├── LICENSE.md
├── README.md
├── Week 1 Introduction to Statistics and Probability
    ├── 1.1 Introduction to Statistics and Probability.rtf
    ├── 1.2 What is Probability Theory.rtf
    ├── 1.3 What is Statistics.rtf
    ├── 1.4 A Puzzle.rtf
    ├── 1.5 A poster.rtf
    ├── 1.5 History of Probability and Statistics.rtf
    ├── 1.5 asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Histor.pdf
    ├── 1.7 Week1-IntroMotivation.zip
    ├── 1.8 HW_1 Py_3.6.ipynb
    ├── 1.8 HW_1.ipynb
    ├── 1.9 Programming Assignment.pdf
    ├── Install Software.docx
    ├── Install Software.pdf
    ├── Quiz 1.pdf
    ├── Week1-IntroMotivation
    │   ├── 1.What-is-Probability.ipynb
    │   ├── 2.What-is-Statistics_.ipynb
    │   ├── 3.Long-term-frequencies.ipynb
    │   ├── 4.The-Three-card-Puzzle.ipynb
    │   └── images
    │   │   ├── AB.png
    │   │   ├── Hitting.png
    │   │   └── Scorecard.jpg
    └── asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Week_1_sid.pdf
├── Week 10 Confidence Intervals and Hypothesis Testing
    ├── 1.1. Confidence Interval on Mean Part 1.rtf
    ├── 1.2. Confidence Interval on Mean Part 2.rtf
    ├── 2. Sigma Unknown.rtf
    ├── 2.1. Sigma Unknown Example.rtf
    ├── 3 Hypothesis Testing.rtf
    ├── 4. Hypothesis Testing - p-Values.rtf
    ├── 5. Tea Testing.rtf
    ├── 6. Hypothesis Testing - Z and T Tests.rtf
    ├── Expectation_variance_and_covariance.ipynb
    ├── HW_10.ipynb
    ├── Problem Set 10.pdf
    ├── Programming Assignment.pdf
    ├── Week_10_Part_1.pdf
    ├── Week_10_Part_2.pdf
    ├── Week_10_Part_3.pdf
    ├── Week_10_Part_4.pdf
    └── Week_10_Part_5.pdf
├── Week 2 Sets
    ├── 1 Week_2.pdf
    ├── 2.1 Notation   Elements, sets, and membership.rtf
    ├── 2.2 Basic Sets   Some simple sets.rtf
    ├── 2.3 Venn Diagrams   Visualizing Sets.rtf
    ├── 2.4 Relations   Complement, Intersection, Union.rtf
    ├── 2.5 Operations   Set Operations.rtf
    ├── 2.6 Cartesian Products   Tuples and products.rtf
    ├── 2.7 Russell's Paradox   Russell's Paradox.rtf
    ├── 2_sets_hw.ipynb
    ├── Polls _ 2.11 Discussion Section _ DSE210x Courseware _ edX.html
    ├── Problem Set 2 _ 2.9 Problem Sets _ DSE210x Courseware _ edX.html
    ├── Problem Set 2.pdf
    ├── Programming Assignment _ 2.10 Programming Assignment _ DSE210x Courseware _ edX.html
    ├── Programming Assignment.pdf
    ├── Quiz 2 _ 2.8 Comprehension Quiz _ DSE210x Courseware _ edX.html
    ├── Quiz 2.pdf
    └── Sets.ipynb
├── Week 3 Counting and Combinatorics
    ├── 1 Sets.rtf
    ├── 10 Combinations.rtf
    ├── 11 Binomial Coefficient.rtf
    ├── 12 Properties of Binomial Coefficient.rtf
    ├── 13 Pascal.rtf
    ├── 14 Multinomial Coefficients.rtf
    ├── 15 Beyond Combinatorics.rtf
    ├── 2 Disjoint Union.rtf
    ├── 3 Products.rtf
    ├── 4 Mix It Up.rtf
    ├── 5 Counting Cartesian Powers.rtf
    ├── 6 Counting Variations.rtf
    ├── 7 Counting Trees.rtf
    ├── 8 Permutations.rtf
    ├── 9 Partial Permutations.rtf
    ├── Problem Set 3 _ 3.17 Problem Sets _ DSE210x Courseware _ edX.html
    ├── Programming Assignment _ 3.18 Programming Assignment _ DSE210x Courseware _ edX.html
    ├── Quiz 3 _ 3.16 Comprehension Quiz _ DSE210x Courseware _ edX.html
    ├── Week_3_Part_1.pdf
    ├── Week_3_Part_2.pdf
    └── week3.zip
├── Week 4 Probability and Conditioning
    ├── 1 Distribution Types.rtf
    ├── 10 Sequences.rtf
    ├── 11 Total Probability.rtf
    ├── 12 Bayes' Rule.rtf
    ├── 2 Distribution Types.rtf
    ├── 3 Events.rtf
    ├── 3_Counting.ipynb
    ├── 4 Repeated Experiments.rtf
    ├── 4_Permutations_and_Combinations.ipynb
    ├── 6 Axioms.rtf
    ├── 7 Inequalities.rtf
    ├── 8 Conditional Probability.rtf
    ├── 9 Independence.rtf
    ├── HW_4.ipynb
    ├── Problem Set 4 _ 4.14 Problem Sets _ DSE210x Courseware _ edX.html
    ├── Programming Assignment _ 4.15 Programming Assignment _ DSE210x Courseware _ edX.html
    ├── Quiz 4 _ 4.13 Comprehension Quiz _ DSE210x Courseware _ edX.html
    ├── Week_4_Part_1.pdf
    └── Week_4_Part_2.pdf
├── Week 5 Random Variables, Expectation, and Variance
    ├── 1 Random Variables.rtf
    ├── 2 Cumulative Distribution Function.rtf
    ├── 3 Expectation.rtf
    ├── 4 Variable Modification.rtf
    ├── 5 Expectation of Functions.rtf
    ├── 5_Probability.ipynb
    ├── 6 Variance.rtf
    ├── 7 Two Variables.rtf
    ├── 8 Linearity of Expectations.rtf
    ├── 9 Covariance.rtf
    ├── Problem Set 5 _ 5.11 Problem Sets _ DSE210x Courseware _ edX.html
    ├── Programming Assignment _ 5.12 Programming Assignment _ DSE210x Courseware _ edX.html
    ├── Quiz 5 _ 5.10 Comprehension Quiz _ DSE210x Courseware _ edX.html
    ├── Week_5_Part_1.pdf
    ├── Week_5_Part_2.pdf
    └── dice_HW.ipynb
├── Week 6 Discrete and Continuous Distribution
    ├── 1 Distribution Families.rtf
    ├── 10 Exponential Distribution.rtf
    ├── 11 Normal Distribution.rtf
    ├── 12 Gaussian Probability.rtf
    ├── 2 Bernoulli.rtf
    ├── 3 Binomial Distribution.rtf
    ├── 4 Poisson.rtf
    ├── 5 Geometric.rtf
    ├── 6 Examples.rtf
    ├── 6_conditional_probability_hw.ipynb
    ├── 7 Continuous Distributions.rtf
    ├── 8 Functions of Random Variables.rtf
    ├── 9 Uniform Distribution.rtf
    ├── Problem Set 6 _ 6.12 Problem Sets _ DSE210x Courseware _ edX.html
    ├── Problem Set 6 _ 6.12 Problem Sets _ DSE210x Courseware _ edX.pdf
    ├── Programming Assignment _ 6.13 Programming Assignment _ DSE210x Courseware _ edX.html
    ├── Programming Assignment _ 6.13 Programming Assignment _ DSE210x Courseware _ edX.pdf
    ├── Quiz 6 _ 6.11 Comprehension Quiz _ DSE210x Courseware _ edX.html
    ├── Quiz 6 _ 6.11 Comprehension Quiz _ DSE210x Courseware _ edX.pdf
    ├── Week6_Continuous_Probability.ipynb
    ├── Week6_Discrete_Probability.ipynb
    ├── Week_6_Part_1.pdf
    ├── Week_6_Part_2.pdf
    ├── Week_6_Part_3.pdf
    └── Week_6_Part_4.pdf
├── Week 7 Inequalities and Limit Theorems
    ├── 1 Markov's Inequality.rtf
    ├── 10_inequalities.ipynb
    ├── 2 Chebyshev's Inequality.rtf
    ├── 3 Weak Law of Large Numbers.rtf
    ├── 4 Moment Generating.rtf
    ├── 5 Chernoff.rtf
    ├── 6 Theorem.rtf
    ├── 7 Proof.rtf
    ├── Expectation_variance_and_covariance.ipynb
    ├── Problem Set 7 _ 7.9 Problem Sets _ DSE210x Courseware _ edX.html
    ├── Problem Set 7 _ 7.9 Problem Sets _ DSE210x Courseware _ edX.pdf
    ├── Programming Assignment _ 7.10 Programming Assignment _ DSE210x Courseware _ edX.html
    ├── Programming Assignment _ 7.10 Programming Assignment _ DSE210x Courseware _ edX.pdf
    ├── Quiz 7 _ 7.8 Comprehension Quiz _ DSE210x Courseware _ edX.html
    ├── Quiz 7.pdf
    └── inequalities_HW.ipynb
├── Week 8 Statistics and Parameter Estimation
    ├── 1. Stats.rtf
    ├── 11_Statistics_lecture_notebook_.ipynb
    ├── 2. Mean and Variance.rtf
    ├── 4. Unbiased Estimation.rtf
    ├── 5. Standard Deviation.rtf
    ├── HW_8.ipynb
    ├── Problem Set 8.pdf
    ├── Programming Assignment 8.pdf
    ├── Quiz 8.pdf
    ├── Week_8_Part_1.pdf
    └── Week_8_Part_2.pdf
└── Week 9 Regression and PCA
    ├── 1. Review of Linear Algebra.rtf
    ├── 2. Matrices Notations and Operations.rtf
    ├── 3. Solving a System of Linear Equations.rtf
    ├── 4. Linear Regression.rtf
    ├── 5. Polynomial Regression.rtf
    ├── 6. Regression Towards the Mean.rtf
    ├── 7. Components Analysis.rtf
    ├── HW_9.zip
    ├── Programming Assignment.pdf
    ├── Quiz 9.pdf
    ├── lectures.zip
    └── more_lectures.zip


/LICENSE.md:
--------------------------------------------------------------------------------
1 | © 2012–2018 edX Inc. All rights reserved except where noted. 
2 | EdX, Open edX and the edX and Open edX logos are registered trademarks or trademarks of edX Inc. | 粤ICP备17044299号-2
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DSE210x-Statistics-and-Probability-in-Data-Science-using-Python
 2 | UCSanDiegoX edX Course DSE210x Statistics and Probability in Data Science using Python 
 3 | https://courses.edx.org/courses/course-v1:UCSanDiegoX+DSE210x+3T2017/course/
 4 | 
 5 | Welcome to Statistics and Probability in Data Science using Python!
 6 | We are delighted to welcome you to Statistics and Probability in Data Science using Python. 
 7 | In this course, you will learn the motivation, intuition, and theory behind the probabilistic 
 8 | and statistical foundations of data science, and will get to experiment and practice with these concepts 
 9 | via Python programs and the Jupyter Notebook platform. 
10 | 
11 | Course Staff
12 | Instructors
13 | Alon Orlitsky, Professor, ECE and CSE Departments, UC San Diego
14 | Yoav Freund, Professor, CSE Department, UC San Diego
15 | 
16 | Teaching Assistants
17 | Matthew Elliot,  Graduate Student, CSE, UC San Diego
18 | Rohit Parasnis, Graduate Student, ECE, UC San Diego
19 | Hanwen Yao, Graduate Student, ECE, UC San Diego
20 | Zhen Zhai, Graduate Student, CSE, UC San Diego
21 | 
22 | What do you need to know to succeed?
23 | The course is intended for learners with an undergraduate degree or senior undergraduates 
24 | interested in broadening their understanding of probability and statistics. 
25 | We will assume basic knowledge of the following topics
26 | 
27 | Logic (e.g., De Morgan’s Laws) 
28 | Set theory (e.g., what are functions)
29 | Calculus (e.g., calculating integrals and derivatives)
30 | Programming (e.g., basic experience with any programming language)
31 | Linear algebra (e.g., vectors and matrices)
32 | The Python programming language will be used throughout the course. 
33 | If you would like to learn or gain more practice with Python, please consider 
34 | viewing or taking the first course in this MicroMasters, Python for Data Science.
35 | 
36 | Overview
37 | The course will cover the following topics:
38 | 
39 | Counting and combinatorics
40 | Discrete and continuous probability
41 | Conditional probability and Bayes’ Rule
42 | Random variables
43 | Expectation, variance, and correlation
44 | Common distribution families
45 | Probabilistic inequalities and concentration
46 | Moments and limit theorems
47 | Hypothesis testing
48 | Sampling and confidence intervals
49 | PCA and regression
50 | Entropy and compression
51 | Learning Objectives
52 | The course will teach you how to visualize, understand, and reason about probabilistic 
53 | and statistical concepts, and how to apply your knowledge to analyze data sets and draw 
54 | meaningful conclusions from data. We will cover both theoretical and practical aspects, 
55 | and will start each topic with motivation and intuition and will proceed with rigorous 
56 | arguments and provable techniques. Each topic will be accompanied by a Python Notebook 
57 | that you could run and modify to experiment with the material learned and get a better 
58 | feel for the material covered. 
59 | 
60 | Course Outline
61 | The course consists of 10 units. In each of the course’s first 10 weeks we will release 
62 | one unit, and you will have six weeks to complete it. 
63 | 
64 | ● Week 1 - Introduction
65 | 
66 | ● Week 2 - Sets
67 | 
68 | ● Week 3 - Counting and Combinatorics
69 | 
70 | ● Week 4 - Probability and Conditioning
71 | 
72 | ● Week 5 - Random Variables, Expectation, and Variance
73 | 
74 | ● Week 6 - Discrete and Continuous Distribution Families
75 | 
76 | ● Week 7 - Inequalities and Concentration Theorems
77 | 
78 | ● Week 8 - Sampling, Confidence Intervals, and Hypothesis Testing
79 | 
80 | ● Week 9 - Regression and Principal Component Analysis
81 | 
82 | ● Week 10 - Entropy and Compression 
83 | 


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/1.1 Introduction to Statistics and Probability.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - [Yoav] Hi, my name is Yoav Fruend,\par
  4 | and this is Introduction to Probability and Statistics.\par
  5 | This is the first video,\par
  6 | therefore it is the introduction\par
  7 | to Introduction to Probability and Statistics.\par
  8 | So we're going to talk about probability.\par
  9 | That is, roughly speaking, about tossing dice.\par
 10 | And we're going to talk about statistics,\par
 11 | which is, roughly speaking,\par
 12 | about keeping scores in baseball.\par
 13 | So why should you care about probability and statistics?\par
 14 | Basically, it's because this is a very powerful tool\par
 15 | for dealing with uncertainty.\par
 16 | So consider the example here,\par
 17 | that we have Google trying to give us a good route\par
 18 | from the point A to the point B.\par
 19 | There are two routes that are shown here.\par
 20 | One is the shortest route,\par
 21 | and the other is the fastest route.\par
 22 | So if we think about shortest route from A to B,\par
 23 | that is a certainty.\par
 24 | Once we know how the roads are constructed\par
 25 | that is a certain thing.\par
 26 | On the other hand,\par
 27 | in terms of the fastest route from A to B,\par
 28 | that depends on traffic, on other conditions,\par
 29 | and that really is something\par
 30 | that we have a lot of uncertainty about.\par
 31 | Therefore, we need statistics\par
 32 | to help us deal with that uncertainty.\par
 33 | Here's another example.\par
 34 | Search Engines.\par
 35 | Suppose you have a search engine\par
 36 | and you're looking for some information.\par
 37 | The first kind of query that you can make\par
 38 | that is a certainty query,\par
 39 | is to find all the web pages\par
 40 | that contain the words Trump, or Hillary, and debate. Okay?\par
 41 | So that is basically a very specific condition\par
 42 | and you can ask your search engine\par
 43 | to look for all of those pages.\par
 44 | On the other hand, it might be more relevant\par
 45 | to ask for the ten most relevant pages for the query\par
 46 | "Trump and Hillary debate." Okay?\par
 47 | So that is not a query\par
 48 | that has a specific set of well defined answers.\par
 49 | It has to do with what words appear in those kind of things\par
 50 | and with what pages are really most relevant,\par
 51 | most up to date,\par
 52 | various things with which we have uncertainty.\par
 53 | The last example is about an insurance company.\par
 54 | So with an insurance company,\par
 55 | you have a contract,\par
 56 | and it says with certainty,\par
 57 | that if you have life insurance\par
 58 | with this company and you die,\par
 59 | then the insurance company has to pay your family\par
 60 | some prescribed amount of dollars.\par
 61 | So that's a certainty.\par
 62 | On the other hand,\par
 63 | the insurance company itself\par
 64 | has to deal with a lot of uncertainty.\par
 65 | It doesn't know which people are going to die.\par
 66 | So it has to figure out,\par
 67 | what is the minimum life insurance premium\par
 68 | such that the probability\par
 69 | that life insurance company\par
 70 | will go bankrupt in 10 years\par
 71 | is smaller than 1%.\par
 72 | Probably much, much smaller than that.\par
 73 | In any case, what the company needs to somehow deal with,\par
 74 | is the uncertainty of how many people will die\par
 75 | that have insurance,\par
 76 | and how much they will have to pay them. Okay?\par
 77 | So that's a case we need to deal with uncertainty.\par
 78 | So what are you going to learn in this course?\par
 79 | First of all,\par
 80 | navigation and search engine that I showed you,\par
 81 | those are very advanced problems,\par
 82 | as is the life insurance market.\par
 83 | What you will learn here are the foundations\par
 84 | that these kind of methods are based on.\par
 85 | So you'll solve basic problems\par
 86 | of reasoning under uncertainty.\par
 87 | So as an example,\par
 88 | you will know how to answer a question of the type;\par
 89 | If you flip a coin 100 times,\par
 90 | what is the probability of getting at most 10 heads?\par
 91 | Or, what is the probability\par
 92 | of getting a four of kind hand in poker?\par
 93 | These are questions that you would be able to answer.\par
 94 | And if you're interested in computer science examples,\par
 95 | here are some other questions you might be able to answer\par
 96 | which is, suppose you have a hash table\par
 97 | with a million elements,\par
 98 | and you don't want to allow more than five indirections\par
 99 | at most for 10 elements,\par
100 | so how big does the table need to be?\par
101 | That's a calculation that you will be able to do.\par
102 | A similar one is,\par
103 | suppose that you have a router\par
104 | and that the router fails from time to time.\par
105 | And the rates of failure is once a year.\par
106 | So that gives you some information,\par
107 | but you would want to know,\par
108 | specifically let's say,\par
109 | what's the probability that it will fail\par
110 | during the first month?\par
111 | Given that on average it fails once per year.\par
112 | So that's a question that you'll be able to answer.\par
113 | So it's not universal, the belief in statistics.\par
114 | Here is an example from a basketball coach\par
115 | who does not believe in statistics\par
116 | because there are too many other factors and so on.\par
117 | So there are many people\par
118 | that basically don't want to trust statistics.\par
119 | And that is fine.\par
120 | But on the other hand,\par
121 | you can say many other people\par
122 | dealing with the same domain, here, basketball,\par
123 | do trust statistics.\par
124 | So this is an app for fans of basketball\par
125 | to have statistics about different players\par
126 | and so they can do fantasy basketball\par
127 | and win a lot of money.\par
128 | So to summarize,\par
129 | what we have all around us\par
130 | when we are doing anything in the world,\par
131 | is uncertainty.\par
132 | And probability and statistics provide a rational way\par
133 | to deal with uncertainty.\par
134 | So what we're going to discuss next is,\par
135 | what is probability, and then what is statistics.\par
136 | So I'll see you then.\par
137 | }
138 |  


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/1.2 What is Probability Theory.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hi, welcome to the course.\par
  4 | In this course, there are two\par
  5 | main subjects that we will study.\par
  6 | One is probability and the other is statistics.\par
  7 | These are very related subjects,\par
  8 | but they are still different, so let's start\par
  9 | by thinking about what is probability.\par
 10 | So probability is the mathematical framework\par
 11 | for computing probabilities of complex events.\par
 12 | That's a mouthful, and we make an assumption\par
 13 | that we know the probability of the basic event.\par
 14 | What precisely do we mean by probability and by event?\par
 15 | Those will be defined later in the class.\par
 16 | For now, let's just think in terms of common sense.\par
 17 | So let's think about a simple question.\par
 18 | We flip a coin and we get tails.\par
 19 | We flip it again, we get heads.\par
 20 | Okay, what we believe somehow is that\par
 21 | the probabilities are equal, but\par
 22 | what do we really mean by that?\par
 23 | What does that mean, does it mean that\par
 24 | we'll exactly get the same number of heads and tails?\par
 25 | No, it just means that ...\par
 26 | if we flip the coin many times,\par
 27 | then for some very large number of coins that we flip,\par
 28 | let's say ten thousand, we'll get\par
 29 | the number of heads is approximately or about five thousand.\par
 30 | Okay, that's what we expect, but\par
 31 | what do we mean by about, how can we,\par
 32 | how can we express this notion of about\par
 33 | in a better way because we might be\par
 34 | actually interested in knowing how far from that we are.\par
 35 | Okay, so we're going to simulate coin flips.\par
 36 | We'll use a pseudo random number generator\par
 37 | to simulate coin flips, and instead of heads and tails\par
 38 | it'll be more convenient to use one and minus one,\par
 39 | and then the number of heads relates to\par
 40 | summing all of these plus ones and minus ones\par
 41 | and what we expect is that the sum\par
 42 | will be zero or close to zero.\par
 43 | So we will vary the number of\par
 44 | coins flips which we denote by k\par
 45 | and here is a little bit of code\par
 46 | for generating such random coin flips.\par
 47 | Here we're generating the coin flips themselves\par
 48 | and here we're summing the coin flips\par
 49 | along a particular sequence.\par
 50 | We're generating many sequences at once,\par
 51 | and this is the number n that we\par
 52 | say here is by default one hundred.\par
 53 | So this is a central part of the code,\par
 54 | but I'm not going to show you all of the code,\par
 55 | for, to see that, you have to download\par
 56 | the notebook yourself and play with it to see the details.\par
 57 | So here is a histogram that shows for\par
 58 | flipping a coin a thousand times\par
 59 | what is the distribution of this sum, that we said.\par
 60 | Sum is about zero, but then you see it's not exactly zero,\par
 61 | and every time that I rerun this experiment,\par
 62 | every time that I rerun the experiment,\par
 63 | what you see is that the histogram\par
 64 | that you get is slightly different.\par
 65 | However, even though it is always\par
 66 | each time slightly different, there is\par
 67 | something very much in common for all of these coin flips.\par
 68 | They're all concentrated around zero,\par
 69 | but they're not exactly zero, and for this\par
 70 | number of coin flips, one thousand,\par
 71 | it is extremely unlikely that they're\par
 72 | below two hundred and fifty or above\par
 73 | minus two hundred and fifty and above two hundred and fifty.\par
 74 | Okay, so with probability theory,\par
 75 | we can calculate how small we expect Sk, the sum, to be.\par
 76 | The absolute value of the sum, so it can be\par
 77 | either negative or positive, and what we show,\par
 78 | we will show, is that the probability\par
 79 | that this Sk is larger than four times\par
 80 | the square root of k is extremely small.\par
 81 | It is two times ten to the minus eight,\par
 82 | or 0.000002 percent,\par
 83 | so we'll have to flip\par
 84 | the sequence of one thousand coins\par
 85 | many many many times before we can see\par
 86 | that it will be bigger than four square root of k.\par
 87 | So let's actually do the simulation\par
 88 | and see if that is the case.\par
 89 | Okay, so here is our simulation.\par
 90 | What we see is, here, we have one hundred coin flips.\par
 91 | Here, one thousand coin flips,\par
 92 | and here ten thousand coin flips,\par
 93 | and the red line mark what probability theory says\par
 94 | is the boundary in which it is very, very likely\par
 95 | that the total number of coin flips resides.\par
 96 | So I can rerun this experiment too.\par
 97 | And you see that again, each time\par
 98 | the distribution is somewhat different\par
 99 | but it never goes outside of the red bar.\par
100 | So that's consistent with what we said.\par
101 | Now, here it seems that all of them are very similar.\par
102 | It doesn't really matter if you do\par
103 | one hundred, one thousand, or ten thousand coin flips,\par
104 | but that's really because I'm scaling it\par
105 | according to this boundary, so the boundary\par
106 | here is minus 40 to 40, here it's\par
107 | minus one hundred and something to one hundred and something\par
108 | and here it's minus four hundred to four hundred.\par
109 | If we scale, if we plot the full scale\par
110 | of these coin flips, what we see is the following.\par
111 | We see something like this, so when\par
112 | we plot the whole scale from\par
113 | minus one hundred to one hundred, for hundred coins,\par
114 | and from minus ten thousand to ten thousand\par
115 | for ten thousand coins, then we see\par
116 | that the distribution becomes more and more concentrated\par
117 | around zero, relative to this scale.\par
118 | So if I run it again ...\par
119 | Again you get each time a different distribution\par
120 | but you get that the distribution\par
121 | is more and more concentrated if you\par
122 | flip the coin more and more times\par
123 | and the width of this is square root of k,\par
124 | four times two times square of k, so you see\par
125 | the more coin flips you, more times you flip the coin,\par
126 | the closer it is relatively to the range to zero.\par
127 | Okay, so let's summarize.\par
128 | We did some experiments where we summed k random numbers\par
129 | that are correspond to coin flips\par
130 | with probability xi, so we had xi minus one or plus one\par
131 | with probability is half and half.\par
132 | And our experiments show that the sum\par
133 | is almost always in the range\par
134 | minus four square root of k to plus four square root of k.\par
135 | Okay, so we can write it this way.\par
136 | If k goes to infinity, we have 4 square root of k\par
137 | as the range, divided by k, so that is\par
138 | four divided by the square root of k,\par
139 | which is equal to zero, which goes to zero as k increases.\par
140 | And so what we can say is that Sk, relative to k,\par
141 | so the ratio of the number, of the difference\par
142 | between heads and tails, divided by k, that goes to zero.\par
143 | And that's basically what we mean\par
144 | by the probability is being half and half.\par
145 | Okay so again, what is probability theory?\par
146 | It's the math involving proving in a precise way\par
147 | the statements that we made above.\par
148 | Okay, so before, we just kind of did simulations\par
149 | and alluded to something that will prove in the future\par
150 | but that's really what probability theory is,\par
151 | is proving these in a precise way.\par
152 | In most cases, we can approximate the output,\par
153 | these probabilities, using simulations.\par
154 | These are called Monte-Carlo simulations,\par
155 | and that's essentially what we did\par
156 | in this little experiment that we did.\par
157 | So why isn't that enough?\par
158 | Because, first of all, calculating\par
159 | the probability gives you a precise answer,\par
160 | and doing Monte-Carlo simulations\par
161 | just gives you an approximation\par
162 | and you need to run the experiment longer and longer\par
163 | to get more and more accurate answers.\par
164 | And the second is that it is much\par
165 | faster than Monte-Carlo simulations,\par
166 | essentially for the same reasons.\par
167 | Okay, so that is a quick description of what is probability\par
168 | and next time, we're going to talk about what is statistics.\par
169 | See you there.\par
170 | End of transcript. Skip to the start.\par
171 | POLL\par
172 | }
173 |  


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/1.3 What is Statistics.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Okay.\par
  4 | So, in the previous video, we talked\par
  5 | about what is probability,\par
  6 | and this time we're going to talk\par
  7 | about what is statistics?\par
  8 | So, in probability theory,\par
  9 | we compute probabilities of complex events,\par
 10 | from the underlying base distribution.\par
 11 | Statistics takes us in the opposite direction.\par
 12 | We're given data that was generated\par
 13 | by some Stochastic process, or some random process,\par
 14 | and from that data we infer properties\par
 15 | of this Stochastic process.\par
 16 | So here is an example.\par
 17 | Again, let's go back to our coin.\par
 18 | Here's our coin.\par
 19 | And, we believe that it is an unbiased coin.\par
 20 | So, it gives us exactly half heads and half tails.\par
 21 | But how can we be sure?\par
 22 | So, previous time we saw that,\par
 23 | we talked about the distribution,\par
 24 | and now we wanna turn the question around.\par
 25 | If we flip the coin 1000 times,\par
 26 | and we get 570 heads,\par
 27 | then can we conclude that the coin is biased?\par
 28 | So, the coin is not a fair coin?\par
 29 | What about, can we conclude that the coin is biased?\par
 30 | Not a fair coin?\par
 31 | What can we conclude if the outcome is 507 heads?\par
 32 | It's still not exactly 500, but closer to it.\par
 33 | So how can we decide whether the coin\par
 34 | is biased or not?\par
 35 | So, here is the logic of statistical inference.\par
 36 | We wanna say something about the coin,\par
 37 | and we use the following kind of logic.\par
 38 | Let's say that the coin is fair, okay?\par
 39 | And then we can calculate\par
 40 | what is the probability that the coin\par
 41 | will give us 570 coins.\par
 42 | If this probability is extremely small,\par
 43 | then we can reject with confidence\par
 44 | the hypothesis that the coin is fair.\par
 45 | We can say it is very unlikely\par
 46 | that a fair coin would generate this sequence,\par
 47 | and therefore it is not a fair coin.\par
 48 | So, let's see how we can calculate the answer.\par
 49 | Recall the simulations we did,\par
 50 | with the video, What is Probability.\par
 51 | We used xi minus one, xi plus one,\par
 52 | instead of heads and tails,\par
 53 | and we looked at the sum of all of these variables.\par
 54 | And, if the number of heads is 570,\par
 55 | then we can easily see that the sum\par
 56 | over 1000 plus ones and minus ones,\par
 57 | will give us 140.\par
 58 | Now, we know, we haven't really shown it yet,\par
 59 | but we already, I already stated it\par
 60 | in the previous slide,\par
 61 | that it's very unlikely that this sum,\par
 62 | the absolute value of it,\par
 63 | is larger than four square root of k,\par
 64 | which in this case is 126.5.\par
 65 | So, this is how I calculated this 126.5.\par
 66 | And, therefore it is very unlikely\par
 67 | that the coin is unbiased, right?\par
 68 | We can say the coin is probably biased.\par
 69 | Because, it is very unlikely that an unbiased coin\par
 70 | would generate this outcome.\par
 71 | What about 507 heads?\par
 72 | Well, if you have 507 heads, 493 tails,\par
 73 | then Sn is going to be 14,\par
 74 | and 14 is much, much smaller than 126.5.\par
 75 | So at least according to the rule we just had,\par
 76 | we cannot say that the probability is very small,\par
 77 | and as we'll see, the probability\par
 78 | is actually quite reasonable,\par
 79 | that it's quite large, that a fair coin\par
 80 | would generate 507 heads.\par
 81 | So, we cannot conclude that the coin is biased.\par
 82 | It might still be biased, but we might have to\par
 83 | flip the coin many, many, many more times,\par
 84 | before we can deduce that.\par
 85 | Okay, so, as a conclusion,\par
 86 | the probability that unbiased coin\par
 87 | would generate a sequence\par
 88 | with 570 or more heads is extremely small,\par
 89 | and from that we can conclude, with high confidence,\par
 90 | that the coin is biased.\par
 91 | On the other hand, if this sum is larger than 507,\par
 92 | that is quite likely.\par
 93 | And so getting 507 heads does not provide us\par
 94 | with evidence that the coin is biased.\par
 95 | Let's think about some real world examples.\par
 96 | So statistics, unlike probability\par
 97 | which is a part of math,\par
 98 | statistics is really about problems in the real world.\par
 99 | And so you might ask,\par
100 | "Why should I care about the coin being biased?\par
101 | That's not a problem I kind of face many times."\par
102 | And that's a very valid critique.\par
103 | And we will now give some examples\par
104 | for real problems where the statistics\par
105 | is very closely related to whether or not a coin is biased.\par
106 | So, let's take polls, for instance.\par
107 | Suppose there is elections,\par
108 | that are going to take place in a few days,\par
109 | and we want to know how people plan to vote.\par
110 | Suppose there are just two parties: D and R.\par
111 | We could try and ask all of the potential voters\par
112 | what they plan to vote,\par
113 | but that would be extremely expensive.\par
114 | We cannot really afford to do that.\par
115 | So instead, we can use a poll.\par
116 | What is a poll?\par
117 | We call a small randomly selected set of people,\par
118 | ask them of their opinions,\par
119 | and then we extrapolate from that\par
120 | what do people think in general.\par
121 | So, call n people at random,\par
122 | and count the number of D votes.\par
123 | And the question is, can you say with confidence\par
124 | that there are more D votes, or more R votes?\par
125 | Mathematically, this is exactly equivalent\par
126 | to flipping a biased coin, and asking\par
127 | whether heads is more likely than tails.\par
128 | Or tails more likely than heads.\par
129 | It's the exact same question,\par
130 | and the same math holds for it.\par
131 | Here's another case.\par
132 | This is called A/B testing,\par
133 | which is a very common practice now,\par
134 | on developing web interfaces.\par
135 | You basically think about two alternative designs\par
136 | for your web page, one is A, one is B.\par
137 | And the only difference in this case\par
138 | is whether this bar is to the left,\par
139 | or to the right of the screen, okay?\par
140 | So, to see which design users prefer,\par
141 | we randomly present the design A or design B,\par
142 | when people visit our website.\par
143 | And we measure how long the user stays on the page,\par
144 | or whether the user clicked on an advertisement,\par
145 | or any other indication that the user\par
146 | likes one of the designs more than the other.\par
147 | We want to decide with some confidence,\par
148 | which of the two designs is better.\par
149 | Again, this is very similar\par
150 | to making the decision with confidence\par
151 | on whether head is more probable than tails,\par
152 | or vice versa.\par
153 | Okay, so to summarize, statistics is about\par
154 | taking data from some real-world process,\par
155 | and drawing conclusions about this process\par
156 | from the data you collected.\par
157 | And, we talked about several examples.\par
158 | One is using polls to estimate public opinion.\par
159 | Another is performing A/B tests to design web pages.\par
160 | And, of course there are many, many others\par
161 | that are all around us.\par
162 | One is estimating the rate of global warming.\par
163 | Another is deciding\par
164 | whether a medical procedure is effective.\par
165 | So these are slightly more complicated,\par
166 | and they don't map directly\par
167 | to flipping a biased coin,\par
168 | but they are of a similar type.\par
169 | So, this ends the description of probability and statistics.\par
170 | And we are now going to start diving into\par
171 | the details of how to do probability.\par
172 | Thank you.\par
173 | End of transcript. Skip to the start.\par
174 | POLL\par
175 | }
176 |  


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/1.4 A Puzzle.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - In a previous video, I give you a short explanation\par
  4 | for what is probability.\par
  5 | And you might wonder,\par
  6 | "Why do I really need that, all this math?\par
  7 | "Does it really help me in real world situations?"\par
  8 | So I'm going to give you here a little puzzle,\par
  9 | it's a very natural puzzle, it's not something contrived,\par
 10 | and I'm going to ask you to think about\par
 11 | how you would want to answer this puzzle, okay?\par
 12 | So it's called the three card puzzle.\par
 13 | And what I want you to imagine\par
 14 | is that you have three cards in a hat, okay?\par
 15 | So the cards are, one card is red on one side,\par
 16 | and blue on the other side, one card is blue on both sides,\par
 17 | and one card is red on both sides.\par
 18 | So I have those cards here.\par
 19 | Let me show you.\par
 20 | Here is the card that has red on one side,\par
 21 | blue on the other side.\par
 22 | This is the card that has red on both sides.\par
 23 | And this is the card that has blue on both sides, okay?\par
 24 | Simple enough.\par
 25 | So we have these cards in the hat, and we mix them,\par
 26 | we mix the cards, and do the following.\par
 27 | We pick a card at random, okay?\par
 28 | And we put it on the table, all right?\par
 29 | So here we have a blue card on the table.\par
 30 | And the color of the side\par
 31 | that is facing up is, I call it U.\par
 32 | It's B or R, okay?\par
 33 | So here's the bet that I'm suggesting we make.\par
 34 | If the other side of the card has a different color,\par
 35 | I will pay you $1.\par
 36 | If the other side has the same color, you pay me $1, okay?\par
 37 | And I think that this is fair.\par
 38 | Why do I think it's fair?\par
 39 | Because, suppose it's red, actually it's blue.\par
 40 | So what can this card be?\par
 41 | It can be either the blue on both sides\par
 42 | or the blue and red, okay?\par
 43 | So there are two options, one is with the other side\par
 44 | being red and one is with the other side being blue,\par
 45 | so they both have the same probability of happening,\par
 46 | so having a bet of one to one is a fair one, okay?\par
 47 | So let's see this one.\par
 48 | Yeah, actually I got red, so I pay you $1.\par
 49 | So to decide what is the probability of winning a dollar\par
 50 | or losing a dollar, let's use a Monte Carlo simulation\par
 51 | like we did earlier, okay?\par
 52 | So we're going to have a little program that generates\par
 53 | at random an order for the cards, chooses one of the cards,\par
 54 | and then chooses a side for the card, and then prints out\par
 55 | the card and the outcome of whether which side wins, okay?\par
 56 | So here is our, the outcome of our Monte Carlo simulation,\par
 57 | you can actually rerun it.\par
 58 | So each time I'm getting slightly different outcomes.\par
 59 | But if you look at the numbers that come out down here,\par
 60 | it shows that different is 17 times, and same is 33 times.\par
 61 | So clearly, different happens\par
 62 | much fewer times than same, okay?\par
 63 | So I basically have here a game that is,\par
 64 | even as simple as it is, it's going to, it's unfair to you\par
 65 | and I'm going to gain money on average from playing it.\par
 66 | Okay, so as we saw, the simulation does not agree\par
 67 | with our argument we made before.\par
 68 | The argument must be false.\par
 69 | In the simulation, the two sides have the same color\par
 70 | about twice the number of times\par
 71 | that they have different colors.\par
 72 | So they have about twice the probability.\par
 73 | So what does that mean?\par
 74 | It means that you, if you play this game, you are twice\par
 75 | as likely to lose a dollar as you are to win.\par
 76 | And because of that, on average, you are going to lose\par
 77 | 33 cents per iteration because you lose $1\par
 78 | with probability 2/3, and you gain a dollar\par
 79 | with probability 1/3.\par
 80 | So you're losing 33 cents on each iteration.\par
 81 | So here's an alternative argument.\par
 82 | Okay, so if we pick card at random, then 2/3 of the time\par
 83 | we pick a card where the two sides have the same color,\par
 84 | and only 1/3 where the color is different.\par
 85 | So that basically explains what we saw.\par
 86 | So supposedly now we understand the game, but the problem is\par
 87 | the original argument also sounds convincing,\par
 88 | but it is wrong.\par
 89 | So how can we distinguish between this argument\par
 90 | and another argument and say which one is right\par
 91 | without doing a simulation?\par
 92 | Okay, simulation is fine, but sometimes, as you saw already,\par
 93 | running a simulation to answer a probability question\par
 94 | is hard, you have to run for a long, long time,\par
 95 | and you get only an approximate result, okay?\par
 96 | So, to be sure that we need,\par
 97 | the argument is correct,\par
 98 | we need more formalism.\par
 99 | That's the point of this video.\par
100 | We need concepts like outcome and event,\par
101 | and those are things that we're going\par
102 | to start talking about next week.\par
103 | End of transcript. Skip to the start.\par
104 | POLL\par
105 | }
106 |  


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/1.5 A poster.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hi, last video\par
  4 | we talked a little bit about the history\par
  5 | of probability and statistics.\par
  6 | And I gave you some pointers\par
  7 | to the main ideas that were there.\par
  8 | Now, I would like to put these things together\par
  9 | with the timeline so that you can\par
 10 | have a better sense about how probability\par
 11 | and statistics developed over the years.\par
 12 | So let's look at this poster.\par
 13 | This is a short history of probability and statistics\par
 14 | and much of it is based on this book\par
 15 | by Ian Hacking, The Emergence of Probability.\par
 16 | So if you're interested in more\par
 17 | you can go read this book.\par
 18 | It's an excellent book.\par
 19 | As we said in the last video,\par
 20 | there are two parts to\par
 21 | probability and statistics.\par
 22 | The two threads are\par
 23 | repeated games of chance, on the one hand,\par
 24 | and the strength of evidence\par
 25 | and degrees of belief on the other.\par
 26 | That's the two arrows, the red one going down\par
 27 | and the green one going down,\par
 28 | and time goes in this direction,\par
 29 | so this is as time progresses.\par
 30 | And statistics and probability\par
 31 | in its modern form is pretty much agreed to be\par
 32 | starting at the time of\par
 33 | Pascal and Fermat\par
 34 | that were two mathematicians in 1654.\par
 35 | In these correspondences, the main ideas\par
 36 | were laid out, of course, there were\par
 37 | other people involved, but the main\par
 38 | interesting thing is around 1650\par
 39 | is when modern mathematical probability and statistics\par
 40 | started to be developed.\par
 41 | Okay, so that's the timeline that we have\par
 42 | with the blue line.\par
 43 | All right, so let's look a little bit\par
 44 | from what happened before that point.\par
 45 | Before that point, you had repeated games of chance,\par
 46 | so those were games played with different things,\par
 47 | like knucklebones\par
 48 | and dice and cards.\par
 49 | And those raised questions of the type\par
 50 | of what is the right way to split\par
 51 | the money when you stop a game early?\par
 52 | So that is the part of statistics that we will\par
 53 | actually deal with quite a lot in the beginning,\par
 54 | which has to do with games of chance.\par
 55 | The other part that is much less well-defined,\par
 56 | but probably even more important is what do we do\par
 57 | when we have a state of uncertainty?\par
 58 | We have some evidence towards some conclusions\par
 59 | but we are not sure how to weigh different\par
 60 | evidences that might be contradictory.\par
 61 | So these kinds of things come up in law.\par
 62 | So here is the law.\par
 63 | It comes up in medicine\par
 64 | and it comes up in science\par
 65 | and later on, technology.\par
 66 | Basically, in modern science and technology,\par
 67 | probability and statistics are\par
 68 | a necessary part.\par
 69 | Now, in public policy, it's also a necessary part.\par
 70 | So those things existed from before\par
 71 | and in these correspondences,\par
 72 | Pascal and Fermat also related to them.\par
 73 | But it's important to remember that these two things\par
 74 | are quite different from each other.\par
 75 | One is about evidence and about how people think\par
 76 | about evidence and the other is much more mechanical.\par
 77 | So it has to do with rolling dice and so on.\par
 78 | So of course, the rolling of dice\par
 79 | did not stop at that point.\par
 80 | We have casinos, also, now.\par
 81 | And so these questions are natural,\par
 82 | and these questions give rise to the frequentist approach\par
 83 | to probability and statistics that was\par
 84 | described in the other video.\par
 85 | And the best known champion of Frequentist Statistics\par
 86 | is Andre Kolmogorov, one of the great\par
 87 | mathematicians from Russia,\par
 88 | and he invented what's called the axiom of probability.\par
 89 | So he was central to this view.\par
 90 | And in the more recent, current,\par
 91 | still alive is Vladimir Vapnik,\par
 92 | who has developed some of the foundations\par
 93 | for machine learning.\par
 94 | Okay, so this is about the frequentist.\par
 95 | Now, in the other direction,\par
 96 | in the side of evidence and degrees of belief,\par
 97 | there was a different line of development,\par
 98 | which is called Bayesian Statistics,\par
 99 | and we will talk also about that in a later time,\par
100 | in which you take your belief before you see\par
101 | the evidence and you update them when you see the evidence\par
102 | and the champion of that was Bruno De-finetti.\par
103 | Okay, so you have\par
104 | on the one side this Bayesian Statistics approach\par
105 | and on the other side, the Frequentist approach\par
106 | and there's definitely a tension between the two.\par
107 | So this is a pretty famous picture by now\par
108 | of Vapnik standing around next to a board\par
109 | and in the board it says, "All your Bayes\par
110 | "are belong to us."\par
111 | So this is a clear slight of Bayesian Statistics.\par
112 | All right, but then what develops over time\par
113 | is people that are statistics practitioners,\par
114 | people that actually use probability and statistics\par
115 | in order to solve real world problems\par
116 | and I draw them in the middle here,\par
117 | the practitioners, because they are,\par
118 | in general, not dogmatic to one side or the other.\par
119 | They would use Bayesian Statistics when it's appropriate,\par
120 | Frequentist Statistics when it's appropriate\par
121 | and other heuristics when that's appropriate.\par
122 | Okay, so the father of those methods\par
123 | is Ronald Fisher, who has brought statistics\par
124 | to the sciences and also to the social sciences.\par
125 | And then more recent ones are\par
126 | John Tukey and even more recent\par
127 | is Leo Breiman, the inventor\par
128 | of bagging and cart and other important algorithms.\par
129 | So, just to add a little bit,\par
130 | there's this area that we will also talk about\par
131 | called hypothesis testing and P-values,\par
132 | which is the Frequentist approach to arguing\par
133 | about degrees of belief.\par
134 | So it's an interesting contrast\par
135 | and this actual approach is now very, very\par
136 | commonly used in science to accept or reject papers\par
137 | according to the strength of the evidence\par
138 | that they have.\par
139 | And as I said before, these middle ones\par
140 | are the practitioners, I drew different colored\par
141 | arrows in all kinds of ways\par
142 | because they're each unique.\par
143 | They take various ideas from probability and statistics\par
144 | and they apply it to real problems\par
145 | in their own unique way.\par
146 | So the modern version of these practitioners\par
147 | today is Machine Learning\par
148 | and even more recently, Big Data,\par
149 | when we try to apply Machine Learning\par
150 | methods to Big Data.\par
151 | So let's zoom out and to see the complete picture\par
152 | and what was important for me to show you here\par
153 | is that while the methods\par
154 | that people are more familiar today with\par
155 | are like Machine Learning, Big Data,\par
156 | and Neural Networks\par
157 | are very popular,\par
158 | there is actually a very long history\par
159 | and in this long history people developed\par
160 | a lot of very important methods\par
161 | that are worthwhile knowing about.\par
162 | So I hope that that gives you a perspective\par
163 | that will be useful for you for the rest of the course.\par
164 | Thank you very much.\par
165 | End of transcript. Skip to the start.\par
166 | Click to download the history Poster\par
167 | }
168 |  


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/1.5 asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Histor.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/1.5 asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Histor.pdf


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/1.7 Week1-IntroMotivation.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/1.7 Week1-IntroMotivation.zip


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/1.9 Programming Assignment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/1.9 Programming Assignment.pdf


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/Install Software.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Install Software.docx


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/Install Software.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Install Software.pdf


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/Quiz 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Quiz 1.pdf


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/AB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/AB.png


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/Hitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/Hitting.png


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/Scorecard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/Scorecard.jpg


--------------------------------------------------------------------------------
/Week 1 Introduction to Statistics and Probability/asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Week_1_sid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Week_1_sid.pdf


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/2.1. Sigma Unknown Example.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - [Narrator] So, let's say that you wanted\par
  4 | to find the average length\par
  5 | of an elephant's trunk,\par
  6 | in this case mature African elephant trunks.\par
  7 | So you want to do that, then what you would do,\par
  8 | you would go out to the field and take some measurements\par
  9 | of some trunks and, let's say that you took\par
 10 | eight measurements, you found eight elephants,\par
 11 | you took eight measurements.\par
 12 | Let's say they are 5.62, 6.07 and so on,\par
 13 | up to 5.48 feet.\par
 14 | Okay?\par
 15 | Now what you want to do is you want to find\par
 16 | the confidence interval for the length of the trunk\par
 17 | and let's say specifically you want to find the\par
 18 | 95% confidence interval.\par
 19 | So you want to find the mean such that if you\par
 20 | did this experiment many times, 95% of the time\par
 21 | you'll come up with an interval that includes a mean.\par
 22 | So, first what you do is you find the critical t,\par
 23 | and remember the formula is the critical value t,\par
 24 | p with n-1 degrees of freedom is, we look at the inverse\par
 25 | of the F, or the percentile point function of the\par
 26 | t distribution with n-1 degrees of freedom\par
 27 | evaluated 1+p over 2, where p is 95%\par
 28 | so that's F7 because we have eight measurements,\par
 29 | so we have seven degrees of freedom,\par
 30 | inverse evaluated 0.975, and when you do that that gives you\par
 31 | 2.3646 and to calculate this you use the t\par
 32 | for the t-distribution ppf for the inverse of the cdf\par
 33 | evaluated at 0.975, seven degrees of freedom\par
 34 | gives you this value, so this is 2.3656.\par
 35 | Then you calculate the sample mean,\par
 36 | which is just the average of these values,\par
 37 | so the sample mean in this case is 6.095.\par
 38 | Then you calculate the sample variance,\par
 39 | which as you remember you take each of those values,\par
 40 | subtract the sample mean, square it,\par
 41 | add them up and normalize by n-1,\par
 42 | by seven, not by eight,\par
 43 | because we're using the S correction.\par
 44 | When you do that you get that S squared is 0.1705\par
 45 | and therefore S, the sample standard deviation,\par
 46 | is going to be 0.4130.\par
 47 | Then, you calculate the margin of error,\par
 48 | which is simply the critical t,\par
 49 | multiplied by our estimate of the standard deviation,\par
 50 | normalized by the square root of n,\par
 51 | which is the number of samples that we take.\par
 52 | When you do that you get, you get here,\par
 53 | t, which we found to be 2.36,\par
 54 | times S, which is this number,\par
 55 | divided by square root of eight, is 0.3453.\par
 56 | And then finally your confidence interval is just the\par
 57 | sample mean plus/minus the margin of error,\par
 58 | so plus/minus this value, and standard numeral 6.095,\par
 59 | so that gives you a number with some values which is\par
 60 | 5.7497 up to 6.4403.\par
 61 | Okay, so that's all we need to do.\par
 62 | And just want to end with\par
 63 | just a few comments and observations.\par
 64 | So first for when n is large, as we saw both using graphs\par
 65 | and using analytical calculation,\par
 66 | the distribution of the, t-distribution,\par
 67 | converges to the standard numeral distribution,\par
 68 | and therefore the critical t value,\par
 69 | with n-1 degrees of freedom, as n increases is going to\par
 70 | converge to the critical z value,\par
 71 | evaluated set point p.\par
 72 | Also note that our estimate S of the standard deviation,\par
 73 | the sample standard deviation, will converge to sigma.\par
 74 | When you combine these two together you see\par
 75 | that you can just use z-based techniques,\par
 76 | and just use S instead of sigma when you don't know sigma,\par
 77 | so just use the estimate, and these two techniques will,\par
 78 | in the limit, as the normal sample increases,\par
 79 | will give you the same value.\par
 80 | Also note that when n is small, as is often the case,\par
 81 | and this was the case that Gosset was concerned with,\par
 82 | then the t-distribution is more accurate,\par
 83 | and notice that it will yield a larger margin of error\par
 84 | than when sigma is known,\par
 85 | that's because there is more uncertainty.\par
 86 | And last thing I want to say is that we assume that\par
 87 | Xi's are normal, if you go back in the slides,\par
 88 | but in reality, of course, the samples are not necessarily\par
 89 | going to be normal, so that calculations that we did\par
 90 | are valued mostly when the samples\par
 91 | that we take are roughly normally distributed.\par
 92 | So, let's summarize.\par
 93 | We looked at confidence intervals when sigma is not known,\par
 94 | we said we need to estimate sigma, which is natural,\par
 95 | but we also need to replace the standard normal distribution\par
 96 | by the student's t-distribution, which is derived\par
 97 | by William Gosset as he was working from Guinness,\par
 98 | trying to improve the production of beer, as we see here,\par
 99 | and we gave step-by-step instructions\par
100 | for calculating the confidence interval, and an example.\par
101 | So next time we're going to talk about hypothesis testing,\par
102 | see you then.\par
103 | End of transcript. Skip to the start.\par
104 | POLL\par
105 | }
106 |  


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/HW_10.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": [
  7 |                 "<font size=\"4\" style=\"color:red;\"> **IMPORTANT: ** When submitting this homework notebook, please modify only the cells that start with:<\/font>\n",
  8 |                 "\n",
  9 |                 "```python\n",
 10 |                 "# modify this cell\n",
 11 |                 "```"
 12 |             ]
 13 |         },
 14 |         {
 15 |             "cell_type": "code",
 16 |             "execution_count": 18,
 17 |             "metadata": {},
 18 |             "outputs": [],
 19 |             "source": [
 20 |                 "from scipy import stats\n",
 21 |                 "from math import sqrt"
 22 |             ]
 23 |         },
 24 |         {
 25 |             "cell_type": "markdown",
 26 |             "metadata": {},
 27 |             "source": [
 28 |                 "# Confidence Interval"
 29 |             ]
 30 |         },
 31 |         {
 32 |             "cell_type": "markdown",
 33 |             "metadata": {},
 34 |             "source": [
 35 |                 "For a sample with size large enough, by central limit theorem we can assume its mean follows normal distribution. And if we also know the standard deviation of the population, we are able to calculate a confidence interval to estimate the population mean."
 36 |             ]
 37 |         },
 38 |         {
 39 |             "cell_type": "markdown",
 40 |             "metadata": {},
 41 |             "source": [
 42 |                 "## Problem 1"
 43 |             ]
 44 |         },
 45 |         {
 46 |             "cell_type": "markdown",
 47 |             "metadata": {},
 48 |             "source": [
 49 |                 "A confidence interval is usually given by sample mean $m$ plus and minus a margin of error $r$:\n",
 50 |                 "$$[m-r,m+r]$$\n",
 51 |                 "The confidence interval is larger (less precise) for large confidence level, and smaller (more precise) for small confidence level.\n",
 52 |                 "\n",
 53 |                 "For this problem, you are asked to write a function **Error** that calculates the margin of error $r$ given sample size $n$, confidence level $p$ and standard deviation of the population $s$."
 54 |             ]
 55 |         },
 56 |         {
 57 |             "cell_type": "markdown",
 58 |             "metadata": {},
 59 |             "source": [
 60 |                 "<font  style=\"color:blue\"> **Code:**<\/font>\n",
 61 |                 "```python\n",
 62 |                 "Error(40,0.95,20)\n",
 63 |                 "\n",
 64 |                 "Error(40,0.95,10)       \n",
 65 |                 "```\n",
 66 |                 "\n",
 67 |                 "\n",
 68 |                 "<font  style=\"color:magenta\"> **Output**<\/font>\n",
 69 |                 "```\n",
 70 |                 "6.1979503230456148\n",
 71 |                 "3.0989751615228074\n",
 72 |                 "```"
 73 |             ]
 74 |         },
 75 |         {
 76 |             "cell_type": "code",
 77 |             "execution_count": 19,
 78 |             "metadata": {
 79 |                 "collapsed": true,
 80 |                 "nbgrader": {
 81 |                     "grade": false,
 82 |                     "locked": false,
 83 |                     "solution": false
 84 |                 }
 85 |             },
 86 |             "outputs": [],
 87 |             "source": [
 88 |                 "# modify this cell\n",
 89 |                 "\n",
 90 |                 "def Error(n,p,s):\n",
 91 |                 "    # inputs: sample size n, confidence level p and standard deviation s\n",
 92 |                 "    # output: margin of error r\n",
 93 |                 "    \n",
 94 |                 "    #\n",
 95 |                 "    # YOUR CODE HERE\n",
 96 |                 "    #\n"
 97 |             ]
 98 |         },
 99 |         {
100 |             "cell_type": "code",
101 |             "execution_count": 47,
102 |             "metadata": {
103 |                 "nbgrader": {
104 |                     "grade": true,
105 |                     "grade_id": "ex1",
106 |                     "locked": true,
107 |                     "points": "5",
108 |                     "solution": false
109 |                 }
110 |             },
111 |             "outputs": [],
112 |             "source": [
113 |                 "# Check Function\n",
114 |                 "assert abs(Error(60,0.9,20)-4.2469938027546128) < 10**-5 \n",
115 |                 "assert abs(Error(60,0.9,10)-2.1234969013773064) < 10**-5\n",
116 |                 "\n",
117 |                 "#\n",
118 |                 "# AUTOGRADER TEST - DO NOT REMOVE\n",
119 |                 "#\n"
120 |             ]
121 |         },
122 |         {
123 |             "cell_type": "markdown",
124 |             "metadata": {},
125 |             "source": [
126 |                 "# Problem 2"
127 |             ]
128 |         },
129 |         {
130 |             "cell_type": "markdown",
131 |             "metadata": {},
132 |             "source": [
133 |                 "For this problem, you are asked to write a function **Confidence** that calculates the confidence level $p$ given sample size $n$, margin of error $r$ and standard deviation of the population $s$."
134 |             ]
135 |         },
136 |         {
137 |             "cell_type": "markdown",
138 |             "metadata": {},
139 |             "source": [
140 |                 "<font  style=\"color:blue\"> **Code:**<\/font>\n",
141 |                 "```python\n",
142 |                 "Confidence(40,6,20)\n",
143 |                 "\n",
144 |                 "Confidence(40,8,20)      \n",
145 |                 "```\n",
146 |                 "\n",
147 |                 "\n",
148 |                 "<font  style=\"color:magenta\"> **Output**<\/font>\n",
149 |                 "```\n",
150 |                 "0.94222042887640267\n",
151 |                 "0.98858796361399826\n",
152 |                 "```"
153 |             ]
154 |         },
155 |         {
156 |             "cell_type": "code",
157 |             "execution_count": 48,
158 |             "metadata": {
159 |                 "collapsed": true
160 |             },
161 |             "outputs": [],
162 |             "source": [
163 |                 "# modify this cell\n",
164 |                 "\n",
165 |                 "def Confidence(n,r,s):\n",
166 |                 "    # inputs: sample size n, margin of error r, and standard deviation s\n",
167 |                 "    # output: confidnce level r\n",
168 |                 "    \n",
169 |                 "    #\n",
170 |                 "    # YOUR CODE HERE\n",
171 |                 "    #\n"
172 |             ]
173 |         },
174 |         {
175 |             "cell_type": "code",
176 |             "execution_count": 49,
177 |             "metadata": {
178 |                 "collapsed": true,
179 |                 "nbgrader": {
180 |                     "grade": true,
181 |                     "grade_id": "ex2",
182 |                     "locked": true,
183 |                     "points": "5",
184 |                     "solution": false
185 |                 }
186 |             },
187 |             "outputs": [],
188 |             "source": [
189 |                 "# Check Function\n",
190 |                 "assert abs(Confidence(60,5,20)-0.94719248858388649) < 10**-5 \n",
191 |                 "assert abs(Confidence(60,6,20)-0.97986324844965367) < 10**-5\n",
192 |                 "\n",
193 |                 "#\n",
194 |                 "# AUTOGRADER TEST - DO NOT REMOVE\n",
195 |                 "#\n"
196 |             ]
197 |         }
198 |     ],
199 |     "metadata": {
200 |         "kernelspec": {
201 |             "display_name": "Python 2",
202 |             "language": "python",
203 |             "name": "python2"
204 |         },
205 |         "language_info": {
206 |             "codemirror_mode": {
207 |                 "name": "ipython",
208 |                 "version": 2
209 |             },
210 |             "file_extension": ".py",
211 |             "mimetype": "text\/x-python",
212 |             "name": "python",
213 |             "nbconvert_exporter": "python",
214 |             "pygments_lexer": "ipython2",
215 |             "version": "2.7.12"
216 |         },
217 |         "toc": {
218 |             "colors": {
219 |                 "hover_highlight": "#DAA520",
220 |                 "navigate_num": "#000000",
221 |                 "navigate_text": "#333333",
222 |                 "running_highlight": "#FF0000",
223 |                 "selected_highlight": "#FFD700",
224 |                 "sidebar_border": "#EEEEEE",
225 |                 "wrapper_background": "#FFFFFF"
226 |             },
227 |             "moveMenuLeft": true,
228 |             "nav_menu": {
229 |                 "height": "12px",
230 |                 "width": "252px"
231 |             },
232 |             "navigate_menu": true,
233 |             "number_sections": true,
234 |             "sideBar": true,
235 |             "threshold": 4,
236 |             "toc_cell": false,
237 |             "toc_section_display": "block",
238 |             "toc_window_display": false,
239 |             "widenNotebook": false
240 |         }
241 |     },
242 |     "nbformat": 4,
243 |     "nbformat_minor": 2
244 | }


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/Problem Set 10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Problem Set 10.pdf


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/Programming Assignment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Programming Assignment.pdf


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_1.pdf


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_2.pdf


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_3.pdf


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_4.pdf


--------------------------------------------------------------------------------
/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_5.pdf


--------------------------------------------------------------------------------
/Week 2 Sets/1 Week_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 2 Sets/1 Week_2.pdf


--------------------------------------------------------------------------------
/Week 2 Sets/2.3 Venn Diagrams   Visualizing Sets.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hello and welcome back.\par
  4 | So now that we have defined and created sets,\par
  5 | the next thing we're going to do is visualize them,\par
  6 | and advertise them and as we all know this is\par
  7 | best done using Instagram.\par
  8 | Except that sets were defined the way we're\par
  9 | introducing them now a couple hundred years ago,\par
 10 | and at that time the hottest thing was not Instagram,\par
 11 | but rather a diagram, and specifically a Venn diagram.\par
 12 | And Venn diagrams are named after John Venn.\par
 13 | Who besides being a mathematician and a priest,\par
 14 | was also an avid cricket player and he even designed\par
 15 | machines that collect cricket balls.\par
 16 | And so it perhaps not so surprising that when\par
 17 | he wanted to do visualized sets,\par
 18 | he visualized them as regions that often use\par
 19 | look round like a ball, like this ball.\par
 20 | And he visualized sets as elements\par
 21 | as points inside those regions.\par
 22 | So for example if you consider the sets here one,\par
 23 | then you would have all the integers maybe,\par
 24 | minus one, zero, one, two, and we put this circle\par
 25 | around zero and one, meaning zero and one intersect\par
 26 | and minus one and two are not intersect.\par
 27 | Or if the elements are whatever they could be,\par
 28 | we can put the circle around the elements that\par
 29 | belong to the set like here, so these are elements\par
 30 | in the set and the elements outside are not in a set.\par
 31 | Or if we don't even want to even draw the elements,\par
 32 | we can just draw the circle and we know that whatever is\par
 33 | inside will be intersect, and whatever is outside\par
 34 | this region will not intersect.\par
 35 | And a single set gets a little boring, so one thing led\par
 36 | to another and he looked at two sets.\par
 37 | So here are a collection of points, and if you have\par
 38 | a set A so these will be the elements in a set A,\par
 39 | these are all the points in A.\par
 40 | And here we have another set, B, so all these four\par
 41 | points here will be four points in B.\par
 42 | And then if you look in this region here where the two sets\par
 43 | overlap, those are the points that are both A and B,\par
 44 | and here are the points that are neither A nor B.\par
 45 | And of course things get much more interesting still if you\par
 46 | have three sets, and so you have three sets, A, B, and C.\par
 47 | And these are the points that are in A,\par
 48 | but not in B, nor in C.\par
 49 | And these are the points that are in A and B, but not in C.\par
 50 | And these are the points that are A, B, and C.\par
 51 | And outside are the points here are not in any set.\par
 52 | So why are we interested in Venn diagrams?\par
 53 | Because everyone knows that a\par
 54 | picture is worth a thousand words.\par
 55 | And when it comes to sets and probability,\par
 56 | Venn diagram is going to allow us to replace\par
 57 | very long proofs by visual and definitions by visual\par
 58 | definitions and visual proofs.\par
 59 | So in a sense, this picture is\par
 60 | going to be worth a whole dictionary.\par
 61 | And if you don't trust me, then maybe you'll\par
 62 | believe Jon Stewart who was the host of The Daily Show,\par
 63 | and together with his show mates, they wrote a book\par
 64 | called Earth, which is supposed to introduce visitors\par
 65 | to our planet, to Earth.\par
 66 | And in this book, they have a section\par
 67 | called Gods of Science.\par
 68 | And John Venn gets a honorable mention.\par
 69 | And he gets it for two things.\par
 70 | First, for getting something right.\par
 71 | And then equally important, for attaching his name to it.\par
 72 | So if you look at this book, and you will find what\par
 73 | according to them was the first Venn diagram,\par
 74 | and here it is, it's the original Venn diagram.\par
 75 | It consists of the set that contains the names of people,\par
 76 | and the other set that contains the names of diagrams,\par
 77 | and guess what's in the intersection,\par
 78 | guess what's in both of those sets?\par
 79 | John Venn, so if you want to see how to implement\par
 80 | Venn diagrams in Python, then you'll need to\par
 81 | download matplotlib_venn package.\par
 82 | And there's a link in the notebook,\par
 83 | and once you do that you can write\par
 84 | import matplotlib.pyplot as plt\par
 85 | and import matplotlib_venn as venn\par
 86 | and then you need to find two sets\par
 87 | and it has to be one, two, three.\par
 88 | For example, T to be this set, and then to plot\par
 89 | you write venn.venn2, two stands for plotting and\par
 90 | Venn diagram of two sets, and then you can label them.\par
 91 | You can incorporate the sets S and T and you can label\par
 92 | them S, S, and T, otherwise it would just be like A and B.\par
 93 | Then you just plot dot show them.\par
 94 | And what you'll get is this, you'll get the set S,\par
 95 | because of the label S.\par
 96 | And the set T, because of this.\par
 97 | And then you'll get the relative sizes,\par
 98 | so you'll see which one is larger and which is smaller.\par
 99 | And you can see the intersection.\par
100 | And you can also do Venn diagrams of three by\par
101 | venn.venn3 and you'll write three sets, S, T, and U.\par
102 | Whatever the set U is going to be.\par
103 | You set the labels that you want,\par
104 | and then you'll get a Venn diagram of three sets.\par
105 | All right, so we talked about set visualizations,\par
106 | specifically used Venn diagrams, we decided\par
107 | that we can use them to visualize, to think, to prove,\par
108 | and to understand sets and that's going to be very\par
109 | useful in the future for us.\par
110 | And next time we're going to talk about set relations.\par
111 | See you then.\par
112 | End of transcript. Skip to the start.\par
113 | Discussion\par
114 | }
115 |  


--------------------------------------------------------------------------------
/Week 2 Sets/2.7 Russell's Paradox   Russell's Paradox.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - So this is our fourth\par
  4 | and last video about sets,\par
  5 | so I thought I'll show you\par
  6 | how even the simple topics we discussed\par
  7 | can lead to interesting and surprising consequences.\par
  8 | In this case an interesting paradox.\par
  9 | The paradox is due to Bertrand Russell,\par
 10 | who was a British mathematician,\par
 11 | philosopher and author,\par
 12 | and amongst other things,\par
 13 | he received a Nobel prize for literature.\par
 14 | He was also a well known wit\par
 15 | who had something to say about almost anything.\par
 16 | And he had a special place in his heart\par
 17 | for human intelligence.\par
 18 | So here are a few of his quotes.\par
 19 | He said that in democracy, fools have the right to vote,\par
 20 | while in dictatorships fools have the right to rule.\par
 21 | That most people would rather die than think,\par
 22 | and in fact, most do.\par
 23 | And perhaps most relevant to our course,\par
 24 | he said that men are born ignorant, not stupid,\par
 25 | but it's education that makes them stupid.\par
 26 | So with that, let's educate ourselves about his paradox.\par
 27 | Let's start with a small review about sets in sets.\par
 28 | Remember that sets can be elements.\par
 29 | For example, here is a set, and another set,\par
 30 | and both of them are elements of the set\par
 31 | that contains both of them.\par
 32 | Also, you remember that every set\par
 33 | can be a subset of itself.\par
 34 | For example, the empty set is contained in itself.\par
 35 | The more interesting question is whether\par
 36 | a set can belong to, or be an element of itself.\par
 37 | Namely, can a set S be an element of the set S?\par
 38 | Now, typically sets do not belong to themselves.\par
 39 | For example, if you take the set\par
 40 | that contains just 0,\par
 41 | it has a single element, which is the number 0.\par
 42 | So the set zero is not a member of,\par
 43 | or an element of the set 0.\par
 44 | Similarly, the empty set contains no elements,\par
 45 | and therefore in particular the empty set\par
 46 | is not an element of the empty set.\par
 47 | On the other hand, some sets do contain themselves,\par
 48 | and do belong to themselves.\par
 49 | So consider for example NT, non Trump,\par
 50 | the set of anything that is not Trump.\par
 51 | This set contains several elements.\par
 52 | For example, Hillary Clinton is not Trump, so she's in.\par
 53 | The number zero is not Donald Trump, so it is in the set.\par
 54 | The set \{1,2\} is not Trump, so it's also in NT.\par
 55 | And, when you think about it,\par
 56 | lots of element are in the set.\par
 57 | In fact, everything except Donald Trump is in the set.\par
 58 | So, in particular, the set NT itself,\par
 59 | it is not Donald Trump.\par
 60 | So NT itself is in the set.\par
 61 | So we therefore get that the set NT\par
 62 | does belong, or is an element, of itself.\par
 63 | So that's surprising, and indeed\par
 64 | can lead to interesting consequences.\par
 65 | One of them is that if you have a set that contains itself,\par
 66 | for example if you take this slide,\par
 67 | and if it contains itself,\par
 68 | then the smaller slide is the same as the original slide,\par
 69 | and therefore it contains itself again.\par
 70 | And the smaller slide contains itself once more, and so on.\par
 71 | So you get an infinite recursion that way.\par
 72 | But don't worry about that.\par
 73 | The only thing we need to know\par
 74 | is that some sets are elements of themselves.\par
 75 | For example the set NT is an element of itself.\par
 76 | While other sets, like the set containing just 0,\par
 77 | is not an element of itself.\par
 78 | That's the only thing we need to know.\par
 79 | And you notice this is something we could have discussed\par
 80 | even after the first lecture.\par
 81 | You don't need anything except that\par
 82 | you have a set that contains itself,\par
 83 | and a set that does not contain itself.\par
 84 | So with that, what is Russell's Paradox?\par
 85 | It is that you can define a set that cannot exist.\par
 86 | So Russell considered the following set R.\par
 87 | It's the set of all sets\par
 88 | that don't belong to themselves.\par
 89 | So in other words, it's the collection of all S,\par
 90 | such that S is not an element of itself.\par
 91 | Since this is our main definition,\par
 92 | it may be worth thinking about it.\par
 93 | We define a set such that\par
 94 | if a set is an element of itself,\par
 95 | then it is not in R.\par
 96 | And if the set is not an element of itself,\par
 97 | then it is in R.\par
 98 | So now, clearly the set that contains 0,\par
 99 | as we saw before, is not an element of itself.\par
100 | We saw it in the previous slide.\par
101 | And therefore, by our definition, it is in R.\par
102 | Conversely, NT was an element of itself,\par
103 | and therefore by our definition here, it is not in R.\par
104 | So the question is, if you take R itself,\par
105 | is R in R, or is R not in R?\par
106 | So clearly, one of those two things must hold.\par
107 | Either R is in R, or R is not in R.\par
108 | And what we are going to show\par
109 | is that both of those will lead to a contradiction.\par
110 | So this cannot happen.\par
111 | So imagine first that R is in R,\par
112 | namely R is an element of itself.\par
113 | Then by our definition, R cannot be in R,\par
114 | because the set R just contains the elements\par
115 | that are not in itself.\par
116 | So if R is in R, then R is not in our set,\par
117 | in Russell's set.\par
118 | And this means that R is both in R, and not in R,\par
119 | which is a contradiction.\par
120 | Now, if R is not in R, then,\par
121 | by the definition of our set,\par
122 | R is in Russell's set, and therefore R is in R,\par
123 | so again we get that R is both not in R, and in R,\par
124 | which is again a contradiction.\par
125 | And that means that if R existed,\par
126 | then it would both be that R is in R,\par
127 | and R would not be in R.\par
128 | Both of those would hold, and that's impossible,\par
129 | and therefore that means that R has been defined,\par
130 | but it cannot exist.\par
131 | So that's the contradiction.\par
132 | Now let's see what happened.\par
133 | So we defined the sets recursively,\par
134 | and that was the source of our problem.\par
135 | Because when we looked at sets\par
136 | that contained sets that contained themselves,\par
137 | then we got an infinite recursion.\par
138 | And when we considered the set\par
139 | that included sets that do not contain themselves,\par
140 | then we got a contradiction.\par
141 | So either way, once we got to recursive definitions,\par
142 | we were not in Kansas anymore.\par
143 | But don't worry, we're still\par
144 | in the continental United States,\par
145 | so we keep things simple.\par
146 | And we'll only consider sets\par
147 | that are not recursively defined,\par
148 | so we'll avoid sets that are self-referential.\par
149 | And also this material is not needed for the exam,\par
150 | although I think it's something\par
151 | that you probably want to know.\par
152 | Now this paradox has several variations.\par
153 | For example, you have a barber\par
154 | who shaves exactly the people\par
155 | who don't shave themselves.\par
156 | And the question is, does the barber shave himself,\par
157 | or does he not?\par
158 | Because if he shaves himself,\par
159 | then he shaves someone who does shave himself.\par
160 | And if he doesn't shave himself,\par
161 | then he doesn't shave everyone\par
162 | who does not shave themselves.\par
163 | And a similar paradox is, "This sentence is a lie."\par
164 | Is this sentence true, or not true?\par
165 | Even Pinocchio cannot figure this out.\par
166 | Or, "This is not a pipe," by Dali.\par
167 | And just in case you think these paradoxes\par
168 | are just philosophical issues,\par
169 | maybe you want to consider this.\par
170 | - I would never want to belong to any club\par
171 | that would have someone like me for a member.\par
172 | That's the key joke of my adult life\par
173 | in terms of my relationships with women.\par
174 | - I hope you learned, and enjoyed\par
175 | the videos, and next time\par
176 | we'll start talking about counting.\par
177 | Thank you.\par
178 | End of transcript. Skip to the start.\par
179 | POLL\par
180 | }
181 |  


--------------------------------------------------------------------------------
/Week 2 Sets/Problem Set 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 2 Sets/Problem Set 2.pdf


--------------------------------------------------------------------------------
/Week 2 Sets/Programming Assignment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 2 Sets/Programming Assignment.pdf


--------------------------------------------------------------------------------
/Week 2 Sets/Quiz 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 2 Sets/Quiz 2.pdf


--------------------------------------------------------------------------------
/Week 3 Counting and Combinatorics/1 Sets.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 Hello, and welcome back.\par
  4 | So we're done talking about just sets on their own,\par
  5 | and we're going to start talking about counting,\par
  6 | and don't worry, we won't be counting sheep\par
  7 | and fall asleep because we'll be counting sets,\par
  8 | and in fact, our goal in these next few lectures\par
  9 | is going to be to develop techniques\par
 10 | so that actually we don't need to count almost at all.\par
 11 | If we get it right, we will never need to count\par
 12 | to more than just three.\par
 13 | So, as we said, we're counting sets,\par
 14 | so we need to describe what we're counting.\par
 15 | The number of elements in a set S is called its size,\par
 16 | or cardinality, and is denoted by these two vertical bars,\par
 17 | the size of S, or pound S, again, the size of S.\par
 18 | Let's do a few examples.\par
 19 | The set of bits consists of zero and one,\par
 20 | and it's cardinality is two.\par
 21 | It has two elements.\par
 22 | Coin has two sides, heads and tails.\par
 23 | Its size is also two, and a die has six faces,\par
 24 | one up to six, there are 10 digits,\par
 25 | and there are 26 letters,\par
 26 | and the empty set, noted like so,\par
 27 | has no elements, so its cardinality, or its size, is zero,\par
 28 | and on the other extreme, the set of integers Z,\par
 29 | or the set of natural numbers N,\par
 30 | or the set of positive integers,\par
 31 | or the set of rational numbers\par
 32 | all have infinitely many elements,\par
 33 | and the reals also have infinitely many elements,\par
 34 | or the size is also infinite.\par
 35 | There's actually a difference between these two infinites,\par
 36 | but we're not going to worry about that right now.\par
 37 | These are called countably infinite,\par
 38 | and these are called uncountably infinite,\par
 39 | but let's not worry about them.\par
 40 | Right?\par
 41 | Let's look at two other sets and their sizes.\par
 42 | Remember we discussed integer intervals?\par
 43 | For m less than or equal to n,\par
 44 | we define m dot dot n to be the set of integers\par
 45 | from m to n, inclusive of both m and n,\par
 46 | and for example, three dot dot five\par
 47 | is the set three, four, five,\par
 48 | and the size of m dot dot n is going to be\par
 49 | n minus m plus one, and the question often asked\par
 50 | is why the plus one, so first of all,\par
 51 | we can try small numbers like five dot dot five.\par
 52 | That's just the set five, so it's one,\par
 53 | which is five minus five plus one,\par
 54 | or one dot dot three.\par
 55 | One, two, three.\par
 56 | That size is three, and that's three minus one,\par
 57 | three minus one, which will give us two plus one,\par
 58 | which will give us three, and as you see,\par
 59 | it's as easy as one, two, three,\par
 60 | and so far we didn't need to count more than three,\par
 61 | and to see why we're adding the one,\par
 62 | let's look, for example, this interval\par
 63 | three, four, five.\par
 64 | It's the interval here.\par
 65 | So you see we can have three points like that,\par
 66 | but when we count five minus three,\par
 67 | we're not counting the number of points or elements,\par
 68 | we're just counting the distance\par
 69 | between five and three, which is two,\par
 70 | and what we're counting is\par
 71 | the length of this segment from three to five,\par
 72 | which is the number of unit intervals here,\par
 73 | but what we're interested is in the number of points\par
 74 | and there's one more point in the interval\par
 75 | because there's a point to the left of every interval\par
 76 | and then there's a point to the right\par
 77 | of the right-most interval, so we need to add one.\par
 78 | So five minus three plus one, which will give us three.\par
 79 | Also, we talked about integer multiples.\par
 80 | Let me just for clarity, we said\par
 81 | that the set n is the set 1 up to n.\par
 82 | That'll be denoted by open brace, open parentheses here\par
 83 | and then a square bracket on the right\par
 84 | to just indicate even more strongly\par
 85 | that we're not using zero, so it's one up to n,\par
 86 | just another notation.\par
 87 | The set of integers between 1 and n\par
 88 | that are multiples of d is denoted like that.\par
 89 | It's a set of integers between one and n\par
 90 | that are divisible by d or the multiples of d.\par
 91 | For example, the set of integers that are multiples of three\par
 92 | between one and eight is three and six,\par
 93 | which is one times three and then two times three,\par
 94 | and the set of integers between one and nine\par
 95 | that are divisible by three, three, six, and nine,\par
 96 | which is one times three, two times three,\par
 97 | and three times three, and if you look\par
 98 | at the size of the set, as you see,\par
 99 | that it's n divided by d, and then we're taking\par
100 | the floor, the largest integer that is\par
101 | less than or equal to this ratio.\par
102 | For example, for integers between one and eight\par
103 | that are divisible by three, it's the floor\par
104 | of eight over three, which is two,\par
105 | and that's because we have one times three\par
106 | and two times three here, and the largest\par
107 | we can multiply is going to be, in this case,\par
108 | the floor of eight over three,\par
109 | and when we have the size of the integers\par
110 | that are multiples of three between one and nine,\par
111 | it's going to be the floor of nine over three,\par
112 | and that's three.\par
113 | That's because we have one times three,\par
114 | two times three, and three times three,\par
115 | and again, the number three comes from\par
116 | nine divided by three, and we take the floor of that.\par
117 | Now, in Python,\par
118 | set size is\par
119 | expressed using the len function.\par
120 | For example, we can write print len\par
121 | of minus one and one.\par
122 | It will give us two, and we can do other things,\par
123 | for example, we want to incorporate\par
124 | the sum of the elements in a set,\par
125 | we'll write sum, so print the sum of minus one, one\par
126 | will give us zero.\par
127 | To find the minimum of several elements,\par
128 | using the min function, print min of minus one and one,\par
129 | will give us minus one, and the maximum,\par
130 | using the max function, print the max of minus one and one,\par
131 | will give us one, and we can also do these calculations\par
132 | including sum, including length,\par
133 | by just iterating over elements or writing loops,\par
134 | so we say, for some variable in some set,\par
135 | do something, so, for example, here\par
136 | A is one, two, three, and if one, two,\par
137 | calculate the sum of A, and just write print sum of A,\par
138 | it will give us six, or we can say total is zero,\par
139 | and for i in A, total plus equals i,\par
140 | and print total, and that will give us, again, six.\par
141 | With that, we're going to continue next time\par
142 | by calculating the size of different sets\par
143 | like unions and intersections and so on.\par
144 | See you then.\par
145 | End of transcript. Skip to the start.\par
146 | I have completed this\par
147 |  Previous\par
148 | }
149 |  


--------------------------------------------------------------------------------
/Week 3 Counting and Combinatorics/11 Binomial Coefficient.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 POLL\par
 4 | \par
 5 | Assume you are choosing a class schedule and you need to take 2 science classes and 2 art classes. If there are 6 science classes and 5 art classes available to choose from, how many different class schedules can you come up with?\par
 6 | \par
 7 | \tab\par
 8 | 25\par
 9 | \par
10 | \tab\par
11 | 55\par
12 | \par
13 | \tab\par
14 | 60\par
15 | \par
16 | \tab\par
17 | 150\par
18 | }
19 |  


--------------------------------------------------------------------------------
/Week 3 Counting and Combinatorics/4 Mix It Up.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 Hello, and welcome back.\par
  4 | So, in the last video we talked about\par
  5 | the size of general unions and now\par
  6 | we would like to discuss Cartesian Products.\par
  7 | So,\par
  8 | if you have a set, for example, \{a,b\}\par
  9 | and another set, \{1,2,3\},\par
 10 | the set \{a,b\} is size two,\par
 11 | the set \{1,2,3\} is size three,\par
 12 | the size of the Cartesian Product \{a,b\}\par
 13 | times \{1,2,3\}, is the size we know\par
 14 | of this set here A one, A two, A three,\par
 15 | B one, B two, B three, as we know this is\par
 16 | the Cartesian Product of these two sets\par
 17 | and the size of this set where we have\par
 18 | three elements in the first row\par
 19 | and\par
 20 | so three elements in the first row\par
 21 | and three elements in the second row,\par
 22 | and these elements are disjoint.\par
 23 | The elements here are different from\par
 24 | the elements here, so therefore by the addition rule,\par
 25 | the number of elements is three plus three\par
 26 | which is the same as two times three which is six.\par
 27 | Okay.\par
 28 | And\par
 29 | here is another interpretation of the same result.\par
 30 | We have here all the elements,\par
 31 | A one, A two, A three,\par
 32 | B one, B two, B three,\par
 33 | we have two rows and three columns,\par
 34 | so the total number of elements\par
 35 | that we have is two times three which is six sets.\par
 36 | Essentially the same argument but maybe\par
 37 | this gives more for an idea of volume\par
 38 | oh, and area.\par
 39 | Alright.\par
 40 | So like this area is two times three which is six.\par
 41 | Now,\par
 42 | in general we can see that the size\par
 43 | of a Cartesian Product is the product\par
 44 | of the set sizes.\par
 45 | So if you have two sets A and B,\par
 46 | then the size of the Cartesian Product\par
 47 | A times B is just going to be\par
 48 | the size of A times the size of B.\par
 49 | 'Kay, and this is called the product rule,\par
 50 | which we'll denote by this X.\par
 51 | And as you can see it's another\par
 52 | application of the addition rule\par
 53 | because we just add things,\par
 54 | you know the size of A times.\par
 55 | So here we added them twice,\par
 56 | so it's just like a generalization\par
 57 | if you want of the addition rule.\par
 58 | Okay.\par
 59 | Here's some applications of Cartesian Products.\par
 60 | Let's start with\par
 61 | the ubiquitous tables.\par
 62 | So every table as we had said,\par
 63 | that we use, is a Cartesian Product\par
 64 | it has records which are\par
 65 | viewed as rows.\par
 66 | Here we have five records\par
 67 | and it has attributes, here we have,\par
 68 | which are viewed as columns,\par
 69 | here we have three, and the number of cells that we have\par
 70 | is the size of this Cartesian Product.\par
 71 | Now we use\par
 72 | multiplication principle is going,\par
 73 | or the product rule, it's going to be\par
 74 | five times three which is 15 cells.\par
 75 | If you have more than two sets,\par
 76 | so\par
 77 | A times B, the Cartesian Product is set of\par
 78 | all (a,b) such that a is in A\par
 79 | and b is in B, and this gives\par
 80 | you a rectangle,\par
 81 | like that, and the size of the rectangle\par
 82 | as we saw is just the product\par
 83 | of the sizes of these two sets.\par
 84 | If we have Cartesian Product of three sets,\par
 85 | it's the collection of all (a,b,c) such that\par
 86 | a is in A, b is in B, and c is in C,\par
 87 | and that will, instead of a rectangle,\par
 88 | is going to give us like a cuboid.\par
 89 | Something that looks like that,\par
 90 | and the number of elements there is\par
 91 | going to be the size of this Cartesian Product\par
 92 | is the number of elements.\par
 93 | It's the size of A times the size of B times the size of C.\par
 94 | 'Kay.\par
 95 | So let's see an example.\par
 96 | We have someone who likes\par
 97 | to dress differently.\par
 98 | And they have three shirts, let's say two pants,\par
 99 | and five pairs of shoes.\par
100 | And one wonders how many days\par
101 | they can go with different outfits.\par
102 | So how many different outfits can they have?\par
103 | So notice that an outfit is just a triple.\par
104 | It's a shirt,\par
105 | a pant,\par
106 | a pair of pants,\par
107 | and a pair of shoes.\par
108 | So for example, and outfit may be\par
109 | the yellow shirt, the blue pants,\par
110 | and the red shoes, 'kay?\par
111 | So an outfit is just an order triple.\par
112 | And the set of outfits,\par
113 | so this is an outfit,\par
114 | and the set of outfits\par
115 | is the set of three tuples, or the triples,\par
116 | which is just a Cartesian Product of\par
117 | all shirts times all pants times all shoes.\par
118 | It's a Cartesian Product.\par
119 | And the size of the number of outfits,\par
120 | mainly the number of different outfits that there are,\par
121 | is by the product rule, is going to be\par
122 | the number of shirts times the number of pants\par
123 | times the number of shoes.\par
124 | And in this case, it's going to be three\par
125 | times two times five.\par
126 | Three\par
127 | times two times five,\par
128 | which is 30.\par
129 | 'Kay.\par
130 | And\par
131 | you know, you might think how this is again\par
132 | not very useful just counting the number of outfits.\par
133 | Maybe the following example, it will convince you\par
134 | that this is indispensable.\par
135 | So let's say you go to Costco\par
136 | and you want\par
137 | to\par
138 | buy tissue paper.\par
139 | 'Kay.\par
140 | Bathroom tissue.\par
141 | And so you get, you know, a package that\par
142 | looks a little bit like that\par
143 | and are trying to count how many\par
144 | rolls there are is hopeless.\par
145 | But what you can do is you can count and see\par
146 | that you know, there are maybe the sizes\par
147 | are three by three by four.\par
148 | And therefore, by the product rule\par
149 | the number of rolls that you have\par
150 | is three times three times four which is 36.\par
151 | Alright.\par
152 | So, if you have n sets,\par
153 | then again you can use the product rule\par
154 | and induction, you get the size of A one\par
155 | Cartesian Product A two up to A n\par
156 | is just the product of the sizes.\par
157 | And here is an application of that.\par
158 | Suppose you want to go for lunch.\par
159 | You want to go to Subway and you ask\par
160 | how many sandwiches can Subway make?\par
161 | And let's say that they have two breads\par
162 | which wheat and Italian.\par
163 | And they have let's say, five meats, listed here.\par
164 | Three cheeses,\par
165 | four veggies,\par
166 | three sauces,\par
167 | and let's say that you need to create\par
168 | a sandwich, you need to choose one of each.\par
169 | You need to choose a bread and a meat,\par
170 | a cheese and so on.\par
171 | Then the set of all sandwiches that you can make\par
172 | is the Cartesian Product of all breads times\par
173 | all meats times the set of cheeses, and so on.\par
174 | Because every sandwich consists of just\par
175 | one element from each.\par
176 | Okay, and you can choose all those,\par
177 | all those breads and so on.\par
178 | So the number of sandwiches is going to be\par
179 | the product of the number of breads and so on.\par
180 | Okay.\par
181 | And that's going to be by the,\par
182 | this is by the product rule\par
183 | and it's going to be equal to two times five\par
184 | times three times four times three\par
185 | which is 360.\par
186 | Now in fact, they have many more choices\par
187 | and number of sandwiches they can make\par
188 | is actually quite astronomical.\par
189 | Alright.\par
190 | So to summarize, we talked about the product rule,\par
191 | that says that the size of A Cartesian Product with B\par
192 | is the size of A times the size of B.\par
193 | Denotated by this multiplication sign.\par
194 | We said that for multi sets, for multiple sets,\par
195 | the size of the Cartesian Product is again\par
196 | the product of their sizes,\par
197 | and what are we going to talk about next time?\par
198 | Cartesian Powers.\par
199 | See you then.\par
200 | End of transcript. Skip to the start.\par
201 | POLL\par
202 | \par
203 | Which is better for calculating the cardinality of a set: inclusion-exclusion rule or complement rule?\par
204 | \par
205 | \tab\par
206 | It depends\par
207 | \par
208 | \tab\par
209 | Inclusion-Exclusion Rule\par
210 | \par
211 | \tab\par
212 | Complement Rule\par
213 | }
214 |  


--------------------------------------------------------------------------------
/Week 3 Counting and Combinatorics/6 Counting Variations.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 POLL\par
 4 | \par
 5 | A student is given a problem: What is the size of the set of 2-letter sequences that cannot use the letter Q? The student first calculates the total number of sequences using all letters to be 26*26=676. He then calculates the number of forbidden sequences. First, he calculates the number of sequences where a Q is the first letter (i.e. QS) to be 26. Next, he calculates the number of sequences where a Q is the second letter (i.e. BQ) to also be 26. He then adds 26+26 to get the total number of forbidden sequences and subtracts them from the total to get 624. What is wrong with his approach?\par
 6 | \par
 7 | \tab\par
 8 | He subtracted the sequence QQ from the total twice.\par
 9 | \par
10 | \tab\par
11 | The sequences with a Q as the second letter only total 24.\par
12 | \par
13 | \tab\par
14 | He incorrectly calculated the total number of sequences using any letters.\par
15 | \par
16 | \tab\par
17 | Nothing, his answer is correct.\par
18 | \par
19 | Submit\par
20 | Discussion\par
21 | }
22 |  


--------------------------------------------------------------------------------
/Week 3 Counting and Combinatorics/9 Partial Permutations.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Welcome again everyone.\par
  4 | Last lecture, we talked about permutations\par
  5 | and now we want to talk about partial permutations.\par
  6 | These are permutations where you don't want to\par
  7 | arrange all the objects that you have,\par
  8 | but just some subset.\par
  9 | So we know that the number of ways\par
 10 | to arrange n objects is n factorial\par
 11 | and we would like to determine how many ways,\par
 12 | or in how many orders,\par
 13 | can you arrange some of the n objects.\par
 14 | So for example,\par
 15 | let's look at PINs that consist of two digits.\par
 16 | So if you allow any digits,\par
 17 | then, for example, 11 or 45,\par
 18 | in other words, you allow things to repeat,\par
 19 | then, the total number of PINs that you have\par
 20 | is ten times ten.\par
 21 | Ten for the first choice and ten for the second choice,\par
 22 | so it's ten times ten which is 100.\par
 23 | On the other hand,\par
 24 | if you insist on the digits being distinct\par
 25 | then, for example,\par
 26 | you allow 05 and 32 but not 33, because three repeats,\par
 27 | then you have ten options for the first digit\par
 28 | and nine options for the second digit\par
 29 | and the total number of ways you can do it\par
 30 | is ten times nine, which is 90.\par
 31 | Similarly, if you look at three letter words,\par
 32 | then if you allow any letters,\par
 33 | for example, mom or xyz,\par
 34 | so for example here we see repetition,\par
 35 | then you have 26 for each letter\par
 36 | so 26 cubed is the total number of\par
 37 | three letter words that you can come up with,\par
 38 | but if you insist on distinct letters,\par
 39 | and you allow things like abc but not dad,\par
 40 | then the number of different sequences you can have\par
 41 | is you have 26 options for the first letter,\par
 42 | and then 25 for the second, 24 for the third,\par
 43 | so you just get this product.\par
 44 | And as you can see,\par
 45 | it's just the calculation that we did for permutations,\par
 46 | except that we're not going all the way down to one,\par
 47 | but we're just finding just three out of this 26 elements.\par
 48 | In other words,\par
 49 | what we're counting here is we're saying we have 26 elements\par
 50 | in this case, for example, the 26 letters\par
 51 | and we don't want to arrange all of them,\par
 52 | but we want to arrange three of them.\par
 53 | The question is how many ways can we do it.\par
 54 | And another way to view this thing\par
 55 | in a completely identical way\par
 56 | is to say that we are looking instead of sequences\par
 57 | where we allow all possible sequences,\par
 58 | this would give us the Cartesian product,\par
 59 | now we're looking at sequences\par
 60 | where we insist that all the elements are distinct.\par
 61 | Okay, so that's another view of the same problem.\par
 62 | So how many partial permutations are there?\par
 63 | We call such permutations where you have n objects\par
 64 | and you want to arrange k of them,\par
 65 | we call it a permutation of k out of n objects,\par
 66 | or we call it a k-permutation of n.\par
 67 | And we want to calculate\par
 68 | how many such k-permutations of n there are.\par
 69 | So as we just said,\par
 70 | there are n ways to write the first element,\par
 71 | and then n minus one to select the second element\par
 72 | and so on until we get to n minus k plus one\par
 73 | to select the kth element,\par
 74 | right, 'cause it's n for the first one\par
 75 | and n minus one for the second\par
 76 | so it's going to be n minus k plus one for the kth element.\par
 77 | And this number, we can easily see\par
 78 | that we can continue it up to, if you want,\par
 79 | times n minus k all the way down, times one\par
 80 | and then divide by this ending\par
 81 | so we get that this is\par
 82 | n factorial divided by n minus k factorial.\par
 83 | And we denote this by n to the k with an underline.\par
 84 | That's what we call it today,\par
 85 | kth falling power of n.\par
 86 | And some people denote it by P(n,k),\par
 87 | for permutation of n elements where we take just k of them.\par
 88 | So here are some values of the kth falling power of n\par
 89 | for small values of k.\par
 90 | So for k equals one, n to the one falling power is just n.\par
 91 | For k equals two it's n times n minus one.\par
 92 | For k equal to three,\par
 93 | it's n times n minus one times n minus two\par
 94 | and so on up to k,\par
 95 | it's n times n minus one times n minus k plus one\par
 96 | as we define it to be.\par
 97 | So, let's finish with an example of partial permutations.\par
 98 | So imagine that you have five programming books,\par
 99 | five probability books and six machine-learning books.\par
100 | And you wonder how many ways can you write a list\par
101 | that contains two books from each subject\par
102 | where books from the same subject are listed consecutively,\par
103 | namely next to each other.\par
104 | So we wonder how many such sequences there are.\par
105 | Examples of lists like that would be\par
106 | the third probability book, the first probability book,\par
107 | and then the fifth machine-learning book\par
108 | and the second machine-learning book,\par
109 | and programming book number one\par
110 | and programming book number four.\par
111 | We can see that the probability books are listed together.\par
112 | The machine-learning books are listed together\par
113 | and the programming books are listed together.\par
114 | Or machine-learning two and machine-learning six\par
115 | followed by programming one, programming two\par
116 | and so on again all the books from the same topic\par
117 | are listed next to each other.\par
118 | So we wonder how many such sequences there are.\par
119 | So we're going to use a combination\par
120 | of factorials and permutations\par
121 | and in some sense we're using the product rule many times.\par
122 | And they will show that the answer is three factorial\par
123 | times four to the second falling power\par
124 | times five to the second falling power\par
125 | times six to the second falling power.\par
126 | To see that, notice that to first decide\par
127 | we can first decide on the order of the topics.\par
128 | For example, here machine-learning, programming,\par
129 | and then probability.\par
130 | So there are three different topics\par
131 | and the number of ways to arrange them\par
132 | is therefore three factorial\par
133 | to decide on the order of the subjects\par
134 | like here's machine learning, programming, and probability.\par
135 | And then once we do that,\par
136 | we need to decide which machine-learning book\par
137 | we're going to choose first\par
138 | and which one we're going to choose second.\par
139 | And because there are six machine learning books\par
140 | I'm sorry,\par
141 | which programming book we're going to put first\par
142 | and which programming book we're going to put second.\par
143 | 'Cause there are four programming books,\par
144 | then we have four to the second falling power\par
145 | number of ways to do that\par
146 | because we can decide on the first programming book\par
147 | and then the second programming book\par
148 | and then we can decide on which probability books\par
149 | we're going to choose\par
150 | and since there are five of them\par
151 | we have five to the second falling power\par
152 | or five times 4 ways of doing that\par
153 | and then we need to decide which machine-learning books\par
154 | we are listing and in what order\par
155 | and we have six ways to choose the first one\par
156 | and five to do the second one\par
157 | or in other words we have\par
158 | six to the second falling power to do that.\par
159 | And we take the product of all these,\par
160 | by the product rules\par
161 | because we are making a decision for the order\par
162 | and making a separate decision\par
163 | for which programming books we choose,\par
164 | which probability books we choose, and so on,\par
165 | so we take the product,\par
166 | we get the total number of ways of writing such lists.\par
167 | So we have talked about permutations\par
168 | and we've talked about partial permutations\par
169 | and next time we'll talk about combinations.\par
170 | See you then.\par
171 | End of transcript. Skip to the start.\par
172 | POLL\par
173 | \par
174 | How many partial permutations are there for a 4-digit number where no two consecutive numbers are the same?\par
175 | \par
176 | \tab\par
177 | 5040\par
178 | \par
179 | \tab\par
180 | 6561\par
181 | \par
182 | \tab\par
183 | 7290\par
184 | \par
185 | \tab\par
186 | 10000\par
187 | }
188 |  


--------------------------------------------------------------------------------
/Week 3 Counting and Combinatorics/Week_3_Part_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 3 Counting and Combinatorics/Week_3_Part_1.pdf


--------------------------------------------------------------------------------
/Week 3 Counting and Combinatorics/Week_3_Part_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 3 Counting and Combinatorics/Week_3_Part_2.pdf


--------------------------------------------------------------------------------
/Week 3 Counting and Combinatorics/week3.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 3 Counting and Combinatorics/week3.zip


--------------------------------------------------------------------------------
/Week 4 Probability and Conditioning/1 Distribution Types.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 Hello, and welcome back.\par
  4 | So now that we have discussed randomness,\par
  5 | and distributions, let's talk about\par
  6 | distribution types.\par
  7 | We're going to discuss uniform sample spaces;\par
  8 | give some examples of coins, dice, and cards;\par
  9 | and then we'll talk about non-uniform spaces,\par
 10 | and give an example for tetrahedral die.\par
 11 | We'll start with uniform probability spaces.\par
 12 | Generally, outcomes may have different probabilities.\par
 13 | For example, rain may have 10% probability\par
 14 | of occurring on a given day,\par
 15 | and 90% probability that it doesn't happen.\par
 16 | But what we want to talk about now\par
 17 | are uniform, also called equiprobable, spaces\par
 18 | where the distribution is uniform.\par
 19 | Everything has the same probability.\par
 20 | All outcomes are equally likely.\par
 21 | For example, if you flip a coin,\par
 22 | then if the coin is far,\par
 23 | then the probability of heads\par
 24 | and the probability of tails are both one half.\par
 25 | When you have a uniform space,\par
 26 | then all probability specifications\par
 27 | and calculations become a lot simpler, as we'll see.\par
 28 | So as we said, in uniform probability spaces,\par
 29 | all outcomes are equally likely.\par
 30 | In other words, for all X in the sample space,\par
 31 | the probability of any X is the same P.\par
 32 | Now we know that one is going to be\par
 33 | the probability of the whole sample space,\par
 34 | and as we know, for probability distribution,\par
 35 | when you sum the probabilities, it has to be one.\par
 36 | Okay, and therefore we get the summation\par
 37 | of all X in the sample space of P is one.\par
 38 | This is just P times the size of the sample space,\par
 39 | and that tells us that P, the probability\par
 40 | of every element, is one over the size of the sample space,\par
 41 | or one of the sides of one coin.\par
 42 | For example, if we take a coin,\par
 43 | probability of heads and probability of tails.\par
 44 | If we assume it's P, then we have that\par
 45 | one is P of heads plus P of tails, or 2P,\par
 46 | and therefore P must be one half.\par
 47 | In uniform spaces in general,\par
 48 | we see that every outcome has probability\par
 49 | which is one over the size of the sample space,\par
 50 | like here one over two.\par
 51 | We call uniform spaces, we'll denote them\par
 52 | by this U for uniform.\par
 53 | All you need to know for uniform distribution\par
 54 | to know all the probabilities\par
 55 | is just to know the size of the sample space.\par
 56 | That makes them so simple for us.\par
 57 | And one notation is that when\par
 58 | we have uniform sample space,\par
 59 | we'll say that we draw uniformly from the space\par
 60 | or we just draw randomly.\par
 61 | So randomly, when we don't say anything else,\par
 62 | we'll signify that we're drawing uniformly.\par
 63 | So sometimes just say randomly, and mean uniformly.\par
 64 | Let's see a couple of examples.\par
 65 | For a coin, the sample space consists\par
 66 | of heads and tails, which we'll denote by H and T.\par
 67 | The size of the sample space is two.\par
 68 | We'll say that we'll flip a coin,\par
 69 | or toss a coin, and the results are equally likely.\par
 70 | In other words, the space is uniform.\par
 71 | That means the probability of heads\par
 72 | is equal to the probability of tails.\par
 73 | As we saw in the previous slide,\par
 74 | that means that each of them has a probability\par
 75 | which is one over the size of the sample space,\par
 76 | or one over two.\par
 77 | If we take a fair die,\par
 78 | the possible outcomes are one up to six.\par
 79 | The size of the sample space is six.\par
 80 | Therefore, if you roll the die,\par
 81 | if we assume the die's fair,\par
 82 | the faces are equally likely.\par
 83 | In other words, the space is uniform.\par
 84 | It's noted by U.\par
 85 | So the probability of one\par
 86 | is equal to the probability of two, and so on\par
 87 | up to the probability of six.\par
 88 | That means that the probability of each element\par
 89 | is one over the sample size, or one over six.\par
 90 | If we have a deck of cards,\par
 91 | then we let (mumbles) the set of four cards.\par
 92 | In this case, maybe 52 in the standard deck.\par
 93 | If we draw a card, everything is equally likely.\par
 94 | You know, a U.\par
 95 | The probability of getting maybe a three of clubs,\par
 96 | or a queen of hearts, all of them equally likely,\par
 97 | which is one over the size of the sample space,\par
 98 | which is one over 52.\par
 99 | Now, in many cases, the space is non-uniform.\par
100 | Uniform spaces are coins and dies and so on,\par
101 | but in those cases everything is pretty good.\par
102 | But in nature, non-uniform spaces abound.\par
103 | We give the example of rain,\par
104 | also grades that we get\par
105 | are not equally likely to be an A plus as other grades.\par
106 | Our words.\par
107 | Some words are more likely than others.\par
108 | Some illnesses are more likely than others.\par
109 | Some web pages are visited more often.\par
110 | People that we see are not randomly distributed\par
111 | among all seven billion people on the planet,\par
112 | and so on.\par
113 | So we need to look at them.\par
114 | Now, the many examples of non-uniform spaces,\par
115 | and one typical example is a pie chart\par
116 | that we see here.\par
117 | These reflect the different probabilities\par
118 | of possible outcomes, as we see here.\par
119 | When we see a pie chart,\par
120 | what you see is, actually, could be uniform,\par
121 | but typically a non-uniform space.\par
122 | Typically the challenge with non-uniform spaces\par
123 | is how can we remember it if we're going to give examples.\par
124 | So we're going to try to give a simple example\par
125 | of a tetrahedral die.\par
126 | This is a four-sided, or pyramid die as it's also called.\par
127 | It's used in different games, board games.\par
128 | For example, Dungeons and Dragons.\par
129 | Typically, in those cases, the die's equiprobable.\par
130 | Each of the four faces has the same probability.\par
131 | But we're just going to give it some different values.\par
132 | We'll assume different probabilities.\par
133 | We'll try to make it easy to remember.\par
134 | The faces are one, two, three, four,\par
135 | and we'll just give them probabilities\par
136 | of .1, .2, .3, and .4.\par
137 | Notice that, conveniently, they add to one,\par
138 | and therefore this is a probability distribution.\par
139 | Now that we have these examples in mind,\par
140 | let's go over a few things about distributions\par
141 | and see what we can say about them or can't.\par
142 | Notice that random notation may be\par
143 | a little confusing at first.\par
144 | Which expressions are valid, and which are not?\par
145 | For example, we can say the probability\par
146 | that X is equal to three.\par
147 | That's a valid expression.\par
148 | That's the probability that the random outcome\par
149 | is going to turn up three.\par
150 | For example, if you have a fair die,\par
151 | this probability is 1/6th.\par
152 | So that's an okay expression.\par
153 | Can also write P of three.\par
154 | That's an abbreviation for the probability that X is three.\par
155 | This is just the same as writing\par
156 | the probability of X is three.\par
157 | You can also write P of X,\par
158 | but then we need to specify X.\par
159 | For example, we can say that,\par
160 | for X, P of X is a quarter.\par
161 | If you have a fair tetrahedral die, for example.\par
162 | Now things that are maybe not so clear\par
163 | what they can do is if you write probability\par
164 | of one equal to three, or probability\par
165 | of the random variable X.\par
166 | Those could happen, but not so likely.\par
167 | For example, here what you might mean\par
168 | is that you have a random variable,\par
169 | which is always one, and another random variable,\par
170 | and you ask what is the probability\par
171 | that it's equal to three.\par
172 | Then that probability is going to be zero.\par
173 | Here, maybe you mean you have random variable X,\par
174 | and you ask what is the probability\par
175 | of the random outcome that you observe.\par
176 | These are possible, but less common,\par
177 | so make sure that this is what you mean.\par
178 | This typically would mean zero,\par
179 | and this is a random value, as we said.\par
180 | Things that you should not do is,\par
181 | for example, write the probability\par
182 | that little X is equal to three,\par
183 | because little X is a value.\par
184 | Cannot write the probability it's equal to three.\par
185 | This is even less likely, and probably it's wrong.\par
186 | So, to summarize, we talked about\par
187 | different distribution types.\par
188 | We talked about uniform sample spaces;\par
189 | coins, dies, and cards as examples.\par
190 | We talked about non-uniform spaces.\par
191 | We gave the example for tetrahedral die.\par
192 | What are we going to do next time?\par
193 | We'll discuss events.\par
194 | See you then.\par
195 | End of transcript. Skip to the start.\par
196 | Discussion\par
197 | Topic: Week 4 / Distributions\par
198 | Show Discussion\par
199 | POLL\par
200 | \par
201 | An outcome in a uniform probability space has probability 1/10, what is the size of the sample space?\par
202 | \par
203 | \tab\par
204 | 5\par
205 | \par
206 | \tab\par
207 | 10\par
208 | \par
209 | \tab\par
210 | 20\par
211 | }
212 |  


--------------------------------------------------------------------------------
/Week 4 Probability and Conditioning/2 Distribution Types.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 Hello, and welcome back.\par
  4 | So now that we have discussed randomness,\par
  5 | and distributions, let's talk about\par
  6 | distribution types.\par
  7 | We're going to discuss uniform sample spaces;\par
  8 | give some examples of coins, dice, and cards;\par
  9 | and then we'll talk about non-uniform spaces,\par
 10 | and give an example for tetrahedral die.\par
 11 | We'll start with uniform probability spaces.\par
 12 | Generally, outcomes may have different probabilities.\par
 13 | For example, rain may have 10% probability\par
 14 | of occurring on a given day,\par
 15 | and 90% probability that it doesn't happen.\par
 16 | But what we want to talk about now\par
 17 | are uniform, also called equiprobable, spaces\par
 18 | where the distribution is uniform.\par
 19 | Everything has the same probability.\par
 20 | All outcomes are equally likely.\par
 21 | For example, if you flip a coin,\par
 22 | then if the coin is far,\par
 23 | then the probability of heads\par
 24 | and the probability of tails are both one half.\par
 25 | When you have a uniform space,\par
 26 | then all probability specifications\par
 27 | and calculations become a lot simpler, as we'll see.\par
 28 | So as we said, in uniform probability spaces,\par
 29 | all outcomes are equally likely.\par
 30 | In other words, for all X in the sample space,\par
 31 | the probability of any X is the same P.\par
 32 | Now we know that one is going to be\par
 33 | the probability of the whole sample space,\par
 34 | and as we know, for probability distribution,\par
 35 | when you sum the probabilities, it has to be one.\par
 36 | Okay, and therefore we get the summation\par
 37 | of all X in the sample space of P is one.\par
 38 | This is just P times the size of the sample space,\par
 39 | and that tells us that P, the probability\par
 40 | of every element, is one over the size of the sample space,\par
 41 | or one of the sides of one coin.\par
 42 | For example, if we take a coin,\par
 43 | probability of heads and probability of tails.\par
 44 | If we assume it's P, then we have that\par
 45 | one is P of heads plus P of tails, or 2P,\par
 46 | and therefore P must be one half.\par
 47 | In uniform spaces in general,\par
 48 | we see that every outcome has probability\par
 49 | which is one over the size of the sample space,\par
 50 | like here one over two.\par
 51 | We call uniform spaces, we'll denote them\par
 52 | by this U for uniform.\par
 53 | All you need to know for uniform distribution\par
 54 | to know all the probabilities\par
 55 | is just to know the size of the sample space.\par
 56 | That makes them so simple for us.\par
 57 | And one notation is that when\par
 58 | we have uniform sample space,\par
 59 | we'll say that we draw uniformly from the space\par
 60 | or we just draw randomly.\par
 61 | So randomly, when we don't say anything else,\par
 62 | we'll signify that we're drawing uniformly.\par
 63 | So sometimes just say randomly, and mean uniformly.\par
 64 | Let's see a couple of examples.\par
 65 | For a coin, the sample space consists\par
 66 | of heads and tails, which we'll denote by H and T.\par
 67 | The size of the sample space is two.\par
 68 | We'll say that we'll flip a coin,\par
 69 | or toss a coin, and the results are equally likely.\par
 70 | In other words, the space is uniform.\par
 71 | That means the probability of heads\par
 72 | is equal to the probability of tails.\par
 73 | As we saw in the previous slide,\par
 74 | that means that each of them has a probability\par
 75 | which is one over the size of the sample space,\par
 76 | or one over two.\par
 77 | If we take a fair die,\par
 78 | the possible outcomes are one up to six.\par
 79 | The size of the sample space is six.\par
 80 | Therefore, if you roll the die,\par
 81 | if we assume the die's fair,\par
 82 | the faces are equally likely.\par
 83 | In other words, the space is uniform.\par
 84 | It's noted by U.\par
 85 | So the probability of one\par
 86 | is equal to the probability of two, and so on\par
 87 | up to the probability of six.\par
 88 | That means that the probability of each element\par
 89 | is one over the sample size, or one over six.\par
 90 | If we have a deck of cards,\par
 91 | then we let (mumbles) the set of four cards.\par
 92 | In this case, maybe 52 in the standard deck.\par
 93 | If we draw a card, everything is equally likely.\par
 94 | You know, a U.\par
 95 | The probability of getting maybe a three of clubs,\par
 96 | or a queen of hearts, all of them equally likely,\par
 97 | which is one over the size of the sample space,\par
 98 | which is one over 52.\par
 99 | Now, in many cases, the space is non-uniform.\par
100 | Uniform spaces are coins and dies and so on,\par
101 | but in those cases everything is pretty good.\par
102 | But in nature, non-uniform spaces abound.\par
103 | We give the example of rain,\par
104 | also grades that we get\par
105 | are not equally likely to be an A plus as other grades.\par
106 | Our words.\par
107 | Some words are more likely than others.\par
108 | Some illnesses are more likely than others.\par
109 | Some web pages are visited more often.\par
110 | People that we see are not randomly distributed\par
111 | among all seven billion people on the planet,\par
112 | and so on.\par
113 | So we need to look at them.\par
114 | Now, the many examples of non-uniform spaces,\par
115 | and one typical example is a pie chart\par
116 | that we see here.\par
117 | These reflect the different probabilities\par
118 | of possible outcomes, as we see here.\par
119 | When we see a pie chart,\par
120 | what you see is, actually, could be uniform,\par
121 | but typically a non-uniform space.\par
122 | Typically the challenge with non-uniform spaces\par
123 | is how can we remember it if we're going to give examples.\par
124 | So we're going to try to give a simple example\par
125 | of a tetrahedral die.\par
126 | This is a four-sided, or pyramid die as it's also called.\par
127 | It's used in different games, board games.\par
128 | For example, Dungeons and Dragons.\par
129 | Typically, in those cases, the die's equiprobable.\par
130 | Each of the four faces has the same probability.\par
131 | But we're just going to give it some different values.\par
132 | We'll assume different probabilities.\par
133 | We'll try to make it easy to remember.\par
134 | The faces are one, two, three, four,\par
135 | and we'll just give them probabilities\par
136 | of .1, .2, .3, and .4.\par
137 | Notice that, conveniently, they add to one,\par
138 | and therefore this is a probability distribution.\par
139 | Now that we have these examples in mind,\par
140 | let's go over a few things about distributions\par
141 | and see what we can say about them or can't.\par
142 | Notice that random notation may be\par
143 | a little confusing at first.\par
144 | Which expressions are valid, and which are not?\par
145 | For example, we can say the probability\par
146 | that X is equal to three.\par
147 | That's a valid expression.\par
148 | That's the probability that the random outcome\par
149 | is going to turn up three.\par
150 | For example, if you have a fair die,\par
151 | this probability is 1/6th.\par
152 | So that's an okay expression.\par
153 | Can also write P of three.\par
154 | That's an abbreviation for the probability that X is three.\par
155 | This is just the same as writing\par
156 | the probability of X is three.\par
157 | You can also write P of X,\par
158 | but then we need to specify X.\par
159 | For example, we can say that,\par
160 | for X, P of X is a quarter.\par
161 | If you have a fair tetrahedral die, for example.\par
162 | Now things that are maybe not so clear\par
163 | what they can do is if you write probability\par
164 | of one equal to three, or probability\par
165 | of the random variable X.\par
166 | Those could happen, but not so likely.\par
167 | For example, here what you might mean\par
168 | is that you have a random variable,\par
169 | which is always one, and another random variable,\par
170 | and you ask what is the probability\par
171 | that it's equal to three.\par
172 | Then that probability is going to be zero.\par
173 | Here, maybe you mean you have random variable X,\par
174 | and you ask what is the probability\par
175 | of the random outcome that you observe.\par
176 | These are possible, but less common,\par
177 | so make sure that this is what you mean.\par
178 | This typically would mean zero,\par
179 | and this is a random value, as we said.\par
180 | Things that you should not do is,\par
181 | for example, write the probability\par
182 | that little X is equal to three,\par
183 | because little X is a value.\par
184 | Cannot write the probability it's equal to three.\par
185 | This is even less likely, and probably it's wrong.\par
186 | So, to summarize, we talked about\par
187 | different distribution types.\par
188 | We talked about uniform sample spaces;\par
189 | coins, dies, and cards as examples.\par
190 | We talked about non-uniform spaces.\par
191 | We gave the example for tetrahedral die.\par
192 | What are we going to do next time?\par
193 | We'll discuss events.\par
194 | See you then.\par
195 | End of transcript. Skip to the start.\par
196 | Discussion\par
197 | }
198 |  


--------------------------------------------------------------------------------
/Week 4 Probability and Conditioning/4_Permutations_and_Combinations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 4 Permutations and Combinations\n",
  8 |     "\n",
  9 |     "## 4.1 Permutations\n",
 10 |     "\n",
 11 |     "We find the number of $k$-permutations of $A$, first by determining the set of permutations and then by calculating $\\frac{|A|!}{(|A|-k)!}$. We first consider the special case of $k=|A|$, which is equivalent to finding the number of ways of ordering the elements of $A$. First we import the  **itertools** library."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 33,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import itertools"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 34,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "A = {1, 2, 3}"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 35,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "Permutations of set([1, 2, 3]):  \n",
 46 |       "(1, 3, 2)\n",
 47 |       "(3, 2, 1)\n",
 48 |       "(2, 1, 3)\n",
 49 |       "(3, 1, 2)\n",
 50 |       "(1, 2, 3)\n",
 51 |       "(2, 3, 1)\n",
 52 |       "\n",
 53 |       "Number of permutations:  6\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "# Find all permutations of A and |A!|\n",
 59 |     "permute_all = set(itertools.permutations(A))\n",
 60 |     "print(\"Permutations of %s:  \" %A)\n",
 61 |     "for i in permute_all:\n",
 62 |     "    print(i)\n",
 63 |     "print;print \"Number of permutations: \", len(permute_all)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 36,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "6\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "# Find |A|! directly\n",
 81 |     "from math import factorial\n",
 82 |     "print(factorial(len(A)))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 37,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "A = {1, 2, 3, 4}\n",
 94 |     "k = 3"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 38,
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "name": "stdout",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "3-permutations of set([1, 2, 3, 4]):  \n",
107 |       "(1, 2, 3)\n",
108 |       "(1, 2, 4)\n",
109 |       "(1, 3, 2)\n",
110 |       "(1, 3, 4)\n",
111 |       "(1, 4, 2)\n",
112 |       "(1, 4, 3)\n",
113 |       "(2, 1, 3)\n",
114 |       "(2, 1, 4)\n",
115 |       "(2, 3, 1)\n",
116 |       "(2, 3, 4)\n",
117 |       "(2, 4, 1)\n",
118 |       "(2, 4, 3)\n",
119 |       "(3, 1, 2)\n",
120 |       "(3, 1, 4)\n",
121 |       "(3, 2, 1)\n",
122 |       "(3, 2, 4)\n",
123 |       "(3, 4, 1)\n",
124 |       "(3, 4, 2)\n",
125 |       "(4, 1, 2)\n",
126 |       "(4, 1, 3)\n",
127 |       "(4, 2, 1)\n",
128 |       "(4, 2, 3)\n",
129 |       "(4, 3, 1)\n",
130 |       "(4, 3, 2)\n",
131 |       "\n",
132 |       "Size =  4!/(4-3)! =  24\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "# Print all the k-permutations of A\n",
138 |     "n = len(A)\n",
139 |     "permute_k = list(itertools.permutations(A, k))\n",
140 |     "print(\"%i-permutations of %s:  \" %(k,A))\n",
141 |     "for i in permute_k:\n",
142 |     "    print(i)\n",
143 |     "print;print \"Size = \", \"%i!/(%i-%i)! = \" %(n,n,k), len(permute_k)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 39,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "24\n"
156 |      ]
157 |     }
158 |    ],
159 |    "source": [
160 |     "# Print |A|!/(|A|-k)! directly\n",
161 |     "print(int(factorial(len(A))/factorial(len(A)-k)))"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## 4.2 Combinations\n",
169 |     "We find the number of $k$-combinations of $A$, first by determining the set of combinations and then by simply calculating ${|A|}\\choose{k}$."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 40,
175 |    "metadata": {
176 |     "collapsed": true
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "from scipy.special import binom          # to calculate the binomial coefficients |A| choose k"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 41,
186 |    "metadata": {
187 |     "collapsed": true
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "A = {1, 2, 3, 4}\n",
192 |     "k = 2"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 42,
198 |    "metadata": {
199 |     "scrolled": true
200 |    },
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "2-combinations of set([1, 2, 3, 4]):  \n",
207 |       "(1, 2)\n",
208 |       "(1, 3)\n",
209 |       "(1, 4)\n",
210 |       "(2, 3)\n",
211 |       "(2, 4)\n",
212 |       "(3, 4)\n",
213 |       "\n",
214 |       "Number of combinations = 4!/(2!(4-2)!) = 6\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "# Print all the k-combinations of A\n",
220 |     "choose_k = list(itertools.combinations(A,k))\n",
221 |     "print(\"%i-combinations of %s:  \" %(k,A))\n",
222 |     "for i in choose_k:\n",
223 |     "    print(i)\n",
224 |     "print;print(\"Number of combinations = %i!/(%i!(%i-%i)!) = %i\" %(n,k,n,k,len(choose_k)  ))"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 43,
230 |    "metadata": {},
231 |    "outputs": [
232 |     {
233 |      "name": "stdout",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "6\n"
237 |      ]
238 |     }
239 |    ],
240 |    "source": [
241 |     "# Print |A|!/(k!(|A|-k)!) directly\n",
242 |     "print(int(factorial(len(A))/(factorial(k)*factorial(len(A)-k))))"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "If you want to concatenate characters such as letters of the English alphabet and print them as strings, you can use the <i>join()</i> function."
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 44,
255 |    "metadata": {
256 |     "collapsed": true
257 |    },
258 |    "outputs": [],
259 |    "source": [
260 |     "A = {'a', 'b', 'c', 'q'}\n",
261 |     "k = 3"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 45,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "name": "stdout",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "3-permutations of set(['a', 'q', 'c', 'b']):\n",
274 |       "aqc\n",
275 |       "aqb\n",
276 |       "acq\n",
277 |       "acb\n",
278 |       "abq\n",
279 |       "abc\n",
280 |       "qac\n",
281 |       "qab\n",
282 |       "qca\n",
283 |       "qcb\n",
284 |       "qba\n",
285 |       "qbc\n",
286 |       "caq\n",
287 |       "cab\n",
288 |       "cqa\n",
289 |       "cqb\n",
290 |       "cba\n",
291 |       "cbq\n",
292 |       "baq\n",
293 |       "bac\n",
294 |       "bqa\n",
295 |       "bqc\n",
296 |       "bca\n",
297 |       "bcq\n",
298 |       "\n",
299 |       "Size =  4!/(4-3)! =  24\n"
300 |      ]
301 |     }
302 |    ],
303 |    "source": [
304 |     "# Print all the k-permutations of S\n",
305 |     "n = len(A)\n",
306 |     "permute_k = list(itertools.permutations(A, k))\n",
307 |     "print(\"%i-permutations of %s:\" %(k,A))\n",
308 |     "for i in range(0, len(permute_k)):\n",
309 |     "    print(''.join(permute_k[i]) )\n",
310 |     "print;print \"Size =  %i!/(%i-%i)! = \" %(n,n,k), len(permute_k)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 46,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "name": "stdout",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "24\n"
323 |      ]
324 |     }
325 |    ],
326 |    "source": [
327 |     "# Print |A|!/(|A|-k)! directly\n",
328 |     "print(int(factorial(len(A))/factorial(len(A)-k)))"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 47,
334 |    "metadata": {
335 |     "collapsed": true
336 |    },
337 |    "outputs": [],
338 |    "source": [
339 |     "A = {'a', 'b', 'c', 'd'}\n",
340 |     "k = 2"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 48,
346 |    "metadata": {
347 |     "scrolled": true
348 |    },
349 |    "outputs": [
350 |     {
351 |      "name": "stdout",
352 |      "output_type": "stream",
353 |      "text": [
354 |       "2-combinations of set(['a', 'c', 'b', 'd']):\n",
355 |       "\n",
356 |       "ac\n",
357 |       "ab\n",
358 |       "ad\n",
359 |       "cb\n",
360 |       "cd\n",
361 |       "bd\n",
362 |       "\n",
363 |       "Size = 4!/(2!(4-2)!) =  6\n"
364 |      ]
365 |     }
366 |    ],
367 |    "source": [
368 |     "# Print all the k-combinations of A\n",
369 |     "choose_k = list(itertools.combinations(A,k))\n",
370 |     "print(\"%i-combinations of %s:\\n\" %(k,A))\n",
371 |     "for i in range(0, len(choose_k)):\n",
372 |     "    print(''.join(choose_k[i]) )\n",
373 |     "print;print \"Size = %i!/(%i!(%i-%i)!) = \" %(n,k,n,k), len(choose_k)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 49,
379 |    "metadata": {},
380 |    "outputs": [
381 |     {
382 |      "name": "stdout",
383 |      "output_type": "stream",
384 |      "text": [
385 |       "6\n"
386 |      ]
387 |     }
388 |    ],
389 |    "source": [
390 |     "# Print |A|!/(k!(|A|-k)!) directly\n",
391 |     "print(int(factorial(len(A))/(factorial(k)*factorial(len(A)-k))))"
392 |    ]
393 |   }
394 |  ],
395 |  "metadata": {
396 |   "kernelspec": {
397 |    "display_name": "Python 2",
398 |    "language": "python",
399 |    "name": "python2"
400 |   },
401 |   "language_info": {
402 |    "codemirror_mode": {
403 |     "name": "ipython",
404 |     "version": 2
405 |    },
406 |    "file_extension": ".py",
407 |    "mimetype": "text/x-python",
408 |    "name": "python",
409 |    "nbconvert_exporter": "python",
410 |    "pygments_lexer": "ipython2",
411 |    "version": "2.7.13"
412 |   },
413 |   "toc": {
414 |    "colors": {
415 |     "hover_highlight": "#DAA520",
416 |     "navigate_num": "#000000",
417 |     "navigate_text": "#333333",
418 |     "running_highlight": "#FF0000",
419 |     "selected_highlight": "#FFD700",
420 |     "sidebar_border": "#EEEEEE",
421 |     "wrapper_background": "#FFFFFF"
422 |    },
423 |    "moveMenuLeft": true,
424 |    "nav_menu": {
425 |     "height": "48px",
426 |     "width": "252px"
427 |    },
428 |    "navigate_menu": true,
429 |    "number_sections": true,
430 |    "sideBar": true,
431 |    "skip_h1_title": false,
432 |    "threshold": 4,
433 |    "toc_cell": false,
434 |    "toc_position": {},
435 |    "toc_section_display": "block",
436 |    "toc_window_display": false,
437 |    "widenNotebook": false
438 |   }
439 |  },
440 |  "nbformat": 4,
441 |  "nbformat_minor": 2
442 | }
443 | 


--------------------------------------------------------------------------------
/Week 4 Probability and Conditioning/Week_4_Part_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 4 Probability and Conditioning/Week_4_Part_1.pdf


--------------------------------------------------------------------------------
/Week 4 Probability and Conditioning/Week_4_Part_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 4 Probability and Conditioning/Week_4_Part_2.pdf


--------------------------------------------------------------------------------
/Week 5 Random Variables, Expectation, and Variance/4 Variable Modification.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hello and welcome back.\par
  4 | So we're talking about expectations of random variables\par
  5 | and what we're going to discuss first\par
  6 | is what happens when you modify random variables\par
  7 | and then we'll talk about expectations\par
  8 | of these modifications.\par
  9 | We want to discuss modifications\par
 10 | of functions of random varables.\par
 11 | Sometimes we're interested in\par
 12 | not just the random variable itself,\par
 13 | but some function of the random variable.\par
 14 | For example if a person gets a salary,\par
 15 | suppose it's some random number expressing dollars,\par
 16 | which we call X then if they get a $10 raise\par
 17 | then the salary now, instead of X, is X plus 10,\par
 18 | which we might call y.\par
 19 | Or if they get a 10% raise, then instead of salary X,\par
 20 | the new salary is new random variable Y,\par
 21 | which is 1.1 times X.\par
 22 | Or if they become CEO then maybe\par
 23 | the new salary Y is now X square.\par
 24 | In all these cases we see that we had a random variable X\par
 25 | and now we have a new random variable Y,\par
 26 | which is some function of X.\par
 27 | And so Y is some g of x for some function g,\par
 28 | and you notice the g is a deterministic function.\par
 29 | Defined over R, over the real numbers\par
 30 | or whatever the domain of X is.\par
 31 | Whatever the range of X is.\par
 32 | But in all those cases again,\par
 33 | you see that g is just a deterministic function.\par
 34 | Like X plus 10, or 1.1 X, or X squared.\par
 35 | It's a known function.\par
 36 | And all the randomness in Y derives from X.\par
 37 | So the only reason there's randomness\par
 38 | about the new salaries is because there was some randomness\par
 39 | about the original salary.\par
 40 | As you can see, we're taking advantage of the fact\par
 41 | that we're talking about random variables.\par
 42 | We're talking about numbers,\par
 43 | and we can now define functions over these variables.\par
 44 | Okay so were going to see a few examples.\par
 45 | So just to reiterate, X is deterministically modified by g.\par
 46 | So g is deterministic.\par
 47 | And X was random.\par
 48 | And Y is g of X, so it's now random as well.\par
 49 | So let's see a couple of examples.\par
 50 | First is translation.\par
 51 | So suppose we have a random variable X\par
 52 | and we want to add the constant b to X.\par
 53 | For example X two, add two, or something like that.\par
 54 | So this is called translation.\par
 55 | We are translating X, or we're moving X\par
 56 | by this quantity b.\par
 57 | So then Y is going to be X plus b.\par
 58 | And here is an example.\par
 59 | Here is X.\par
 60 | It's distributed between one and four\par
 61 | according to this distribution.\par
 62 | And Y is X plus b.\par
 63 | Maybe Y is X plus two.\par
 64 | Then what happens is we just\par
 65 | move the distribution of X by two.\par
 66 | So, what was the probability of one before\par
 67 | is now the probability of three.\par
 68 | And probability of two becomes the probability of four.\par
 69 | And so on.\par
 70 | Because we just took X, whatever it was,\par
 71 | and we added two to it.\par
 72 | So just to say it a little more formally,\par
 73 | what is the probability that the newer variable Y has,\par
 74 | gets the value y.\par
 75 | It's the probability because Y is X plus b.\par
 76 | It's the probability that X plus b is equal to y.\par
 77 | And that's the probability that X is equal to y minus b.\par
 78 | So we can see here the probability that y is six,\par
 79 | is the probability that X was two.\par
 80 | The probability that y is five\par
 81 | is the probability that X was three.\par
 82 | And so on.\par
 83 | The probability that y is three\par
 84 | is the probability that X was one.\par
 85 | Okay, so we see that we can relate the probabilities\par
 86 | of X, the original X, to the probability of the new y.\par
 87 | Using this function.\par
 88 | Here's another example, scaling.\par
 89 | So suppose we take X and we multiply it by a constant b.\par
 90 | So we call it, we said that we scale X by a factor of b.\par
 91 | Then Y is b times X.\par
 92 | And here is our original X.\par
 93 | And now let's multiply it by 1.5.\par
 94 | Then the values of X originally\par
 95 | were one, two, three, and four.\par
 96 | Now they're going to be one times 1.5,\par
 97 | which is 1.5.\par
 98 | Or two times 1.5, which is three.\par
 99 | On to the three we have 4.5.\par
100 | Instead of four we have six.\par
101 | And the probability that y is six\par
102 | is the probability that X was four, and so on.\par
103 | So I'll just again, writing it formally.\par
104 | The probability that Y is y,\par
105 | is the probability because Y is b times X,\par
106 | it's the probability that b times X is y.\par
107 | And that's the probability that X is y over b.\par
108 | Okay, and that's what we see here.\par
109 | The probability that y is six\par
110 | is the probability that X was six divided by 1.5.\par
111 | Or the probability that X was four.\par
112 | Now in these two examples,\par
113 | the function that we use b times X\par
114 | or X plus b was one to one.\par
115 | It mapped in different values of X,\par
116 | the different values of Y.\par
117 | But sometimes the function is not one to one,\par
118 | and things get a little more interesting.\par
119 | So let's look at the square function.\par
120 | And let's start in the range where the square is one to one.\par
121 | So let's say that we have a random variable X\par
122 | which is distribute of a zero one two.\par
123 | According to this probability,\par
124 | it's zero probability half.\par
125 | One with a probability of third.\par
126 | And two with probability one sixth.\par
127 | So you see that over the range zero one two,\par
128 | if we square those values,\par
129 | the square function is one to one.\par
130 | And specifically if we let Y equal to be X square,\par
131 | then y will get the value of zero square, which is zero.\par
132 | One square which is one.\par
133 | Or two square which is four.\par
134 | So these are the values.\par
135 | And what is the probability y will get those value,\par
136 | well y would be zero if X was zero\par
137 | which happens probability half.\par
138 | Y will be one if X was one.\par
139 | Which happens probability of one third.\par
140 | And y will be four if X was two.\par
141 | Which happens probability of one sixth.\par
142 | So these are the probabilities.\par
143 | But now let's look at the range\par
144 | where the square function is many to one.\par
145 | In particular let's look at the range\par
146 | where X varies from minus two to plus two.\par
147 | So minus two, minus one, all the way up to plus two.\par
148 | According to a uniform distribution.\par
149 | So X is gets each of those values probability of one fifth.\par
150 | Then Y again is X square.\par
151 | And now Y will have fewer values than X.\par
152 | Particularly it will have the same values\par
153 | that it would have before, zero one four.\par
154 | And let's see, what is the probability that y is zero.\par
155 | Y is zero if X was zero.\par
156 | And that happens probability one fifth.\par
157 | That's what we get here.\par
158 | Now more interestingly, y is one if X was either\par
159 | minus one, or plus one.\par
160 | Because in both cases, X squared is going to be one.\par
161 | And so X is minus one or one, with probability two fifths.\par
162 | And therefore the probability that y is one, is two fifths.\par
163 | And y is four if X was minus two or plus two\par
164 | because in both cases, X squared is going to be four.\par
165 | So y is going to be four.\par
166 | And X is minus two or plus two\par
167 | with probability one fifth plus one fifth,\par
168 | which is two fifths.\par
169 | So we see now that y is ranging over a smaller\par
170 | set of values.\par
171 | Three instead of five.\par
172 | And for each one, or at least for two of them,\par
173 | the probability of one of y comes from multiple values of X.\par
174 | All of them mapped to the same way.\par
175 | So let's see this in a picture.\par
176 | So here are the values of X.\par
177 | Zero, minus one, plus one,\par
178 | minus two, and plus two.\par
179 | And here is Y which is g of X,\par
180 | and our g of X, or g is X square.\par
181 | So Y is zero, one, and four.\par
182 | And now zero will map to zero.\par
183 | And minus one and plus one will both map to one\par
184 | by g or by X square.\par
185 | And minus two and plus two will both map to four.\par
186 | So when we look at the inverse mapping of zero of Y,\par
187 | we get the inverse mapping of zero is zero.\par
188 | The inverse mapping of one is minus one and plus one.\par
189 | The inverse mapping, or the inverse image of four\par
190 | is minus two and two.\par
191 | And the probability that Y is four,\par
192 | is the probability that X is in this inverse image of four.\par
193 | And the probability that Y is one,\par
194 | is the probability that X is in this inverse image of one.\par
195 | Which is minus one and one.\par
196 | So we can say therefore that the probability\par
197 | that Y is one, is the probability that g of X\par
198 | is equal to y.\par
199 | Because that's by definition, y is g of X.\par
200 | So supposedly g of X is equal to y.\par
201 | And taking the inverse mapping,\par
202 | it's the probability that X is in the inverse image of y.\par
203 | And what is the probability that X\par
204 | is in the inverse image of y for example here,\par
205 | minus two and two.\par
206 | It's just the sum of the probabilities of those Xs.\par
207 | So it's summation of all X in the inverse image of y\par
208 | of the probability of X.\par
209 | So for example, if we take four,\par
210 | and the probability that Y is four,\par
211 | is the summation of all Xs in the inverse image of four.\par
212 | Namely, minus two and two of the probabilities.\par
213 | So it's one fifth plus one fifth.\par
214 | Okay so as we see,\par
215 | when the mapping is many to one,\par
216 | then to calculate the probability of Y,\par
217 | we need to sum the probability of X\par
218 | in the inverse image of Y.\par
219 | So that's all there is to it.\par
220 | So we want to just introduce this concept\par
221 | of variable modifications.\par
222 | Or functions of random variables\par
223 | and next time we're going to look at\par
224 | the expectations of this modifications.\par
225 | See you then.\par
226 | End of transcript. Skip to the start.\par
227 | POLL\par
228 | \par
229 | Let\rquote s see if we have any risk takers in the class. Suppose you flip a fair coin. With each flip you must bet a certain amount of money. If the coin lands heads, you get double your money in return. If the coin lands tails, you lose your money. How much money would you be willing to bet during this game?\par
230 | \par
231 | \tab\par
232 | $0\par
233 | \par
234 | \tab\par
235 | $10\par
236 | \par
237 | \tab\par
238 | $100\par
239 | \par
240 | \tab\par
241 | $1000\par
242 | \par
243 | Submit\par
244 | }
245 |  


--------------------------------------------------------------------------------
/Week 5 Random Variables, Expectation, and Variance/8 Linearity of Expectations.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Today, I'm going to talk about Linearity of Expectations.\par
  4 | And so, if we want to calculate\par
  5 | the expected value of\par
  6 | the sum of two random variables, x and y,\par
  7 | we can just write it as the summation of all xy\par
  8 | of the sum, x plus y times the probability,\par
  9 | and that's going to be\par
 10 | broken up, if you want,\par
 11 | to the summation of over x times P xy,\par
 12 | plus the summation of all y times P xy.\par
 13 | And then we can see that we can take\par
 14 | x does not change with y,\par
 15 | so we can take it outside the y sum.\par
 16 | So we get summation over x\par
 17 | of this summation over y of P xy\par
 18 | plus summation over y of y, which we took out.\par
 19 | And then, inside, we have summation over all x\par
 20 | of P xy, where here we exchange the order first\par
 21 | and then took the y out.\par
 22 | Now, when we sum over P xy,\par
 23 | then by the rule of total probability,\par
 24 | this will sum to P of x.\par
 25 | So we get summation x, P of x.\par
 26 | And here, we have a summation of all x of P xy.\par
 27 | This will sum to P of y.\par
 28 | So, this is going to give us summation over y\par
 29 | of y times the probability of y.\par
 30 | And this is nothing but the expected value of x.\par
 31 | And the second sum is the expected value of y.\par
 32 | So, what we see is that the expected value of x plus y\par
 33 | is the expected value of x plus the expected value of y.\par
 34 | Or, in words, we see that the expectation of the sum\par
 35 | is the sum of expectations,\par
 36 | and this has many applications,\par
 37 | and we're going to actually have a video about that.\par
 38 | Now, of course, the next natural thing to consider is\par
 39 | whether the variance adds as well.\par
 40 | So, the expectations add, namely,\par
 41 | the expected value of x plus y\par
 42 | is the expected value of x\par
 43 | plus the expected value of y,\par
 44 | and it's very natural to ask\par
 45 | whether the variances add as well, namely,\par
 46 | is the variance of x plus y\par
 47 | the same as the variance of x plus the variance of y?\par
 48 | Now, to figure out if this is correct,\par
 49 | let us write the variance of the sum.\par
 50 | So, the variance of x plus y\par
 51 | is the expected value of x plus y squared\par
 52 | minus the expected value of x plus y,\par
 53 | the whole thing squared.\par
 54 | That's out of the second formulation for the variance.\par
 55 | It's the expected value of the random variable squared\par
 56 | minus the expected value of the random variable,\par
 57 | the whole thing squared.\par
 58 | And so, this is the same as\par
 59 | writing the expected value of when we square x plus y.\par
 60 | It's the expected value of x squared\par
 61 | plus 2 xy plus y squared,\par
 62 | and here we have minus the expected value of x\par
 63 | plus the expected value of y, the whole thing squared.\par
 64 | And this, we can open up.\par
 65 | We can say this the expected value of x squared\par
 66 | plus twice the expected value of xy\par
 67 | plus the expected value of y squared minus,\par
 68 | and when we open this product,\par
 69 | it's the expected value of x, the whole thing squared\par
 70 | plus twice the expected value of x\par
 71 | times the expected value of y\par
 72 | plus the expected value of y,\par
 73 | the whole quantity squared.\par
 74 | So, we can regroup things, and we get\par
 75 | this is the same as the expected value of x squared\par
 76 | minus the expected value of x squared\par
 77 | plus the expected value of y squared\par
 78 | minus the square of the expected value of y\par
 79 | plus twice the expected value of xy\par
 80 | minus twice the expected value of x\par
 81 | times the expected value of y.\par
 82 | Now, the first two terms are the variance of x,\par
 83 | and the next two terms are the variance of y,\par
 84 | and what we're left with is\par
 85 | twice the expected value of xy\par
 86 | minus the expected value of x\par
 87 | times the expected value of y.\par
 88 | So, the answer as to whether the variance of x plus y\par
 89 | is equal to the variance of x plus the variance of y\par
 90 | depends on whether the expected value of xy\par
 91 | equals the expected value of x\par
 92 | times the expected value of y.\par
 93 | So, this is the question that\par
 94 | we really probably should have asked before.\par
 95 | We saw that the expected value of x plus y\par
 96 | is the sum of the expectation of the product,\par
 97 | the product of the expectations,\par
 98 | namely, do expectations multiply?\par
 99 | So, that is an even more basic question\par
100 | than whether the variances add,\par
101 | and that's what we want to look at next.\par
102 | So, this is what we're going to do in a separate video\par
103 | because it would take us some time to discuss this.\par
104 | See you then.\par
105 | End of transcript. Skip to the start.\par
106 | POLL\par
107 | \par
108 | \par
109 | Which of the following equation(s) is/are true?\par
110 | \par
111 | a) E(X+2Y) = EX + 2EY \par
112 | b) E(X+Y2) = EX + (EY)2 \par
113 | \par
114 | \tab\par
115 | a\par
116 | \par
117 | \tab\par
118 | b\par
119 | \par
120 | \tab\par
121 | a, b\par
122 | \par
123 | \tab\par
124 | None of the above\par
125 | }
126 |  


--------------------------------------------------------------------------------
/Week 5 Random Variables, Expectation, and Variance/Week_5_Part_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 5 Random Variables, Expectation, and Variance/Week_5_Part_1.pdf


--------------------------------------------------------------------------------
/Week 5 Random Variables, Expectation, and Variance/Week_5_Part_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 5 Random Variables, Expectation, and Variance/Week_5_Part_2.pdf


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/1 Distribution Families.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hello and welcome back.\par
 4 | So we have introduced random variables,\par
 5 | and in this lecture I thought I'll just start\par
 6 | and present what we're going to talk about next.\par
 7 | So in relative, the random variables typically belong\par
 8 | to certain families of distributions.\par
 9 | And as you can see on this slide,\par
10 | there are many distribution families.\par
11 | But don't worry, we are going to focus on\par
12 | the most natural and most important ones,\par
13 | those that are significant\par
14 | both theoretically and practically.\par
15 | So we'll talk about very few,\par
16 | and I'm going to quickly tell you what we'll do for them\par
17 | in this very brief presentation.\par
18 | So the distribution we'll talk about\par
19 | will be for the discrete distributions.\par
20 | We'll talk about the Bernoulli distribution,\par
21 | binomial distributions, Poisson,\par
22 | and geometric distributions.\par
23 | And then after that,\par
24 | we will introduce continuous distributions\par
25 | and then we'll discuss uniform, exponential,\par
26 | and normal distributions.\par
27 | So for each of those distributions,\par
28 | what we are going to do is we will motivate them\par
29 | and discuss some applications.\par
30 | We'll provide the formulation for this distribution\par
31 | or this distribution family.\par
32 | And then we'll visualize them and see what they look like.\par
33 | And we'll provide some examples,\par
34 | and then we'll describe some properties,\par
35 | and typically we'll describe the mean, the variance,\par
36 | and the standard deviations.\par
37 | And for some distributions\par
38 | we'll discuss other properties as well.\par
39 | And then the notebooks that you have\par
40 | have some Python implementations\par
41 | so you can further plot the distributions\par
42 | and experiment with them and do certain other things.\par
43 | Okay?\par
44 | And now, when you want to show\par
45 | that a certain function is a distribution,\par
46 | we need to establish that we know two things.\par
47 | We need to show that it's nonnegative\par
48 | and we need to show that the values sum to one.\par
49 | Showing that the values are nonnegative\par
50 | is typically obvious,\par
51 | and showing that it sums to one takes a little more effort.\par
52 | So to make it maybe slightly less boring\par
53 | we're going to borrow a page from a company called Blendtec.\par
54 | They make a product which is fairly standard\par
55 | and uninteresting.\par
56 | It's a blender.\par
57 | And so to attract more people to them,\par
58 | they came up with a campaign\par
59 | that features the founder, Tom Dickson.\par
60 | And it's a commercial campaign.\par
61 | It's called Will it Blend?\par
62 | And in this commercial, Tom Dickson, the founder,\par
63 | takes different things and he checks whether they'll blend.\par
64 | So for example,\par
65 | he checks whether a Nike shoe will blend,\par
66 | or whether a garden rake will blend,\par
67 | or whether an iPhone will blend,\par
68 | or whether a Justin Bieber doll will blend,\par
69 | and so on.\par
70 | And typically the answer is the same,\par
71 | and that is yes, these will blend.\par
72 | So we are just going to try to slightly mimic this\par
73 | and we will ask, when we want to see whether it sums to one,\par
74 | we will ask will it add?\par
75 | And we will see\par
76 | whether these things add or don't add to one.\par
77 | Next we're going to start with\par
78 | the first collection of distributions\par
79 | which will be the Bernoulli distribution.\par
80 | See you then.\par
81 | End of transcript. Skip to the start.\par
82 |   Previous\par
83 | }
84 |  


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/5 Geometric.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 POLL\par
 4 | \par
 5 | Which of the following distributions is memoryless?\par
 6 | \par
 7 | \tab\par
 8 | Poisson distribution\par
 9 | \par
10 | \tab\par
11 | Geometric distribution\par
12 | \par
13 | \tab\par
14 | Both\par
15 | \par
16 | \tab\par
17 | None of them\par
18 | }
19 |  


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/6_conditional_probability_hw.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {
  6 |                 "nbgrader": {
  7 |                     "grade": false,
  8 |                     "locked": false,
  9 |                     "solution": false
 10 |                 }
 11 |             },
 12 |             "source": [
 13 |                 "<font size=\"4\" style=\"color:red;\"> **IMPORTANT: ** When submitting this homework notebook, please modify only the cells that start with:<\/font>\n",
 14 |                 "\n",
 15 |                 "```python\n",
 16 |                 "# modify this cell\n",
 17 |                 "```"
 18 |             ]
 19 |         },
 20 |         {
 21 |             "cell_type": "markdown",
 22 |             "metadata": {},
 23 |             "source": [
 24 |                 "# Conditional Probability and Baye's Rule\n"
 25 |             ]
 26 |         },
 27 |         {
 28 |             "cell_type": "markdown",
 29 |             "metadata": {},
 30 |             "source": [
 31 |                 "## Problem"
 32 |             ]
 33 |         },
 34 |         {
 35 |             "cell_type": "markdown",
 36 |             "metadata": {},
 37 |             "source": [
 38 |                 "There are two urns $A$ and $B$. Urn $A$ contains $r_A$ red balls and $w_A$ white balls whereas urn $B$ contains $r_B$ red balls and $w_B$ white balls. One of the urns is picked at random and then one ball is picked at random from this urn. Write a function **conditional_probability** that calculates the conditional probability that the randomly chosen ball belonged to urn $A$ given that it is white. Assume that $\\frac{r_A}{w_A}\\neq\\frac{r_B}{w_B}$.\n",
 39 |                 "\n",
 40 |                 "<font  style=\"color:blue\"> **Code**<\/font>\n",
 41 |                 "```python\n",
 42 |                 "rA, wA, rB, wB = 1., 2., 2., 1.\n",
 43 |                 "conditional__probability(rA, wA, rB, wB) \n",
 44 |                 "```\n",
 45 |                 "\n",
 46 |                 "<font  style=\"color:magenta\"> **Output**<\/font>\n",
 47 |                 "```\n",
 48 |                 "0.6666666666666666\n",
 49 |                 "```"
 50 |             ]
 51 |         },
 52 |         {
 53 |             "cell_type": "code",
 54 |             "execution_count": null,
 55 |             "metadata": {
 56 |                 "collapsed": true,
 57 |                 "scrolled": true
 58 |             },
 59 |             "outputs": [],
 60 |             "source": [
 61 |                 "# modify this cell\n",
 62 |                 "\n",
 63 |                 "def conditional__probability(rA, wA, rB, wB):\n",
 64 |                 "    # inputs: all of them are of type 'float'\n",
 65 |                 "    # output: a variable of type 'float'\n",
 66 |                 "    \n",
 67 |                 "    #\n",
 68 |                 "    # YOUR CODE HERE\n",
 69 |                 "    #\n"
 70 |             ]
 71 |         },
 72 |         {
 73 |             "cell_type": "code",
 74 |             "execution_count": null,
 75 |             "metadata": {
 76 |                 "collapsed": true,
 77 |                 "nbgrader": {
 78 |                     "grade": true,
 79 |                     "grade_id": "ex1",
 80 |                     "locked": true,
 81 |                     "points": "5",
 82 |                     "solution": false
 83 |                 }
 84 |             },
 85 |             "outputs": [],
 86 |             "source": [
 87 |                 "assert( abs(conditional__probability(2., 4., 3., 3.) -0.5714285714285715) < 10**-5)  \n",
 88 |                 "assert( abs(conditional__probability(1., 3., 5., 2.) -0.7241379310344829) < 10**-5) \n",
 89 |                 "\n",
 90 |                 "#\n",
 91 |                 "# AUTOGRADER TEST - DO NOT REMOVE\n",
 92 |                 "#\n"
 93 |             ]
 94 |         },
 95 |         {
 96 |             "cell_type": "code",
 97 |             "execution_count": null,
 98 |             "metadata": {
 99 |                 "collapsed": true
100 |             },
101 |             "outputs": [],
102 |             "source": [
103 |                 "\n",
104 |                 "\n",
105 |                 "\n",
106 |                 "\n",
107 |                 "\n",
108 |                 "\n",
109 |                 "\n"
110 |             ]
111 |         }
112 |     ],
113 |     "metadata": {
114 |         "kernelspec": {
115 |             "display_name": "Python 2",
116 |             "language": "python",
117 |             "name": "python2"
118 |         },
119 |         "language_info": {
120 |             "codemirror_mode": {
121 |                 "name": "ipython",
122 |                 "version": 2
123 |             },
124 |             "file_extension": ".py",
125 |             "mimetype": "text\/x-python",
126 |             "name": "python",
127 |             "nbconvert_exporter": "python",
128 |             "pygments_lexer": "ipython2",
129 |             "version": "2.7.12"
130 |         },
131 |         "toc": {
132 |             "colors": {
133 |                 "hover_highlight": "#DAA520",
134 |                 "navigate_num": "#000000",
135 |                 "navigate_text": "#333333",
136 |                 "running_highlight": "#FF0000",
137 |                 "selected_highlight": "#FFD700",
138 |                 "sidebar_border": "#EEEEEE",
139 |                 "wrapper_background": "#FFFFFF"
140 |             },
141 |             "moveMenuLeft": true,
142 |             "nav_menu": {
143 |                 "height": "48px",
144 |                 "width": "252px"
145 |             },
146 |             "navigate_menu": true,
147 |             "number_sections": true,
148 |             "sideBar": true,
149 |             "threshold": 4,
150 |             "toc_cell": false,
151 |             "toc_section_display": "block",
152 |             "toc_window_display": false,
153 |             "widenNotebook": false
154 |         }
155 |     },
156 |     "nbformat": 4,
157 |     "nbformat_minor": 2
158 | }


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/Problem Set 6 _ 6.12 Problem Sets _ DSE210x Courseware _ edX.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Problem Set 6 _ 6.12 Problem Sets _ DSE210x Courseware _ edX.pdf


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/Programming Assignment _ 6.13 Programming Assignment _ DSE210x Courseware _ edX.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Programming Assignment _ 6.13 Programming Assignment _ DSE210x Courseware _ edX.pdf


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/Quiz 6 _ 6.11 Comprehension Quiz _ DSE210x Courseware _ edX.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Quiz 6 _ 6.11 Comprehension Quiz _ DSE210x Courseware _ edX.pdf


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/Week_6_Part_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Week_6_Part_1.pdf


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/Week_6_Part_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Week_6_Part_2.pdf


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/Week_6_Part_3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Week_6_Part_3.pdf


--------------------------------------------------------------------------------
/Week 6 Discrete and Continuous Distribution/Week_6_Part_4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Week_6_Part_4.pdf


--------------------------------------------------------------------------------
/Week 7 Inequalities and Limit Theorems/Problem Set 7 _ 7.9 Problem Sets _ DSE210x Courseware _ edX.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 7 Inequalities and Limit Theorems/Problem Set 7 _ 7.9 Problem Sets _ DSE210x Courseware _ edX.pdf


--------------------------------------------------------------------------------
/Week 7 Inequalities and Limit Theorems/Programming Assignment _ 7.10 Programming Assignment _ DSE210x Courseware _ edX.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 7 Inequalities and Limit Theorems/Programming Assignment _ 7.10 Programming Assignment _ DSE210x Courseware _ edX.pdf


--------------------------------------------------------------------------------
/Week 7 Inequalities and Limit Theorems/Quiz 7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 7 Inequalities and Limit Theorems/Quiz 7.pdf


--------------------------------------------------------------------------------
/Week 7 Inequalities and Limit Theorems/inequalities_HW.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": [
  7 |                 "<font size=\"4\" style=\"color:red;\"> **IMPORTANT: ** When submitting this homework notebook, please modify only the cells that start with:<\/font>\n",
  8 |                 "\n",
  9 |                 "```python\n",
 10 |                 "# modify this cell\n",
 11 |                 "```"
 12 |             ]
 13 |         },
 14 |         {
 15 |             "cell_type": "markdown",
 16 |             "metadata": {},
 17 |             "source": [
 18 |                 "**Note:**  notice that no packages are imported for this assignment. This is because you do not need any python packages."
 19 |             ]
 20 |         },
 21 |         {
 22 |             "cell_type": "markdown",
 23 |             "metadata": {},
 24 |             "source": [
 25 |                 "# Probability Inequalities\n"
 26 |             ]
 27 |         },
 28 |         {
 29 |             "cell_type": "markdown",
 30 |             "metadata": {},
 31 |             "source": [
 32 |                 "For the binomial distribution $X\\sim B_{p,n}$ with mean $\\mu=np$ and variance $\\sigma^2=np(1-p)$, \n",
 33 |                 "we would like to upper bound the probability $P(X\\ge c\\cdot \\mu)$ for $c\\ge1$. \n",
 34 |                 "The lectures introduced three bounds:\n",
 35 |                 "\n",
 36 |                 "Markov: $$P(X\\ge \\alpha\\mu)\\le \\frac{1}{\\alpha},\\quad\\quad\\forall \\alpha\\ge 1,$$\n",
 37 |                 "Chebyshev: $$P(|X-\\mu|\\ge \\alpha\\sigma)\\le \\frac{1}{\\alpha^2},\\quad\\quad \\forall \\alpha\\ge 1,$$\n",
 38 |                 "Note that, while double-sided, this inequality also bounds $P(X\\ge\\mu+\\alpha)$\n",
 39 |                 "$$P(X\\ge \\mu+\\alpha\\sigma)\\le P(|X-\\mu|\\ge \\alpha\\sigma)\\le \\frac{1}{\\alpha^2},$$\n",
 40 |                 "Chernoff: $$P(X\\ge (1+\\delta)\\mu)\\le e^{-\\frac{\\delta^2}{2+\\delta}\\mu},\\quad\\quad\\forall \\delta\\ge0.$$\n"
 41 |             ]
 42 |         },
 43 |         {
 44 |             "cell_type": "code",
 45 |             "execution_count": null,
 46 |             "metadata": {
 47 |                 "collapsed": true
 48 |             },
 49 |             "outputs": [],
 50 |             "source": [
 51 |                 "from math import exp"
 52 |             ]
 53 |         },
 54 |         {
 55 |             "cell_type": "markdown",
 56 |             "metadata": {},
 57 |             "source": [
 58 |                 "import exponential function exp from math"
 59 |             ]
 60 |         },
 61 |         {
 62 |             "cell_type": "markdown",
 63 |             "metadata": {},
 64 |             "source": [
 65 |                 "## Problem 1"
 66 |             ]
 67 |         },
 68 |         {
 69 |             "cell_type": "markdown",
 70 |             "metadata": {},
 71 |             "source": [
 72 |                 "Write three functions **Markov**, **Chebyshev** and **Chernoff** that take $n$, $p$ and $c$ as inputs and return the upper bounds for $P(X\\ge c\\cdot np)$ given by the above Markov, Chebyshev, and Chernoff inequalities as outputs.\n",
 73 |                 "\n",
 74 |                 "<font  style=\"color:blue\"> **Code:**<\/font>\n",
 75 |                 "```python\n",
 76 |                 "print Markov(100.,0.2,1.5)\n",
 77 |                 "print Chebyshev(100.,0.2,1.5)\n",
 78 |                 "print Chernoff(100.,0.2,1.5)\n",
 79 |                 "```\n",
 80 |                 "\n",
 81 |                 "\n",
 82 |                 "<font  style=\"color:magenta\"> **Output**<\/font>\n",
 83 |                 "```\n",
 84 |                 "0.6666666666666666\n",
 85 |                 "0.16\n",
 86 |                 "0.1353352832366127\n",
 87 |                 "\n",
 88 |                 "```"
 89 |             ]
 90 |         },
 91 |         {
 92 |             "cell_type": "code",
 93 |             "execution_count": null,
 94 |             "metadata": {
 95 |                 "collapsed": true
 96 |             },
 97 |             "outputs": [],
 98 |             "source": [
 99 |                 "\n",
100 |                 "# modify this cell\n",
101 |                 "\n",
102 |                 "def Markov(n, p, c):\n",
103 |                 "    # inputs: 3 floats as described above\n",
104 |                 "    # output: a variable of type float\n",
105 |                 "    \n",
106 |                 "    #\n",
107 |                 "    # YOUR CODE HERE\n",
108 |                 "    #\n"
109 |             ]
110 |         },
111 |         {
112 |             "cell_type": "code",
113 |             "execution_count": null,
114 |             "metadata": {
115 |                 "collapsed": true
116 |             },
117 |             "outputs": [],
118 |             "source": [
119 |                 "\n",
120 |                 "# modify this cell\n",
121 |                 "\n",
122 |                 "def Chebyshev(n, p, c):\n",
123 |                 "    # inputs: 3 floats as described above\n",
124 |                 "    # output: a variable of type float\n",
125 |                 "    \n",
126 |                 "    #\n",
127 |                 "    # YOUR CODE HERE\n",
128 |                 "    #\n"
129 |             ]
130 |         },
131 |         {
132 |             "cell_type": "code",
133 |             "execution_count": null,
134 |             "metadata": {
135 |                 "collapsed": true
136 |             },
137 |             "outputs": [],
138 |             "source": [
139 |                 "\n",
140 |                 "# modify this cell\n",
141 |                 "\n",
142 |                 "def Chernoff(n, p, c):\n",
143 |                 "    # inputs: 3 floats as described above\n",
144 |                 "    # output: a variable of type float\n",
145 |                 "    \n",
146 |                 "    #\n",
147 |                 "    # YOUR CODE HERE\n",
148 |                 "    #\n"
149 |             ]
150 |         },
151 |         {
152 |             "cell_type": "code",
153 |             "execution_count": null,
154 |             "metadata": {
155 |                 "nbgrader": {
156 |                     "grade": true,
157 |                     "grade_id": "ex1",
158 |                     "locked": true,
159 |                     "points": "5",
160 |                     "solution": false
161 |                 }
162 |             },
163 |             "outputs": [],
164 |             "source": [
165 |                 "assert (Markov(200.,0.25,1.25)-0.8)< 10**-5\n",
166 |                 "assert (Chebyshev(100.,0.25,1.25)-0.48)< 10**-5\n",
167 |                 "assert (Chernoff(100.,0.25,1.25)-0.4993517885992762)< 10**-5\n",
168 |                 "#\n",
169 |                 "# AUTOGRADER TEST - DO NOT REMOVE\n",
170 |                 "#\n"
171 |             ]
172 |         },
173 |         {
174 |             "cell_type": "code",
175 |             "execution_count": null,
176 |             "metadata": {
177 |                 "collapsed": true
178 |             },
179 |             "outputs": [],
180 |             "source": [
181 |                 "\n",
182 |                 "\n",
183 |                 "\n",
184 |                 "\n",
185 |                 "\n",
186 |                 "\n",
187 |                 "\n",
188 |                 "\n"
189 |             ]
190 |         }
191 |     ],
192 |     "metadata": {
193 |         "kernelspec": {
194 |             "display_name": "Python 2",
195 |             "language": "python",
196 |             "name": "python2"
197 |         },
198 |         "language_info": {
199 |             "codemirror_mode": {
200 |                 "name": "ipython",
201 |                 "version": 2
202 |             },
203 |             "file_extension": ".py",
204 |             "mimetype": "text\/x-python",
205 |             "name": "python",
206 |             "nbconvert_exporter": "python",
207 |             "pygments_lexer": "ipython2",
208 |             "version": "2.7.12"
209 |         },
210 |         "toc": {
211 |             "colors": {
212 |                 "hover_highlight": "#DAA520",
213 |                 "navigate_num": "#000000",
214 |                 "navigate_text": "#333333",
215 |                 "running_highlight": "#FF0000",
216 |                 "selected_highlight": "#FFD700",
217 |                 "sidebar_border": "#EEEEEE",
218 |                 "wrapper_background": "#FFFFFF"
219 |             },
220 |             "moveMenuLeft": true,
221 |             "nav_menu": {
222 |                 "height": "48px",
223 |                 "width": "252px"
224 |             },
225 |             "navigate_menu": true,
226 |             "number_sections": true,
227 |             "sideBar": true,
228 |             "threshold": 4,
229 |             "toc_cell": false,
230 |             "toc_section_display": "block",
231 |             "toc_window_display": false,
232 |             "widenNotebook": false
233 |         }
234 |     },
235 |     "nbformat": 4,
236 |     "nbformat_minor": 2
237 | }


--------------------------------------------------------------------------------
/Week 8 Statistics and Parameter Estimation/1. Stats.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hello, and welcome back.\par
  4 | So far, we talked about probability\par
  5 | where everything was designed by us.\par
  6 | Like, we said, okay, this is a uniform distribution,\par
  7 | it behaves exactly in that way.\par
  8 | Or this is a geometric distribution or binomial.\par
  9 | Everything was very clean and very precise\par
 10 | and behaved exactly the way we wanted it.\par
 11 | And now, we're going to move to the real world\par
 12 | where things are not exactly the way they're planned.\par
 13 | They look a little different, they behave\par
 14 | a little differently, sometimes not what we what we expect\par
 15 | or not even what we want, but still\par
 16 | we need to deal with them.\par
 17 | So, that's what we're going to do\par
 18 | in the statistics part of this course.\par
 19 | So, in some sense, probability and statistics\par
 20 | are somewhat of opposites of each other.\par
 21 | In probability, as we said, we assume some distribution,\par
 22 | we come up with it, and this,\par
 23 | and then we take samples and we say,\par
 24 | here are the properties of the samples that we take.\par
 25 | For example, we have a distribution,\par
 26 | we can define the mean, mu, to be the summation\par
 27 | of X times P of X, and then we say that if we take\par
 28 | many samples, then the average value\par
 29 | is going to be roughly mu.\par
 30 | Or, if we have a distribution over values\par
 31 | that are non-negative, so the random variable\par
 32 | will be non-negative, then as we saw for Markov's\par
 33 | inequality, the probability that we'll get a value\par
 34 | which is bigger than twice the mean,\par
 35 | we calculate it here, is at most half.\par
 36 | And in statistics, it's, as we said,\par
 37 | a little bit the opposite.\par
 38 | We get samples, and from the samples we want\par
 39 | to deduce properties of the distribution,\par
 40 | or maybe what the distribution is.\par
 41 | So we want to deduce some parameters\par
 42 | of the distribution, for example, the mean\par
 43 | or the standard deviation, or we want to say\par
 44 | what type of distribution it is.\par
 45 | Is it Gaussian, is it geometric,\par
 46 | or maybe it's none of those.\par
 47 | The first thing we're going to do\par
 48 | is we're going to look at estimating\par
 49 | distribution parameters.\par
 50 | So, most distribution families that we saw\par
 51 | are determined by parameters.\par
 52 | For example, if we have Bernoulli distribution,\par
 53 | it's determined by the success probability, P.\par
 54 | If we have a binomial distribution,\par
 55 | it's determined by the same P and also the number\par
 56 | of samples that we take.\par
 57 | If we have Poisson distribution,\par
 58 | it's determined by the parameter lambda,\par
 59 | which is the mean, and so on.\par
 60 | If you have a geometric or a uniform\par
 61 | or exponential or normal distribution,\par
 62 | all of them are determined by parameters.\par
 63 | So these are distribution parameters.\par
 64 | But, you can view parameters more generally\par
 65 | to be any deterministic function of the distribution.\par
 66 | And sometimes these are called properties.\par
 67 | So for example you can say that the mean,\par
 68 | if you have a distribution, for example,\par
 69 | binomial PN, then the mean is a parameter\par
 70 | of the distribution, in this case it's P times N.\par
 71 | Or the variance of the distribution here it's going\par
 72 | to be NPQ, that's something that's determined\par
 73 | by the distribution and so it's a parameter\par
 74 | of the distribution.\par
 75 | Or, the standard deviation, or we can take\par
 76 | the min or max values.\par
 77 | For example, for a geometric distribution,\par
 78 | then the smallest value, the min value\par
 79 | is the smallest value that has positive probability,\par
 80 | which will be one, and the maximum value will be infinity.\par
 81 | Or we can look at the mode, which is the value\par
 82 | that has the highest probability.\par
 83 | All of these are determined by the distribution\par
 84 | of interest, and you might want to find\par
 85 | what they are from from samples, okay, or the median.\par
 86 | So the way we do it is by sampling\par
 87 | from the distribution.\par
 88 | So a distribution could be discrete,\par
 89 | in which case we call it P,\par
 90 | it's a probability mass function.\par
 91 | Or distribution can be continuous, in which case\par
 92 | we have a probability density function, F.\par
 93 | And then we're going to take independent samples\par
 94 | from P or from F.\par
 95 | And we denote the samples by X superscript N,\par
 96 | it's short for X one, X two up to X N, which as we said\par
 97 | they're chosen independently from P.\par
 98 | So they're chosen from P and independently of each other.\par
 99 | And from these samples we want to deduce\par
100 | properties of the distribution.\par
101 | Or, instead of looking at distribution,\par
102 | what we often want to do is look at populations.\par
103 | So, a population is a collection of objects,\par
104 | typically many of them.\par
105 | For example we can take all students at UCSD\par
106 | so that's a population of students.\par
107 | Or all patients in a hospital, that's a population.\par
108 | And we want to deduce properties of this population.\par
109 | So what we do is we sample N objects\par
110 | from this collection of object.\par
111 | And typically N is much smaller\par
112 | than the population size, so we don't want to take\par
113 | all students at UCSD but we want to take\par
114 | a small sample, called it number N,\par
115 | and from that sample we want to deduce\par
116 | properties of the population as a whole.\par
117 | So in this case we pick for example N students at random,\par
118 | and we want to, as we said, to deduce population parameter\par
119 | from the samples.\par
120 | For example, maybe you want to deduce the average height\par
121 | of all students at UCSD by just sampling 100 of them.\par
122 | And so we can view, so this might look like\par
123 | it's a different problem, because here\par
124 | we have a physical population that we're sampling from.\par
125 | It might look like it's a different problem\par
126 | from estimating parameters of a distribution.\par
127 | But in fact we can view it as the same.\par
128 | So, we can view, for example, if we're looking\par
129 | at the heights, we can view the collection\par
130 | of heights as a distribution.\par
131 | So we have maybe, a person whose height is, you know,\par
132 | five foot, or five foot one, and another person's height\par
133 | is five feet, and so on.\par
134 | So we have all these, and now what we're going to do\par
135 | is we're going to sample from them.\par
136 | So we sample from the population is like sampling\par
137 | from the distribution that has so many,\par
138 | so many people with given height,\par
139 | and so many people with another height, and so on.\par
140 | So we're just sampling from them uniformly\par
141 | from this collection.\par
142 | And there's a small difference\par
143 | between this and this sampling that we had\par
144 | in the previous slide, in the sense that\par
145 | before they were IID and here,\par
146 | if we're sampling from the population,\par
147 | then we're picking a set of people,\par
148 | and these people are going to be distinct.\par
149 | So, it's not exactly IID.\par
150 | For example, if you have a population of size two,\par
151 | and you pick two of them, you know that\par
152 | they are different, and you couldn't pick\par
153 | the same person twice.\par
154 | But, in what we're going to look at\par
155 | N, the number of samples that you pick\par
156 | is much smaller than the population size.\par
157 | And in that case, the probability of repeats\par
158 | if you pick them independently will be\par
159 | fairly small, so we can view what we have\par
160 | as roughly independent.\par
161 | So under this assumption that N the sample size\par
162 | is much smaller than the population,\par
163 | then the selection, even though we're selecting\par
164 | without replacement will be very similar\par
165 | to selecting with replacement, because the probability\par
166 | that we'll get repeats is small.\par
167 | If we get repeats, there will be very few of them.\par
168 | So, with this assumption we can therefore\par
169 | assume that we have the same problem of estimating\par
170 | parameters of population as estimating\par
171 | parameters of a distribution.\par
172 | So, when we have this sample, then we're going\par
173 | to look at the sample and we're going to look\par
174 | at functions of the data.\par
175 | For example, the average of all the values\par
176 | that we get from the data,\par
177 | or the maximum value that we observe.\par
178 | And any function of the data\par
179 | is going to be called a statistic.\par
180 | What we want to do is we want to use the statistics\par
181 | to infer properites of the distribution or the population.\par
182 | So I want to look for example at the average\par
183 | or maximum value that we observed in our sample,\par
184 | and from these things we want to deduce\par
185 | some properties of the distribution or the population.\par
186 | And for example, we may want to deduce\par
187 | the parameter, like the mean of the distribution\par
188 | or the maximum of all elements in the population, and so on.\par
189 | Or we may want to deduce the type\par
190 | of distribution that's in effect.\par
191 | What we're going to do in the rest\par
192 | of the presentations in this sequence\par
193 | is see how to do this, and how to do this well.\par
194 | And so this was just a brief introduction\par
195 | to what we're going to discuss in the next presentations.\par
196 | Next, we're going to talk about possibly\par
197 | the simplest problem, which is estimating\par
198 | the mean of a distribution.\par
199 | And so that's what we're going to do next time,\par
200 | and see you then.\par
201 | End of transcript. Skip to the start.\par
202 | POLL\par
203 | }
204 |  


--------------------------------------------------------------------------------
/Week 8 Statistics and Parameter Estimation/Problem Set 8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Problem Set 8.pdf


--------------------------------------------------------------------------------
/Week 8 Statistics and Parameter Estimation/Programming Assignment 8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Programming Assignment 8.pdf


--------------------------------------------------------------------------------
/Week 8 Statistics and Parameter Estimation/Quiz 8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Quiz 8.pdf


--------------------------------------------------------------------------------
/Week 8 Statistics and Parameter Estimation/Week_8_Part_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Week_8_Part_1.pdf


--------------------------------------------------------------------------------
/Week 8 Statistics and Parameter Estimation/Week_8_Part_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Week_8_Part_2.pdf


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/3. Solving a System of Linear Equations.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hi, last time we talked about matrices,\par
  4 | and now we will see the first use of matrices\par
  5 | for solving a system of linear equations.\par
  6 | So linear equations, the simplest case\par
  7 | of system of linear equation\par
  8 | is two equations with two unknowns.\par
  9 | And that corresponds to finding a line\par
 10 | that passes through two points.\par
 11 | So we're given two points in the plane,\par
 12 | minus one, two, and one, one.\par
 13 | And we want to find the line\par
 14 | that passes through these points.\par
 15 | So here is the picture, here is our plane,\par
 16 | and this is the point two, minus one.\par
 17 | And this is the point one, one.\par
 18 | And we want to find the line that would be\par
 19 | a straight line going through these two points.\par
 20 | So any line, other than the vertical line,\par
 21 | can be expressed using this expression,\par
 22 | y equals w zero, plus w one x,\par
 23 | where w zero and w one are some numbers,\par
 24 | and x and y are the points along the line\par
 25 | connecting these points.\par
 26 | W zero is the intercept with the y-axis,\par
 27 | and y one is the slope of the line.\par
 28 | So to identify the line that passes through the two points,\par
 29 | we need to find the w zero and w one\par
 30 | that satisfy the two constraints\par
 31 | determined by the two points.\par
 32 | So for the point minus one, two,\par
 33 | if we write the expression, we see that w zero minus w one,\par
 34 | should be equal to two.\par
 35 | And for the point one, one, we see that w zero plus w one,\par
 36 | should be equal to one.\par
 37 | So these are two equations with two unknowns,\par
 38 | probably know how to solve this directly,\par
 39 | but let's do it through matrices\par
 40 | because that will generalize.\par
 41 | So we want to write these equations in matrix form.\par
 42 | So here is the way that we can write it,\par
 43 | we can write the matrix, one, minus one,\par
 44 | one, one, times the vector, w zero, w one,\par
 45 | is equal to the vector, two, one.\par
 46 | If you basically write out the two dot products\par
 47 | that correspond to the first row and the second row,\par
 48 | you see that you get exactly the equations\par
 49 | that we had before, and now we can just\par
 50 | represent this as, a w times b,\par
 51 | a times w, equals b.\par
 52 | If we basically call this a, this w, and this b.\par
 53 | So this is what we do.\par
 54 | So a is called the matrix of coefficient,\par
 55 | b is the ordinate or dependent variable vector,\par
 56 | and the parameter vector is w.\par
 57 | It's this part that we don't know.\par
 58 | So how can we find w?\par
 59 | So we want to find the w, such that a w equal to b.\par
 60 | Now if a is invertible, so a is square,\par
 61 | but if it is also invertible, which it is in this case,\par
 62 | then we can multiply both sides by a to the minus one,\par
 63 | and we get that a to the minus one, times a w,\par
 64 | is equal a to the minus one, times b.\par
 65 | But that's also equal, on the left here,\par
 66 | is equal to w, because a to the minus one, times a\par
 67 | is the identity matrix.\par
 68 | So here is how this looks when we use numpy.\par
 69 | We have the matrix a and the column vector b,\par
 70 | and we want to find the inverse of a.\par
 71 | So inverse of a, we just use the command inv(A)\par
 72 | and we find the inverse,\par
 73 | and then we check that it is indeed the inverse,\par
 74 | so if you multiply a times the inverse of a,\par
 75 | we get the identity matrix.\par
 76 | And then the solution is inverse of a, times b.\par
 77 | And so what we get here is that the solution\par
 78 | is 1.5, minus 0.5, so that's the vector w\par
 79 | that we need to use.\par
 80 | Alternatively, we can just use the solve operation\par
 81 | in numpy, and that will give us the solution directly.\par
 82 | Now that we have the vector w, we want to define the line\par
 83 | that is represented by w, and as we've said,\par
 84 | this is simply w zero, plus w one, times x.\par
 85 | So we define a function\par
 86 | that gives you the value y for every value of x,\par
 87 | w is fixed here, and what we see is that if we basically\par
 88 | write f of minus one, it's two,\par
 89 | and f of one is one, so indeed we see that the line\par
 90 | goes through the two points we wanted it to go through.\par
 91 | And when we draw it, we can see\par
 92 | that this is indeed the case.\par
 93 | So we found the line that goes through\par
 94 | the two points that we were given.\par
 95 | Now that was two points, but what about\par
 96 | if we have more than two points?\par
 97 | So in general, if you have more than two points\par
 98 | on the two-dimensional plane,\par
 99 | there is no line that goes through these points.\par
100 | So here is an example.\par
101 | Here is one, two points that we had before,\par
102 | and here is a third point.\par
103 | Now this third point doesn't lie\par
104 | on the line connecting these two,\par
105 | so there is no line that would go through all of them.\par
106 | Still, many times we do want to solve problems\par
107 | that have more points than we have dimensions.\par
108 | So when the number of points is larger\par
109 | than the number of dimensions,\par
110 | we say that the system is over-determined.\par
111 | That means that there is no line\par
112 | that goes exactly through the points.\par
113 | However, we do still want to find a line\par
114 | that passes close to the points.\par
115 | So that's what we're going to talk about in the next video.\par
116 | See you then.\par
117 | End of transcript. Skip to the start.\par
118 | POLL\par
119 | }
120 |  


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/4. Linear Regression.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hi, last time we talked about\par
  4 | finding a line that passes through two points on the plane.\par
  5 | And we raised the question at the end about\par
  6 | what about having more than two points.\par
  7 | Can we find a line that passes\par
  8 | close to these points?\par
  9 | Okay, so that's basically the idea of regression and\par
 10 | this video is going to introduce you to the\par
 11 | notion of regression and\par
 12 | how we solve it using numpy.\par
 13 | So here's a small example,\par
 14 | we have nine points on the plane,\par
 15 | defined by their x, y positions,\par
 16 | so here are our points, okay?\par
 17 | So these are the points, and clearly there is no single line\par
 18 | that passes through all of these points.\par
 19 | But, also clearly, there is a line that passes very close to\par
 20 | all of them, okay this line that is tending upwards.\par
 21 | So, how can we find that line?\par
 22 | So the line is going to be defined as before,\par
 23 | by a function of the form w zero plus w one times x.\par
 24 | And we want to find w zero and w one.\par
 25 | So previously, we saw how to find that when\par
 26 | there are just two points, and so there is a line\par
 27 | that passes exactly through the points,\par
 28 | and then it was just matrix inversion.\par
 29 | Here, there's more than two points,\par
 30 | and the system is overconstrained, there is no straight line\par
 31 | that passes through all the points.\par
 32 | So, while they don't fall,\par
 33 | there's no line that falls exactly\par
 34 | on all of the points, we can find a line\par
 35 | that will be close to the points,\par
 36 | but we need to define somehow what we mean by close.\par
 37 | So what we are going to use\par
 38 | is this idea of square difference.\par
 39 | Okay, so for every point, xi yi, we're going to calculate\par
 40 | the value of the line at that x,\par
 41 | and then take the difference from that and y,\par
 42 | which is the actual position of the point,\par
 43 | and square that, why do we square it?\par
 44 | Because we always want to be positive,\par
 45 | if we're not exactly at the point.\par
 46 | If we're exactly at the point, we have zero.\par
 47 | So, we want this, this cost,\par
 48 | the square cost to be a function that is bigger and bigger\par
 49 | the further you are from the points,\par
 50 | and our goal is to find a minimum.\par
 51 | So, this method of looking for,\par
 52 | for minimizing the squared difference\par
 53 | is called the least square method,\par
 54 | and we are going to look for the least square solution.\par
 55 | Okay, so we're going to use matrix notation,\par
 56 | and we're going to use numpy linalg,\par
 57 | the library, to find this optimal vector w,\par
 58 | that minimizes the square error.\par
 59 | So, we're going to use the following matrices,\par
 60 | first we're going to define A to be this matrix,\par
 61 | where there is one column that is just all ones,\par
 62 | and the second column is the x values,\par
 63 | and then we're going to have\par
 64 | a column vector that is all the y values,\par
 65 | and then we're going to have a small column vector\par
 66 | with just two values, that is going to be the weight\par
 67 | vector that we're looking for.\par
 68 | So, now we can find, we can define the errors\par
 69 | to be aw, which gives us the,\par
 70 | the y vectors as measured by fx,\par
 71 | as computed by fx, minus y, that's the difference,\par
 72 | and that's, now, it's the difference vector.\par
 73 | And what we're interested is in the sum of the squares\par
 74 | of the differences, and that turns out to be\par
 75 | exactly the square of the norm of d,\par
 76 | remember the norm is the length of d, we're looking for\par
 77 | the w that will make d as short as possible.\par
 78 | Okay, so how do we do this in numpy?\par
 79 | We define the vectors a and y, as I said,\par
 80 | I'm just printing here a transposed and y transposed,\par
 81 | so that they fit nicely in the slide.\par
 82 | And, then we just call the,\par
 83 | the function in numpy called least square,\par
 84 | give it the matrix a, and the vector y,\par
 85 | and we're just interested in the perimeter,\par
 86 | so we're going to take the first component\par
 87 | of the answer from this, and that's the vector w,\par
 88 | and if we print out w we see that it's 19,\par
 89 | and 0.7166 and so on, okay, so this is w zero,\par
 90 | the offset, and this is the slope.\par
 91 | And if we now plot this line\par
 92 | on the previous graph,\par
 93 | we see that it is indeed like the line that we expected,\par
 94 | and the little green segments here\par
 95 | represent the errors, the differences,\par
 96 | so you see that for most of the points\par
 97 | the difference is very small,\par
 98 | for some points the difference is significantly larger.\par
 99 | But, this is the line that would minimize\par
100 | the total of the square of the lengths\par
101 | of these green lines.\par
102 | Okay, so that was a toy example,\par
103 | just to show you how you do this kind of thing,\par
104 | in a small number of examples where you can\par
105 | essentially see everything and,\par
106 | and it makes sense.\par
107 | In real life, we usually have not just\par
108 | nine or 10 or 20 points,\par
109 | we have a huge number of points, and we want to\par
110 | find a line that passes through close to the,\par
111 | to all of these points.\par
112 | So, here is a real data set,\par
113 | which has 25,000 people, their height and their weight.\par
114 | The height in inches, and the weight in pounds.\par
115 | And, if we solve\par
116 | the least square for\par
117 | this data set, what we get is the following,\par
118 | is, this is, these are our points,\par
119 | you see it's now a cloud of points,\par
120 | we have a huge number of points,\par
121 | and the red line is the best line that passes through\par
122 | those points, so what this red line tells us,\par
123 | is, not surprisingly, that as weight,\par
124 | as the height increases, the weight also tends to increase.\par
125 | But, this is by no means explaining all of the variation\par
126 | in the weight, for the same, for the same height\par
127 | you have a big variation in the,\par
128 | in the weight of the person.\par
129 | Okay, but this is the, what we would call\par
130 | the linear regression line.\par
131 | So, to get to the slightly\par
132 | more refined understanding of what this line tells us,\par
133 | it is useful to draw what is called the graph of averages.\par
134 | So this is the graph of averages,\par
135 | and basically what I've done is that I split the\par
136 | height into many ranges,\par
137 | of about one inch I think,\par
138 | and for each of these ranges, I found the,\par
139 | the mean, the mean value, and that mean value\par
140 | is the red dot, so that is called the graph of averages,\par
141 | and what you see from this graph of averages\par
142 | is that the tendency described by the,\par
143 | by the, the line that we found,\par
144 | is actually well-represented,\par
145 | well-representing the points of the averages,\par
146 | so the averages are indeed going, increasing linearly.\par
147 | So remember, this, this black line,\par
148 | we found it according, by minimizing the square error\par
149 | for all of the points, but it passes close to\par
150 | the center for these points.\par
151 | And only in the edges where we have very few examples,\par
152 | do we have significant deviation from that,\par
153 | okay, so if we ignore these very,\par
154 | very short or very tall people,\par
155 | that are outliers, we see that the linear graph\par
156 | is a good representation of the graph of averages.\par
157 | We'll see that, in some other cases, this is not the case.\par
158 | The, for every problem that we do in two dimensions,\par
159 | like here from weight, from height to weight,\par
160 | or vice versa, we have two regression lines.\par
161 | One is for, to predict\par
162 | the weight from the height,\par
163 | so if I give you the height of the person,\par
164 | I can find a function that is this straight line\par
165 | that will predict the weight,\par
166 | but I can also do it the other way, I can predict the\par
167 | height from the weight, okay?\par
168 | So, and the two functions will not coincide,\par
169 | so, here is the result of doing that,\par
170 | you see this is our red line,\par
171 | this red line is what we had, the previous line\par
172 | for predicting the weight from the height,\par
173 | the black line is predicting the height from the weight,\par
174 | so if you give me the, the weight,\par
175 | I will predict the height that is associated with it,\par
176 | so you see that these are two different lines,\par
177 | depending on what it is that we're trying to predict,\par
178 | and what we're using to predict it.\par
179 | So, in the next video,\par
180 | we're going to talk about polynomial regression,\par
181 | and that is a case in which\par
182 | linear regression might not be good enough,\par
183 | and we need to do something a little bit more sophisticated.\par
184 | So I'll see you then.\par
185 | End of transcript. Skip to the start.\par
186 | POLL\par
187 | }
188 |  


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/5. Polynomial Regression.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Last time, we talked about linear regression,\par
  4 | and we fit a straight line into the data, okay?\par
  5 | This time we're going to look at fitting\par
  6 | more complicated curves into the data,\par
  7 | and we're going to use for that polynomial regression.\par
  8 | Okay, so let's get started with a little review.\par
  9 | When we had our previous video, we looked at this graph\par
 10 | of averages; and we saw that the averages, the red dots,\par
 11 | fall along a straight line most of the time.\par
 12 | At the extremes they fall, they might deviate\par
 13 | because there is so little data.\par
 14 | Okay, but most of the time\par
 15 | they're very close to the straight line.\par
 16 | But we might get data that looks more like this, okay?\par
 17 | So here the averages, the red dots,\par
 18 | are not really along a straight line.\par
 19 | So, it's not going to work very well\par
 20 | if we try to do linear regression.\par
 21 | So there's nothing stopping us from doing linear regression;\par
 22 | we will just get poor results.\par
 23 | So this is what we'll get.\par
 24 | Here is the straight line, and you see that it's\par
 25 | really doesn't capture the shape of the data, okay?\par
 26 | So how do we try to capture the shape\par
 27 | of this non-linear data?\par
 28 | We can try a second degree polynomial, okay?\par
 29 | So I'm not going into what polynomials are.\par
 30 | You can go to the notebook, and they'll give you\par
 31 | some pointers; but this is second degree polynomial.\par
 32 | It's basically similar to the first degree polynomial\par
 33 | in the first two terms, but then it has a second,\par
 34 | a third term that is W2 times X squared, okay?\par
 35 | And now we want to fit all\par
 36 | of these three parameters to the data.\par
 37 | So we do that, again, using a least square;\par
 38 | and the only thing that is really different is that\par
 39 | instead of just having ones and the values\par
 40 | of the Y values, we have also the Y values squared.\par
 41 | Okay, and once we have that, we can just use the same\par
 42 | least square function to find these, the W0, W1, and W2;\par
 43 | and we can plot them, and we see that now\par
 44 | with this second degree polynomial, we got a very nice\par
 45 | and smooth curves that goes through,\par
 46 | again, most of the data, okay?\par
 47 | So we can be happy about that.\par
 48 | Okay, so now here is an interesting question.\par
 49 | You're given some data and you want to fit to it a line\par
 50 | or a curve, but you don't really know what degree polynomial\par
 51 | to fit; you know, do you wanna fit a first degree,\par
 52 | just a linear line or second or third or fourth or fifth?\par
 53 | That's a very good question.\par
 54 | You don't really know what would work best,\par
 55 | so how do you decide?\par
 56 | Okay, so there are two phenomena that appear\par
 57 | when you try to do such a thing.\par
 58 | One is called underfit, and that's what we saw\par
 59 | before with the straight line.\par
 60 | The straight line was not rich enough.\par
 61 | It was not flexible enough in order to fit the data;\par
 62 | so it underfit the data.\par
 63 | And then there is the opposite problem of overfit.\par
 64 | So you use a model and it fits the data too well.\par
 65 | That sounds strange; what do I mean by\par
 66 | it fits the data too well?\par
 67 | Why is this a problem?\par
 68 | It's a problem because we're not really interested always\par
 69 | in just fitting the data that we see,\par
 70 | we want to also fit new data that we haven't see yet\par
 71 | that comes from the same distribution.\par
 72 | So that leads to the concept of\par
 73 | training error and test error.\par
 74 | What we do is we take the data that we have,\par
 75 | and we randomly partition it into two parts, okay?\par
 76 | The two parts are essentially\par
 77 | statistically equivalent, but they are disjoint.\par
 78 | Each example is either in the training set\par
 79 | or in the test set, and we use the training set\par
 80 | to find the best polynomial; and then we use the test set\par
 81 | to test that polynomial that we found on new data.\par
 82 | And now we can express what we actually mean by overfitting.\par
 83 | If we increase the degree of the polynomial,\par
 84 | the training error, the error that we get\par
 85 | on the training data, will always decrease.\par
 86 | It just keeps decreasing because the polynomials are more\par
 87 | and more flexible as you make them higher and higher degree;\par
 88 | but if you look at the test error, you'll see that initially\par
 89 | it behaves like the training error, but at some point,\par
 90 | the training error continues to decrease\par
 91 | and the test error will start to increase\par
 92 | because we're overfitting the training error\par
 93 | and then we're performing badly on the test data.\par
 94 | Okay, so this increase is called overfitting.\par
 95 | That's what we mean when we say overfitting.\par
 96 | So we're going to use a simple data, data set\par
 97 | to analyze this and here is the data set;\par
 98 | and we see that it has a general tendency going up,\par
 99 | and we might think that maybe it has,\par
100 | maybe a straight line would fit it well\par
101 | or maybe a polynomial would, sorry, a parabola\par
102 | would fit it well, a second degree polynomial.\par
103 | But we don't know, so let's see what we can do.\par
104 | Okay, so we split the data into training set and test set,\par
105 | as I said before; and now let's say that we fit\par
106 | degree three polynomial to the training sector.\par
107 | Okay, we get this thing; and what you see is that\par
108 | this second, third degree polynomial fits the blue dots\par
109 | very well because that's the training data.\par
110 | But it doesn't really fit the red dots very well.\par
111 | Okay, so, it doesn't perform well on the test data;\par
112 | and we can see that in the numbers up here the training\par
113 | root mean square error is 0.04\par
114 | and the test root mean square error is 0.5, okay?\par
115 | So the test root mean square error is significantly bigger\par
116 | than the train, and it seems like we're overfitting.\par
117 | But we can only really judge it if we try a bunch\par
118 | of degrees for the polynomials and see what fits best.\par
119 | So we're going to do that.\par
120 | We're going to fit all degrees from zero to five.\par
121 | So here they are; and what is degrees zero?\par
122 | Degree zero is basically a constant.\par
123 | We basically fit the whole data with the constant,\par
124 | and the constant turns out to be simply the mean.\par
125 | Okay, so with the mean, we have very poor fit.\par
126 | It's definitely underfitting, and it's,\par
127 | and the performance is pretty bad\par
128 | on both training and testing.\par
129 | When we go to first degree, we see that there is a nice fit\par
130 | to the training data, not perfect;\par
131 | but in the root mean square for the training data is 0.16,\par
132 | and the root mean square for the test data is 0/22.\par
133 | So higher than the training error, but not very high.\par
134 | Once we go to degree two, we see that the training error\par
135 | decreased somewhat; and the test error increased.\par
136 | And when we go to degree three, like we saw before,\par
137 | the training data, the training error decreases even more;\par
138 | but the test data increases very significantly.\par
139 | And what happens at this point is that once you have\par
140 | a degree four polynomial, you can fit the data perfectly.\par
141 | Right, so you can simply go through all of the points;\par
142 | and that's great training error,\par
143 | but the test error is very, very high.\par
144 | Okay, so from that we basically see\par
145 | that the best degree polynomial to choose\par
146 | is the first degree polynomial, the straight line.\par
147 | That was our intuition too, but it's kind of hard\par
148 | to depend on intuition when you have data\par
149 | that is very high dimensional or very large.\par
150 | So what we saw is that the minimum\par
151 | of the root mean square occurred of the equal one.\par
152 | And here is the graph that shows it.\par
153 | We see that the, that the training error keeps going down.\par
154 | The horizontal is a degree and this is\par
155 | the root mean square error; and if we look\par
156 | at the test error, it goes down to one,\par
157 | but then it starts to increase and increase.\par
158 | Okay, so that tells us that the minimum point here,\par
159 | the best model to use is simply a linear function,\par
160 | degree one polynomial.\par
161 | So these concepts that I just introduced,\par
162 | they're very central to statistics\par
163 | and to machine learning in general.\par
164 | So this is just kind of a first dip into them,\par
165 | and you will Learn much more about them\par
166 | when you go into the machine learning course.\par
167 | And in the next video, we'll talk about another\par
168 | related subject, which is principle component analysis.\par
169 | Thank you.\par
170 | End of transcript. Skip to the start.\par
171 |   Previous\par
172 | }
173 |  


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/6. Regression Towards the Mean.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - In this video, I want to tell you about something\par
 4 | that is an interesting and curious phenomenon,\par
 5 | and it is something that shows that statistics can\par
 6 | sometimes result in situations that are unintuitive.\par
 7 | And it's a very common thing, so it's worth knowing about.\par
 8 | It's called regression to the mean.\par
 9 | So, here's an example.\par
10 | Suppose students take a standard exam at two time points,\par
11 | maybe before and after a quarter,\par
12 | and they get two grades.\par
13 | The grades are, let's say, on a scale of 0-100,\par
14 | and the average grade is 70.\par
15 | Here is what tends to happen.\par
16 | Suppose that the first grade that the student got is 97.\par
17 | Then the second grade that the student gets\par
18 | is likely to be lower than 97.\par
19 | And the opposite is also true.\par
20 | If the first grade the student got is 55,\par
21 | then the second grade is likely to be higher than 55.\par
22 | So, remember, the mean is 70, and this is what we call\par
23 | regression to the mean, because you're moving,\par
24 | and the second case, you're moving closer to the mean.\par
25 | But the strange thing is that this has nothing to do\par
26 | with whether the student learned or not.\par
27 | It just happens, even if the two grades\par
28 | are completely statistically independent.\par
29 | And also, it happens if you exchange grade 1 and grade 2.\par
30 | So you say, what is grade 1,\par
31 | given that, in grade 2, you got 97?\par
32 | It will tend to be smaller than 97.\par
33 | So, it doesn't really tell us anything.\par
34 | It doesn't tell us that those students\par
35 | that were good initially became bad,\par
36 | or those that were bad initially became good.\par
37 | It is just something that happens because of statistics.\par
38 | It doesn't tell us anything interesting,\par
39 | and we can't conclude anything from that.\par
40 | So, here is one of the first works\par
41 | that showed this.\par
42 | It was a work comparing fathers' and sons' height,\par
43 | and it was done by Sir Francis Galton\par
44 | in 1886, so quite awhile ago.\par
45 | And basically, it says that extreme characteristics,\par
46 | let's say height in parents, are not passed on\par
47 | completely to their offspring.\par
48 | So, you'd think tall parents\par
49 | tend to have tall children, and it's true,\par
50 | but the children would tend to be less tall.\par
51 | And so, here is some data that\par
52 | was actually used to study this.\par
53 | What you have on the x-axis is the height of the father,\par
54 | and what you have on the y-axis is the height of the son.\par
55 | And what you see is that there is\par
56 | definitely relationship between them.\par
57 | As the height of the father increases,\par
58 | the height of the son increases.\par
59 | But if you look at the actual values,\par
60 | you see that the height of the parent can go from,\par
61 | let's say, 58"-77",\par
62 | and the related son height will\par
63 | just go from 63"-72", so it's much closer\par
64 | to the mean height.\par
65 | A different way to look at it that is useful\par
66 | is to look really at the difference.\par
67 | So, instead of looking at the absolute value\par
68 | of the height of the son, you look at\par
69 | what's the difference between the height of the son\par
70 | and the height of the parent.\par
71 | So, you get this kind of plot.\par
72 | So, in this plot, it is even more striking\par
73 | that the higher the parent is,\par
74 | the shorter, relatively to the parent, the son is.\par
75 | And again, that has nothing to do with whether\par
76 | there is like some kind of limitation\par
77 | in the genetics or so on.\par
78 | It's simply a result of regression\par
79 | to the mean that always happens,\par
80 | even if things are completely independent.\par
81 | So, this is the conclusion.\par
82 | Suppose you have a pair of independent and identically\par
83 | distributed random variables, X1 and X2.\par
84 | If you select pairs X1, X2, such that X1 is\par
85 | far from the mean of the distribution,\par
86 | then X2, for those pairs, will\par
87 | tend to be closer to the mean.\par
88 | And you can do the same if you choose\par
89 | X2 to be far from the mean.\par
90 | You'll get X1 is closer to the mean.\par
91 | So, next time, we're going to talk\par
92 | about principle component analysis.\par
93 | End of transcript. Skip to the start.\par
94 |   Previous\par
95 | }
96 |  


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/7. Components Analysis.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
  2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 
  3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - We are reaching the end of the topic of regression.\par
  4 | And the last thing I want to tell you about\par
  5 | is a subject called principal components analysis.\par
  6 | So, the subject of principal component analysis,\par
  7 | to actually fully understand it,\par
  8 | you need a good understanding of eigenvectors,\par
  9 | eigenvalues, matrix decomposition, and so on.\par
 10 | And I'm not going to provide those.\par
 11 | I'm just going to give you a somewhat more superficial\par
 12 | understanding, but that still gives you\par
 13 | some of the intuition about how PCA works.\par
 14 | So let's start with a quick review of linear regression.\par
 15 | Suppose we have nine points in the plane like that.\par
 16 | And here is a plot of these points.\par
 17 | And you see that they are not exactly\par
 18 | on a straight line, but they kind of tend\par
 19 | to go up from the left to the right.\par
 20 | So Y tends to increase as X increases.\par
 21 | So, we would like to find a line\par
 22 | that would represent it.\par
 23 | And what we're going to do is look\par
 24 | for a line with the formula w zero plus w one times x.\par
 25 | And the goal is really to find these two parameters,\par
 26 | w zero and w one.\par
 27 | And this is actually very easy using NumPy.\par
 28 | We just use the least square function\par
 29 | and we get the w's that we wanted.\par
 30 | And when we plot them, we see this line.\par
 31 | And what I added here are the green lines\par
 32 | that basically indicate the amount of error\par
 33 | associated with each point.\par
 34 | And what we're trying to basically do\par
 35 | is minimize the square of the lengths of these segments.\par
 36 | So in the regression problem,\par
 37 | we looked at the function that predicts y from x.\par
 38 | Okay, and we saw that if we try to do the opposite,\par
 39 | we predict x from y, we get actually a different function,\par
 40 | a different line.\par
 41 | So it matters if we go this way or in the reverse.\par
 42 | In general, we call this kind of problem\par
 43 | in machine learning, we call it supervised learning.\par
 44 | Why?\par
 45 | Because the idea is that the output\par
 46 | or the thing that we're trying to predict\par
 47 | is labeled by some supervisor,\par
 48 | somebody that knows what is the correct value of y,\par
 49 | and then we are just trying to predict\par
 50 | that value of y.\par
 51 | Okay?\par
 52 | But we can also fit the line\par
 53 | without deciding on a direction.\par
 54 | So, there is a way to fit a line to this data\par
 55 | that doesn't have anything to do\par
 56 | with whether we choose x or we choose y\par
 57 | or we rotate the whole thing any way we want.\par
 58 | So this is called unsupervised learning\par
 59 | because here we're just basically given data\par
 60 | and nobody identifies a particular component\par
 61 | as something that we're trying to predict.\par
 62 | So if we want to do unsupervised learning\par
 63 | for a linear function by using squared error,\par
 64 | this is called PCA, principal component analysis.\par
 65 | So both principal component analysis\par
 66 | and regression minimize the same loss function,\par
 67 | the root mean squared error.\par
 68 | But the definitions of the error are different\par
 69 | as I'll show you in the next figure.\par
 70 | So let me make this a little larger\par
 71 | so you can see more clearly.\par
 72 | What we see here is the black line\par
 73 | is the regression line.\par
 74 | And the errors to the regression line\par
 75 | are basically the vertical, these green vertical segments.\par
 76 | On the other hand, the red line is the PCA solution,\par
 77 | and the errors for the PCA solution\par
 78 | are not vertical but basically they're orthogonal\par
 79 | to the line itself.\par
 80 | So, basically the error for this point\par
 81 | is this blue line here.\par
 82 | And what we see is that when we try\par
 83 | to minimize this kind of error,\par
 84 | then basically you see that we get a different result.\par
 85 | The red line is not the same as the black line.\par
 86 | And importantly, if we think about rotating\par
 87 | this coordinate system, we will see\par
 88 | that the regression result will change\par
 89 | because we are changing the relationship\par
 90 | between x and y but the PCA result will not change.\par
 91 | It is somehow directly associated with the data\par
 92 | and not with the coordinate system.\par
 93 | So again, errors for the black regression line\par
 94 | correspond to the vertical green segment\par
 95 | and errors for the red PCA line\par
 96 | correspond to the blue segments\par
 97 | that are orthogonal to the red line.\par
 98 | An alternative way to think about PCA\par
 99 | is about maximizing variance.\par
100 | So suppose that we have a set of vectors\par
101 | x one to x n, and then we take a unit vector, u,\par
102 | if you remember from the linear algebra review,\par
103 | a unit vector that has length one\par
104 | and we take the dot product of this u\par
105 | with each one of the vectors,\par
106 | then we get a number.\par
107 | And then we can calculate the mean of that number\par
108 | and the variance of the number.\par
109 | And we're particularly interested in the variance,\par
110 | how much spread out is the data along this projection.\par
111 | So here is one example.\par
112 | We're projecting on this red line,\par
113 | and we see that the points fall pretty far\par
114 | away from the mean.\par
115 | On the other hand, if we use this direction,\par
116 | we see that the points fall closer to the mean.\par
117 | So the points in this direction\par
118 | are more bunched together and the variance is smaller.\par
119 | So suppose you consider all possible directions?\par
120 | In the two-dimension case it's very simple.\par
121 | It's all directions from zero to 360 degrees.\par
122 | And for each direction, we compute the standard deviation,\par
123 | so the square root of the variance.\par
124 | And we put a point that distance away\par
125 | from the mean, from the origin point\par
126 | which is the mean.\par
127 | So the collection of all of these points\par
128 | when we draw them out will form an ellipse,\par
129 | something like this.\par
130 | So basically, if you go from this point\par
131 | and you project along this line,\par
132 | then the standard deviation is big.\par
133 | And if you go orthogonal to that,\par
134 | the standard deviation is small.\par
135 | So this ellipse essentially represents\par
136 | all the information that exists\par
137 | in the variance of the projections.\par
138 | So PCA relates to this picture in the following way.\par
139 | The larger axis of the ellipse corresponds\par
140 | to the direction of maximum variance.\par
141 | And that is what's called the first eigenvector\par
142 | of the principal component analysis.\par
143 | So that is the direction that gives you\par
144 | the maximum variance.\par
145 | The smaller axis of this ellipse\par
146 | corresponds to the direction of minimum variance.\par
147 | So that gives you the second eigenvector\par
148 | which is orthogonal to the first one.\par
149 | So the nice thing with this is\par
150 | that it's not really restricted\par
151 | just to two dimensions.\par
152 | You can do it in very high dimensions.\par
153 | And what you get is that the direction\par
154 | that gives you the highest variance\par
155 | is the first eigenvector, and the second direction\par
156 | is the second eigenvector,\par
157 | so the second direction in which you get maximum variance\par
158 | but it is orthogonal to the first direction and so on,\par
159 | you go down.\par
160 | And basically, that kind of describes\par
161 | a big ellipsoid in space that basically represents\par
162 | somehow the distribution of the data.\par
163 | So let's see a real-life example of that\par
164 | right here just in two dimensions.\par
165 | So we're going back to the data\par
166 | that has the weight and the height of 25,000 people.\par
167 | And here is the first component of the PCA analysis.\par
168 | Okay, so this is the direction\par
169 | that gives you the highest variance\par
170 | when you project on it, when you project the data on it.\par
171 | So it's pretty intuitive.\par
172 | This is the direction the data is most,\par
173 | it's most distributed widely across this direction.\par
174 | Okay, so this way of looking at PCA\par
175 | provides one of the common ways to normalize data.\par
176 | Normalizing data is very useful\par
177 | because it puts some of the variation\par
178 | into some parameters and then leaves\par
179 | the rest of the variation to be studied.\par
180 | So how do we use PCA to normalize data?\par
181 | First, we subtract the mean.\par
182 | So by subtracting the mean, we make the new mean zero.\par
183 | And then we rotate the data so that\par
184 | the coordinates are the eigenvectors.\par
185 | So this looks something like this.\par
186 | If we have the original data here,\par
187 | the mean is somewhere between 125 and 70.\par
188 | We subtracted the mean, so now the mean is zero, zero.\par
189 | And then we rotate it so that the maximum variation\par
190 | is along the x-axis and the smaller variation\par
191 | is along the y-axis.\par
192 | So this is something that is done quite a lot.\par
193 | Okay, let's see another little application\par
194 | of PCA in computer vision.\par
195 | So here is a little picture of maybe a blob\par
196 | and an image.\par
197 | So it's a rectangle.\par
198 | And let's say we want to somehow capture\par
199 | what is the size and orientation of this rectangle.\par
200 | So we can basically map it to our PCA problem.\par
201 | Here are all the pixels, those little blue dots.\par
202 | And if we do a PCA, we find that the eigenvector\par
203 | is essentially in this direction.\par
204 | And it has, the standard deviation is about this,\par
205 | or this is actually some product,\par
206 | some constant factor over the standard deviation.\par
207 | But it tells us the data varies more.\par
208 | The direction of this blob is mostly in this direction,\par
209 | and this is about its size.\par
210 | And that would work for any shape.\par
211 | It doesn't really matter.\par
212 | Okay, so to summarize,\par
213 | we talked about PCA and regression.\par
214 | And they are both models for,\par
215 | both ways to model data by minimizing\par
216 | root mean squared error.\par
217 | Regression is a supervised method,\par
218 | so you have to choose what thing\par
219 | you're trying to predict,\par
220 | while principal component analysis\par
221 | is an unsupervised method.\par
222 | So unsupervised methods tend to be used\par
223 | before when you just get your raw data\par
224 | and you are trying to summarize it\par
225 | or somehow reduce its dimension\par
226 | so that you can do the supervised part more efficiently.\par
227 | Both methods are based on linear algebra.\par
228 | And because of that, they are very, very efficient methods.\par
229 | They are far more efficient than methods\par
230 | that depend on gradient descent and so on.\par
231 | Okay, so this is the end of the regression topic.\par
232 | I hope you found it interesting.\par
233 | And we'll continue next week.\par
234 | End of transcript. Skip to the start.\par
235 |   Previous\par
236 | }
237 |  


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/HW_9.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/HW_9.zip


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/Programming Assignment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/Programming Assignment.pdf


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/Quiz 9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/Quiz 9.pdf


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/lectures.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/lectures.zip


--------------------------------------------------------------------------------
/Week 9 Regression and PCA/more_lectures.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/more_lectures.zip


--------------------------------------------------------------------------------