├── .gitignore ├── 01-Estimation-leading_up_to_MLE.R ├── HW ├── Exercise-1c-R.Rmd ├── Exercise-1c-R.html ├── Exercise-1c-R.pdf ├── Exercise-3c-R.Rmd ├── Exercise-3c-R.pdf └── old │ ├── HW1 │ ├── HW1.Rmd │ ├── HW1.html │ ├── HW1.pdf │ ├── HW1.tex │ ├── hw1_q3_solution.Rmd │ └── hw1_q3_solution.html │ ├── HW2 │ ├── HW2.Rmd │ ├── HW2.html │ └── iris.jpg │ ├── HW3 │ ├── HW3.Rmd │ ├── HW3.html │ └── answers │ │ ├── EX09_Questions_and_Solutions.pdf │ │ ├── country_code.csv │ │ ├── federalelections2016.xls │ │ ├── is_republican.csv │ │ └── solution_ex3_q2.R │ ├── HW4 │ └── first_semester │ │ ├── HW4.Rmd │ │ └── HW4.html │ ├── HW5 │ └── HW5.pdf │ ├── Submission guidelines for data analysis course 2020.docx │ ├── Submission guidelines for data analysis course 2020.pdf │ └── Submission guidelines for data analysis course 2019.pdf ├── README.md ├── additional_notes ├── MLE_Bernoulli.Rmd └── MLE_Bernoulli.pdf ├── books └── openintro-statistics-sample.pdf ├── distribution_shiny_app ├── app.R ├── distribution_shiny_app.Rproj └── plot_z_score.R ├── exam_examples ├── Exam_Moed_A.pdf ├── Exam_Moed_A_answers.pdf ├── Exercise_Examples.Rmd ├── Exercise_Examples.pdf ├── Exercise_Examples_Answers.Rmd └── TSLA.csv ├── exercises └── old │ ├── 10 │ ├── 10.pdf │ └── first_semester │ │ ├── 08.pdf │ │ ├── 10.pdf │ │ └── Exam Questiuon.pdf │ ├── 11 │ ├── ex 11.docx │ ├── ex 11.pdf │ └── first_semester │ │ ├── HW4 Q1 and Q2 solution.docx │ │ ├── HW4 Q1 and Q2 solution.pdf │ │ └── ex11.pdf │ ├── 12 │ ├── 12.docx │ ├── 12.pdf │ └── first_semester │ │ └── ex12.pdf │ ├── 13 │ ├── ex 13.pdf │ └── first_semester │ │ └── ex 13.pdf │ ├── 01 - Intro to R │ ├── 00-Introduction.Rmd │ ├── 00-Introduction.html │ ├── 01- More Operations.Rmd │ ├── 01--More-Operations.html │ ├── 01-Syntax, functions, loops, data types.Rmd │ ├── 01-Syntax,-functions,-loops,-data-types.html │ └── example_file.csv │ ├── 02 │ ├── 02.Rmd │ ├── 02.html │ ├── EX 02 Q1 Q2.docx │ ├── EX 02 Q1 Q2.pdf │ ├── t.png │ └── x^2.png │ ├── 03 │ ├── 03- Point estimation and dplyr package.Rmd │ ├── 03-_Point_estimation_and_dplyr_package.html │ ├── 03-_Point_estimation_and_dplyr_package.pdf │ ├── 03-_Point_estimation_and_dplyr_package.tex │ ├── ex03.zip │ ├── hw1_q3_solution.html │ └── hw1_q3_solution.pdf │ ├── 04 │ ├── 04.Rmd │ └── 04.html │ ├── 05 │ ├── 05.Rmd │ ├── 05.html │ ├── EX 05 - Intro to hypothesis tests.pdf │ ├── Q1_2.docx │ ├── Q1_2.pdf │ ├── type1_type2_errors.png │ └── type1_type2_errors2.jpg │ ├── 06 │ ├── EX06.docx │ └── EX06.pdf │ ├── 07 │ ├── EX07.pdf │ └── cs229-notes1.pdf │ ├── 08 │ ├── EX08.pdf │ └── Ex08.docx │ └── 09 │ ├── ex09.docx │ └── ex09.pdf ├── intro_statistics_R.Rproj ├── labs ├── Independence test.Rmd ├── answers │ ├── Data_science_workflow_lab-answers.html │ ├── food_consumption-answers.Rmd │ └── netflix movies and tv shows exercise - answers.Rmd ├── data │ ├── netflix_titles.csv │ └── sf_trees.csv ├── food_consumption.Rmd ├── linear_regression.Rmd └── netflix movies and tv shows exercise.Rmd ├── lectures ├── 00-Introduction.pptx ├── 00-intro-binomial-dist.R ├── 00-introduction │ ├── 00-introduction_script.Rmd │ ├── IWER34_2019.xlsx │ ├── Lego_parts.csv │ └── st02_03.xls ├── 01-Estimation methods and Intervals.Rmd ├── 01-Point Estimation Methods and Intervals.pdf ├── 02-Intervals.Rmd ├── 02-Intervals.pdf ├── 03 - Hypothesis Tests.pdf ├── 03-Hypothesis_tests.Rmd ├── 03-Hypothesis_tests_part_b.Rmd ├── 04 - Statistical inference for Two Samples.pdf ├── 04-Statistical_inference_two_samples - part B.Rmd ├── 04-Statistical_inference_two_samples.Rmd ├── 05 - Simple Linear Regression.pdf ├── 05-Simple_linear_regression.Rmd ├── 06 - Multiple Linear Regression and Correlation.pdf ├── 06-Multiple_linear_regression_and_correlation.Rmd ├── 06-Note_about_overfitting.Rmd ├── 07 - Regression, Design and Analysis of Single-Factor Experiments.pdf ├── 07-Multiple regression and exercises.Rmd ├── 08-Single_factor_experiments_ANOVA.Rmd ├── 09-One_Two_way_ANOVA.Rmd ├── 09-One_Two_way_ANOVA.pdf ├── Example_for_multicolinearity_problem.R ├── data │ ├── ipf_lifts.csv │ ├── montgomery_13.5_fabric_strength.csv │ ├── montgomery_14.5_adhesion_force.csv │ ├── movie_db_clean.csv │ ├── wildlife_impacts_medium.csv │ └── wildlife_impacts_small.csv ├── files_during_lecture │ ├── 05-file1.R │ ├── 05-file2.R │ └── 05-file3.R ├── images │ ├── Type_IandType_II_errors.jpg │ ├── birds_eye_view1.svg │ ├── link_for_survey_example.png │ ├── speeding_ticket.png │ └── waze_not_accurate.jpg ├── mult_lin_reg_example.R ├── what_is_z_score.R └── xaringan-themer.css ├── misc ├── Distribution tables - x^2, z, f, t.pdf ├── extended_formula_page.pdf ├── init_course_plan.md ├── syllabus_05601823_2022.pdf └── tau_engineering_logo.png ├── population_vs_sample ├── .Rhistory ├── app.R ├── population_vs_sample.Rproj └── rsconnect │ └── shinyapps.io │ └── sarid │ └── population_vs_sample.dcf ├── project ├── Project Instructions.Rmd ├── Project Instructions.pdf ├── example_star_trek_script_analysis.Rmd └── examples │ ├── College Major (I.P, E.T).html │ ├── FIFA 2019 Analysis - Inbar Siloni.Rmd │ ├── FIFA 2019 Analysis - Inbar Siloni.html │ ├── Final_Project_Gil_Shwartz.html │ └── spotify_project (D.S, S.K).html └── xaringan-themer.css /.gitignore: -------------------------------------------------------------------------------- 1 | montgomery.pdf 2 | .Rproj.user 3 | /exercises/04/heights.jpg 4 | /.Rhistory 5 | .RData 6 | /exercises/05/Q1 (on board).docx 7 | /exercises/Submission guidelines for data analysis course.docx 8 | /exercises/07/EX07.docx 9 | /exercises/08/08.docx 10 | /lectures/data/wildlife_impacts.csv 11 | lectures/libs 12 | /HW/HW3/answers/EX09_Questions_and_Solutions.docx 13 | /exercises/10/10.docx 14 | /exercises/11/HW4 Q2 solution.docx 15 | /HW/HW5/HW5.docx 16 | /exercises/12/~$ex12.docx 17 | /exercises/12/ex12.docx 18 | /exercises/13/ex 13.docx 19 | /lectures/*.html 20 | /lectures/*_files 21 | distribution_shiny_app/rsconnect 22 | labs/rsconnect/ 23 | /labs/*_files/ 24 | /labs/*.html 25 | 26 | # ignore temporary files declared via tmp_*.* 27 | tmp_*.* 28 | /tmp_bug_example -------------------------------------------------------------------------------- /01-Estimation-leading_up_to_MLE.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | density_func <- tibble(x = seq(-2, 2, 0.1)) %>% 4 | mutate(dens = dnorm(x = x, mean = 0, sd = 1)) 5 | 6 | set.seed(0) 7 | randomized_dots <- tibble(rnd = rnorm(n = 1000, mean = 0.4, sd = 1.1)) 8 | 9 | # Demonstrate by changing this: 10 | 11 | num_points <- 50 12 | 13 | ggplot(density_func, aes(x = x, y = dens)) + 14 | geom_line() + 15 | geom_density(data = randomized_dots[1:num_points,], aes(rnd), inherit.aes = F) + 16 | geom_point(data = randomized_dots[1:num_points,], 17 | aes(x = rnd, y = 0), alpha = 0.4, inherit.aes = F, size = 3) -------------------------------------------------------------------------------- /HW/Exercise-1c-R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "תרגיל בית 1ג' (תרגול ב-R)" 3 | subtitle: "היכרות ראשונית עם R" 4 | output: 5 | knit: pagedown::chrome_print 6 | --- 7 | 8 | ```{=html} 9 | 39 | ``` 40 | ```{r setup, include=FALSE} 41 | suppressWarnings(suppressMessages(library(tidyverse))) 42 | library(palmerpenguins) 43 | ``` 44 | 45 | בתרגיל זה תתקינו את R, את חבילת `tidyverse` ותתרגלו מעט עבודה עם נתונים. 46 | 47 | ## חלק ראשון 48 | 49 | התקינו את תוכנת R ואת RStudio. 50 | 51 | את תוכנת R ניתן להוריד בקישור הבא: 52 | 53 | 54 | 55 | את RStudio ניתן להוריד בקישור הבא: 56 | 57 | 58 | 59 | שימו לב להשתמש בגרסאות המתאימות לכם (יש גרסאות שונות ל-Windows, macOS, ו-Linux) 60 | 61 | כעת לאחר שהתקנתם, הפעילו את RStudio והתקינו את חבילת `tidyverse` על ידי הקלדה ב-Console של: 62 | 63 | ```{r, eval=FALSE} 64 | install.packages("tidyverse") 65 | ``` 66 | 67 | זה עשוי לקחת קצת זמן, אז התאזרו בסבלנות. 68 | 69 | אם קיבלתם הודעת שגיאה כגון: *Error in install.packages : cannot open file* זה כנראה נובע מבעיות הרשאה לתיקייה אליה R מנסה להתקין את החבילה. אפשר לפתור את הבעיה הזו על ידי העלאת RStudio עם כפתור ימני של העכבר, ואז Run as Administrator (ואז לנסות להתקין מחדש). 70 | 71 | ## חלק שני 72 | 73 | התקינו את חבילת `palmerpenguins`. השתמשו באותה הפקודה בה השתמשנו בחלק הראשון לצורך התקנת חבילה (רק שנו את השם tidyverse לשם של החבילה `palmerpenguins`). 74 | 75 | כעת השתמשו בקוד הבא ותארו מהם הנתונים הקיימים בטבלה `penguins`: 76 | 77 | ```{r, eval=FALSE} 78 | library(tidyverse) 79 | library(palmerpenguins) 80 | 81 | glimpse(penguins) 82 | ``` 83 | 84 | הריצו גם את הפקודות הבאות וכתבו מה ההבדל בין הפונקציה `glimpse`, `head`, `tail`, `view`, ולרשום `penguins` ב-console. 85 | 86 | ```{r, eval=FALSE} 87 | head(penguins) 88 | tail(penguins) 89 | view(penguins) 90 | penguins 91 | ``` 92 | 93 | ------------------------------------------------------------------------ 94 | 95 | - מה מתארת כל שורה בטבלה? 96 | 97 | - מה מתאר המשתנה island? 98 | 99 | - מה מתאר המשתנה species? 100 | 101 | השתמשו בפונקציה count בדומה לאופן המתואר להלן, וענו על השאלות: 102 | 103 | - כמה איים מתועדים בטבלה? 104 | 105 | - כמה זכרים מתועדים בטבלה? 106 | 107 | - לכמה תצפיות חסרים נתונים זכר/נקבה? 108 | 109 | דוגמה לשימוש בפונקציה count (תצטרכו להרחיב את הדוגמה הזו על ידי שינוי הקוד) 110 | 111 | ```{r, eval=FALSE} 112 | penguins %>% 113 | count(species) 114 | ``` 115 | 116 | ## חלק שלישי 117 | 118 | הקוד הבא מצייר תרשים של הקשר בין אורך מקור ואורך כנף. 119 | 120 | ```{r} 121 | ggplot(data = penguins, 122 | aes(x = bill_length_mm, 123 | y = flipper_length_mm, 124 | color = species)) + 125 | geom_point() 126 | ``` 127 | 128 | ענו על השאלות הבאות: 129 | 130 | 1. מה אומרת הודעת האזהרה של R? מה המשמעות שלה? (Removed 2 rows containins...) 131 | 132 | 2. הסבירו מה הקשר שמתאר התרשים בין שלושת המשתנים שבו (שני הצירים וצבע הנקודות). 133 | 134 | 3. עבור כל פונקציה שהשתמשנו בה, הסבירו מה התפקיד שלה ביצירת התרשים (להלן הפונקציות שבהן השתמשנו `ggplot`, `aes`, `geom_point`). באפשרותכם להיעזר [בעמוד הזה](https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-visualization.pdf) 135 | 136 | 4. כתבו קוד אשר יצייר תרשים שמתאר את הקשר בין אורך המקור, עומק המקור, והאי שבו נמדדה הדגימה (לשם כך היעזרו בקוד של התרשים הקודם, ושנו אותו כך שיתאים לצרכים). 137 | 138 | 5. נתחו את התרשים: מה הקשר שאתם מבחינים בו? מדוע האי שבו נמדדה הדגימה משפיע על הקשר בין האורך והעומק של המקור? 139 | -------------------------------------------------------------------------------- /HW/Exercise-1c-R.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/Exercise-1c-R.pdf -------------------------------------------------------------------------------- /HW/Exercise-3c-R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "עבודה עם Dataset, חילוץ נתונים, ושימוש בפונקציות בסיסיות של `tidyverse`" 3 | subtitle: "`r Sys.Date()`" 4 | output: 5 | knit: pagedown::chrome_print 6 | --- 7 | 8 | ```{=html} 9 | 39 | ``` 40 | ```{r setup, include=FALSE} 41 | suppressWarnings(suppressMessages(library(tidyverse))) 42 | library(palmerpenguins) 43 | ``` 44 | 45 | בתרגיל זה תתנסו במספר פונקציות בסיסיות ב-R (ולמעשה פונקציות של tidyverse). 46 | 47 | ## חלק ראשון 48 | 49 | קראו את התיעוד בעמוד הבא: 50 | 51 | https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-09-22/readme.md 52 | 53 | 54 | השתמשו בפונקציה `read_csv` מחבילת `readr` על מנת לקרוא את שלושת הקבצים הבאים. ניתן להיעזר בקוד הבא: 55 | 56 | ```{r, eval=FALSE} 57 | members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv') 58 | expeditions <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/expeditions.csv') 59 | peaks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/peaks.csv') 60 | ``` 61 | 62 | על מנת לתאר את הדאטה, באופן כללי, השתמשו בפונקציות `count`, `filter`, `arrange`, `distinct` וענו על השאלות הבאות: 63 | 64 | 1. אילו שנים מתועדות בקבצים (טווח השנים של משלחות להימלאיה) 65 | 66 | 2. כמה אזרחויות שונות היו מעורבות במשלחות להימלאיה, עד לשנת 1950? 67 | 68 | 3. על בסיס הנתונים, מה לדעתכם העונה הטובה ביותר לטפס על פסגת האוורסט? 69 | 70 | 4. כמה משלחות מתועדות בקובץ, שניסו לטפס על האוורסט (ולא הצליחו להעפיל לפסגה) לפני שנת 1953? 71 | 72 | 5. כמה פסגות ברכס ההימלאיה מגיעות לרום של מעל 800 מטר? 73 | 74 | ## חלק שני 75 | 76 | בחלק זה תשתמשו בפונקציה `mutate` על מנת לבחור משתנים ולערוך טרנספורמציות. 77 | 78 | היעזרו בקוד הבא, על מנת לייצר טבלה חדשה עם שני משתנים בוליאניים חדשים: `is_doctor`, `is_leader`. 79 | 80 | הסבירו מה עושה הפונקציה `str_detect`, ומה המשמעות של כל שורה בקוד (מה עושה כל שורה). 81 | 82 | ```{r, eval=FALSE} 83 | leader_table <- members %>% 84 | mutate(is_leader = str_detect(expedition_role, "Leader")) %>% 85 | mutate(is_doctor = str_detect(expedition_role, "Doctor")) 86 | ``` 87 | 88 | השתמשו בפונקציה `count` על מנת לייצר טבלה שתראה את כל הצירופים האפשריים של is_leader, is_doctor, ומספר התצפיות. כמה מובילי משלחות יש שהם גם רופאים? 89 | 90 | 91 | 92 | השתמשו ב-`mutate` ובפונקציה `cut` על מנת לבנות טבלה חדשה שבה יש משתנה שנקרא `decade`, המתאר את העשור שבו יצאה המשלחת. ניתן להיעזר בקוד הבא (השלימו את הקוד). 93 | 94 | לאחר מכן, צרו תרשים שיציג את מספר המשלחות בכל עונה בכל עשור. 95 | 96 | ```{r, eval=FALSE} 97 | expeditions_new <- expeditions %>% 98 | mutate(decade = cut(year, breaks = c(___, ___, ___, ...))) %>% 99 | count(season, ___) 100 | 101 | ggplot(___, aes(x = decade, y = ___, fill = season)) + 102 | geom_col(position = position_dodge()) 103 | 104 | ``` 105 | 106 | מה ההבדל בין שימוש ב-`position_dodge` לבין `position_stack` ו-`position_fill` בקוד לעיל? 107 | 108 | ## חלק שלישי 109 | 110 | נניח שאתם מתכננים להעפיל לפסגה ברכס ההימלאיה, של מעל ל-8000 מטר. 111 | 112 | 1. צרו טבלה חדשה עם רשימה של פסגות אלו. 113 | 2. מיהן עשרת סוכנויות המסע (`trekking_agency`) שהוציאו הכי הרבה משלחות? 114 | 3. השתמשו ב-`group_by` וב-`summarize` על מנת לחשב את ממוצע התמותה לכל משלחת, באופן כללי, וגם עבור הפסגות מעל 8000 מטר בלבד (ניתן לחלק סעיף זה לשני חישובים נפרדים). 115 | 4. עם אילו חברות הייתם שוקלים לצאת, ועם אילו לא? -------------------------------------------------------------------------------- /HW/Exercise-3c-R.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/Exercise-3c-R.pdf -------------------------------------------------------------------------------- /HW/old/HW1/HW1.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Introduction to Statistics and Data Analysis with R - Homework #1' 3 | author: "Adi Sarid and Afek Adler" 4 | date: "`r Sys.Date()`" 5 | output: 6 | pdf_document: default 7 | html_document: default 8 | --- 9 | 10 | This homework sheet is due on the 2020-04-01 on 24:00. You may submit your answers in pairs. 11 | Submission will be performed electronically via the moodle. 12 | 13 | We urge you to start solving this sheet as soon as possible and, if you have any questions, come to visit us in reception hours next week. 14 | 15 | The exercise is divided into two parts: Technical (programming in `R`) and theoretical. 16 | 17 | Submit the following questions: 18 | 19 | * Q1 - only 3.6.1.6 (specified later). 20 | * Q2 21 | * Q4 22 | * Q5 23 | 24 | 25 | 26 | # Technical (programming in `R`) 27 | 28 | ## Question 1: 29 | 30 | Please read the following chapters in [R4DS - https://r4ds.had.co.nz](https://r4ds.had.co.nz): 31 | 32 | 1. [Introduction](https://r4ds.had.co.nz/introduction.html) 33 | 2. [Explore - Introduction](https://r4ds.had.co.nz/explore-intro.html) 34 | 3. [Explore - Data visualizations](https://r4ds.had.co.nz/data-visualisation.html) 35 | 4. [Workflow](https://r4ds.had.co.nz/workflow-basics.html) 36 | 37 | Solve exercise 3.6.1,3.81, 4.4. **submit the code for question 6 (in 3.6.1)** ("*Recreate the R code necessary to generate the following graphs*"). 38 | 39 | ## Question 2: 40 | 41 | In this question, you will get acquainted (or reminded of) the following distributions: 42 | 43 | * Normal distribution $N(\mu, \sigma)$ 44 | * Student's t $t_{\operatorname{df}}$ 45 | * Chi-square $\chi^2$ 46 | 47 | Complete the blanks (`___`) in the following code, to generate $n=100$ random values from each of these distributions with: 48 | 49 | * Normal with $\mu=3, \sigma = 1.5$ 50 | * Student's-t with $\operatorname{df}=10$ 51 | * Chi-square with $\operatorname{df}=12$ 52 | 53 | Tip: if you type a `?` followed by the command name in the console, you will see its documentation. I.e., type `?rnorm` to see the help on the random number generator for the normal distribution. 54 | 55 | ### Complete the blanks: 56 | 57 | ``` 58 | set.seed(0) # we set the seed of the random generator so that your results will be consistent 59 | 60 | random_normal <- rnorm(n = ___, mean = ___, ___ = 1.5) 61 | random_t <- rt(n = ___, df = ___) 62 | random_chi <- rchisq(n = ___, df = ___) 63 | 64 | ``` 65 | 66 | ### Plot by completing the blanks: 67 | 68 | Plot each of these samples using `ggplot2`. Think, what `geom` would you use to plot the distribution of the sample? 69 | 70 | ``` 71 | 72 | # if you don't have the tidyverse package first install by running 73 | # install.packages("tidyverse") 74 | 75 | library(tidyverse) 76 | 77 | all_random_data <- tibble(random_normal, random_t, random_chi) 78 | 79 | ggplot(all_random_data, aes(random_normal)) + 80 | geom____() 81 | 82 | ggplot(all_random_data, aes(random_t)) + 83 | ___ 84 | 85 | ggplot(all_random_data, aes(random_chi)) + 86 | ___ 87 | 88 | ``` 89 | 90 | ### Answer these: 91 | 92 | 1. Is the original distribution symmetric? does the plots look symmetric, why? 93 | 2. Generally speaking (not relating to the specific sample you obtained), what is the relationship between the mean and median of each of these distributions? 94 | 3. What would happen if we increase $n$ from 100 to 1000? 95 | 96 | a. How would the distribution look like? 97 | b. Why? 98 | c. Modify your code and visualize the updates. 99 | 100 | 101 | # Theoretical 102 | 103 | ## Question 3: 104 | 105 | In the smallest branch of the smallest bank, the number of customers in the queue (waiting customers), is a random variable $Q\in\{0,1,2\}$. You cannot have more than 2 customers waiting in the queue, because the've been downsizing and the branch is really small. 106 | 107 | The distribution of $Q$ is dependent on a parameter $\theta$. 108 | 109 | $$Q = \left\{\begin{array}{ll}0 & \text{w.p. }4\theta^2\\ 110 | 1 & \text{w.p. }4\theta-8\theta^2\\ 111 | 2 & \text{w.p. }1-4\theta+4\theta^2\end{array}\right.$$ 112 | 113 | The bank's headquarters randomly sampled the queue during five independent times. The results were $\{0,1,0,0,0\}$ customers in the queue. 114 | 115 | ### Answer the following questions: 116 | 117 | 1. Find an unbiased estimator $\hat{\Theta}$ for the parameter $\theta$ for a sample of size $n=5$. What is $\hat{\Theta}$ based on the current sample? (you should get 0.45) 118 | 2. Find an unbiased estimator for the expected number of customers waiting in the queue based on a sample of size $n=5$. What is the estimate of the expected number of customers, based on the current sample? (0.2) 119 | 3. Find an estimator for $\theta$ in the maximum likelihood estimation method. (0.45) 120 | 121 | ## Question 4: 122 | 123 | let $X$ be a random Bernoulli variable. It's probability density function can be formulated as follows: 124 | 125 | \[ 126 | f(x ; p)=\left\{\begin{array}{ll}{p^{x}(1-p)^{1-x}} & {x=0,1} \\ {0} & {\text { otherwise }}\end{array}\right. 127 | \] 128 | 129 | 1. Show that $X=1$ with probability $p$ and that $X=0$ with probability $1-p$ 130 | 2. Suppose we get a random sample of size $n$ from a Bernulli distribution. What is the likelihood function $L(p)$ of the sample? (what is the probability that $P\left(X_{1}=x_{1}, X_{2}=x_{2}, \ldots, X_{n}=x_{n}\right)$) 131 | 3. Apply the $\log$ transformation on this likelihood function, what do you get? 132 | 4. Find the $p$ that maximizes $\log L(p)$ 133 | 134 | ## Question 5: 135 | 136 | For the following probability density function: 137 | \[f(x)=\left\{\begin{array}{ll}{\frac{2}{\theta^{2}}(\theta-x)} & {0\linewidth\linewidth\else\Gin@nat@width\fi} 48 | \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} 49 | \makeatother 50 | % Scale images if necessary, so that they will not overflow the page 51 | % margins by default, and it is still possible to overwrite the defaults 52 | % using explicit options in \includegraphics[width, height, ...]{} 53 | \setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} 54 | % Set default figure placement to htbp 55 | \makeatletter 56 | \def\fps@figure{htbp} 57 | \makeatother 58 | \setlength{\emergencystretch}{3em} % prevent overfull lines 59 | \providecommand{\tightlist}{% 60 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} 61 | \setcounter{secnumdepth}{-\maxdimen} % remove section numbering 62 | 63 | \title{Introduction to Statistics and Data Analysis with R - Homework \#1} 64 | \author{Adi Sarid and Afek Adler} 65 | \date{2020-03-18} 66 | 67 | \begin{document} 68 | \maketitle 69 | 70 | This homework sheet is due on the 2020-04-01 on 24:00. You may submit 71 | your answers in pairs. Submission will be performed electronically via 72 | the moodle. 73 | 74 | We urge you to start solving this sheet as soon as possible and, if you 75 | have any questions, come to visit us in reception hours next week. 76 | 77 | The exercise is divided into two parts: Technical (programming in 78 | \texttt{R}) and theoretical. 79 | 80 | Submit the following questions: 81 | 82 | \begin{itemize} 83 | \tightlist 84 | \item 85 | Q1 - only 3.6.1.6 (specified later). 86 | \item 87 | Q2 88 | \item 89 | Q4 90 | \item 91 | Q5 92 | \end{itemize} 93 | 94 | \hypertarget{technical-programming-in-r}{% 95 | \section{\texorpdfstring{Technical (programming in 96 | \texttt{R})}{Technical (programming in R)}}\label{technical-programming-in-r}} 97 | 98 | \hypertarget{question-1}{% 99 | \subsection{Question 1:}\label{question-1}} 100 | 101 | Please read the following chapters in \href{https://r4ds.had.co.nz}{R4DS 102 | - https://r4ds.had.co.nz}: 103 | 104 | \begin{enumerate} 105 | \def\labelenumi{\arabic{enumi}.} 106 | \tightlist 107 | \item 108 | \href{https://r4ds.had.co.nz/introduction.html}{Introduction} 109 | \item 110 | \href{https://r4ds.had.co.nz/explore-intro.html}{Explore - 111 | Introduction} 112 | \item 113 | \href{https://r4ds.had.co.nz/data-visualisation.html}{Explore - Data 114 | visualizations} 115 | \item 116 | \href{https://r4ds.had.co.nz/workflow-basics.html}{Workflow} 117 | \end{enumerate} 118 | 119 | Solve exercise 3.6.1,3.81, 4.4. \textbf{submit the code for question 6 120 | (in 3.6.1)} (``\emph{Recreate the R code necessary to generate the 121 | following graphs}''). 122 | 123 | \hypertarget{question-2}{% 124 | \subsection{Question 2:}\label{question-2}} 125 | 126 | In this question, you will get acquainted (or reminded of) the following 127 | distributions: 128 | 129 | \begin{itemize} 130 | \tightlist 131 | \item 132 | Normal distribution \(N(\mu, \sigma)\) 133 | \item 134 | Student's t \(t_{\operatorname{df}}\) 135 | \item 136 | Chi-square \(\chi^2\) 137 | \end{itemize} 138 | 139 | Complete the blanks (\texttt{\_\_\_}) in the following code, to generate 140 | \(n=100\) random values from each of these distributions with: 141 | 142 | \begin{itemize} 143 | \tightlist 144 | \item 145 | Normal with \(\mu=3, \sigma = 1.5\) 146 | \item 147 | Student's-t with \(\operatorname{df}=10\) 148 | \item 149 | Chi-square with \(\operatorname{df}=12\) 150 | \end{itemize} 151 | 152 | Tip: if you type a \texttt{?} followed by the command name in the 153 | console, you will see its documentation. I.e., type \texttt{?rnorm} to 154 | see the help on the random number generator for the normal distribution. 155 | 156 | \hypertarget{complete-the-blanks}{% 157 | \subsubsection{Complete the blanks:}\label{complete-the-blanks}} 158 | 159 | \begin{verbatim} 160 | set.seed(0) # we set the seed of the random generator so that your results will be consistent 161 | 162 | random_normal <- rnorm(n = ___, mean = ___, ___ = 1.5) 163 | random_t <- rt(n = ___, df = ___) 164 | random_chi <- rchisq(n = ___, df = ___) 165 | \end{verbatim} 166 | 167 | \hypertarget{plot-by-completing-the-blanks}{% 168 | \subsubsection{Plot by completing the 169 | blanks:}\label{plot-by-completing-the-blanks}} 170 | 171 | Plot each of these samples using \texttt{ggplot2}. Think, what 172 | \texttt{geom} would you use to plot the distribution of the sample? 173 | 174 | \begin{verbatim} 175 | 176 | # if you don't have the tidyverse package first install by running 177 | # install.packages("tidyverse") 178 | 179 | library(tidyverse) 180 | 181 | all_random_data <- tibble(random_normal, random_t, random_chi) 182 | 183 | ggplot(all_random_data, aes(random_normal)) + 184 | geom____() 185 | 186 | ggplot(all_random_data, aes(random_t)) + 187 | ___ 188 | 189 | ggplot(all_random_data, aes(random_chi)) + 190 | ___ 191 | \end{verbatim} 192 | 193 | \hypertarget{answer-these}{% 194 | \subsubsection{Answer these:}\label{answer-these}} 195 | 196 | \begin{enumerate} 197 | \def\labelenumi{\arabic{enumi}.} 198 | \item 199 | Is the original distribution symmetric? does the plots look symmetric, 200 | why? 201 | \item 202 | Generally speaking (not relating to the specific sample you obtained), 203 | what is the relationship between the mean and median of each of these 204 | distributions? 205 | \item 206 | What would happen if we increase \(n\) from 100 to 1000? 207 | 208 | \begin{enumerate} 209 | \def\labelenumii{\alph{enumii}.} 210 | \tightlist 211 | \item 212 | How would the distribution look like? 213 | \item 214 | Why? 215 | \item 216 | Modify your code and visualize the updates. 217 | \end{enumerate} 218 | \end{enumerate} 219 | 220 | \hypertarget{theoretical}{% 221 | \section{Theoretical}\label{theoretical}} 222 | 223 | \hypertarget{question-3}{% 224 | \subsection{Question 3:}\label{question-3}} 225 | 226 | In the smallest branch of the smallest bank, the number of customers in 227 | the queue (waiting customers), is a random variable \(Q\in\{0,1,2\}\). 228 | You cannot have more than 2 customers waiting in the queue, because 229 | the've been downsizing and the branch is really small. 230 | 231 | The distribution of \(Q\) is dependent on a parameter \(\theta\). 232 | 233 | \[Q = \left\{\begin{array}{ll}0 & \text{w.p. }4\theta^2\\ 234 | 1 & \text{w.p. }4\theta-8\theta^2\\ 235 | 2 & \text{w.p. }1-4\theta+4\theta^2\end{array}\right.\] 236 | 237 | The bank's headquarters randomly sampled the queue during five 238 | independent times. The results were \(\{0,1,0,0,0\}\) customers in the 239 | queue. 240 | 241 | \hypertarget{answer-the-following-questions}{% 242 | \subsubsection{Answer the following 243 | questions:}\label{answer-the-following-questions}} 244 | 245 | \begin{enumerate} 246 | \def\labelenumi{\arabic{enumi}.} 247 | \tightlist 248 | \item 249 | Find an unbiased estimator \(\hat{\Theta}\) for the parameter 250 | \(\theta\) for a sample of size \(n=5\). What is \(\hat{\Theta}\) 251 | based on the current sample? (you should get 0.45) 252 | \item 253 | Find an unbiased estimator for the expected number of customers 254 | waiting in the queue based on a sample of size \(n=5\). What is the 255 | estimate of the expected number of customers, based on the current 256 | sample? (0.2) 257 | \item 258 | Find an estimator for \(\theta\) in the maximum likelihood estimation 259 | method. (0.45) 260 | \end{enumerate} 261 | 262 | \hypertarget{question-4}{% 263 | \subsection{Question 4:}\label{question-4}} 264 | 265 | let \(X\) be a random Bernoulli variable. It's probability density 266 | function can be formulated as follows: 267 | 268 | \[ 269 | f(x ; p)=\left\{\begin{array}{ll}{p^{x}(1-p)^{1-x}} & {x=0,1} \\ {0} & {\text { otherwise }}\end{array}\right. 270 | \] 271 | 272 | \begin{enumerate} 273 | \def\labelenumi{\arabic{enumi}.} 274 | \tightlist 275 | \item 276 | Show that \(X=1\) with probability \(p\) and that \(X=0\) with 277 | probability \(1-p\) 278 | \item 279 | Suppose we get a random sample of size \(n\) from a Bernulli 280 | distribution. What is the likelihood function \(L(p)\) of the sample? 281 | (what is the probability that 282 | \(P\left(X_{1}=x_{1}, X_{2}=x_{2}, \ldots, X_{n}=x_{n}\right)\)) 283 | \item 284 | Apply the \(\log\) transformation on this likelihood function, what do 285 | you get? 286 | \item 287 | Find the \(p\) that maximizes \(\log L(p)\) 288 | \end{enumerate} 289 | 290 | \hypertarget{question-5}{% 291 | \subsection{Question 5:}\label{question-5}} 292 | 293 | For the following probability density function: 294 | \[f(x)=\left\{\begin{array}{ll}{\frac{2}{\theta^{2}}(\theta-x)} & {0% 7 | data.frame() %>% 8 | mutate(income_fct = cut(Income, 2), 9 | hs_grad_fct = cut(HS.Grad, 2)) 10 | 11 | country_name_code <- read_csv("c:/temp/country_code.csv", col_names = c("country", "code")) %>% 12 | distinct(country, code) 13 | 14 | country_vote <- read_csv("c:/temp/is_republican.csv", 15 | col_names = c("code", "is_rep"), 16 | skip = 1) %>% 17 | mutate(is_rep = !is.na(is_rep)) %>% 18 | left_join(country_name_code) 19 | 20 | x.77_voting_pattern <- newx.77 %>% 21 | mutate(country = rownames(state.x77)) %>% 22 | left_join(country_vote) 23 | 24 | 25 | chisq.test(x = newx.77$income_fct, y = newx.77$hs_grad_fct, 26 | simulate.p.value = F) 27 | 28 | t.test(formula = HS.Grad ~ is_rep, data = x.77_voting_pattern) 29 | -------------------------------------------------------------------------------- /HW/old/HW4/first_semester/HW4.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Introduction to Statistics and Data Analysis with R - Homework #4' 3 | author: "Adi Sarid and Afek Adler" 4 | date: '2019-12-16' 5 | output: 6 | html_document: default 7 | pdf_document: default 8 | --- 9 | 10 | This homework sheet is due on the 2019-12-30 You may submit your answers in pairs. Submit a PDF file with your handwritten solutions. 11 | Submission will be performed electronically via Moodle. 12 | 13 | We urge you to start solving this sheet as soon as possible and, if you have any questions, come to visit us in reception hours next week. 14 | 15 | Across all the exercise. If not mentioned – use $\alpha$ = 0.05. 16 | 17 | 18 | ## Question 1 19 | 20 | A biologist performed a regression on how much a planet diameter (x) affects it's mass (y), Based on 30 samples. His concllusion was $y ̂ = 10+0.1x$. were x is in trillions of tons ($10^{12}$ kilogram). Which regression line would he get if he would use units of ten trillions of ton? ($10^{13}$ kilogram)? \ 21 | Guidance - compute the new $ss_x,ss_{xy}$ as a function of the old ones. See what happens to new $b_1 = ss_{xy}/ss_x$ and $b_0 = \bar{y} -b_1\bar{x}$ 22 | 23 | ## Question 2 24 | 25 | Given that $\bar{x} = 432.2,\sum_{i=1}^{10}x_i^2 = 2,048,810,\sum_{i=1}^{10}y_i^2 = 103,195, \hat{y} = 5.821 + 0.195x$ 26 | 27 | Calculate: 28 | 29 | 1. $\bar{y}$ 30 | 2. $R^2$ 31 | 3. is there a positive linear connection between x and y? Guidance, use hypothesis testing. 32 | 33 | 34 | ## Question 3 35 | 36 | Based on the following samples: 37 | 38 | 1. find the parameters of a simple linear regression model ($y = a + bx$) 39 | 2. Can we say that there is a linear relationship between x and y? Guidance, hypothesis test on the significance level of the regression. 40 | 41 | | X| Y 42 | | ----|---- 43 | | 0| 0.5 44 | | 1| 2 45 | | 2| 4.2 46 | | 4| 6 47 | | 5| 6.5 48 | | 8| 8.5 49 | -------------------------------------------------------------------------------- /HW/old/HW5/HW5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/HW5/HW5.pdf -------------------------------------------------------------------------------- /HW/old/Submission guidelines for data analysis course 2020.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/Submission guidelines for data analysis course 2020.docx -------------------------------------------------------------------------------- /HW/old/Submission guidelines for data analysis course 2020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/Submission guidelines for data analysis course 2020.pdf -------------------------------------------------------------------------------- /HW/old/Submission guidelines for data analysis course 2019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/Submission guidelines for data analysis course 2019.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](https://raw.githubusercontent.com/adisarid/intro_statistics_R/bcdb6af4058308ebe999d0a477d6a1bb9030ffa2/misc/tau_engineering_logo.png) 2 | 3 | # Introduction to Statistics and Data Analysis with R 4 | 5 | This is the repository for the course of **introduction to statistics and data analysis**, taught in Tel-Aviv university (0560.1823). The course is taught in the Engineering faculty in the "Digital Sciences for High Tech" track. 6 | 7 | In this repository you will find all the required materials including lecture notes, references, class code, exercises, and more. 8 | 9 | ## Technical Information 10 | 11 | In this course there are 3 lecture hours + 1 exercise (instructor) hour. 12 | 13 | Lecturer: Dr. Adi Sarid. 14 | 15 | * Office hours: Please **coordinate in advance** via email. 16 | * E-mail: adi@sarid-ins.co.il. 17 | * Twitter: @SaridResearch 18 | * Mobile Phone: +972-50-8455450 (Please please try to reach out via email first). 19 | * Personal website: [adisarid.github.io](adisarid.github.io) 20 | 21 | Instructor: Mr. Raphael Shuhendler. 22 | 23 | * Office hours: Please **coordinate in advance** via email. 24 | * E-mail: shuhendler@mail.tau.ac.il. 25 | 26 | The course will be given in Hebrew, but all the supporting materials will be provided in English. 27 | 28 | Garding will be based on: 29 | 30 | * Final exam (60%) 31 | * Final project, pairs (40%) 32 | 33 | You will have homework but it's up to you to make sure you do them and understand them, we will not be grading them (only verifying that you submitted them). 34 | 35 | ## Prerequisites 36 | 37 | The prerequisites for this course are: 38 | 39 | * Introduction to Probability (0560.2801 or equivalent). 40 | * Mathematical Methods 1 (0560.2802 or equivalent). 41 | 42 | This course is mainly designed for undergraduates with prior knowledge in probability and basic knowledge in math (a bit of Algebra and a bit of Infi), doing a BA/BSc with a "Digital Sciences for High-Tech" track. However, it would also fit graduate students which want to strengthen their knowledge in statistics and data analysis (or learn the very basics of R). 43 | 44 | ## Goals 45 | 46 | This is a course in introduction to statistics and data analysis. The course covers fundemantal terms in statistics, such as significance, hypothesis testing, inference, sampling methods, variable types, modelling (regression). 47 | 48 | During the course we will use the [R](https://www.r-project.org) language for demonstrations and exercises. 49 | 50 | We will use publicly available "open data sets" (e.g., from [Kaggle](https://kaggle.com) and [tidytuesday](https://github.com/rfordatascience/tidytuesday)) to demonstrate the various topics we will cover. 51 | 52 | ## Topics 53 | 54 | * Overview - from design to implementation: how a statistical research is conducted, from the design phases, through data collection and presentation. 55 | * Statistical inference and parameter estimation (e.g., average, standard deviation, percentiles). 56 | * Hypothesis testing: 57 | * Confidence intervals, unpaird tests, paird tests. Student's t-test, z test, a-parameteric tests. 58 | * Goodness of fit (Chi-square, Kolmogorov-Smirnov). 59 | * The problem with p-value and significance testing in the age of big data. False discovery rate (FDR). 60 | * Analysis of Variance (One-way and Two-way ANOVA). 61 | * Planning experiments (multiple-comparisons), sample size calculations, power calculations. 62 | * Linear regression. 63 | * Correlation. 64 | * Logistic regression (if time permits). 65 | 66 | ## Software Prerequisites 67 | 68 | You will need to install [R](https://www.r-project.org) and [RStudio](https://rstudio.com/products/rstudio/download/). RStudio is not mandatory to run R, but it provides a very environment for writing R code. Both software are available for free (for RStudio download the *RStudio Desktop Open Source License* version). 69 | 70 | ## Reading Materials 71 | 72 | OpenIntro statistics is an introduction to statistics with R, it doesn't contain everything we will learn, but provides a good intro to some topics. Downloadable for free [here](https://leanpub.com/openintro-statistics/) (click on the "download sample" and the entire book downloads as a pdf file). 73 | 74 | * Diez, D. M., Barr, C. D., & Cetinkaya-Rundel, M. (2012). OpenIntro statistics (pp. 174-175). OpenIntro. 75 | 76 | R4DS (R for Data Science) is a highly recommended book for learning R, and specifically *tidyverse* which is a collection of useful packages for data science. The book is mostly "technical", i.e., it does not provide much theoretical details. This book is also available in an online format [here](https://r4ds.had.co.nz/). 77 | 78 | * Wickham, H., & Grolemund, G. (2016). R for data science: import, tidy, transform, visualize, and model data. " O'Reilly Media, Inc.". 79 | 80 | Most of the theory I present during the course comes from these two books: 81 | 82 | * Walpole R.E., Myers R. H, Myers S. L., and Ye K.: Probability & Statistics for Engineers & Scientists. Prentice Hall, 9th ed., 2011. Available [online](https://fac.ksu.edu.sa/sites/default/files/probability_and_statistics_for_engineers_and_scientisst.pdf) 83 | * Runger G. & D. Montgomery: Applied Statistics and Probability for Engineers. Wiley, 7th ed., 2018. 84 | 85 | Additional books: 86 | 87 | * Johnson, N.L. & Leone, F.C.: Statistics and Experimental Design Vol. 1.2, Wiley, 2nd ed., 1997. 88 | * Draper N. & H. Smith: Applied Regression Analysis, 3rd ed. Wiley, 1998. 89 | * Gibbons J.D.: Nonparametic Statistical Inference, Springer, 2011. 90 | 91 | ## Additional Sources 92 | 93 | You can find various online videos teaching statistics theory along with R coding examples. One such place is the Statistics of DOOM channel on youtube: [https://www.youtube.com/channel/UCMdihazndR0f9XBoSXWqnYg](https://www.youtube.com/channel/UCMdihazndR0f9XBoSXWqnYg). 94 | 95 | A recorded workshop on R [https://tau.cloud.panopto.eu/Panopto/Pages/Sessions/List.aspx?folderID=63ae0b2d-6a79-4d4d-82d5-ac8f0160961b](https://tau.cloud.panopto.eu/Panopto/Pages/Sessions/List.aspx?folderID=63ae0b2d-6a79-4d4d-82d5-ac8f0160961b). 96 | 97 | Online course in R from Tel-Hai College, available in Campus-IL [here](https://campus.gov.il/course/telhai-acd-rfp4-telhai-r/). 98 | 99 | The friendly guide for moving from Excel to R on my [youtube channel](https://www.youtube.com/watch?v=yRTD1zP5iEM&list=PLvH84evAlP42YtWm3XTjfK2ksC4evTg_U). 100 | 101 | ## How this Repository is Arranged 102 | 103 | This repository is arranged with subfolders as follows: 104 | 105 | ``` 106 | ├── exam_examples (examples for questions and exams) 107 | ├── exercises (exercise notes) 108 | ├── HW (home work exercises) 109 | ├── lectures (lecture notes) 110 | └── data (contains datasets we will use) 111 | ├── misc (miscellaneous, feel free to ignore this) 112 | └── project (project instructions and example) 113 | ``` 114 | -------------------------------------------------------------------------------- /additional_notes/MLE_Bernoulli.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "MLE for Bernoulli parameter $p$" 3 | author: "Adi Sarid" 4 | date: "`r Sys.Date()`" 5 | output: pdf_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | The likelihood function of a Bernoulli random variable is the probability of the results, i.e. a probability of a binomial random variable with $v$ success out of $n$ trials, with a probability $p$ for success: 13 | 14 | $$L(p)=p^v(1-p)^{(n-v)}$$ 15 | Taking the $\log(L)$ we get: 16 | 17 | $$\log(L(p)) = v\log(p) + (n-v)\log{(1-p)}$$ 18 | Then 19 | 20 | $$\frac{d\log(L(p))}{dp}=\frac{v}{p}-\frac{n-v}{1-p}$$ 21 | 22 | Require that this derivative is 0 (to find the maximum): 23 | 24 | $$\frac{v}{p} = \frac{n-v}{1-p}$$ 25 | 26 | If and only if: 27 | 28 | $$v(1-p) = p(n-v)$$ 29 | 30 | If and only if: 31 | 32 | $$np=v$$ 33 | Hence 34 | 35 | $$\hat{p}=\frac{v}{n}$$ 36 | Using the second derivative (to make sure that this is indeed a **maximum** of the likelihood): 37 | 38 | $$\frac{d\log(L(p))}{d^2p}=-\frac{v}{p^2}-\frac{n-v}{(1-p)^2} < 0$$ 39 | $$\square$$ 40 | 41 | 42 | -------------------------------------------------------------------------------- /additional_notes/MLE_Bernoulli.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/additional_notes/MLE_Bernoulli.pdf -------------------------------------------------------------------------------- /books/openintro-statistics-sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/books/openintro-statistics-sample.pdf -------------------------------------------------------------------------------- /distribution_shiny_app/app.R: -------------------------------------------------------------------------------- 1 | # 2 | # A small shiny app which illustrates the normal distribution, 3 | # and the relationship between alpha, z, one sided and two sided intervals 4 | 5 | library(shiny) 6 | library(tidyverse) 7 | 8 | source("plot_z_score.R") 9 | 10 | z_dense <- tibble(z_range = seq(-3, 3, by = 0.025), 11 | density = dnorm(z_range), 12 | p_range = pnorm(z_range)) 13 | 14 | ui <- fluidPage( 15 | theme = shinythemes::shinytheme("united"), title = "Demonstrating the normal distribution", 16 | fluidRow( 17 | sidebarLayout( 18 | sidebarPanel(width = 3, 19 | fluidRow( 20 | h3("Input parameters"), 21 | sliderInput("alpha", "α-level (alpha)", value = 0.05, 22 | min = 0, max = 1, step = 0.025, 23 | animate = TRUE), 24 | verbatimTextOutput("z"), 25 | # sliderInput("z", "z-score", value = qnorm(0.05), 26 | # min = -3, max = 3, step = 0.05), 27 | radioButtons("alternative", "Two- or one-sided test", 28 | choices = c("One sided" = "one.sided", 29 | "Two sided" = "two.sided"), 30 | selected = "one.sided" 31 | ) 32 | )), 33 | mainPanel( 34 | h3("Illustration of the normal distribution"), 35 | plotOutput("z_plot"), 36 | fluidRow( 37 | p("This app was generated by Adi Sarid, as a tool to demonstrate the relationship between the quantiles, the density, and the cumulative distribution function of a normal distribution. 38 | Change the controls on the right to see how it reflects on the chart."), 39 | p("The source code for this app is available in the a github repository:", 40 | a("https://github.com/adisarid/intro_statistics_R/tree/master/distribution_shiny_app", 41 | href = "https://github.com/adisarid/intro_statistics_R/tree/master/distribution_shiny_app", 42 | target = "_blank")), 43 | p("The use of this tool is permitted via the cc-by-sa, with attribution to ", 44 | a("Adi Sarid", href = "https://adisarid.github.io", 45 | target = "_blank")) 46 | ) 47 | ) 48 | ) 49 | ) 50 | ) 51 | 52 | # Define server logic required to draw a histogram 53 | server <- function(input, output, session) { 54 | 55 | output$z_plot <- renderPlot({ 56 | 57 | plot_z_score(p = input$alpha, 58 | alternative = input$alternative, 59 | z_dense) 60 | }) 61 | 62 | output$z <- renderText({ 63 | if (input$alternative == "one.sided"){ 64 | paste0("z-score=", qnorm(p = input$alpha)) 65 | } else { 66 | paste0("z-score=", qnorm(p = input$alpha/2)) 67 | } 68 | }) 69 | 70 | } 71 | 72 | # Run the application 73 | shinyApp(ui = ui, server = server) 74 | -------------------------------------------------------------------------------- /distribution_shiny_app/distribution_shiny_app.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /distribution_shiny_app/plot_z_score.R: -------------------------------------------------------------------------------- 1 | # This script illustrates z-score. 2 | ## Example: 3 | # plot_z_score(1, "two.sided") 4 | # plot_z_score(0.05, "one.sided") 5 | plot_z_score <- function(p = NULL, 6 | alternative = c("two.sided", "one.sided"), 7 | z_dense = tibble(z_range = seq(-3, 3, by = 0.05), 8 | density = dnorm(z_range), 9 | p_range = pnorm(z_range))){ 10 | 11 | # prep the plot 12 | base_plot <- ggplot(z_dense, aes(x = z_range, y = density)) + 13 | geom_line() 14 | 15 | subtitle_str <- "" 16 | 17 | 18 | 19 | # split to two sided/one sided alternative 20 | if (alternative[1] == "two.sided"){ 21 | z_of_p <- qnorm(p/2) 22 | div_factor <- 0.5 23 | } else { 24 | z_of_p <- qnorm(p) 25 | div_factor <- 1 26 | } 27 | 28 | density_of_p <- dnorm(z_of_p) 29 | 30 | base_plot <- base_plot + 31 | geom_area(data = z_dense %>% filter(p_range <= p*div_factor), 32 | aes(x = z_range, y = density), fill = "lightblue", alpha = 0.5) + 33 | geom_segment(x = -3, xend = z_of_p, 34 | y = density_of_p, yend = density_of_p, color = "red") + 35 | geom_segment(x = z_of_p, xend = z_of_p, y = density_of_p, yend = 0, color = "red") 36 | 37 | if (alternative[1] == "two.sided"){ 38 | base_plot <- base_plot + 39 | geom_area(data = z_dense %>% filter(p_range >= 1 - p*div_factor), 40 | aes(x = z_range, y = density), fill = "lightblue", alpha = 0.5) + 41 | geom_segment(x = 3, xend = -z_of_p, 42 | y = density_of_p, yend = density_of_p, color = "red") + 43 | geom_segment(x = -z_of_p, xend = -z_of_p, y = density_of_p, yend = 0, color = "red") 44 | 45 | } 46 | 47 | subtitle_str <- paste0(subtitle_str, "p = Phi(z) = pnorm(z) = ", round(p*div_factor, 3), 48 | "; z = qnorm(p) = ", 49 | round(z_of_p, 3)) 50 | 51 | 52 | base_plot + 53 | xlab("z_p") + 54 | ylab("density\ndnorm(z_p)") + 55 | theme_bw() + 56 | ggtitle("The normal distribution", 57 | subtitle = subtitle_str) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /exam_examples/Exam_Moed_A.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exam_examples/Exam_Moed_A.pdf -------------------------------------------------------------------------------- /exam_examples/Exam_Moed_A_answers.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exam_examples/Exam_Moed_A_answers.pdf -------------------------------------------------------------------------------- /exam_examples/Exercise_Examples.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exam_examples/Exercise_Examples.pdf -------------------------------------------------------------------------------- /exam_examples/Exercise_Examples_Answers.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Test Exercises - Answers" 3 | author: "Adi Sarid" 4 | date: "June 2020" 5 | output: pdf_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | -------------------------------------------------------------------------------- /exercises/old/01 - Intro to R/00-Introduction.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "00-Introduction" 3 | author: "Adi Sarid / adi@sarid-ins.co.il" 4 | date: "`r Sys.Date()`" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Background 13 | 14 | You are now viewing a document generated via R Markdown. R Markdown is a friendly format for writing R code along with documentation that surrounds it. It's a very powerful tool - a simple text document can then be compiled very easily to an html, a pdf, word and additional formats. 15 | 16 | It is good for documenting and communicating your work to others, and we are going to rely on this format. 17 | 18 | The aim of this exercise is to get you familiar with the RStudio interface, while writing your first R Markdown document along with some R code. 19 | 20 | Ready? Let's get started! 21 | 22 | ## The console and some basic commands 23 | 24 | You can notice that when you open RStudio, the window is divided into 4 segments. In the lower segment (on the left) you can see the console. The console can be used to run R commands. Try running some code in the console, i.e., copy and paste the following code (line by line). 25 | 26 | Note that we are: 27 | 28 | * Using `<-` which is the sign used to set a variable's value. 29 | * We are using `a` and `b` as variable names. 30 | * `cars`, which is an example dataset. 31 | * `?` which is used to get help on commands. 32 | * `plot()` which is used to generate a base-r plot. 33 | 34 | ``` 35 | 1 + 1 36 | a <- 1 37 | b <- 2 38 | a + b 39 | cars 40 | ?cars 41 | ?plot 42 | plot(cars) 43 | ``` 44 | 45 | ### Answer/note these: 46 | 47 | Decipher what each command did. 48 | 49 | a. Did you notice the help pop up when you used `?cars` and `?plot`? 50 | b. Did you notice where the plot came up? 51 | 52 | Look at the console and click Ctrl+L. What happend? 53 | 54 | Set the marker in the console window, click on Ctrl + up arrow. What happened? 55 | 56 | Finally, type the letter c in the console and click Ctrl + up arrow. What does this do? 57 | 58 | *** 59 | 60 | ## File types you can use in RStudio 61 | 62 | Throughout the course we will rely heavily on writing in RMarkdown, however there are some more file types in RStudio. Use the file menu (File -> New -> R Script) to open up a new script, in the script type the code from the previous part above and click Ctrl + Shift + Enter. See what happens. 63 | 64 | Now mark the last two lines and click Ctrl + Enter (without the shift). What is the difference? 65 | 66 | When do you think you would use a script file versus an R Markdown file? 67 | 68 | Another important option is using R Projects. When you open up a new project it will generate an environment file called .Rproj. This file will preserve the relative location of the directory, and this will make it slightly easier to load and save files from within R (it will be a huge benefit later on). 69 | 70 | Let's start a new project: Click on File -> New Project 71 | Follow the wizard's instructions to open up a new project with project type "New Project". 72 | Name it "My first R project", and check the two checkboxes for "Create a .git repository" and for "Open in new session". 73 | 74 | In the new RStudio window that appeared, open up a new RMarkdown file, call it "My brand new RMarkdown", keep the default output format as html, and click OK. Save the RMarkdown file inside the directory (click on the save button in the upper left corner or on File -> Save). 75 | 76 | Now, click on "Knit" (Ctrl + Shift + K). 77 | 78 | ## Master addtional shortcuts and RStudio panes. 79 | 80 | Click on Alt+Shift+K. This is the shortcut window - it can help you if you ever forget shortcuts. 81 | Look at the RStudio window and try to answer the following short questions (please ask the one sitting next to you if you are lost!!!) 82 | 83 | 1. Where can you see all the environment variables currently loaded? 84 | 2. Where can you access the history of all the commands you previously ran? 85 | 3. Where would you look for the packages that are available to you? (RStudio has a pane for it, you just have to look for it) 86 | 4. If you are familiar with Git, how would you commit a file from within RStudio? 87 | 5. How would you find cheatsheets from within RStudio's menus? 88 | 89 | 90 | *** 91 | 92 | Congratulations! you've completed your very first exercise in R. 93 | -------------------------------------------------------------------------------- /exercises/old/01 - Intro to R/01- More Operations.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "01 Matrices dataframes and more" 3 | author: "Afek Adler" 4 | date: "r Sys.Date()" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Random Numbers 13 | Random Numbers allow us to simulate situations easily 14 | Setting seed allow us to reproduce the results of an experiment 15 | 16 | ``` {r Seed} 17 | rnorm(5) 18 | rnorm(5) 19 | set.seed(5) 20 | rnorm(5) 21 | set.seed(5) 22 | rnorm(5) 23 | ``` 24 | 25 | 26 | ``` {r Random vectors} 27 | rand_vec = rpois(n = 50, lambda = 10) 28 | mean(rand_vec) 29 | median(rand_vec) 30 | length(rand_vec) 31 | 32 | ``` 33 | ``` {r Random matrices} 34 | r <- 1000 #rows 35 | c <- 3 # columns 36 | rand_mat = matrix(runif(r*c), r, c) 37 | class(rand_mat) 38 | dim(rand_mat) 39 | dim(t(rand_mat)) 40 | rand_mat[5,2] 41 | # rand_mat[5,10] error 42 | ``` 43 | 44 | 45 | ``` {r head,tail, and sample} 46 | head(rand_mat,2) 47 | ``` 48 | 49 | 50 | ``` {r dataframes} 51 | cols <- c('a','b','c') 52 | df <- as.data.frame(rand_mat) 53 | names(df) <- cols 54 | head(df,2) 55 | ``` 56 | 57 | 58 | ``` {r read write csv} 59 | write.csv(df, "example_file.csv") 60 | new_file <- read.csv("example_file.csv") 61 | ``` 62 | ``` {r} 63 | sapply(df, class) 64 | sapply(new_file, class) 65 | ``` 66 | 67 | ``` {r} 68 | df$new_col = (df$a> 0.5)*1 69 | sapply(df, class) 70 | df$new_col = as.factor(df$new_col) 71 | sapply(df, class) 72 | ``` 73 | ``` {r} 74 | x <-rnorm(100) 75 | y <-rnorm(100) 76 | plot(x, y, xlab="This is the x-axis", ylab="This is the y-axis",main="Plot of X vs Y") 77 | ``` 78 | 79 | [normal distribution deriviation - recommended](https://www.youtube.com/watch?v=cTyPuZ9-JZ0) 80 | -------------------------------------------------------------------------------- /exercises/old/01 - Intro to R/01-Syntax, functions, loops, data types.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "01-Syntax, base, functions, loops and data types" 3 | author: "Adi Sarid / adi@sarid-ins.co.il" 4 | date: "`r Sys.Date()`" 5 | output: 6 | pdf_document: default 7 | html_document: default 8 | --- 9 | 10 | ```{r setup, include=FALSE} 11 | knitr::opts_chunk$set(echo = TRUE) 12 | ``` 13 | 14 | ## Goals of this exercise 15 | 16 | Familiarize yourself with the basics of R. This exercise encompasses: 17 | 18 | 1. Setting variables 19 | 2. Various data types, 20 | 3. Working with vectors, data.frames 21 | 4. Installing and loading packages 22 | 5. Building functions and base-R iterations 23 | 24 | 25 | 26 | ## Setting variables 27 | 28 | A variables can be defined using the arrow notation (which we have already seen in the previous lesson `<-`). You can also do the same with `=` but its less common and should not be used (apart from a specific case, in function arguments, which we will discuss later). 29 | 30 | ```{r setting new variables} 31 | a <- 1 32 | b <- 2 33 | c = 3 # just to show that this works 34 | 35 | a 36 | b 37 | c 38 | a*b 39 | b*c 40 | 41 | ``` 42 | 43 | Try to run the following code in the console, and think about questions which follow. 44 | ``` 45 | d <- c(a + b) 46 | d2 <- a + b 47 | d3 <- c*(a + b) 48 | ``` 49 | 50 | What does the first line do? 51 | What is the difference between the first line and the second line? (there is a difference even though the result is the same) 52 | What does the third line do? 53 | Why shouldn't you use `c` as a variable name? 54 | 55 | (hint: type `help("c")` in the console) 56 | 57 | 58 | Do not mix the assignment operator `=` (which I told you not to use), with the test equality operator `==`. 59 | 60 | ```{r test equality} 61 | 62 | a == 1 63 | b == a 64 | b > a 65 | 66 | ``` 67 | 68 | Also note the use of logicals: 69 | Please explain what each of the following operators do: `& | ! !=`, you can use the following (and modify it in any way): 70 | 71 | ``` 72 | # TRUE FALSE and the likes 73 | TRUE & FALSE 74 | TRUE | FALSE 75 | TRUE & TRUE 76 | !TRUE 77 | FALSE != TRUE 78 | FALSE == FALSE 79 | ``` 80 | 81 | Try the following code. Bonus points if you can explain what's wrong with it (and why that is). 82 | 83 | ```{r two is not two} 84 | 85 | sqrt2 <- sqrt(2) 86 | sqrt2 87 | 2 == sqrt2^2 88 | 89 | ``` 90 | 91 | ## Data types 92 | 93 | R has a number of "basic" data types: 94 | 95 | * Integer 96 | * Numeric (double) 97 | * Date (posix) 98 | * Factors 99 | * Logicals 100 | 101 | You can use `c()`, `rbind()`, `cbind()` to piece values together into vectors or more complex structures. 102 | Run the following code. 103 | 104 | ```{r} 105 | integer_example <- 10L 106 | integer_example 107 | numeric_example <- pi # pi is a reserved word... 108 | numeric_example 109 | character_examples <- "hello world" 110 | character_examples 111 | date_example <- as.Date("2018-10-01") 112 | date_example 113 | factor_example <- as.factor(c("big", "big", "small", "medium", "small", "big", "bigger")) 114 | factor_example 115 | summary(factor_example) 116 | logical_example <- c(TRUE, TRUE, FALSE, TRUE) 117 | logical_example 118 | ``` 119 | 120 | 121 | *** 122 | 123 | Using the `c()` command try to piece together the `logical_example` with the `factor_example`, i.e. (replace the `???` with something else): 124 | 125 | ``` 126 | c(logical_example, ???) 127 | ``` 128 | What happend to the factor vector? does the resulting vector make sense? 129 | 130 | Do the same with the `date_example` and the `factor_example`. What happend now? What precautions would you take when working with factors? 131 | 132 | *** 133 | 134 | ## data frames 135 | 136 | Data frames are a more complex structure which contains mixed data. R comes bundled with a number of "classical" data frames. Try the following: 137 | 138 | ``` 139 | mtcars 140 | iris 141 | ?mtcars 142 | ?iris 143 | ``` 144 | 145 | What types are the variables (columns) in each of these data sets? (double/factor/date/logical/integer/character) 146 | 147 | ## Packages 148 | 149 | An R package is a bundle of functions which share a common goal or vision. So far, we've been using base-r. The `tidyverse` packages is a package of packages. We will be working a lot with it. Let's try to load `tidyverse`. 150 | 151 | ``` 152 | library(tidyverse) 153 | ``` 154 | 155 | Did that work? if you got an error message you might need to install it. The following code will download and install the tidyverse. Be warned, this takes long. 156 | 157 | ``` 158 | install.packages("tidyverse") 159 | ``` 160 | 161 | Now if you installed the package, try to load it again `library(tidyverse)`. To use a function after you loaded a packages you can call `function_name(arg1 = ..., arg2 = ..., ...)`. Use `glimpse` to verify your answers for the previous questions (what types are the variables in mtcars and iris): 162 | 163 | ```{r tidyverse} 164 | library(tidyverse) 165 | 166 | glimpse(iris) 167 | glimpse(mtcars) 168 | 169 | ``` 170 | 171 | Use the function `count` to answer: 172 | 173 | How many flower-types are there in `iris`? 174 | How many cylinder values are there in `mtcars`? 175 | 176 | ``` 177 | count(iris, Species) 178 | count(???, cyl) 179 | 180 | ``` 181 | 182 | Later on, we will learn some more convinient ways to answer such questions. 183 | 184 | 185 | ## Functions and iterations - intermediate exercise 186 | 187 | We will discuss some base-R iterations, however, **in real situations you should do all in your power to avoid base-r loops!**. 188 | 189 | In the following exercise you will build a function which computes the Fibonacci series (0, 1, 1, 2, 3, 5, 8, 13, 21,...), and a loop which does the same. You will compare their runtime using `bench::mark()`. 190 | 191 | WARNING: 192 | This might feel like a **relatively complex** exercise if you're not fluent in programming, and it's not directly related to data analysis. The reasone I am giving you this exercise is that it is a great exercise to reherse elements we were discussing, in one single exercise. 193 | 194 | 1. Functions and the concept of recursion (a function calling itself) 195 | 2. Base-r loops 196 | 3. Conditionals (`if...else if...else`). 197 | 198 | First, if you don't know what the Fibonacci series is, go to [wikipedia, Fibonacci number](https://en.wikipedia.org/wiki/Fibonacci_number) and read about it (just the intro, should suffice). 199 | 200 | Complete the following function so that a call to the function will generate the n^th^ Fibonnacci number. Replace the `???`. 201 | Rows which start with the hash sign `#` are comments and will be ignored. 202 | 203 | Also, if you never heard the term "recursion" up until today, you might want to start with the second function `fib_loop`, and then think about the first one `fibonnaci`. 204 | 205 | ``` {r} 206 | fibonnacci <- function(n){ 207 | if (n == 0) { 208 | # starting condition for F_0 209 | return(0) 210 | } else if (n == 1) { 211 | # starting condition for F_1 212 | return(1) 213 | } else { 214 | # use recursion to calculate the number 215 | return(fibonnacci(n-1)+fibonnacci(n-2)) 216 | } 217 | } 218 | 219 | fibonnacci(30) 220 | 221 | ``` 222 | 223 | An alternative way to compute the Fibonnacci is via loops. Complete the following code: 224 | 225 | ``` {r} 226 | fib_loop <- function(n){ 227 | len <- n 228 | fibvals <- numeric(len) 229 | fibvals[1] <- 1 230 | fibvals[2] <- 1 231 | for (i in 3:len) { 232 | fibvals[i] <- fibvals[i-1]+fibvals[i-2] 233 | } 234 | return( tail(fibvals,1)) 235 | } 236 | fib_loop(30) 237 | 238 | ``` 239 | 240 | Check that your are getting consistent results. Now, compare the two functions using: 241 | 242 | ``` 243 | install.packages(bench) # if it is not installed 244 | bench::mark(fibonnacci(30), fib_loop(30)) 245 | ``` 246 | 247 | Which method is quicker? 248 | 249 | Note the use of `::`, I didn't mention this earlier, but instead of loading the package entirely `library(bench)` we're just calling the function `mark` from packages`bench` directly, using the double `::`. -------------------------------------------------------------------------------- /exercises/old/02/02.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "EX 02 - Data Handling & MLE" 3 | author: "Afek Adler" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document : 7 | number_sections: TRUE 8 | --- 9 | Last excercise we did: 10 | 11 | * Spoke about the R language and seen examples. 12 | * Reminded to go over basic probability distributions (Normal, Binomial, exc.) 13 | 14 | 15 | Today's topics: 16 | 17 | 1. Basic theory which will be the basis for the next weeks. 18 | 2. Solve a question. 19 | 20 | **HW 01 is out - submission in two weeks from now** 21 | 22 | 23 | ## Revision - the normal distribution - 24 | \[f\left(x | \mu, \sigma^{2}\right)=\frac{1}{\sqrt{2 \pi \sigma^{2}}} e^{-\frac{(x-\mu)^{2}}{2 \sigma^{2}}}\] 25 | 26 | 27 | ## Expectency and Variance of the sample sum (i.i.d) 28 | i.i.d stands for independent and identically distributed random variables. 29 | \[E\left(\sum_{i=1}^{n} X_{i}\right)=\sum_{i=1}^{n} E\left(X_{i}\right)= \sum_{i=1}^{n} \mu = n\mu \] 30 | 31 | \[V\left(\sum_{i=1}^{n} X_{i}\right)=\sum_{i=1}^{n} V\left(X_{i}\right)= \sum_{i=1}^{n} \sigma^{2} = n\sigma^{2} \] 32 | 33 | ## Expectency and Variance of the sample mean (i.i.d) 34 | \[E(\bar{X})=E\left(\frac{1}{n} \sum_{i=1}^{n} X_{i}\right)=\frac{1}{n} \sum_{i=1}^{n} E\left(X_{i}\right)=\frac{1}{n} \sum_{i=1}^{n} \mu=\frac{1}{n} \cdot n \mu=\mu\] 35 | 36 | 37 | \[V(\bar{X})=V\left(\frac{1}{n} \sum_{i=1}^{n} X_{i}\right)=\frac{1}{n^{2}} \sum_{i=1}^{n} V\left(X_{i}\right)=\frac{1}{n^{2}} \sum_{i=1}^{n} \sigma^{2}=\frac{1}{n^{2}} \cdot n \sigma^{2}=\frac{\sigma^{2}}{n}\] 38 | 39 | ## Bias variance decomposition 40 | \[\operatorname{MSE}(\hat{\Theta})=E(\hat{\Theta}-\theta)^{2}\] 41 | \[ \operatorname{MSE}(\boldsymbol{\hat{\Theta}})=E[\boldsymbol{\hat{\Theta}}-E(\boldsymbol{\hat{\Theta}})]^{2}+[\theta-E(\boldsymbol{\hat{\Theta}})]^{2}\] 42 | \[=V(\hat{\Theta})+(\text { bias })^{2}\] 43 | 44 | The MSE of an estimator is a criterion in choosing the best estimator: 45 | 46 | * If the estimator is unbiased it doesn't mean that it has the lowest MSE 47 | 48 | *** 49 | The distribution of a sample (i.i.d) from the normal distribution (on board). 50 | *** 51 | 52 | 53 | 54 | ## Central limit theoram 55 | Let there be n random variable such that - 56 | \[E(X_i) = \mu,V(X_i) = \sigma \] 57 | Than, for a large n: 58 | \[\sum_{n} X_{i} \sim N\left(n \mu, n \sigma^{2}\right)\] 59 | \[\bar{X} \sim N\left(\mu, \frac{\sigma^{2}}{n}\right)\] 60 | 61 | 62 | If $X_i$ is distributed normal than it accounts for every n. 63 | 64 | ## Deriving an unbiased estimate for $\sigma^{2}(S^{2})$ - 65 | 66 | \[E\left(S^{2}\right)=E\left[\frac{\sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}}{n-1}\right]=\frac{1}{n-1} E \sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}\] 67 | 68 | \[=\frac{1}{n-1} E \sum_{i=1}^{n}\left(X_{i}^{2}+\bar{X}^{2}-2 \bar{X} X_{i}\right)=\frac{1}{n-1} E\left(\sum_{i=1}^{n} X_{i}^{2}-n \bar{X}^{2}\right)\] 69 | 70 | \[=\frac{1}{n-1}\left[\sum_{i=1}^{n} E\left(X_{i}^{2}\right)-n E\left(\bar{X}^{2}\right)\right]\] 71 | 72 | Recall from the probability course - 73 | \[V(X) = E(X^2) - [E(X)]^2\] 74 | We can deduce that $E\left(X_{i}^{2}\right)=\mu^{2}+\sigma^{2}$ and $E(\bar{x}) =\mu^{2}+\sigma^{2} / n$ 75 | So - 76 | \[E\left(S^{2}\right)=\frac{1}{n-1}\left[\sum_{i=1}^{n}\left(\mu^{2}+\sigma^{2}\right)-n\left(\mu^{2}+\sigma^{2} / n\right)\right]\] 77 | \[=\frac{1}{n-1}\left(n \mu^{2}+n \sigma^{2}-n \mu^{2}-\sigma^{2}\right) = \sigma^{2} \] 78 | 79 | 80 | # Some distribuitons that we will use in the future 81 | 82 | ## Student's t-distribution 83 | In probability and statistics, Student's t-distribution (or simply the t-distribution) is any member of a family of continuous probability distributions that arises when estimating the mean of a normally distributed population in situations where the sample size is small and the population standard deviation is unknown. It was developed by William Sealy Gosset under the pseudonym Student. 84 | when n < 30 it resembles the nurmal distribution and when n >= 30 it is very close to it (especially when n increases) 85 | 86 | ```{r T, echo=FALSE, out.width = '50%'} 87 | knitr::include_graphics(knitr::include_graphics('t.png')) 88 | ``` 89 | 90 | 91 | ## The chi squared distribution 92 | In probability theory and statistics, the chi-square distribution (also chi-squared or χ2-distribution) with k degrees of freedom is the distribution of a sum of the squares of k independent standard normal random variables. The chi-square distribution is a special case of the gamma distribution and is one of the most widely used probability distributions in inferential statistics, notably in hypothesis testing or in construction of confidence intervals. 93 | 94 | ```{r X^2, echo=FALSE, out.width = '50%'} 95 | knitr::include_graphics(knitr::include_graphics('x^2.png')) 96 | ``` 97 | 98 | **We will also use the F distribution** 99 | 100 | # Q1 & Q2 - Attached in PDF 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /exercises/old/02/EX 02 Q1 Q2.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/02/EX 02 Q1 Q2.docx -------------------------------------------------------------------------------- /exercises/old/02/EX 02 Q1 Q2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/02/EX 02 Q1 Q2.pdf -------------------------------------------------------------------------------- /exercises/old/02/t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/02/t.png -------------------------------------------------------------------------------- /exercises/old/02/x^2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/02/x^2.png -------------------------------------------------------------------------------- /exercises/old/03/03- Point estimation and dplyr package.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "EX 03 - Data Handling & MLE" 3 | author: "Afek Adler" 4 | date: "`r Sys.Date()`" 5 | output: 6 | pdf_document: default 7 | html_document: 8 | number_sections: yes 9 | --- 10 | 11 | ```{r setup, include=FALSE} 12 | knitr::opts_chunk$set 13 | ``` 14 | 15 | 16 | Last excercise we did: 17 | 18 | * Expectency and Variance of the sample mean and sample sum 19 | * Central limit theoram 20 | * Bias variance decomposition of a point estimator 21 | * Derived an unbiased estimate for $\sigma^{2}(S^{2})$ 22 | * Covered the student's t-distribution and chi square distribution 23 | 24 | Today we will: 25 | 26 | * Cover methods for point estimattion 27 | * Get to know `dplyr` package 28 | * Try to develop a feeling for bayesian estimation. 29 | 30 | 31 | # Loss function 32 | A quick recap of the MSE of an estimator: 33 | \[\operatorname{MSE}(\hat{\Theta})=E((\hat{\Theta}-\theta)^{2})\] 34 | 35 | The squared loss did not come from heaven but from convienince. for example, another good criterion can be: 36 | 37 | \[\operatorname{MAE}(\hat{\Theta})=E(|\hat{\Theta}-\theta)|)\] 38 | 39 | Or many other types of error function. 40 | Also, at the lecture you have seen an *example* of Bayesian estimation where $\hat{\theta}_{\mathrm{MMSE}}=\int \theta \mathrm{p}(\theta | \mathbf{x}) \mathrm{d} \theta=\mathrm{E}(\theta | \mathbf{x})$ ,the derivation of this formula was taken under assumption of a square loss but there are also many other bayesian estimators like the maximum a posteriori estimation - $\hat{\theta}_{\mathrm{MAP}}=\underset{\theta}{\arg \max } p(\mathbf{\theta}| x)$. 41 | In the end of the excercise we will go deeper into this subject. 42 | 43 | 44 | # Point estimaion 45 | 46 | ## Some nice to have charactraists 47 | 48 | * Unbiased. If $E(\hat{\theta}) = \theta$ 49 | * Consistent. If the varaince of the estimator ~0 when N tends to $\infty$ 50 | 51 | Remember that the sample mean is unbiased estimator of the population mean 52 | 53 | ## Point estimates with the method of moments (MOM) 54 | 55 | The first moment 56 | \[E(X)=\frac{\sum_{i=1}^{n} X_{i}}{n} \Rightarrow E(X)=\bar{X}\] 57 | The Second moment 58 | \[E\left(X^{2}\right)=\frac{\sum_{i=1}^{n} X_{i}^{2}}{n} \Rightarrow \] 59 | \[V(X)=E\left(X^{2}\right)-E^{2}(X) \Rightarrow V(X)=\frac{\sum_{i=1}^{n} X_{i}^{2}}{n}-(\bar{X})^{2}\] 60 | 61 | 62 | **Q1: MOM** 63 | 64 | Let \[X = \mathcal{U}\left(\theta , \theta + 6 \right)\] 65 | Estimate $\theta$ with the method of moments \ 66 | 67 | Therfore \[E(X) = (\theta + \theta+ 6 )/2 = \theta +3\] 68 | And \[E(X) = \bar{X} = \theta +3\] 69 | By the equation of the first moment. 70 | Therfore \[\hat{\theta} = \bar{X} -3\] 71 | 72 | 73 | 74 | ## Point estimates with the mazimum likelihood estimation (MLE) 75 | In statistics, maximum likelihood estimation (MLE) is a method of estimating the parameters of a probability distribution by maximizing a likelihood function, so that under the assumed statistical model the observed data is most probable. The point in the parameter space that maximizes the likelihood function is called the maximum likelihood estimate. The logic of maximum likelihood is both intuitive and flexible, and as such the method has become a dominant means of statistical inference. 76 | 77 | If the likelihood function is differentiable, the derivative test for determining maxima can be applied. In some cases, the first-order conditions of the likelihood function can be solved explicitly; for instance, the ordinary least squares estimator maximizes the likelihood of the linear regression model. Under most circumstances, however, numerical methods will be necessary to find the maximum of the likelihood function. ("Wikipedia") 78 | 79 | **Q2: MLE** 80 | With the binomial distribution - suppose we had a trial with 49 success ou of 80. 81 | 82 | \begin{equation} 83 | L(p)=f_{D}(\mathrm{H}=49 | p)=\left(\begin{array}{c}{80} \\ {49}\end{array}\right) p^{49}(1-p)^{31}\end{equation} 84 | \begin{equation} 85 | 0=\frac{\partial}{\partial p}\left(\left(\begin{array}{c}{80} \\ {49}\end{array}\right) p^{49}(1-p)^{31}\right) , \{discard binomial coefficient\} 86 | \end{equation} 87 | \begin{equation} 88 | 0=49 p^{48}(1-p)^{31}-31 p^{49}(1-p)^{30}, \{(uv)` = u`v +v`u\} 89 | \end{equation} 90 | \begin{equation} 91 | =p^{48}(1-p)^{30}[49(1-p)-31 p] 92 | \end{equation} 93 | \begin{equation} 94 | =p^{48}(1-p)^{30}[49-80 p] 95 | \end{equation} 96 | Can be solved also by applying log on the likelihood. 97 | 98 | It's clear that the maximum is at p = 49/80. But let's see how we do it in R using the bulit in [optimize](ehttps://stat.ethz.ch/R-manual/R-devel/library/stats/html/optimize.html) function: 99 | 100 | ``` {r find MlE} 101 | likelihood <- function(p) { 102 | p^49*((1-p)^31) 103 | } 104 | tolerance <- 10^(-4) 105 | pmax <- optimize(likelihood, c(0, 1), tol = tolerance , maximum = T)[[1]] 106 | delta <- abs(pmax- (49/80)) 107 | delta 108 | ``` 109 | 110 | # HW1 q3 111 | 112 | # Best Pracitces for data Data handling with R 113 | 114 | R main datatypes: 115 | 116 | * vectors 117 | * matrices 118 | * data.frame - matrices with meatadata, added functionallity and allow multiple data types 119 | * tibbles - modern take on dataframes 120 | 121 | `dplyr` is a grammar of data manipulation, providing a consistent set of verbs that help you solve the most common data manipulation challenges: 122 | 123 | * `mutate()` adds new variables that are functions of existing variables. 124 | * `select()` picks variables based on their names. 125 | * `filter()` picks cases based on their values. 126 | * `summarize()` reduces multiple values down to a single summary. 127 | * `arrange()` sorts the rows. 128 | 129 | ```{r Impports, message=FALSE} 130 | library(tidyverse) 131 | library(nycflights13) 132 | ``` 133 | 134 | 135 | This dataset has 19 columns so the head function is not that usefull when knitting to html. 136 | It is always useful to know how many missing values we have in our dataset, sometimes missing values are not just given to us as NA. 137 | ```{r describe dataset} 138 | head(flights,2) 139 | colSums(is.na(flights))/nrow(flights) 140 | sapply(flights,class) 141 | ``` 142 | *** 143 | **At home - find a better way to print the classes and the % of missing values in R**
144 | 145 | 146 | ## `select()` picks variables based on their names. 147 | 148 | ```{r select method} 149 | flight_ditance_airtime <- flights %>% select( distance, air_time) 150 | flight_ditance_airtime %>% head(2) 151 | ``` 152 | 153 | 154 | ## `mutate()` adds new variables that are functions of existing variables. 155 | ```{r mutate method} 156 | flight_ditance_airtime %>% mutate(mean_speed = distance/air_time) %>% head(2) 157 | ``` 158 | If you only want to keep the new variables, use `transmute()`: 159 | ```{r transmute method} 160 | flight_ditance_airtime %>% transmute(mean_speed = distance/air_time) %>% head(2) 161 | ``` 162 | 163 | ## `filter()` picks cases based on their values. 164 | ```{r filter method} 165 | flights %>% filter(is.na(dep_delay)) %>% head(2) 166 | ``` 167 | 168 | 169 | ## `arrange()` picks cases based on their values. 170 | ```{r arrange method} 171 | flights %>% arrange(desc(month)) %>% head(2) 172 | ``` 173 | 174 | ## `summarize()` reduces multiple values down to a single summary. 175 | 176 | ```{r summarize method} 177 | by_month <- group_by(flights,month) 178 | by_month %>% summarise(count = n()) %>% 179 | ggplot( mapping = aes(x = month, y = count)) + geom_bar(stat="identity") + coord_cartesian(ylim = c(2*10^4, 3*10^4)) 180 | ``` 181 | 182 | ```{r another way} 183 | ggplot(data = flights) + 184 | geom_bar(mapping = aes(x = month)) + 185 | coord_cartesian(ylim = c(2*10^4, 3*10^4)) 186 | ``` 187 | 188 | 189 | Additional resources: 190 | 191 | + [r4ds](https://r4ds.had.co.nz/transform.html) 192 | + [dplyr](https://dplyr.tidyverse.org/) 193 | + [dplyr cheat sheet](https://github.com/rstudio/cheatsheets/blob/master/data-transformation.pdf) 194 | 195 | # Bayesian estimation 196 | 197 | We want to minimize with respect to a given loss function - 198 | \[\int L(\hat{\theta}- \theta)*p(\theta|x)d\theta\] 199 | 200 | In the lecture, we have seen that when $L(\hat{\theta},\theta) = (\hat{\theta}- \theta)^2$ 201 | than $\hat{\theta} = E(posterier)$, other types of loss functions will derive different estimators (like we have seen above). 202 | The logic of this method is as follows - we have somekind of a distribution over $\theta$, but we need to choose only one of those. So we formulate an objective function and minimze it with respect to the parameter that we want to find. The most used ones are the mean, median, and common of that distribution, and as we said, thet are the bayesian estimators for different loss functions. 203 | 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /exercises/old/03/03-_Point_estimation_and_dplyr_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/03/03-_Point_estimation_and_dplyr_package.pdf -------------------------------------------------------------------------------- /exercises/old/03/ex03.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/03/ex03.zip -------------------------------------------------------------------------------- /exercises/old/03/hw1_q3_solution.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/03/hw1_q3_solution.pdf -------------------------------------------------------------------------------- /exercises/old/04/04.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "EX 04 - confidense intervals and MLE motivation" 3 | author: "Afek Adler" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document : 7 | number_sections: TRUE 8 | --- 9 | 10 | Last excercise we did: 11 | 12 | * Cover methods for point estimation 13 | * Get to know dplyr package 14 | * Talked about what makes an estimator a good one 15 | 16 | Today we will: 17 | 18 | * Explain again some terms (central limit theorem, what is an estimator, what is a moment) 19 | * Revise MLE with an example 20 | * Talk on interavel estimation 21 | 22 | 23 | ## What is an the difference between $\theta$ and $\hat{\theta}$ 24 | $\theta$ is the population parameter (we will never know it). 25 | $\hat{\theta}$ estimates $\theta$ based on a sample from the population. 26 | 27 | ## What is a moment? 28 | The moments of a function describes it's shape. 29 | In the method of moments we assume that our sample is big enough such that the sample moments are approximately equal to the population moments. And that's how we estimate parameters. 30 | 31 | ## Central limit theoram 32 | Q1 in excercise notes 33 | 34 | 35 | # Another MLE and Moments example 36 | Q2 in excercise notes 37 | 38 | 39 | # Interval Estimation 40 | 41 | ## The student's t distribution 42 | t distribution is used to model the expected values of a **small** sample from a population that is distributed noraml with unknown variance. 43 | As N increases, t distribution is getting closer and closer to the normal distribution. 44 | 45 | Lemma: 46 | $\frac{\bar{X}-\mu}{S / \sqrt{n}}$ is t- distributed. 47 | 48 | [Student t dist vs normal dist as function of n](https://rpsychologist.com/d3/tdist/) 49 | 50 | ## confidense interval 51 | 52 | In statistics, a confidence interval (CL) is a type of interval estimate, computed from the statistics of the observed data, that might contain the true value of an unknown population parameter. The interval has an associated confidence level, or coverage that, loosely speaking, quantifies the level of confidence that the deterministic parameter is captured by the interval. More strictly speaking, the confidence level represents the frequency (i.e. the proportion) of possible confidence intervals that contain the true value of the unknown population parameter. **In other words, if confidence intervals are constructed using a given confidence level from an infinite number of independent sample statistics, the proportion of those intervals that contain the true value of the parameter will be equal to the confidence level**. [wiki](https://en.wikipedia.org/wiki/Confidence_interval) 53 | 54 | There are one sided and two sided confidense intervals. 55 | 56 | ### confidense interval for the mean based on n samples: 57 | Based on our assumptions we get a different distribution of the sample, after we figure out how the sample is distributed computing the confidense interval is straighforward. for example, for the mean - 58 | 59 | \[\mu \in(\bar{X}-\#of\_std_\_for\_confidense\_level\_\alpha*std,\bar{X}+\#of\_std_\_for\_confidense\_level\_\alpha*std) \] 60 | 1.When variance is known and the population is assumed to be distributer notmal or n is "big" (n > 30), the sample mean is distributed 61 | \[\mathcal{N}\left(\bar{X} , \frac{\sigma^{2}}{n}\right)\] 62 | So a two sided confidense interval is: 63 | \[\mu \in\left(\bar{X}-Z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}},\bar{X}+Z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}}\right)\] 64 | 2 .When variance is **not** known and n is "big" (n > 30), the sample mean is distributed 65 | \[\mathcal{N}\left(\bar{X} , \frac{\hat{\sigma}^{2}}{n}\right)\] 66 | 3. When variance is **not** known and n is **not** "big" (n <= 30), the sample mean is distributed 67 | \[\mathcal{t_{n-1}}\left(\bar{X} , \frac{\hat{\sigma}^{2}}{n}\right)\] 68 | 69 | Reminder: 70 | \[\hat{\sigma}^{2} = S^2 =\frac{\sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}}{n-1}=\frac{\sum_{i=1}^{n} X_{i}^{2}-n \bar{X}^{2}}{n-1} \] 71 | 72 | For your own understanding, for each case, derive the one sided and two sided confidense interval at home. 73 | 74 | * We have seen in the lecture alse CL for the $\sigma^2$, and we will encounter it at HW2 as well. 75 | 76 | ### confidense interval for the proportion based on n samples 77 | if n is large enough, the proportion is distributed: 78 | \[\mathcal{N}\left(\hat{p} , \frac{\hat{p}\hat{q}}{n}\right)\] 79 | 80 | **Q1:** 81 | An online advertising company is doing an ab testing for a new advertisement and wants to model a confidense interval for the click thorogh rate (CTR) of a givent test such that it's confidense interval will be smaller than 5% in confidense level of 95% . What is the minmum number of sample for this purpose? 82 | 83 | \[0.05 = Length \geq 2*Z_{1-\frac{\alpha}{2}}*\sqrt{\frac{\hat{p}\hat{q}}{n}} = 2*Z_.975*\sqrt{\frac{\hat{p}\hat{q}}{n}} \Rightarrow\] 84 | \[= 2*1.96*\sqrt{\frac{\hat{p}\hat{q}}{n}} \Rightarrow\ 0.01275 \geq \sqrt{\frac{\hat{p}\hat{q}}{n}} \Rightarrow\ n \geq \frac{\hat{p}\hat{q}}{0.00016} \Rightarrow\] 85 | \[n \geq \frac{\hat{p}\hat{q}}{0.00016} = \frac{0.5*0.5}{0.00016} = 1562.5\] 86 | because \[ p(1-p) \leq 0.5*0.5 \ \forall p \in \{0,1\} \] 87 | 88 | If we know for example that the CTR is bounded by 4% than: 89 | \[n \geq 240 = \frac{0.04*0.96}{0.00016} \geq \frac{\hat{p}\hat{q}}{0.00016}\] 90 | 91 | # CL Verification 92 | Let's verify that indeed when we bouild CI (and..our assumptions are correct) than $1-\alpha$ our parameter is inside the CI: 93 | ``` {r confidense interval} 94 | miu = 10 95 | sigma = 3 96 | n = 10 97 | alpha = 0.1 98 | N_tests <- 10000 99 | counter <- 0 100 | error = qnorm(1-alpha/2)*(sigma/sqrt(n)) 101 | for (i in 1:N_tests) 102 | {sample = rnorm(n,miu,sigma) 103 | sample_mean <- mean(sample) 104 | left <- sample_mean-error 105 | right <- sample_mean + error 106 | between <- (left <= miu) & (miu <= right) 107 | counter <- counter+between} 108 | print(counter/N_tests) 109 | ``` 110 | 111 | 112 | -------------------------------------------------------------------------------- /exercises/old/05/05.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "EX 05 - Intro to hypothesis tests" 3 | author: "Afek Adler" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document : 7 | number_sections: TRUE 8 | --- 9 | 10 | Last exercise we did: 11 | 12 | * Talk on interval estimation 13 | * Go throw a reminder on MLE with a motivation for machine learning (a mixture process) 14 | 15 | Today we talk about: 16 | 17 | * The null hypothesis 18 | * General framework hypothesis testing 19 | * Type 1 and type 2 errors 20 | * P value 21 | * The connection between hypothesis testing and confidence intervals 22 | * Hypothesis tests 23 | 24 | Some more topics that were covered in the lecture (will not be in the exercise): 25 | 26 | * Type-II error and determining the sample size 27 | * QQ plot (comparing distributions) 28 | 29 | 30 | # The null hypothesis 31 | In inferential statistics, the null hypothesis is a general statement or default position that there is nothing new happening, like there is no association among groups, or no relationship between two measured phenomena. Testing (accepting, approving, rejecting, or disproving) the null hypothesis—and thus concluding that there are or are not grounds for believing that there is a relationship between two phenomena (e.g. that a potential treatment has a measurable effect)—is a central task in the modern practice of science; the field of statistics gives precise criteria for rejecting a null hypothesis. 32 | The null hypothesis is generally assumed to be true until evidence indicates otherwise. 33 | In statistics, it is often denoted H0, pronounced as "H-nought", "H-null", or "H-zero" (or, even, by some, "H-oh"), with the subscript being the digit 0. 34 | 35 | The concept of a null hypothesis is used differently in two approaches to statistical inference. In the significance testing approach of Ronald Fisher, a null hypothesis is rejected if the observed data are significantly unlikely to have occurred if the null hypothesis were true. In this case, the null hypothesis is rejected and an alternative hypothesis is accepted in its place. If the data are consistent with the null hypothesis, then the null hypothesis is not rejected. In neither case is the null hypothesis or its alternative proven; the null hypothesis is tested with data and a decision is made based on how likely or unlikely the data are. This is analogous to the legal principle of presumption of innocence, in which a suspect or defendant is assumed to be innocent (null is not rejected) until proven guilty (null is rejected) beyond a reasonable doubt (to a statistically significant degree). [https://en.wikipedia.org/wiki/Null_hypothesis](wiki) 36 | 37 | # General Framework for hypothesis testing 38 | 39 | This is the procedure for hypothesis testing: 40 | 41 | 1. Identify the parameter of interest (i.e., proportion, expectancy, std, etc.) 42 | 2. State the null hypothesis $H_0$ 43 | 3. Specify the alternative hypothesis $H_1$ (one sided, two sided, etc.) 44 | 4. Choose significance level 45 | 5. Determine what test statistic to use (e.g., $Z, T ,X^2$ ) 46 | 6. State the rejection region for the statistic 47 | 7. Compute the sample quantities, plug-in into the test statistic and compute it 48 | 8. Decide if should be rejected based on 6-7 49 | 50 | # Type 1 and Type 2 errors 51 | 52 | ```{r errors 1,echo=FALSE, out.width = "400px"} 53 | photo_path <- 'https://i.stack.imgur.com/R0ncP.png' 54 | destination_path <- 'type1_type2_errors.png' 55 | if (!(file.exists(destination_path))){ 56 | download.file(photo_path,destination_path, mode = 'wb')} 57 | knitr::include_graphics(destination_path) 58 | ``` 59 | ```{r errors 2,echo=FALSE, out.width = "400px"} 60 | photo_path <- 'https://www.dummies.com/wp-content/uploads/436264.image0.jpg' 61 | destination_path <- 'type1_type2_errors2.jpg' 62 | if (!(file.exists(destination_path))){ 63 | download.file(photo_path,destination_path, mode = 'wb')} 64 | knitr::include_graphics(destination_path) 65 | 66 | ``` 67 | 68 | On board: 69 | 70 | * $\alpha$ - the probability if *$H_0$* is True but our test says otherwise () 71 | * $1 - \alpha$ 72 | * $\beta$ the probability if *$H_1$* is True but our test says otherwise 73 | * $1- \beta$ 74 | * $p_{value}$ - Intuition - low $p_{value}$ means it's not very likely that $H_0$ generated our sample 75 | * Rejection region $C$ and it's counterpart, the acceptance region $\overline{C}$ 76 | 77 | 78 | # The relationship between hypothesis testing and confidence intervals 79 | It can be shown that $H_0$ is accepted if and only the confidence interval contains the parameter in the basis of the assumption of $H_0$. 80 | For example, let's look at the hypothesis test for the mean: 81 | 82 | Acceptance region - 83 | \[\mu_{0}-Z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}} \leq \bar{x} \leq \mu_{0}+z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}}\] 84 | 85 | And by re-arranging both sides we get: 86 | 87 | \[\bar{x}-z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}} \leq \mu_{0} \leq \bar{x}+z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}}\] 88 | 89 | Which is exact the confidences intervals. 90 | 91 | # Hypothesis tests 92 | 93 | ## Goodness of fit test: 94 | Goodness of fit tests are used to test how good is the fit of our empirical distribution to that of a theoretical 95 | distribution. 96 | Arrange the empirical distribution in $k$ bins, and let $O_i$ be the observed frequency in the $i$th class bin. Let $E_i$ 97 | be the expected probability. 98 | 99 | $H_0$ : our observation are distributed according to ~Y (some distribution)\ 100 | $H_1$ : else 101 | 102 | The test statistic is: 103 | 104 | \[ \chi_{0}^{2}=\sum_{i=1}^{k} \frac{\left(O_{i}-E_{i}\right)^{2}}{E_{i}} \] 105 | 106 | We would reject the hypothesis if $\chi_{0}^{2}>\chi_{\alpha, k-p-1}^{2}$ \ 107 | Where p is the number of parameters of the distribution. 108 | 109 | **Q1 - on board** 110 | **Q2 - on board** 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /exercises/old/05/EX 05 - Intro to hypothesis tests.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/EX 05 - Intro to hypothesis tests.pdf -------------------------------------------------------------------------------- /exercises/old/05/Q1_2.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/Q1_2.docx -------------------------------------------------------------------------------- /exercises/old/05/Q1_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/Q1_2.pdf -------------------------------------------------------------------------------- /exercises/old/05/type1_type2_errors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/type1_type2_errors.png -------------------------------------------------------------------------------- /exercises/old/05/type1_type2_errors2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/type1_type2_errors2.jpg -------------------------------------------------------------------------------- /exercises/old/06/EX06.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/06/EX06.docx -------------------------------------------------------------------------------- /exercises/old/06/EX06.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/06/EX06.pdf -------------------------------------------------------------------------------- /exercises/old/07/EX07.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/07/EX07.pdf -------------------------------------------------------------------------------- /exercises/old/07/cs229-notes1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/07/cs229-notes1.pdf -------------------------------------------------------------------------------- /exercises/old/08/EX08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/08/EX08.pdf -------------------------------------------------------------------------------- /exercises/old/08/Ex08.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/08/Ex08.docx -------------------------------------------------------------------------------- /exercises/old/09/ex09.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/09/ex09.docx -------------------------------------------------------------------------------- /exercises/old/09/ex09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/09/ex09.pdf -------------------------------------------------------------------------------- /exercises/old/10/10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/10/10.pdf -------------------------------------------------------------------------------- /exercises/old/10/first_semester/08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/10/first_semester/08.pdf -------------------------------------------------------------------------------- /exercises/old/10/first_semester/10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/10/first_semester/10.pdf -------------------------------------------------------------------------------- /exercises/old/10/first_semester/Exam Questiuon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/10/first_semester/Exam Questiuon.pdf -------------------------------------------------------------------------------- /exercises/old/11/ex 11.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/ex 11.docx -------------------------------------------------------------------------------- /exercises/old/11/ex 11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/ex 11.pdf -------------------------------------------------------------------------------- /exercises/old/11/first_semester/HW4 Q1 and Q2 solution.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/first_semester/HW4 Q1 and Q2 solution.docx -------------------------------------------------------------------------------- /exercises/old/11/first_semester/HW4 Q1 and Q2 solution.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/first_semester/HW4 Q1 and Q2 solution.pdf -------------------------------------------------------------------------------- /exercises/old/11/first_semester/ex11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/first_semester/ex11.pdf -------------------------------------------------------------------------------- /exercises/old/12/12.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/12/12.docx -------------------------------------------------------------------------------- /exercises/old/12/12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/12/12.pdf -------------------------------------------------------------------------------- /exercises/old/12/first_semester/ex12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/12/first_semester/ex12.pdf -------------------------------------------------------------------------------- /exercises/old/13/ex 13.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/13/ex 13.pdf -------------------------------------------------------------------------------- /exercises/old/13/first_semester/ex 13.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/13/first_semester/ex 13.pdf -------------------------------------------------------------------------------- /intro_statistics_R.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /labs/answers/food_consumption-answers.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lab - food consumption and carbon footprint - answers" 3 | author: "Adi Sarid" 4 | date: "`r Sys.Date()`" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | library(tidyverse) 11 | ``` 12 | 13 | In this lab we cover the data science work flow. The lab will walk you through the steps performed in a data science project: 14 | 15 | * Data import 16 | * Data tidying 17 | * Transformation <-> Visualization <-> Modelling 18 | 19 | The lab will also cover the theoretic elements we covered such confidence intervals and hypothesis tests. This lab is to be performed in groups of 3 (i.e., zoom break rooms). 20 | 21 | # First exercise - open up a new project. 22 | 23 | First, open up a new project. To do this, in RStudio go to: 24 | 25 | * File -> New Project -> New Directory -> New Project. 26 | 27 | Provide your project directory name (under directory name), and click ok. Note that everything will close and RStudio will open up in a clean window. But don't worry, you can always view this file by visiting [this link](https://github.com/adisarid/intro_statistics_R/tree/master/labs/). 28 | 29 | Once a new RStudio instance has opened with a clean window of your new project, open up a new RMarkdown, which you will use to answer your questions to this lab by: 30 | 31 | * File -> New file -> R Markdown... 32 | 33 | Give your RMarkdown file a name and use the html outupt type. In your new RMarkdown file you can delete lines 1-10 and delete everything else in the file (lines 11-31). Try to knit it by clicking ctrl+k. 34 | 35 | Now we are ready to do some analysis. 36 | 37 | # Second exercise - get to know your data 38 | 39 | In this lab we are going to analyze food consumption data from *tidytuesday*. You can read about tidytuesday [here](https://github.com/rfordatascience/tidytuesday) - it's a github repository which is updated every Tuesday with data freely available for analyzing and sharpening your data analysis skills. 40 | Today we will use this dataset: [here](https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-18/readme.md). 41 | 42 | In groups, find the original source of the data (the nu3.de website within the links) and discuss: 43 | 44 | * What is the origin of the data? 45 | 46 | * Would you consider the data reliable/trust worthy? 47 | 48 | * How was the carbon footprint computed for each food type and country? 49 | 50 | Using the `read_csv` function from the `readr` package, read the food consumption data. 51 | 52 | * Use the following functions to understand how the data is arranged: `glimpse`, `head`, `View`. 53 | 54 | * Comparing the file you read with the original table, in what sense the file you read is more "tidy"? 55 | 56 | ```{r read the data} 57 | food_consumption <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-18/food_consumption.csv') 58 | 59 | glimpse(food_consumption) 60 | 61 | head(food_consumption) 62 | 63 | ``` 64 | 65 | *** 66 | 67 | **Checkpoint** - make sure you summarize your answers in a convinient way, we are going to discuss them in class. 68 | 69 | *** 70 | 71 | # Third exercise - visualization, and descriptive statistics of food consumption 72 | 73 | In the group, discuss: 74 | 75 | * What influences the consumption of various goods (e.g., pork, beef, fish, wheat, etc.). In which countries would you expect to find a low or high consumption of specific products? 76 | 77 | * Write down an equivalent formula for the variance. This is a theoretic part, you can write your answer in RMarkdown by enclosing it with the $ characters like this. RMarkdown interprets such test as LaTeX and creates a formula: 78 | 79 | $$\sigma^2=\operatorname{Var}(X)=E\left[(X-E[X])^2\right]=\ldots?$$ 80 | 81 | * Write down two estimates for the variance $\hat{\sigma}^2$ and $s^2$. Which would you prefer to use and why? 82 | 83 | * What plot(s) would you use to visualize the distribution of consumption of each food type? you can use the `ggplot2` cheatsheet to consider this. Once you reach a conclusion - create the chart. 84 | 85 | ```{r foo consumption distribution} 86 | 87 | ggplot(food_consumption, aes(x = food_category, y = consumption)) + 88 | geom_boxplot() + coord_flip() 89 | 90 | ggplot(food_consumption, aes(x = consumption)) + 91 | geom_histogram() + 92 | facet_wrap(~food_category) 93 | 94 | ggplot(food_consumption, aes(x = consumption)) + 95 | geom_density() + 96 | facet_wrap(~food_category, scales = "free") 97 | 98 | ``` 99 | 100 | * Based on the plot, what products have the highest (or lowest) variance? what is the meaning of having a high (or low) variance in this context of food consumption? 101 | 102 | * Verify this by computing the consumption standard deviation of each product, also add to your computation the mean, median and 1st and 3rd quartiles. You can use `group_by` and `summarize` for this. 103 | 104 | ```{r summary stats} 105 | food_descriptives <- food_consumption %>% 106 | group_by(food_category) %>% 107 | summarize(mean = mean(consumption), 108 | q1 = quantile(consumption, probs = 0.25), 109 | q2 = median(consumption), 110 | q3 = quantile(consumption, probs = 0.75), 111 | sd = sd(consumption)) %>% 112 | arrange(desc(mean)) 113 | food_descriptives 114 | ``` 115 | 116 | *** 117 | 118 | **Checkpoint** - together in class, we're going to discuss and solve the exercise so far. 119 | 120 | *** 121 | 122 | # Fourth exercise - modelling - confidence intervals and hypothesis tests of consumption 123 | 124 | In this part we will create a number of confidence intervals. Follow these steps in order to answer this question: 125 | 126 | * First, decide what kind of confidence interval is to be used (what statistic are you using) and write it down as a formula: 127 | 128 | $$T_{\text{df}=n-1} = \frac{\bar{X} - \mu}{S/\sqrt{n}}$$ 129 | 130 | * And the confidence interval is therefore: 131 | 132 | $$\mu\in\bar{X}\pm t_{\alpha/2,n-1}S/\sqrt{n}$$ 133 | 134 | * Use the tibble you created in the last step of the previous part (`food_descriptives`), to create a confidence interval for all the food categories, with $\alpha=0.05$. Use the following code. 135 | 136 | ```{r confidence intervals t statistic} 137 | 138 | # First find the relevant t for the chosen alpha and the relevant degrees of freedom 139 | t0.05_129 <- qt(p = 0.05/2, df = 129) 140 | 141 | # Now use it on the tibble we computed 142 | food_descriptives %>% 143 | mutate(ci_lower_bound = mean + t0.05_129*sd/sqrt(130), 144 | ci_upper_bound = mean - t0.05_129*sd/sqrt(130)) %>% 145 | select(food_category, ci_lower_bound, mean, ci_upper_bound) 146 | 147 | ``` 148 | 149 | * Check your result using the `t.test` function for Fish. 150 | 151 | ```{r fish ci} 152 | 153 | fish_vector <- food_consumption %>% 154 | filter(food_category == "Fish") %>% 155 | pull(consumption) 156 | 157 | t.test(fish_vector) 158 | 159 | ``` 160 | 161 | * Formulate a hypothesis test which examines the expected consumption of Fish vs. Pork (with $H_0$ and $H_1$). 162 | 163 | $$H_0: \mu_{\text{pork}} = \mu_{\text{fish}}$$ 164 | $$H_0: \mu_{\text{pork}} \neq \mu_{\text{fish}}$$ 165 | 166 | * Assuming that the variance of pork and fish consumption is the same, what test statistic would you use for this hypothesis? 167 | 168 | $$T=\frac{\bar{X}_1 - \bar{X}_2 - (\mu_1 - \mu_2)}{S_p\sqrt{1/n_1 + 1/n_2}}$$ 169 | 170 | $$S_p = \sqrt{\frac{(n_1-1)S_1^2 + (n_2-1)S_2^2}{n_1 + n_2 -2}}$$ 171 | 172 | With df$=n_1+n_2-2$. 173 | 174 | * Is this a paired or unpaired test? 175 | 176 | * Conduct the test by computing the test statistic and its p-value. You can do this either directly or with the `t.test` function. However you prefer. If you are using the `t.test`, note that you have to set the `var.equal` argument (to what?). 177 | 178 | ```{r compare fish and pork} 179 | pork_vector <- food_consumption %>% 180 | filter(food_category == "Pork") %>% 181 | pull(consumption) 182 | 183 | t.test(x = fish_vector, y = pork_vector, var.equal = T) 184 | 185 | ``` 186 | 187 | * How would you have conducted the test if the variances were assumed to be unequal? 188 | 189 | ```{r compare fish and pork unequal variance} 190 | t.test(x = fish_vector, y = pork_vector, var.equal = F) 191 | ``` 192 | 193 | *** 194 | 195 | **Checkpoint** - solving this exercise together in class. 196 | 197 | *** 198 | 199 | # Fifth exercise - visualizing the relationship between meet products and vegan products 200 | 201 | In this final part, we're going to use visualizations to examine the relationship between meat/dairy and vegan products. 202 | 203 | * Reclassify all the `food_category` into two types of products: meet/dairy versus vegan. You can use the definition of the following tibble, along with the `left_join` function, but you will probably need to read about it in the documentation. Another option is to use another function called `case_when` or `recode_factor`. 204 | 205 | ```{r food reclassification} 206 | food_types <- tribble(~food_category, ~food_type, 207 | "Beef", "Meat/Dairy", 208 | "Eggs", "Meat/Dairy", 209 | "Fish", "Meat/Dairy", 210 | "Lamb & Goat", "Meat/Dairy", 211 | "Milk - inc. cheese", "Meat/Dairy", 212 | "Pork", "Meat/Dairy", 213 | "Poultry", "Meat/Dairy", 214 | "Rice", "Vegan", 215 | "Soybeans", "Vegan", 216 | "Wheat and Wheat Products", "Vegan", 217 | "Nuts inc. Peanut Butter", "Vegan") 218 | 219 | food_consumption_reclassified <- food_consumption %>% 220 | left_join(food_types) 221 | 222 | ``` 223 | 224 | * In the result you got, summarize the data such that each country will appear only twice (once for Meat/Dairy and once for Vegan consumption values), with the overall consumption for that type. Select only the `consumption` and `food_type` values. You should be using the functions `group_by` and `summary`. 225 | 226 | ```{r reclassification summary} 227 | food_consumption_summarised <- food_consumption_reclassified %>% 228 | group_by(country, food_type) %>% 229 | summarize(consumption = sum(consumption)) 230 | ``` 231 | 232 | * We would like to create a chart in which each country is a point, the x axis is Meat/dairy and the y-axis is Vegan consumption. What kind of transformations would you need to do on the previous tibble to prepare it for such a plot? 233 | 234 | * Try to use `pivot_wider` in order to make that transformation, and use `ggplot` to create the chart. 235 | 236 | ```{r consumption chart} 237 | food_consumption_summarised %>% 238 | ungroup() %>% 239 | select(country, consumption, food_type) %>% 240 | pivot_wider(names_from = food_type, values_from = consumption) %>% 241 | ggplot(aes(x = `Meat/Dairy`, y = Vegan)) + 242 | geom_point() 243 | ``` 244 | 245 | * Can you identify any relationships between the two variables? (consumption of Meat/Dairy versus consumption of Vegan food?) 246 | 247 | 248 | *** 249 | 250 | **Checkpoint** - Solve exercise together in class. 251 | 252 | *** -------------------------------------------------------------------------------- /labs/answers/netflix movies and tv shows exercise - answers.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Netflix Movies and TV Shows - Exercise" 3 | author: "Adi Sarid" 4 | date: "2022-04-05" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Background 13 | 14 | The following exercise is based on the Netflix Movies and TV Shows data, extracted from Kaggle ([here](https://www.kaggle.com/datasets/shivamb/netflix-shows)). 15 | 16 | The goal of the exercise is to repeat and familiarize yourselves with the topics we were discussing in the past few weeks, but from a practical perspective. 17 | 18 | In this exercise you will: 19 | 20 | 1. Write your solutions in RMarkdown, combining documentation and code. 21 | 22 | 2. You will import and transform data, using different functions from `tidyverse` which we discussed two weeks ago. 23 | 24 | 3. You will do some visualizations to support and interpret your analysis. 25 | 26 | 4. You will test a number of hypothesis: 27 | 28 | 1. Comparing means. 29 | 30 | 2. Comparing distributions. 31 | 32 | Please do the exercise in pairs/groups. 33 | 34 | ## Reading the data 35 | 36 | Read the data, and use `glimpse` to understand it. 37 | 38 | ```{r reading the data, warning=FALSE, message=FALSE} 39 | library(tidyverse) 40 | netflix_raw <- readr::read_csv("https://raw.githubusercontent.com/adisarid/intro_statistics_R/master/labs/data/netflix_titles.csv") 41 | 42 | glimpse(netflix_raw) 43 | ``` 44 | 45 | In the group, discuss the different variables of the data, what type are they? which of them do you think you should transform? why and how? 46 | 47 | ## Data transformation 48 | 49 | Create a new table called `netflix` in which: 50 | 51 | - `type` is a factor. 52 | 53 | - `country` is a factor. 54 | 55 | - `duration` is numeric. 56 | 57 | - Add a new variable called `duration_units` indicating the units of `duration`. 58 | 59 | - Create a set of new logical variables indicating the title type, i.e., is the title: 60 | 61 | - International Movies 62 | 63 | - Dramas 64 | 65 | - Documentaries 66 | 67 | - Comedies 68 | 69 | - Action and Adventure 70 | 71 | Hint: you probably want to use `str_detect` for that. 72 | 73 | ```{r transform and tidy} 74 | netflix <- netflix_raw %>% 75 | mutate(type = factor(type), 76 | country = factor(country), 77 | duration = parse_number(duration), 78 | duration_units = if_else(type == "Movie", "minutes", "seasons"), 79 | international_movie = str_detect(listed_in, "International Movies"), 80 | drama = str_detect(listed_in, "Dramas"), 81 | comedies = str_detect(listed_in, "Comedies"), 82 | documentaries = str_detect(listed_in, "Documentaries"), 83 | action_adventure = str_detect(listed_in, "Action & Adventure")) 84 | ``` 85 | 86 | ## TV Shows versus Movies 87 | 88 | Use a chart to plot the frequency (number of appearances per each country, per each type movies versus tv shows, for the following countries: "United States", "India", "United Kingdom", "Japan", "South Korea", "Canada", "Spain", "France", "Mexico"). Ignore titles with multiple countries. What position function should you use to compare the proportion of moveis versus tv shows (position_fill versus position_stack?) 89 | 90 | ```{r country appearances} 91 | netflix %>% 92 | filter(country %in% c("United States", "India", "United Kingdom", "Japan", "South Korea", "Canada", "Spain", "France", "Mexico")) %>% 93 | count(type, country) %>% 94 | ggplot(aes(x = country, fill = type, y = n)) + 95 | geom_col(position = position_fill()) + 96 | coord_flip() 97 | ``` 98 | 99 | ------------------------------------------------------------------------ 100 | 101 | What country is with the most TV shows and what country is with the least TV shows? 102 | 103 | Can you find a similarity between some countries in the same region (i.e., European versus APAC versus North America? 104 | 105 | Conduct a hypothesis test which examines if the proportion of tv shows in South Korea is higher than the proportion of tv shows in Japan. What is the p-value? is it a statistically significant finding (in the $\alpha=0.05$ level? 106 | 107 | ------------------------------------------------------------------------ 108 | 109 | ```{r hypothesis prop test} 110 | netflix_apac <- netflix %>% 111 | filter(country %in% c("Japan", "South Korea")) %>% 112 | count(country, type) 113 | 114 | netflix_apac 115 | 116 | prop.test(x = c(169, 158), n = c(169+76, 158+41)) 117 | ``` 118 | 119 | ## Movie duration 120 | 121 | Plot the distribution of movie duration, discuss the type of distribution (e.g., is it normally distributed or not?). Find the mean, sd, and a 95% confidence interval for movie duration. 122 | 123 | ```{r movie duration} 124 | netflix %>% 125 | filter(type == "Movie") %>% 126 | ggplot(aes(duration)) + 127 | geom_density() 128 | 129 | netflix %>% 130 | group_by(type) %>% 131 | summarize(mean(duration, na.rm = T), 132 | sd(duration, na.rm = T)) 133 | 134 | netflix_movies <- netflix %>% 135 | filter(type == "Movie") 136 | 137 | t.test(netflix_movies$duration) 138 | ``` 139 | 140 | ---- 141 | 142 | 1. Compare the two duration distributions of drama and non-drama movies (via a plot). 143 | 144 | 2. Compute the mean and variance of movie duration of dramas versus non-dramas. 145 | 146 | 3. Formulate and test the following hypothesis tests: 147 | 148 | a. Drama movies are longer than non-drama movies. Is this a paired or non-paired test? 149 | 150 | b. The variance of duration is different between drama and non-drama movies. 151 | 152 | ```{r drama movie hypothesis} 153 | netflix_movies %>% 154 | group_by(drama) %>% 155 | summarize(mean(duration, na.rm = T), 156 | sd(duration, na.rm = T)) 157 | ggplot(netflix_movies, aes(duration, color = drama)) + 158 | geom_density() 159 | t.test(formula = duration ~ drama, data = netflix_movies) 160 | var.test(formula = duration ~ drama, data = netflix_movies, alternative = "two.sided") 161 | ``` 162 | 163 | 164 | -------------------------------------------------------------------------------- /labs/netflix movies and tv shows exercise.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Netflix Movies and TV Shows - Exercise" 3 | author: "Adi Sarid" 4 | date: "2022-04-05" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Background 13 | 14 | The following exercise is based on the Netflix Movies and TV Shows data, extracted from Kaggle ([here](https://www.kaggle.com/datasets/shivamb/netflix-shows)). 15 | 16 | The goal of the exercise is to repeat and familiarize yourselves with the topics we were discussing in the past few weeks, but from a practical perspective. 17 | 18 | In this exercise you will: 19 | 20 | 1. Write your solutions in RMarkdown, combining documentation and code. 21 | 22 | 2. You will import and transform data, using different functions from `tidyverse` which we discussed two weeks ago. 23 | 24 | 3. You will do some visualizations to support and interpret your analysis. 25 | 26 | 4. You will test a number of hypothesis: 27 | 28 | 1. Comparing means. 29 | 30 | 2. Comparing distributions. 31 | 32 | Please do the exercise in pairs/groups. 33 | 34 | ## Reading the data 35 | 36 | Read the data, and use `glimpse` to understand it. 37 | 38 | ```{r reading the data, warning=FALSE, message=FALSE, eval=FALSE} 39 | library(tidyverse) 40 | netflix_raw <- readr::read_csv("https://raw.githubusercontent.com/adisarid/intro_statistics_R/master/labs/data/netflix_titles.csv") 41 | 42 | glimpse(netflix_raw) 43 | ``` 44 | 45 | In the group, discuss the different variables of the data, what type are they? which of them do you think you should transform? why and how? 46 | 47 | ## Data transformation 48 | 49 | Create a new table called `netflix` in which: 50 | 51 | - `type` is a factor. 52 | 53 | - `country` is a factor. 54 | 55 | - `duration` is numeric. 56 | 57 | - Add a new variable called `duration_units` indicating the units of `duration`. 58 | 59 | - Create a set of new logical variables indicating the title type, i.e., is the title: 60 | 61 | - International Movies 62 | 63 | - Dramas 64 | 65 | - Documentaries 66 | 67 | - Comedies 68 | 69 | - Action and Adventure 70 | 71 | Hint: you probably want to use `str_detect` for that. 72 | 73 | (Fill-in the blanks) 74 | 75 | ```{r transform and tidy, eval=FALSE} 76 | netflix <- netflix_raw %>% 77 | mutate(type = factor(___), 78 | country = factor(___), 79 | duration = parse_number(___), 80 | duration_units = if_else(type == "Movie", ___, ___), 81 | international_movie = str_detect(listed_in, "International Movies"), 82 | ___ = str_detect(listed_in, "Dramas"), 83 | comedies = str_detect(___, ___), 84 | documentaries = ___, 85 | action_adventure = ___) 86 | ``` 87 | 88 | ## TV Shows versus Movies 89 | 90 | Use a chart to plot the frequency (number of appearances per each country, per each type movies versus tv shows, for the following countries: "United States", "India", "United Kingdom", "Japan", "South Korea", "Canada", "Spain", "France", "Mexico"). Ignore titles with multiple countries. What position function should you use to compare the proportion of movies versus tv shows (position_fill versus position_stack?) 91 | 92 | ```{r country appearances, eval=FALSE} 93 | netflix %>% 94 | filter(country %in% c("United States", "India", "United Kingdom", "Japan", "South Korea", "Canada", "Spain", "France", "Mexico")) %>% 95 | count(___, ___) %>% 96 | ggplot(aes(x = ___, fill = ___, y = n)) + 97 | geom_col(position = ___) + 98 | coord_flip() 99 | ``` 100 | 101 | ------------------------------------------------------------------------ 102 | 103 | What country is with the most TV shows and what country is with the least TV shows? 104 | 105 | Can you find a similarity between some countries in the same region (i.e., European versus APAC versus North America? 106 | 107 | Conduct a hypothesis test which examines if the proportion of tv shows in South Korea is higher than the proportion of tv shows in Japan. What is the p-value? is it a statistically significant finding (in the $\alpha=0.05$ level? 108 | 109 | ------------------------------------------------------------------------ 110 | 111 | ```{r hypothesis prop test, eval=FALSE} 112 | netflix_apac <- netflix %>% 113 | filter(country %in% ___) %>% 114 | count(country, type) 115 | 116 | netflix_apac 117 | 118 | prop.test(x = c(___, ___), n = c(___, ___)) 119 | ``` 120 | 121 | ## Movie duration 122 | 123 | Plot the distribution of movie duration, discuss the type of distribution (e.g., is it normally distributed or not?). Find the mean, sd, and a 95% confidence interval for movie duration. 124 | 125 | ```{r movie duration, eval=FALSE} 126 | netflix %>% 127 | filter(type == "Movie") %>% 128 | ggplot(aes(___)) + 129 | geom____() 130 | 131 | netflix %>% 132 | group_by(type) %>% 133 | summarize(mean(___, na.rm = ___), 134 | sd(___, na.rm = ___)) 135 | 136 | netflix_movies <- netflix %>% 137 | filter(type == "Movie") 138 | 139 | t.test(___) 140 | ``` 141 | 142 | ---- 143 | 144 | 1. Compare the two duration distributions of drama and non-drama movies (via a plot). 145 | 146 | 2. Compute the mean and variance of movie duration of dramas versus non-dramas. 147 | 148 | 3. Formulate and test the following hypothesis tests: 149 | 150 | a. Drama movies are longer than non-drama movies. Is this a paired or non-paired test? 151 | 152 | b. The variance of duration is different between drama and non-drama movies. 153 | 154 | ```{r drama movie hypothesis, eval=FALSE} 155 | 156 | netflix_movies %>% 157 | group_by(___) %>% 158 | summarize(mean(duration, na.rm = T), 159 | sd(duration, na.rm = T)) 160 | 161 | ggplot(netflix_movies, aes(duration, color = drama)) + 162 | geom_density() 163 | 164 | t.test(formula = ___, data = ___) 165 | 166 | var.test(formula = ___, data = ___, alternative = "___") 167 | ``` 168 | 169 | 170 | -------------------------------------------------------------------------------- /lectures/00-Introduction.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/00-Introduction.pptx -------------------------------------------------------------------------------- /lectures/00-intro-binomial-dist.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | tibble(Infected = 25:100, 3 | Probability = dbinom(25:100, 300, 0.2)) %>% 4 | mutate(color = if_else(Infected %in% c(40, 50, 55), T, F)) %>% 5 | ggplot(aes(x = Infected, y = Probability, fill = color)) + 6 | scale_fill_manual(values = c("TRUE" = saridr::sarid_colors$light_blue_gradient, "FALSE" = "grey")) + 7 | geom_col() + 8 | guides(fill = F) + 9 | scale_y_continuous(labels = scales::percent_format(1)) + 10 | ggtitle("Density function n=300, p=0.2") 11 | 12 | tibble(Infected = 25:100, 13 | Probability = pbinom(25:100, 300, 0.2)) %>% 14 | mutate(label = if_else(Infected %in% c(40, 50, 55), glue::glue("{round(Probability*100,2)}%"), NA_character_)) %>% 15 | ggplot(aes(x = Infected, y = Probability)) + 16 | geom_line() + 17 | geom_label(aes(label = label)) + 18 | guides(fill = F) + 19 | scale_y_continuous(labels = scales::percent_format(1)) + 20 | ggtitle("Distribution function n=300, p=0.2") 21 | -------------------------------------------------------------------------------- /lectures/00-introduction/00-introduction_script.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction - visualizing and summarizing data" 3 | author: "Adi Sarid" 4 | date: "2019-10-19" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | library(tidyverse) 11 | ``` 12 | 13 | 14 | ## The power lifting data set 15 | 16 | We're going to demonstrate with power lifting data. 17 | This data set comes from tidytuesday (2019-10-08), see the documentation here: 18 | https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-10-08 19 | 20 | 21 | ```{r read the data} 22 | ipf_lifts <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-08/ipf_lifts.csv") 23 | 24 | glimpse(ipf_lifts) 25 | ``` 26 | 27 | ## Scatter plot 28 | 29 | A scatter plot allows us to examine the relationship between two continuous variables (i.e., numeric). For example, the following scatter plot will tell us the relationship between squats and bench presses (two types of exercises, which work on the legs -- quadriceps and the hands -- triceps respectively). We're going to sample observations (because all 40k observations will be too much for the chart). 30 | 31 | ```{r scatter plot sqauat bench age} 32 | set.seed(0) # used to get consistent results 33 | 34 | # sample a subset of the file 35 | sampled_ipf_lifts <- ipf_lifts %>% 36 | filter(!is.na(best3squat_kg) & !is.na(best3bench_kg)) %>% 37 | sample_n(1500) 38 | 39 | # plot squat versus bench press 40 | ggplot(sampled_ipf_lifts, 41 | aes(x = best3squat_kg, y = best3bench_kg)) + 42 | geom_point(alpha = 0.3) + 43 | theme_bw() 44 | 45 | ggplot(sampled_ipf_lifts, 46 | aes(x = age, y = best3bench_kg)) + 47 | geom_point(alpha = 0.3) + 48 | theme_bw() 49 | 50 | ``` 51 | 52 | Think about what these scatter plots teach us? 53 | 54 | * What is the relationship between benchpresses weight and squats weight? 55 | * What is the meaning of the point with a negative squat weight measurement? 56 | * What do outlier points look like here? 57 | * Is there a relationship between benchpresses weight and age? 58 | 59 | Later on, we will learn how to extract the linear relationship between such variables (the equation), when there is such a relationship. This will be dealt with in the linear regression chapter of our course. 60 | 61 | ## Single variable: distribution, mean, meadian, standard deviation 62 | 63 | Now we discuss how to express the properties of a single, continuous, variable. 64 | 65 | ### Histogram and Shape 66 | 67 | The distribution of a variable can be described with a histogram or with a density plot. The both are related (basically show the same thing). 68 | 69 | ```{r histogram and density} 70 | 71 | ggplot(ipf_lifts, aes(x = age)) + 72 | geom_histogram() + 73 | theme_bw() 74 | 75 | ggplot(ipf_lifts, aes(x = age, y = stat(density))) + 76 | geom_histogram() + 77 | geom_density(color = "red", size = 1) + 78 | theme_bw() 79 | 80 | ipf_lifts %>% 81 | select(starts_with("best3")) %>% 82 | pivot_longer(cols = everything(), names_to = "exercise", values_to = "weight") %>% 83 | filter(weight > 0) %>% 84 | ggplot(aes(x = weight, y = stat(density))) + 85 | geom_density(aes(color = exercise), size = 1, bw = 10) + 86 | theme_bw() 87 | 88 | ``` 89 | 90 | 91 | *** 92 | 93 | Questions: 94 | [Mentimeter edit link](https://www.mentimeter.com/s/c53753031b6cccd429aebeedf531eb1d/fb09c578738d/edit) 95 | 96 | 1. Which exercise has relatively lower weight values? [mentimeter](https://www.menti.com/tgdkyggsnu) 97 | 2. The densities look bi-modal (two peaks). Can you guess why? 98 | 3. Which exercise has higher dispersion? 99 | 4. Can you think of familiar distributions (or a combination of such) which would fit these densities? 100 | 101 | *** 102 | 103 | Here are some familiar forms of distributions: 104 | 105 | ```{r density and histogram demonstrations} 106 | 107 | familiar_distributions <- tibble(values = rnorm(1000), dist_type = "normal(mu=0,sig=1)") %>% 108 | bind_rows(tibble(values = runif(1000), dist_type = "uniform(a=0,b=1)"), 109 | tibble(values = rexp(1000), dist_type = "exponential(rate=1)"), 110 | tibble(values = rchisq(1000, df = 3), dist_type = "Chi-sqaure(df=3)"), 111 | tibble(values = rbinom(1000, size = 6, prob = 0.5), dist_type = "binomial(n=6,p=0.5)")) 112 | 113 | ggplot(familiar_distributions, aes(x = values, y = stat(density))) + 114 | geom_histogram() + 115 | geom_density(color = "red", size = 1) + 116 | facet_wrap(~dist_type) + 117 | theme_bw() 118 | 119 | ``` 120 | 121 | Notice a few things: 122 | 123 | * The difference between a discrete distribution and a continuous distribution 124 | * The variance (dispersion) of the distributions (based on the second moment $EX^2 - (EX)^2$) 125 | * The asymmetry of certain distributions - which can be measured via skeweness (based on the third moment $E[\left(\frac{X-\mu}{\sigma}\right)^3]$) 126 | * The tendency to yield outliers (extreme values) - which is measured by kurtosis (based on the fourth moment $E[\left(\frac{X-\mu}{\sigma}\right)^4]$) 127 | 128 | These can be computed either from the data, or analytically (when the distribution is known). We will delve deeper into these terms later on. 129 | 130 | ### Mean, standard deviation (and variance) 131 | 132 | Reminder: the mean or expected value of a random variable $X$ is defined as: 133 | 134 | \[ 135 | E[X] = \mu_X = \int_{x=-\infty}^\infty{xf(x)dx} 136 | \] 137 | 138 | In the case of a **discrete** variable, the integral becomes summation. 139 | 140 | \[ 141 | E[X] = \sum_{x=-\infty}^\infty{xf(x)} 142 | \] 143 | 144 | When we are estimating the mean from a given sample, the weight of each observation is $1/n$, hence, we get the familiar form for computing the mean (average): 145 | 146 | \[ 147 | \bar{x} = \frac{1}{n}\sum_{i=1}^n{x_i} 148 | \] 149 | 150 | The standard deviation (variance) is a measure of dispersion of a distribution. Defined as: 151 | 152 | \[ 153 | V(X) = \sigma^2=E[(X-\mu)^2] = \int{(x-\mu)^2f(x)dx} 154 | \] 155 | 156 | Note that $\sigma^2=E[X^2]-(EX)^2$. For a sample there are two types of estimates to standard deviations which are used oftenly (either $\hat{\sigma}$ or $s$, for population and for a sample). 157 | 158 | The standard deviation is the sqaure root of the variance $\sigma$. 159 | 160 | When computing the standard deviation of a population we use: 161 | 162 | \[ 163 | \hat{\sigma} = \sqrt{\frac{1}{n}\sum_{i=1}^n(x_i-\bar{x})^2} 164 | \] 165 | 166 | And for a sample we would use a denominator $n-1$ instead of $n$: 167 | 168 | \[ 169 | s = \sqrt{\frac{1}{n-1}\sum_{i=1}^n(x_i-\bar{x})^2} 170 | \] 171 | 172 | This is called Bessel's correction, which is applied to yield an **unbiased estimate**. We'll get back to that later on and explain bias in detail, and why it is unbiased. 173 | 174 | ```{r compute mean and std} 175 | 176 | familiar_distributions %>% 177 | group_by(dist_type) %>% 178 | summarize(mean = mean(values), 179 | sd = sd(values), 180 | var = var(values)) 181 | 182 | 183 | ``` 184 | 185 | ### Boxplot, median, quartiles (and percentiles) 186 | 187 | ```{r boxplot example} 188 | 189 | ipf_lifts %>% 190 | select(starts_with("best3")) %>% 191 | pivot_longer(cols = everything(), names_to = "exercise", values_to = "weight") %>% 192 | filter(weight > 0) %>% 193 | ggplot(aes(y = weight, x = exercise)) + 194 | geom_boxplot() + 195 | theme_bw() 196 | 197 | ``` 198 | 199 | The median is the value that 50\% of the obsevations are below it and 50\% of the observations are above it, i.e.: the median is the observation right in the middle. If there are an even number of observations, there will be two observations "in the middle" and the median is defined as their average. 200 | 201 | The $P$-th **percentile** of a list of $n$ observations (sorted in an increasing order) is the number located at $\left\lceil {\frac {P}{100}}\times n\right\rceil$. 202 | 203 | The estimate of percentile $p$ ($0\leq p\leq1$) is the value $v$ which yields: 204 | 205 | \[ 206 | P(X\leq v) = p 207 | \] 208 | 209 | (For median $p=0.5$, for quantiles $p\in\{0.25, 0.5, 0.75\}$) 210 | 211 | The boxplot illustrates the quantiles (box ends), the median (line inside the box), the box extends with two wiskers up to $1.5\times IQR$ (the Inter-Quartile-Range). Observations outside the IQR are considered outliers, and marked by a point. 212 | 213 | ```{r example for IQR} 214 | 215 | ipf_lifts %>% 216 | select(starts_with("best3")) %>% 217 | pivot_longer(cols = everything(), names_to = "exercise", values_to = "weight") %>% 218 | filter(weight > 0) %>% 219 | group_by(exercise) %>% 220 | summarize(quartile1 = quantile(weight, 0.25), 221 | quartile3 = quantile(weight, 0.75)) %>% 222 | mutate(IQR = quartile3 - quartile1) %>% 223 | mutate(buttom_whisker = quartile1 - 1.5*IQR, 224 | top_whisker = quartile3 + 1.5*IQR) 225 | 226 | ``` 227 | 228 | Example for boxplot of common distributions. 229 | 230 | ```{r boxplot common distributions} 231 | ggplot(familiar_distributions, aes(x = dist_type, y = values)) + 232 | geom_boxplot() + 233 | theme_bw() 234 | ``` 235 | 236 | ### Data transformations 237 | 238 | Sometimes, it's useful to transform data. Transformations can reveal new relationships between variables, and allow us to improve models. We will learn more about transformations later on in the course (when we discuss linear and logistic regression, for example). 239 | 240 | Here is a demonstration from the `diamonds` data set. You will work with this data set in the homework (questions from the R4DS book). 241 | 242 | ```{r relationship with transformations} 243 | 244 | ggplot(diamonds, 245 | aes(x = carat, y = price)) + 246 | geom_point(alpha = 0.3) + 247 | theme_bw() 248 | 249 | ggplot(diamonds, 250 | aes(x = log(carat), y = price)) + 251 | geom_point(alpha = 0.3) + 252 | theme_bw() 253 | 254 | ggplot(diamonds, 255 | aes(x = log(carat), y = log(price))) + 256 | geom_point(alpha = 0.3) + 257 | theme_bw() 258 | 259 | 260 | ``` -------------------------------------------------------------------------------- /lectures/00-introduction/IWER34_2019.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/00-introduction/IWER34_2019.xlsx -------------------------------------------------------------------------------- /lectures/00-introduction/st02_03.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/00-introduction/st02_03.xls -------------------------------------------------------------------------------- /lectures/01-Point Estimation Methods and Intervals.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/01-Point Estimation Methods and Intervals.pdf -------------------------------------------------------------------------------- /lectures/02-Intervals.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/02-Intervals.pdf -------------------------------------------------------------------------------- /lectures/03 - Hypothesis Tests.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/03 - Hypothesis Tests.pdf -------------------------------------------------------------------------------- /lectures/04 - Statistical inference for Two Samples.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/04 - Statistical inference for Two Samples.pdf -------------------------------------------------------------------------------- /lectures/05 - Simple Linear Regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/05 - Simple Linear Regression.pdf -------------------------------------------------------------------------------- /lectures/06 - Multiple Linear Regression and Correlation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/06 - Multiple Linear Regression and Correlation.pdf -------------------------------------------------------------------------------- /lectures/06-Note_about_overfitting.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "A note about overfitting" 3 | author: "Adi Sarid" 4 | date: "12/7/2019" 5 | output: html_document 6 | --- 7 | 8 | When the number of features $p$ is very big compared to the sample size $n$, we are prone to undesired effects, also termed "the curse of dimensionality". Mainly, when the number of features is as big as we want we can explain any observed phenomena in the train set, but not for a good reason, it is simply due to an excess in degrees of freedom. 9 | 10 | For example 11 | 12 | * Take $y$ completely random with 100 observations. 13 | * Take 95 parameters $x_1,\ldots,x_{95}$ and set their values randomly also. 14 | * Build a model, any model (we'll use linear regression here). 15 | * Analyze the model's fit. 16 | * Do the process only this time with a train/test split. 17 | 18 | ```{r overfitting in action, message=FALSE, warning=FALSE} 19 | library(tidyverse) 20 | set.seed(0) 21 | xvars <- data.frame(matrix(runif(100*95), ncol=95)) 22 | overfitting <- tibble(y = runif(100)) %>% 23 | bind_cols(xvars) 24 | glimpse(overfitting) 25 | ggplot(overfitting, aes(y)) + 26 | geom_histogram() + 27 | theme_bw() 28 | 29 | # these are just uniformly distributed numbers, should have no kind of relationship between variables 30 | # here's a model with just a few X's, and no overfit. The model is insignificant. 31 | # the only significant coefficient beta is the intercept (which is roughly equal to the average of y) 32 | 33 | lm_no_overfit <- lm(data = overfitting, 34 | formula = y ~ X1 + X2 + X3) 35 | summary(lm_no_overfit) 36 | 37 | # now, see what happens when we add all the 95 features 38 | # mostly, look at the R^2. It's almost 1! 39 | lm_overfit <- lm(data = overfitting, 40 | formula = y ~ .) 41 | summary(lm_overfit) 42 | 43 | # now, see the errors of each model 44 | overfitting <- overfitting %>% 45 | mutate(res_no_overfit = y - predict(lm_no_overfit, newdata = overfitting), 46 | res_overfit = y - predict(lm_overfit, newdata = overfitting)) 47 | overfitting %>% 48 | summarize(mean((res_no_overfit)^2), 49 | mean((res_overfit)^2)) 50 | # 80%+ reduction in mean absolute residual error! 51 | ``` 52 | 53 | It looks as if the over fit model is amazing, but this is a bluff. Let's do this again, only this time with a train/test split. 54 | 55 | ```{r overfitting detection with test set} 56 | overfitting <- overfitting %>% 57 | mutate(is_train = runif(nrow(overfitting)) < 0.8) 58 | 59 | lm_overfit_train <- lm(data = overfitting %>% filter(is_train), 60 | formula = y ~ .) 61 | 62 | overfitting <- overfitting %>% 63 | mutate(res_overfit_train = y - predict(lm_overfit_train, newdata = overfitting)) 64 | 65 | overfitting %>% 66 | filter(!is_train) %>% 67 | summarize(mean((res_no_overfit)^2), 68 | mean((res_overfit)^2), 69 | mean((res_overfit_train)^2)) 70 | 71 | # Now the "true face" of the model is discovered. See how high the error rate of the test set is! 72 | # Beware of overfitting models. Always use train/test. Watch out for n and p. 73 | ``` 74 | 75 | ## To sum up 76 | 77 | * Beware of overfitting. 78 | * Always use a train/test split (also possible train/test/validate or cross-validation). 79 | * Consider the number of parameters $p$ versus the sample size $n$. There is no "iron rule" here but the test set error will help guide you, and also, comparing a nominal model to your model will show you the contribution of your model. -------------------------------------------------------------------------------- /lectures/07 - Regression, Design and Analysis of Single-Factor Experiments.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/07 - Regression, Design and Analysis of Single-Factor Experiments.pdf -------------------------------------------------------------------------------- /lectures/09-One_Two_way_ANOVA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/09-One_Two_way_ANOVA.pdf -------------------------------------------------------------------------------- /lectures/Example_for_multicolinearity_problem.R: -------------------------------------------------------------------------------- 1 | set.seed(42) 2 | ex1 <- tibble(x1 = runif(100)) %>% 3 | mutate(y = 5 + 3*x1 + rnorm(100, mean = 0, sd = 0.05), 4 | x2 = 2*x1 + rnorm(100, mean = 0, sd = 0.01)) 5 | 6 | ex1 %>% 7 | cor() 8 | 9 | summary(lm(y ~ x1, ex1)) 10 | summary(lm(y ~ x2, ex1)) 11 | summary(lm(y ~ x1 + x2, ex1)) 12 | 13 | car::vif(lm(y ~ x1 + x2, ex1)) 14 | -------------------------------------------------------------------------------- /lectures/data/montgomery_13.5_fabric_strength.csv: -------------------------------------------------------------------------------- 1 | chemical,fabric1,fabric2,fabric3,fabric4,fabric5 2 | 1,1.3,1.6,0.5,1.2,1.1 3 | 2,2.2,2.4,0.4,2,1.8 4 | 3,1.8,1.7,0.6,1.5,1.3 5 | 4,3.9,4.4,2,4.1,3.4 6 | -------------------------------------------------------------------------------- /lectures/data/montgomery_14.5_adhesion_force.csv: -------------------------------------------------------------------------------- 1 | primer_type,application_method,adhesion_force 2 | 1,Dipping,4 3 | 1,Dipping,4.5 4 | 1,Dipping,4.3 5 | 1,Spraying,5.4 6 | 1,Spraying,4.9 7 | 1,Spraying,5.6 8 | 2,Dipping,5.6 9 | 2,Dipping,4.9 10 | 2,Dipping,5.4 11 | 2,Spraying,5.8 12 | 2,Spraying,6.1 13 | 2,Spraying,6.3 14 | 3,Dipping,3.8 15 | 3,Dipping,3.7 16 | 3,Dipping,4 17 | 3,Spraying,5.5 18 | 3,Spraying,5 19 | 3,Spraying,5 20 | -------------------------------------------------------------------------------- /lectures/data/wildlife_impacts_small.csv: -------------------------------------------------------------------------------- 1 | height,n 2 | 0,15684 3 | 1,56 4 | 2,51 5 | 3,30 6 | 4,14 7 | 5,273 8 | 6,15 9 | 7,10 10 | 8,17 11 | 9,7 12 | 10,1078 13 | 11,4 14 | 12,7 15 | 13,3 16 | 14,2 17 | 15,125 18 | 17,2 19 | 18,1 20 | 20,503 21 | 21,3 22 | 22,1 23 | 23,1 24 | 25,97 25 | 26,1 26 | 27,1 27 | 30,325 28 | 35,27 29 | 36,1 30 | 37,1 31 | 40,79 32 | 42,1 33 | 45,6 34 | 46,1 35 | 47,1 36 | 48,1 37 | 50,1572 38 | 55,2 39 | 60,31 40 | 62,2 41 | 65,3 42 | 70,27 43 | 73,1 44 | 75,114 45 | 76,1 46 | 79,1 47 | 80,28 48 | 90,21 49 | 91,1 50 | 95,3 51 | 100,1793 52 | 110,2 53 | 120,7 54 | 125,23 55 | 130,1 56 | 140,2 57 | 142,1 58 | 150,311 59 | 170,1 60 | 173,1 61 | 175,8 62 | 178,1 63 | 180,5 64 | 185,1 65 | 190,5 66 | 200,1433 67 | 220,1 68 | 225,4 69 | 230,2 70 | 240,1 71 | 250,103 72 | 260,1 73 | 270,1 74 | 275,1 75 | 280,1 76 | 290,1 77 | 300,1143 78 | 310,1 79 | 320,1 80 | 325,1 81 | 330,3 82 | 350,62 83 | 370,2 84 | 375,1 85 | 381,1 86 | 400,633 87 | 410,1 88 | 413,1 89 | 423,1 90 | 424,1 91 | 425,1 92 | 430,2 93 | 431,1 94 | 450,33 95 | 460,1 96 | 480,3 97 | 500,1460 98 | 510,2 99 | 530,1 100 | 540,1 101 | 545,1 102 | 550,15 103 | 560,2 104 | 565,1 105 | 570,1 106 | 580,3 107 | 600,358 108 | 620,1 109 | 650,11 110 | 680,2 111 | 700,290 112 | 710,1 113 | 725,1 114 | 750,32 115 | 754,1 116 | 800,608 117 | 809,1 118 | 850,11 119 | 883,1 120 | 890,1 121 | 900,137 122 | 922,1 123 | 950,3 124 | 995,1 125 | 1000,1354 126 | 1003,1 127 | 1025,1 128 | 1040,1 129 | 1050,2 130 | 1060,2 131 | 1100,91 132 | 1126,1 133 | 1150,4 134 | 1165,1 135 | 1174,1 136 | 1180,1 137 | 1200,301 138 | 1240,1 139 | 1250,15 140 | 1275,1 141 | 1300,100 142 | 1320,1 143 | 1323,1 144 | 1350,10 145 | 1370,1 146 | 1380,1 147 | 1400,109 148 | 1440,1 149 | 1450,2 150 | 1464,1 151 | 1480,1 152 | 1485,1 153 | 1490,1 154 | 1500,930 155 | 1550,1 156 | 1580,2 157 | 1600,78 158 | 1617,1 159 | 1640,2 160 | 1643,1 161 | 1650,1 162 | 1700,109 163 | 1750,4 164 | 1775,1 165 | 1800,161 166 | 1820,1 167 | 1830,1 168 | 1850,7 169 | 1870,1 170 | 1880,1 171 | 1900,61 172 | 1950,2 173 | 1975,3 174 | 1980,1 175 | 1990,2 176 | 2000,983 177 | 2020,1 178 | 2080,1 179 | 2100,37 180 | 2150,1 181 | 2200,78 182 | 2250,1 183 | 2300,57 184 | 2350,1 185 | 2375,1 186 | 2400,79 187 | 2448,1 188 | 2500,467 189 | 2510,1 190 | 2530,1 191 | 2600,43 192 | 2650,2 193 | 2700,35 194 | 2750,2 195 | 2800,54 196 | 2830,1 197 | 2850,4 198 | 2900,22 199 | 2950,1 200 | 2975,1 201 | 2992,1 202 | 2999,1 203 | 3000,1076 204 | 3027,1 205 | 3050,1 206 | 3100,13 207 | 3150,1 208 | 3200,31 209 | 3225,1 210 | 3250,2 211 | 3280,1 212 | 3300,25 213 | 3330,1 214 | 3400,55 215 | 3500,241 216 | 3550,1 217 | 3600,21 218 | 3664,1 219 | 3700,20 220 | 3740,1 221 | 3750,4 222 | 3800,25 223 | 3850,2 224 | 3900,9 225 | 3950,3 226 | 3960,1 227 | 4000,699 228 | 4100,7 229 | 4130,1 230 | 4200,16 231 | 4300,19 232 | 4400,22 233 | 4500,126 234 | 4600,13 235 | 4650,1 236 | 4700,6 237 | 4750,3 238 | 4800,16 239 | 4900,8 240 | 4940,1 241 | 4950,2 242 | 5000,585 243 | 5100,2 244 | 5200,19 245 | 5300,7 246 | 5330,1 247 | 5360,1 248 | 5380,1 249 | 5400,15 250 | 5450,1 251 | 5500,70 252 | 5600,9 253 | 5605,1 254 | 5700,7 255 | 5800,9 256 | 5850,1 257 | 5900,7 258 | 5960,1 259 | 5979,1 260 | 6000,375 261 | 6100,5 262 | 6200,10 263 | 6300,7 264 | 6400,4 265 | 6500,61 266 | 6600,6 267 | 6700,7 268 | 6800,8 269 | 6900,4 270 | 7000,246 271 | 7080,1 272 | 7100,2 273 | 7200,5 274 | 7300,3 275 | 7400,10 276 | 7500,56 277 | 7600,6 278 | 7700,5 279 | 7800,6 280 | 7900,1 281 | 8000,199 282 | 8300,4 283 | 8400,2 284 | 8500,30 285 | 8600,1 286 | 8700,3 287 | 8800,4 288 | 8900,1 289 | 9000,91 290 | 9200,2 291 | 9300,2 292 | 9400,4 293 | 9430,1 294 | 9500,29 295 | 9700,2 296 | 9800,3 297 | 9950,1 298 | 10000,164 299 | 10100,2 300 | 10200,4 301 | 10300,1 302 | 10400,8 303 | 10500,17 304 | 10700,2 305 | 10800,2 306 | 10900,1 307 | 11000,93 308 | 11200,1 309 | 11300,2 310 | 11400,1 311 | 11500,9 312 | 11600,1 313 | 11700,1 314 | 12000,46 315 | 12300,1 316 | 12500,16 317 | 12600,1 318 | 13000,27 319 | 13250,1 320 | 13400,1 321 | 13500,4 322 | 13700,1 323 | 13800,1 324 | 14000,19 325 | 14400,1 326 | 15000,16 327 | 15500,1 328 | 16000,9 329 | 16500,1 330 | 17000,2 331 | 17500,1 332 | 18000,3 333 | 18500,2 334 | 19000,1 335 | 20000,4 336 | 21000,1 337 | 22000,1 338 | 23000,1 339 | 24300,1 340 | 25000,2 341 | NA,18038 342 | -------------------------------------------------------------------------------- /lectures/files_during_lecture/05-file1.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | wildlife_small <- read_csv("lectures/data/wildlife_impacts_small.csv", col_types = cols()) %>% 4 | mutate(rounded_height = round(height/1000)) %>% 5 | group_by(rounded_height) %>% 6 | summarize(n = sum(n)) %>% 7 | filter(!is.na(rounded_height)) 8 | 9 | 10 | wildlife_hist <- ggplot(wildlife_small, aes(x = rounded_height, y = n)) + 11 | geom_col(fill = "darkorange", color = "black") + theme_bw() + xlab("Height [k feet]") + scale_y_log10() 12 | wildlife_points <- ggplot(wildlife_small, aes(x = rounded_height, y = n)) + 13 | geom_point() + 14 | theme_linedraw() + 15 | scale_y_log10() + 16 | stat_smooth(method = "lm") + 17 | xlab("Height [k feet]") 18 | cowplot::plot_grid(wildlife_hist, wildlife_points) 19 | 20 | 21 | 22 | 23 | wildlife_lm <- lm(formula = log(n) ~ rounded_height, data = wildlife_small) 24 | 25 | ggplot(tibble(res = wildlife_lm$residuals), 26 | aes(sample = res)) + 27 | geom_qq() 28 | 29 | 30 | 31 | summary(wildlife_lm) 32 | -------------------------------------------------------------------------------- /lectures/files_during_lecture/05-file2.R: -------------------------------------------------------------------------------- 1 | tibble(number = rnorm(150, mean = 2, sd = 5)) %>% 2 | ggplot(aes(sample = number)) + 3 | geom_qq() 4 | 5 | tibble(number = rnorm(150, mean = 0, sd = 1)) %>% 6 | ggplot(aes(x = number)) + 7 | geom_density() 8 | -------------------------------------------------------------------------------- /lectures/files_during_lecture/05-file3.R: -------------------------------------------------------------------------------- 1 | mtcars_lm <- lm(formula = mpg ~ disp, data = mtcars) 2 | 3 | mtcars_lm$residuals 4 | mtcars_lm$fitted.values 5 | 6 | summary(mtcars_lm) 7 | 8 | mtcars_new <- mtcars %>% 9 | mutate(resid = mtcars_lm$residuals) %>% 10 | mutate(prediction = mtcars_lm$fitted.values) 11 | 12 | 13 | ggplot(mtcars_new, aes(y = resid, x = disp)) + 14 | geom_point() 15 | 16 | ggplot(mtcars_new, aes(x = resid)) + 17 | geom_density() 18 | 19 | ggplot(mtcars_new, aes(sample = resid)) + 20 | geom_qq() 21 | -------------------------------------------------------------------------------- /lectures/images/Type_IandType_II_errors.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/images/Type_IandType_II_errors.jpg -------------------------------------------------------------------------------- /lectures/images/birds_eye_view1.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 59 | 61 | Estimation Methods 72 | 79 | 80 | 82 | Method of Moments 93 | 100 | 101 | 103 | Maximum Likelihood 114 | 121 | 122 | Create estimates(such as the average)to estimate the value of 143 | 145 | Parameters 156 | 163 | 164 | 167 | ExpectancyVarianceProportions... 193 | 200 | 201 | 203 | ConfidenceIntervals 219 | 226 | 227 | 229 | HypothesisTests 245 | 252 | 253 | Using statistics such as Z, and Twe can compute 274 | 278 | 282 | 286 | 290 | 291 | 292 | -------------------------------------------------------------------------------- /lectures/images/link_for_survey_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/images/link_for_survey_example.png -------------------------------------------------------------------------------- /lectures/images/speeding_ticket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/images/speeding_ticket.png -------------------------------------------------------------------------------- /lectures/images/waze_not_accurate.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/images/waze_not_accurate.jpg -------------------------------------------------------------------------------- /lectures/mult_lin_reg_example.R: -------------------------------------------------------------------------------- 1 | # Coded example for manual multiple linear regression 2 | 3 | library(tidyverse) 4 | 5 | # In this example we illustrate manual computation of multiple linear regression 6 | 7 | # The model using the lm function ---- 8 | 9 | lm(mpg ~ hp + wt, mtcars) %>% 10 | summary() 11 | 12 | # Now let's do this manually ---- 13 | 14 | X <- mtcars %>% 15 | mutate(ones = 1) %>% 16 | select(ones, hp, wt) %>% 17 | as.matrix() 18 | 19 | # The beta coefficients ---- 20 | 21 | XtX <- crossprod(X) # same as t(X) %*% X 22 | 23 | XtX_inv <- solve(XtX) # Important! not the same as (XtX)^(-1) 24 | 25 | beta <- XtX_inv %*% t(X) %*% mtcars$mpg 26 | 27 | # We saw that \hat{\sigma^2}(X^tX)^{-1} is an estimate 28 | 29 | # Residual sum of square 30 | 31 | y_hat <- X %*% beta 32 | 33 | e_i <- (mtcars$mpg - y_hat) 34 | 35 | sigma_sq <- (e_i^2)/(32 - 3) # df = 32 - (p + 1) = 32 - 2 - 1 = 29 36 | 37 | # the sum of squared errors 38 | sum(sigma_sq) 39 | 40 | # the residual standard error 41 | sqrt(sum(sigma_sq)) 42 | 43 | # The standard deviation of coefficients ---- 44 | sqrt(sum(sigma_sq) * diag(XtX_inv)) 45 | -------------------------------------------------------------------------------- /lectures/what_is_z_score.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | # This script illustrates z-score. 4 | 5 | plot_z_score <- function(z = NULL, p = NULL, 6 | alternative = c("two.sided", "one.sided")){ 7 | 8 | if (!is.null(z) & !is.null(p)){ 9 | stop("Can't have both z and p...") 10 | } 11 | 12 | if (alternative[1] == "two.sided" & !is.null(p)){ 13 | div_factor <- 0.5 14 | } else { 15 | div_factor <- 1 16 | } 17 | 18 | if (!is.null(p)){ 19 | if (p > 0.5){ 20 | p <- 1-p 21 | } 22 | } 23 | 24 | z_dense <- tibble(z_range = seq(-3, 3, by = 0.05), 25 | density = dnorm(z_range), 26 | p_range = pnorm(z_range)) 27 | 28 | base_plot <- ggplot(z_dense, aes(x = z_range, y = density)) + 29 | geom_line() 30 | 31 | subtitle_str <- "" 32 | 33 | if (!is.null(z)){ 34 | p <- pnorm(q = z)*div_factor 35 | } 36 | 37 | z_of_p <- qnorm(p*div_factor) 38 | density_of_p <- dnorm(z_of_p) 39 | 40 | base_plot <- base_plot + 41 | geom_area(data = z_dense %>% filter(p_range <= p*div_factor), 42 | aes(x = z_range, y = density), fill = "lightblue", alpha = 0.5) + 43 | geom_segment(x = -3, xend = z_of_p, 44 | y = density_of_p, yend = density_of_p, color = "red") + 45 | geom_segment(x = z_of_p, xend = z_of_p, y = density_of_p, yend = 0, color = "red") 46 | 47 | if (alternative[1] == "two.sided"){ 48 | base_plot <- base_plot + 49 | geom_area(data = z_dense %>% filter(p_range >= 1 - p*div_factor), 50 | aes(x = z_range, y = density), fill = "lightblue", alpha = 0.5) + 51 | geom_segment(x = 3, xend = -z_of_p, 52 | y = density_of_p, yend = density_of_p, color = "red") + 53 | geom_segment(x = -z_of_p, xend = -z_of_p, y = density_of_p, yend = 0, color = "red") 54 | 55 | } 56 | 57 | subtitle_str <- paste0(subtitle_str, "p = Phi(z) = pnorm(z) = ", round(p*div_factor, 3), 58 | "; z = qnorm(p) = ", 59 | round(z_of_p, 3)) 60 | 61 | 62 | base_plot + 63 | xlab("z_p") + 64 | ylab("density\ndnorm(z_p)") + 65 | theme_bw() + 66 | ggtitle("The normal distribution", 67 | subtitle = subtitle_str) 68 | 69 | } 70 | 71 | plot_z_score(p=0.05, alternative = "two.sided") 72 | plot_z_score(z = -1.649, alternative = "one.sided") 73 | -------------------------------------------------------------------------------- /lectures/xaringan-themer.css: -------------------------------------------------------------------------------- 1 | /* ------------------------------------------------------- 2 | * 3 | * !! This file was generated by xaringanthemer !! 4 | * 5 | * Changes made to this file directly will be overwritten 6 | * if you used xaringanthemer in your xaringan slides Rmd 7 | * 8 | * Issues or likes? 9 | * - https://github.com/gadenbuie/xaringanthemer 10 | * - https://www.garrickadenbuie.com 11 | * 12 | * Need help? Try: 13 | * - vignette(package = "xaringanthemer") 14 | * - ?xaringanthemer::style_xaringan 15 | * - xaringan wiki: https://github.com/yihui/xaringan/wiki 16 | * - remarkjs wiki: https://github.com/gnab/remark/wiki 17 | * 18 | * Version: 0.4.1 19 | * 20 | * ------------------------------------------------------- */ 21 | @import url(https://fonts.googleapis.com/css?family=Rubik:300,300i&display=swap); 22 | @import url(https://fonts.googleapis.com/css?family=Open+Sans&display=swap); 23 | @import url(https://fonts.googleapis.com/css?family=Fira+Mono&display=swap); 24 | 25 | 26 | :root { 27 | /* Fonts */ 28 | --text-font-family: Rubik; 29 | --text-font-is-google: 1; 30 | --text-font-family-fallback: -apple-system, BlinkMacSystemFont, avenir next, avenir, helvetica neue, helvetica, Ubuntu, roboto, noto, segoe ui, arial; 31 | --text-font-base: sans-serif; 32 | --header-font-family: 'Open Sans'; 33 | --header-font-is-google: 1; 34 | --header-font-family-fallback: Georgia, serif; 35 | --code-font-family: 'Fira Mono'; 36 | --code-font-is-google: 1; 37 | --base-font-size: 20px; 38 | --text-font-size: 1rem; 39 | --code-font-size: 0.9rem; 40 | --code-inline-font-size: 1em; 41 | --header-h1-font-size: 2.3rem; 42 | --header-h2-font-size: 2.25rem; 43 | --header-h3-font-size: 1.75rem; 44 | 45 | /* Colors */ 46 | --text-color: #272822; 47 | --header-color: #43418A; 48 | --background-color: #FFFFFF; 49 | --link-color: #43418A; 50 | --text-bold-color: #43418A; 51 | --code-highlight-color: rgba(255,255,0,0.5); 52 | --inverse-text-color: #FFFFFF; 53 | --inverse-background-color: #43418A; 54 | --inverse-header-color: #FFFFFF; 55 | --inverse-link-color: #43418A; 56 | --title-slide-background-color: #43418A; 57 | --title-slide-text-color: #FFFFFF; 58 | --header-background-color: #43418A; 59 | --header-background-text-color: #FFFFFF; 60 | --base: #43418A; 61 | --white: #FFFFFF; 62 | --black: #272822; 63 | } 64 | 65 | html { 66 | font-size: var(--base-font-size); 67 | } 68 | 69 | body { 70 | font-family: var(--text-font-family), var(--text-font-family-fallback), var(--text-font-base); 71 | font-weight: 300; 72 | color: var(--text-color); 73 | } 74 | h1, h2, h3 { 75 | font-family: var(--header-font-family), var(--header-font-family-fallback); 76 | font-weight: 600; 77 | color: var(--header-color); 78 | } 79 | .remark-slide-content { 80 | background-color: var(--background-color); 81 | font-size: 1rem; 82 | padding: 16px 64px 16px 64px; 83 | width: 100%; 84 | height: 100%; 85 | } 86 | .remark-slide-content h1 { 87 | font-size: var(--header-h1-font-size); 88 | } 89 | .remark-slide-content h2 { 90 | font-size: var(--header-h2-font-size); 91 | } 92 | .remark-slide-content h3 { 93 | font-size: var(--header-h3-font-size); 94 | } 95 | .remark-code, .remark-inline-code { 96 | font-family: var(--code-font-family), Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace; 97 | } 98 | .remark-code { 99 | font-size: var(--code-font-size); 100 | } 101 | .remark-inline-code { 102 | font-size: var(--code-inline-font-size); 103 | color: #43418A; 104 | } 105 | .remark-slide-number { 106 | color: #43418A; 107 | opacity: 1; 108 | font-size: 0.9rem; 109 | } 110 | strong { 111 | font-weight: bold; 112 | color: var(--text-bold-color); 113 | } 114 | a, a > code { 115 | color: var(--link-color); 116 | text-decoration: none; 117 | } 118 | .footnote { 119 | position: absolute; 120 | bottom: 60px; 121 | padding-right: 4em; 122 | font-size: 0.9em; 123 | } 124 | .remark-code-line-highlighted { 125 | background-color: var(--code-highlight-color); 126 | } 127 | .inverse { 128 | background-color: var(--inverse-background-color); 129 | color: var(--inverse-text-color); 130 | 131 | } 132 | .inverse h1, .inverse h2, .inverse h3 { 133 | color: var(--inverse-header-color); 134 | } 135 | .inverse a, .inverse a > code { 136 | color: var(--inverse-link-color); 137 | } 138 | .title-slide, .title-slide h1, .title-slide h2, .title-slide h3 { 139 | color: var(--title-slide-text-color); 140 | } 141 | .title-slide { 142 | background-color: var(--title-slide-background-color); 143 | } 144 | .title-slide .remark-slide-number { 145 | display: none; 146 | } 147 | /* Two-column layout */ 148 | .left-column { 149 | width: 20%; 150 | height: 92%; 151 | float: left; 152 | } 153 | .left-column h2, .left-column h3 { 154 | color: #43418A99; 155 | } 156 | .left-column h2:last-of-type, .left-column h3:last-child { 157 | color: #43418A; 158 | } 159 | .right-column { 160 | width: 75%; 161 | float: right; 162 | padding-top: 1em; 163 | } 164 | .pull-left { 165 | float: left; 166 | width: 47%; 167 | } 168 | .pull-right { 169 | float: right; 170 | width: 47%; 171 | } 172 | .pull-right + * { 173 | clear: both; 174 | } 175 | img, video, iframe { 176 | max-width: 100%; 177 | } 178 | blockquote { 179 | border-left: solid 5px #43418A80; 180 | padding-left: 1em; 181 | } 182 | .remark-slide table { 183 | margin: auto; 184 | border-top: 1px solid #666; 185 | border-bottom: 1px solid #666; 186 | } 187 | .remark-slide table thead th { 188 | border-bottom: 1px solid #ddd; 189 | } 190 | th, td { 191 | padding: 5px; 192 | } 193 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { 194 | background: #D9D9E7; 195 | } 196 | table.dataTable tbody { 197 | background-color: var(--background-color); 198 | color: var(--text-color); 199 | } 200 | table.dataTable.display tbody tr.odd { 201 | background-color: var(--background-color); 202 | } 203 | table.dataTable.display tbody tr.even { 204 | background-color: #D9D9E7; 205 | } 206 | table.dataTable.hover tbody tr:hover, table.dataTable.display tbody tr:hover { 207 | background-color: rgba(255, 255, 255, 0.5); 208 | } 209 | .dataTables_wrapper .dataTables_length, .dataTables_wrapper .dataTables_filter, .dataTables_wrapper .dataTables_info, .dataTables_wrapper .dataTables_processing, .dataTables_wrapper .dataTables_paginate { 210 | color: var(--text-color); 211 | } 212 | .dataTables_wrapper .dataTables_paginate .paginate_button { 213 | color: var(--text-color) !important; 214 | } 215 | 216 | /* Horizontal alignment of code blocks */ 217 | .remark-slide-content.left pre, 218 | .remark-slide-content.center pre, 219 | .remark-slide-content.right pre { 220 | text-align: start; 221 | width: max-content; 222 | max-width: 100%; 223 | } 224 | .remark-slide-content.left pre, 225 | .remark-slide-content.right pre { 226 | min-width: 50%; 227 | min-width: min(40ch, 100%); 228 | } 229 | .remark-slide-content.center pre { 230 | min-width: 66%; 231 | min-width: min(50ch, 100%); 232 | } 233 | .remark-slide-content.left pre { 234 | margin-left: unset; 235 | margin-right: auto; 236 | } 237 | .remark-slide-content.center pre { 238 | margin-left: auto; 239 | margin-right: auto; 240 | } 241 | .remark-slide-content.right pre { 242 | margin-left: auto; 243 | margin-right: unset; 244 | } 245 | 246 | /* Slide Header Background for h1 elements */ 247 | .remark-slide-content.header_background > h1 { 248 | display: block; 249 | position: absolute; 250 | top: 0; 251 | left: 0; 252 | width: 100%; 253 | background: var(--header-background-color); 254 | color: var(--header-background-text-color); 255 | padding: 2rem 64px 1.5rem 64px; 256 | margin-top: 0; 257 | box-sizing: border-box; 258 | } 259 | .remark-slide-content.header_background { 260 | padding-top: 7rem; 261 | } 262 | 263 | @page { margin: 0; } 264 | @media print { 265 | .remark-slide-scaler { 266 | width: 100% !important; 267 | height: 100% !important; 268 | transform: scale(1) !important; 269 | top: 0 !important; 270 | left: 0 !important; 271 | } 272 | } 273 | 274 | .base { 275 | color: var(--base); 276 | } 277 | .bg-base { 278 | background-color: var(--base); 279 | } 280 | .white { 281 | color: var(--white); 282 | } 283 | .bg-white { 284 | background-color: var(--white); 285 | } 286 | .black { 287 | color: var(--black); 288 | } 289 | .bg-black { 290 | background-color: var(--black); 291 | } 292 | 293 | 294 | 295 | /* Extra CSS */ 296 | .medium { 297 | font-size: 85%; 298 | code-font-size: 85%; 299 | } 300 | .small { 301 | zoom: 70%; 302 | } 303 | .extra-small { 304 | font-size: 50%; 305 | code-font-size: 50%; 306 | } 307 | .tiny { 308 | font-size: 50%; 309 | code-font-size: 50%; 310 | zoom: 50%; 311 | } 312 | .full-width { 313 | display: flex; 314 | width: 100%; 315 | flex: 1 1 auto; 316 | } 317 | -------------------------------------------------------------------------------- /misc/Distribution tables - x^2, z, f, t.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/misc/Distribution tables - x^2, z, f, t.pdf -------------------------------------------------------------------------------- /misc/extended_formula_page.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/misc/extended_formula_page.pdf -------------------------------------------------------------------------------- /misc/init_course_plan.md: -------------------------------------------------------------------------------- 1 | # Plan for course 2 | 3 | ## Weeks 1-2 4 | 5 | ### Lecture: 6 | 7 | * Intro/overview 8 | * Sampling methods 9 | * Statistical inference and parameter estimation 10 | * Mean, median, std, biased/unbiased, maximum likelihood, percentiles, skewness, desired properties of estimtes (e.g. unbiased), Cramer-Rao bound(?), Chebychev. 11 | * Variable types 12 | * Reminder on types of distributions(?) + visualizations. 13 | 14 | ### Exercise: 15 | 16 | * Technical exercises on R 17 | * Intro to R 18 | * Tidyverse 19 | * Visualizations 20 | * Motivation example 21 | 22 | ### Homework: 23 | 24 | * The first will present an example research (verbally) and ask questions about the sampling method, type of variables, problems with bias. 25 | * Show a chart and analyze it (e.g., boxplot, density, ecdf, scatter plot matrix). What a specific plot teaches that others don't. 26 | * Technical part (maybe adopted from R4DS) 27 | 28 | ## Weeks 3-4 29 | 30 | ### Lecture: 31 | 32 | * Hypothesis tests. 33 | * Use cases and examples for z, student's t, how are they related to assumptions. 34 | * Chi-square test for independence of variables. 35 | * A-parametric tests (e.g., Wilcoxon) - what do they mean, when should we use them, examples. 36 | * Goodness-of-fit. 37 | * Problems with p-values, FDR. 38 | * Multidimensional CIs? 39 | * Relationship between significance (p-value) versus confidence interval. 40 | 41 | ### Exercise: 42 | 43 | * Show another test not covered in the lecture; or 44 | * Review a concrete example in class, i.e., take a data set and run a number of hypothesis tests. 45 | * Illustrate how 100 tests of independent random variables might show significant values (even though they are drawn from the same distribution). 46 | * Example for FDR. 47 | 48 | ### Homework: 49 | 50 | * Theoretical question, i.e., "story" followed by what test would you use and why. 51 | * Practical example - parsons example of running the tests + analysis. 52 | 53 | ## Weeks 5-6 54 | 55 | ### Lecture: 56 | 57 | * Analysis of variance, one-way, two-way. Examples. 58 | * Dunnett's test. 59 | * Related tests. 60 | 61 | ### Exercise: 62 | 63 | * Example for use in R. 64 | * Dunnett's test (if not covered in lecture). 65 | 66 | ### Homework: 67 | 68 | * Complex data set with parsons example which involves both data restructuring (i.e. `pivot_wider`/`pivot_longer`) and then `aov`, `summary`. 69 | 70 | ## Weeks 7: 71 | 72 | ### Lecture: 73 | 74 | * Choosing the right sample type. 75 | * Deciding on sample size via margin of error and power calculations. 76 | * Consider sub-groups. 77 | 78 | ### Exercise: 79 | 80 | * Show examples and implementation in R. 81 | 82 | ### Homework: 83 | 84 | * Provide a research use cases, students will provide a detailed plan for the experiment. 85 | * Present a flawed experiment - student need to find the flaws. -------------------------------------------------------------------------------- /misc/syllabus_05601823_2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/misc/syllabus_05601823_2022.pdf -------------------------------------------------------------------------------- /misc/tau_engineering_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/misc/tau_engineering_logo.png -------------------------------------------------------------------------------- /population_vs_sample/.Rhistory: -------------------------------------------------------------------------------- 1 | library(shiny) 2 | runApp() 3 | runApp() 4 | runApp() 5 | runApp() 6 | runApp() 7 | runApp() 8 | runApp() 9 | runApp() 10 | runApp() 11 | runApp() 12 | runApp() 13 | runApp() 14 | -------------------------------------------------------------------------------- /population_vs_sample/app.R: -------------------------------------------------------------------------------- 1 | 2 | library(shiny) 3 | library(tidyverse) 4 | 5 | # Define UI for application that draws a histogram 6 | ui <- fluidPage( 7 | 8 | theme = shinythemes::shinytheme("united"), title = "Population vs. sample", 9 | h1("Population versus sample (normal distribution)"), 10 | 11 | fluidRow( 12 | sidebarLayout( 13 | sidebarPanel(width = 2, 14 | fluidRow( 15 | h3("Population parameters"), 16 | numericInput("expectancy", "Expectancy (μ)", 17 | value = 0), 18 | numericInput("std", "Standard Deviation (σ)", 19 | value = 1), 20 | h3("Sample size"), 21 | numericInput("sample_size", "Sample size (n)", 22 | value = 30), 23 | numericInput("num_bins", "Histogram bins", 24 | value = 30) 25 | 26 | )), 27 | mainPanel(column(plotOutput("population_distribution"), width = 6), 28 | column(plotOutput("sample_histogram"), width = 6)) 29 | ) 30 | ) 31 | 32 | ) 33 | 34 | # Define server logic required to draw a histogram 35 | server <- function(input, output) { 36 | 37 | output$population_distribution <- renderPlot({ 38 | 39 | x_range <- seq(input$expectancy - 3*input$std, input$expectancy + 3*input$std, 40 | by = 0.01) 41 | 42 | y_range <- dnorm(x_range, 43 | mean = input$expectancy, 44 | sd = input$std) 45 | 46 | tibble(x = x_range, y = y_range) %>% 47 | ggplot(aes(x, y)) + 48 | geom_line() + 49 | coord_cartesian(xlim = c(-5, 5)) + 50 | ggtitle("The population distribution") + 51 | ylab("Density function") 52 | 53 | }) 54 | 55 | output$sample_histogram <- renderPlot({ 56 | 57 | smp <- tibble(x = 58 | rnorm(n = input$sample_size, 59 | mean = input$expectancy, 60 | sd = input$std)) 61 | 62 | smp %>% 63 | ggplot(aes(x = x)) + 64 | geom_histogram(bins = input$num_bins) + 65 | coord_cartesian(xlim = c(-5, 5)) + 66 | ggtitle(glue::glue("The sample distribution: mean={round(mean(smp$x),2)}, sd={round(sd(smp$x), 2)}")) + 67 | xlab("x") 68 | 69 | }) 70 | 71 | 72 | } 73 | 74 | # Run the application 75 | shinyApp(ui = ui, server = server) 76 | -------------------------------------------------------------------------------- /population_vs_sample/population_vs_sample.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /population_vs_sample/rsconnect/shinyapps.io/sarid/population_vs_sample.dcf: -------------------------------------------------------------------------------- 1 | name: population_vs_sample 2 | title: population_vs_sample 3 | username: 4 | account: sarid 5 | server: shinyapps.io 6 | hostUrl: https://api.shinyapps.io/v1 7 | appId: 3766450 8 | bundleId: 4324931 9 | url: https://sarid.shinyapps.io/population_vs_sample/ 10 | when: 1615360163.46147 11 | asMultiple: FALSE 12 | asStatic: FALSE 13 | -------------------------------------------------------------------------------- /project/Project Instructions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Project Instructions" 3 | author: "Adi Sarid" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document: default 7 | pdf_document: default 8 | subtitle: Intro to Statistics and Data Analysis with R (0560.1823) 9 | --- 10 | 11 | ```{r setup, include=FALSE} 12 | knitr::opts_chunk$set(echo = TRUE) 13 | ``` 14 | 15 | ## Background 16 | 17 | The following document contains instructions to the project in the Introduction to Statistics and Data Analysis with R course. 18 | 19 | The project has a weight of 40% of you final grade. 20 | 21 | ## Goal 22 | 23 | The goal of the project is to demonstrate and practice the different elements we have been talking about, which are a part of most data analysis/data science projects. 24 | 25 | ## Methods 26 | 27 | In this project you will handle the different phases of a data analysis project: 28 | 29 | 1. Data **Import** (reading the data into R). 30 | 31 | 2. Data **Tidying** (arranging the data into something you can work with) 32 | 33 | 3. Understanding the data: 34 | 35 | a. **Transforming** variables. 36 | 37 | b. **Visualizing** (use `ggplot2` to show distribution of variables, relationships between variables, and to hypothesize). 38 | 39 | c. **Modelling**: using a few of the tools we have learned during the course (like hypothesis testing, regression, analysis of variance, etc.) to examine your hypothesis. 40 | 41 | 4. **Communicating** your findings via a written report 42 | 43 | ## Instructions and Schedule 44 | 45 | The project should be performed **in pairs** (same groups of homework submissions). 46 | 47 | ### Choosing a dataset 48 | 49 | First, you should select a dataset on which you will perform the project. I recommend using a data set from either [Kaggle](kaggle.com) or from [tidytuesday](https://github.com/rfordatascience/tidytuesday), or [government data](https://data.gov.il/dataset/). You can select something else. 50 | 51 | In any case, please do not choose something "too popular" (e.g., no built-in `R` datasets, and no data sets that we've worked on in the lectures). 52 | 53 | In your work you must document: 54 | 55 | * The dataset name 56 | * Source (a url with the data and documentation of the dataset) 57 | * A **direct link** to download the raw data you are using 58 | 59 | ### Consultation 60 | 61 | I'm dedicating a weekly reception hour, Thursdays 09:00, in zoom. You can bring questions regarding the project, coding, `R`, etc. Please coordinate in advanced (send me an email if you want to join the reception hour). 62 | 63 | ### Submission 64 | 65 | Final submissions should be made by **June 10th 2022.** 66 | 67 | Please submit your file to moodle as `statintro_final_studentname_studentID.zip` which bundles an Rmd version, data files, and a knitted html version of your report. The Rmd should compile standalone in every computer. 68 | 69 | ## Grading 70 | 71 | You will be graded along the following lines: 72 | 73 | * Data import, tidying, and transformations (20%): Your ability to use the proper methods to import the data, tidy it and apply required transformations towards the next stages. 74 | 75 | * Visualizations (20%): Your ability to utilize visualizations to articulate your hypothesis and to illustrate different patterns and relationships in the data. You should be able to match the proper types of charts to what ever it is you are trying to show. 76 | 77 | * Modeling (20%): Your ability to match the appropriate statistical tests/models to the problem, verifying (or highlighting) certain assumptions which are valid or invalid in this case. Please provide at least two relevant models/hypothesis tests that we learned. 78 | 79 | * Communication, documentation, explanations (20%): You should be able to explain the different steps you are doing, lead the reader in a logical and appealing manner, explain your results, and highlight the research or business implications of your findings. For example, make sure you start with data description, research questions, hypothesis, etc. 80 | 81 | * Code (20%): Readability, proper use and proper documentation of code. You may use tidyverse code or base R. 82 | 83 | *** 84 | 85 | **Good luck!** 86 | 87 | \newpage 88 | 89 | # Appendix: Questions and answers 90 | 91 | Some more questions and answers. 92 | 93 | ## How should you report the results? 94 | 95 | In tests such as t-test or goodness of fit, you should explain in plain text what you are doing, what assumptions the test entails and if they indeed hold in this case or not. Then add the code chunk and include the output. 96 | 97 | For example, in linear regression, you should also report a qqplot of the residuals and check homoscedasticity. 98 | 99 | ## Where can I see examples for projects? 100 | 101 | You can see examples for projects from previous semesters [here](https://github.com/adisarid/intro_statistics_R/tree/master/project/examples). -------------------------------------------------------------------------------- /project/Project Instructions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/project/Project Instructions.pdf -------------------------------------------------------------------------------- /project/example_star_trek_script_analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Where no R has gone before - Analyzing Star Trek scripts" 3 | author: "Adi Sarid" 4 | date: "12 1 2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | # Background 13 | 14 | In this markdown, I analyze [this](https://www.kaggle.com/gjbroughton/start-trek-scripts/) data of startrek scripts. 15 | 16 | ## Goals 17 | 18 | My main goal is to demonstrate various elements which I expect to see in the project, while allowing the students to ask questions as we build the analysis in real time together. 19 | 20 | Here are some related research questions which we can examine: 21 | 22 | * What is the number of words/script lines spoken by each starfleet captain? 23 | * What is the difference between a captain, number.1, and other characters? 24 | * What is the relationship between sentence length (number of words) and starfleet captains? 25 | * Is there a gender bias in different series/episodes? Which series reflects more female characters (in terms of script words or script lines)? 26 | * What the number of words per episode? Is it randomely distributed? 27 | 28 | # Data Import and Tidying 29 | 30 | The source is in json files - non tabular data which is hard to work with. Hence, we must first turn it into a tidyformat. data was read from the json files and put into this tidy format. 31 | 32 | ```{r read startrek script data} 33 | suppressWarnings(suppressMessages(library(tidyverse))) 34 | trek <- read_csv("https://github.com/adisarid/startrek_plumber_api/blob/master/raw_data/characters_words.csv?raw=true") 35 | glimpse(trek) 36 | ``` 37 | 38 | # Transformation, Visualization, and Modelling 39 | 40 | ## Which starfleet captains appear in the data? 41 | 42 | First, let's determine who are the starfleet captains which appear in the data. 43 | 44 | ```{r starfleet captain} 45 | trek %>% 46 | group_by(character, series) %>% 47 | summarize(num_words = sum(total_words)) %>% 48 | arrange(desc(num_words)) 49 | ``` 50 | 51 | The starfleet captains which appear are PICARD, JANEWAY, KIRK, SISCO, and ARCHER. Now we can filter the data and examine them. 52 | 53 | # Conclusions -------------------------------------------------------------------------------- /project/examples/FIFA 2019 Analysis - Inbar Siloni.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "FIFA 2019 Analysis - Inbar Siloni" 3 | author: "Inbar Siloni" 4 | date: "18 1 2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | # Background 12 | 13 | In this markdown, I analyze [this](https://www.kaggle.com/karangadiya/fifa19/download) data of fifa players. 14 | 15 | ## Goals 16 | 17 | In my research, I will focus on two main questions: 18 | 19 | * Is there a correlation between a player's abilities, his age and his value? 20 | 21 | * Do players score the same if they are right footed and left footed? 22 | 23 | 24 | ```{r libraries, message=FALSE,warning=FALSE, echo=FALSE} 25 | library(tidyverse) 26 | library(broom) 27 | library(knitr) 28 | library(readxl) 29 | library(here) 30 | library(janitor) 31 | library(dplyr) 32 | library(ggcorrplot) 33 | library(scales) 34 | library(RColorBrewer) 35 | library(stringi) 36 | library(agricolae) 37 | ``` 38 | 39 | ## importing the data set 40 | 41 | The dataset was in an excel file- a convenient format to work with. I had to filter the clubs I wanted to focus on- the 16 top clubs of the champions league of 2019. Also, I formatted the column names for my convenience. I deleted a few data columns that I will not be using for more approachable data. 42 | ```{r read fifa dataset,message=FALSE, warning=FALSE, echo=FALSE} 43 | alufot <- c("Atlético Madrid", "FC Barcelona", "Real Madrid", 44 | "FC Bayern München", "Tottenham Hotspur", "Paris Saint-Germain", 45 | "Juventus", "Chelsea", "Borussia Dortmund", "Liverpool", "Atalanta", 46 | "Valencia CF", "RB Leipzig", "Napoli", "Olympique Lyonnais", "Manchester City") 47 | 48 | fifa_data <- read_excel("fifa_data.xlsx") %>% 49 | clean_names() %>% 50 | filter(club %in% alufot) 51 | fifa_data <- fifa_data[, c(4,8:10,12,15,22,55:88)] 52 | View(fifa_data) 53 | ``` 54 | 55 | # Transformation, Visualization, and Modelling 56 | 57 | ## doing some transformation 58 | First, I did a little transformation for several variables. 59 | ```{r transforming the dataset, echo=FALSE} 60 | positions <- unique(fifa_data$position) 61 | gk <- "GK" 62 | defs <- positions[str_detect(positions, "B$")] 63 | mids <- positions[str_detect(positions, "M$")] 64 | f1 <- positions[str_detect(positions, "F$")] 65 | f2 <- positions[str_detect(positions, "S$")] 66 | f3 <- positions[str_detect(positions, "T$")] 67 | f4 <- positions[str_detect(positions, "W$")] 68 | fwds <- c(f1, f2, f3, f4) 69 | fifa_data <- fifa_data %>% 70 | mutate(position_group = ifelse(position %in% gk, "GK", ifelse(position %in% defs, "DEF", ifelse(position %in% mids, "MID", ifelse(position %in% fwds, "FWD", "Unknown"))))) 71 | fifa_data <- fifa_data %>% 72 | mutate( 73 | value_multiplier = ifelse(str_detect(value, "K"), 1000, 74 | ifelse(str_detect(value, "M"), 1000000, 1)), 75 | value_numeric_pounds = as.numeric(str_extract(value,"[[:digit:]]+\\.*[[:digit:]]*")) 76 | * value_multiplier, 77 | age_group = ifelse(age <= 20, "20 and under", 78 | ifelse(age > 20 & age <=25, "21 to 25", 79 | ifelse(age > 25 & age <= 30, "25 to 30", 80 | ifelse(age > 30 & age <= 35, "31 to 35", "Over 35")))), 81 | club=stri_trans_general(club, "Latin-ASCII"), 82 | ) 83 | View(fifa_data) 84 | glimpse(fifa_data) 85 | ``` 86 | 87 | 88 | ## doing some visualization 89 | 90 | Let's take a look at the clubs who's players we are going to analyze! Ladies and gentlemen, I present you UEFA champions league's top 16 clubs of 2019: 91 | 92 | ```{r clubs} 93 | fifa_data %>% 94 | group_by(club) %>% 95 | summarize() 96 | ``` 97 | 98 | 99 | Now let's take a look at the age distribution of the players: 100 | 101 | ```{r age distribution} 102 | ggplot(data=fifa_data, aes(x=age))+ 103 | geom_density(fill="violet") 104 | ``` 105 | 106 | 107 | ```{r age range} 108 | fifa_data %>% 109 | summarise( 110 | min = min(age), 111 | median = median(age), 112 | max = max(age) 113 | ) 114 | ``` 115 | 116 | 117 | ```{r age group count} 118 | fifa_data %>% 119 | group_by(age_group) %>% 120 | count() 121 | ``` 122 | 123 | We can see that most of the players are under the age of 30 and over the age of 20. The biggest group age is 21 to 25. 124 | 125 | Let's see how their value is distributed: 126 | 127 | ```{r players value distribution} 128 | ggplot(data=fifa_data, aes(x=value_numeric_pounds))+ 129 | geom_density(fill="lavenderblush")+ 130 | scale_x_log10(labels = dollar_format(prefix = "€")) 131 | ``` 132 | 133 | So, as we can see there are two "humps" is this distribution. A possible explanation to this is the difference of values for the squad players (those who play in the league, for example) and players in the club's teen groups and substitutes to substitutes. Both players' types values distribute normally, but most of the subs are valued at max as the min for squad players. 134 | 135 | ## modelling 136 | 137 | Let's get down to business. 138 | 139 | First, we will check if there is a correlation between preferred foot and scoring: 140 | 141 | ```{r finishing/preferred leg distribution} 142 | fit1<-lm(formula = finishing~preferred_foot, data= fifa_data) 143 | summary(fit1) 144 | fifa_data %>% 145 | ggplot(aes(x = preferred_foot, y = finishing, 146 | fill = preferred_foot))+ 147 | geom_boxplot()+ 148 | scale_fill_brewer(palette = "Pastel1") 149 | ``` 150 | ```{r another finishing/preferred leg distribution} 151 | t.test(fifa_data$finishing~fifa_data$preferred_foot) 152 | fifa_data %>% 153 | ggplot(aes(x = finishing, 154 | fill = preferred_foot))+ 155 | geom_histogram() 156 | ``` 157 | 158 | 159 | 160 | My null hypothesis was that preferred foot and scoring are not correlated. As we can see in the t-test and linear regression above, the p-value is 0.07948, so we can't say in significance level of 0.05 that there is a correlation, so we do not reject the null hypothesis. 161 | 162 | Now to the next research question- let's see if there is a correlation between abilities, age and value! 163 | 164 | First, let's take a look at age vs. value: 165 | 166 | ```{r players value/age distribution} 167 | fit2<-lm(formula = log1p(value_numeric_pounds)~age_group, data= fifa_data) 168 | summary(fit2) 169 | fifa_data %>% 170 | ggplot(aes(x=age_group, y=value_numeric_pounds, fill=age_group))+ 171 | geom_boxplot()+ 172 | scale_y_log10(labels = dollar_format(prefix = "€"))+ 173 | stat_smooth(method = "lm")+ 174 | scale_fill_brewer(palette = "Pastel1") 175 | ``` 176 | 177 | From the regression and chart we can see that players aged 21-35 earn more, and more specifically in age group 25-30. This supports my explanation to the "humps" in the value distribution chart. 178 | 179 | Let's see the correlation for different abilities: 180 | 181 | So, obviously, field players require different abilities than goal keepers. We will check the correlation for all players and for each group individually. 182 | 183 | Here we check for all players: 184 | 185 | ```{r abilities correlation} 186 | abilities <- fifa_data %>% select(crossing:gk_reflexes) 187 | ability_corr <- cor(abilities) 188 | ggcorrplot(ability_corr, type = "upper", hc.order = T, hc.method = "complete", 189 | colors = c("lightskyblue","white","brown1")) 190 | ``` 191 | 192 | We see that some abilities are correlated with others, and some aren't. Let's see if we can explain this. 193 | Here we check just for field players: 194 | 195 | ```{r field players abilities correlation} 196 | abilities <- fifa_data %>% filter(!position == "GK") %>% select(crossing:sliding_tackle) 197 | ability_corr <- cor(abilities) 198 | ggcorrplot(ability_corr, type = "upper", hc.order = T, hc.method = "complete", 199 | colors = c("lightskyblue","white","brown1")) 200 | ``` 201 | 202 | Once again we see a very strong correlation of some abilities while others don't correlate at all. The reason for this can be that abilities like short passing and long passing must be related to the same physical abilities, while agility and jumping do not. Overall we learn that most of the abilities are correlated to one another. 203 | And here we check just for goal keepers: 204 | 205 | ```{r goal keepers abilities correlation} 206 | abilities <- fifa_data %>% filter(position == "GK") %>% select(gk_diving:gk_reflexes) 207 | ability_corr <- cor(abilities) 208 | ggcorrplot(ability_corr, type = "upper", hc.order = T, hc.method = "complete", 209 | colors = c("lightskyblue","white","indianred1")) 210 | ``` 211 | 212 | for goal keepers the situation is different. Since their abilities are measured only for, well, goal keeping, it makes sense that all the abilities will be correlated. Still we see that kicking is not as correlated as the other abilities, which makes sense because it relates to foot ability and not body ability (the others are related mostly to hands). 213 | 214 | Now that we have established the relation between different abilities, let's see if there is a correlation between the value of a player and his abilities: 215 | 216 | ```{r abilities/value distribution} 217 | fit6<-lm(formula = log1p(value_numeric_pounds) ~ overall, data = fifa_data) 218 | summary(fit6) 219 | fifa_data %>% 220 | ggplot(aes(y=value_numeric_pounds, x= overall))+ 221 | geom_point(color="seagreen3")+ 222 | scale_y_log10(labels = dollar_format(prefix = "€"))+ 223 | stat_smooth(method = "lm",color="black") 224 | ``` 225 | 226 | We see there definitely is a correlation! The p-value is very small and the chart shows a strong connection. But does age matter? 227 | 228 | Lets check if there is a correlation between abilities, value and age: 229 | 230 | ```{r value/age+abilities destribution} 231 | fit <- lm(formula = log1p(value_numeric_pounds) ~ age_group+overall, data = fifa_data) 232 | summary(fit) 233 | fit.av <- aov(fit) 234 | summary(fit.av) 235 | fifa_data %>% 236 | ggplot(aes(y=value_numeric_pounds, x=overall, 237 | group = age_group, 238 | color = age_group))+ 239 | geom_point(size = 2)+ 240 | scale_y_log10(labels = dollar_format(prefix = "€"))+ 241 | stat_smooth(method = "lm")+ 242 | scale_color_brewer(palette = "Set1") 243 | ``` 244 | 245 | 246 | So, as we can see, while value and abilities are correlated, age does affect this correlation. We see that for young players, the same abilities will earn higher value- while for older players the same abilities will earn lower value (drastically lower if the player is over 35). All the tests show a clear correlation of value with age and abilities (low p-values). Also, in all of the tests we can clearly see the for players over 35 value drops much faster. 247 | 248 | 249 | # conclusion 250 | 251 | I checked two main questions- 252 | 253 | *Is there a correlation between a player's abilities, his age and his value? 254 | 255 | *Do players score the same if they are right footed and left footed? 256 | 257 | The first question showed a strong correlation between the three variables, but the second one did not. 258 | I have learned that it doesn't matter if you are a leftie or a rightie, as long as you stay young and talented- football is the right place for you. 259 | 260 | Also I really held myself back the entire project because I know I'm not supposed to show my opinions but Real Madrid are the best club in the world and my next project will show that they rule in every aspect. 261 | 262 | Thank you for reading! -------------------------------------------------------------------------------- /xaringan-themer.css: -------------------------------------------------------------------------------- 1 | /* ------------------------------------------------------- 2 | * 3 | * !! This file was generated by xaringanthemer !! 4 | * 5 | * Changes made to this file directly will be overwritten 6 | * if you used xaringanthemer in your xaringan slides Rmd 7 | * 8 | * Issues or likes? 9 | * - https://github.com/gadenbuie/xaringanthemer 10 | * - https://www.garrickadenbuie.com 11 | * 12 | * Need help? Try: 13 | * - vignette(package = "xaringanthemer") 14 | * - ?xaringanthemer::style_xaringan 15 | * - xaringan wiki: https://github.com/yihui/xaringan/wiki 16 | * - remarkjs wiki: https://github.com/gnab/remark/wiki 17 | * 18 | * Version: 0.4.1 19 | * 20 | * ------------------------------------------------------- */ 21 | @import url(https://fonts.googleapis.com/css?family=Rubik:300,300i&display=swap); 22 | @import url(https://fonts.googleapis.com/css?family=Open+Sans&display=swap); 23 | @import url(https://fonts.googleapis.com/css?family=Fira+Mono&display=swap); 24 | 25 | 26 | :root { 27 | /* Fonts */ 28 | --text-font-family: Rubik; 29 | --text-font-is-google: 1; 30 | --text-font-family-fallback: -apple-system, BlinkMacSystemFont, avenir next, avenir, helvetica neue, helvetica, Ubuntu, roboto, noto, segoe ui, arial; 31 | --text-font-base: sans-serif; 32 | --header-font-family: 'Open Sans'; 33 | --header-font-is-google: 1; 34 | --header-font-family-fallback: Georgia, serif; 35 | --code-font-family: 'Fira Mono'; 36 | --code-font-is-google: 1; 37 | --base-font-size: 20px; 38 | --text-font-size: 1rem; 39 | --code-font-size: 0.9rem; 40 | --code-inline-font-size: 1em; 41 | --header-h1-font-size: 2.75rem; 42 | --header-h2-font-size: 2.25rem; 43 | --header-h3-font-size: 1.75rem; 44 | 45 | /* Colors */ 46 | --text-color: #272822; 47 | --header-color: #43418A; 48 | --background-color: #FFFFFF; 49 | --link-color: #43418A; 50 | --text-bold-color: #43418A; 51 | --code-highlight-color: rgba(255,255,0,0.5); 52 | --inverse-text-color: #FFFFFF; 53 | --inverse-background-color: #43418A; 54 | --inverse-header-color: #FFFFFF; 55 | --inverse-link-color: #43418A; 56 | --title-slide-background-color: #43418A; 57 | --title-slide-text-color: #FFFFFF; 58 | --header-background-color: #43418A; 59 | --header-background-text-color: #FFFFFF; 60 | --base: #43418A; 61 | --white: #FFFFFF; 62 | --black: #272822; 63 | } 64 | 65 | html { 66 | font-size: var(--base-font-size); 67 | } 68 | 69 | body { 70 | font-family: var(--text-font-family), var(--text-font-family-fallback), var(--text-font-base); 71 | font-weight: 300; 72 | color: var(--text-color); 73 | } 74 | h1, h2, h3 { 75 | font-family: var(--header-font-family), var(--header-font-family-fallback); 76 | font-weight: 600; 77 | color: var(--header-color); 78 | } 79 | .remark-slide-content { 80 | background-color: var(--background-color); 81 | font-size: 1rem; 82 | padding: 16px 64px 16px 64px; 83 | width: 100%; 84 | height: 100%; 85 | } 86 | .remark-slide-content h1 { 87 | font-size: var(--header-h1-font-size); 88 | } 89 | .remark-slide-content h2 { 90 | font-size: var(--header-h2-font-size); 91 | } 92 | .remark-slide-content h3 { 93 | font-size: var(--header-h3-font-size); 94 | } 95 | .remark-code, .remark-inline-code { 96 | font-family: var(--code-font-family), Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace; 97 | } 98 | .remark-code { 99 | font-size: var(--code-font-size); 100 | } 101 | .remark-inline-code { 102 | font-size: var(--code-inline-font-size); 103 | color: #43418A; 104 | } 105 | .remark-slide-number { 106 | color: #43418A; 107 | opacity: 1; 108 | font-size: 0.9rem; 109 | } 110 | strong { 111 | font-weight: bold; 112 | color: var(--text-bold-color); 113 | } 114 | a, a > code { 115 | color: var(--link-color); 116 | text-decoration: none; 117 | } 118 | .footnote { 119 | position: absolute; 120 | bottom: 60px; 121 | padding-right: 4em; 122 | font-size: 0.9em; 123 | } 124 | .remark-code-line-highlighted { 125 | background-color: var(--code-highlight-color); 126 | } 127 | .inverse { 128 | background-color: var(--inverse-background-color); 129 | color: var(--inverse-text-color); 130 | 131 | } 132 | .inverse h1, .inverse h2, .inverse h3 { 133 | color: var(--inverse-header-color); 134 | } 135 | .inverse a, .inverse a > code { 136 | color: var(--inverse-link-color); 137 | } 138 | .title-slide, .title-slide h1, .title-slide h2, .title-slide h3 { 139 | color: var(--title-slide-text-color); 140 | } 141 | .title-slide { 142 | background-color: var(--title-slide-background-color); 143 | } 144 | .title-slide .remark-slide-number { 145 | display: none; 146 | } 147 | /* Two-column layout */ 148 | .left-column { 149 | width: 20%; 150 | height: 92%; 151 | float: left; 152 | } 153 | .left-column h2, .left-column h3 { 154 | color: #43418A99; 155 | } 156 | .left-column h2:last-of-type, .left-column h3:last-child { 157 | color: #43418A; 158 | } 159 | .right-column { 160 | width: 75%; 161 | float: right; 162 | padding-top: 1em; 163 | } 164 | .pull-left { 165 | float: left; 166 | width: 47%; 167 | } 168 | .pull-right { 169 | float: right; 170 | width: 47%; 171 | } 172 | .pull-right + * { 173 | clear: both; 174 | } 175 | img, video, iframe { 176 | max-width: 100%; 177 | } 178 | blockquote { 179 | border-left: solid 5px #43418A80; 180 | padding-left: 1em; 181 | } 182 | .remark-slide table { 183 | margin: auto; 184 | border-top: 1px solid #666; 185 | border-bottom: 1px solid #666; 186 | } 187 | .remark-slide table thead th { 188 | border-bottom: 1px solid #ddd; 189 | } 190 | th, td { 191 | padding: 5px; 192 | } 193 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { 194 | background: #D9D9E7; 195 | } 196 | table.dataTable tbody { 197 | background-color: var(--background-color); 198 | color: var(--text-color); 199 | } 200 | table.dataTable.display tbody tr.odd { 201 | background-color: var(--background-color); 202 | } 203 | table.dataTable.display tbody tr.even { 204 | background-color: #D9D9E7; 205 | } 206 | table.dataTable.hover tbody tr:hover, table.dataTable.display tbody tr:hover { 207 | background-color: rgba(255, 255, 255, 0.5); 208 | } 209 | .dataTables_wrapper .dataTables_length, .dataTables_wrapper .dataTables_filter, .dataTables_wrapper .dataTables_info, .dataTables_wrapper .dataTables_processing, .dataTables_wrapper .dataTables_paginate { 210 | color: var(--text-color); 211 | } 212 | .dataTables_wrapper .dataTables_paginate .paginate_button { 213 | color: var(--text-color) !important; 214 | } 215 | 216 | /* Horizontal alignment of code blocks */ 217 | .remark-slide-content.left pre, 218 | .remark-slide-content.center pre, 219 | .remark-slide-content.right pre { 220 | text-align: start; 221 | width: max-content; 222 | max-width: 100%; 223 | } 224 | .remark-slide-content.left pre, 225 | .remark-slide-content.right pre { 226 | min-width: 50%; 227 | min-width: min(40ch, 100%); 228 | } 229 | .remark-slide-content.center pre { 230 | min-width: 66%; 231 | min-width: min(50ch, 100%); 232 | } 233 | .remark-slide-content.left pre { 234 | margin-left: unset; 235 | margin-right: auto; 236 | } 237 | .remark-slide-content.center pre { 238 | margin-left: auto; 239 | margin-right: auto; 240 | } 241 | .remark-slide-content.right pre { 242 | margin-left: auto; 243 | margin-right: unset; 244 | } 245 | 246 | /* Slide Header Background for h1 elements */ 247 | .remark-slide-content.header_background > h1 { 248 | display: block; 249 | position: absolute; 250 | top: 0; 251 | left: 0; 252 | width: 100%; 253 | background: var(--header-background-color); 254 | color: var(--header-background-text-color); 255 | padding: 2rem 64px 1.5rem 64px; 256 | margin-top: 0; 257 | box-sizing: border-box; 258 | } 259 | .remark-slide-content.header_background { 260 | padding-top: 7rem; 261 | } 262 | 263 | @page { margin: 0; } 264 | @media print { 265 | .remark-slide-scaler { 266 | width: 100% !important; 267 | height: 100% !important; 268 | transform: scale(1) !important; 269 | top: 0 !important; 270 | left: 0 !important; 271 | } 272 | } 273 | 274 | .base { 275 | color: var(--base); 276 | } 277 | .bg-base { 278 | background-color: var(--base); 279 | } 280 | .white { 281 | color: var(--white); 282 | } 283 | .bg-white { 284 | background-color: var(--white); 285 | } 286 | .black { 287 | color: var(--black); 288 | } 289 | .bg-black { 290 | background-color: var(--black); 291 | } 292 | 293 | 294 | 295 | /* Extra CSS */ 296 | .small { 297 | font-size: 70%; 298 | } 299 | .extra-small { 300 | font-size: 50%; 301 | } 302 | .full-width { 303 | display: flex; 304 | width: 100%; 305 | flex: 1 1 auto; 306 | } 307 | --------------------------------------------------------------------------------