├── .gitignore
├── 01-Estimation-leading_up_to_MLE.R
├── HW
    ├── Exercise-1c-R.Rmd
    ├── Exercise-1c-R.html
    ├── Exercise-1c-R.pdf
    ├── Exercise-3c-R.Rmd
    ├── Exercise-3c-R.pdf
    └── old
    │   ├── HW1
    │       ├── HW1.Rmd
    │       ├── HW1.html
    │       ├── HW1.pdf
    │       ├── HW1.tex
    │       ├── hw1_q3_solution.Rmd
    │       └── hw1_q3_solution.html
    │   ├── HW2
    │       ├── HW2.Rmd
    │       ├── HW2.html
    │       └── iris.jpg
    │   ├── HW3
    │       ├── HW3.Rmd
    │       ├── HW3.html
    │       └── answers
    │       │   ├── EX09_Questions_and_Solutions.pdf
    │       │   ├── country_code.csv
    │       │   ├── federalelections2016.xls
    │       │   ├── is_republican.csv
    │       │   └── solution_ex3_q2.R
    │   ├── HW4
    │       └── first_semester
    │       │   ├── HW4.Rmd
    │       │   └── HW4.html
    │   ├── HW5
    │       └── HW5.pdf
    │   ├── Submission guidelines for data analysis course  2020.docx
    │   ├── Submission guidelines for data analysis course  2020.pdf
    │   └── Submission guidelines for data analysis course 2019.pdf
├── README.md
├── additional_notes
    ├── MLE_Bernoulli.Rmd
    └── MLE_Bernoulli.pdf
├── books
    └── openintro-statistics-sample.pdf
├── distribution_shiny_app
    ├── app.R
    ├── distribution_shiny_app.Rproj
    └── plot_z_score.R
├── exam_examples
    ├── Exam_Moed_A.pdf
    ├── Exam_Moed_A_answers.pdf
    ├── Exercise_Examples.Rmd
    ├── Exercise_Examples.pdf
    ├── Exercise_Examples_Answers.Rmd
    └── TSLA.csv
├── exercises
    └── old
    │   ├── 10
    │       ├── 10.pdf
    │       └── first_semester
    │       │   ├── 08.pdf
    │       │   ├── 10.pdf
    │       │   └── Exam Questiuon.pdf
    │   ├── 11
    │       ├── ex 11.docx
    │       ├── ex 11.pdf
    │       └── first_semester
    │       │   ├── HW4 Q1 and Q2 solution.docx
    │       │   ├── HW4 Q1 and Q2 solution.pdf
    │       │   └── ex11.pdf
    │   ├── 12
    │       ├── 12.docx
    │       ├── 12.pdf
    │       └── first_semester
    │       │   └── ex12.pdf
    │   ├── 13
    │       ├── ex 13.pdf
    │       └── first_semester
    │       │   └── ex 13.pdf
    │   ├── 01 - Intro to R
    │       ├── 00-Introduction.Rmd
    │       ├── 00-Introduction.html
    │       ├── 01- More Operations.Rmd
    │       ├── 01--More-Operations.html
    │       ├── 01-Syntax, functions, loops, data types.Rmd
    │       ├── 01-Syntax,-functions,-loops,-data-types.html
    │       └── example_file.csv
    │   ├── 02
    │       ├── 02.Rmd
    │       ├── 02.html
    │       ├── EX 02 Q1 Q2.docx
    │       ├── EX 02 Q1 Q2.pdf
    │       ├── t.png
    │       └── x^2.png
    │   ├── 03
    │       ├── 03- Point estimation and dplyr package.Rmd
    │       ├── 03-_Point_estimation_and_dplyr_package.html
    │       ├── 03-_Point_estimation_and_dplyr_package.pdf
    │       ├── 03-_Point_estimation_and_dplyr_package.tex
    │       ├── ex03.zip
    │       ├── hw1_q3_solution.html
    │       └── hw1_q3_solution.pdf
    │   ├── 04
    │       ├── 04.Rmd
    │       └── 04.html
    │   ├── 05
    │       ├── 05.Rmd
    │       ├── 05.html
    │       ├── EX 05 - Intro to hypothesis tests.pdf
    │       ├── Q1_2.docx
    │       ├── Q1_2.pdf
    │       ├── type1_type2_errors.png
    │       └── type1_type2_errors2.jpg
    │   ├── 06
    │       ├── EX06.docx
    │       └── EX06.pdf
    │   ├── 07
    │       ├── EX07.pdf
    │       └── cs229-notes1.pdf
    │   ├── 08
    │       ├── EX08.pdf
    │       └── Ex08.docx
    │   └── 09
    │       ├── ex09.docx
    │       └── ex09.pdf
├── intro_statistics_R.Rproj
├── labs
    ├── Independence test.Rmd
    ├── answers
    │   ├── Data_science_workflow_lab-answers.html
    │   ├── food_consumption-answers.Rmd
    │   └── netflix movies and tv shows exercise - answers.Rmd
    ├── data
    │   ├── netflix_titles.csv
    │   └── sf_trees.csv
    ├── food_consumption.Rmd
    ├── linear_regression.Rmd
    └── netflix movies and tv shows exercise.Rmd
├── lectures
    ├── 00-Introduction.pptx
    ├── 00-intro-binomial-dist.R
    ├── 00-introduction
    │   ├── 00-introduction_script.Rmd
    │   ├── IWER34_2019.xlsx
    │   ├── Lego_parts.csv
    │   └── st02_03.xls
    ├── 01-Estimation methods and Intervals.Rmd
    ├── 01-Point Estimation Methods and Intervals.pdf
    ├── 02-Intervals.Rmd
    ├── 02-Intervals.pdf
    ├── 03 - Hypothesis Tests.pdf
    ├── 03-Hypothesis_tests.Rmd
    ├── 03-Hypothesis_tests_part_b.Rmd
    ├── 04 - Statistical inference for Two Samples.pdf
    ├── 04-Statistical_inference_two_samples - part B.Rmd
    ├── 04-Statistical_inference_two_samples.Rmd
    ├── 05 - Simple Linear Regression.pdf
    ├── 05-Simple_linear_regression.Rmd
    ├── 06 - Multiple Linear Regression and Correlation.pdf
    ├── 06-Multiple_linear_regression_and_correlation.Rmd
    ├── 06-Note_about_overfitting.Rmd
    ├── 07 - Regression, Design and Analysis of Single-Factor Experiments.pdf
    ├── 07-Multiple regression and exercises.Rmd
    ├── 08-Single_factor_experiments_ANOVA.Rmd
    ├── 09-One_Two_way_ANOVA.Rmd
    ├── 09-One_Two_way_ANOVA.pdf
    ├── Example_for_multicolinearity_problem.R
    ├── data
    │   ├── ipf_lifts.csv
    │   ├── montgomery_13.5_fabric_strength.csv
    │   ├── montgomery_14.5_adhesion_force.csv
    │   ├── movie_db_clean.csv
    │   ├── wildlife_impacts_medium.csv
    │   └── wildlife_impacts_small.csv
    ├── files_during_lecture
    │   ├── 05-file1.R
    │   ├── 05-file2.R
    │   └── 05-file3.R
    ├── images
    │   ├── Type_IandType_II_errors.jpg
    │   ├── birds_eye_view1.svg
    │   ├── link_for_survey_example.png
    │   ├── speeding_ticket.png
    │   └── waze_not_accurate.jpg
    ├── mult_lin_reg_example.R
    ├── what_is_z_score.R
    └── xaringan-themer.css
├── misc
    ├── Distribution tables - x^2, z, f, t.pdf
    ├── extended_formula_page.pdf
    ├── init_course_plan.md
    ├── syllabus_05601823_2022.pdf
    └── tau_engineering_logo.png
├── population_vs_sample
    ├── .Rhistory
    ├── app.R
    ├── population_vs_sample.Rproj
    └── rsconnect
    │   └── shinyapps.io
    │       └── sarid
    │           └── population_vs_sample.dcf
├── project
    ├── Project Instructions.Rmd
    ├── Project Instructions.pdf
    ├── example_star_trek_script_analysis.Rmd
    └── examples
    │   ├── College Major (I.P, E.T).html
    │   ├── FIFA 2019 Analysis - Inbar Siloni.Rmd
    │   ├── FIFA 2019 Analysis - Inbar Siloni.html
    │   ├── Final_Project_Gil_Shwartz.html
    │   └── spotify_project (D.S, S.K).html
└── xaringan-themer.css


/.gitignore:
--------------------------------------------------------------------------------
 1 | montgomery.pdf
 2 | .Rproj.user
 3 | /exercises/04/heights.jpg
 4 | /.Rhistory
 5 | .RData
 6 | /exercises/05/Q1 (on board).docx
 7 | /exercises/Submission guidelines for data analysis course.docx
 8 | /exercises/07/EX07.docx
 9 | /exercises/08/08.docx
10 | /lectures/data/wildlife_impacts.csv
11 | lectures/libs
12 | /HW/HW3/answers/EX09_Questions_and_Solutions.docx
13 | /exercises/10/10.docx
14 | /exercises/11/HW4 Q2 solution.docx
15 | /HW/HW5/HW5.docx
16 | /exercises/12/~$ex12.docx
17 | /exercises/12/ex12.docx
18 | /exercises/13/ex 13.docx
19 | /lectures/*.html
20 | /lectures/*_files
21 | distribution_shiny_app/rsconnect
22 | labs/rsconnect/
23 | /labs/*_files/
24 | /labs/*.html
25 | 
26 | # ignore temporary files declared via tmp_*.*
27 | tmp_*.*
28 | /tmp_bug_example


--------------------------------------------------------------------------------
/01-Estimation-leading_up_to_MLE.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | density_func <- tibble(x = seq(-2, 2, 0.1)) %>% 
 4 |   mutate(dens = dnorm(x = x, mean = 0, sd = 1))
 5 | 
 6 | set.seed(0)
 7 | randomized_dots <- tibble(rnd = rnorm(n = 1000, mean = 0.4, sd = 1.1))
 8 | 
 9 | # Demonstrate by changing this:
10 | 
11 | num_points <- 50
12 | 
13 | ggplot(density_func, aes(x = x, y = dens)) + 
14 |   geom_line() + 
15 |   geom_density(data = randomized_dots[1:num_points,], aes(rnd), inherit.aes = F) + 
16 |   geom_point(data = randomized_dots[1:num_points,], 
17 |              aes(x = rnd, y = 0), alpha = 0.4, inherit.aes = F, size = 3)


--------------------------------------------------------------------------------
/HW/Exercise-1c-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "תרגיל בית 1ג' (תרגול ב-R)"
  3 | subtitle: "היכרות ראשונית עם R"
  4 | output: 
  5 | knit: pagedown::chrome_print
  6 | ---
  7 | 
  8 | ```{=html}
  9 | <style>
 10 | h2 {
 11 | direction: rtl;
 12 | }
 13 | 
 14 | h1 {
 15 | text-align: center;
 16 | }
 17 | 
 18 | h3 {
 19 | text-align: center;
 20 |     border-bottom:1px solid #3398ff;
 21 |     padding-bottom:1px;
 22 |     
 23 | }
 24 | 
 25 | p {
 26 | direction: rtl;
 27 | }
 28 | 
 29 | body, td {
 30 | font-size: 12px;
 31 | direction: rtl;
 32 | }
 33 | 
 34 | pre {
 35 |   direction:ltr;
 36 | }
 37 | 
 38 | </style>
 39 | ```
 40 | ```{r setup, include=FALSE}
 41 | suppressWarnings(suppressMessages(library(tidyverse)))
 42 | library(palmerpenguins)
 43 | ```
 44 | 
 45 | בתרגיל זה תתקינו את R, את חבילת `tidyverse` ותתרגלו מעט עבודה עם נתונים.
 46 | 
 47 | ## חלק ראשון
 48 | 
 49 | התקינו את תוכנת R ואת RStudio.
 50 | 
 51 | את תוכנת R ניתן להוריד בקישור הבא:
 52 | 
 53 | <https://cran.r-project.org/bin/windows/base/>
 54 | 
 55 | את RStudio ניתן להוריד בקישור הבא:
 56 | 
 57 | <https://www.rstudio.com/products/rstudio/download/#download>
 58 | 
 59 | שימו לב להשתמש בגרסאות המתאימות לכם (יש גרסאות שונות ל-Windows, macOS, ו-Linux)
 60 | 
 61 | כעת לאחר שהתקנתם, הפעילו את RStudio והתקינו את חבילת `tidyverse` על ידי הקלדה ב-Console של:
 62 | 
 63 | ```{r, eval=FALSE}
 64 | install.packages("tidyverse")
 65 | ```
 66 | 
 67 | זה עשוי לקחת קצת זמן, אז התאזרו בסבלנות.
 68 | 
 69 | אם קיבלתם הודעת שגיאה כגון: *Error in install.packages : cannot open file* זה כנראה נובע מבעיות הרשאה לתיקייה אליה R מנסה להתקין את החבילה. אפשר לפתור את הבעיה הזו על ידי העלאת RStudio עם כפתור ימני של העכבר, ואז Run as Administrator (ואז לנסות להתקין מחדש).
 70 | 
 71 | ## חלק שני
 72 | 
 73 | התקינו את חבילת `palmerpenguins`. השתמשו באותה הפקודה בה השתמשנו בחלק הראשון לצורך התקנת חבילה (רק שנו את השם tidyverse לשם של החבילה `palmerpenguins`).
 74 | 
 75 | כעת השתמשו בקוד הבא ותארו מהם הנתונים הקיימים בטבלה `penguins`:
 76 | 
 77 | ```{r, eval=FALSE}
 78 | library(tidyverse)
 79 | library(palmerpenguins)
 80 | 
 81 | glimpse(penguins)
 82 | ```
 83 | 
 84 | הריצו גם את הפקודות הבאות וכתבו מה ההבדל בין הפונקציה `glimpse`, `head`, `tail`, `view`, ולרשום `penguins` ב-console.
 85 | 
 86 | ```{r, eval=FALSE}
 87 | head(penguins)
 88 | tail(penguins)
 89 | view(penguins)
 90 | penguins
 91 | ```
 92 | 
 93 | ------------------------------------------------------------------------
 94 | 
 95 | -   מה מתארת כל שורה בטבלה?
 96 | 
 97 | -   מה מתאר המשתנה island?
 98 | 
 99 | -   מה מתאר המשתנה species?
100 | 
101 | השתמשו בפונקציה count בדומה לאופן המתואר להלן, וענו על השאלות:
102 | 
103 | -   כמה איים מתועדים בטבלה?
104 | 
105 | -   כמה זכרים מתועדים בטבלה?
106 | 
107 | -   לכמה תצפיות חסרים נתונים זכר/נקבה?
108 | 
109 | דוגמה לשימוש בפונקציה count (תצטרכו להרחיב את הדוגמה הזו על ידי שינוי הקוד)
110 | 
111 | ```{r, eval=FALSE}
112 | penguins %>% 
113 |   count(species)
114 | ```
115 | 
116 | ## חלק שלישי
117 | 
118 | הקוד הבא מצייר תרשים של הקשר בין אורך מקור ואורך כנף.
119 | 
120 | ```{r}
121 | ggplot(data = penguins,
122 |        aes(x = bill_length_mm,
123 |            y = flipper_length_mm,
124 |            color = species)) + 
125 |   geom_point()
126 | ```
127 | 
128 | ענו על השאלות הבאות:
129 | 
130 | 1.  מה אומרת הודעת האזהרה של R? מה המשמעות שלה? (Removed 2 rows containins...)
131 | 
132 | 2.  הסבירו מה הקשר שמתאר התרשים בין שלושת המשתנים שבו (שני הצירים וצבע הנקודות).
133 | 
134 | 3.  עבור כל פונקציה שהשתמשנו בה, הסבירו מה התפקיד שלה ביצירת התרשים (להלן הפונקציות שבהן השתמשנו `ggplot`, `aes`, `geom_point`). באפשרותכם להיעזר [בעמוד הזה](https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-visualization.pdf)
135 | 
136 | 4.  כתבו קוד אשר יצייר תרשים שמתאר את הקשר בין אורך המקור, עומק המקור, והאי שבו נמדדה הדגימה (לשם כך היעזרו בקוד של התרשים הקודם, ושנו אותו כך שיתאים לצרכים).
137 | 
138 | 5.  נתחו את התרשים: מה הקשר שאתם מבחינים בו? מדוע האי שבו נמדדה הדגימה משפיע על הקשר בין האורך והעומק של המקור?
139 | 


--------------------------------------------------------------------------------
/HW/Exercise-1c-R.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/Exercise-1c-R.pdf


--------------------------------------------------------------------------------
/HW/Exercise-3c-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "עבודה עם Dataset, חילוץ נתונים, ושימוש בפונקציות בסיסיות של `tidyverse`"
  3 | subtitle: "`r Sys.Date()`"
  4 | output: 
  5 | knit: pagedown::chrome_print
  6 | ---
  7 | 
  8 | ```{=html}
  9 | <style>
 10 | h2 {
 11 | direction: rtl;
 12 | }
 13 | 
 14 | h1 {
 15 | text-align: center;
 16 | }
 17 | 
 18 | h3 {
 19 | text-align: center;
 20 |     border-bottom:1px solid #3398ff;
 21 |     padding-bottom:1px;
 22 |     
 23 | }
 24 | 
 25 | p {
 26 | direction: rtl;
 27 | }
 28 | 
 29 | body, td {
 30 | font-size: 12px;
 31 | direction: rtl;
 32 | }
 33 | 
 34 | pre {
 35 |   direction:ltr;
 36 | }
 37 | 
 38 | </style>
 39 | ```
 40 | ```{r setup, include=FALSE}
 41 | suppressWarnings(suppressMessages(library(tidyverse)))
 42 | library(palmerpenguins)
 43 | ```
 44 | 
 45 | בתרגיל זה תתנסו במספר פונקציות בסיסיות ב-R (ולמעשה פונקציות של tidyverse).
 46 | 
 47 | ## חלק ראשון
 48 | 
 49 | קראו את התיעוד בעמוד הבא:
 50 | 
 51 | https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-09-22/readme.md
 52 | 
 53 | 
 54 | השתמשו בפונקציה `read_csv` מחבילת `readr` על מנת לקרוא את שלושת הקבצים הבאים. ניתן להיעזר בקוד הבא:
 55 | 
 56 | ```{r, eval=FALSE}
 57 | members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
 58 | expeditions <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/expeditions.csv')
 59 | peaks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/peaks.csv')
 60 | ```
 61 | 
 62 | על מנת לתאר את הדאטה, באופן כללי, השתמשו בפונקציות `count`, `filter`, `arrange`, `distinct` וענו על השאלות הבאות:
 63 | 
 64 |    1. אילו שנים מתועדות בקבצים (טווח השנים של משלחות להימלאיה)
 65 |    
 66 |    2. כמה אזרחויות שונות היו מעורבות במשלחות להימלאיה, עד לשנת 1950?
 67 |    
 68 |    3. על בסיס הנתונים, מה לדעתכם העונה הטובה ביותר לטפס על פסגת האוורסט?
 69 |    
 70 |    4. כמה משלחות מתועדות בקובץ, שניסו לטפס על האוורסט (ולא הצליחו להעפיל לפסגה) לפני שנת 1953?
 71 |    
 72 |    5. כמה פסגות ברכס ההימלאיה מגיעות לרום של מעל 800 מטר?
 73 |    
 74 | ## חלק שני
 75 | 
 76 | בחלק זה תשתמשו בפונקציה `mutate` על מנת לבחור משתנים ולערוך טרנספורמציות.
 77 | 
 78 | היעזרו בקוד הבא, על מנת לייצר טבלה חדשה עם שני משתנים בוליאניים חדשים: `is_doctor`, `is_leader`.
 79 | 
 80 | הסבירו מה עושה הפונקציה `str_detect`, ומה המשמעות של כל שורה בקוד (מה עושה כל שורה).
 81 | 
 82 | ```{r, eval=FALSE}
 83 | leader_table <- members %>% 
 84 |   mutate(is_leader = str_detect(expedition_role, "Leader")) %>% 
 85 |   mutate(is_doctor = str_detect(expedition_role, "Doctor"))
 86 | ```
 87 | 
 88 | השתמשו בפונקציה `count` על מנת לייצר טבלה שתראה את כל הצירופים האפשריים של is_leader, is_doctor, ומספר התצפיות. כמה מובילי משלחות יש שהם גם רופאים?
 89 | 
 90 | 
 91 | 
 92 | השתמשו ב-`mutate` ובפונקציה `cut` על מנת לבנות טבלה חדשה שבה יש משתנה שנקרא `decade`, המתאר את העשור שבו יצאה המשלחת. ניתן להיעזר בקוד הבא (השלימו את הקוד).
 93 | 
 94 | לאחר מכן, צרו תרשים שיציג את מספר המשלחות בכל עונה בכל עשור.
 95 | 
 96 | ```{r, eval=FALSE}
 97 | expeditions_new <- expeditions %>% 
 98 |   mutate(decade = cut(year, breaks = c(___, ___, ___, ...))) %>% 
 99 |   count(season, ___)
100 | 
101 | ggplot(___, aes(x = decade, y = ___, fill = season)) + 
102 |   geom_col(position = position_dodge())
103 |   
104 | ```
105 | 
106 | מה ההבדל בין שימוש ב-`position_dodge` לבין `position_stack` ו-`position_fill` בקוד לעיל?
107 | 
108 | ## חלק שלישי
109 | 
110 | נניח שאתם מתכננים להעפיל לפסגה ברכס ההימלאיה, של מעל ל-8000 מטר.
111 | 
112 |    1. צרו טבלה חדשה עם רשימה של פסגות אלו.
113 |    2. מיהן עשרת סוכנויות המסע (`trekking_agency`) שהוציאו הכי הרבה משלחות?
114 |    3. השתמשו ב-`group_by` וב-`summarize` על מנת לחשב את ממוצע התמותה לכל משלחת, באופן כללי, וגם עבור הפסגות מעל 8000 מטר בלבד (ניתן לחלק סעיף זה לשני חישובים נפרדים).
115 |    4. עם אילו חברות הייתם שוקלים לצאת, ועם אילו לא?


--------------------------------------------------------------------------------
/HW/Exercise-3c-R.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/Exercise-3c-R.pdf


--------------------------------------------------------------------------------
/HW/old/HW1/HW1.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Introduction to Statistics and Data Analysis with R - Homework #1'
  3 | author: "Adi Sarid and Afek Adler"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   pdf_document: default
  7 |   html_document: default
  8 | ---
  9 | 
 10 | This homework sheet is due on the 2020-04-01 on 24:00.  You may submit your answers in pairs.
 11 | Submission will be performed electronically via the moodle.
 12 | 
 13 | We urge you to start solving this sheet as soon as possible and, if you have any questions, come to visit us in reception hours next week.
 14 | 
 15 | The exercise is divided into two parts: Technical (programming in `R`) and theoretical.
 16 | 
 17 | Submit the following questions:
 18 | 
 19 |    * Q1 - only 3.6.1.6 (specified later).
 20 |    * Q2
 21 |    * Q4 
 22 |    * Q5
 23 |    
 24 |    
 25 | 
 26 | # Technical (programming in `R`)
 27 | 
 28 | ## Question 1:
 29 | 
 30 | Please read the following chapters in [R4DS - https://r4ds.had.co.nz](https://r4ds.had.co.nz):
 31 | 
 32 |    1. [Introduction](https://r4ds.had.co.nz/introduction.html)
 33 |    2. [Explore - Introduction](https://r4ds.had.co.nz/explore-intro.html)
 34 |    3. [Explore - Data visualizations](https://r4ds.had.co.nz/data-visualisation.html)
 35 |    4. [Workflow](https://r4ds.had.co.nz/workflow-basics.html)
 36 |    
 37 | Solve exercise 3.6.1,3.81, 4.4. **submit the code for question 6 (in 3.6.1)**  ("*Recreate the R code necessary to generate the following graphs*").
 38 | 
 39 | ## Question 2:
 40 | 
 41 | In this question, you will get acquainted (or reminded of) the following distributions:
 42 | 
 43 |    * Normal distribution $N(\mu, \sigma)$
 44 |    * Student's t $t_{\operatorname{df}}$
 45 |    * Chi-square $\chi^2$
 46 | 
 47 | Complete the blanks (`___`) in the following code, to generate $n=100$ random values from each of these distributions with:
 48 | 
 49 |    * Normal with $\mu=3, \sigma = 1.5$
 50 |    * Student's-t with $\operatorname{df}=10$
 51 |    * Chi-square with $\operatorname{df}=12$
 52 | 
 53 | Tip: if you type a `?` followed by the command name in the console, you will see its documentation. I.e., type `?rnorm` to see the help on the random number generator for the normal distribution.
 54 | 
 55 | ### Complete the blanks:
 56 | 
 57 | ```
 58 | set.seed(0)  # we set the seed of the random generator so that your results will be consistent
 59 | 
 60 | random_normal <- rnorm(n = ___, mean = ___, ___ = 1.5)
 61 | random_t <- rt(n = ___, df = ___)
 62 | random_chi <- rchisq(n = ___, df = ___)
 63 | 
 64 | ```
 65 | 
 66 | ### Plot by completing the blanks:
 67 | 
 68 | Plot each of these samples using `ggplot2`. Think, what `geom` would you use to plot the distribution of the sample?
 69 | 
 70 | ```
 71 | 
 72 | # if you don't have the tidyverse package first install by running
 73 | # install.packages("tidyverse")
 74 | 
 75 | library(tidyverse) 
 76 | 
 77 | all_random_data <- tibble(random_normal, random_t, random_chi)
 78 | 
 79 | ggplot(all_random_data, aes(random_normal)) + 
 80 |   geom____()
 81 | 
 82 | ggplot(all_random_data, aes(random_t)) + 
 83 |   ___
 84 | 
 85 | ggplot(all_random_data, aes(random_chi)) + 
 86 |   ___
 87 | 
 88 | ```
 89 | 
 90 | ### Answer these:
 91 | 
 92 |    1. Is the original distribution symmetric? does the plots look symmetric, why?
 93 |    2. Generally speaking (not relating to the specific sample you obtained), what is the relationship between the mean and median of each of these distributions?
 94 |    3. What would happen if we increase $n$ from 100 to 1000? 
 95 |    
 96 |       a. How would the distribution look like? 
 97 |       b. Why? 
 98 |       c. Modify your code and visualize the updates.
 99 | 
100 | 
101 | # Theoretical
102 | 
103 | ## Question 3:
104 | 
105 | In the smallest branch of the smallest bank, the number of customers in the queue (waiting customers), is a random variable $Q\in\{0,1,2\}$. You cannot have more than 2 customers waiting in the queue, because the've been downsizing and the branch is really small.
106 | 
107 | The distribution of $Q$ is dependent on a parameter $\theta$.
108 | 
109 | $$Q = \left\{\begin{array}{ll}0 & \text{w.p. }4\theta^2\\
110 | 1 & \text{w.p. }4\theta-8\theta^2\\
111 | 2 & \text{w.p. }1-4\theta+4\theta^2\end{array}\right.$$
112 | 
113 | The bank's headquarters randomly sampled the queue during five independent times. The results were $\{0,1,0,0,0\}$ customers in the queue.
114 | 
115 | ### Answer the following questions:
116 | 
117 |    1. Find an unbiased estimator $\hat{\Theta}$ for the parameter $\theta$ for a sample of size $n=5$. What is $\hat{\Theta}$ based on the current sample? (you should get 0.45)
118 |    2. Find an unbiased estimator for the expected number of customers waiting in the queue based on a sample of size $n=5$. What is the estimate of the expected number of customers, based on the current sample? (0.2)
119 |    3. Find an estimator for $\theta$ in the maximum likelihood estimation method. (0.45)
120 |    
121 | ## Question 4:
122 | 
123 | let $X$ be a random Bernoulli variable. It's probability density function can be formulated as follows:
124 | 
125 | \[
126 | f(x ; p)=\left\{\begin{array}{ll}{p^{x}(1-p)^{1-x}} & {x=0,1} \\ {0} & {\text { otherwise }}\end{array}\right.
127 | \]
128 | 
129 | 1. Show that $X=1$ with probability $p$ and that $X=0$ with probability $1-p$
130 | 2. Suppose we get a random sample of size $n$ from a Bernulli distribution. What is the likelihood function $L(p)$ of the sample? (what is the probability that $P\left(X_{1}=x_{1}, X_{2}=x_{2}, \ldots, X_{n}=x_{n}\right)$)
131 | 3. Apply the $\log$ transformation on this likelihood function, what do you get?
132 | 4. Find the $p$ that maximizes $\log L(p)$
133 | 
134 | ## Question 5:
135 | 
136 | For the following probability density function:
137 | \[f(x)=\left\{\begin{array}{ll}{\frac{2}{\theta^{2}}(\theta-x)} & {0<x<\theta} \\ {0} & {\text { else }}\end{array}\right.\]
138 | Find $\theta$ by the method of moments.
139 | 
140 | ## Question 6:
141 | 
142 | For the exponential distribution:
143 | \[f(x ; \lambda)=\left\{\begin{array}{ll}{\lambda e^{-\lambda x}} & {x \geq 0} \\ {0} & {x<0}\end{array}\right.\]
144 | Find $\lambda$ by MLE and by the Method of Moments.
145 | 
146 | 


--------------------------------------------------------------------------------
/HW/old/HW1/HW1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/HW1/HW1.pdf


--------------------------------------------------------------------------------
/HW/old/HW1/HW1.tex:
--------------------------------------------------------------------------------
  1 | % Options for packages loaded elsewhere
  2 | \PassOptionsToPackage{unicode}{hyperref}
  3 | \PassOptionsToPackage{hyphens}{url}
  4 | %
  5 | \documentclass[
  6 | ]{article}
  7 | \usepackage{lmodern}
  8 | \usepackage{amssymb,amsmath}
  9 | \usepackage{ifxetex,ifluatex}
 10 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
 11 |   \usepackage[T1]{fontenc}
 12 |   \usepackage[utf8]{inputenc}
 13 |   \usepackage{textcomp} % provide euro and other symbols
 14 | \else % if luatex or xetex
 15 |   \usepackage{unicode-math}
 16 |   \defaultfontfeatures{Scale=MatchLowercase}
 17 |   \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
 18 | \fi
 19 | % Use upquote if available, for straight quotes in verbatim environments
 20 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{}
 21 | \IfFileExists{microtype.sty}{% use microtype if available
 22 |   \usepackage[]{microtype}
 23 |   \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
 24 | }{}
 25 | \makeatletter
 26 | \@ifundefined{KOMAClassName}{% if non-KOMA class
 27 |   \IfFileExists{parskip.sty}{%
 28 |     \usepackage{parskip}
 29 |   }{% else
 30 |     \setlength{\parindent}{0pt}
 31 |     \setlength{\parskip}{6pt plus 2pt minus 1pt}}
 32 | }{% if KOMA class
 33 |   \KOMAoptions{parskip=half}}
 34 | \makeatother
 35 | \usepackage{xcolor}
 36 | \IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
 37 | \IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
 38 | \hypersetup{
 39 |   pdftitle={Introduction to Statistics and Data Analysis with R - Homework \#1},
 40 |   pdfauthor={Adi Sarid and Afek Adler},
 41 |   hidelinks,
 42 |   pdfcreator={LaTeX via pandoc}}
 43 | \urlstyle{same} % disable monospaced font for URLs
 44 | \usepackage[margin=1in]{geometry}
 45 | \usepackage{graphicx,grffile}
 46 | \makeatletter
 47 | \def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
 48 | \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
 49 | \makeatother
 50 | % Scale images if necessary, so that they will not overflow the page
 51 | % margins by default, and it is still possible to overwrite the defaults
 52 | % using explicit options in \includegraphics[width, height, ...]{}
 53 | \setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
 54 | % Set default figure placement to htbp
 55 | \makeatletter
 56 | \def\fps@figure{htbp}
 57 | \makeatother
 58 | \setlength{\emergencystretch}{3em} % prevent overfull lines
 59 | \providecommand{\tightlist}{%
 60 |   \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
 61 | \setcounter{secnumdepth}{-\maxdimen} % remove section numbering
 62 | 
 63 | \title{Introduction to Statistics and Data Analysis with R - Homework \#1}
 64 | \author{Adi Sarid and Afek Adler}
 65 | \date{2020-03-18}
 66 | 
 67 | \begin{document}
 68 | \maketitle
 69 | 
 70 | This homework sheet is due on the 2020-04-01 on 24:00. You may submit
 71 | your answers in pairs. Submission will be performed electronically via
 72 | the moodle.
 73 | 
 74 | We urge you to start solving this sheet as soon as possible and, if you
 75 | have any questions, come to visit us in reception hours next week.
 76 | 
 77 | The exercise is divided into two parts: Technical (programming in
 78 | \texttt{R}) and theoretical.
 79 | 
 80 | Submit the following questions:
 81 | 
 82 | \begin{itemize}
 83 | \tightlist
 84 | \item
 85 |   Q1 - only 3.6.1.6 (specified later).
 86 | \item
 87 |   Q2
 88 | \item
 89 |   Q4
 90 | \item
 91 |   Q5
 92 | \end{itemize}
 93 | 
 94 | \hypertarget{technical-programming-in-r}{%
 95 | \section{\texorpdfstring{Technical (programming in
 96 | \texttt{R})}{Technical (programming in R)}}\label{technical-programming-in-r}}
 97 | 
 98 | \hypertarget{question-1}{%
 99 | \subsection{Question 1:}\label{question-1}}
100 | 
101 | Please read the following chapters in \href{https://r4ds.had.co.nz}{R4DS
102 | - https://r4ds.had.co.nz}:
103 | 
104 | \begin{enumerate}
105 | \def\labelenumi{\arabic{enumi}.}
106 | \tightlist
107 | \item
108 |   \href{https://r4ds.had.co.nz/introduction.html}{Introduction}
109 | \item
110 |   \href{https://r4ds.had.co.nz/explore-intro.html}{Explore -
111 |   Introduction}
112 | \item
113 |   \href{https://r4ds.had.co.nz/data-visualisation.html}{Explore - Data
114 |   visualizations}
115 | \item
116 |   \href{https://r4ds.had.co.nz/workflow-basics.html}{Workflow}
117 | \end{enumerate}
118 | 
119 | Solve exercise 3.6.1,3.81, 4.4. \textbf{submit the code for question 6
120 | (in 3.6.1)} (``\emph{Recreate the R code necessary to generate the
121 | following graphs}'').
122 | 
123 | \hypertarget{question-2}{%
124 | \subsection{Question 2:}\label{question-2}}
125 | 
126 | In this question, you will get acquainted (or reminded of) the following
127 | distributions:
128 | 
129 | \begin{itemize}
130 | \tightlist
131 | \item
132 |   Normal distribution \(N(\mu, \sigma)\)
133 | \item
134 |   Student's t \(t_{\operatorname{df}}\)
135 | \item
136 |   Chi-square \(\chi^2\)
137 | \end{itemize}
138 | 
139 | Complete the blanks (\texttt{\_\_\_}) in the following code, to generate
140 | \(n=100\) random values from each of these distributions with:
141 | 
142 | \begin{itemize}
143 | \tightlist
144 | \item
145 |   Normal with \(\mu=3, \sigma = 1.5\)
146 | \item
147 |   Student's-t with \(\operatorname{df}=10\)
148 | \item
149 |   Chi-square with \(\operatorname{df}=12\)
150 | \end{itemize}
151 | 
152 | Tip: if you type a \texttt{?} followed by the command name in the
153 | console, you will see its documentation. I.e., type \texttt{?rnorm} to
154 | see the help on the random number generator for the normal distribution.
155 | 
156 | \hypertarget{complete-the-blanks}{%
157 | \subsubsection{Complete the blanks:}\label{complete-the-blanks}}
158 | 
159 | \begin{verbatim}
160 | set.seed(0)  # we set the seed of the random generator so that your results will be consistent
161 | 
162 | random_normal <- rnorm(n = ___, mean = ___, ___ = 1.5)
163 | random_t <- rt(n = ___, df = ___)
164 | random_chi <- rchisq(n = ___, df = ___)
165 | \end{verbatim}
166 | 
167 | \hypertarget{plot-by-completing-the-blanks}{%
168 | \subsubsection{Plot by completing the
169 | blanks:}\label{plot-by-completing-the-blanks}}
170 | 
171 | Plot each of these samples using \texttt{ggplot2}. Think, what
172 | \texttt{geom} would you use to plot the distribution of the sample?
173 | 
174 | \begin{verbatim}
175 | 
176 | # if you don't have the tidyverse package first install by running
177 | # install.packages("tidyverse")
178 | 
179 | library(tidyverse) 
180 | 
181 | all_random_data <- tibble(random_normal, random_t, random_chi)
182 | 
183 | ggplot(all_random_data, aes(random_normal)) + 
184 |   geom____()
185 | 
186 | ggplot(all_random_data, aes(random_t)) + 
187 |   ___
188 | 
189 | ggplot(all_random_data, aes(random_chi)) + 
190 |   ___
191 | \end{verbatim}
192 | 
193 | \hypertarget{answer-these}{%
194 | \subsubsection{Answer these:}\label{answer-these}}
195 | 
196 | \begin{enumerate}
197 | \def\labelenumi{\arabic{enumi}.}
198 | \item
199 |   Is the original distribution symmetric? does the plots look symmetric,
200 |   why?
201 | \item
202 |   Generally speaking (not relating to the specific sample you obtained),
203 |   what is the relationship between the mean and median of each of these
204 |   distributions?
205 | \item
206 |   What would happen if we increase \(n\) from 100 to 1000?
207 | 
208 |   \begin{enumerate}
209 |   \def\labelenumii{\alph{enumii}.}
210 |   \tightlist
211 |   \item
212 |     How would the distribution look like?
213 |   \item
214 |     Why?
215 |   \item
216 |     Modify your code and visualize the updates.
217 |   \end{enumerate}
218 | \end{enumerate}
219 | 
220 | \hypertarget{theoretical}{%
221 | \section{Theoretical}\label{theoretical}}
222 | 
223 | \hypertarget{question-3}{%
224 | \subsection{Question 3:}\label{question-3}}
225 | 
226 | In the smallest branch of the smallest bank, the number of customers in
227 | the queue (waiting customers), is a random variable \(Q\in\{0,1,2\}\).
228 | You cannot have more than 2 customers waiting in the queue, because
229 | the've been downsizing and the branch is really small.
230 | 
231 | The distribution of \(Q\) is dependent on a parameter \(\theta\).
232 | 
233 | \[Q = \left\{\begin{array}{ll}0 & \text{w.p. }4\theta^2\\
234 | 1 & \text{w.p. }4\theta-8\theta^2\\
235 | 2 & \text{w.p. }1-4\theta+4\theta^2\end{array}\right.\]
236 | 
237 | The bank's headquarters randomly sampled the queue during five
238 | independent times. The results were \(\{0,1,0,0,0\}\) customers in the
239 | queue.
240 | 
241 | \hypertarget{answer-the-following-questions}{%
242 | \subsubsection{Answer the following
243 | questions:}\label{answer-the-following-questions}}
244 | 
245 | \begin{enumerate}
246 | \def\labelenumi{\arabic{enumi}.}
247 | \tightlist
248 | \item
249 |   Find an unbiased estimator \(\hat{\Theta}\) for the parameter
250 |   \(\theta\) for a sample of size \(n=5\). What is \(\hat{\Theta}\)
251 |   based on the current sample? (you should get 0.45)
252 | \item
253 |   Find an unbiased estimator for the expected number of customers
254 |   waiting in the queue based on a sample of size \(n=5\). What is the
255 |   estimate of the expected number of customers, based on the current
256 |   sample? (0.2)
257 | \item
258 |   Find an estimator for \(\theta\) in the maximum likelihood estimation
259 |   method. (0.45)
260 | \end{enumerate}
261 | 
262 | \hypertarget{question-4}{%
263 | \subsection{Question 4:}\label{question-4}}
264 | 
265 | let \(X\) be a random Bernoulli variable. It's probability density
266 | function can be formulated as follows:
267 | 
268 | \[
269 | f(x ; p)=\left\{\begin{array}{ll}{p^{x}(1-p)^{1-x}} & {x=0,1} \\ {0} & {\text { otherwise }}\end{array}\right.
270 | \]
271 | 
272 | \begin{enumerate}
273 | \def\labelenumi{\arabic{enumi}.}
274 | \tightlist
275 | \item
276 |   Show that \(X=1\) with probability \(p\) and that \(X=0\) with
277 |   probability \(1-p\)
278 | \item
279 |   Suppose we get a random sample of size \(n\) from a Bernulli
280 |   distribution. What is the likelihood function \(L(p)\) of the sample?
281 |   (what is the probability that
282 |   \(P\left(X_{1}=x_{1}, X_{2}=x_{2}, \ldots, X_{n}=x_{n}\right)\))
283 | \item
284 |   Apply the \(\log\) transformation on this likelihood function, what do
285 |   you get?
286 | \item
287 |   Find the \(p\) that maximizes \(\log L(p)\)
288 | \end{enumerate}
289 | 
290 | \hypertarget{question-5}{%
291 | \subsection{Question 5:}\label{question-5}}
292 | 
293 | For the following probability density function:
294 | \[f(x)=\left\{\begin{array}{ll}{\frac{2}{\theta^{2}}(\theta-x)} & {0<x<\theta} \\ {0} & {\text { else }}\end{array}\right.\]
295 | Find \(\theta\) by the method of moments.
296 | 
297 | \hypertarget{question-6}{%
298 | \subsection{Question 6:}\label{question-6}}
299 | 
300 | For the exponential distribution:
301 | \[f(x ; \lambda)=\left\{\begin{array}{ll}{\lambda e^{-\lambda x}} & {x \geq 0} \\ {0} & {x<0}\end{array}\right.\]
302 | Find \(\lambda\) by MLE and by the Method of Moments.
303 | 
304 | \end{document}
305 | 


--------------------------------------------------------------------------------
/HW/old/HW1/hw1_q3_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "HW1_Q3_Solution"
 3 | author: "Afek Adler"
 4 | date: "11/14/2019"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## Question 3:
13 | 
14 | In the smallest branch of the smallest bank, the number of customers in the queue (waiting customers), is a random variable $Q\in\{0,1,2\}$. You cannot have more than 2 customers waiting in the queue, because the've been downsizing and the branch is really small.
15 | 
16 | The distribution of $Q$ is dependent on a parameter $\theta$.
17 | 
18 | $$Q = \left\{\begin{array}{ll}0 & \text{w.p. }4\theta^2\\
19 | 1 & \text{w.p. }4\theta-8\theta^2\\
20 | 2 & \text{w.p. }1-4\theta+4\theta^2\end{array}\right.$$
21 | 
22 | The bank's headquarters randomly sampled the queue during five independent times. The results were $\{0,1,0,0,0\}$ customers in the queue.
23 | 
24 | ### Answer the following questions:
25 | 
26 |    1. Find an unbiased estimator $\hat{\Theta}$ for the parameter $\theta$ for a sample of size $n=5$. What is $\hat{\Theta}$ based on the current sample? (you should get 0.45)
27 |    2. Find an unbiased estimator for the expected number of customers waiting in the queue based on a sample of size $n=5$. What is the estimate of the expected number of customers, based on the current sample? (0.2)
28 |    3. Find an estimator for $\theta$ in the maximum likelihood estimation method. (0.45)
29 |    
30 |    
31 | ## Question 3 - Solution:
32 | ### 1.
33 | We need to find $\hat{\Theta}$ such that $E(\hat{\theta}) = \theta$.
34 | It is known that $E(\overline{X}) = E(X_i)$ so:
35 | \[E(\overline{X}) = E(X_i) = 0 \cdot 4 \theta^{2}+1 \cdot\left(4 \theta-8 \theta^{2}\right)+2 \cdot\left(1-4 \theta+4 \theta^{2}\right)=2-4\theta \ \rightarrow\]
36 | \[\theta =\frac{2-E(\bar{X})}{4}= E(\frac{2-\bar{X}}{4}) \rightarrow\]
37 | \[\hat{\theta} = \frac{2-\bar{X}}{4}\]
38 | 
39 | 
40 | ### 2.
41 | We need to find $\hat{u}$ such that $E(\hat{u})=E\left(X\right)$
42 | We know that  $E(\overline{X})=E\left(X\right)$, in the context of our sample - $\overline{X} = 0.2$
43 | 
44 | ### 3. 
45 | Let's form the likelihood function - 
46 | \[
47 | \begin{array}{l}{L(\theta)=P\left(X_{1}=0\right) \cdot P\left(X_{2}=1\right) \cdot P\left(X_{3}=0\right) \cdot P\left(X_{4}=0\right) \cdot P\left(X_{5}=0\right)} \\ {=[4 \theta(1-2 \theta)] \cdot\left[4 \theta^{2}\right]^{4}=1024 \theta^{9}-2048 \theta^{10}} \\ {\rightarrow L^{\prime}(\theta)=9216 \theta^{8}-20480 \theta^{9}=0 \rightarrow \hat{\theta}=0.45}\end{array}\]
48 | Let's verify it's a maxima -
49 | 
50 | ```{r}
51 | library(ggplot2)
52 | fun.1 <- function(x) 1024*(x^9) - 2048*(x^10)
53 | set.seed(1492)
54 | ggplot(data.frame(x = seq(from = -0.3, to = 0.5, by = 0.001)), aes(x)) +
55 |   stat_function(fun = fun.1, colour = "red") + ggtitle("y = 1024x^9 -2048x^10") + theme(plot.title = element_text(hjust = 0.5))
56 | ```
57 | 
58 | 


--------------------------------------------------------------------------------
/HW/old/HW2/HW2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Introduction to Statistics and Data Analysis with R - Homework #2'
  3 | author: "Adi Sarid and Afek Adler"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   html_document: default
  7 |   pdf_document: default
  8 | ---
  9 | 
 10 | You may submit your answers in pairs, **in R** (,Rmd file).
 11 | Submission will be performed electronically via Moodle.
 12 | 
 13 | We urge you to start solving this sheet as soon as possible and, if you have any questions, come to visit us in reception hours next week.
 14 | 
 15 | Submit the your answeers for questions 1-4. 
 16 | 
 17 | 
 18 | ## Question 1:
 19 | In this question we will look at the iris classic dataset (built in in r environment):
 20 | 
 21 |   1. Plot a box plot of the numeric features
 22 |   2. Provide two sided confidence interval for the Sepal.Length & Sepal.Width. How was that confidence interval calculated? e.g what assumptions on the mean and variance fit to this particular confidence interval. 
 23 |   3. Provide a confidence interval for the probability to belong to the setosa species.  
 24 |   4. Does the above confidence interval really represent the confidence interval of this setosa species among the population of all the iris species? e.g, is this dataset biased in some meaning?
 25 |   5. Plot a scatter plot such that:
 26 |   
 27 |     * Sepal.Length will appear on the x axis 
 28 |     * Sepal.Width will appear on the y axis
 29 |     * Each point will have color based on the species of that given iris
 30 |     * The figure will have a legend that explains the colors of the species 
 31 |   4. Look at that splendid iris -
 32 |   ```{r,echo=FALSE}
 33 | photo_path <- 'https://upload.wikimedia.org/wikipedia/commons/f/f8/Iris_virginica_2.jpg'
 34 | destination_path <- 'iris.jpg'
 35 | if (!(file.exists(destination_path))){
 36 | download.file(photo_path,destination_path, mode = 'wb')}
 37 | ```
 38 | 
 39 | ```{r iris, echo=FALSE, out.width = '50%'}
 40 | knitr::include_graphics(destination_path)
 41 | ```
 42 | 
 43 | 
 44 | Suppose it has Sepal.Length of 7 and Sepal.Width of 3.
 45 | Based on the previous figure, which iris type would you say it is?
 46 | 
 47 | 
 48 | ``` {r confidense interval iris}
 49 | 
 50 | 
 51 | ```
 52 | 
 53 | ## Question 2:
 54 | This question is an appetizer for hypothesis test. we will create a QQ plot.
 55 | A QQ plot is a visual test (not a statistical test!) to check whether a one dimensional variable is distributed according according to some kind of a known distribution.
 56 | 
 57 | Instructions are provided in the attached link:
 58 | 
 59 |   1. What is a QQ plot? e.g, what appears in the x and y axis?
 60 |   2. make a QQ plot for sample1 and sample2 (with the normal distribution). are the plots the same? What's the difference?
 61 | * As always, plot the graph with `ggplot`
 62 | [https://www.youtube.com/watch?v=X9_ISJ0YpGw](explanation)
 63 | 
 64 | 
 65 | 
 66 | ``` {r QQ plot}
 67 | set.seed(0)
 68 | sample1 <- rnorm(100,100,36)
 69 | sample2 <- rnorm(100^2,100,36)
 70 | ####### 
 71 | # your code here
 72 | 
 73 | 
 74 | #######
 75 | ```
 76 | 
 77 | 
 78 | ## Question 3:
 79 | The breaking strength of yarn used in manufacturing
 80 | drapery material is required to be at least 100 psi. Past experience
 81 | has indicated that breaking strength is normally distributed
 82 | and that $\sigma = 2$  psi. A random sample of nine specimens
 83 | is tested, and the average breaking strength is found to be 98
 84 | psi. Find a 95% two-sided confidence interval on the true
 85 | mean breaking strength.
 86 | 
 87 | ## Question 4:
 88 | In order to find the proportion of defective bottles produced by a machine, two samples were taken. In sample 1, out of $n_1$ bottles $d_1$ werefound to be defective and in sample 2 out of $n_2$ bottles $d_2$ werefound to be defective.
 89 | 
 90 | Hereby there are 3 different estimators for the proportion of defective bottles:
 91 | 
 92 |   1. $\hat{p}_{1}=\frac{d_{1}}{n_{1}}$
 93 |   2. $\hat{p}_{2}=\frac{d_{1}+d_{2}}{n_{1}+n_{2}}$
 94 |   3. $\hat{p}_{3}=\frac{1}{2}\cdot\left(\frac{d_{1}}{n_{1}}+\frac{d_{2}}{n_{2}}\right)=\frac{d_{1}}{2 n_{1}}+\frac{d_{2}}{2 n_{2}}$
 95 |   
 96 | For each one of the above estimators. Check if it is unbiased and calculate the MSE (mean squared error) of that estimator. 
 97 | 
 98 | ## Question 5:
 99 | Read chapter 5 in r4ds and do excercises 5.24 - 5.41. 
100 | 


--------------------------------------------------------------------------------
/HW/old/HW2/iris.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/HW2/iris.jpg


--------------------------------------------------------------------------------
/HW/old/HW3/HW3.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Introduction to Statistics and Data Analysis with R - Homework #3'
  3 | author: "Adi Sarid and Afek Adler"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   html_document: default
  7 |   pdf_document: default
  8 | ---
  9 | 
 10 | This homework sheet is due on the 2019-05-12. You may submit your answers in pairs. Submit a PDF file with the code and results for the R part. Add to the PDF your solution for the other questions.
 11 | Submission will be performed electronically via Moodle.
 12 | 
 13 | We urge you to start solving this sheet as soon as possible and, if you have any questions, come to visit us in reception hours next week.
 14 | 
 15 | Submit the your answeers for questions 1-4. Questions 2 and 4 from the "hypothesis testing questions #1" and question 1 from "hypothesis testing questions #2".
 16 | 
 17 | ## Question 1:
 18 | In r4ds - excerices 5.52 to 5.71.
 19 | 
 20 | 
 21 | ## Question 2:
 22 | 
 23 | 
 24 | Across all the exercise. If not mentioned – use $\alpha$ = 0.05.
 25 | Create a tibble with 100 rows such that:
 26 | 
 27 |   * Column 1 – samples from normal distribution (mean = 3, std = 1)
 28 |   * Column 2 – samples from exponential distribution (mean = 3)
 29 | 
 30 | Use seed = 1 for that purpose.
 31 | 
 32 | __Your code here: __
 33 | ``` {r data set creation}
 34 | ```
 35 | 
 36 | From now on assume that you don’t anything about this data frame/tibble except that it is a matrix that contains arbitrary numbers.
 37 | 
 38 | For each column:
 39 | 
 40 |   a. Plot the PDF (probability density function) for that column
 41 |   b. Test (hypothesis test) whether the column is distributed normal
 42 |   c. Test (hypothesis test)  whether the column is distributed exponential 
 43 |   d. Test (hypothesis test)  whether the column is distributed log normal
 44 | 
 45 | Verify your results with a QQ plot (6 graphs total).
 46 |   
 47 | __Your code here: __
 48 | ``` {r hypothesis testing}
 49 | ```
 50 | 
 51 | Create a scatter plot where on the x axis lies the data on the first column and on the y axis lies the data on the second column.
 52 | 
 53 | 
 54 | __Your code here: __
 55 | ``` {r scatter plot}
 56 | ```
 57 | 
 58 | Do you expect that those column will we independent? check yourself with an hypothesis test.
 59 | 
 60 | __Your code here: __
 61 | ``` {r hypothesis testing for independence}
 62 | ```
 63 | 
 64 | 
 65 | ***
 66 | __The next questions should be done Both by handwriting (as a prepareation for the exam) and with R functions (as verification).__ 
 67 | 
 68 | ***
 69 | 
 70 | ## Question 2
 71 | 
 72 | Here are 50 samples of time between events (can be any events, for example, time between emergency room entrance in an hospital).
 73 | 
 74 | ```
 75 | time_events <- c(5.2, 16.6, 9.8, 14.9, 1.3, 0.5, 16.6, 16.5, 1, 16.6, 19.3, 10.6, 26.8, 4.5, 2.5, 24.9, 3.4, 9.1, 12.5, 10.6, 27.6, 1.3, 0.4, 2.2, 7.8, 6, 17.6, 10.8, 2.7, 1.3, 1.6, 15.7,10.9, 2.1, 20.1, 1.9, 13.6, 0.4, 18.8, 11.4, 3, 0.3, 5.5, 2.7, 2.2, 9.5, 14.8, 3, 6.3, 11.2)
 76 | ```
 77 | 
 78 | Test the hypothesis that this sample were taken form an exponential distribution. 
 79 | Guidance - estimate the parameter of the distribution first.
 80 | 
 81 | ## Question 3
 82 | 
 83 | There is always a line in the coffee station in Yoto. The waiting time is distributed exponential with $\theta$. The cafeteria manager claims that the mean waiting time is 5m whereas the students claim that the mean waiting time is 10 m (and therefore it takes all the time between two lessons).
 84 | 
 85 |   a. One student decided to test this claims by going to get coffee. He claims that if he will wait more than 10m, than the mean waiting time is indeed 10m.
 86 | based on this test, write down the null and the alternative hypothesis and calculate $\alpha$ and $\beta$.
 87 | 
 88 |   b. The cafeteria manager claimed that a sample of one student is not enough, and suggested that if and only if the student will wait more than 15m than we can reject the claim that the mean waiting time is 5 m. Based on this test, write down the null and the alternative hypothesis and calculate $\alpha$ and $\beta$.
 89 |   
 90 |   c. In order to solve the controversy, it was decided to measure the time of 100 students in line (not in the same time, independent of each other). The test statistic will be their mean waiting time, under the requirement that the test power should be 95% ($1 - \beta = 0.95$). 
 91 |  Based on this test, write down the null and the alternative hypothesis and calculate the critical value and $\alpha$.
 92 |  
 93 |  
 94 | ## Question 4
 95 | 
 96 | Two american people (Hilli form Colorado and Billie from California) argue about whether the amount of brown bears in california is higher than in Colorado. 
 97 | Each of them went to the woods a couple of times and recorded the amount of bears they have seen. Altohugh they agrreed to do it 20 times, Hilli decided that he can't bear it no more after a bear attack. 
 98 |  
 99 | |               | Sample Size   | Mean  | Std
100 | | ------------- |:-----------:| -----:| -----: |
101 | | Colorado      | 15            | 7.2  | 4.1     |
102 | | California    | 20            |12.5  | 5.5     |
103 |   
104 | With a confidense level of 10%, is it possible to say that the mean number of brown bears in California is bigger in more than one than the mean number of brown bears in Colorado?


--------------------------------------------------------------------------------
/HW/old/HW3/answers/EX09_Questions_and_Solutions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/HW3/answers/EX09_Questions_and_Solutions.pdf


--------------------------------------------------------------------------------
/HW/old/HW3/answers/federalelections2016.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/HW3/answers/federalelections2016.xls


--------------------------------------------------------------------------------
/HW/old/HW3/answers/is_republican.csv:
--------------------------------------------------------------------------------
 1 | ,Republican
 2 | AL,9
 3 | AK,3
 4 | AZ,11
 5 | AR,6
 6 | CA,
 7 | CO,
 8 | CT,
 9 | DE,
10 | DC,
11 | FL,29
12 | GA,16
13 | HI,
14 | ID,4
15 | IL,
16 | IN,11
17 | IA,6
18 | KS,6
19 | KY,8
20 | LA,8
21 | ME,1
22 | MD,
23 | MA,
24 | MI,16
25 | MN,
26 | MS,6
27 | MO,10
28 | MT,3
29 | NE,5
30 | NV,
31 | NH,
32 | NJ,
33 | NM,
34 | NY,
35 | NC,15
36 | ND,3
37 | OH,18
38 | OK,7
39 | OR,
40 | PA,20
41 | RI,
42 | SC,9
43 | SD,3
44 | TN,11
45 | TX,36*
46 | UT,6
47 | VT,
48 | VA,
49 | WA,
50 | WV,5
51 | WI,10
52 | WY,3
53 | 


--------------------------------------------------------------------------------
/HW/old/HW3/answers/solution_ex3_q2.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | ggplot(data = data.frame(state.x77), aes(x = HS.Grad, y = Income)) + 
 4 |   geom_point()
 5 | 
 6 | newx.77 <- state.x77 %>% 
 7 |   data.frame() %>% 
 8 |   mutate(income_fct = cut(Income, 2),
 9 |          hs_grad_fct = cut(HS.Grad, 2))
10 | 
11 | country_name_code <- read_csv("c:/temp/country_code.csv", col_names = c("country", "code")) %>% 
12 |   distinct(country, code)
13 | 
14 | country_vote <- read_csv("c:/temp/is_republican.csv", 
15 |                          col_names = c("code", "is_rep"),
16 |                          skip = 1) %>% 
17 |   mutate(is_rep = !is.na(is_rep)) %>% 
18 |   left_join(country_name_code)
19 | 
20 | x.77_voting_pattern <- newx.77 %>% 
21 |   mutate(country = rownames(state.x77)) %>% 
22 |   left_join(country_vote)
23 |   
24 | 
25 | chisq.test(x = newx.77$income_fct, y = newx.77$hs_grad_fct,
26 |            simulate.p.value = F)
27 | 
28 | t.test(formula = HS.Grad ~ is_rep, data = x.77_voting_pattern)
29 | 


--------------------------------------------------------------------------------
/HW/old/HW4/first_semester/HW4.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Introduction to Statistics and Data Analysis with R - Homework #4'
 3 | author: "Adi Sarid and Afek Adler"
 4 | date: '2019-12-16'
 5 | output:
 6 |   html_document: default
 7 |   pdf_document: default
 8 | ---
 9 | 
10 | This homework sheet is due on the 2019-12-30 You may submit your answers in pairs. Submit a PDF file with your handwritten solutions.
11 | Submission will be performed electronically via Moodle.
12 | 
13 | We urge you to start solving this sheet as soon as possible and, if you have any questions, come to visit us in reception hours next week.
14 | 
15 | Across all the exercise. If not mentioned – use $\alpha$ = 0.05.
16 | 
17 | 
18 | ## Question 1
19 | 
20 | A biologist performed a regression on how much a planet diameter (x) affects it's mass (y), Based on 30 samples. His concllusion was $y ̂ = 10+0.1x$. were x is in trillions of tons ($10^{12}$ kilogram). Which regression line would he get if he would use units of ten trillions of ton? ($10^{13}$ kilogram)? \
21 | Guidance - compute the new $ss_x,ss_{xy}$ as a function of the old ones. See what happens to new $b_1 = ss_{xy}/ss_x$ and $b_0 = \bar{y} -b_1\bar{x}$
22 | 
23 | ## Question 2
24 | 
25 | Given that $\bar{x} = 432.2,\sum_{i=1}^{10}x_i^2 = 2,048,810,\sum_{i=1}^{10}y_i^2 = 103,195, \hat{y} = 5.821 + 0.195x$
26 | 
27 | Calculate:
28 | 
29 |   1. $\bar{y}$
30 |   2. $R^2$
31 |   3. is there a positive linear connection between x and y? Guidance, use hypothesis testing.
32 | 
33 | 
34 | ## Question 3
35 | 
36 | Based on the following samples: 
37 | 
38 |   1. find the parameters of a simple linear regression model ($y = a + bx$)
39 |   2. Can we say that there is a linear relationship between x and y? Guidance, hypothesis test on the significance level of the regression.
40 | 
41 | | X| Y 
42 | | ----|----
43 | | 0| 0.5           
44 | | 1| 2     
45 | | 2| 4.2            
46 | | 4| 6  
47 | | 5| 6.5            
48 | | 8| 8.5  
49 | 


--------------------------------------------------------------------------------
/HW/old/HW5/HW5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/HW5/HW5.pdf


--------------------------------------------------------------------------------
/HW/old/Submission guidelines for data analysis course  2020.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/Submission guidelines for data analysis course  2020.docx


--------------------------------------------------------------------------------
/HW/old/Submission guidelines for data analysis course  2020.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/Submission guidelines for data analysis course  2020.pdf


--------------------------------------------------------------------------------
/HW/old/Submission guidelines for data analysis course 2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/HW/old/Submission guidelines for data analysis course 2019.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![](https://raw.githubusercontent.com/adisarid/intro_statistics_R/bcdb6af4058308ebe999d0a477d6a1bb9030ffa2/misc/tau_engineering_logo.png)
  2 | 
  3 | # Introduction to Statistics and Data Analysis with R
  4 | 
  5 | This is the repository for the course of **introduction to statistics and data analysis**, taught in Tel-Aviv university (0560.1823). The course is taught in the Engineering faculty in the "Digital Sciences for High Tech" track.
  6 | 
  7 | In this repository you will find all the required materials including lecture notes, references, class code, exercises, and more.
  8 | 
  9 | ## Technical Information
 10 | 
 11 | In this course there are 3 lecture hours + 1 exercise (instructor) hour.
 12 | 
 13 | Lecturer: Dr. Adi Sarid.
 14 | 
 15 |    * Office hours: Please **coordinate in advance** via email.
 16 |    * E-mail: adi@sarid-ins.co.il.
 17 |    * Twitter: @SaridResearch
 18 |    * Mobile Phone: +972-50-8455450 (Please please try to reach out via email first).
 19 |    * Personal website: [adisarid.github.io](adisarid.github.io)
 20 | 
 21 | Instructor: Mr. Raphael Shuhendler.
 22 | 
 23 |    * Office hours: Please **coordinate in advance** via email.
 24 |    * E-mail: shuhendler@mail.tau.ac.il.
 25 | 
 26 | The course will be given in Hebrew, but all the supporting materials will be provided in English.
 27 | 
 28 | Garding will be based on: 
 29 | 
 30 |    * Final exam (60%)
 31 |    * Final project, pairs (40%)
 32 | 
 33 | You will have homework but it's up to you to make sure you do them and understand them, we will not be grading them (only verifying that you submitted them).
 34 | 
 35 | ## Prerequisites
 36 | 
 37 | The prerequisites for this course are:
 38 | 
 39 |    * Introduction to Probability (0560.2801 or equivalent). 
 40 |    * Mathematical Methods 1 (0560.2802 or equivalent).
 41 |    
 42 | This course is mainly designed for undergraduates with prior knowledge in probability and basic knowledge in math (a bit of Algebra and a bit of Infi), doing a BA/BSc with a "Digital Sciences for High-Tech" track. However, it would also fit graduate students which want to strengthen their knowledge in statistics and data analysis (or learn the very basics of R).
 43 | 
 44 | ## Goals
 45 | 
 46 | This is a course in introduction to statistics and data analysis. The course covers fundemantal terms in statistics, such as significance, hypothesis testing, inference, sampling methods, variable types, modelling (regression).
 47 | 
 48 | During the course we will use the [R](https://www.r-project.org) language for demonstrations and exercises.
 49 | 
 50 | We will use publicly available "open data sets" (e.g., from [Kaggle](https://kaggle.com) and [tidytuesday](https://github.com/rfordatascience/tidytuesday)) to demonstrate the various topics we will cover.
 51 | 
 52 | ## Topics
 53 | 
 54 |    * Overview - from design to implementation: how a statistical research is conducted, from the design phases, through data collection and presentation.
 55 |    * Statistical inference and parameter estimation (e.g., average, standard deviation, percentiles).
 56 |    * Hypothesis testing:
 57 |       * Confidence intervals, unpaird tests, paird tests. Student's t-test, z test, a-parameteric tests.
 58 | 	  * Goodness of fit (Chi-square, Kolmogorov-Smirnov).
 59 |    * The problem with p-value and significance testing in the age of big data. False discovery rate (FDR).
 60 |    * Analysis of Variance (One-way and Two-way ANOVA).
 61 |    * Planning experiments (multiple-comparisons), sample size calculations, power calculations.
 62 |    * Linear regression.
 63 |    * Correlation.
 64 |    * Logistic regression (if time permits).
 65 | 
 66 | ## Software Prerequisites
 67 | 
 68 | You will need to install [R](https://www.r-project.org) and [RStudio](https://rstudio.com/products/rstudio/download/). RStudio is not mandatory to run R, but it provides a very environment for writing R code. Both software are available for free (for RStudio download the *RStudio Desktop Open Source License* version).
 69 | 
 70 | ## Reading Materials
 71 | 
 72 | OpenIntro statistics is an introduction to statistics with R, it doesn't contain everything we will learn, but provides a good intro to some topics. Downloadable for free [here](https://leanpub.com/openintro-statistics/) (click on the "download sample" and the entire book downloads as a pdf file).
 73 | 
 74 |    * Diez, D. M., Barr, C. D., & Cetinkaya-Rundel, M. (2012). OpenIntro statistics (pp. 174-175). OpenIntro.
 75 | 
 76 | R4DS (R for Data Science) is a highly recommended book for learning R, and specifically *tidyverse* which is a collection of useful packages for data science. The book is mostly "technical", i.e., it does not provide much theoretical details. This book is also available in an online format [here](https://r4ds.had.co.nz/).
 77 | 
 78 |    * Wickham, H., & Grolemund, G. (2016). R for data science: import, tidy, transform, visualize, and model data. " O'Reilly Media, Inc.".
 79 | 
 80 | Most of the theory I present during the course comes from these two books:
 81 | 
 82 |    * Walpole R.E., Myers R. H, Myers S. L., and Ye K.: Probability & Statistics for Engineers & Scientists. Prentice Hall, 9th ed., 2011. Available [online](https://fac.ksu.edu.sa/sites/default/files/probability_and_statistics_for_engineers_and_scientisst.pdf)
 83 |    * Runger G. & D. Montgomery: Applied Statistics and Probability for Engineers. Wiley, 7th ed., 2018.
 84 |    
 85 | Additional books:
 86 | 
 87 |    * Johnson, N.L. & Leone, F.C.: Statistics and Experimental Design Vol. 1.2, Wiley, 2nd ed., 1997.
 88 |    * Draper N. & H. Smith: Applied Regression Analysis, 3rd  ed. Wiley, 1998.
 89 |    * Gibbons J.D.: Nonparametic Statistical Inference, Springer, 2011.
 90 | 
 91 | ## Additional Sources
 92 | 
 93 | You can find various online videos teaching statistics theory along with R coding examples. One such place is the Statistics of DOOM channel on youtube: [https://www.youtube.com/channel/UCMdihazndR0f9XBoSXWqnYg](https://www.youtube.com/channel/UCMdihazndR0f9XBoSXWqnYg).
 94 | 
 95 | A recorded workshop on R [https://tau.cloud.panopto.eu/Panopto/Pages/Sessions/List.aspx?folderID=63ae0b2d-6a79-4d4d-82d5-ac8f0160961b](https://tau.cloud.panopto.eu/Panopto/Pages/Sessions/List.aspx?folderID=63ae0b2d-6a79-4d4d-82d5-ac8f0160961b).
 96 | 
 97 | Online course in R from Tel-Hai College, available in Campus-IL [here](https://campus.gov.il/course/telhai-acd-rfp4-telhai-r/).
 98 | 
 99 | The friendly guide for moving from Excel to R on my [youtube channel](https://www.youtube.com/watch?v=yRTD1zP5iEM&list=PLvH84evAlP42YtWm3XTjfK2ksC4evTg_U).
100 | 
101 | ## How this Repository is Arranged
102 | 
103 | This repository is arranged with subfolders as follows:
104 | 
105 | ```
106 | ├── exam_examples (examples for questions and exams)
107 | ├── exercises (exercise notes)
108 | ├── HW (home work exercises)
109 | ├── lectures (lecture notes)
110 |    └── data (contains datasets we will use)
111 | ├── misc (miscellaneous, feel free to ignore this)
112 | └── project (project instructions and example)
113 | ```
114 | 


--------------------------------------------------------------------------------
/additional_notes/MLE_Bernoulli.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "MLE for Bernoulli parameter $p$"
 3 | author: "Adi Sarid"
 4 | date: "`r Sys.Date()`"
 5 | output: pdf_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | The likelihood function of a Bernoulli random variable is the probability of the results, i.e. a probability of a binomial random variable with $v$ success out of $n$ trials, with a probability $p$ for success:
13 | 
14 | $$L(p)=p^v(1-p)^{(n-v)}$$
15 | Taking the $\log(L)$ we get:
16 | 
17 | $$\log(L(p)) = v\log(p) + (n-v)\log{(1-p)}$$
18 | Then
19 | 
20 | $$\frac{d\log(L(p))}{dp}=\frac{v}{p}-\frac{n-v}{1-p}$$
21 | 
22 | Require that this derivative is 0 (to find the maximum):
23 | 
24 | $$\frac{v}{p} = \frac{n-v}{1-p}$$
25 | 
26 | If and only if:
27 | 
28 | $$v(1-p) = p(n-v)$$
29 | 
30 | If and only if:
31 | 
32 | $$np=v$$
33 | Hence
34 | 
35 | $$\hat{p}=\frac{v}{n}$$
36 | Using the second derivative (to make sure that this is indeed a **maximum** of the likelihood):
37 | 
38 | $$\frac{d\log(L(p))}{d^2p}=-\frac{v}{p^2}-\frac{n-v}{(1-p)^2} < 0$$
39 | $$\square$$
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/additional_notes/MLE_Bernoulli.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/additional_notes/MLE_Bernoulli.pdf


--------------------------------------------------------------------------------
/books/openintro-statistics-sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/books/openintro-statistics-sample.pdf


--------------------------------------------------------------------------------
/distribution_shiny_app/app.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # A small shiny app which illustrates the normal distribution, 
 3 | # and the relationship between alpha, z, one sided and two sided intervals
 4 | 
 5 | library(shiny)
 6 | library(tidyverse)
 7 | 
 8 | source("plot_z_score.R")
 9 | 
10 | z_dense <- tibble(z_range = seq(-3, 3, by = 0.025),
11 |                   density = dnorm(z_range),
12 |                   p_range = pnorm(z_range))
13 | 
14 | ui <- fluidPage(
15 |     theme = shinythemes::shinytheme("united"), title = "Demonstrating the normal distribution",
16 |     fluidRow(
17 |         sidebarLayout(
18 |             sidebarPanel(width = 3,
19 |                          fluidRow(
20 |                              h3("Input parameters"),
21 |                              sliderInput("alpha", "α-level (alpha)", value = 0.05, 
22 |                                          min = 0, max = 1, step = 0.025,
23 |                                          animate = TRUE),
24 |                              verbatimTextOutput("z"),
25 |                              # sliderInput("z", "z-score", value = qnorm(0.05),
26 |                              #             min = -3, max = 3, step = 0.05),
27 |                              radioButtons("alternative", "Two- or one-sided test",
28 |                                           choices = c("One sided" = "one.sided",
29 |                                                       "Two sided" = "two.sided"),
30 |                                           selected = "one.sided"
31 |                                           )
32 |                          )),
33 |             mainPanel(
34 |                 h3("Illustration of the normal distribution"),
35 |                 plotOutput("z_plot"),
36 |                 fluidRow(
37 |                     p("This app was generated by Adi Sarid, as a tool to demonstrate the relationship between the quantiles, the density, and the cumulative distribution function of a normal distribution. 
38 |                          Change the controls on the right to see how it reflects on the chart."),
39 |                     p("The source code for this app is available in the a github repository:",
40 |                       a("https://github.com/adisarid/intro_statistics_R/tree/master/distribution_shiny_app", 
41 |                         href = "https://github.com/adisarid/intro_statistics_R/tree/master/distribution_shiny_app", 
42 |                         target = "_blank")),
43 |                     p("The use of this tool is permitted via the cc-by-sa, with attribution to ", 
44 |                       a("Adi Sarid", href = "https://adisarid.github.io", 
45 |                         target = "_blank"))
46 |                 )
47 |             )
48 |         )
49 |     )
50 | )
51 | 
52 | # Define server logic required to draw a histogram
53 | server <- function(input, output, session) {
54 |     
55 |     output$z_plot <- renderPlot({
56 |         
57 |         plot_z_score(p = input$alpha,
58 |                      alternative = input$alternative,
59 |                      z_dense)
60 |     })
61 |     
62 |     output$z <- renderText({
63 |         if (input$alternative == "one.sided"){
64 |             paste0("z-score=", qnorm(p = input$alpha))
65 |         } else {
66 |             paste0("z-score=", qnorm(p = input$alpha/2))
67 |         }
68 |     })
69 | 
70 | }
71 | 
72 | # Run the application 
73 | shinyApp(ui = ui, server = server)
74 | 


--------------------------------------------------------------------------------
/distribution_shiny_app/distribution_shiny_app.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/distribution_shiny_app/plot_z_score.R:
--------------------------------------------------------------------------------
 1 | # This script illustrates z-score.
 2 | ## Example:
 3 | # plot_z_score(1, "two.sided")
 4 | # plot_z_score(0.05, "one.sided")
 5 | plot_z_score <- function(p = NULL, 
 6 |                          alternative = c("two.sided", "one.sided"),
 7 |                          z_dense = tibble(z_range = seq(-3, 3, by = 0.05),
 8 |                                           density = dnorm(z_range),
 9 |                                           p_range = pnorm(z_range))){
10 |   
11 |   # prep the plot
12 |   base_plot <- ggplot(z_dense, aes(x = z_range, y = density)) + 
13 |     geom_line()
14 |   
15 |   subtitle_str <- ""
16 |   
17 |   
18 |   
19 |   # split to two sided/one sided alternative
20 |   if (alternative[1] == "two.sided"){
21 |     z_of_p <- qnorm(p/2)
22 |     div_factor <- 0.5
23 |   } else {
24 |     z_of_p <- qnorm(p)
25 |     div_factor <- 1
26 |   }
27 |   
28 |   density_of_p <- dnorm(z_of_p)
29 |   
30 |   base_plot <- base_plot + 
31 |     geom_area(data = z_dense %>% filter(p_range <= p*div_factor),
32 |               aes(x = z_range, y = density), fill = "lightblue", alpha = 0.5) +
33 |     geom_segment(x = -3, xend = z_of_p,
34 |                  y = density_of_p, yend = density_of_p, color = "red") +
35 |     geom_segment(x = z_of_p, xend = z_of_p, y = density_of_p, yend = 0, color = "red")
36 |   
37 |   if (alternative[1] == "two.sided"){
38 |     base_plot <- base_plot + 
39 |       geom_area(data = z_dense %>% filter(p_range >= 1 - p*div_factor),
40 |                 aes(x = z_range, y = density), fill = "lightblue", alpha = 0.5) +
41 |       geom_segment(x = 3, xend = -z_of_p,
42 |                    y = density_of_p, yend = density_of_p, color = "red") +
43 |       geom_segment(x = -z_of_p, xend = -z_of_p, y = density_of_p, yend = 0, color = "red")
44 |     
45 |   }
46 |   
47 |   subtitle_str <- paste0(subtitle_str, "p = Phi(z) = pnorm(z) = ", round(p*div_factor, 3), 
48 |                          "; z = qnorm(p) = ", 
49 |                          round(z_of_p, 3))
50 |   
51 |   
52 |   base_plot + 
53 |     xlab("z_p") + 
54 |     ylab("density\ndnorm(z_p)") + 
55 |     theme_bw() + 
56 |     ggtitle("The normal distribution",
57 |             subtitle = subtitle_str)
58 |   
59 | }
60 | 


--------------------------------------------------------------------------------
/exam_examples/Exam_Moed_A.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exam_examples/Exam_Moed_A.pdf


--------------------------------------------------------------------------------
/exam_examples/Exam_Moed_A_answers.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exam_examples/Exam_Moed_A_answers.pdf


--------------------------------------------------------------------------------
/exam_examples/Exercise_Examples.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exam_examples/Exercise_Examples.pdf


--------------------------------------------------------------------------------
/exam_examples/Exercise_Examples_Answers.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Test Exercises - Answers"
 3 | author: "Adi Sarid"
 4 | date: "June 2020"
 5 | output: pdf_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | 


--------------------------------------------------------------------------------
/exercises/old/01 - Intro to R/00-Introduction.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "00-Introduction"
 3 | author: "Adi Sarid / adi@sarid-ins.co.il"
 4 | date: "`r Sys.Date()`"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## Background
13 | 
14 | You are now viewing a document generated via R Markdown. R Markdown is a friendly format for writing R code along with documentation that surrounds it. It's a very powerful tool - a simple text document can then be compiled very easily to an html, a pdf, word and additional formats.
15 | 
16 | It is good for documenting and communicating your work to others, and we are going to rely on this format.
17 | 
18 | The aim of this exercise is to get you familiar with the RStudio interface, while writing your first R Markdown document along with some R code. 
19 | 
20 | Ready? Let's get started!
21 | 
22 | ## The console and some basic commands
23 | 
24 | You can notice that when you open RStudio, the window is divided into 4 segments. In the lower segment (on the left) you can see the console. The console can be used to run R commands. Try running some code in the console, i.e., copy and paste the following code (line by line). 
25 | 
26 | Note that we are: 
27 | 
28 |    * Using `<-` which is the sign used to set a variable's value. 
29 |    * We are using `a` and `b` as variable names.  
30 |    * `cars`, which is an example dataset.
31 |    * `?` which is used to get help on commands.
32 |    * `plot()` which is used to generate a base-r plot.
33 | 
34 | ```
35 | 1 + 1
36 | a <- 1
37 | b <- 2
38 | a + b
39 | cars
40 | ?cars
41 | ?plot
42 | plot(cars)
43 | ```
44 | 
45 | ### Answer/note these:
46 | 
47 | Decipher what each command did.
48 | 
49 |    a. Did you notice the help pop up when you used `?cars` and `?plot`?
50 |    b. Did you notice where the plot came up?
51 | 
52 | Look at the console and click Ctrl+L. What happend?
53 | 
54 | Set the marker in the console window, click on Ctrl + up arrow. What happened?
55 | 
56 | Finally, type the letter c in the console and click Ctrl + up arrow. What does this do?
57 | 
58 | ***
59 | 
60 | ## File types you can use in RStudio
61 | 
62 | Throughout the course we will rely heavily on writing in RMarkdown, however there are some more file types in RStudio. Use the file menu (File -> New -> R Script) to open up a new script, in the script type the code from the previous part above and click Ctrl + Shift + Enter. See what happens.
63 | 
64 | Now mark the last two lines and click Ctrl + Enter (without the shift). What is the difference?
65 | 
66 | When do you think you would use a script file versus an R Markdown file?
67 | 
68 | Another important option is using R Projects. When you open up a new project it will generate an environment file called .Rproj. This file will preserve the relative location of the directory, and this will make it slightly easier to load and save files from within R (it will be a huge benefit later on).
69 | 
70 | Let's start a new project: Click on File -> New Project
71 | Follow the wizard's instructions to open up a new project with project type "New Project".
72 | Name it "My first R project", and check the two checkboxes for "Create a .git repository" and for "Open in new session".
73 | 
74 | In the new RStudio window that appeared, open up a new RMarkdown file, call it "My brand new RMarkdown", keep the default output format as html, and click OK. Save the RMarkdown file inside the directory (click on the save button in the upper left corner or on File -> Save).
75 | 
76 | Now, click on "Knit" (Ctrl + Shift + K).
77 | 
78 | ## Master addtional shortcuts and RStudio panes.
79 | 
80 | Click on Alt+Shift+K. This is the shortcut window - it can help you if you ever forget shortcuts. 
81 | Look at the RStudio window and try to answer the following short questions (please ask the one sitting next to you if you are lost!!!)
82 | 
83 |    1. Where can you see all the environment variables currently loaded?
84 |    2. Where can you access the history of all the commands you previously ran?
85 |    3. Where would you look for the packages that are available to you? (RStudio has a pane for it, you just have to look for it)
86 |    4. If you are familiar with Git, how would you commit a file from within RStudio?
87 |    5. How would you find cheatsheets from within RStudio's menus?
88 | 
89 | 
90 | ***
91 | 
92 | Congratulations! you've completed your very first exercise in R.
93 | 


--------------------------------------------------------------------------------
/exercises/old/01 - Intro to R/01- More Operations.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "01 Matrices dataframes and more"
 3 | author: "Afek Adler"
 4 | date: "r Sys.Date()"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## Random Numbers
13 | Random Numbers allow us to simulate situations easily
14 | Setting seed allow us to reproduce the results of an experiment
15 | 
16 | ``` {r Seed}
17 | rnorm(5)
18 | rnorm(5)
19 | set.seed(5)
20 | rnorm(5)
21 | set.seed(5)
22 | rnorm(5)
23 | ```
24 | 
25 | 
26 | ``` {r Random vectors}
27 | rand_vec = rpois(n = 50, lambda = 10)
28 | mean(rand_vec)
29 | median(rand_vec)
30 | length(rand_vec)
31 | 
32 | ```
33 | ``` {r Random matrices}
34 | r <- 1000 #rows
35 | c <- 3 # columns
36 | rand_mat = matrix(runif(r*c), r, c)
37 | class(rand_mat)
38 | dim(rand_mat)
39 | dim(t(rand_mat))
40 | rand_mat[5,2]
41 | # rand_mat[5,10] error
42 | ```
43 | 
44 | 
45 | ``` {r head,tail, and sample}
46 | head(rand_mat,2)
47 | ```
48 | 
49 | 
50 | ``` {r dataframes}
51 | cols <- c('a','b','c')
52 | df <- as.data.frame(rand_mat)
53 | names(df) <- cols
54 | head(df,2)
55 | ```
56 | 
57 | 
58 | ``` {r read write csv}
59 | write.csv(df, "example_file.csv")
60 | new_file <- read.csv("example_file.csv") 
61 | ```
62 | ``` {r}
63 | sapply(df, class)
64 | sapply(new_file, class)
65 | ```
66 | 
67 | ``` {r}
68 | df$new_col = (df$a> 0.5)*1
69 | sapply(df, class)
70 | df$new_col = as.factor(df$new_col)
71 | sapply(df, class)
72 | ```
73 | ``` {r}
74 | x <-rnorm(100)
75 | y <-rnorm(100)
76 | plot(x, y, xlab="This is the x-axis", ylab="This is the y-axis",main="Plot of X vs Y")
77 | ```
78 | 
79 | [normal distribution deriviation - recommended](https://www.youtube.com/watch?v=cTyPuZ9-JZ0)
80 | 


--------------------------------------------------------------------------------
/exercises/old/01 - Intro to R/01-Syntax, functions, loops, data types.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "01-Syntax, base, functions, loops and data types"
  3 | author: "Adi Sarid / adi@sarid-ins.co.il"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   pdf_document: default
  7 |   html_document: default
  8 | ---
  9 | 
 10 | ```{r setup, include=FALSE}
 11 | knitr::opts_chunk$set(echo = TRUE)
 12 | ```
 13 | 
 14 | ## Goals of this exercise
 15 | 
 16 | Familiarize yourself with the basics of R. This exercise encompasses:
 17 | 
 18 |    1. Setting variables
 19 |    2. Various data types, 
 20 |    3. Working with vectors, data.frames
 21 |    4. Installing and loading packages
 22 |    5. Building functions and base-R iterations
 23 | 
 24 | 
 25 | 
 26 | ## Setting variables
 27 | 
 28 | A variables can be defined using the arrow notation (which we have already seen in the previous lesson `<-`). You can also do the same with `=` but its less common and should not be used (apart from a specific case, in function arguments, which we will discuss later).
 29 | 
 30 | ```{r setting new variables}
 31 | a <- 1
 32 | b <- 2
 33 | c = 3 # just to show that this works
 34 | 
 35 | a
 36 | b
 37 | c
 38 | a*b
 39 | b*c
 40 | 
 41 | ```
 42 | 
 43 | Try to run the following code in the console, and think about questions which follow.
 44 | ```
 45 | d <- c(a + b)
 46 | d2 <- a + b
 47 | d3 <- c*(a + b)
 48 | ```
 49 | 
 50 | What does the first line do? 
 51 | What is the difference between the first line and the second line? (there is a difference even though the result is the same)
 52 | What does the third line do?
 53 | Why shouldn't you use `c` as a variable name?
 54 | 
 55 | (hint: type `help("c")` in the console)
 56 | 
 57 | 
 58 | Do not mix the assignment operator `=` (which I told you not to use), with the test equality operator `==`.
 59 | 
 60 | ```{r test equality}
 61 | 
 62 | a == 1
 63 | b == a
 64 | b > a
 65 | 
 66 | ```
 67 | 
 68 | Also note the use of logicals:
 69 | Please explain what each of the following operators do: `& | ! !=`, you can use the following (and modify it in any way):
 70 | 
 71 | ```
 72 | # TRUE FALSE and the likes
 73 | TRUE & FALSE
 74 | TRUE | FALSE
 75 | TRUE & TRUE
 76 | !TRUE
 77 | FALSE != TRUE
 78 | FALSE == FALSE
 79 | ```
 80 | 
 81 | Try the following code. Bonus points if you can explain what's wrong with it (and why that is).
 82 | 
 83 | ```{r two is not two}
 84 | 
 85 | sqrt2 <- sqrt(2)
 86 | sqrt2
 87 | 2 == sqrt2^2
 88 | 
 89 | ```
 90 | 
 91 | ## Data types
 92 | 
 93 | R has a number of "basic" data types:
 94 | 
 95 |    * Integer
 96 |    * Numeric (double)
 97 |    * Date (posix)
 98 |    * Factors
 99 |    * Logicals
100 | 
101 | You can use `c()`, `rbind()`, `cbind()` to piece values together into vectors or more complex structures.
102 | Run the following code.
103 | 
104 | ```{r}
105 | integer_example <- 10L
106 | integer_example
107 | numeric_example <- pi # pi is a reserved word...
108 | numeric_example
109 | character_examples <- "hello world"
110 | character_examples
111 | date_example <- as.Date("2018-10-01")
112 | date_example
113 | factor_example <- as.factor(c("big", "big", "small", "medium", "small", "big", "bigger"))
114 | factor_example
115 | summary(factor_example)
116 | logical_example <- c(TRUE, TRUE, FALSE, TRUE)
117 | logical_example
118 | ```
119 | 
120 | 
121 | ***
122 | 
123 | Using the `c()` command try to piece together the `logical_example` with the `factor_example`, i.e. (replace the `???` with something else):
124 | 
125 | ```
126 | c(logical_example, ???)
127 | ```
128 | What happend to the factor vector? does the resulting vector make sense?
129 | 
130 | Do the same with the `date_example` and the `factor_example`. What happend now? What precautions would you take when working with factors?
131 | 
132 | ***
133 | 
134 | ## data frames
135 | 
136 | Data frames are a more complex structure which contains mixed data. R comes bundled with a number of "classical" data frames. Try the following:
137 | 
138 | ```
139 | mtcars
140 | iris
141 | ?mtcars
142 | ?iris
143 | ```
144 | 
145 | What types are the variables (columns) in each of these data sets? (double/factor/date/logical/integer/character)
146 | 
147 | ## Packages
148 | 
149 | An R package is a bundle of functions which share a common goal or vision. So far, we've been using base-r. The `tidyverse` packages is a package of packages. We will be working a lot with it. Let's try to load `tidyverse`.
150 | 
151 | ```
152 | library(tidyverse)
153 | ```
154 | 
155 | Did that work? if you got an error message you might need to install it. The following code will download and install the tidyverse. Be warned, this takes long.
156 | 
157 | ```
158 | install.packages("tidyverse")
159 | ```
160 | 
161 | Now if you installed the package, try to load it again `library(tidyverse)`. To use a function after you loaded a packages you can call `function_name(arg1 = ..., arg2 = ..., ...)`. Use `glimpse` to verify your answers for the previous questions (what types are the variables in mtcars and iris):
162 | 
163 | ```{r tidyverse}
164 | library(tidyverse)
165 | 
166 | glimpse(iris)
167 | glimpse(mtcars)
168 | 
169 | ```
170 | 
171 | Use the function `count` to answer:
172 | 
173 | How many flower-types are there in `iris`?
174 | How many cylinder values are there in `mtcars`?
175 | 
176 | ```
177 | count(iris, Species)
178 | count(???, cyl)
179 | 
180 | ```
181 | 
182 | Later on, we will learn some more convinient ways to answer such questions.
183 | 
184 | 
185 | ## Functions and iterations - intermediate exercise
186 | 
187 | We will discuss some base-R iterations, however, **in real situations you should do all in your power to avoid base-r loops!**.
188 | 
189 | In the following exercise you will build a function which computes the Fibonacci series (0, 1, 1, 2, 3, 5, 8, 13, 21,...), and a loop which does the same. You will compare their runtime using `bench::mark()`.
190 | 
191 | WARNING:
192 | This might feel like a **relatively complex** exercise if you're not fluent in programming, and it's not directly related to data analysis. The reasone I am giving you this exercise is that it is a great exercise to reherse elements we were discussing, in one single exercise.
193 |    
194 |    1. Functions and the concept of recursion (a function calling itself)
195 |    2. Base-r loops
196 |    3. Conditionals (`if...else if...else`).
197 | 
198 | First, if you don't know what the Fibonacci series is, go to [wikipedia, Fibonacci number](https://en.wikipedia.org/wiki/Fibonacci_number) and read about it (just the intro, should suffice).
199 | 
200 | Complete the following function so that a call to the function will generate the n^th^ Fibonnacci number. Replace the `???`.
201 | Rows which start with the hash sign `#` are comments and will be ignored.
202 | 
203 | Also, if you never heard the term "recursion" up until today, you might want to start with the second function `fib_loop`, and then think about the first one `fibonnaci`.
204 | 
205 | ``` {r} 
206 | fibonnacci <- function(n){
207 |   if (n == 0) {
208 |     # starting condition for F_0 
209 |     return(0)
210 |   } else if (n == 1) {
211 |     # starting condition for F_1
212 |     return(1)
213 |   } else {
214 |     # use recursion to calculate the number
215 |     return(fibonnacci(n-1)+fibonnacci(n-2))
216 |   }
217 | }
218 | 
219 | fibonnacci(30)
220 | 
221 | ```
222 | 
223 | An alternative way to compute the Fibonnacci is via loops. Complete the following code:
224 | 
225 | ``` {r}
226 | fib_loop <- function(n){
227 | len <- n
228 | fibvals <- numeric(len)
229 | fibvals[1] <- 1
230 | fibvals[2] <- 1
231 | for (i in 3:len) { 
232 |    fibvals[i] <- fibvals[i-1]+fibvals[i-2]
233 | } 
234 | return( tail(fibvals,1))
235 | }
236 | fib_loop(30)
237 | 
238 | ```
239 | 
240 | Check that your are getting consistent results. Now, compare the two functions using:
241 | 
242 | ```
243 | install.packages(bench) # if it is not installed
244 | bench::mark(fibonnacci(30), fib_loop(30))
245 | ```
246 | 
247 | Which method is quicker?
248 | 
249 | Note the use of `::`, I didn't mention this earlier, but instead of loading the package entirely `library(bench)` we're just calling the function `mark` from packages`bench` directly, using the double `::`.


--------------------------------------------------------------------------------
/exercises/old/02/02.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "EX 02 - Data Handling & MLE"
  3 | author: "Afek Adler"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   html_document :  
  7 |     number_sections: TRUE
  8 | ---
  9 | Last excercise we did:
 10 | 
 11 |   * Spoke about the R language and seen examples.
 12 |   * Reminded to go over basic probability distributions (Normal, Binomial, exc.)
 13 | 
 14 | 
 15 | Today's topics:
 16 | 
 17 |   1. Basic theory which will be the basis for the next weeks.
 18 |   2. Solve a question.
 19 | 
 20 | **HW 01 is out - submission in two weeks from now**
 21 | 
 22 | 
 23 | ## Revision - the normal distribution - 
 24 | \[f\left(x | \mu, \sigma^{2}\right)=\frac{1}{\sqrt{2 \pi \sigma^{2}}} e^{-\frac{(x-\mu)^{2}}{2 \sigma^{2}}}\]
 25 | 
 26 | 
 27 | ## Expectency and Variance of the sample sum (i.i.d)
 28 | i.i.d stands for independent and identically distributed random variables.
 29 | \[E\left(\sum_{i=1}^{n} X_{i}\right)=\sum_{i=1}^{n} E\left(X_{i}\right)= \sum_{i=1}^{n} \mu = n\mu \]
 30 | 
 31 | \[V\left(\sum_{i=1}^{n} X_{i}\right)=\sum_{i=1}^{n} V\left(X_{i}\right)= \sum_{i=1}^{n} \sigma^{2} = n\sigma^{2} \]
 32 | 
 33 | ## Expectency and Variance of the sample mean (i.i.d)
 34 | \[E(\bar{X})=E\left(\frac{1}{n} \sum_{i=1}^{n} X_{i}\right)=\frac{1}{n} \sum_{i=1}^{n} E\left(X_{i}\right)=\frac{1}{n} \sum_{i=1}^{n} \mu=\frac{1}{n} \cdot n \mu=\mu\]
 35 | 
 36 | 
 37 | \[V(\bar{X})=V\left(\frac{1}{n} \sum_{i=1}^{n} X_{i}\right)=\frac{1}{n^{2}} \sum_{i=1}^{n} V\left(X_{i}\right)=\frac{1}{n^{2}} \sum_{i=1}^{n} \sigma^{2}=\frac{1}{n^{2}} \cdot n \sigma^{2}=\frac{\sigma^{2}}{n}\]
 38 | 
 39 | ## Bias variance decomposition 
 40 | \[\operatorname{MSE}(\hat{\Theta})=E(\hat{\Theta}-\theta)^{2}\]
 41 | \[ \operatorname{MSE}(\boldsymbol{\hat{\Theta}})=E[\boldsymbol{\hat{\Theta}}-E(\boldsymbol{\hat{\Theta}})]^{2}+[\theta-E(\boldsymbol{\hat{\Theta}})]^{2}\]
 42 | \[=V(\hat{\Theta})+(\text { bias })^{2}\]
 43 | 
 44 | The MSE of an estimator is a criterion in choosing the best estimator:
 45 | 
 46 |   * If the estimator is unbiased it doesn't mean that it has the lowest MSE
 47 | 
 48 | ***  
 49 | The distribution of a sample (i.i.d) from the normal distribution (on board).
 50 | ***
 51 | 
 52 | 
 53 | 
 54 | ## Central limit theoram 
 55 | Let there be n random variable such that - 
 56 | \[E(X_i) = \mu,V(X_i) = \sigma \]
 57 | Than, for a large n:
 58 | \[\sum_{n} X_{i} \sim N\left(n \mu, n \sigma^{2}\right)\]
 59 | \[\bar{X} \sim N\left(\mu, \frac{\sigma^{2}}{n}\right)\]
 60 | 
 61 | 
 62 | If $X_i$ is distributed normal than it accounts for every n.
 63 | 
 64 | ## Deriving an unbiased estimate for $\sigma^{2}(S^{2})$ - 
 65 | 
 66 | \[E\left(S^{2}\right)=E\left[\frac{\sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}}{n-1}\right]=\frac{1}{n-1} E \sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}\]
 67 | 
 68 | \[=\frac{1}{n-1} E \sum_{i=1}^{n}\left(X_{i}^{2}+\bar{X}^{2}-2 \bar{X} X_{i}\right)=\frac{1}{n-1} E\left(\sum_{i=1}^{n} X_{i}^{2}-n \bar{X}^{2}\right)\]
 69 | 
 70 | \[=\frac{1}{n-1}\left[\sum_{i=1}^{n} E\left(X_{i}^{2}\right)-n E\left(\bar{X}^{2}\right)\right]\]
 71 | 
 72 | Recall from the probability course - 
 73 | \[V(X) = E(X^2) - [E(X)]^2\]
 74 | We can deduce that $E\left(X_{i}^{2}\right)=\mu^{2}+\sigma^{2}$ and $E(\bar{x}) =\mu^{2}+\sigma^{2} / n$
 75 | So - 
 76 | \[E\left(S^{2}\right)=\frac{1}{n-1}\left[\sum_{i=1}^{n}\left(\mu^{2}+\sigma^{2}\right)-n\left(\mu^{2}+\sigma^{2} / n\right)\right]\]
 77 | \[=\frac{1}{n-1}\left(n \mu^{2}+n \sigma^{2}-n \mu^{2}-\sigma^{2}\right) = \sigma^{2} \]
 78 | 
 79 | 
 80 | # Some distribuitons that we will use in the future
 81 | 
 82 | ## Student's t-distribution
 83 | In probability and statistics, Student's t-distribution (or simply the t-distribution) is any member of a family of continuous probability distributions that arises when estimating the mean of a normally distributed population in situations where the sample size is small and the population standard deviation is unknown. It was developed by William Sealy Gosset under the pseudonym Student. 
 84 | when n < 30 it resembles the nurmal distribution and when n >= 30 it is very close to it (especially when n increases)
 85 | 
 86 | ```{r T, echo=FALSE, out.width = '50%'}
 87 | knitr::include_graphics(knitr::include_graphics('t.png'))
 88 | ```
 89 | 
 90 | 
 91 | ## The chi squared distribution
 92 | In probability theory and statistics, the chi-square distribution (also chi-squared or χ2-distribution) with k degrees of freedom is the distribution of a sum of the squares of k independent standard normal random variables. The chi-square distribution is a special case of the gamma distribution and is one of the most widely used probability distributions in inferential statistics, notably in hypothesis testing or in construction of confidence intervals.
 93 | 
 94 | ```{r X^2, echo=FALSE, out.width = '50%'}
 95 | knitr::include_graphics(knitr::include_graphics('x^2.png'))
 96 | ```
 97 | 
 98 | **We will also use the F distribution**
 99 | 
100 | # Q1 & Q2 - Attached in PDF
101 | 
102 | 
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/exercises/old/02/EX 02 Q1 Q2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/02/EX 02 Q1 Q2.docx


--------------------------------------------------------------------------------
/exercises/old/02/EX 02 Q1 Q2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/02/EX 02 Q1 Q2.pdf


--------------------------------------------------------------------------------
/exercises/old/02/t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/02/t.png


--------------------------------------------------------------------------------
/exercises/old/02/x^2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/02/x^2.png


--------------------------------------------------------------------------------
/exercises/old/03/03- Point estimation and dplyr package.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "EX 03 - Data Handling & MLE"
  3 | author: "Afek Adler"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   pdf_document: default
  7 |   html_document:
  8 |     number_sections: yes
  9 | ---
 10 | 
 11 | ```{r setup, include=FALSE}
 12 | knitr::opts_chunk$set
 13 | ```
 14 | 
 15 | 
 16 | Last excercise we did:
 17 | 
 18 |   * Expectency and Variance of the sample mean and sample sum
 19 |   * Central limit theoram 
 20 |   * Bias variance decomposition of a point estimator
 21 |   * Derived an unbiased estimate for $\sigma^{2}(S^{2})$ 
 22 |   * Covered the student's t-distribution and chi square distribution
 23 |   
 24 | Today we will:
 25 | 
 26 |   * Cover methods for point estimattion
 27 |   * Get to know `dplyr` package
 28 |   * Try to develop a feeling for bayesian estimation.
 29 | 
 30 | 
 31 | # Loss function
 32 | A quick recap of the MSE of an estimator:
 33 | \[\operatorname{MSE}(\hat{\Theta})=E((\hat{\Theta}-\theta)^{2})\]
 34 | 
 35 | The squared loss did not come from heaven but from convienince. for example, another good criterion can be:
 36 | 
 37 |   \[\operatorname{MAE}(\hat{\Theta})=E(|\hat{\Theta}-\theta)|)\]
 38 | 
 39 | Or many other types of error function.
 40 | Also, at the lecture you have seen an *example* of Bayesian estimation where $\hat{\theta}_{\mathrm{MMSE}}=\int \theta \mathrm{p}(\theta | \mathbf{x}) \mathrm{d} \theta=\mathrm{E}(\theta | \mathbf{x})$ ,the derivation of this formula was taken under assumption of a  square loss but there are also many other bayesian estimators like the maximum a posteriori estimation - $\hat{\theta}_{\mathrm{MAP}}=\underset{\theta}{\arg \max } p(\mathbf{\theta}| x)$.
 41 | In the end of the excercise we will go deeper into this subject.
 42 | 
 43 | 
 44 | # Point estimaion
 45 | 
 46 | ## Some nice to have charactraists 
 47 | 
 48 |   * Unbiased. If $E(\hat{\theta}) = \theta$
 49 |   * Consistent. If the varaince of the estimator ~0 when N tends to $\infty$ 
 50 |   
 51 | Remember that the sample mean is unbiased estimator of the population mean
 52 |   
 53 | ## Point estimates with the method of moments (MOM)
 54 | 
 55 | The first moment
 56 |  \[E(X)=\frac{\sum_{i=1}^{n} X_{i}}{n} \Rightarrow E(X)=\bar{X}\]
 57 | The Second moment
 58 |   \[E\left(X^{2}\right)=\frac{\sum_{i=1}^{n} X_{i}^{2}}{n} \Rightarrow \]
 59 |    \[V(X)=E\left(X^{2}\right)-E^{2}(X) \Rightarrow V(X)=\frac{\sum_{i=1}^{n} X_{i}^{2}}{n}-(\bar{X})^{2}\]
 60 |  
 61 | 
 62 | **Q1: MOM**
 63 | 
 64 | Let  \[X =  \mathcal{U}\left(\theta , \theta + 6 \right)\]
 65 | Estimate $\theta$ with the method of moments \
 66 | 
 67 | Therfore \[E(X) =  (\theta + \theta+ 6 )/2 = \theta +3\]
 68 | And \[E(X) = \bar{X} = \theta +3\]
 69 | By the equation of the first moment.
 70 | Therfore \[\hat{\theta} = \bar{X}  -3\]
 71 | 
 72 | 
 73 | 
 74 | ## Point estimates with the mazimum likelihood estimation (MLE)
 75 | In statistics, maximum likelihood estimation (MLE) is a method of estimating the parameters of a probability distribution by maximizing a likelihood function, so that under the assumed statistical model the observed data is most probable. The point in the parameter space that maximizes the likelihood function is called the maximum likelihood estimate. The logic of maximum likelihood is both intuitive and flexible, and as such the method has become a dominant means of statistical inference.
 76 | 
 77 | If the likelihood function is differentiable, the derivative test for determining maxima can be applied. In some cases, the first-order conditions of the likelihood function can be solved explicitly; for instance, the ordinary least squares estimator maximizes the likelihood of the linear regression model. Under most circumstances, however, numerical methods will be necessary to find the maximum of the likelihood function. ("Wikipedia")
 78 | 
 79 | **Q2: MLE**
 80 | With the binomial distribution - suppose we had a trial with 49 success ou of 80.
 81 | 
 82 | \begin{equation}
 83 | L(p)=f_{D}(\mathrm{H}=49 | p)=\left(\begin{array}{c}{80} \\ {49}\end{array}\right) p^{49}(1-p)^{31}\end{equation}
 84 | \begin{equation}
 85 | 0=\frac{\partial}{\partial p}\left(\left(\begin{array}{c}{80} \\ {49}\end{array}\right) p^{49}(1-p)^{31}\right) , \{discard binomial coefficient\}
 86 | \end{equation}
 87 | \begin{equation}
 88 | 0=49 p^{48}(1-p)^{31}-31 p^{49}(1-p)^{30}, \{(uv)` = u`v +v`u\}
 89 | \end{equation}
 90 | \begin{equation}
 91 | =p^{48}(1-p)^{30}[49(1-p)-31 p]
 92 | \end{equation}
 93 | \begin{equation}
 94 | =p^{48}(1-p)^{30}[49-80 p]
 95 | \end{equation}
 96 | Can be solved also by applying log on the likelihood.
 97 | 
 98 | It's clear that the maximum is at p = 49/80. But let's see how we do it in R using the bulit in [optimize](ehttps://stat.ethz.ch/R-manual/R-devel/library/stats/html/optimize.html) function:  
 99 | 
100 | ``` {r find MlE}
101 | likelihood <- function(p) {
102 |   p^49*((1-p)^31)
103 | }
104 | tolerance <- 10^(-4) 
105 | pmax <- optimize(likelihood, c(0, 1), tol = tolerance  , maximum = T)[[1]]
106 | delta <- abs(pmax- (49/80))
107 | delta
108 | ```
109 | 
110 | # HW1 q3
111 | 
112 | # Best Pracitces for data Data handling with R
113 | 
114 | R main datatypes:
115 | 
116 |   * vectors
117 |   * matrices
118 |   * data.frame - matrices with meatadata, added functionallity and allow multiple data types
119 |   * tibbles  - modern take on dataframes 
120 | 
121 | `dplyr` is a grammar of data manipulation, providing a consistent set of verbs that help you solve the most common data manipulation challenges:
122 | 
123 |   * `mutate()` adds new variables that are functions of existing variables.
124 |   * `select()` picks variables based on their names. 
125 |   * `filter()` picks cases based on their values.
126 |   * `summarize()` reduces multiple values down to a single summary.
127 |   * `arrange()` sorts the rows.
128 | 
129 | ```{r Impports, message=FALSE}
130 | library(tidyverse)
131 | library(nycflights13)
132 | ```
133 | 
134 | 
135 | This dataset has 19 columns so the head function is not that usefull when knitting to html.
136 | It is always useful to know how many missing values we have in our dataset, sometimes missing values are not just given to us as NA.
137 | ```{r describe dataset}
138 | head(flights,2)
139 | colSums(is.na(flights))/nrow(flights) 
140 | sapply(flights,class)
141 | ```
142 | ***
143 | **At home - find a better way to print the classes and the % of missing values in R** <br>
144 | 
145 | 
146 | ## `select()` picks variables based on their names. 
147 | 
148 | ```{r select method}
149 | flight_ditance_airtime <- flights %>% select( distance, air_time) 
150 | flight_ditance_airtime %>% head(2)
151 | ```
152 | 
153 | 
154 | ## `mutate()` adds new variables that are functions of existing variables.
155 | ```{r mutate method}
156 | flight_ditance_airtime %>% mutate(mean_speed = distance/air_time) %>% head(2)
157 | ```
158 | If you only want to keep the new variables, use `transmute()`:
159 | ```{r transmute method}
160 | flight_ditance_airtime %>% transmute(mean_speed = distance/air_time) %>% head(2)
161 | ```
162 | 
163 | ##  `filter()` picks cases based on their values.
164 | ```{r filter method}
165 | flights %>% filter(is.na(dep_delay)) %>% head(2)
166 | ```
167 | 
168 | 
169 | ##  `arrange()` picks cases based on their values.
170 | ```{r arrange  method}
171 | flights %>% arrange(desc(month)) %>% head(2)
172 | ```
173 | 
174 | ## `summarize()` reduces multiple values down to a single summary.
175 | 
176 | ```{r summarize  method}
177 | by_month <- group_by(flights,month)
178 | by_month %>% summarise(count = n()) %>% 
179 |   ggplot( mapping = aes(x = month, y = count)) + geom_bar(stat="identity") +  coord_cartesian(ylim = c(2*10^4, 3*10^4))
180 | ```
181 | 
182 | ```{r another way}
183 | ggplot(data = flights) + 
184 |   geom_bar(mapping = aes(x = month)) +
185 |  coord_cartesian(ylim = c(2*10^4, 3*10^4))
186 | ```
187 | 
188 | 
189 | Additional resources:
190 | 
191 |   + [r4ds](https://r4ds.had.co.nz/transform.html)
192 |   + [dplyr](https://dplyr.tidyverse.org/)
193 |   + [dplyr cheat sheet](https://github.com/rstudio/cheatsheets/blob/master/data-transformation.pdf)
194 | 
195 | # Bayesian estimation
196 | 
197 | We want to minimize with respect to a given loss function - 
198 |  \[\int L(\hat{\theta}- \theta)*p(\theta|x)d\theta\]
199 | 
200 | In the lecture, we have seen that when $L(\hat{\theta},\theta) = (\hat{\theta}- \theta)^2$
201 | than $\hat{\theta} = E(posterier)$, other types of loss functions will derive different estimators (like we have seen above).
202 | The logic of this method is as follows - we have somekind of a distribution over $\theta$, but we need to choose only one of those. So we formulate an objective function and minimze it with respect to the parameter that we want to find. The most used ones are the mean, median, and common of that distribution, and as we said, thet are the bayesian estimators for different loss functions. 
203 | 
204 | 
205 | 
206 | 
207 | 


--------------------------------------------------------------------------------
/exercises/old/03/03-_Point_estimation_and_dplyr_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/03/03-_Point_estimation_and_dplyr_package.pdf


--------------------------------------------------------------------------------
/exercises/old/03/ex03.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/03/ex03.zip


--------------------------------------------------------------------------------
/exercises/old/03/hw1_q3_solution.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/03/hw1_q3_solution.pdf


--------------------------------------------------------------------------------
/exercises/old/04/04.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "EX 04 - confidense intervals and MLE motivation"
  3 | author: "Afek Adler"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   html_document :  
  7 |     number_sections: TRUE
  8 | ---
  9 | 
 10 | Last excercise we did:
 11 | 
 12 |   * Cover methods for point estimation
 13 |   * Get to know dplyr package
 14 |   * Talked about what makes an estimator a good one
 15 | 
 16 | Today we will:
 17 | 
 18 |   * Explain again some terms (central limit theorem, what is an estimator, what is a moment)
 19 |   * Revise MLE with an example
 20 |   * Talk on interavel estimation
 21 | 
 22 | 
 23 | ## What is an the difference between $\theta$ and $\hat{\theta}$
 24 | $\theta$ is the population parameter (we will never know it). 
 25 | $\hat{\theta}$ estimates $\theta$ based on a sample from the population. 
 26 | 
 27 | ## What is a moment?
 28 | The moments of a function describes it's shape.
 29 | In the method of moments we assume that our sample is big enough such that the sample moments are approximately equal to the population moments. And that's how we estimate parameters.
 30 | 
 31 | ## Central limit theoram
 32 | Q1 in excercise notes
 33 | 
 34 | 
 35 | # Another MLE and Moments example
 36 | Q2 in excercise notes
 37 | 
 38 | 
 39 | # Interval Estimation
 40 | 
 41 | ## The student's t distribution 
 42 | t distribution is used to model the expected values of a **small** sample from a population that is distributed noraml with unknown variance.
 43 | As N increases, t distribution is getting closer and closer to the normal distribution.
 44 | 
 45 | Lemma: 
 46 | $\frac{\bar{X}-\mu}{S / \sqrt{n}}$ is t- distributed.
 47 | 
 48 | [Student t dist vs normal dist as function of n](https://rpsychologist.com/d3/tdist/)
 49 | 
 50 | ## confidense interval
 51 | 
 52 | In statistics, a confidence interval (CL) is a type of interval estimate, computed from the statistics of the observed data, that might contain the true value of an unknown population parameter. The interval has an associated confidence level, or coverage that, loosely speaking, quantifies the level of confidence that the deterministic parameter is captured by the interval. More strictly speaking, the confidence level represents the frequency (i.e. the proportion) of possible confidence intervals that contain the true value of the unknown population parameter. **In other words, if confidence intervals are constructed using a given confidence level from an infinite number of independent sample statistics, the proportion of those intervals that contain the true value of the parameter will be equal to the confidence level**. [wiki](https://en.wikipedia.org/wiki/Confidence_interval)
 53 | 
 54 | There are one sided and two sided confidense intervals.
 55 | 
 56 | ### confidense interval for the mean based on n samples: 
 57 | Based on our assumptions we get a different distribution of the sample, after we figure out how the sample is distributed computing the confidense interval is straighforward. for example, for the mean - 
 58 | 
 59 | \[\mu \in(\bar{X}-\#of\_std_\_for\_confidense\_level\_\alpha*std,\bar{X}+\#of\_std_\_for\_confidense\_level\_\alpha*std) \]
 60 | 1.When variance is known and the population is assumed to be distributer notmal or n is "big" (n > 30), the sample mean is distributed
 61 | \[\mathcal{N}\left(\bar{X} , \frac{\sigma^{2}}{n}\right)\]
 62 | So  a two sided confidense interval is:
 63 | \[\mu \in\left(\bar{X}-Z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}},\bar{X}+Z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}}\right)\]
 64 | 2 .When variance is **not** known and n is "big" (n > 30), the sample mean is distributed
 65 | \[\mathcal{N}\left(\bar{X} , \frac{\hat{\sigma}^{2}}{n}\right)\]
 66 | 3. When variance is **not** known and n is **not** "big" (n <= 30), the sample mean is distributed
 67 | \[\mathcal{t_{n-1}}\left(\bar{X} , \frac{\hat{\sigma}^{2}}{n}\right)\] 
 68 | 
 69 | Reminder:
 70 | \[\hat{\sigma}^{2} = S^2 =\frac{\sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}}{n-1}=\frac{\sum_{i=1}^{n} X_{i}^{2}-n \bar{X}^{2}}{n-1} \]
 71 | 
 72 | For your own understanding, for each case, derive the one sided and two sided confidense interval at home.
 73 | 
 74 | * We have seen in the lecture alse CL for the $\sigma^2$, and we will encounter it at HW2 as well.
 75 | 
 76 | ### confidense interval for the proportion based on n samples
 77 | if n is large enough, the proportion is distributed:
 78 | \[\mathcal{N}\left(\hat{p} , \frac{\hat{p}\hat{q}}{n}\right)\]
 79 | 
 80 | **Q1:**
 81 | An online advertising company is doing an ab testing for a new advertisement and wants to model a confidense interval for the click thorogh rate (CTR) of a givent test such that it's confidense interval will be smaller than 5%  in confidense level of 95% . What is the minmum number of sample for this purpose?
 82 | 
 83 | \[0.05 = Length \geq 2*Z_{1-\frac{\alpha}{2}}*\sqrt{\frac{\hat{p}\hat{q}}{n}} =  2*Z_.975*\sqrt{\frac{\hat{p}\hat{q}}{n}} \Rightarrow\]
 84 | \[=  2*1.96*\sqrt{\frac{\hat{p}\hat{q}}{n}} \Rightarrow\ 0.01275 \geq \sqrt{\frac{\hat{p}\hat{q}}{n}} \Rightarrow\ n \geq \frac{\hat{p}\hat{q}}{0.00016} \Rightarrow\]
 85 | \[n \geq  \frac{\hat{p}\hat{q}}{0.00016} = \frac{0.5*0.5}{0.00016} = 1562.5\]
 86 | because \[ p(1-p) \leq 0.5*0.5  \ \forall p \in \{0,1\} \]
 87 | 
 88 | If we know for example that the CTR is bounded by 4% than:
 89 | \[n \geq  240 = \frac{0.04*0.96}{0.00016} \geq \frac{\hat{p}\hat{q}}{0.00016}\]
 90 | 
 91 | # CL Verification
 92 | Let's verify that indeed when we bouild CI (and..our assumptions are correct) than $1-\alpha$ our parameter is inside the CI:  
 93 | ``` {r confidense interval}
 94 | miu = 10
 95 | sigma = 3
 96 | n = 10
 97 | alpha = 0.1
 98 | N_tests <- 10000
 99 | counter <- 0
100 | error = qnorm(1-alpha/2)*(sigma/sqrt(n))
101 | for (i in 1:N_tests)
102 |   {sample = rnorm(n,miu,sigma)
103 |   sample_mean <- mean(sample)
104 |   left <- sample_mean-error 
105 |   right <- sample_mean + error
106 |   between <- (left <= miu) & (miu <= right)
107 |   counter <- counter+between}
108 | print(counter/N_tests)
109 | ```
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/exercises/old/05/05.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "EX 05 - Intro to hypothesis tests"
  3 | author: "Afek Adler"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   html_document :  
  7 |     number_sections: TRUE
  8 | ---
  9 | 
 10 | Last exercise we did:
 11 | 
 12 |   * Talk on interval estimation
 13 |   * Go throw a reminder on MLE with a motivation for machine learning (a mixture process)
 14 | 
 15 | Today we  talk  about:
 16 | 
 17 |   * The null hypothesis 
 18 |   * General framework hypothesis testing
 19 |   * Type 1 and type 2 errors
 20 |   * P value
 21 |   * The connection between hypothesis testing and confidence intervals
 22 |   * Hypothesis tests
 23 | 
 24 | Some more topics that were covered in the lecture (will not be in the exercise):
 25 | 
 26 |   * Type-II error and determining the sample size
 27 |   * QQ plot (comparing distributions)
 28 |   
 29 |   
 30 | # The null hypothesis 
 31 | In inferential statistics, the null hypothesis is a general statement or default position that there is nothing new happening, like there is no association among groups, or no relationship between two measured phenomena. Testing (accepting, approving, rejecting, or disproving) the null hypothesis—and thus concluding that there are or are not grounds for believing that there is a relationship between two phenomena (e.g. that a potential treatment has a measurable effect)—is a central task in the modern practice of science; the field of statistics gives precise criteria for rejecting a null hypothesis.
 32 | The null hypothesis is generally assumed to be true until evidence indicates otherwise.
 33 | In statistics, it is often denoted H0, pronounced as "H-nought", "H-null", or "H-zero" (or, even, by some, "H-oh"), with the subscript being the digit 0.
 34 | 
 35 | The concept of a null hypothesis is used differently in two approaches to statistical inference. In the significance testing approach of Ronald Fisher, a null hypothesis is rejected if the observed data are significantly unlikely to have occurred if the null hypothesis were true. In this case, the null hypothesis is rejected and an alternative hypothesis is accepted in its place. If the data are consistent with the null hypothesis, then the null hypothesis is not rejected. In neither case is the null hypothesis or its alternative proven; the null hypothesis is tested with data and a decision is made based on how likely or unlikely the data are. This is analogous to the legal principle of presumption of innocence, in which a suspect or defendant is assumed to be innocent (null is not rejected) until proven guilty (null is rejected) beyond a reasonable doubt (to a statistically significant degree). [https://en.wikipedia.org/wiki/Null_hypothesis](wiki)
 36 | 
 37 | # General Framework for hypothesis testing
 38 | 
 39 | This is the procedure for hypothesis testing:
 40 | 
 41 |   1. Identify the parameter of interest (i.e., proportion, expectancy, std, etc.)
 42 |   2. State the null hypothesis $H_0$
 43 |   3. Specify the alternative hypothesis  $H_1$ (one sided, two sided, etc.)
 44 |   4. Choose significance level
 45 |   5. Determine what test statistic to use (e.g., $Z, T ,X^2$ )
 46 |   6. State the rejection region for the statistic
 47 |   7. Compute the sample quantities, plug-in into the test statistic and compute it
 48 |   8. Decide if should be rejected based on 6-7
 49 | 
 50 | # Type 1 and Type 2 errors
 51 | 
 52 |   ```{r errors 1,echo=FALSE, out.width = "400px"}
 53 | photo_path <- 'https://i.stack.imgur.com/R0ncP.png'
 54 | destination_path <- 'type1_type2_errors.png'
 55 | if (!(file.exists(destination_path))){
 56 | download.file(photo_path,destination_path, mode = 'wb')}
 57 | knitr::include_graphics(destination_path)
 58 | ```
 59 |   ```{r errors 2,echo=FALSE, out.width = "400px"}
 60 | photo_path <- 'https://www.dummies.com/wp-content/uploads/436264.image0.jpg'
 61 | destination_path <- 'type1_type2_errors2.jpg'
 62 | if (!(file.exists(destination_path))){
 63 | download.file(photo_path,destination_path, mode = 'wb')}
 64 | knitr::include_graphics(destination_path)
 65 | 
 66 | ```
 67 | 
 68 | On board:
 69 | 
 70 | * $\alpha$ - the probability if *$H_0$* is True but our test says otherwise ()
 71 | * $1 - \alpha$
 72 | * $\beta$ the probability if *$H_1$* is True but our test says otherwise 
 73 | * $1- \beta$
 74 | * $p_{value}$ - Intuition - low $p_{value}$ means it's not very likely that $H_0$ generated our sample
 75 | * Rejection region $C$ and it's counterpart, the acceptance region $\overline{C}$
 76 | 
 77 | 
 78 | # The relationship between hypothesis testing and confidence intervals
 79 | It can be shown that $H_0$ is accepted if and only the confidence interval contains the parameter in the basis of the assumption of $H_0$.
 80 | For example, let's look at the hypothesis test for the mean:
 81 | 
 82 | Acceptance region - 
 83 | \[\mu_{0}-Z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}} \leq \bar{x} \leq \mu_{0}+z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}}\]
 84 | 
 85 | And by re-arranging both sides we get:
 86 | 
 87 | \[\bar{x}-z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}} \leq \mu_{0} \leq \bar{x}+z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}}\]
 88 | 
 89 | Which is exact the confidences intervals.
 90 | 
 91 | # Hypothesis tests
 92 | 
 93 | ## Goodness of fit test:
 94 | Goodness of fit tests are used to test how good is the fit of our empirical distribution to that of a theoretical
 95 | distribution.
 96 | Arrange the empirical distribution in $k$ bins, and let  $O_i$  be the observed frequency in the  $i$th class bin. Let $E_i$
 97 | be the expected probability. 
 98 | 
 99 | $H_0$ : our observation are distributed according to ~Y (some distribution)\
100 | $H_1$ : else
101 | 
102 | The test statistic is:
103 | 
104 | \[ \chi_{0}^{2}=\sum_{i=1}^{k} \frac{\left(O_{i}-E_{i}\right)^{2}}{E_{i}} \]
105 | 
106 | We would reject the hypothesis if $\chi_{0}^{2}>\chi_{\alpha, k-p-1}^{2}$ \
107 | Where p is the number of parameters of the distribution.
108 | 
109 | **Q1 - on board**
110 | **Q2 - on board**
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/exercises/old/05/EX 05 - Intro to hypothesis tests.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/EX 05 - Intro to hypothesis tests.pdf


--------------------------------------------------------------------------------
/exercises/old/05/Q1_2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/Q1_2.docx


--------------------------------------------------------------------------------
/exercises/old/05/Q1_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/Q1_2.pdf


--------------------------------------------------------------------------------
/exercises/old/05/type1_type2_errors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/type1_type2_errors.png


--------------------------------------------------------------------------------
/exercises/old/05/type1_type2_errors2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/05/type1_type2_errors2.jpg


--------------------------------------------------------------------------------
/exercises/old/06/EX06.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/06/EX06.docx


--------------------------------------------------------------------------------
/exercises/old/06/EX06.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/06/EX06.pdf


--------------------------------------------------------------------------------
/exercises/old/07/EX07.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/07/EX07.pdf


--------------------------------------------------------------------------------
/exercises/old/07/cs229-notes1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/07/cs229-notes1.pdf


--------------------------------------------------------------------------------
/exercises/old/08/EX08.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/08/EX08.pdf


--------------------------------------------------------------------------------
/exercises/old/08/Ex08.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/08/Ex08.docx


--------------------------------------------------------------------------------
/exercises/old/09/ex09.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/09/ex09.docx


--------------------------------------------------------------------------------
/exercises/old/09/ex09.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/09/ex09.pdf


--------------------------------------------------------------------------------
/exercises/old/10/10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/10/10.pdf


--------------------------------------------------------------------------------
/exercises/old/10/first_semester/08.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/10/first_semester/08.pdf


--------------------------------------------------------------------------------
/exercises/old/10/first_semester/10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/10/first_semester/10.pdf


--------------------------------------------------------------------------------
/exercises/old/10/first_semester/Exam Questiuon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/10/first_semester/Exam Questiuon.pdf


--------------------------------------------------------------------------------
/exercises/old/11/ex 11.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/ex 11.docx


--------------------------------------------------------------------------------
/exercises/old/11/ex 11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/ex 11.pdf


--------------------------------------------------------------------------------
/exercises/old/11/first_semester/HW4 Q1 and Q2 solution.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/first_semester/HW4 Q1 and Q2 solution.docx


--------------------------------------------------------------------------------
/exercises/old/11/first_semester/HW4 Q1 and Q2 solution.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/first_semester/HW4 Q1 and Q2 solution.pdf


--------------------------------------------------------------------------------
/exercises/old/11/first_semester/ex11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/11/first_semester/ex11.pdf


--------------------------------------------------------------------------------
/exercises/old/12/12.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/12/12.docx


--------------------------------------------------------------------------------
/exercises/old/12/12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/12/12.pdf


--------------------------------------------------------------------------------
/exercises/old/12/first_semester/ex12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/12/first_semester/ex12.pdf


--------------------------------------------------------------------------------
/exercises/old/13/ex 13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/13/ex 13.pdf


--------------------------------------------------------------------------------
/exercises/old/13/first_semester/ex 13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/exercises/old/13/first_semester/ex 13.pdf


--------------------------------------------------------------------------------
/intro_statistics_R.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/labs/answers/food_consumption-answers.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Lab - food consumption and carbon footprint - answers"
  3 | author: "Adi Sarid"
  4 | date: "`r Sys.Date()`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | library(tidyverse)
 11 | ```
 12 | 
 13 | In this lab we cover the data science work flow. The lab will walk you through the steps performed in a data science project:
 14 | 
 15 |    * Data import
 16 |    * Data tidying
 17 |    * Transformation <-> Visualization <-> Modelling
 18 |    
 19 | The lab will also cover the theoretic elements we covered such confidence intervals and hypothesis tests. This lab is to be performed in groups of 3 (i.e., zoom break rooms).
 20 | 
 21 | # First exercise - open up a new project.
 22 | 
 23 | First, open up a new project. To do this, in RStudio go to:
 24 | 
 25 |    * File -> New Project -> New Directory -> New Project.
 26 | 
 27 | Provide your project directory name (under directory name), and click ok. Note that everything will close and RStudio will open up in a clean window. But don't worry, you can always view this file by visiting [this link](https://github.com/adisarid/intro_statistics_R/tree/master/labs/).
 28 | 
 29 | Once a new RStudio instance has opened with a clean window of your new project, open up a new RMarkdown, which you will use to answer your questions to this lab by:
 30 | 
 31 |    * File -> New file -> R Markdown...
 32 | 
 33 | Give your RMarkdown file a name and use the html outupt type. In your new RMarkdown file you can delete lines 1-10 and delete everything else in the file (lines 11-31). Try to knit it by clicking ctrl+k. 
 34 | 
 35 | Now we are ready to do some analysis.
 36 | 
 37 | # Second exercise - get to know your data
 38 | 
 39 | In this lab we are going to analyze food consumption data from *tidytuesday*. You can read about tidytuesday [here](https://github.com/rfordatascience/tidytuesday) - it's a github repository which is updated every Tuesday with data freely available for analyzing and sharpening your data analysis skills. 
 40 | Today we will use this dataset: [here](https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-18/readme.md).
 41 | 
 42 | In groups, find the original source of the data (the nu3.de website within the links) and discuss: 
 43 | 
 44 |    * What is the origin of the data?
 45 |    
 46 |    * Would you consider the data reliable/trust worthy?
 47 |    
 48 |    * How was the carbon footprint computed for each food type and country?
 49 | 
 50 | Using the `read_csv` function from the `readr` package, read the food consumption data. 
 51 | 
 52 |    * Use the following functions to understand how the data is arranged: `glimpse`, `head`, `View`. 
 53 |    
 54 |    * Comparing the file you read with the original table, in what sense the file you read is more "tidy"?
 55 | 
 56 | ```{r read the data}
 57 | food_consumption <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-18/food_consumption.csv')
 58 | 
 59 | glimpse(food_consumption)
 60 | 
 61 | head(food_consumption)
 62 | 
 63 | ```
 64 | 
 65 | ***
 66 | 
 67 | **Checkpoint** - make sure you summarize your answers in a convinient way, we are going to discuss them in class.
 68 | 
 69 | ***
 70 | 
 71 | # Third exercise - visualization, and descriptive statistics of food consumption
 72 | 
 73 | In the group, discuss: 
 74 | 
 75 |    * What influences the consumption of various goods (e.g., pork, beef, fish, wheat, etc.). In which countries would you expect to find a low or high consumption of specific products?
 76 |    
 77 |    * Write down an equivalent formula for the variance. This is a theoretic part, you can write your answer in RMarkdown by enclosing it with the $ characters like this. RMarkdown interprets such test as LaTeX and creates a formula:
 78 | 
 79 | $$\sigma^2=\operatorname{Var}(X)=E\left[(X-E[X])^2\right]=\ldots?$$
 80 | 
 81 |    * Write down two estimates for the variance $\hat{\sigma}^2$ and $s^2$. Which would you prefer to use and why?
 82 |    
 83 |    * What plot(s) would you use to visualize the distribution of consumption of each food type? you can use the `ggplot2` cheatsheet to consider this. Once you reach a conclusion - create the chart.
 84 |    
 85 | ```{r foo consumption distribution}
 86 | 
 87 | ggplot(food_consumption, aes(x = food_category, y = consumption)) + 
 88 |    geom_boxplot() + coord_flip()
 89 | 
 90 | ggplot(food_consumption, aes(x = consumption)) + 
 91 |    geom_histogram() + 
 92 |    facet_wrap(~food_category)
 93 | 
 94 | ggplot(food_consumption, aes(x = consumption)) + 
 95 |    geom_density() + 
 96 |    facet_wrap(~food_category, scales = "free")
 97 | 
 98 | ```
 99 | 
100 |    * Based on the plot, what products have the highest (or lowest) variance? what is the meaning of having a high (or low) variance in this context of food consumption?
101 |    
102 |    * Verify this by computing the consumption standard deviation of each product, also add to your computation the mean, median and 1st and 3rd quartiles. You can use `group_by` and `summarize` for this.
103 |    
104 | ```{r summary stats}
105 | food_descriptives <- food_consumption %>% 
106 |    group_by(food_category) %>% 
107 |    summarize(mean = mean(consumption),
108 |              q1 = quantile(consumption, probs = 0.25),
109 |              q2 = median(consumption),
110 |              q3 = quantile(consumption, probs = 0.75),
111 |              sd = sd(consumption)) %>% 
112 |    arrange(desc(mean))
113 | food_descriptives
114 | ```
115 | 
116 | ***
117 | 
118 | **Checkpoint** - together in class, we're going to discuss and solve the exercise so far.
119 | 
120 | ***
121 | 
122 | # Fourth exercise - modelling - confidence intervals and hypothesis tests of consumption
123 | 
124 | In this part we will create a number of confidence intervals. Follow these steps in order to answer this question:
125 | 
126 |    * First, decide what kind of confidence interval is to be used (what statistic are you using) and write it down as a formula:
127 | 
128 | $$T_{\text{df}=n-1} = \frac{\bar{X} - \mu}{S/\sqrt{n}}$$
129 | 
130 |    * And the confidence interval is therefore:
131 | 
132 | $$\mu\in\bar{X}\pm t_{\alpha/2,n-1}S/\sqrt{n}$$
133 | 
134 |    * Use the tibble you created in the last step of the previous part (`food_descriptives`), to create a confidence interval for all the food categories, with $\alpha=0.05$. Use the following code.
135 |    
136 | ```{r confidence intervals t statistic}
137 | 
138 | # First find the relevant t for the chosen alpha and the relevant degrees of freedom
139 | t0.05_129 <- qt(p = 0.05/2, df = 129)
140 | 
141 | # Now use it on the tibble we computed
142 | food_descriptives %>% 
143 |    mutate(ci_lower_bound = mean + t0.05_129*sd/sqrt(130),
144 |           ci_upper_bound = mean - t0.05_129*sd/sqrt(130)) %>% 
145 |    select(food_category, ci_lower_bound, mean, ci_upper_bound)
146 | 
147 | ```
148 | 
149 |    * Check your result using the `t.test` function for Fish.
150 |    
151 | ```{r fish ci}
152 | 
153 | fish_vector <- food_consumption %>% 
154 |    filter(food_category == "Fish") %>% 
155 |    pull(consumption)
156 | 
157 | t.test(fish_vector)
158 | 
159 | ```
160 | 
161 |    * Formulate a hypothesis test which examines the expected consumption of Fish vs. Pork (with $H_0$ and $H_1$).
162 |    
163 | $$H_0: \mu_{\text{pork}} = \mu_{\text{fish}}$$
164 | $$H_0: \mu_{\text{pork}} \neq \mu_{\text{fish}}$$
165 | 
166 |    * Assuming that the variance of pork and fish consumption is the same, what test statistic would you use for this hypothesis?
167 |    
168 | $$T=\frac{\bar{X}_1 - \bar{X}_2 - (\mu_1 - \mu_2)}{S_p\sqrt{1/n_1 + 1/n_2}}$$
169 | 
170 | $$S_p = \sqrt{\frac{(n_1-1)S_1^2 + (n_2-1)S_2^2}{n_1 + n_2 -2}}$$
171 | 
172 | With df$=n_1+n_2-2$.
173 | 
174 |    * Is this a paired or unpaired test?
175 |    
176 |    * Conduct the test by computing the test statistic and its p-value. You can do this either directly or with the `t.test` function. However you prefer. If you are using the `t.test`, note that you have to set the `var.equal` argument (to what?).
177 | 
178 | ```{r compare fish and pork}
179 | pork_vector <- food_consumption %>% 
180 |    filter(food_category == "Pork") %>% 
181 |    pull(consumption)
182 | 
183 | t.test(x = fish_vector, y = pork_vector, var.equal = T)
184 | 
185 | ```
186 |    
187 |    * How would you have conducted the test if the variances were assumed to be unequal?
188 |    
189 | ```{r compare fish and pork unequal variance}
190 | t.test(x = fish_vector, y = pork_vector, var.equal = F)
191 | ```
192 | 
193 | ***
194 | 
195 | **Checkpoint** - solving this exercise together in class.
196 | 
197 | ***
198 | 
199 | # Fifth exercise - visualizing the relationship between meet products and vegan products
200 | 
201 | In this final part, we're going to use visualizations to examine the relationship between meat/dairy and vegan products.
202 | 
203 |    * Reclassify all the `food_category` into two types of products: meet/dairy versus vegan. You can use the definition of the following tibble, along with the `left_join` function, but you will probably need to read about it in the documentation. Another option is to use another function called `case_when` or `recode_factor`.
204 |    
205 | ```{r food reclassification}
206 | food_types <- tribble(~food_category, ~food_type,
207 |                       "Beef", "Meat/Dairy",
208 |                       "Eggs", "Meat/Dairy",
209 |                       "Fish", "Meat/Dairy",
210 |                       "Lamb & Goat", "Meat/Dairy",
211 |                       "Milk - inc. cheese", "Meat/Dairy",
212 |                       "Pork", "Meat/Dairy",
213 |                       "Poultry", "Meat/Dairy",
214 |                       "Rice", "Vegan",
215 |                       "Soybeans", "Vegan",
216 |                       "Wheat and Wheat Products", "Vegan",
217 |                       "Nuts inc. Peanut Butter", "Vegan")
218 | 
219 | food_consumption_reclassified <- food_consumption %>% 
220 |    left_join(food_types)
221 | 
222 | ```
223 | 
224 |    * In the result you got, summarize the data such that each country will appear only twice (once for Meat/Dairy and once for Vegan consumption values), with the overall consumption for that type. Select only the `consumption` and `food_type` values. You should be using the functions `group_by` and `summary`.
225 |    
226 | ```{r reclassification summary}
227 | food_consumption_summarised <- food_consumption_reclassified %>% 
228 |    group_by(country, food_type) %>% 
229 |    summarize(consumption = sum(consumption))
230 | ```
231 | 
232 |    * We would like to create a chart in which each country is a point, the x axis is Meat/dairy and the y-axis is Vegan consumption. What kind of transformations would you need to do on the previous tibble to prepare it for such a plot?
233 |    
234 |    * Try to use `pivot_wider` in order to make that transformation, and use `ggplot` to create the chart.
235 |    
236 | ```{r consumption chart}
237 | food_consumption_summarised %>% 
238 |    ungroup() %>% 
239 |    select(country, consumption, food_type) %>% 
240 |    pivot_wider(names_from = food_type, values_from = consumption) %>% 
241 |    ggplot(aes(x = `Meat/Dairy`, y = Vegan)) + 
242 |    geom_point()
243 | ```
244 | 
245 |    * Can you identify any relationships between the two variables? (consumption of Meat/Dairy versus consumption of Vegan food?)
246 | 
247 | 
248 | ***
249 | 
250 | **Checkpoint** - Solve exercise together in class.
251 | 
252 | ***


--------------------------------------------------------------------------------
/labs/answers/netflix movies and tv shows exercise - answers.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Netflix Movies and TV Shows - Exercise"
  3 | author: "Adi Sarid"
  4 | date: "2022-04-05"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ## Background
 13 | 
 14 | The following exercise is based on the Netflix Movies and TV Shows data, extracted from Kaggle ([here](https://www.kaggle.com/datasets/shivamb/netflix-shows)).
 15 | 
 16 | The goal of the exercise is to repeat and familiarize yourselves with the topics we were discussing in the past few weeks, but from a practical perspective.
 17 | 
 18 | In this exercise you will:
 19 | 
 20 | 1.  Write your solutions in RMarkdown, combining documentation and code.
 21 | 
 22 | 2.  You will import and transform data, using different functions from `tidyverse` which we discussed two weeks ago.
 23 | 
 24 | 3.  You will do some visualizations to support and interpret your analysis.
 25 | 
 26 | 4.  You will test a number of hypothesis:
 27 | 
 28 |     1.  Comparing means.
 29 | 
 30 |     2.  Comparing distributions.
 31 | 
 32 | Please do the exercise in pairs/groups.
 33 | 
 34 | ## Reading the data
 35 | 
 36 | Read the data, and use `glimpse` to understand it.
 37 | 
 38 | ```{r reading the data, warning=FALSE, message=FALSE}
 39 | library(tidyverse)
 40 | netflix_raw <- readr::read_csv("https://raw.githubusercontent.com/adisarid/intro_statistics_R/master/labs/data/netflix_titles.csv")
 41 | 
 42 | glimpse(netflix_raw)
 43 | ```
 44 | 
 45 | In the group, discuss the different variables of the data, what type are they? which of them do you think you should transform? why and how?
 46 | 
 47 | ## Data transformation
 48 | 
 49 | Create a new table called `netflix` in which:
 50 | 
 51 | -   `type` is a factor.
 52 | 
 53 | -   `country` is a factor.
 54 | 
 55 | -   `duration` is numeric.
 56 | 
 57 | -   Add a new variable called `duration_units` indicating the units of `duration`.
 58 | 
 59 | -   Create a set of new logical variables indicating the title type, i.e., is the title:
 60 | 
 61 |     -   International Movies
 62 | 
 63 |     -   Dramas
 64 | 
 65 |     -   Documentaries
 66 | 
 67 |     -   Comedies
 68 | 
 69 |     -   Action and Adventure
 70 | 
 71 |         Hint: you probably want to use `str_detect` for that.
 72 | 
 73 | ```{r transform and tidy}
 74 | netflix <- netflix_raw %>% 
 75 |   mutate(type = factor(type),
 76 |          country = factor(country),
 77 |          duration = parse_number(duration),
 78 |          duration_units = if_else(type == "Movie", "minutes", "seasons"),
 79 |          international_movie = str_detect(listed_in, "International Movies"),
 80 |          drama = str_detect(listed_in, "Dramas"),
 81 |          comedies = str_detect(listed_in, "Comedies"),
 82 |          documentaries = str_detect(listed_in, "Documentaries"),
 83 |          action_adventure = str_detect(listed_in, "Action & Adventure"))
 84 | ```
 85 | 
 86 | ## TV Shows versus Movies
 87 | 
 88 | Use a chart to plot the frequency (number of appearances per each country, per each type movies versus tv shows, for the following countries: "United States", "India", "United Kingdom", "Japan", "South Korea", "Canada", "Spain", "France", "Mexico"). Ignore titles with multiple countries. What position function should you use to compare the proportion of moveis versus tv shows (position_fill versus position_stack?)
 89 | 
 90 | ```{r country appearances}
 91 | netflix %>% 
 92 |   filter(country %in% c("United States", "India", "United Kingdom", "Japan", "South Korea", "Canada", "Spain", "France", "Mexico")) %>% 
 93 |   count(type, country) %>% 
 94 |   ggplot(aes(x = country, fill = type, y = n)) + 
 95 |   geom_col(position = position_fill()) + 
 96 |   coord_flip()
 97 | ```
 98 | 
 99 | ------------------------------------------------------------------------
100 | 
101 | What country is with the most TV shows and what country is with the least TV shows?
102 | 
103 | Can you find a similarity between some countries in the same region (i.e., European versus APAC versus North America?
104 | 
105 | Conduct a hypothesis test which examines if the proportion of tv shows in South Korea is higher than the proportion of tv shows in Japan. What is the p-value? is it a statistically significant finding (in the $\alpha=0.05$ level?
106 | 
107 | ------------------------------------------------------------------------
108 | 
109 | ```{r hypothesis prop test}
110 | netflix_apac <- netflix %>% 
111 |   filter(country %in% c("Japan", "South Korea")) %>% 
112 |   count(country, type)
113 | 
114 | netflix_apac
115 | 
116 | prop.test(x = c(169, 158), n = c(169+76, 158+41))
117 | ```
118 | 
119 | ## Movie duration
120 | 
121 | Plot the distribution of movie duration, discuss the type of distribution (e.g., is it normally distributed or not?). Find the mean, sd, and a 95% confidence interval for movie duration.
122 | 
123 | ```{r movie duration}
124 | netflix %>% 
125 |   filter(type == "Movie") %>% 
126 |   ggplot(aes(duration)) + 
127 |   geom_density()
128 | 
129 | netflix %>% 
130 |   group_by(type) %>% 
131 |   summarize(mean(duration, na.rm = T),
132 |             sd(duration, na.rm = T))
133 | 
134 | netflix_movies <- netflix %>% 
135 |   filter(type == "Movie")
136 | 
137 | t.test(netflix_movies$duration)
138 | ```
139 | 
140 | ----
141 | 
142 |    1. Compare the two duration distributions of drama and non-drama movies (via a plot). 
143 | 
144 |    2. Compute the mean and variance of movie duration of dramas versus non-dramas.
145 | 
146 |    3. Formulate and test the following hypothesis tests:
147 |       
148 |       a. Drama movies are longer than non-drama movies. Is this a paired or non-paired test?
149 |       
150 |       b. The variance of duration is different between drama and non-drama movies.
151 | 
152 | ```{r drama movie hypothesis}
153 | netflix_movies %>% 
154 |   group_by(drama) %>% 
155 |   summarize(mean(duration, na.rm = T),
156 |             sd(duration, na.rm = T))
157 | ggplot(netflix_movies, aes(duration, color = drama)) + 
158 |   geom_density()
159 | t.test(formula = duration ~ drama, data = netflix_movies)
160 | var.test(formula = duration ~ drama, data = netflix_movies, alternative = "two.sided")
161 | ```
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/labs/netflix movies and tv shows exercise.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Netflix Movies and TV Shows - Exercise"
  3 | author: "Adi Sarid"
  4 | date: "2022-04-05"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ## Background
 13 | 
 14 | The following exercise is based on the Netflix Movies and TV Shows data, extracted from Kaggle ([here](https://www.kaggle.com/datasets/shivamb/netflix-shows)).
 15 | 
 16 | The goal of the exercise is to repeat and familiarize yourselves with the topics we were discussing in the past few weeks, but from a practical perspective.
 17 | 
 18 | In this exercise you will:
 19 | 
 20 | 1.  Write your solutions in RMarkdown, combining documentation and code.
 21 | 
 22 | 2.  You will import and transform data, using different functions from `tidyverse` which we discussed two weeks ago.
 23 | 
 24 | 3.  You will do some visualizations to support and interpret your analysis.
 25 | 
 26 | 4.  You will test a number of hypothesis:
 27 | 
 28 |     1.  Comparing means.
 29 | 
 30 |     2.  Comparing distributions.
 31 | 
 32 | Please do the exercise in pairs/groups.
 33 | 
 34 | ## Reading the data
 35 | 
 36 | Read the data, and use `glimpse` to understand it.
 37 | 
 38 | ```{r reading the data, warning=FALSE, message=FALSE, eval=FALSE}
 39 | library(tidyverse)
 40 | netflix_raw <- readr::read_csv("https://raw.githubusercontent.com/adisarid/intro_statistics_R/master/labs/data/netflix_titles.csv")
 41 | 
 42 | glimpse(netflix_raw)
 43 | ```
 44 | 
 45 | In the group, discuss the different variables of the data, what type are they? which of them do you think you should transform? why and how?
 46 | 
 47 | ## Data transformation
 48 | 
 49 | Create a new table called `netflix` in which:
 50 | 
 51 | -   `type` is a factor.
 52 | 
 53 | -   `country` is a factor.
 54 | 
 55 | -   `duration` is numeric.
 56 | 
 57 | -   Add a new variable called `duration_units` indicating the units of `duration`.
 58 | 
 59 | -   Create a set of new logical variables indicating the title type, i.e., is the title:
 60 | 
 61 |     -   International Movies
 62 | 
 63 |     -   Dramas
 64 | 
 65 |     -   Documentaries
 66 | 
 67 |     -   Comedies
 68 | 
 69 |     -   Action and Adventure
 70 | 
 71 |         Hint: you probably want to use `str_detect` for that.
 72 | 
 73 | (Fill-in the blanks)
 74 | 
 75 | ```{r transform and tidy, eval=FALSE}
 76 | netflix <- netflix_raw %>% 
 77 |   mutate(type = factor(___),
 78 |          country = factor(___),
 79 |          duration = parse_number(___),
 80 |          duration_units = if_else(type == "Movie", ___, ___),
 81 |          international_movie = str_detect(listed_in, "International Movies"),
 82 |          ___ = str_detect(listed_in, "Dramas"),
 83 |          comedies = str_detect(___, ___),
 84 |          documentaries = ___,
 85 |          action_adventure = ___)
 86 | ```
 87 | 
 88 | ## TV Shows versus Movies
 89 | 
 90 | Use a chart to plot the frequency (number of appearances per each country, per each type movies versus tv shows, for the following countries: "United States", "India", "United Kingdom", "Japan", "South Korea", "Canada", "Spain", "France", "Mexico"). Ignore titles with multiple countries. What position function should you use to compare the proportion of movies versus tv shows (position_fill versus position_stack?)
 91 | 
 92 | ```{r country appearances, eval=FALSE}
 93 | netflix %>% 
 94 |   filter(country %in% c("United States", "India", "United Kingdom", "Japan", "South Korea", "Canada", "Spain", "France", "Mexico")) %>% 
 95 |   count(___, ___) %>% 
 96 |   ggplot(aes(x = ___, fill = ___, y = n)) + 
 97 |   geom_col(position = ___) + 
 98 |   coord_flip()
 99 | ```
100 | 
101 | ------------------------------------------------------------------------
102 | 
103 | What country is with the most TV shows and what country is with the least TV shows?
104 | 
105 | Can you find a similarity between some countries in the same region (i.e., European versus APAC versus North America?
106 | 
107 | Conduct a hypothesis test which examines if the proportion of tv shows in South Korea is higher than the proportion of tv shows in Japan. What is the p-value? is it a statistically significant finding (in the $\alpha=0.05$ level?
108 | 
109 | ------------------------------------------------------------------------
110 | 
111 | ```{r hypothesis prop test, eval=FALSE}
112 | netflix_apac <- netflix %>% 
113 |   filter(country %in% ___) %>% 
114 |   count(country, type)
115 | 
116 | netflix_apac
117 | 
118 | prop.test(x = c(___, ___), n = c(___, ___))
119 | ```
120 | 
121 | ## Movie duration
122 | 
123 | Plot the distribution of movie duration, discuss the type of distribution (e.g., is it normally distributed or not?). Find the mean, sd, and a 95% confidence interval for movie duration.
124 | 
125 | ```{r movie duration, eval=FALSE}
126 | netflix %>% 
127 |   filter(type == "Movie") %>% 
128 |   ggplot(aes(___)) + 
129 |   geom____()
130 | 
131 | netflix %>% 
132 |   group_by(type) %>% 
133 |   summarize(mean(___, na.rm = ___),
134 |             sd(___, na.rm = ___))
135 | 
136 | netflix_movies <- netflix %>% 
137 |   filter(type == "Movie")
138 | 
139 | t.test(___)
140 | ```
141 | 
142 | ----
143 | 
144 |    1. Compare the two duration distributions of drama and non-drama movies (via a plot). 
145 | 
146 |    2. Compute the mean and variance of movie duration of dramas versus non-dramas.
147 | 
148 |    3. Formulate and test the following hypothesis tests:
149 |       
150 |       a. Drama movies are longer than non-drama movies. Is this a paired or non-paired test?
151 |       
152 |       b. The variance of duration is different between drama and non-drama movies.
153 | 
154 | ```{r drama movie hypothesis, eval=FALSE}
155 | 
156 | netflix_movies %>% 
157 |   group_by(___) %>% 
158 |   summarize(mean(duration, na.rm = T),
159 |             sd(duration, na.rm = T))
160 | 
161 | ggplot(netflix_movies, aes(duration, color = drama)) + 
162 |   geom_density()
163 | 
164 | t.test(formula = ___, data = ___)
165 | 
166 | var.test(formula = ___, data = ___, alternative = "___")
167 | ```
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/lectures/00-Introduction.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/00-Introduction.pptx


--------------------------------------------------------------------------------
/lectures/00-intro-binomial-dist.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | tibble(Infected = 25:100, 
 3 |        Probability = dbinom(25:100, 300, 0.2)) %>% 
 4 |   mutate(color = if_else(Infected %in% c(40, 50, 55), T, F)) %>% 
 5 |   ggplot(aes(x = Infected, y = Probability, fill = color)) + 
 6 |   scale_fill_manual(values = c("TRUE" = saridr::sarid_colors$light_blue_gradient, "FALSE" = "grey")) +
 7 |   geom_col() + 
 8 |   guides(fill = F) + 
 9 |   scale_y_continuous(labels = scales::percent_format(1)) + 
10 |   ggtitle("Density function n=300, p=0.2")
11 | 
12 | tibble(Infected = 25:100, 
13 |        Probability = pbinom(25:100, 300, 0.2)) %>% 
14 |   mutate(label = if_else(Infected %in% c(40, 50, 55), glue::glue("{round(Probability*100,2)}%"), NA_character_)) %>% 
15 |   ggplot(aes(x = Infected, y = Probability)) + 
16 |   geom_line() + 
17 |   geom_label(aes(label = label)) +
18 |   guides(fill = F) + 
19 |   scale_y_continuous(labels = scales::percent_format(1)) + 
20 |   ggtitle("Distribution function n=300, p=0.2")
21 | 


--------------------------------------------------------------------------------
/lectures/00-introduction/00-introduction_script.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction - visualizing and summarizing data"
  3 | author: "Adi Sarid"
  4 | date: "2019-10-19"
  5 | output: html_document
  6 | ---
  7 |   
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | library(tidyverse)
 11 | ```
 12 | 
 13 | 
 14 | ## The power lifting data set
 15 | 
 16 | We're going to demonstrate with power lifting data.
 17 | This data set comes from tidytuesday (2019-10-08), see the documentation here:
 18 | https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-10-08
 19 | 
 20 | 
 21 | ```{r read the data}
 22 | ipf_lifts <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-08/ipf_lifts.csv")
 23 | 
 24 | glimpse(ipf_lifts)
 25 | ```
 26 | 
 27 | ## Scatter plot
 28 | 
 29 | A scatter plot allows us to examine the relationship between two continuous variables (i.e., numeric). For example, the following scatter plot will tell us the relationship between squats and bench presses (two types of exercises, which work on the legs -- quadriceps and the hands -- triceps respectively). We're going to sample observations (because all 40k observations will be too much for the chart).
 30 | 
 31 | ```{r scatter plot sqauat bench age}
 32 | set.seed(0) # used to get consistent results
 33 | 
 34 | # sample a subset of the file
 35 | sampled_ipf_lifts <- ipf_lifts %>% 
 36 |   filter(!is.na(best3squat_kg) & !is.na(best3bench_kg)) %>% 
 37 |   sample_n(1500)
 38 | 
 39 | # plot squat versus bench press
 40 | ggplot(sampled_ipf_lifts,
 41 |        aes(x = best3squat_kg, y = best3bench_kg)) + 
 42 |   geom_point(alpha = 0.3) + 
 43 |   theme_bw()
 44 | 
 45 | ggplot(sampled_ipf_lifts,
 46 |        aes(x = age, y = best3bench_kg)) + 
 47 |   geom_point(alpha = 0.3) + 
 48 |   theme_bw()
 49 | 
 50 | ```
 51 | 
 52 | Think about what these scatter plots teach us?
 53 | 
 54 |    * What is the relationship between benchpresses weight and squats weight?
 55 |    * What is the meaning of the point with a negative squat weight measurement?
 56 |    * What do outlier points look like here?
 57 |    * Is there a relationship between benchpresses weight and age?
 58 | 
 59 | Later on, we will learn how to extract the linear relationship between such variables (the equation), when there is such a relationship. This will be dealt with in the linear regression chapter of our course.
 60 | 
 61 | ## Single variable: distribution, mean, meadian, standard deviation
 62 | 
 63 | Now we discuss how to express the properties of a single, continuous, variable.
 64 | 
 65 | ### Histogram and Shape
 66 | 
 67 | The distribution of a variable can be described with a histogram or with a density plot. The both are related (basically show the same thing).
 68 | 
 69 | ```{r histogram and density}
 70 | 
 71 | ggplot(ipf_lifts, aes(x = age)) + 
 72 |   geom_histogram() + 
 73 |   theme_bw()
 74 |   
 75 | ggplot(ipf_lifts, aes(x = age, y = stat(density))) + 
 76 |   geom_histogram() + 
 77 |   geom_density(color = "red", size = 1) +
 78 |   theme_bw()
 79 |   
 80 | ipf_lifts %>% 
 81 |   select(starts_with("best3")) %>% 
 82 |   pivot_longer(cols = everything(), names_to = "exercise", values_to = "weight") %>% 
 83 |   filter(weight > 0) %>% 
 84 |   ggplot(aes(x = weight, y = stat(density))) + 
 85 |   geom_density(aes(color = exercise), size = 1, bw = 10) + 
 86 |   theme_bw()
 87 | 
 88 | ```
 89 | 
 90 | 
 91 | ***
 92 | 
 93 | Questions: 
 94 | [Mentimeter edit link](https://www.mentimeter.com/s/c53753031b6cccd429aebeedf531eb1d/fb09c578738d/edit)
 95 | 
 96 |    1. Which exercise has relatively lower weight values? [mentimeter](https://www.menti.com/tgdkyggsnu)
 97 |    2. The densities look bi-modal (two peaks). Can you guess why?
 98 |    3. Which exercise has higher dispersion?
 99 |    4. Can you think of familiar distributions (or a combination of such) which would fit these densities?
100 | 
101 | ***
102 | 
103 | Here are some familiar forms of distributions:
104 | 
105 | ```{r density and histogram demonstrations}
106 | 
107 | familiar_distributions <- tibble(values = rnorm(1000), dist_type = "normal(mu=0,sig=1)") %>% 
108 |   bind_rows(tibble(values = runif(1000), dist_type = "uniform(a=0,b=1)"),
109 |             tibble(values = rexp(1000), dist_type = "exponential(rate=1)"),
110 |             tibble(values = rchisq(1000, df = 3), dist_type = "Chi-sqaure(df=3)"),
111 |             tibble(values = rbinom(1000, size = 6, prob = 0.5), dist_type = "binomial(n=6,p=0.5)")) 
112 |   
113 | ggplot(familiar_distributions, aes(x = values, y = stat(density))) + 
114 |   geom_histogram() + 
115 |   geom_density(color = "red", size = 1) + 
116 |   facet_wrap(~dist_type) + 
117 |   theme_bw()
118 | 
119 | ```
120 | 
121 | Notice a few things:
122 | 
123 |    * The difference between a discrete distribution and a continuous distribution
124 |    * The variance (dispersion) of the distributions (based on the second moment $EX^2 - (EX)^2$)
125 |    * The asymmetry of certain distributions - which can be measured via skeweness (based on the third moment $E[\left(\frac{X-\mu}{\sigma}\right)^3]$)
126 |    * The tendency to yield outliers (extreme values) - which is measured by kurtosis (based on the fourth moment $E[\left(\frac{X-\mu}{\sigma}\right)^4]$)
127 | 
128 | These can be computed either from the data, or analytically (when the distribution is known). We will delve deeper into these terms later on.
129 | 
130 | ### Mean, standard deviation (and variance)
131 | 
132 | Reminder: the mean or expected value of a random variable $X$ is defined as:
133 | 
134 | \[
135 | E[X] = \mu_X = \int_{x=-\infty}^\infty{xf(x)dx}
136 | \]
137 | 
138 | In the case of a **discrete** variable, the integral becomes summation.
139 | 
140 | \[
141 | E[X] = \sum_{x=-\infty}^\infty{xf(x)}
142 | \]
143 | 
144 | When we are estimating the mean from a given sample, the weight of each observation is $1/n$, hence, we get the familiar form for computing the mean (average):
145 | 
146 | \[
147 | \bar{x} = \frac{1}{n}\sum_{i=1}^n{x_i}
148 | \]
149 | 
150 | The standard deviation (variance) is a measure of dispersion of a distribution. Defined as:
151 | 
152 | \[
153 | V(X) = \sigma^2=E[(X-\mu)^2] = \int{(x-\mu)^2f(x)dx}
154 | \]
155 | 
156 | Note that $\sigma^2=E[X^2]-(EX)^2$. For a sample there are two types of estimates to standard deviations which are used oftenly (either $\hat{\sigma}$ or $s$, for population and for a sample).
157 | 
158 | The standard deviation is the sqaure root of the variance $\sigma$.
159 | 
160 | When computing the standard deviation of a population we use:
161 | 
162 | \[
163 | \hat{\sigma} = \sqrt{\frac{1}{n}\sum_{i=1}^n(x_i-\bar{x})^2}
164 | \]
165 | 
166 | And for a sample we would use a denominator $n-1$ instead of $n$:
167 | 
168 | \[
169 | s = \sqrt{\frac{1}{n-1}\sum_{i=1}^n(x_i-\bar{x})^2}
170 | \]
171 | 
172 | This is called Bessel's correction, which is applied to yield an **unbiased estimate**. We'll get back to that later on and explain bias in detail, and why it is unbiased.
173 | 
174 | ```{r compute mean and std}
175 | 
176 | familiar_distributions %>% 
177 |   group_by(dist_type) %>% 
178 |   summarize(mean = mean(values),
179 |             sd = sd(values),
180 |             var = var(values))
181 | 
182 | 
183 | ```
184 | 
185 | ### Boxplot, median, quartiles (and percentiles)
186 | 
187 | ```{r boxplot example}
188 | 
189 | ipf_lifts %>% 
190 |   select(starts_with("best3")) %>% 
191 |   pivot_longer(cols = everything(), names_to = "exercise", values_to = "weight") %>% 
192 |   filter(weight > 0) %>% 
193 |   ggplot(aes(y = weight, x = exercise)) + 
194 |   geom_boxplot() +
195 |   theme_bw()
196 | 
197 | ```
198 | 
199 | The median is the value that 50\% of the obsevations are below it and 50\% of the observations are above it, i.e.: the median is the observation right in the middle. If there are an even number of observations, there will be two observations "in the middle" and the median is defined as their average. 
200 | 
201 | The $P$-th **percentile** of a list of $n$ observations (sorted in an increasing order) is the number located at $\left\lceil {\frac {P}{100}}\times n\right\rceil$.
202 | 
203 | The estimate of percentile $p$ ($0\leq p\leq1$) is the value $v$ which yields:
204 | 
205 | \[
206 | P(X\leq v) = p
207 | \]
208 | 
209 | (For median $p=0.5$, for quantiles $p\in\{0.25, 0.5, 0.75\}$)
210 | 
211 | The boxplot illustrates the quantiles (box ends), the median (line inside the box), the box extends with two wiskers up to $1.5\times IQR$ (the Inter-Quartile-Range). Observations outside the IQR are considered outliers, and marked by a point.
212 | 
213 | ```{r example for IQR}
214 | 
215 | ipf_lifts %>% 
216 |   select(starts_with("best3")) %>% 
217 |   pivot_longer(cols = everything(), names_to = "exercise", values_to = "weight") %>% 
218 |   filter(weight > 0) %>% 
219 |   group_by(exercise) %>% 
220 |   summarize(quartile1 = quantile(weight, 0.25),
221 |             quartile3 = quantile(weight, 0.75)) %>% 
222 |   mutate(IQR = quartile3 - quartile1) %>% 
223 |   mutate(buttom_whisker = quartile1 - 1.5*IQR,
224 |          top_whisker = quartile3 + 1.5*IQR)
225 | 
226 | ```
227 | 
228 | Example for boxplot of common distributions.
229 | 
230 | ```{r boxplot common distributions}
231 | ggplot(familiar_distributions, aes(x = dist_type, y = values)) + 
232 |   geom_boxplot() +
233 |   theme_bw()
234 | ```
235 | 
236 | ### Data transformations
237 | 
238 | Sometimes, it's useful to transform data. Transformations can reveal new relationships between variables, and allow us to improve models. We will learn more about transformations later on in the course (when we discuss linear and logistic regression, for example). 
239 | 
240 | Here is a demonstration from the `diamonds` data set. You will work with this data set in the homework (questions from the R4DS book).
241 | 
242 | ```{r relationship with transformations}
243 | 
244 | ggplot(diamonds,
245 |        aes(x = carat, y = price)) + 
246 |   geom_point(alpha = 0.3) + 
247 |   theme_bw()
248 | 
249 | ggplot(diamonds,
250 |        aes(x = log(carat), y = price)) + 
251 |   geom_point(alpha = 0.3) + 
252 |   theme_bw()
253 | 
254 | ggplot(diamonds,
255 |        aes(x = log(carat), y = log(price))) + 
256 |   geom_point(alpha = 0.3) + 
257 |   theme_bw()
258 | 
259 | 
260 | ```


--------------------------------------------------------------------------------
/lectures/00-introduction/IWER34_2019.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/00-introduction/IWER34_2019.xlsx


--------------------------------------------------------------------------------
/lectures/00-introduction/st02_03.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/00-introduction/st02_03.xls


--------------------------------------------------------------------------------
/lectures/01-Point Estimation Methods and Intervals.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/01-Point Estimation Methods and Intervals.pdf


--------------------------------------------------------------------------------
/lectures/02-Intervals.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/02-Intervals.pdf


--------------------------------------------------------------------------------
/lectures/03 - Hypothesis Tests.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/03 - Hypothesis Tests.pdf


--------------------------------------------------------------------------------
/lectures/04 - Statistical inference for Two Samples.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/04 - Statistical inference for Two Samples.pdf


--------------------------------------------------------------------------------
/lectures/05 - Simple Linear Regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/05 - Simple Linear Regression.pdf


--------------------------------------------------------------------------------
/lectures/06 - Multiple Linear Regression and Correlation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/06 - Multiple Linear Regression and Correlation.pdf


--------------------------------------------------------------------------------
/lectures/06-Note_about_overfitting.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "A note about overfitting"
 3 | author: "Adi Sarid"
 4 | date: "12/7/2019"
 5 | output: html_document
 6 | ---
 7 | 
 8 | When the number of features $p$ is very big compared to the sample size $n$, we are prone to undesired effects, also termed "the curse of dimensionality". Mainly, when the number of features is as big as we want we can explain any observed phenomena in the train set, but not for a good reason, it is simply due to an excess in degrees of freedom.
 9 | 
10 | For example
11 | 
12 |    * Take $y$ completely random with 100 observations.
13 |    * Take 95 parameters $x_1,\ldots,x_{95}$ and set their values randomly also.
14 |    * Build a model, any model (we'll use linear regression here).
15 |    * Analyze the model's fit.
16 |    * Do the process only this time with a train/test split.
17 |    
18 | ```{r overfitting in action, message=FALSE, warning=FALSE}
19 | library(tidyverse)
20 | set.seed(0)
21 | xvars <- data.frame(matrix(runif(100*95), ncol=95))
22 | overfitting <- tibble(y = runif(100)) %>%
23 |   bind_cols(xvars)
24 | glimpse(overfitting)
25 | ggplot(overfitting, aes(y)) + 
26 |   geom_histogram() + 
27 |   theme_bw()
28 | 
29 | # these are just uniformly distributed numbers, should have no kind of relationship between variables
30 | # here's a model with just a few X's, and no overfit. The model is insignificant.
31 | # the only significant coefficient beta is the intercept (which is roughly equal to the average of y)
32 | 
33 | lm_no_overfit <- lm(data = overfitting,
34 |                     formula = y ~ X1 + X2 + X3)
35 | summary(lm_no_overfit)
36 | 
37 | # now, see what happens when we add all the 95 features
38 | # mostly, look at the R^2. It's almost 1!
39 | lm_overfit <- lm(data = overfitting,
40 |                  formula = y ~ .)
41 | summary(lm_overfit)
42 | 
43 | # now, see the errors of each model
44 | overfitting <- overfitting %>% 
45 |   mutate(res_no_overfit = y - predict(lm_no_overfit, newdata = overfitting),
46 |          res_overfit = y - predict(lm_overfit, newdata = overfitting))
47 | overfitting %>%
48 |   summarize(mean((res_no_overfit)^2),
49 |             mean((res_overfit)^2))
50 | # 80%+ reduction in mean absolute residual error!
51 | ```
52 | 
53 | It looks as if the over fit model is amazing, but this is a bluff. Let's do this again, only this time with a train/test split.
54 | 
55 | ```{r overfitting detection with test set}
56 | overfitting <- overfitting %>%
57 |   mutate(is_train = runif(nrow(overfitting)) < 0.8)
58 | 
59 | lm_overfit_train <- lm(data = overfitting %>% filter(is_train),
60 |                        formula = y ~ .)
61 | 
62 | overfitting <- overfitting %>%
63 |   mutate(res_overfit_train = y - predict(lm_overfit_train, newdata = overfitting))
64 | 
65 | overfitting %>%
66 |   filter(!is_train) %>%
67 |   summarize(mean((res_no_overfit)^2),
68 |             mean((res_overfit)^2),
69 |             mean((res_overfit_train)^2))
70 | 
71 | # Now the "true face" of the model is discovered. See how high the error rate of the test set is!
72 | # Beware of overfitting models. Always use train/test. Watch out for n and p.
73 | ```
74 | 
75 | ## To sum up
76 | 
77 |    * Beware of overfitting.
78 |    * Always use a train/test split (also possible train/test/validate or cross-validation).
79 |    * Consider the number of parameters $p$ versus the sample size $n$. There is no "iron rule" here but the test set error will help guide you, and also, comparing a nominal model to your model will show you the contribution of your model.


--------------------------------------------------------------------------------
/lectures/07 - Regression, Design and Analysis of Single-Factor Experiments.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/07 - Regression, Design and Analysis of Single-Factor Experiments.pdf


--------------------------------------------------------------------------------
/lectures/09-One_Two_way_ANOVA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/09-One_Two_way_ANOVA.pdf


--------------------------------------------------------------------------------
/lectures/Example_for_multicolinearity_problem.R:
--------------------------------------------------------------------------------
 1 | set.seed(42)
 2 | ex1 <- tibble(x1 = runif(100)) %>% 
 3 |   mutate(y = 5 + 3*x1 + rnorm(100, mean = 0, sd = 0.05),
 4 |          x2 = 2*x1 + rnorm(100, mean = 0, sd = 0.01))
 5 | 
 6 | ex1 %>% 
 7 |   cor()
 8 | 
 9 | summary(lm(y ~ x1, ex1))
10 | summary(lm(y ~ x2, ex1))
11 | summary(lm(y ~ x1 + x2, ex1))
12 | 
13 | car::vif(lm(y ~ x1 + x2, ex1))
14 | 


--------------------------------------------------------------------------------
/lectures/data/montgomery_13.5_fabric_strength.csv:
--------------------------------------------------------------------------------
1 | chemical,fabric1,fabric2,fabric3,fabric4,fabric5
2 | 1,1.3,1.6,0.5,1.2,1.1
3 | 2,2.2,2.4,0.4,2,1.8
4 | 3,1.8,1.7,0.6,1.5,1.3
5 | 4,3.9,4.4,2,4.1,3.4
6 | 


--------------------------------------------------------------------------------
/lectures/data/montgomery_14.5_adhesion_force.csv:
--------------------------------------------------------------------------------
 1 | ﻿primer_type,application_method,adhesion_force
 2 | 1,Dipping,4
 3 | 1,Dipping,4.5
 4 | 1,Dipping,4.3
 5 | 1,Spraying,5.4
 6 | 1,Spraying,4.9
 7 | 1,Spraying,5.6
 8 | 2,Dipping,5.6
 9 | 2,Dipping,4.9
10 | 2,Dipping,5.4
11 | 2,Spraying,5.8
12 | 2,Spraying,6.1
13 | 2,Spraying,6.3
14 | 3,Dipping,3.8
15 | 3,Dipping,3.7
16 | 3,Dipping,4
17 | 3,Spraying,5.5
18 | 3,Spraying,5
19 | 3,Spraying,5
20 | 


--------------------------------------------------------------------------------
/lectures/data/wildlife_impacts_small.csv:
--------------------------------------------------------------------------------
  1 | height,n
  2 | 0,15684
  3 | 1,56
  4 | 2,51
  5 | 3,30
  6 | 4,14
  7 | 5,273
  8 | 6,15
  9 | 7,10
 10 | 8,17
 11 | 9,7
 12 | 10,1078
 13 | 11,4
 14 | 12,7
 15 | 13,3
 16 | 14,2
 17 | 15,125
 18 | 17,2
 19 | 18,1
 20 | 20,503
 21 | 21,3
 22 | 22,1
 23 | 23,1
 24 | 25,97
 25 | 26,1
 26 | 27,1
 27 | 30,325
 28 | 35,27
 29 | 36,1
 30 | 37,1
 31 | 40,79
 32 | 42,1
 33 | 45,6
 34 | 46,1
 35 | 47,1
 36 | 48,1
 37 | 50,1572
 38 | 55,2
 39 | 60,31
 40 | 62,2
 41 | 65,3
 42 | 70,27
 43 | 73,1
 44 | 75,114
 45 | 76,1
 46 | 79,1
 47 | 80,28
 48 | 90,21
 49 | 91,1
 50 | 95,3
 51 | 100,1793
 52 | 110,2
 53 | 120,7
 54 | 125,23
 55 | 130,1
 56 | 140,2
 57 | 142,1
 58 | 150,311
 59 | 170,1
 60 | 173,1
 61 | 175,8
 62 | 178,1
 63 | 180,5
 64 | 185,1
 65 | 190,5
 66 | 200,1433
 67 | 220,1
 68 | 225,4
 69 | 230,2
 70 | 240,1
 71 | 250,103
 72 | 260,1
 73 | 270,1
 74 | 275,1
 75 | 280,1
 76 | 290,1
 77 | 300,1143
 78 | 310,1
 79 | 320,1
 80 | 325,1
 81 | 330,3
 82 | 350,62
 83 | 370,2
 84 | 375,1
 85 | 381,1
 86 | 400,633
 87 | 410,1
 88 | 413,1
 89 | 423,1
 90 | 424,1
 91 | 425,1
 92 | 430,2
 93 | 431,1
 94 | 450,33
 95 | 460,1
 96 | 480,3
 97 | 500,1460
 98 | 510,2
 99 | 530,1
100 | 540,1
101 | 545,1
102 | 550,15
103 | 560,2
104 | 565,1
105 | 570,1
106 | 580,3
107 | 600,358
108 | 620,1
109 | 650,11
110 | 680,2
111 | 700,290
112 | 710,1
113 | 725,1
114 | 750,32
115 | 754,1
116 | 800,608
117 | 809,1
118 | 850,11
119 | 883,1
120 | 890,1
121 | 900,137
122 | 922,1
123 | 950,3
124 | 995,1
125 | 1000,1354
126 | 1003,1
127 | 1025,1
128 | 1040,1
129 | 1050,2
130 | 1060,2
131 | 1100,91
132 | 1126,1
133 | 1150,4
134 | 1165,1
135 | 1174,1
136 | 1180,1
137 | 1200,301
138 | 1240,1
139 | 1250,15
140 | 1275,1
141 | 1300,100
142 | 1320,1
143 | 1323,1
144 | 1350,10
145 | 1370,1
146 | 1380,1
147 | 1400,109
148 | 1440,1
149 | 1450,2
150 | 1464,1
151 | 1480,1
152 | 1485,1
153 | 1490,1
154 | 1500,930
155 | 1550,1
156 | 1580,2
157 | 1600,78
158 | 1617,1
159 | 1640,2
160 | 1643,1
161 | 1650,1
162 | 1700,109
163 | 1750,4
164 | 1775,1
165 | 1800,161
166 | 1820,1
167 | 1830,1
168 | 1850,7
169 | 1870,1
170 | 1880,1
171 | 1900,61
172 | 1950,2
173 | 1975,3
174 | 1980,1
175 | 1990,2
176 | 2000,983
177 | 2020,1
178 | 2080,1
179 | 2100,37
180 | 2150,1
181 | 2200,78
182 | 2250,1
183 | 2300,57
184 | 2350,1
185 | 2375,1
186 | 2400,79
187 | 2448,1
188 | 2500,467
189 | 2510,1
190 | 2530,1
191 | 2600,43
192 | 2650,2
193 | 2700,35
194 | 2750,2
195 | 2800,54
196 | 2830,1
197 | 2850,4
198 | 2900,22
199 | 2950,1
200 | 2975,1
201 | 2992,1
202 | 2999,1
203 | 3000,1076
204 | 3027,1
205 | 3050,1
206 | 3100,13
207 | 3150,1
208 | 3200,31
209 | 3225,1
210 | 3250,2
211 | 3280,1
212 | 3300,25
213 | 3330,1
214 | 3400,55
215 | 3500,241
216 | 3550,1
217 | 3600,21
218 | 3664,1
219 | 3700,20
220 | 3740,1
221 | 3750,4
222 | 3800,25
223 | 3850,2
224 | 3900,9
225 | 3950,3
226 | 3960,1
227 | 4000,699
228 | 4100,7
229 | 4130,1
230 | 4200,16
231 | 4300,19
232 | 4400,22
233 | 4500,126
234 | 4600,13
235 | 4650,1
236 | 4700,6
237 | 4750,3
238 | 4800,16
239 | 4900,8
240 | 4940,1
241 | 4950,2
242 | 5000,585
243 | 5100,2
244 | 5200,19
245 | 5300,7
246 | 5330,1
247 | 5360,1
248 | 5380,1
249 | 5400,15
250 | 5450,1
251 | 5500,70
252 | 5600,9
253 | 5605,1
254 | 5700,7
255 | 5800,9
256 | 5850,1
257 | 5900,7
258 | 5960,1
259 | 5979,1
260 | 6000,375
261 | 6100,5
262 | 6200,10
263 | 6300,7
264 | 6400,4
265 | 6500,61
266 | 6600,6
267 | 6700,7
268 | 6800,8
269 | 6900,4
270 | 7000,246
271 | 7080,1
272 | 7100,2
273 | 7200,5
274 | 7300,3
275 | 7400,10
276 | 7500,56
277 | 7600,6
278 | 7700,5
279 | 7800,6
280 | 7900,1
281 | 8000,199
282 | 8300,4
283 | 8400,2
284 | 8500,30
285 | 8600,1
286 | 8700,3
287 | 8800,4
288 | 8900,1
289 | 9000,91
290 | 9200,2
291 | 9300,2
292 | 9400,4
293 | 9430,1
294 | 9500,29
295 | 9700,2
296 | 9800,3
297 | 9950,1
298 | 10000,164
299 | 10100,2
300 | 10200,4
301 | 10300,1
302 | 10400,8
303 | 10500,17
304 | 10700,2
305 | 10800,2
306 | 10900,1
307 | 11000,93
308 | 11200,1
309 | 11300,2
310 | 11400,1
311 | 11500,9
312 | 11600,1
313 | 11700,1
314 | 12000,46
315 | 12300,1
316 | 12500,16
317 | 12600,1
318 | 13000,27
319 | 13250,1
320 | 13400,1
321 | 13500,4
322 | 13700,1
323 | 13800,1
324 | 14000,19
325 | 14400,1
326 | 15000,16
327 | 15500,1
328 | 16000,9
329 | 16500,1
330 | 17000,2
331 | 17500,1
332 | 18000,3
333 | 18500,2
334 | 19000,1
335 | 20000,4
336 | 21000,1
337 | 22000,1
338 | 23000,1
339 | 24300,1
340 | 25000,2
341 | NA,18038
342 | 


--------------------------------------------------------------------------------
/lectures/files_during_lecture/05-file1.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | wildlife_small <- read_csv("lectures/data/wildlife_impacts_small.csv", col_types = cols()) %>% 
 4 |   mutate(rounded_height = round(height/1000)) %>% 
 5 |   group_by(rounded_height) %>% 
 6 |   summarize(n = sum(n)) %>% 
 7 |   filter(!is.na(rounded_height))
 8 | 
 9 | 
10 | wildlife_hist <- ggplot(wildlife_small, aes(x = rounded_height, y = n)) + 
11 |   geom_col(fill = "darkorange", color = "black") + theme_bw() + xlab("Height [k feet]") + scale_y_log10()
12 | wildlife_points <- ggplot(wildlife_small, aes(x = rounded_height, y = n)) + 
13 |   geom_point() +
14 |   theme_linedraw() + 
15 |   scale_y_log10() +
16 |   stat_smooth(method = "lm") + 
17 |   xlab("Height [k feet]")
18 | cowplot::plot_grid(wildlife_hist, wildlife_points)
19 | 
20 | 
21 | 
22 | 
23 | wildlife_lm <- lm(formula = log(n) ~ rounded_height, data = wildlife_small)
24 | 
25 | ggplot(tibble(res = wildlife_lm$residuals),
26 |        aes(sample = res)) + 
27 |   geom_qq()
28 | 
29 | 
30 | 
31 | summary(wildlife_lm)
32 | 


--------------------------------------------------------------------------------
/lectures/files_during_lecture/05-file2.R:
--------------------------------------------------------------------------------
1 | tibble(number = rnorm(150, mean = 2, sd = 5)) %>% 
2 |   ggplot(aes(sample = number)) + 
3 |   geom_qq()
4 | 
5 | tibble(number = rnorm(150, mean = 0, sd = 1)) %>% 
6 |   ggplot(aes(x = number)) + 
7 |   geom_density()
8 | 


--------------------------------------------------------------------------------
/lectures/files_during_lecture/05-file3.R:
--------------------------------------------------------------------------------
 1 | mtcars_lm <- lm(formula = mpg ~ disp, data = mtcars)
 2 | 
 3 | mtcars_lm$residuals
 4 | mtcars_lm$fitted.values
 5 | 
 6 | summary(mtcars_lm)
 7 | 
 8 | mtcars_new <- mtcars %>% 
 9 |   mutate(resid = mtcars_lm$residuals) %>% 
10 |   mutate(prediction = mtcars_lm$fitted.values) 
11 | 
12 | 
13 | ggplot(mtcars_new, aes(y = resid, x = disp)) + 
14 |   geom_point()
15 | 
16 | ggplot(mtcars_new, aes(x = resid)) + 
17 |   geom_density()
18 | 
19 | ggplot(mtcars_new, aes(sample = resid)) + 
20 |   geom_qq()
21 | 


--------------------------------------------------------------------------------
/lectures/images/Type_IandType_II_errors.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/images/Type_IandType_II_errors.jpg


--------------------------------------------------------------------------------
/lectures/images/birds_eye_view1.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <svg
  3 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  4 |    xmlns:cc="http://creativecommons.org/ns#"
  5 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  6 |    xmlns:svg="http://www.w3.org/2000/svg"
  7 |    xmlns="http://www.w3.org/2000/svg"
  8 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
  9 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 10 |    width="206.03027mm"
 11 |    height="160.05992mm"
 12 |    viewBox="0 0 206.03027 160.05992"
 13 |    version="1.1"
 14 |    id="svg8"
 15 |    inkscape:version="1.0.1 (3bc2e813f5, 2020-09-07)"
 16 |    sodipodi:docname="birds_eye_view1.svg">
 17 |   <defs
 18 |      id="defs2" />
 19 |   <sodipodi:namedview
 20 |      id="base"
 21 |      pagecolor="#ffffff"
 22 |      bordercolor="#666666"
 23 |      borderopacity="1.0"
 24 |      inkscape:pageopacity="0.0"
 25 |      inkscape:pageshadow="2"
 26 |      inkscape:zoom="0.98994949"
 27 |      inkscape:cx="325.67277"
 28 |      inkscape:cy="344.04687"
 29 |      inkscape:document-units="mm"
 30 |      inkscape:current-layer="layer1"
 31 |      inkscape:document-rotation="0"
 32 |      showgrid="false"
 33 |      fit-margin-top="0"
 34 |      fit-margin-left="0"
 35 |      fit-margin-right="0"
 36 |      fit-margin-bottom="0"
 37 |      inkscape:window-width="1920"
 38 |      inkscape:window-height="1001"
 39 |      inkscape:window-x="-9"
 40 |      inkscape:window-y="1341"
 41 |      inkscape:window-maximized="1" />
 42 |   <metadata
 43 |      id="metadata5">
 44 |     <rdf:RDF>
 45 |       <cc:Work
 46 |          rdf:about="">
 47 |         <dc:format>image/svg+xml</dc:format>
 48 |         <dc:type
 49 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 50 |         <dc:title></dc:title>
 51 |       </cc:Work>
 52 |     </rdf:RDF>
 53 |   </metadata>
 54 |   <g
 55 |      inkscape:label="Layer 1"
 56 |      inkscape:groupmode="layer"
 57 |      id="layer1"
 58 |      transform="translate(-4.5608516,-18.191597)">
 59 |     <g
 60 |        id="g944">
 61 |       <text
 62 |          xml:space="preserve"
 63 |          style="font-size:8.81944px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583"
 64 |          x="7.6708579"
 65 |          y="44.576748"
 66 |          id="text841"><tspan
 67 |            sodipodi:role="line"
 68 |            id="tspan839"
 69 |            x="7.6708579"
 70 |            y="44.576748"
 71 |            style="stroke-width:0.264583">Estimation Methods</tspan></text>
 72 |       <rect
 73 |          style="opacity:1;fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
 74 |          id="rect867"
 75 |          width="93.277069"
 76 |          height="14.699824"
 77 |          x="4.8108511"
 78 |          y="33.94323" />
 79 |     </g>
 80 |     <g
 81 |        id="g934">
 82 |       <text
 83 |          xml:space="preserve"
 84 |          style="font-size:8.81944px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583"
 85 |          x="116.08769"
 86 |          y="29.363914"
 87 |          id="text861"><tspan
 88 |            sodipodi:role="line"
 89 |            id="tspan859"
 90 |            x="116.08769"
 91 |            y="29.363914"
 92 |            style="stroke-width:0.264583">Method of Moments</tspan></text>
 93 |       <rect
 94 |          style="opacity:1;fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
 95 |          id="rect869"
 96 |          width="96.751572"
 97 |          height="15.234363"
 98 |          x="111.71867"
 99 |          y="18.441597" />
100 |     </g>
101 |     <g
102 |        id="g939">
103 |       <text
104 |          xml:space="preserve"
105 |          style="font-size:8.81944px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583"
106 |          x="115.72476"
107 |          y="61.949268"
108 |          id="text865"><tspan
109 |            sodipodi:role="line"
110 |            id="tspan863"
111 |            x="115.72476"
112 |            y="61.949268"
113 |            style="stroke-width:0.264583">Maximum Likelihood</tspan></text>
114 |       <rect
115 |          style="opacity:1;fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
116 |          id="rect871"
117 |          width="98.622452"
118 |          height="16.30344"
119 |          x="111.71867"
120 |          y="50.513943" />
121 |     </g>
122 |     <text
123 |        xml:space="preserve"
124 |        style="font-size:4.80864px;line-height:1.25;font-family:sans-serif;stroke-width:0.144259"
125 |        x="35.108025"
126 |        y="66.200363"
127 |        id="text885"><tspan
128 |          sodipodi:role="line"
129 |          id="tspan883"
130 |          x="35.108025"
131 |          y="66.200363"
132 |          style="stroke-width:0.144259">Create estimates</tspan><tspan
133 |          sodipodi:role="line"
134 |          x="35.108025"
135 |          y="72.211166"
136 |          style="stroke-width:0.144259"
137 |          id="tspan907">(such as the average)</tspan><tspan
138 |          sodipodi:role="line"
139 |          x="35.108025"
140 |          y="78.221962"
141 |          style="stroke-width:0.144259"
142 |          id="tspan887">to estimate the value of</tspan></text>
143 |     <g
144 |        id="g949">
145 |       <text
146 |          xml:space="preserve"
147 |          style="font-size:8.81944px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583"
148 |          x="10.318133"
149 |          y="101.62817"
150 |          id="text845"><tspan
151 |            sodipodi:role="line"
152 |            id="tspan843"
153 |            x="10.318133"
154 |            y="101.62817"
155 |            style="stroke-width:0.264583">Parameters</tspan></text>
156 |       <rect
157 |          style="opacity:1;fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
158 |          id="rect889"
159 |          width="59.601105"
160 |          height="16.83798"
161 |          x="5.8799295"
162 |          y="90.069832" />
163 |     </g>
164 |     <g
165 |        id="g929"
166 |        transform="translate(18.441597,-0.80180858)">
167 |       <text
168 |          xml:space="preserve"
169 |          style="font-size:8.81944px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583"
170 |          x="112.13248"
171 |          y="89.969307"
172 |          id="text875"><tspan
173 |            sodipodi:role="line"
174 |            id="tspan873"
175 |            x="112.13248"
176 |            y="89.969307"
177 |            style="stroke-width:0.264583">Expectancy</tspan><tspan
178 |            sodipodi:role="line"
179 |            x="112.13248"
180 |            y="100.99361"
181 |            style="stroke-width:0.264583"
182 |            id="tspan877">Variance</tspan><tspan
183 |            sodipodi:role="line"
184 |            x="112.13248"
185 |            y="112.01791"
186 |            style="stroke-width:0.264583"
187 |            id="tspan879">Proportions</tspan><tspan
188 |            sodipodi:role="line"
189 |            x="112.13248"
190 |            y="123.04221"
191 |            style="stroke-width:0.264583"
192 |            id="tspan881">...</tspan></text>
193 |       <rect
194 |          style="opacity:1;fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
195 |          id="rect891"
196 |          width="59.066566"
197 |          height="47.306705"
198 |          x="107.70962"
199 |          y="79.646317" />
200 |     </g>
201 |     <g
202 |        id="g961">
203 |       <text
204 |          xml:space="preserve"
205 |          style="font-size:8.81944px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583"
206 |          x="13.390771"
207 |          y="161.89648"
208 |          id="text849"><tspan
209 |            sodipodi:role="line"
210 |            id="tspan847"
211 |            x="13.390771"
212 |            y="161.89648"
213 |            style="stroke-width:0.264583">Confidence</tspan><tspan
214 |            sodipodi:role="line"
215 |            x="13.390771"
216 |            y="172.92079"
217 |            style="stroke-width:0.264583"
218 |            id="tspan851">Intervals</tspan></text>
219 |       <rect
220 |          style="opacity:1;fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
221 |          id="rect893"
222 |          width="57.99749"
223 |          height="25.123335"
224 |          x="8.8198938"
225 |          y="151.54182" />
226 |     </g>
227 |     <g
228 |        id="g955">
229 |       <text
230 |          xml:space="preserve"
231 |          style="font-size:8.81944px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583"
232 |          x="107.7711"
233 |          y="162.0172"
234 |          id="text855"><tspan
235 |            sodipodi:role="line"
236 |            id="tspan853"
237 |            x="107.7711"
238 |            y="162.0172"
239 |            style="stroke-width:0.264583">Hypothesis</tspan><tspan
240 |            sodipodi:role="line"
241 |            x="107.7711"
242 |            y="173.0415"
243 |            style="stroke-width:0.264583"
244 |            id="tspan857">Tests</tspan></text>
245 |       <rect
246 |          style="opacity:1;fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
247 |          id="rect895"
248 |          width="60.670181"
249 |          height="27.528765"
250 |          x="101.56242"
251 |          y="150.47275" />
252 |     </g>
253 |     <text
254 |        xml:space="preserve"
255 |        style="font-size:4.39254px;line-height:1.25;font-family:sans-serif;stroke-width:0.131776"
256 |        x="30.333336"
257 |        y="132.45058"
258 |        id="text899"><tspan
259 |          sodipodi:role="line"
260 |          id="tspan897"
261 |          x="30.333336"
262 |          y="132.45058"
263 |          style="stroke-width:0.131776">Using statistics </tspan><tspan
264 |          sodipodi:role="line"
265 |          x="30.333336"
266 |          y="137.94125"
267 |          style="stroke-width:0.131776"
268 |          id="tspan913">such as Z, and T</tspan><tspan
269 |          sodipodi:role="line"
270 |          x="30.333336"
271 |          y="143.43193"
272 |          style="stroke-width:0.131776"
273 |          id="tspan901">we can compute</tspan></text>
274 |     <path
275 |        style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
276 |        d="M 110.91685,25.390605 98.622455,41.159506 110.91685,59.066566"
277 |        id="path915" />
278 |     <path
279 |        style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
280 |        d="M 29.934188,49.444862 V 89.53529"
281 |        id="path917" />
282 |     <path
283 |        style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
284 |        d="m 28.865107,151.54182 v -44.63401 l 72.697313,43.56494"
285 |        id="path921" />
286 |     <path
287 |        style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
288 |        d="M 66.015573,98.355186 H 125.88395"
289 |        id="path963" />
290 |   </g>
291 | </svg>
292 | 


--------------------------------------------------------------------------------
/lectures/images/link_for_survey_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/images/link_for_survey_example.png


--------------------------------------------------------------------------------
/lectures/images/speeding_ticket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/images/speeding_ticket.png


--------------------------------------------------------------------------------
/lectures/images/waze_not_accurate.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/lectures/images/waze_not_accurate.jpg


--------------------------------------------------------------------------------
/lectures/mult_lin_reg_example.R:
--------------------------------------------------------------------------------
 1 | # Coded example for manual multiple linear regression
 2 | 
 3 | library(tidyverse)
 4 | 
 5 | # In this example we illustrate manual computation of multiple linear regression
 6 | 
 7 | # The model using the lm function ----
 8 | 
 9 | lm(mpg ~ hp + wt, mtcars) %>% 
10 |   summary()
11 | 
12 | # Now let's do this manually ----
13 | 
14 | X <- mtcars %>%  
15 |   mutate(ones = 1) %>% 
16 |   select(ones, hp, wt) %>% 
17 |   as.matrix()
18 | 
19 | # The beta coefficients ----
20 | 
21 | XtX <- crossprod(X)  # same as t(X) %*% X
22 | 
23 | XtX_inv <- solve(XtX)  # Important! not the same as (XtX)^(-1)
24 | 
25 | beta <- XtX_inv %*% t(X) %*% mtcars$mpg
26 | 
27 | # We saw that \hat{\sigma^2}(X^tX)^{-1} is an estimate
28 | 
29 | # Residual sum of square
30 | 
31 | y_hat <- X %*% beta
32 | 
33 | e_i <- (mtcars$mpg - y_hat)
34 | 
35 | sigma_sq <- (e_i^2)/(32 - 3)  # df = 32 - (p + 1) = 32 - 2 - 1 = 29
36 | 
37 | # the sum of squared errors
38 | sum(sigma_sq)
39 | 
40 | # the residual standard error
41 | sqrt(sum(sigma_sq))
42 | 
43 | # The standard deviation of coefficients ----
44 | sqrt(sum(sigma_sq) * diag(XtX_inv))
45 | 


--------------------------------------------------------------------------------
/lectures/what_is_z_score.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | # This script illustrates z-score.
 4 | 
 5 | plot_z_score <- function(z = NULL, p = NULL, 
 6 |                          alternative = c("two.sided", "one.sided")){
 7 |   
 8 |   if (!is.null(z) & !is.null(p)){
 9 |     stop("Can't have both z and p...")
10 |   }
11 |   
12 |   if (alternative[1] == "two.sided" & !is.null(p)){
13 |     div_factor <- 0.5
14 |   } else {
15 |     div_factor <- 1
16 |   }
17 |   
18 |   if (!is.null(p)){
19 |     if (p > 0.5){
20 |       p <- 1-p
21 |     }
22 |   }
23 |   
24 |   z_dense <- tibble(z_range = seq(-3, 3, by = 0.05),
25 |                     density = dnorm(z_range),
26 |                     p_range = pnorm(z_range))
27 |   
28 |   base_plot <- ggplot(z_dense, aes(x = z_range, y = density)) + 
29 |     geom_line()
30 |   
31 |   subtitle_str <- ""
32 |   
33 |   if (!is.null(z)){
34 |     p <- pnorm(q = z)*div_factor
35 |   }
36 | 
37 |   z_of_p <- qnorm(p*div_factor)
38 |   density_of_p <- dnorm(z_of_p)
39 |   
40 |   base_plot <- base_plot + 
41 |     geom_area(data = z_dense %>% filter(p_range <= p*div_factor),
42 |               aes(x = z_range, y = density), fill = "lightblue", alpha = 0.5) +
43 |     geom_segment(x = -3, xend = z_of_p,
44 |                  y = density_of_p, yend = density_of_p, color = "red") +
45 |     geom_segment(x = z_of_p, xend = z_of_p, y = density_of_p, yend = 0, color = "red")
46 |   
47 |   if (alternative[1] == "two.sided"){
48 |     base_plot <- base_plot + 
49 |       geom_area(data = z_dense %>% filter(p_range >= 1 - p*div_factor),
50 |                 aes(x = z_range, y = density), fill = "lightblue", alpha = 0.5) +
51 |       geom_segment(x = 3, xend = -z_of_p,
52 |                    y = density_of_p, yend = density_of_p, color = "red") +
53 |       geom_segment(x = -z_of_p, xend = -z_of_p, y = density_of_p, yend = 0, color = "red")
54 |       
55 |   }
56 |   
57 |   subtitle_str <- paste0(subtitle_str, "p = Phi(z) = pnorm(z) = ", round(p*div_factor, 3), 
58 |                          "; z = qnorm(p) = ", 
59 |                          round(z_of_p, 3))
60 | 
61 |   
62 |   base_plot + 
63 |     xlab("z_p") + 
64 |     ylab("density\ndnorm(z_p)") + 
65 |     theme_bw() + 
66 |     ggtitle("The normal distribution",
67 |             subtitle = subtitle_str)
68 |   
69 | }
70 | 
71 | plot_z_score(p=0.05, alternative = "two.sided")
72 | plot_z_score(z = -1.649, alternative = "one.sided")
73 | 


--------------------------------------------------------------------------------
/lectures/xaringan-themer.css:
--------------------------------------------------------------------------------
  1 | /* -------------------------------------------------------
  2 |  *
  3 |  *     !! This file was generated by xaringanthemer !!
  4 |  *
  5 |  *  Changes made to this file directly will be overwritten
  6 |  *  if you used xaringanthemer in your xaringan slides Rmd
  7 |  *
  8 |  *  Issues or likes?
  9 |  *    - https://github.com/gadenbuie/xaringanthemer
 10 |  *    - https://www.garrickadenbuie.com
 11 |  *
 12 |  *  Need help? Try:
 13 |  *    - vignette(package = "xaringanthemer")
 14 |  *    - ?xaringanthemer::style_xaringan
 15 |  *    - xaringan wiki: https://github.com/yihui/xaringan/wiki
 16 |  *    - remarkjs wiki: https://github.com/gnab/remark/wiki
 17 |  *
 18 |  *  Version: 0.4.1
 19 |  *
 20 |  * ------------------------------------------------------- */
 21 | @import url(https://fonts.googleapis.com/css?family=Rubik:300,300i&display=swap);
 22 | @import url(https://fonts.googleapis.com/css?family=Open+Sans&display=swap);
 23 | @import url(https://fonts.googleapis.com/css?family=Fira+Mono&display=swap);
 24 | 
 25 | 
 26 | :root {
 27 |   /* Fonts */
 28 |   --text-font-family: Rubik;
 29 |   --text-font-is-google: 1;
 30 |   --text-font-family-fallback: -apple-system, BlinkMacSystemFont, avenir next, avenir, helvetica neue, helvetica, Ubuntu, roboto, noto, segoe ui, arial;
 31 |   --text-font-base: sans-serif;
 32 |   --header-font-family: 'Open Sans';
 33 |   --header-font-is-google: 1;
 34 |   --header-font-family-fallback: Georgia, serif;
 35 |   --code-font-family: 'Fira Mono';
 36 |   --code-font-is-google: 1;
 37 |   --base-font-size: 20px;
 38 |   --text-font-size: 1rem;
 39 |   --code-font-size: 0.9rem;
 40 |   --code-inline-font-size: 1em;
 41 |   --header-h1-font-size: 2.3rem;
 42 |   --header-h2-font-size: 2.25rem;
 43 |   --header-h3-font-size: 1.75rem;
 44 | 
 45 |   /* Colors */
 46 |   --text-color: #272822;
 47 |   --header-color: #43418A;
 48 |   --background-color: #FFFFFF;
 49 |   --link-color: #43418A;
 50 |   --text-bold-color: #43418A;
 51 |   --code-highlight-color: rgba(255,255,0,0.5);
 52 |   --inverse-text-color: #FFFFFF;
 53 |   --inverse-background-color: #43418A;
 54 |   --inverse-header-color: #FFFFFF;
 55 |   --inverse-link-color: #43418A;
 56 |   --title-slide-background-color: #43418A;
 57 |   --title-slide-text-color: #FFFFFF;
 58 |   --header-background-color: #43418A;
 59 |   --header-background-text-color: #FFFFFF;
 60 |   --base: #43418A;
 61 |   --white: #FFFFFF;
 62 |   --black: #272822;
 63 | }
 64 | 
 65 | html {
 66 |   font-size: var(--base-font-size);
 67 | }
 68 | 
 69 | body {
 70 |   font-family: var(--text-font-family), var(--text-font-family-fallback), var(--text-font-base);
 71 |   font-weight: 300;
 72 |   color: var(--text-color);
 73 | }
 74 | h1, h2, h3 {
 75 |   font-family: var(--header-font-family), var(--header-font-family-fallback);
 76 |   font-weight: 600;
 77 |   color: var(--header-color);
 78 | }
 79 | .remark-slide-content {
 80 |   background-color: var(--background-color);
 81 |   font-size: 1rem;
 82 |   padding: 16px 64px 16px 64px;
 83 |   width: 100%;
 84 |   height: 100%;
 85 | }
 86 | .remark-slide-content h1 {
 87 |   font-size: var(--header-h1-font-size);
 88 | }
 89 | .remark-slide-content h2 {
 90 |   font-size: var(--header-h2-font-size);
 91 | }
 92 | .remark-slide-content h3 {
 93 |   font-size: var(--header-h3-font-size);
 94 | }
 95 | .remark-code, .remark-inline-code {
 96 |   font-family: var(--code-font-family), Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace;
 97 | }
 98 | .remark-code {
 99 |   font-size: var(--code-font-size);
100 | }
101 | .remark-inline-code {
102 |   font-size: var(--code-inline-font-size);
103 |   color: #43418A;
104 | }
105 | .remark-slide-number {
106 |   color: #43418A;
107 |   opacity: 1;
108 |   font-size: 0.9rem;
109 | }
110 | strong {
111 |   font-weight: bold;
112 |   color: var(--text-bold-color);
113 | }
114 | a, a > code {
115 |   color: var(--link-color);
116 |   text-decoration: none;
117 | }
118 | .footnote {
119 |   position: absolute;
120 |   bottom: 60px;
121 |   padding-right: 4em;
122 |   font-size: 0.9em;
123 | }
124 | .remark-code-line-highlighted {
125 |   background-color: var(--code-highlight-color);
126 | }
127 | .inverse {
128 |   background-color: var(--inverse-background-color);
129 |   color: var(--inverse-text-color);
130 |   
131 | }
132 | .inverse h1, .inverse h2, .inverse h3 {
133 |   color: var(--inverse-header-color);
134 | }
135 | .inverse a, .inverse a > code {
136 |   color: var(--inverse-link-color);
137 | }
138 | .title-slide, .title-slide h1, .title-slide h2, .title-slide h3 {
139 |   color: var(--title-slide-text-color);
140 | }
141 | .title-slide {
142 |   background-color: var(--title-slide-background-color);
143 | }
144 | .title-slide .remark-slide-number {
145 |   display: none;
146 | }
147 | /* Two-column layout */
148 | .left-column {
149 |   width: 20%;
150 |   height: 92%;
151 |   float: left;
152 | }
153 | .left-column h2, .left-column h3 {
154 |   color: #43418A99;
155 | }
156 | .left-column h2:last-of-type, .left-column h3:last-child {
157 |   color: #43418A;
158 | }
159 | .right-column {
160 |   width: 75%;
161 |   float: right;
162 |   padding-top: 1em;
163 | }
164 | .pull-left {
165 |   float: left;
166 |   width: 47%;
167 | }
168 | .pull-right {
169 |   float: right;
170 |   width: 47%;
171 | }
172 | .pull-right + * {
173 |   clear: both;
174 | }
175 | img, video, iframe {
176 |   max-width: 100%;
177 | }
178 | blockquote {
179 |   border-left: solid 5px #43418A80;
180 |   padding-left: 1em;
181 | }
182 | .remark-slide table {
183 |   margin: auto;
184 |   border-top: 1px solid #666;
185 |   border-bottom: 1px solid #666;
186 | }
187 | .remark-slide table thead th {
188 |   border-bottom: 1px solid #ddd;
189 | }
190 | th, td {
191 |   padding: 5px;
192 | }
193 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) {
194 |   background: #D9D9E7;
195 | }
196 | table.dataTable tbody {
197 |   background-color: var(--background-color);
198 |   color: var(--text-color);
199 | }
200 | table.dataTable.display tbody tr.odd {
201 |   background-color: var(--background-color);
202 | }
203 | table.dataTable.display tbody tr.even {
204 |   background-color: #D9D9E7;
205 | }
206 | table.dataTable.hover tbody tr:hover, table.dataTable.display tbody tr:hover {
207 |   background-color: rgba(255, 255, 255, 0.5);
208 | }
209 | .dataTables_wrapper .dataTables_length, .dataTables_wrapper .dataTables_filter, .dataTables_wrapper .dataTables_info, .dataTables_wrapper .dataTables_processing, .dataTables_wrapper .dataTables_paginate {
210 |   color: var(--text-color);
211 | }
212 | .dataTables_wrapper .dataTables_paginate .paginate_button {
213 |   color: var(--text-color) !important;
214 | }
215 | 
216 | /* Horizontal alignment of code blocks */
217 | .remark-slide-content.left pre,
218 | .remark-slide-content.center pre,
219 | .remark-slide-content.right pre {
220 |   text-align: start;
221 |   width: max-content;
222 |   max-width: 100%;
223 | }
224 | .remark-slide-content.left pre,
225 | .remark-slide-content.right pre {
226 |   min-width: 50%;
227 |   min-width: min(40ch, 100%);
228 | }
229 | .remark-slide-content.center pre {
230 |   min-width: 66%;
231 |   min-width: min(50ch, 100%);
232 | }
233 | .remark-slide-content.left pre {
234 |   margin-left: unset;
235 |   margin-right: auto;
236 | }
237 | .remark-slide-content.center pre {
238 |   margin-left: auto;
239 |   margin-right: auto;
240 | }
241 | .remark-slide-content.right pre {
242 |   margin-left: auto;
243 |   margin-right: unset;
244 | }
245 | 
246 | /* Slide Header Background for h1 elements */
247 | .remark-slide-content.header_background > h1 {
248 |   display: block;
249 |   position: absolute;
250 |   top: 0;
251 |   left: 0;
252 |   width: 100%;
253 |   background: var(--header-background-color);
254 |   color: var(--header-background-text-color);
255 |   padding: 2rem 64px 1.5rem 64px;
256 |   margin-top: 0;
257 |   box-sizing: border-box;
258 | }
259 | .remark-slide-content.header_background {
260 |   padding-top: 7rem;
261 | }
262 | 
263 | @page { margin: 0; }
264 | @media print {
265 |   .remark-slide-scaler {
266 |     width: 100% !important;
267 |     height: 100% !important;
268 |     transform: scale(1) !important;
269 |     top: 0 !important;
270 |     left: 0 !important;
271 |   }
272 | }
273 | 
274 | .base {
275 |   color: var(--base);
276 | }
277 | .bg-base {
278 |   background-color: var(--base);
279 | }
280 | .white {
281 |   color: var(--white);
282 | }
283 | .bg-white {
284 |   background-color: var(--white);
285 | }
286 | .black {
287 |   color: var(--black);
288 | }
289 | .bg-black {
290 |   background-color: var(--black);
291 | }
292 | 
293 | 
294 | 
295 | /* Extra CSS */
296 | .medium {
297 |   font-size: 85%;
298 |   code-font-size: 85%;
299 | }
300 | .small {
301 |   zoom: 70%;
302 | }
303 | .extra-small {
304 |   font-size: 50%;
305 |   code-font-size: 50%;
306 | }
307 | .tiny {
308 |   font-size: 50%;
309 |   code-font-size: 50%;
310 |   zoom: 50%;
311 | }
312 | .full-width {
313 |   display: flex;
314 |   width: 100%;
315 |   flex: 1 1 auto;
316 | }
317 | 


--------------------------------------------------------------------------------
/misc/Distribution tables - x^2, z, f, t.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/misc/Distribution tables - x^2, z, f, t.pdf


--------------------------------------------------------------------------------
/misc/extended_formula_page.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/misc/extended_formula_page.pdf


--------------------------------------------------------------------------------
/misc/init_course_plan.md:
--------------------------------------------------------------------------------
 1 | # Plan for course
 2 | 
 3 | ## Weeks 1-2
 4 | 
 5 | ### Lecture:
 6 | 
 7 |    * Intro/overview
 8 |       * Sampling methods
 9 |    * Statistical inference and parameter estimation
10 |       * Mean, median, std, biased/unbiased, maximum likelihood, percentiles, skewness, desired properties of estimtes (e.g. unbiased), Cramer-Rao bound(?), Chebychev.
11 |    * Variable types
12 |    * Reminder on types of distributions(?) + visualizations.
13 | 
14 | ### Exercise:
15 | 
16 |    * Technical exercises on R
17 |       * Intro to R
18 | 	  * Tidyverse
19 | 	  * Visualizations
20 |    * Motivation example
21 | 
22 | ### Homework:
23 | 
24 |    * The first will present an example research (verbally) and ask questions about the sampling method, type of variables, problems with bias.
25 |    * Show a chart and analyze it (e.g., boxplot, density, ecdf, scatter plot matrix). What a specific plot teaches that others don't.
26 |    * Technical part (maybe adopted from R4DS)
27 |    
28 | ## Weeks 3-4
29 | 
30 | ### Lecture:
31 | 
32 |    * Hypothesis tests.
33 |    * Use cases and examples for z, student's t, how are they related to assumptions.
34 |    * Chi-square test for independence of variables.
35 |    * A-parametric tests (e.g., Wilcoxon) - what do they mean, when should we use them, examples.
36 |    * Goodness-of-fit.
37 |    * Problems with p-values, FDR.
38 |    * Multidimensional CIs?
39 |    * Relationship between significance (p-value) versus confidence interval.
40 | 
41 | ### Exercise:
42 | 
43 |    * Show another test not covered in the lecture; or
44 |    * Review a concrete example in class, i.e., take a data set and run a number of hypothesis tests.
45 |    * Illustrate how 100 tests of independent random variables might show significant values (even though they are drawn from the same distribution).
46 |    * Example for FDR.
47 |    
48 | ### Homework:
49 | 
50 |    * Theoretical question, i.e., "story" followed by what test would you use and why.
51 |    * Practical example - parsons example of running the tests + analysis.
52 |    
53 | ## Weeks 5-6
54 | 
55 | ### Lecture:
56 | 
57 |    * Analysis of variance, one-way, two-way. Examples.
58 |    * Dunnett's test.
59 |    * Related tests.
60 |    
61 | ### Exercise:
62 | 
63 |    * Example for use in R.
64 |    * Dunnett's test (if not covered in lecture).
65 | 
66 | ### Homework:
67 | 
68 |    * Complex data set with parsons example which involves both data restructuring (i.e. `pivot_wider`/`pivot_longer`) and then `aov`, `summary`.
69 |    
70 | ## Weeks 7:
71 | 
72 | ### Lecture:
73 | 
74 |    * Choosing the right sample type.
75 |    * Deciding on sample size via margin of error and power calculations.
76 |    * Consider sub-groups.
77 | 
78 | ### Exercise:
79 | 
80 |    * Show examples and implementation in R.
81 | 
82 | ### Homework:
83 | 
84 |    * Provide a research use cases, students will provide a detailed plan for the experiment.
85 |    * Present a flawed experiment - student need to find the flaws.


--------------------------------------------------------------------------------
/misc/syllabus_05601823_2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/misc/syllabus_05601823_2022.pdf


--------------------------------------------------------------------------------
/misc/tau_engineering_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/misc/tau_engineering_logo.png


--------------------------------------------------------------------------------
/population_vs_sample/.Rhistory:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | runApp()
 3 | runApp()
 4 | runApp()
 5 | runApp()
 6 | runApp()
 7 | runApp()
 8 | runApp()
 9 | runApp()
10 | runApp()
11 | runApp()
12 | runApp()
13 | runApp()
14 | 


--------------------------------------------------------------------------------
/population_vs_sample/app.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(shiny)
 3 | library(tidyverse)
 4 | 
 5 | # Define UI for application that draws a histogram
 6 | ui <- fluidPage(
 7 |     
 8 |     theme = shinythemes::shinytheme("united"), title = "Population vs. sample",
 9 |     h1("Population versus sample (normal distribution)"),
10 |     
11 |     fluidRow(
12 |         sidebarLayout(
13 |             sidebarPanel(width = 2,
14 |                          fluidRow(
15 |                              h3("Population parameters"),
16 |                              numericInput("expectancy", "Expectancy (μ)",
17 |                                           value = 0),
18 |                              numericInput("std", "Standard Deviation (σ)",
19 |                                           value = 1),
20 |                              h3("Sample size"),
21 |                              numericInput("sample_size", "Sample size (n)",
22 |                                           value = 30),
23 |                              numericInput("num_bins", "Histogram bins",
24 |                                           value = 30)
25 |                              
26 |                          )),
27 |             mainPanel(column(plotOutput("population_distribution"), width = 6),
28 |                       column(plotOutput("sample_histogram"), width = 6))
29 |         )
30 |     )
31 |     
32 | )
33 | 
34 | # Define server logic required to draw a histogram
35 | server <- function(input, output) {
36 |     
37 |     output$population_distribution <- renderPlot({
38 |         
39 |         x_range <- seq(input$expectancy - 3*input$std, input$expectancy + 3*input$std,
40 |                        by = 0.01)
41 |         
42 |         y_range <- dnorm(x_range, 
43 |                          mean = input$expectancy,
44 |                          sd = input$std)
45 |         
46 |         tibble(x = x_range, y = y_range) %>% 
47 |             ggplot(aes(x, y)) + 
48 |             geom_line() + 
49 |             coord_cartesian(xlim = c(-5, 5)) + 
50 |             ggtitle("The population distribution") + 
51 |             ylab("Density function")
52 |         
53 |     })
54 |     
55 |     output$sample_histogram <- renderPlot({
56 |         
57 |         smp <- tibble(x = 
58 |                    rnorm(n = input$sample_size,
59 |                          mean = input$expectancy,
60 |                          sd = input$std)) 
61 |         
62 |         smp %>% 
63 |             ggplot(aes(x = x)) + 
64 |             geom_histogram(bins = input$num_bins) + 
65 |             coord_cartesian(xlim = c(-5, 5)) + 
66 |             ggtitle(glue::glue("The sample distribution: mean={round(mean(smp$x),2)}, sd={round(sd(smp$x), 2)}")) + 
67 |             xlab("x")
68 |         
69 |     })
70 | 
71 | 
72 | }
73 | 
74 | # Run the application 
75 | shinyApp(ui = ui, server = server)
76 | 


--------------------------------------------------------------------------------
/population_vs_sample/population_vs_sample.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/population_vs_sample/rsconnect/shinyapps.io/sarid/population_vs_sample.dcf:
--------------------------------------------------------------------------------
 1 | name: population_vs_sample
 2 | title: population_vs_sample
 3 | username:
 4 | account: sarid
 5 | server: shinyapps.io
 6 | hostUrl: https://api.shinyapps.io/v1
 7 | appId: 3766450
 8 | bundleId: 4324931
 9 | url: https://sarid.shinyapps.io/population_vs_sample/
10 | when: 1615360163.46147
11 | asMultiple: FALSE
12 | asStatic: FALSE
13 | 


--------------------------------------------------------------------------------
/project/Project Instructions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Project Instructions"
  3 | author: "Adi Sarid"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   html_document: default
  7 |   pdf_document: default
  8 | subtitle: Intro to Statistics and Data Analysis with R (0560.1823)
  9 | ---
 10 | 
 11 | ```{r setup, include=FALSE}
 12 | knitr::opts_chunk$set(echo = TRUE)
 13 | ```
 14 | 
 15 | ## Background
 16 | 
 17 | The following document contains instructions to the project in the Introduction to Statistics and Data Analysis with R course.
 18 | 
 19 | The project has a weight of 40% of you final grade.
 20 | 
 21 | ## Goal
 22 | 
 23 | The goal of the project is to demonstrate and practice the different elements we have been talking about, which are a part of most data analysis/data science projects. 
 24 | 
 25 | ## Methods
 26 | 
 27 | In this project you will handle the different phases of a data analysis project:
 28 | 
 29 |    1. Data **Import** (reading the data into R).
 30 |    
 31 |    2. Data **Tidying** (arranging the data into something you can work with)
 32 |    
 33 |    3. Understanding the data:
 34 |    
 35 |       a. **Transforming** variables.
 36 |       
 37 |       b. **Visualizing** (use `ggplot2` to show distribution of variables, relationships between variables, and to hypothesize).
 38 |       
 39 |       c. **Modelling**: using a few of the tools we have learned during the course (like hypothesis testing, regression, analysis of variance, etc.) to examine your hypothesis.
 40 |       
 41 |    4. **Communicating** your findings via a written report
 42 |    
 43 | ## Instructions and Schedule
 44 | 
 45 | The project should be performed **in pairs** (same groups of homework submissions).
 46 | 
 47 | ### Choosing a dataset
 48 | 
 49 | First, you should select a dataset on which you will perform the project. I recommend using a data set from either [Kaggle](kaggle.com) or from [tidytuesday](https://github.com/rfordatascience/tidytuesday), or [government data](https://data.gov.il/dataset/). You can select something else. 
 50 | 
 51 | In any case, please do not choose something "too popular" (e.g., no built-in `R` datasets, and no data sets that we've worked on in the lectures). 
 52 | 
 53 | In your work you must document:
 54 | 
 55 |    * The dataset name
 56 |    * Source (a url with the data and documentation of the dataset)
 57 |    * A **direct link** to download the raw data you are using
 58 | 
 59 | ### Consultation
 60 | 
 61 | I'm dedicating a weekly reception hour, Thursdays 09:00, in zoom. You can bring questions regarding the project, coding, `R`, etc. Please coordinate in advanced (send me an email if you want to join the reception hour).
 62 | 
 63 | ### Submission
 64 | 
 65 | Final submissions should be made by **June 10th 2022.**
 66 | 
 67 | Please submit your file to moodle as `statintro_final_studentname_studentID.zip` which bundles an Rmd version, data files, and a knitted html version of your report. The Rmd should compile standalone in every computer.
 68 | 
 69 | ## Grading
 70 | 
 71 | You will be graded along the following lines:
 72 | 
 73 |    * Data import, tidying, and transformations (20%): Your ability to use the proper methods to import the data, tidy it and apply required transformations towards the next stages.
 74 | 
 75 |    * Visualizations (20%): Your ability to utilize visualizations to articulate your hypothesis and to illustrate different patterns and relationships in the data. You should be able to match the proper types of charts to what ever it is you are trying to show. 
 76 |    
 77 |    * Modeling (20%): Your ability to match the appropriate statistical tests/models to the problem, verifying (or highlighting) certain assumptions which are valid or invalid in this case. Please provide at least two relevant models/hypothesis tests that we learned.
 78 |    
 79 |    * Communication, documentation, explanations (20%): You should be able to explain the different steps you are doing, lead the reader in a logical and appealing manner, explain your results, and highlight the research or business implications of your findings. For example, make sure you start with data description, research questions, hypothesis, etc.
 80 |    
 81 |    * Code (20%): Readability, proper use and proper documentation of code. You may use tidyverse code or base R.
 82 | 
 83 | ***
 84 | 
 85 | **Good luck!**
 86 | 
 87 | \newpage
 88 | 
 89 | # Appendix: Questions and answers
 90 | 
 91 | Some more questions and answers.
 92 | 
 93 | ## How should you report the results?
 94 | 
 95 | In tests such as t-test or goodness of fit, you should explain in plain text what you are doing, what assumptions the test entails and if they indeed hold in this case or not. Then add the code chunk and include the output.
 96 | 
 97 | For example, in linear regression, you should also report a qqplot of the residuals and check homoscedasticity.
 98 | 
 99 | ## Where can I see examples for projects?
100 | 
101 | You can see examples for projects from previous semesters [here](https://github.com/adisarid/intro_statistics_R/tree/master/project/examples).


--------------------------------------------------------------------------------
/project/Project Instructions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adisarid/intro_statistics_R/429046aa026fd3a17cd756189829181dcd455993/project/Project Instructions.pdf


--------------------------------------------------------------------------------
/project/example_star_trek_script_analysis.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Where no R has gone before - Analyzing Star Trek scripts"
 3 | author: "Adi Sarid"
 4 | date: "12 1 2020"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | # Background
13 | 
14 | In this markdown, I analyze [this](https://www.kaggle.com/gjbroughton/start-trek-scripts/) data of startrek scripts.
15 | 
16 | ## Goals
17 | 
18 | My main goal is to demonstrate various elements which I expect to see in the project, while allowing the students to ask questions as we build the analysis in real time together.
19 | 
20 | Here are some related research questions which we can examine:
21 | 
22 |    * What is the number of words/script lines spoken by each starfleet captain?
23 |    * What is the difference between a captain, number.1, and other characters?
24 |    * What is the relationship between sentence length (number of words) and starfleet captains?
25 |    * Is there a gender bias in different series/episodes? Which series reflects more female characters (in terms of script words or script lines)?
26 |    * What the number of words per episode? Is it randomely distributed?
27 | 
28 | # Data Import and Tidying
29 | 
30 | The source is in json files - non tabular data which is hard to work with. Hence, we must first turn it into a tidyformat.  data was read from the json files and put into this tidy format.
31 | 
32 | ```{r read startrek script data}
33 | suppressWarnings(suppressMessages(library(tidyverse)))
34 | trek <- read_csv("https://github.com/adisarid/startrek_plumber_api/blob/master/raw_data/characters_words.csv?raw=true")
35 | glimpse(trek)
36 | ```
37 | 
38 | # Transformation, Visualization, and Modelling
39 | 
40 | ## Which starfleet captains appear in the data?
41 | 
42 | First, let's determine who are the starfleet captains which appear in the data.
43 | 
44 | ```{r starfleet captain}
45 | trek %>% 
46 |   group_by(character, series) %>% 
47 |   summarize(num_words = sum(total_words)) %>% 
48 |   arrange(desc(num_words))
49 | ```
50 | 
51 | The starfleet captains which appear are PICARD, JANEWAY, KIRK, SISCO, and ARCHER. Now we can filter the data and examine them.
52 | 
53 | # Conclusions


--------------------------------------------------------------------------------
/project/examples/FIFA 2019 Analysis - Inbar Siloni.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "FIFA 2019 Analysis - Inbar Siloni"
  3 | author: "Inbar Siloni"
  4 | date: "18 1 2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | # Background
 12 | 
 13 | In this markdown, I analyze [this](https://www.kaggle.com/karangadiya/fifa19/download) data of fifa players.
 14 | 
 15 | ## Goals
 16 | 
 17 | In my research, I will focus on two main questions:
 18 | 
 19 |   * Is there a correlation between a player's abilities, his age and his value?
 20 |   
 21 |   * Do players score the same if they are right footed and left footed?
 22 |   
 23 | 
 24 | ```{r libraries, message=FALSE,warning=FALSE, echo=FALSE}
 25 | library(tidyverse)
 26 | library(broom)
 27 | library(knitr)
 28 | library(readxl)
 29 | library(here)
 30 | library(janitor)
 31 | library(dplyr)
 32 | library(ggcorrplot)
 33 | library(scales)
 34 | library(RColorBrewer)
 35 | library(stringi)
 36 | library(agricolae)
 37 | ```
 38 | 
 39 | ## importing the data set
 40 | 
 41 | The dataset was in an excel file- a convenient format to work with. I had to filter the clubs I wanted to focus on- the 16 top clubs of the champions league of 2019. Also, I formatted the column names for my convenience. I deleted a few data columns that I will not be using for more approachable data.
 42 | ```{r read fifa dataset,message=FALSE, warning=FALSE, echo=FALSE}
 43 | alufot <- c("Atlético Madrid", "FC Barcelona", "Real Madrid",
 44 |                      "FC Bayern München", "Tottenham Hotspur", "Paris Saint-Germain",
 45 |                      "Juventus", "Chelsea", "Borussia Dortmund", "Liverpool", "Atalanta",
 46 |                      "Valencia CF", "RB Leipzig", "Napoli", "Olympique Lyonnais", "Manchester City")
 47 | 
 48 | fifa_data <- read_excel("fifa_data.xlsx") %>%
 49 |   clean_names() %>% 
 50 |   filter(club %in% alufot)
 51 | fifa_data <- fifa_data[, c(4,8:10,12,15,22,55:88)]
 52 | View(fifa_data)
 53 | ```
 54 | 
 55 | # Transformation, Visualization, and Modelling
 56 | 
 57 | ## doing some transformation
 58 | First, I did a little transformation for several variables.
 59 | ```{r transforming the dataset, echo=FALSE}
 60 | positions <- unique(fifa_data$position)
 61 | gk <- "GK"
 62 | defs <- positions[str_detect(positions, "B$")]
 63 | mids <- positions[str_detect(positions, "M$")]
 64 | f1 <- positions[str_detect(positions, "F$")]
 65 | f2 <- positions[str_detect(positions, "S$")]
 66 | f3 <- positions[str_detect(positions, "T$")]
 67 | f4 <- positions[str_detect(positions, "W$")]
 68 | fwds <- c(f1, f2, f3, f4)
 69 | fifa_data <- fifa_data %>% 
 70 |   mutate(position_group = ifelse(position %in% gk, "GK", ifelse(position %in% defs, "DEF", ifelse(position %in% mids, "MID", ifelse(position %in% fwds, "FWD", "Unknown")))))
 71 | fifa_data <- fifa_data %>%
 72 |                 mutate(
 73 |                   value_multiplier = ifelse(str_detect(value, "K"), 1000,
 74 |                                            ifelse(str_detect(value, "M"), 1000000, 1)),
 75 |                   value_numeric_pounds = as.numeric(str_extract(value,"[[:digit:]]+\\.*[[:digit:]]*"))
 76 |                                            * value_multiplier,
 77 |                   age_group = ifelse(age <= 20, "20 and under", 
 78 |                                     ifelse(age > 20 & age <=25, "21 to 25",
 79 |                                            ifelse(age > 25 & age <= 30, "25 to 30",
 80 |                                                   ifelse(age > 30 & age <= 35, "31 to 35", "Over 35")))),
 81 |                   club=stri_trans_general(club, "Latin-ASCII"),
 82 |                   )
 83 | View(fifa_data)
 84 | glimpse(fifa_data)
 85 | ```
 86 | 
 87 | 
 88 | ## doing some visualization
 89 | 
 90 | Let's take a look at the clubs who's players we are going to analyze! Ladies and gentlemen, I present you UEFA champions league's top 16 clubs of 2019:
 91 | 
 92 | ```{r clubs}
 93 |  fifa_data %>% 
 94 |   group_by(club) %>% 
 95 |   summarize()
 96 | ```
 97 | 
 98 | 
 99 | Now let's take a look at the age distribution of the players:
100 | 
101 | ```{r age distribution}
102 | ggplot(data=fifa_data, aes(x=age))+
103 |   geom_density(fill="violet")
104 | ```
105 | 
106 | 
107 | ```{r age range}
108 | fifa_data %>%
109 |   summarise(
110 |     min = min(age),
111 |     median = median(age),
112 |     max = max(age)
113 |   )
114 | ```
115 | 
116 | 
117 | ```{r age group count}
118 | fifa_data %>%
119 |   group_by(age_group) %>%
120 |   count()
121 | ```
122 | 
123 | We can see that most of the players are under the age of 30 and over the age of 20. The biggest group age is 21 to 25.
124 | 
125 | Let's see how their value is distributed:
126 | 
127 | ```{r players value distribution}
128 | ggplot(data=fifa_data, aes(x=value_numeric_pounds))+
129 |   geom_density(fill="lavenderblush")+
130 |   scale_x_log10(labels = dollar_format(prefix = "€"))
131 | ```
132 | 
133 | So, as we can see there are two "humps" is this distribution. A possible explanation to this is the difference of values for the squad players (those who play in the league, for example) and players in the club's teen groups and substitutes to substitutes. Both players' types values distribute normally, but most of the subs are valued at max as the min for squad players.
134 | 
135 | ## modelling
136 | 
137 | Let's get down to business.
138 | 
139 | First, we will check if there is a correlation between preferred foot and scoring: 
140 | 
141 | ```{r finishing/preferred leg distribution}
142 | fit1<-lm(formula = finishing~preferred_foot, data= fifa_data)
143 | summary(fit1)
144 | fifa_data %>%
145 |   ggplot(aes(x = preferred_foot, y = finishing,
146 |            fill = preferred_foot))+
147 |   geom_boxplot()+
148 |   scale_fill_brewer(palette = "Pastel1") 
149 | ```
150 | ```{r another finishing/preferred leg distribution}
151 | t.test(fifa_data$finishing~fifa_data$preferred_foot)
152 | fifa_data %>%
153 |   ggplot(aes(x = finishing,
154 |            fill = preferred_foot))+
155 |   geom_histogram()
156 | ```
157 | 
158 | 
159 | 
160 | My null hypothesis was that preferred foot and scoring are not correlated. As we can see in the t-test and linear regression above, the p-value is 0.07948, so we can't say in significance level of 0.05 that there is a correlation, so we do not reject the null hypothesis. 
161 | 
162 | Now to the next research question- let's see if there is a correlation between abilities, age and value!
163 | 
164 | First, let's take a look at age vs. value:
165 | 
166 | ```{r players value/age distribution}
167 | fit2<-lm(formula = log1p(value_numeric_pounds)~age_group, data= fifa_data)
168 | summary(fit2)
169 | fifa_data %>%
170 |   ggplot(aes(x=age_group, y=value_numeric_pounds, fill=age_group))+
171 |   geom_boxplot()+
172 |   scale_y_log10(labels = dollar_format(prefix = "€"))+
173 |   stat_smooth(method = "lm")+
174 |   scale_fill_brewer(palette = "Pastel1")
175 | ```
176 | 
177 | From the regression and chart we can see that players aged 21-35 earn more, and more specifically in age group 25-30. This supports my explanation to the "humps" in the value distribution chart.
178 | 
179 | Let's see the correlation for different abilities:
180 | 
181 | So, obviously, field players require different abilities than goal keepers. We will check the correlation for all players and for each group individually.
182 | 
183 | Here we check for all players:
184 | 
185 | ```{r abilities correlation}
186 | abilities <- fifa_data %>% select(crossing:gk_reflexes)
187 | ability_corr <- cor(abilities)
188 | ggcorrplot(ability_corr, type = "upper", hc.order = T, hc.method = "complete",
189 |            colors = c("lightskyblue","white","brown1"))
190 | ```
191 | 
192 | We see that some abilities are correlated with others, and some aren't. Let's see if we can explain this. 
193 | Here we check just for field players:
194 | 
195 | ```{r field players abilities correlation}
196 | abilities <- fifa_data %>% filter(!position == "GK") %>% select(crossing:sliding_tackle)
197 | ability_corr <- cor(abilities)
198 | ggcorrplot(ability_corr, type = "upper", hc.order = T, hc.method = "complete",
199 |            colors = c("lightskyblue","white","brown1"))
200 | ```
201 | 
202 | Once again we see a very strong correlation of some abilities while others don't correlate at all. The reason for this can be that abilities like short passing and long passing must be related to the same physical abilities, while agility and jumping do not. Overall we learn that most of the abilities are correlated to one another.
203 | And here we check just for goal keepers:
204 | 
205 | ```{r goal keepers abilities correlation}
206 | abilities <- fifa_data %>% filter(position == "GK") %>% select(gk_diving:gk_reflexes)
207 | ability_corr <- cor(abilities)
208 | ggcorrplot(ability_corr, type = "upper", hc.order = T, hc.method = "complete",
209 |           colors = c("lightskyblue","white","indianred1"))
210 | ```
211 | 
212 | for goal keepers the situation is different. Since their abilities are measured only for, well, goal keeping, it makes sense that all the abilities will be correlated. Still we see that kicking is not as correlated as the other abilities, which makes sense because it relates to foot ability and not body ability (the others are related mostly to hands).
213 | 
214 | Now that we have established the relation between different abilities, let's see if there is a correlation between the value of a player and his abilities:
215 | 
216 | ```{r abilities/value distribution}
217 | fit6<-lm(formula = log1p(value_numeric_pounds) ~ overall, data = fifa_data)
218 | summary(fit6)
219 | fifa_data %>% 
220 |   ggplot(aes(y=value_numeric_pounds, x= overall))+
221 |   geom_point(color="seagreen3")+
222 |   scale_y_log10(labels = dollar_format(prefix = "€"))+
223 |   stat_smooth(method = "lm",color="black")
224 | ```
225 | 
226 | We see there definitely is a correlation! The p-value is very small and the chart shows a strong connection. But does age matter?
227 | 
228 | Lets check if there is a correlation between abilities, value and age:
229 | 
230 | ```{r value/age+abilities destribution}
231 | fit <- lm(formula = log1p(value_numeric_pounds) ~ age_group+overall, data = fifa_data)
232 | summary(fit)
233 | fit.av <- aov(fit)
234 | summary(fit.av)
235 | fifa_data %>%
236 |   ggplot(aes(y=value_numeric_pounds, x=overall,
237 |              group = age_group,
238 |              color = age_group))+
239 |   geom_point(size = 2)+
240 |   scale_y_log10(labels = dollar_format(prefix = "€"))+
241 |   stat_smooth(method = "lm")+
242 |   scale_color_brewer(palette = "Set1")
243 | ```
244 | 
245 | 
246 | So, as we can see, while value and abilities are correlated, age does affect this correlation. We see that for young players, the same abilities will earn higher value- while for older players the same abilities will earn lower value (drastically lower if the player is over 35). All the tests show a clear correlation of value with age and abilities (low p-values). Also, in all of the tests we can clearly see the for players over 35 value drops much faster.
247 | 
248 | 
249 | # conclusion
250 | 
251 | I checked two main questions-
252 | 
253 |   *Is there a correlation between a player's abilities, his age and his value?
254 |   
255 |   *Do players score the same if they are right footed and left footed?
256 |   
257 | The first question showed a strong correlation between the three variables, but the second one did not.
258 | I have learned that it doesn't matter if you are a leftie or a rightie, as long as you stay young and talented- football is the right place for you.
259 | 
260 | Also I really held myself back the entire project because I know I'm not supposed to show my opinions but Real Madrid are the best club in  the world and my next project will show that they rule in every aspect.
261 | 
262 | Thank you for reading!


--------------------------------------------------------------------------------
/xaringan-themer.css:
--------------------------------------------------------------------------------
  1 | /* -------------------------------------------------------
  2 |  *
  3 |  *     !! This file was generated by xaringanthemer !!
  4 |  *
  5 |  *  Changes made to this file directly will be overwritten
  6 |  *  if you used xaringanthemer in your xaringan slides Rmd
  7 |  *
  8 |  *  Issues or likes?
  9 |  *    - https://github.com/gadenbuie/xaringanthemer
 10 |  *    - https://www.garrickadenbuie.com
 11 |  *
 12 |  *  Need help? Try:
 13 |  *    - vignette(package = "xaringanthemer")
 14 |  *    - ?xaringanthemer::style_xaringan
 15 |  *    - xaringan wiki: https://github.com/yihui/xaringan/wiki
 16 |  *    - remarkjs wiki: https://github.com/gnab/remark/wiki
 17 |  *
 18 |  *  Version: 0.4.1
 19 |  *
 20 |  * ------------------------------------------------------- */
 21 | @import url(https://fonts.googleapis.com/css?family=Rubik:300,300i&display=swap);
 22 | @import url(https://fonts.googleapis.com/css?family=Open+Sans&display=swap);
 23 | @import url(https://fonts.googleapis.com/css?family=Fira+Mono&display=swap);
 24 | 
 25 | 
 26 | :root {
 27 |   /* Fonts */
 28 |   --text-font-family: Rubik;
 29 |   --text-font-is-google: 1;
 30 |   --text-font-family-fallback: -apple-system, BlinkMacSystemFont, avenir next, avenir, helvetica neue, helvetica, Ubuntu, roboto, noto, segoe ui, arial;
 31 |   --text-font-base: sans-serif;
 32 |   --header-font-family: 'Open Sans';
 33 |   --header-font-is-google: 1;
 34 |   --header-font-family-fallback: Georgia, serif;
 35 |   --code-font-family: 'Fira Mono';
 36 |   --code-font-is-google: 1;
 37 |   --base-font-size: 20px;
 38 |   --text-font-size: 1rem;
 39 |   --code-font-size: 0.9rem;
 40 |   --code-inline-font-size: 1em;
 41 |   --header-h1-font-size: 2.75rem;
 42 |   --header-h2-font-size: 2.25rem;
 43 |   --header-h3-font-size: 1.75rem;
 44 | 
 45 |   /* Colors */
 46 |   --text-color: #272822;
 47 |   --header-color: #43418A;
 48 |   --background-color: #FFFFFF;
 49 |   --link-color: #43418A;
 50 |   --text-bold-color: #43418A;
 51 |   --code-highlight-color: rgba(255,255,0,0.5);
 52 |   --inverse-text-color: #FFFFFF;
 53 |   --inverse-background-color: #43418A;
 54 |   --inverse-header-color: #FFFFFF;
 55 |   --inverse-link-color: #43418A;
 56 |   --title-slide-background-color: #43418A;
 57 |   --title-slide-text-color: #FFFFFF;
 58 |   --header-background-color: #43418A;
 59 |   --header-background-text-color: #FFFFFF;
 60 |   --base: #43418A;
 61 |   --white: #FFFFFF;
 62 |   --black: #272822;
 63 | }
 64 | 
 65 | html {
 66 |   font-size: var(--base-font-size);
 67 | }
 68 | 
 69 | body {
 70 |   font-family: var(--text-font-family), var(--text-font-family-fallback), var(--text-font-base);
 71 |   font-weight: 300;
 72 |   color: var(--text-color);
 73 | }
 74 | h1, h2, h3 {
 75 |   font-family: var(--header-font-family), var(--header-font-family-fallback);
 76 |   font-weight: 600;
 77 |   color: var(--header-color);
 78 | }
 79 | .remark-slide-content {
 80 |   background-color: var(--background-color);
 81 |   font-size: 1rem;
 82 |   padding: 16px 64px 16px 64px;
 83 |   width: 100%;
 84 |   height: 100%;
 85 | }
 86 | .remark-slide-content h1 {
 87 |   font-size: var(--header-h1-font-size);
 88 | }
 89 | .remark-slide-content h2 {
 90 |   font-size: var(--header-h2-font-size);
 91 | }
 92 | .remark-slide-content h3 {
 93 |   font-size: var(--header-h3-font-size);
 94 | }
 95 | .remark-code, .remark-inline-code {
 96 |   font-family: var(--code-font-family), Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace;
 97 | }
 98 | .remark-code {
 99 |   font-size: var(--code-font-size);
100 | }
101 | .remark-inline-code {
102 |   font-size: var(--code-inline-font-size);
103 |   color: #43418A;
104 | }
105 | .remark-slide-number {
106 |   color: #43418A;
107 |   opacity: 1;
108 |   font-size: 0.9rem;
109 | }
110 | strong {
111 |   font-weight: bold;
112 |   color: var(--text-bold-color);
113 | }
114 | a, a > code {
115 |   color: var(--link-color);
116 |   text-decoration: none;
117 | }
118 | .footnote {
119 |   position: absolute;
120 |   bottom: 60px;
121 |   padding-right: 4em;
122 |   font-size: 0.9em;
123 | }
124 | .remark-code-line-highlighted {
125 |   background-color: var(--code-highlight-color);
126 | }
127 | .inverse {
128 |   background-color: var(--inverse-background-color);
129 |   color: var(--inverse-text-color);
130 |   
131 | }
132 | .inverse h1, .inverse h2, .inverse h3 {
133 |   color: var(--inverse-header-color);
134 | }
135 | .inverse a, .inverse a > code {
136 |   color: var(--inverse-link-color);
137 | }
138 | .title-slide, .title-slide h1, .title-slide h2, .title-slide h3 {
139 |   color: var(--title-slide-text-color);
140 | }
141 | .title-slide {
142 |   background-color: var(--title-slide-background-color);
143 | }
144 | .title-slide .remark-slide-number {
145 |   display: none;
146 | }
147 | /* Two-column layout */
148 | .left-column {
149 |   width: 20%;
150 |   height: 92%;
151 |   float: left;
152 | }
153 | .left-column h2, .left-column h3 {
154 |   color: #43418A99;
155 | }
156 | .left-column h2:last-of-type, .left-column h3:last-child {
157 |   color: #43418A;
158 | }
159 | .right-column {
160 |   width: 75%;
161 |   float: right;
162 |   padding-top: 1em;
163 | }
164 | .pull-left {
165 |   float: left;
166 |   width: 47%;
167 | }
168 | .pull-right {
169 |   float: right;
170 |   width: 47%;
171 | }
172 | .pull-right + * {
173 |   clear: both;
174 | }
175 | img, video, iframe {
176 |   max-width: 100%;
177 | }
178 | blockquote {
179 |   border-left: solid 5px #43418A80;
180 |   padding-left: 1em;
181 | }
182 | .remark-slide table {
183 |   margin: auto;
184 |   border-top: 1px solid #666;
185 |   border-bottom: 1px solid #666;
186 | }
187 | .remark-slide table thead th {
188 |   border-bottom: 1px solid #ddd;
189 | }
190 | th, td {
191 |   padding: 5px;
192 | }
193 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) {
194 |   background: #D9D9E7;
195 | }
196 | table.dataTable tbody {
197 |   background-color: var(--background-color);
198 |   color: var(--text-color);
199 | }
200 | table.dataTable.display tbody tr.odd {
201 |   background-color: var(--background-color);
202 | }
203 | table.dataTable.display tbody tr.even {
204 |   background-color: #D9D9E7;
205 | }
206 | table.dataTable.hover tbody tr:hover, table.dataTable.display tbody tr:hover {
207 |   background-color: rgba(255, 255, 255, 0.5);
208 | }
209 | .dataTables_wrapper .dataTables_length, .dataTables_wrapper .dataTables_filter, .dataTables_wrapper .dataTables_info, .dataTables_wrapper .dataTables_processing, .dataTables_wrapper .dataTables_paginate {
210 |   color: var(--text-color);
211 | }
212 | .dataTables_wrapper .dataTables_paginate .paginate_button {
213 |   color: var(--text-color) !important;
214 | }
215 | 
216 | /* Horizontal alignment of code blocks */
217 | .remark-slide-content.left pre,
218 | .remark-slide-content.center pre,
219 | .remark-slide-content.right pre {
220 |   text-align: start;
221 |   width: max-content;
222 |   max-width: 100%;
223 | }
224 | .remark-slide-content.left pre,
225 | .remark-slide-content.right pre {
226 |   min-width: 50%;
227 |   min-width: min(40ch, 100%);
228 | }
229 | .remark-slide-content.center pre {
230 |   min-width: 66%;
231 |   min-width: min(50ch, 100%);
232 | }
233 | .remark-slide-content.left pre {
234 |   margin-left: unset;
235 |   margin-right: auto;
236 | }
237 | .remark-slide-content.center pre {
238 |   margin-left: auto;
239 |   margin-right: auto;
240 | }
241 | .remark-slide-content.right pre {
242 |   margin-left: auto;
243 |   margin-right: unset;
244 | }
245 | 
246 | /* Slide Header Background for h1 elements */
247 | .remark-slide-content.header_background > h1 {
248 |   display: block;
249 |   position: absolute;
250 |   top: 0;
251 |   left: 0;
252 |   width: 100%;
253 |   background: var(--header-background-color);
254 |   color: var(--header-background-text-color);
255 |   padding: 2rem 64px 1.5rem 64px;
256 |   margin-top: 0;
257 |   box-sizing: border-box;
258 | }
259 | .remark-slide-content.header_background {
260 |   padding-top: 7rem;
261 | }
262 | 
263 | @page { margin: 0; }
264 | @media print {
265 |   .remark-slide-scaler {
266 |     width: 100% !important;
267 |     height: 100% !important;
268 |     transform: scale(1) !important;
269 |     top: 0 !important;
270 |     left: 0 !important;
271 |   }
272 | }
273 | 
274 | .base {
275 |   color: var(--base);
276 | }
277 | .bg-base {
278 |   background-color: var(--base);
279 | }
280 | .white {
281 |   color: var(--white);
282 | }
283 | .bg-white {
284 |   background-color: var(--white);
285 | }
286 | .black {
287 |   color: var(--black);
288 | }
289 | .bg-black {
290 |   background-color: var(--black);
291 | }
292 | 
293 | 
294 | 
295 | /* Extra CSS */
296 | .small {
297 |   font-size: 70%;
298 | }
299 | .extra-small {
300 |   font-size: 50%;
301 | }
302 | .full-width {
303 |   display: flex;
304 |   width: 100%;
305 |   flex: 1 1 auto;
306 | }
307 | 


--------------------------------------------------------------------------------