├── .Rbuildignore
├── .github
├── .gitignore
└── workflows
│ ├── R-CMD-check.yaml
│ └── pkgdown.yaml
├── .gitignore
├── CRAN-SUBMISSION
├── Code_of_Conduct.md
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
├── data.R
├── load_student.R
└── zzz.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── cran-comments.md
├── data
├── countrycode.rda
├── school.rda
├── student_subset_2000.rda
├── student_subset_2003.rda
├── student_subset_2006.rda
├── student_subset_2009.rda
├── student_subset_2012.rda
├── student_subset_2015.rda
├── student_subset_2018.rda
└── student_subset_2022.rda
├── inst
└── CITATION
├── learningtower.Rproj
├── man
├── countrycode.Rd
├── figures
│ ├── README_school_data_missing_values_summary.png
│ ├── README_student_data_missing_values_summary.png
│ ├── conversation_holden.png
│ ├── logo.png
│ ├── pisa_image.png
│ ├── readme.gif
│ └── readme.png
├── load_student.Rd
├── school.Rd
└── student.Rd
├── student_full_data
├── student_2000.rds
├── student_2003.rds
├── student_2006.rds
├── student_2009.rds
├── student_2012.rds
├── student_2015.rds
├── student_2018.rds
└── student_2022.rds
├── tests
├── testthat.R
└── testthat
│ ├── test-countrycode-col-types.R
│ ├── test-merge.R
│ ├── test-school-col-types.R
│ ├── test-student-col-types.R
│ └── test-test-load.R
└── vignettes
├── .gitignore
├── articles
├── Australia_trends.Rmd
└── exploring_time.Rmd
├── learningtower_school.Rmd
└── learningtower_student.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^_pkgdown\.yml$
5 | ^docs$
6 | ^pkgdown$
7 | ^\.github$
8 | ^README_cache$
9 | ^data_raw$
10 | ^LICENSE\.md$
11 | ^pkgdown$
12 | ^vignettes/articles$
13 | ^student_full_data$
14 | ^school_full_data$
15 | ^CODE_OF_CONDUCT\.md$
16 | ^CRAN-RELEASE$
17 | ^cran-comments\.md$
18 | ^man/figures/readme.gif$
19 | ^man/figures/README_student_data_missing_values_summary.png$
20 | ^man/figures/README_school_data_missing_values_summary.png$
21 | ^\.DS_Store$
22 | ^CRAN-SUBMISSION$
23 |
--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches:
6 | - master
7 | - '*'
8 | pull_request:
9 | branches:
10 | - master
11 | - '*'
12 |
13 | name: R-CMD-check
14 |
15 | jobs:
16 | R-CMD-check:
17 | runs-on: ${{ matrix.config.os }}
18 |
19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 |
21 | strategy:
22 | fail-fast: false
23 | matrix:
24 | config:
25 | - {os: macos-latest, r: 'release'}
26 | - {os: windows-latest, r: 'release'}
27 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
28 | - {os: ubuntu-latest, r: 'release'}
29 | - {os: ubuntu-latest, r: 'oldrel-1'}
30 |
31 | env:
32 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
33 | R_KEEP_PKG_SOURCE: yes
34 |
35 | steps:
36 | - uses: actions/checkout@v3
37 |
38 | - uses: r-lib/actions/setup-pandoc@v2
39 |
40 | - uses: r-lib/actions/setup-r@v2
41 | with:
42 | r-version: ${{ matrix.config.r }}
43 | http-user-agent: ${{ matrix.config.http-user-agent }}
44 | use-public-rspm: true
45 |
46 | - uses: r-lib/actions/setup-r-dependencies@v2
47 | with:
48 | extra-packages: any::rcmdcheck
49 | needs: check
50 |
51 | - uses: r-lib/actions/check-r-package@v2
52 | with:
53 | upload-snapshots: true
54 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 | release:
9 | types: [published]
10 | workflow_dispatch:
11 |
12 | name: pkgdown
13 |
14 | jobs:
15 | pkgdown:
16 | runs-on: ubuntu-latest
17 | # Only restrict concurrency for non-PR jobs
18 | concurrency:
19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 | env:
21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 | steps:
23 | - uses: actions/checkout@v3
24 |
25 | - uses: r-lib/actions/setup-pandoc@v2
26 |
27 | - uses: r-lib/actions/setup-r@v2
28 | with:
29 | use-public-rspm: true
30 |
31 | - uses: r-lib/actions/setup-r-dependencies@v2
32 | with:
33 | extra-packages: any::pkgdown, local::.
34 | needs: website
35 |
36 | - name: Install additional dependencies
37 | run: |
38 | install.packages("remotes")
39 | install.packages("tidyverse")
40 | install.packages("rmarkdown")
41 | install.packages("broom")
42 | install.packages("knitr")
43 | install.packages("ggrepel")
44 | install.packages("patchwork")
45 | install.packages("gghighlight")
46 | install.packages("brolgar")
47 | install.packages("tsibble")
48 | install.packages("lme4")
49 | install.packages("gganimate")
50 | install.packages("ggfortify")
51 | install.packages("sjPlot")
52 | install.packages("kableExtra")
53 | shell: Rscript {0}
54 |
55 | - name: Build site
56 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
57 | shell: Rscript {0}
58 |
59 | - name: Deploy to GitHub pages
60 | if: github.event_name != 'pull_request'
61 | uses: JamesIves/github-pages-deploy-action@v4.4.1
62 | with:
63 | clean: false
64 | branch: gh-pages
65 | folder: docs
66 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | docs
6 | inst/doc
7 | doc
8 | Meta
9 | .DS_Store
10 |
--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 1.1.0
2 | Date: 2024-12-21 01:56:18 UTC
3 | SHA: e46f5094a6d05f4406087ca802e3466209a0abf3
4 |
--------------------------------------------------------------------------------
/Code_of_Conduct.md:
--------------------------------------------------------------------------------
1 | # Contributor Code of Conduct
2 |
3 | As contributors and maintainers of this project, we pledge to respect all people who
4 | contribute through reporting issues, posting feature requests, updating documentation,
5 | submitting pull requests or patches, and other activities.
6 |
7 | We are committed to making participation in this project a harassment-free experience for
8 | everyone, regardless of level of experience, gender, gender identity and expression,
9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 |
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 |
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed
18 | from the project team.
19 |
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by
21 | opening an issue or contacting one or more of the project maintainers.
22 |
23 | This Code of Conduct is adapted from the Contributor Covenant
24 | (http:contributor-covenant.org), version 1.0.0, available at
25 | http://contributor-covenant.org/version/1/0/0/
26 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: learningtower
2 | Title: OECD PISA Datasets from 2000-2022 in an Easy-to-Use Format
3 | Version: 1.1.0
4 | Authors@R: c(
5 | person(given = "Kevin", family = "Wang", role = c("aut", "cre"),
6 | email = "kevinwangstats@gmail.com"),
7 | person(given = "Paul", family = "Yacobellis", role = "aut",
8 | email = "pyacobellis@hotmail.com"),
9 | person(given = "Erika", family = "Siregar", role = "aut",
10 | email = "erika.mukhlisina@gmail.com"),
11 | person(given = "Sarah", family = "Romanes", role = "aut",
12 | email = "srom8308@uni.sydney.edu.au"),
13 | person(given = "Kim", family = "Fitter", role = "aut",
14 | email = "kimfitter@yahoo.com"),
15 | person(given = "Giulio", family = "Valentino Dalla Riva", role = "aut",
16 | email = "me@gvdallariva.net"),
17 | person(given = "Dianne", family = "Cook", role = "aut",
18 | email = "dicook@monash.edu"),
19 | person(given = "Nick", family = "Tierney", role = "aut",
20 | email = "nicholas.tierney@gmail.com"),
21 | person(given = "Priya", family = "Dingorkar", role = "aut",
22 | email = "priyadingorkar@gmail.com"),
23 | person(given = "Shabarish", family = "Sai Subramanian", role = "aut",
24 | email = "shabarish161@gmail.com"),
25 | person(given = "Guan Ru", family = "Chen", role = "aut",
26 | email = "rix09207@gmail.com")
27 | )
28 | Description: The Programme for International Student Assessment (PISA) is a global study conducted by the Organization for Economic Cooperation and Development (OECD) in member and non-member countries to assess educational systems by assessing 15-year-old school students academic performance in mathematics, science, and reading. This datasets contains information on their scores and other socioeconomic characteristics, information about their school and its infrastructure, as well as the countries that are taking part in the program.
29 | Depends: R (>= 3.5.0)
30 | Encoding: UTF-8
31 | RoxygenNote: 7.3.2
32 | VignetteBuilder: knitr
33 | License: MIT + file LICENSE
34 | URL: https://kevinwang09.github.io/learningtower/, https://github.com/kevinwang09/learningtower
35 | BugReports: https://github.com/kevinwang09/learningtower/issues
36 | Imports:
37 | tibble,
38 | dplyr
39 | Suggests:
40 | testthat (>= 3.0.0),
41 | knitr,
42 | rmarkdown,
43 | ggplot2,
44 | forcats,
45 | scales
46 | Config/testthat/edition: 3
47 | LazyData: true
48 | LazyDataCompression: xz
49 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2024
2 | COPYRIGHT HOLDER: learningtower authors
3 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2024 learningtower authors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(load_student)
4 | importFrom(dplyr,bind_rows)
5 | importFrom(tibble,tibble)
6 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # learningtower 1.1.0
2 |
3 | * Added 2022 data set.
4 | * Updates to the README and contributor information.
5 | * Added a `NEWS.md` file to track changes to the package.
6 | * `year` column in both the `student` and the `school` datasets are changed from a factor column to an integer column.
7 | * `school_id` column in both the `student` and the `school` datasets are changed from a factor column to a character column.
8 |
--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
1 | #' @title Processed and Sampled PISA Student Data (2000-2022)
2 | #'
3 | #' @description This dataset provides a clean and processed subset of the OECD PISA student data
4 | #' for the years 2000-2022. The original data is sourced from
5 | #' \url{https://www.oecd.org/en/about/programmes/pisa/pisa-data.html} and has been prepared for analysis.
6 | #' A sampling of 50 students per country (for OECD countries) has been included for each year.
7 | #' The data curation and sampling process are documented in \url{https://github.com/kevinwang09/learningtower_masonry/blob/master/Code/student_bind_rows.Rmd}
8 | #'
9 | #' @format A tibble of the following variables
10 | #' \itemize{
11 | #' \item \code{year}: Year of the PISA data. Integer.
12 | #' \item \code{country}: Country 3 character code. Note that some regions/territories are coded as "country" for ease of input. Factor.
13 | #' \item \code{school_id}: Unique school identifier for each country and year. Character.
14 | #' \item \code{student_id}: Unique student identifier within each school. Integer.
15 | #' \item \code{mother_educ}: Mother's highest level of education, from "less than ISCED1" to "ISCED 3A". Factor.
16 | #' \item \code{father_educ}: Father's highest level of education, from "less than ISCED1" to "ISCED 3A". Factor.
17 | #' \item \code{gender}: Gender of the student. Only "male" and "female" are recorded. Factor.
18 | #' Note that we call this variable gender and not sex as this term was used in the OECD PISA database.
19 | #' \item \code{computer}: Possession of computer. Only "yes" and "no" are recorded. Factor.
20 | #' \item \code{internet}: Access to internet. Only "yes" and "no" are recorded. Factor.
21 | #' \item \code{math}: Simulated score in mathematics. Numeric.
22 | #' \item \code{read}: Simulated score in reading. Numeric.
23 | #' \item \code{science}: Simulated score in science. Numeric.
24 | #' \item \code{stu_wgt}: The final survey weight score for the student score. Numeric.
25 | #' \item \code{desk}: Possession of desk to study at. Only "yes" and "no" are recorded. Factor.
26 | #' \item \code{room}: Possession of a room of your own. Only "yes" and "no" are recorded. Factor.
27 | #' \item \code{dishwasher}: Possession of a dishwasher. Only "yes" and "no" are recorded. Factor.
28 | #' Note that in 2015 and 2018, all entries are missing.
29 | #' \item \code{television}: Number of televisions.
30 | #' "0", "1", "2" are code for no, one and two TVs in the house. "3+" codes for three or more TVs. Factor.
31 | #' Note that in 2003, all entries are missing.
32 | #' \item \code{computer_n}: Number of computers.
33 | #' "0", "1", "2" are code for no, one and two computers in the house. "3+" codes for three or more computers. Factor.
34 | #' Note that in 2003, all entries are missing.
35 | #' \item \code{car}: Number of cars.
36 | #' "0", "1", "2" are code for no, one and two cars in the house. "3+" codes for three or more cars Factor.
37 | #' Note that in 2003, all entries are missing.
38 | #' \item \code{book}: Number of books. Factor.
39 | #' Note that encoding is different in the years 2000 and 2003 compared to all other years. Factor.
40 | #' Evaluate \code{table(student$book, student$year)} for a demo.
41 | #' \item \code{wealth}: Index of family wealth. Numeric.
42 | #' Note that in 2003, all entries are missing.
43 | #' \item \code{escs}: Index of economic, social and cultural status. Numeric.
44 | #' }
45 | #' @docType data
46 | #' @name student
47 | #' @rdname student
48 | #' @importFrom dplyr bind_rows
49 | #' @examples
50 | #' library(dplyr)
51 | #' data(student_subset_2000)
52 | #' data(student_subset_2003)
53 | #' dplyr::bind_rows(
54 | #' student_subset_2000,
55 | #' student_subset_2003
56 | #' )
57 | NULL
58 |
59 |
60 | #' @docType data
61 | #' @name student_subset_2000
62 | #' @rdname student
63 | NULL
64 |
65 | #' @docType data
66 | #' @name student_subset_2003
67 | #' @rdname student
68 | NULL
69 |
70 | #' @docType data
71 | #' @name student_subset_2006
72 | #' @rdname student
73 | NULL
74 |
75 | #' @docType data
76 | #' @name student_subset_2009
77 | #' @rdname student
78 | NULL
79 |
80 | #' @docType data
81 | #' @name student_subset_2012
82 | #' @rdname student
83 | NULL
84 |
85 | #' @docType data
86 | #' @name student_subset_2015
87 | #' @rdname student
88 | NULL
89 |
90 | #' @docType data
91 | #' @name student_subset_2018
92 | #' @rdname student
93 | NULL
94 |
95 | #' @docType data
96 | #' @name student_subset_2022
97 | #' @rdname student
98 | NULL
99 |
100 |
101 | #' @title Subset of the School data available for the years 2000-2022 from the PISA OECD database
102 | #'
103 | #' @description A subset data containing school weight and other information
104 | #' from the triennial testing of 15 year olds around
105 | #' the globe. Original data available from
106 | #' \url{https://www.oecd.org/en/about/programmes/pisa/pisa-data.html}.
107 | #'
108 | #' @format A tibble of the following variables
109 | #' \itemize{
110 | #' \item \code{year}: Year of the PISA data. Integer.
111 | #' \item \code{country}: Country 3 character code. Note that some regions/territories are coded as country for ease of input. Chracter.
112 | #' \item \code{school_id}: The school identification number, unique for each country and year combination. Character.
113 | #' \item \code{fund_gov}: Percentage of total funding for school year from government. Numeric.
114 | #' \item \code{fund_fees}: Percentage of total funding for school year from student fees or school charges paid by parents. Numeric.
115 | #' \item \code{fund_donation}: Percentage of total funding for school year from
116 | #' benefactors, donations, bequests, sponsorship, parent fundraising. Numeric.
117 | #' \item \code{enrol_boys}: Number of boys in the school. Numeric.
118 | #' \item \code{enrol_girls}: Number of girls in the school. Numeric.
119 | #' \item \code{stratio}: Student-Teacher ratio. Numeric.
120 | #' \item \code{public_private}: Is the school a public or private school. Factor.
121 | #' \item \code{staff_shortage}: Shortage of staff. Numeric.
122 | #' \item \code{sch_wgt}: The final survey weight score for the schools. Numeric.
123 | #' \item \code{school_size}: The school size. Numeric.
124 | #' }
125 | #' @docType data
126 | #' @name school
127 | NULL
128 |
129 | #' @title Country iso3c and name mapping for PISA OECD countries participants.
130 | #'
131 | #' @description A dataset containing mapping of the country ISO code to the country names.
132 | #' More information on participating countries can be found at
133 | #' \url{https://www.oecd.org/pisa/aboutpisa/pisa-participants.htm}.
134 | #'
135 | #' @format A tibble of the following variables
136 | #' \itemize{
137 | #' \item \code{country}: Country 3 character code. Note that some regions/territories are coded as country for ease of input. Character.
138 | #' \item \code{country_name}: Country name. Note that some regions/territories are coded as country for ease of input. Character.
139 | #' }
140 | #' @docType data
141 | #' @name countrycode
142 | NULL
143 |
144 |
--------------------------------------------------------------------------------
/R/load_student.R:
--------------------------------------------------------------------------------
1 | #'@title load_student() function allows the user to extract the PISA student scores for any desired year
2 | #'from 2000-2022
3 | #'
4 | #'@description load_student() function was created to extract the data of student's scores in any
5 | #'years from 2000-2022, the function requires any of the year as it argument or a string "all"
6 | #'that will return all the PISA scores of the students from the years 2000-2022.
7 | #'
8 | #'@param year is the required parameter for the function to display the
9 | #'dataset the user wants to view the PISA scores for the selected year else the entire student
10 | #'data will be available to the user
11 | #'
12 | #'@importFrom dplyr bind_rows
13 | #'@importFrom tibble tibble
14 | #'
15 | #'@return A dataset of PISA scores of students that took the test in the selected year as per user
16 | #'from the years 2000-2018
17 | #'
18 | #'@usage load_student(year = "2000")
19 | #'
20 | #'@examples
21 | #' \dontrun{
22 | #' library(learningtower)
23 | #' student_all <- load_student("all")
24 | #' student_2000 <- load_student("2000")
25 | #' }
26 | #'
27 | #'@export
28 | load_student <- function(year = "2000"){
29 |
30 | year <- as.character(year)
31 | stopifnot(all(year %in% c("2000", "2003",
32 | "2006", "2009",
33 | "2012", "2015",
34 | "2018", "2022",
35 | "all")))
36 |
37 | ## If "all" is in the year vector, we will download everything
38 | if("all" %in% year){
39 | year = c("2000", "2003", "2006", "2009", "2012", "2015", "2018", "2022")
40 | }
41 |
42 | result = tibble::tibble()
43 |
44 | for(this_year in year){
45 | message("Downloading year ", this_year, "...\n")
46 | this_data = download_single_student(year = this_year)
47 | result = dplyr::bind_rows(result, this_data)
48 | }
49 | return(result)
50 | }
51 |
52 | download_single_student <- function(year){
53 | url_git = base::paste0("https://github.com/kevinwang09/learningtower/raw/master/student_full_data/student_", year, ".rds")
54 | tmp <- tempfile()
55 | utils::download.file(url = url_git, destfile = tmp)
56 | return(base::readRDS(file = tmp))
57 | }
58 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | data_range <- function(){
2 | return("2000 - 2022")
3 | }
4 |
5 | .onAttach <- function(libname, pkgname) {
6 | m = paste0(
7 | "The learningtower package (version 1.1.0)",
8 | " provides data from OECD PISA database between ", data_range(), ".",
9 | " For package size reasons, only a small subset is provided in the package. Use the function `load_student()` to access the full data.")
10 | packageStartupMessage(m)
11 | }
12 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 | ```{r, include = FALSE}
6 | knitr::opts_chunk$set(
7 | collapse = TRUE,
8 | warning = FALSE,
9 | message = FALSE,
10 | comment = "#>",
11 | fig.path = "man/figures/README-",
12 | out.width = "54%",
13 | fig.align = "center")
14 | library(tidyverse)
15 | library(learningtower)
16 | ```
17 |
18 | # learningtower
19 |
20 | [](https://github.com/kevinwang09/learningtower/actions/workflows/R-CMD-check.yaml)
21 |
22 | The goal of `learningtower` is to provide a user-friendly R package to provide easy access to a subset of variables from PISA data collected from the [OECD](https://www.oecd.org/pisa/data/). Version `r utils::packageVersion("learningtower")` of this package provides the data for the years `r learningtower:::data_range()`. The survey data is published every three years. This is an excellent real world dataset for data exploring, data visualising and statistical computations.
23 |
24 | ## What is the PISA dataset?
25 |
26 |
27 |
28 |
29 |
30 |
31 | The Programme for International Student Assessment (PISA) is an international assessment measuring student performance in reading, mathematical and scientific literacy.
32 |
33 | PISA assesses the extent to which 15-year-old students have acquired some of the knowledge and skills that are essential for full participation in society, and how well they are prepared for lifelong learning in the areas of reading, mathematical and scientific literacy.
34 |
35 | In 2022, PISA involved 79 countries and 600,000+ students worldwide.
36 |
37 | Read more about the Programme [here](https://www.oecd.org/en/about/programmes/pisa.html).
38 |
39 |
40 | ## Installation
41 |
42 | You can install the `learningtower` package from [CRAN](https://CRAN.R-project.org) with:
43 |
44 | ``` r
45 | install.packages("learningtower")
46 | ```
47 |
48 | To install the development version of `learningtower` from [GitHub](https://github.com/) use:
49 |
50 | ``` r
51 | devtools::install_github("kevinwang09/learningtower")
52 | ```
53 |
54 | ## Data Description
55 |
56 | The `learningtower` gives access to a subset of variables from PISA data originally collected and are available from [OECD](https://www.oecd.org/pisa/data/), collected on a three year basis.
57 |
58 | The `learningtower` package contains mainly three datasets:
59 |
60 | + `student`
61 | + `school`
62 | + `countrycode`
63 |
64 | This provides us with information about the students scores in mathematics, reading and science, their school details, and which country they are from. The data provided in this package is a cleaned version of the full published PISA organisation, with reproducible code available in [this repository](https://github.com/kevinwang09/learningtower_masonry).
65 |
66 | The number of entries for the `student` and `school` data are shown below.
67 |
68 | ```{r, eval=FALSE, echo=FALSE}
69 | library(learningtower)
70 | student = load_student("all")
71 | data(school)
72 |
73 | library(dplyr)
74 |
75 | student_summary <- student |>
76 | group_by(year) |>
77 | tally(name = "Number of Students")
78 |
79 | school_summary <- school |>
80 | group_by(year) |>
81 | tally(name = "Number of Schools")
82 |
83 | combined_summary <- full_join(student_summary, school_summary, by = "year")
84 |
85 | knitr::kable(combined_summary)
86 | ```
87 |
88 | | Year | Number of Students | Number of Schools |
89 | |------|--------------------:|------------------:|
90 | | 2000 | 127,236 | 8,526 |
91 | | 2003 | 276,165 | 10,274 |
92 | | 2006 | 398,750 | 14,365 |
93 | | 2009 | 515,958 | 18,641 |
94 | | 2012 | 480,174 | 18,139 |
95 | | 2015 | 519,334 | 17,908 |
96 | | 2018 | 612,004 | 21,903 |
97 | | 2022 | 613,744 | 21,629 |
98 |
99 | ### Student Dataset
100 |
101 | The `student` dataset comprises of the scores from the triennial testing of 15-year-olds worldwide. In addition, this dataset contains interesting information on their parents qualifications, family wealth, gender, and possession of computers, internet, cars, books, rooms, desks, and similar other variables.
102 |
103 | The full dataset is approximately 50MB in size, which is much larger than the CRAN's allowed package size limit. As the result, the package itself only includes a random 50 rows from the [38 OECD countries](https://en.wikipedia.org/wiki/OECD#Member_countries), for each of the survey years. i.e. `student_subset_2000`, `student_subset_2003` etc.
104 |
105 | The `student` subset dataset can be loaded easily. See `?student` for detailed information on the measured variables.
106 |
107 | ```{r}
108 | library(learningtower)
109 |
110 | data(student_subset_2018)
111 | dim(student_subset_2018)
112 | ```
113 |
114 | The entire `student` data can be downloaded using the `load_student` function.
115 |
116 | ```{r, eval=FALSE}
117 | #load the entire student data for a single year
118 | student_data_2018 <- load_student(2018)
119 |
120 | #load the entire student data for two of the years (2012, 2018)
121 | student_data_2012_2018 <- load_student(c(2012, 2018))
122 |
123 | #load the entire student data
124 | student_data_all <- load_student("all")
125 | ```
126 |
127 | Note that because of changing data specification over the survery years, not all variables were measured consistently across the years.
128 |
129 |
130 |
131 |
132 |
133 | ### School Dataset
134 |
135 | The `school` dataset comprises school weight and other information such as the funding distribution of the schools, whether the school is private or public, the enrollment of boys and girls, the school size, and similar other characteristics of interest of different schools these 15-year-olds attend throughout the world.
136 |
137 | - The school subset dataset can be loaded as follows
138 |
139 | ```{r}
140 | # loading the school data
141 | data(school)
142 | ```
143 |
144 | See `?school` for more information on the different variables present in the the school dataset.
145 |
146 |
147 |
148 |
149 |
150 | ### Countrycode Dataset
151 |
152 | The countrycode dataset contains mapping of the [country ISO code to the country name](https://www.oecd.org/content/dam/oecd/en/about/programmes/edu/pisa/publications/technical-report/PISA2015_TechRep_Final.pdf). More information on the participating countries can be found [here](https://www.oecd.org/en/about/programmes/pisa/pisa-participants.html).
153 |
154 | ```{r}
155 | # loading the countrycode data
156 | data(countrycode)
157 | head(countrycode)
158 | ```
159 |
160 |
161 | Notes on countries
162 | + Not all data entries in the `countrycode` are countries. For example, "QCN" refers to "Shanghai-China".
163 | + Due to differences in country codes, not all `student_subset_yyyy` data has all 38 OECD countries.
164 |
165 |
166 | See `?countrycode` for more detailed information on the countries that participated in the PISA experiment.
167 |
168 | ## Exploring the data
169 |
170 | In the plot shown below, shows the weighted mean of mathematics scores of these 15 year old students for a few selected countries over the available years.
171 |
172 | ```{r, eval = FALSE, echo = FALSE}
173 | library(dplyr)
174 | library(learningtower)
175 |
176 | student <- load_student("all")
177 |
178 | p = student |>
179 | dplyr::filter(country %in% c("SGP","CAN", "FIN", "NZL",
180 | "USA", "JPN", "GBR", "AUS")) |>
181 | group_by(year, country) |>
182 | summarise(math = weighted.mean(math, stu_wgt, na.rm=TRUE)) |>
183 | ggplot(aes(x=year, y=math, group=country, color = country)) +
184 | geom_line(alpha=0.6, linewidth = 2) +
185 | geom_point(alpha=0.6, size=3)+
186 | ylim(c(450, 600)) +
187 | theme_minimal() +
188 | labs(x = "Year",
189 | y = "Score",
190 | title = "Math Scores 2000 - 2022") +
191 | theme(text = element_text(size=10),
192 | legend.title = element_blank()) +
193 | scale_color_brewer(palette = "Dark2")
194 |
195 | ggsave(p, filename = "man/figures/readme.png", width=6, height=4)
196 | ```
197 |
198 |
199 |
200 |
201 |
202 |
203 | - Similarly, you can find more code examples and data visualizations for exploring `learningtower` through our vignettes and articles
204 |
205 | - Further data exploration can be found in our articles exploring temporal trends [here](https://kevinwang09.github.io/learningtower/articles/articles/exploring_time.html).
206 |
207 | ## Citation
208 |
209 | To cite the `learningtower` package, please use:
210 |
211 | ```{r}
212 | citation("learningtower")
213 | ```
214 |
215 | ## Motivation for `learningtower`
216 |
217 | + The PISA 2018 results were released on 3 December 2019. This led to wringing of hands in the Australian press, with titles of stories like [Vital Signs: Australia's slipping student scores will lead to greater income inequality](https://theconversation.com/vital-signs-australias-slipping-student-scores-will-lead-to-greater-income-inequality-128301) and [In China, Nicholas studied maths 20 hours a week. In Australia, it's three](https://www.smh.com.au/education/in-china-nicholas-studied-maths-20-hours-a-week-in-australia-it-s-three-20191203-p53ggv.html).
218 |
219 |
220 |
221 |
222 |
223 | + Australia's neighbours, New Zealand and Indonesia, are also worrying: [New Zealand top-end in OECD's latest PISA report but drop in achievements 'worrying'](https://www.stuff.co.nz/national/education/117890945/new-zealand-topend-in-oecds-latest-pisa-report-but-drop-in-achievements-worrying), [Not even mediocre? Indonesian students score low in math, reading, science: PISA report](https://www.thejakartapost.com/news/2019/12/04/not-even-mediocre-indonesian-students-score-low-in-math-reading-science-pisa-report.html).
224 |
225 | + The data from this survey and all of the surveys conducted since the first collection in 2000, is publicly available. We decided to have made a more convenient subset of the data available in a new R package, called `learningtower`
226 |
227 |
228 | ## Acknowledgement
229 |
230 | The work to make the data available is the effort of several researchers from Australia, New Zealand and Indonesia, conducted as part of the [ROpenSci OzUnconf](https://ozunconf19.ropensci.org) held in Sydney, Dec 11-13, 2019.
231 |
232 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # learningtower
3 |
4 | [](https://github.com/kevinwang09/learningtower/actions/workflows/R-CMD-check.yaml)
5 |
6 | The goal of `learningtower` is to provide a user-friendly R package to
7 | provide easy access to a subset of variables from PISA data collected
8 | from the [OECD](https://www.oecd.org/pisa/data/). Version 1.1.0 of this
9 | package provides the data for the years 2000 - 2022. The survey data is
10 | published every three years. This is an excellent real world dataset for
11 | data exploring, data visualising and statistical computations.
12 |
13 | ## What is the PISA dataset?
14 |
15 |
16 |
17 |
18 |
19 | The Programme for International Student Assessment (PISA) is an
20 | international assessment measuring student performance in reading,
21 | mathematical and scientific literacy.
22 |
23 | PISA assesses the extent to which 15-year-old students have acquired
24 | some of the knowledge and skills that are essential for full
25 | participation in society, and how well they are prepared for lifelong
26 | learning in the areas of reading, mathematical and scientific literacy.
27 |
28 | In 2022, PISA involved 79 countries and 600,000+ students worldwide.
29 |
30 | Read more about the Programme
31 | [here](https://www.oecd.org/en/about/programmes/pisa.html).
32 |
33 | ## Installation
34 |
35 | You can install the `learningtower` package from
36 | [CRAN](https://CRAN.R-project.org) with:
37 |
38 | ``` r
39 | install.packages("learningtower")
40 | ```
41 |
42 | To install the development version of `learningtower` from
43 | [GitHub](https://github.com/) use:
44 |
45 | ``` r
46 | devtools::install_github("kevinwang09/learningtower")
47 | ```
48 |
49 | ## Data Description
50 |
51 | The `learningtower` gives access to a subset of variables from PISA data
52 | originally collected and are available from
53 | [OECD](https://www.oecd.org/pisa/data/), collected on a three year
54 | basis.
55 |
56 | The `learningtower` package contains mainly three datasets:
57 |
58 | - `student`
59 | - `school`
60 | - `countrycode`
61 |
62 | This provides us with information about the students scores in
63 | mathematics, reading and science, their school details, and which
64 | country they are from. The data provided in this package is a cleaned
65 | version of the full published PISA organisation, with reproducible code
66 | available in [this
67 | repository](https://github.com/kevinwang09/learningtower_masonry).
68 |
69 | The number of entries for the `student` and `school` data are shown
70 | below.
71 |
72 | | Year | Number of Students | Number of Schools |
73 | |------|-------------------:|------------------:|
74 | | 2000 | 127,236 | 8,526 |
75 | | 2003 | 276,165 | 10,274 |
76 | | 2006 | 398,750 | 14,365 |
77 | | 2009 | 515,958 | 18,641 |
78 | | 2012 | 480,174 | 18,139 |
79 | | 2015 | 519,334 | 17,908 |
80 | | 2018 | 612,004 | 21,903 |
81 | | 2022 | 613,744 | 21,629 |
82 |
83 | ### Student Dataset
84 |
85 | The `student` dataset comprises of the scores from the triennial testing
86 | of 15-year-olds worldwide. In addition, this dataset contains
87 | interesting information on their parents qualifications, family wealth,
88 | gender, and possession of computers, internet, cars, books, rooms,
89 | desks, and similar other variables.
90 |
91 | The full dataset is approximately 50MB in size, which is much larger
92 | than the CRAN’s allowed package size limit. As the result, the package
93 | itself only includes a random 50 rows from the [38 OECD
94 | countries](https://en.wikipedia.org/wiki/OECD#Member_countries), for
95 | each of the survey years. i.e. `student_subset_2000`,
96 | `student_subset_2003` etc.
97 |
98 | The `student` subset dataset can be loaded easily. See `?student` for
99 | detailed information on the measured variables.
100 |
101 | ``` r
102 | library(learningtower)
103 |
104 | data(student_subset_2018)
105 | dim(student_subset_2018)
106 | #> [1] 1900 22
107 | ```
108 |
109 | The entire `student` data can be downloaded using the `load_student`
110 | function.
111 |
112 | ``` r
113 | #load the entire student data for a single year
114 | student_data_2018 <- load_student(2018)
115 |
116 | #load the entire student data for two of the years (2012, 2018)
117 | student_data_2012_2018 <- load_student(c(2012, 2018))
118 |
119 | #load the entire student data
120 | student_data_all <- load_student("all")
121 | ```
122 |
123 | Note that because of changing data specification over the survery years,
124 | not all variables were measured consistently across the years.
125 |
126 |
127 |
128 |
129 |
130 | ### School Dataset
131 |
132 | The `school` dataset comprises school weight and other information such
133 | as the funding distribution of the schools, whether the school is
134 | private or public, the enrollment of boys and girls, the school size,
135 | and similar other characteristics of interest of different schools these
136 | 15-year-olds attend throughout the world.
137 |
138 | - The school subset dataset can be loaded as follows
139 |
140 | ``` r
141 | # loading the school data
142 | data(school)
143 | ```
144 |
145 | See `?school` for more information on the different variables present in
146 | the the school dataset.
147 |
148 |
149 |
150 |
151 |
152 | ### Countrycode Dataset
153 |
154 | The countrycode dataset contains mapping of the [country ISO code to the
155 | country
156 | name](https://www.oecd.org/content/dam/oecd/en/about/programmes/edu/pisa/publications/technical-report/PISA2015_TechRep_Final.pdf).
157 | More information on the participating countries can be found
158 | [here](https://www.oecd.org/en/about/programmes/pisa/pisa-participants.html).
159 |
160 | ``` r
161 | # loading the countrycode data
162 | data(countrycode)
163 | head(countrycode)
164 | #> # A tibble: 6 × 2
165 | #> country country_name
166 | #>
167 | #> 1 AZE Azerbaijan
168 | #> 2 ARG Argentina
169 | #> 3 AUS Australia
170 | #> 4 AUT Austria
171 | #> 5 BEL Belgium
172 | #> 6 BRA Brazil
173 | ```
174 |
175 |
176 |
177 | Notes on countries
178 |
179 |
180 | - Not all data entries in the `countrycode` are countries. For example,
181 | “QCN” refers to “Shanghai-China”.
182 | - Due to differences in country codes, not all `student_subset_yyyy`
183 | data has all 38 OECD countries.
184 |
185 |
186 |
187 | See `?countrycode` for more detailed information on the countries that
188 | participated in the PISA experiment.
189 |
190 | ## Exploring the data
191 |
192 | In the plot shown below, shows the weighted mean of mathematics scores
193 | of these 15 year old students for a few selected countries over the
194 | available years.
195 |
196 |
197 |
198 |
199 |
200 | - Similarly, you can find more code examples and data visualizations for
201 | exploring `learningtower` through our vignettes and articles
202 |
203 | - Further data exploration can be found in our articles exploring
204 | temporal trends
205 | [here](https://kevinwang09.github.io/learningtower/articles/articles/exploring_time.html).
206 |
207 | ## Citation
208 |
209 | To cite the `learningtower` package, please use:
210 |
211 | ``` r
212 | citation("learningtower")
213 | #> To cite package 'learningtower' in publications use:
214 | #>
215 | #> Wang K, Yacobellis P, Siregar E, Romanes S, Fitter K, Dalla Riva G,
216 | #> Cook D, Tierney N, Dingorkar P, Sai Subramanian S, Chen G (2024).
217 | #> _learningtower: OECD PISA Datasets from 2000-2022 in an Easy-to-Use
218 | #> Format_. R package version 1.1.0,
219 | #> https://github.com/kevinwang09/learningtower,
220 | #> .
221 | #>
222 | #> A BibTeX entry for LaTeX users is
223 | #>
224 | #> @Manual{,
225 | #> title = {learningtower: OECD PISA Datasets from 2000-2022 in an Easy-to-Use Format},
226 | #> author = {Kevin Wang and Paul Yacobellis and Erika Siregar and Sarah Romanes and Kim Fitter and Giulio Valentino {Dalla Riva} and Dianne Cook and Nick Tierney and Priya Dingorkar and Shabarish {Sai Subramanian} and Guan Ru Chen},
227 | #> note = {R package version 1.1.0, https://github.com/kevinwang09/learningtower},
228 | #> url = {https://kevinwang09.github.io/learningtower/},
229 | #> year = {2024},
230 | #> }
231 | ```
232 |
233 | ## Motivation for `learningtower`
234 |
235 | - The PISA 2018 results were released on 3 December 2019. This led to
236 | wringing of hands in the Australian press, with titles of stories like
237 | [Vital Signs: Australia’s slipping student scores will lead to greater
238 | income
239 | inequality](https://theconversation.com/vital-signs-australias-slipping-student-scores-will-lead-to-greater-income-inequality-128301)
240 | and [In China, Nicholas studied maths 20 hours a week. In Australia,
241 | it’s
242 | three](https://www.smh.com.au/education/in-china-nicholas-studied-maths-20-hours-a-week-in-australia-it-s-three-20191203-p53ggv.html).
243 |
244 |
245 |
246 |
247 |
248 | - Australia’s neighbours, New Zealand and Indonesia, are also worrying:
249 | [New Zealand top-end in OECD’s latest PISA report but drop in
250 | achievements
251 | ‘worrying’](https://www.stuff.co.nz/national/education/117890945/new-zealand-topend-in-oecds-latest-pisa-report-but-drop-in-achievements-worrying),
252 | [Not even mediocre? Indonesian students score low in math, reading,
253 | science: PISA
254 | report](https://www.thejakartapost.com/news/2019/12/04/not-even-mediocre-indonesian-students-score-low-in-math-reading-science-pisa-report.html).
255 |
256 | - The data from this survey and all of the surveys conducted since the
257 | first collection in 2000, is publicly available. We decided to have
258 | made a more convenient subset of the data available in a new R
259 | package, called `learningtower`
260 |
261 | ## Acknowledgement
262 |
263 | The work to make the data available is the effort of several researchers
264 | from Australia, New Zealand and Indonesia, conducted as part of the
265 | [ROpenSci OzUnconf](https://ozunconf19.ropensci.org) held in Sydney, Dec
266 | 11-13, 2019.
267 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://kevinwang09.github.io/learningtower/
2 | template:
3 | params:
4 | bootswatch: flatly
5 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Test environments
2 |
3 | * ubuntu 16.04 (GitHub Actions), R devel
4 | * MacOS (GitHub Actions), R devel
5 | * Windows (GitHub Actions), R devel
6 | * win-builder (devel)
7 |
8 | ## R CMD check results
9 | There were no ERRORs or WARNINGs.
10 |
11 | Winbuilder gave some notes, but these are on URLs that are valid: https://win-builder.r-project.org/O7qG479Sn4gS/00check.log.
12 |
--------------------------------------------------------------------------------
/data/countrycode.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/countrycode.rda
--------------------------------------------------------------------------------
/data/school.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/school.rda
--------------------------------------------------------------------------------
/data/student_subset_2000.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2000.rda
--------------------------------------------------------------------------------
/data/student_subset_2003.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2003.rda
--------------------------------------------------------------------------------
/data/student_subset_2006.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2006.rda
--------------------------------------------------------------------------------
/data/student_subset_2009.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2009.rda
--------------------------------------------------------------------------------
/data/student_subset_2012.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2012.rda
--------------------------------------------------------------------------------
/data/student_subset_2015.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2015.rda
--------------------------------------------------------------------------------
/data/student_subset_2018.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2018.rda
--------------------------------------------------------------------------------
/data/student_subset_2022.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2022.rda
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | bibentry(
2 | bibtype = "Manual",
3 | title = "learningtower: OECD PISA Datasets from 2000-2022 in an Easy-to-Use Format",
4 | author = c(
5 | person("Kevin", "Wang"),
6 | person("Paul", "Yacobellis"),
7 | person("Erika", "Siregar"),
8 | person("Sarah", "Romanes"),
9 | person("Kim", "Fitter"),
10 | person("Giulio Valentino", "Dalla Riva"),
11 | person("Dianne", "Cook"),
12 | person("Nick", "Tierney"),
13 | person("Priya", "Dingorkar"),
14 | person("Shabarish", "Sai Subramanian"),
15 | person("Guan Ru", "Chen")
16 | ),
17 | note = "R package version 1.1.0, https://github.com/kevinwang09/learningtower",
18 | url = "https://kevinwang09.github.io/learningtower/",
19 | year = "2024"
20 | )
21 |
--------------------------------------------------------------------------------
/learningtower.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageCheckArgs: --as-cran
22 |
--------------------------------------------------------------------------------
/man/countrycode.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{countrycode}
5 | \alias{countrycode}
6 | \title{Country iso3c and name mapping for PISA OECD countries participants.}
7 | \format{
8 | A tibble of the following variables
9 | \itemize{
10 | \item \code{country}: Country 3 character code. Note that some regions/territories are coded as country for ease of input. Character.
11 | \item \code{country_name}: Country name. Note that some regions/territories are coded as country for ease of input. Character.
12 | }
13 | }
14 | \description{
15 | A dataset containing mapping of the country ISO code to the country names.
16 | More information on participating countries can be found at
17 | \url{https://www.oecd.org/pisa/aboutpisa/pisa-participants.htm}.
18 | }
19 |
--------------------------------------------------------------------------------
/man/figures/README_school_data_missing_values_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/README_school_data_missing_values_summary.png
--------------------------------------------------------------------------------
/man/figures/README_student_data_missing_values_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/README_student_data_missing_values_summary.png
--------------------------------------------------------------------------------
/man/figures/conversation_holden.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/conversation_holden.png
--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/logo.png
--------------------------------------------------------------------------------
/man/figures/pisa_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/pisa_image.png
--------------------------------------------------------------------------------
/man/figures/readme.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/readme.gif
--------------------------------------------------------------------------------
/man/figures/readme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/readme.png
--------------------------------------------------------------------------------
/man/load_student.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/load_student.R
3 | \name{load_student}
4 | \alias{load_student}
5 | \title{load_student() function allows the user to extract the PISA student scores for any desired year
6 | from 2000-2022}
7 | \usage{
8 | load_student(year = "2000")
9 | }
10 | \arguments{
11 | \item{year}{is the required parameter for the function to display the
12 | dataset the user wants to view the PISA scores for the selected year else the entire student
13 | data will be available to the user}
14 | }
15 | \value{
16 | A dataset of PISA scores of students that took the test in the selected year as per user
17 | from the years 2000-2018
18 | }
19 | \description{
20 | load_student() function was created to extract the data of student's scores in any
21 | years from 2000-2022, the function requires any of the year as it argument or a string "all"
22 | that will return all the PISA scores of the students from the years 2000-2022.
23 | }
24 | \examples{
25 | \dontrun{
26 | library(learningtower)
27 | student_all <- load_student("all")
28 | student_2000 <- load_student("2000")
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/man/school.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{school}
5 | \alias{school}
6 | \title{Subset of the School data available for the years 2000-2022 from the PISA OECD database}
7 | \format{
8 | A tibble of the following variables
9 | \itemize{
10 | \item \code{year}: Year of the PISA data. Integer.
11 | \item \code{country}: Country 3 character code. Note that some regions/territories are coded as country for ease of input. Chracter.
12 | \item \code{school_id}: The school identification number, unique for each country and year combination. Character.
13 | \item \code{fund_gov}: Percentage of total funding for school year from government. Numeric.
14 | \item \code{fund_fees}: Percentage of total funding for school year from student fees or school charges paid by parents. Numeric.
15 | \item \code{fund_donation}: Percentage of total funding for school year from
16 | benefactors, donations, bequests, sponsorship, parent fundraising. Numeric.
17 | \item \code{enrol_boys}: Number of boys in the school. Numeric.
18 | \item \code{enrol_girls}: Number of girls in the school. Numeric.
19 | \item \code{stratio}: Student-Teacher ratio. Numeric.
20 | \item \code{public_private}: Is the school a public or private school. Factor.
21 | \item \code{staff_shortage}: Shortage of staff. Numeric.
22 | \item \code{sch_wgt}: The final survey weight score for the schools. Numeric.
23 | \item \code{school_size}: The school size. Numeric.
24 | }
25 | }
26 | \description{
27 | A subset data containing school weight and other information
28 | from the triennial testing of 15 year olds around
29 | the globe. Original data available from
30 | \url{https://www.oecd.org/en/about/programmes/pisa/pisa-data.html}.
31 | }
32 |
--------------------------------------------------------------------------------
/man/student.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{student}
5 | \alias{student}
6 | \alias{student_subset_2000}
7 | \alias{student_subset_2003}
8 | \alias{student_subset_2006}
9 | \alias{student_subset_2009}
10 | \alias{student_subset_2012}
11 | \alias{student_subset_2015}
12 | \alias{student_subset_2018}
13 | \alias{student_subset_2022}
14 | \title{Processed and Sampled PISA Student Data (2000-2022)}
15 | \format{
16 | A tibble of the following variables
17 | \itemize{
18 | \item \code{year}: Year of the PISA data. Integer.
19 | \item \code{country}: Country 3 character code. Note that some regions/territories are coded as "country" for ease of input. Factor.
20 | \item \code{school_id}: Unique school identifier for each country and year. Character.
21 | \item \code{student_id}: Unique student identifier within each school. Integer.
22 | \item \code{mother_educ}: Mother's highest level of education, from "less than ISCED1" to "ISCED 3A". Factor.
23 | \item \code{father_educ}: Father's highest level of education, from "less than ISCED1" to "ISCED 3A". Factor.
24 | \item \code{gender}: Gender of the student. Only "male" and "female" are recorded. Factor.
25 | Note that we call this variable gender and not sex as this term was used in the OECD PISA database.
26 | \item \code{computer}: Possession of computer. Only "yes" and "no" are recorded. Factor.
27 | \item \code{internet}: Access to internet. Only "yes" and "no" are recorded. Factor.
28 | \item \code{math}: Simulated score in mathematics. Numeric.
29 | \item \code{read}: Simulated score in reading. Numeric.
30 | \item \code{science}: Simulated score in science. Numeric.
31 | \item \code{stu_wgt}: The final survey weight score for the student score. Numeric.
32 | \item \code{desk}: Possession of desk to study at. Only "yes" and "no" are recorded. Factor.
33 | \item \code{room}: Possession of a room of your own. Only "yes" and "no" are recorded. Factor.
34 | \item \code{dishwasher}: Possession of a dishwasher. Only "yes" and "no" are recorded. Factor.
35 | Note that in 2015 and 2018, all entries are missing.
36 | \item \code{television}: Number of televisions.
37 | "0", "1", "2" are code for no, one and two TVs in the house. "3+" codes for three or more TVs. Factor.
38 | Note that in 2003, all entries are missing.
39 | \item \code{computer_n}: Number of computers.
40 | "0", "1", "2" are code for no, one and two computers in the house. "3+" codes for three or more computers. Factor.
41 | Note that in 2003, all entries are missing.
42 | \item \code{car}: Number of cars.
43 | "0", "1", "2" are code for no, one and two cars in the house. "3+" codes for three or more cars Factor.
44 | Note that in 2003, all entries are missing.
45 | \item \code{book}: Number of books. Factor.
46 | Note that encoding is different in the years 2000 and 2003 compared to all other years. Factor.
47 | Evaluate \code{table(student$book, student$year)} for a demo.
48 | \item \code{wealth}: Index of family wealth. Numeric.
49 | Note that in 2003, all entries are missing.
50 | \item \code{escs}: Index of economic, social and cultural status. Numeric.
51 | }
52 | }
53 | \description{
54 | This dataset provides a clean and processed subset of the OECD PISA student data
55 | for the years 2000-2022. The original data is sourced from
56 | \url{https://www.oecd.org/en/about/programmes/pisa/pisa-data.html} and has been prepared for analysis.
57 | A sampling of 50 students per country (for OECD countries) has been included for each year.
58 | The data curation and sampling process are documented in \url{https://github.com/kevinwang09/learningtower_masonry/blob/master/Code/student_bind_rows.Rmd}
59 | }
60 | \examples{
61 | library(dplyr)
62 | data(student_subset_2000)
63 | data(student_subset_2003)
64 | dplyr::bind_rows(
65 | student_subset_2000,
66 | student_subset_2003
67 | )
68 | }
69 |
--------------------------------------------------------------------------------
/student_full_data/student_2000.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2000.rds
--------------------------------------------------------------------------------
/student_full_data/student_2003.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2003.rds
--------------------------------------------------------------------------------
/student_full_data/student_2006.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2006.rds
--------------------------------------------------------------------------------
/student_full_data/student_2009.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2009.rds
--------------------------------------------------------------------------------
/student_full_data/student_2012.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2012.rds
--------------------------------------------------------------------------------
/student_full_data/student_2015.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2015.rds
--------------------------------------------------------------------------------
/student_full_data/student_2018.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2018.rds
--------------------------------------------------------------------------------
/student_full_data/student_2022.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2022.rds
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(learningtower)
3 |
4 | test_check("learningtower")
5 |
--------------------------------------------------------------------------------
/tests/testthat/test-countrycode-col-types.R:
--------------------------------------------------------------------------------
1 | expected_countrycode_columns <- c("country", "country_name")
2 | expected_countrycode_types <- c("character", "character")
3 |
4 | test_that("countrycode dataset has correct structure", {
5 | data("countrycode", package = "learningtower") # Replace with your package name
6 |
7 | # Check column names
8 | expect_named(countrycode, expected_countrycode_columns, info = "Column names for countrycode dataset")
9 |
10 | # Check column types
11 | for (i in seq_along(expected_countrycode_columns)) {
12 | expect_true(class(countrycode[[expected_countrycode_columns[i]]])[1] == expected_countrycode_types[i],
13 | info = paste("Column", expected_countrycode_columns[i], "in countrycode dataset should be", expected_countrycode_types[i]))
14 | }
15 | })
16 |
--------------------------------------------------------------------------------
/tests/testthat/test-merge.R:
--------------------------------------------------------------------------------
1 | test_that("Merging student and school data works correctly", {
2 | # Load datasets
3 | student_data <- load_student(2000)
4 | data("school", package = "learningtower")
5 |
6 | # Perform merge
7 | expect_no_warning(
8 | merged_data <- dplyr::left_join(student_data, school, by = c("year", "school_id", "country"), relationship = "many-to-one")
9 | )
10 |
11 | # Check that all columns from both datasets are present
12 | expected_columns <- unique(c(colnames(student_data), colnames(school)))
13 | expect_named(merged_data, expected_columns,
14 | info = "All columns from both datasets should be present after merging")
15 |
16 | # Check for no NA values in key columns after merge
17 | expect_true(all(!is.na(merged_data$school_id)),
18 | info = "No NA values should be introduced in school_id column after merging")
19 | })
20 |
21 |
22 | test_that("Merging student and countrycode data works correctly", {
23 | # Load datasets
24 | student_data <- load_student(2000)
25 | data("countrycode", package = "learningtower")
26 |
27 | # Perform merge
28 | expect_no_warning(
29 | merged_data <- dplyr::left_join(student_data, countrycode, by = "country", relationship = "many-to-one")
30 | )
31 |
32 | # Check that all columns from both datasets are present
33 | expected_columns <- unique(c(colnames(student_data), colnames(countrycode)))
34 | expect_named(merged_data, expected_columns,
35 | info = "All columns from both datasets should be present after merging")
36 |
37 | # Check for no NA values in the country column after merge
38 | expect_true(all(!is.na(merged_data$country)),
39 | info = "No NA values should be introduced in the country column after merging")
40 | })
41 |
42 | test_that("Sequential merging of student, school, and countrycode works", {
43 | # Load datasets
44 | student_data <- load_student(2000)
45 | data("school", package = "learningtower")
46 | data("countrycode", package = "learningtower")
47 |
48 | # Merge student and school
49 | expect_no_warning(
50 | merged_data <- dplyr::left_join(student_data, school, by = c("year", "school_id", "country"), relationship = "many-to-one")
51 | )
52 |
53 | # Merge with countrycode
54 | expect_no_warning(
55 | final_data <- dplyr::left_join(merged_data, countrycode, by = "country", relationship = "many-to-one")
56 | )
57 |
58 | # Check that all columns from all datasets are present
59 | expected_columns <- unique(c(colnames(student_data), colnames(school), colnames(countrycode)))
60 | expect_named(final_data, expected_columns,
61 | info = "All columns from student, school, and countrycode should be present after merging")
62 |
63 | # Check for no NA values in key columns
64 | expect_true(all(!is.na(final_data$school_id)),
65 | info = "No NA values should be introduced in school_id column after merging")
66 | expect_true(all(!is.na(final_data$country)),
67 | info = "No NA values should be introduced in the country column after merging")
68 | expect_true(all(!is.na(final_data$country_name)),
69 | info = "No NA values should be introduced in the country_name column after merging")
70 | })
71 |
--------------------------------------------------------------------------------
/tests/testthat/test-school-col-types.R:
--------------------------------------------------------------------------------
1 | expected_school_columns <- c(
2 | "year", "country", "school_id", "fund_gov", "fund_fees", "fund_donation",
3 | "enrol_boys", "enrol_girls", "stratio", "public_private", "staff_shortage",
4 | "sch_wgt", "school_size"
5 | )
6 |
7 | expected_school_types <- c(
8 | "integer", "character", "character", "numeric", "numeric", "numeric", "numeric",
9 | "numeric", "numeric", "factor", "numeric", "numeric", "numeric"
10 | )
11 |
12 | test_that("school dataset has correct structure", {
13 | data("school", package = "learningtower") # Replace with your package name
14 |
15 | # Check column names
16 | expect_named(school, expected_school_columns, info = "Column names for school dataset")
17 |
18 | # Check column types
19 | for (i in seq_along(expected_school_columns)) {
20 | expect_true(class(school[[expected_school_columns[i]]])[1] == expected_school_types[i],
21 | info = paste("Column", expected_school_columns[i], "in school dataset should be", expected_school_types[i]))
22 | }
23 | })
24 |
--------------------------------------------------------------------------------
/tests/testthat/test-student-col-types.R:
--------------------------------------------------------------------------------
1 | # Define expected column names and types for student data
2 | expected_student_columns <- c(
3 | "year", "country", "school_id", "student_id", "mother_educ", "father_educ",
4 | "gender", "computer", "internet", "math", "read", "science", "stu_wgt",
5 | "desk", "room", "dishwasher", "television", "computer_n", "car", "book",
6 | "wealth", "escs"
7 | )
8 |
9 | expected_student_types <- c(
10 | "integer", "factor", "character", "integer", "factor", "factor", "factor",
11 | "factor", "factor", "numeric", "numeric", "numeric", "numeric", "factor",
12 | "factor", "factor", "factor", "factor", "factor", "factor", "numeric",
13 | "numeric"
14 | )
15 |
16 | test_that("student_subset_* datasets have correct structure", {
17 | for (year in c("2000", "2003", "2006", "2009", "2012", "2015", "2018", "2022")) {
18 | data_name <- paste0("student_subset_", year)
19 | dataset <- get(data_name)
20 |
21 | # Check column names
22 | expect_named(dataset,
23 | expected_student_columns,
24 | info = paste("Column names for", data_name))
25 |
26 | # Check column types
27 | for (i in seq_along(expected_student_columns)) {
28 | expect_true(class(dataset[[expected_student_columns[i]]])[1] == expected_student_types[i],
29 | info = paste("Column", expected_student_columns[i], "in", data_name, "should be", expected_student_types[i]))
30 | }
31 | }
32 | })
33 |
34 | test_that("load_student() returns correct structure for full datasets", {
35 | for (year in c("2000", "2003", "2006", "2009", "2012", "2015", "2018", "2022")) {
36 | dataset <- load_student(year)
37 |
38 | # Check column names
39 | expect_named(dataset, expected_student_columns, info = paste("Column names for full dataset of", year))
40 |
41 | # Check column types
42 | for (i in seq_along(expected_student_columns)) {
43 | expect_true(class(dataset[[expected_student_columns[i]]])[1] == expected_student_types[i],
44 | info = paste("Column", expected_student_columns[i], "in full dataset of", year, "should be", expected_student_types[i]))
45 | }
46 | }
47 | })
48 |
--------------------------------------------------------------------------------
/tests/testthat/test-test-load.R:
--------------------------------------------------------------------------------
1 | test_that("load student year can be done using both an integer and a character value", {
2 | expect_equal(load_student(2000), load_student("2000"))
3 | })
4 |
5 |
6 | test_that("load multiple years, test is on if the data be be properly binded", {
7 | expect_no_error(load_student(c("2000", "2003")))
8 | })
9 |
10 | test_that("load non-existing years", {
11 | expect_error(load_student("2001"))
12 | })
13 |
14 | test_that("test for load_student class functions",{
15 | expect_s3_class(object = load_student(year = 2000),
16 | class = c("tbl_df", "tbl", "data.frame"))
17 | })
18 |
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/vignettes/articles/Australia_trends.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "How did Australia do in the PISA study"
3 | author: "The Freemasons"
4 | date: "`r Sys.Date()`"
5 | output:
6 | rmarkdown::html_vignette:
7 | fig_height: 10
8 | fig_width: 14
9 | number_sections: true
10 | vignette: >
11 | %\VignetteIndexEntry{How did Australia do in the PISA study}
12 | %\VignetteEncoding{UTF-8}
13 | %\VignetteEngine{knitr::rmarkdown}
14 | editor_options:
15 | chunk_output_type: console
16 | ---
17 |
18 | ```{r setup, include = FALSE}
19 | knitr::opts_chunk$set(
20 | collapse = TRUE,
21 | comment = "#>",
22 | warning = FALSE,
23 | message = FALSE,
24 | error = FALSE,
25 | outwidth = "100%",
26 | fig.width = 8,
27 | fig.height = 6)
28 | ```
29 |
30 | # Introduction
31 |
32 | The purpose of this article is to explore some of the variables that influenced Australia's performance in PISA study. Note that this is an observational study (as oppose to controlled experiment), and we are inferring on factors that are correlated with academic performance rather than specific causes.
33 |
34 |
35 | # Loading the packages and data
36 |
37 | ```{r}
38 | #loading the data and libraries
39 | library(learningtower)
40 | library(tidyverse)
41 | library(lme4)
42 | library(ggfortify)
43 | library(sjPlot)
44 | library(patchwork)
45 | library(ggrepel)
46 | library(kableExtra)
47 |
48 | student <- load_student("all")
49 | data(school)
50 | data(countrycode)
51 |
52 | theme_set(theme_classic(18)
53 | + theme(legend.position = "bottom"))
54 | ```
55 |
56 | # Visualise predictors over time
57 |
58 | Since we are expecting some time variations in the data, let's quickly visualize the time trends.
59 |
60 | ```{r}
61 | #filtering the data for Australia
62 | aus_data = student |>
63 | dplyr::filter(country %in% c("AUS")) |>
64 | dplyr::mutate(mother_educ = mother_educ |> fct_relevel("less than ISCED1"),
65 | father_educ = father_educ |> fct_relevel("less than ISCED1"))
66 | ```
67 |
68 |
69 | ## Numeric variables
70 |
71 | A boxplot is a standardized method of presenting data distribution. It informs whether or not our data is symmetrical. Box plots are important because they give a visual overview of the data, allowing researchers to rapidly discover mean values, data set dispersion, and skewness. In this data we visualize the numeric distribution across the years via boxplots.
72 |
73 | ```{r, fig.height = 9, fig.width = 15}
74 | # plotting the distribution of numeric variables via boxplots
75 | aus_data |>
76 | select(where(is.numeric), -school_id, -student_id) |>
77 | pivot_longer(cols = -year) |>
78 | ggplot(aes(x = factor(year),
79 | y = value,
80 | colour = factor(year))) +
81 | geom_boxplot() +
82 | facet_wrap(~name, scales = "free_y") +
83 | theme(legend.position = "none") +
84 | labs(x = "Year",
85 | y = "",
86 | title = "The distribution of numerical variables in the student dataset over all years")
87 | ```
88 |
89 | ## Factor variables
90 |
91 | Missing data is a common issue that data professionals must deal with on a daily basis. In this section we visualize the number of missing values across the years for all the factor variables in the student dataset.
92 |
93 | ```{r, fig.height = 15, fig.width = 15}
94 | #checking the missing values in the factor variables of the data
95 | aus_fct_plotdata = aus_data |>
96 | select(year, where(is.factor)) |>
97 | dplyr::select(-country) |>
98 | pivot_longer(cols = -year) |>
99 | group_by(year, name, value) |>
100 | tally() |>
101 | dplyr::mutate(
102 | value = coalesce(value, "missing"),
103 | percent = n/sum(n),
104 | year = year |> as.character() |> as.integer()) |>
105 | group_by(name, value) |>
106 | dplyr::mutate(last_point = ifelse(year == max(year), as.character(value), NA))
107 |
108 | aus_fct_plotdata |>
109 | ggplot(aes(x = year, y = percent,
110 | label = last_point,
111 | group = value)) +
112 | geom_point() +
113 | geom_line() +
114 | geom_label_repel(direction = "both", nudge_x = 3, seed = 2020, segment.size = 0) +
115 | facet_wrap(~name, scales = "free_y", ncol = 3) +
116 | scale_x_continuous(breaks = c(2000, 2003, 2006, 2009, 2012, 2015, 2018)) +
117 | scale_y_continuous(labels = scales::percent) +
118 | labs(x = "Year",
119 | y = "Percentage of missing values",
120 | title = "Missing values in the student dataset's factor variables")
121 | ```
122 |
123 | We initially investigate the most current 2018 data before generalizing the models/results into any patterns due to the quantity of missing values in the data in previous years and also to decrease the time complexity in modeling.
124 |
125 | # Linear regression model for the 2018 study
126 |
127 | Linear regression analysis predicts the value of one variable depending on the value of other variables. Because they are well known and can be trained rapidly, linear regression models have become a effective way of scientifically and consistently predicting the future.
128 |
129 | We begin by doing a basic data exploration using linear regression models. To begin, we fit three linear models (one for each subject of math, reading, and science) to the 2018 Australian data to gain an understanding of the key variables that may be impacting test scores.
130 |
131 | We filter the student data (we will load the complete student data using `load student("all")`) to pick the scores in Australia and re level some variables for further analyses.
132 |
133 | ```{r}
134 | #filtering the data to Australia, defining the predictors and selecting the scores
135 | student_predictors = c("mother_educ", "father_educ", "gender", "internet",
136 | "desk", "room", "television", "computer_n",
137 | "car", "book", "wealth", "escs")
138 |
139 | student_formula_rhs = paste(student_predictors, collapse = "+")
140 |
141 | aus2018 = aus_data |>
142 | dplyr::filter(year == "2018") |>
143 | dplyr::select(
144 | math, read, science,
145 | all_of(student_predictors)) |>
146 | na.omit()
147 | ```
148 |
149 | ## Checking correlation matrix of the numeric variables
150 |
151 | A correlation matrix is a table that displays the coefficients of correlation between variables. Each cell in the table represents the relationship between two variables.
152 |
153 | ```{r}
154 | #correlation matrix for the numeric variables
155 | aus2018 |>
156 | select(where(is.numeric)) |>
157 | cor(use = "pairwise.complete.obs") |>
158 | round(2) |>
159 | kbl(caption = "Correlation Matrix") |>
160 | kable_styling(full_width = NULL,
161 | position = "center",
162 | bootstrap_options = c("hover", "striped"))
163 | ```
164 |
165 | ## Fitting three linear models
166 |
167 |
168 | ```{r}
169 | #fitting linear models for the three subjects maths, reading and science
170 |
171 | aus2018_math = lm(formula = as.formula(paste("math ~ ", student_formula_rhs)) , data = aus2018)
172 |
173 | aus2018_read = lm(formula = as.formula(paste("read ~ ", student_formula_rhs)) , data = aus2018)
174 |
175 | aus2018_science = lm(formula = as.formula(paste("science ~ ", student_formula_rhs)) , data = aus2018)
176 |
177 | sjPlot::tab_model(aus2018_math, aus2018_read, aus2018_science,
178 | show.ci = FALSE, show.aic = TRUE, show.se = TRUE,
179 | show.stat = TRUE,
180 | show.obs = FALSE)
181 | ```
182 |
183 |
184 | Some interesting discoveries from these models:
185 |
186 | 1. All three response variables seem to be influenced by the same set of factors.
187 |
188 | 2. Father's education level (`father_educ`) seems to have a much stronger effect than mother's education level (`mother_educ`).
189 |
190 | 3. While most estimates agree in signs across the three subjects, the most notable exception to this is `gender`, where girls tend to perform better than boys in reading.
191 |
192 | 4. The most influential predictors are those associated with socioeconomic status (`escs`) and education (`book`). A number of variables that should not be directly causal to academic performance also showed up as significant. This is likely due to their associations with socio-economic status.
193 |
194 | Note that in making these conclusions, we have ignored the effects of multicollinearity.
195 |
196 | Upon checking the classical diagnostic plots of these models, we see no major violation on the assumptions of linear models. The large amount of variations in the data may help to explain why the models only has a moderately low $R^2$ values (~ 0.20).
197 |
198 | ```{r, fig.height = 30, fig.width = 12}
199 | #plotting the outcome of linear models
200 | autoplot(aus2018_math) + labs(title = "2018 Australia maths model") +
201 | autoplot(aus2018_read) + labs(title = "2018 Australia read model") +
202 | autoplot(aus2018_science) + labs(title = "2018 Australia science model")
203 | ```
204 |
205 |
206 | # Linear mixed model
207 |
208 | Linear mixed models are a subset of simple linear models that allow for both fixed and random effects.
209 |
210 | We already know that the socio-economic status (SES) of a student is often the most influential predictor and it is likely that students with similar SES will attend the same schools in their neighborhood and receive similar level of quality of education from the same teachers.
211 |
212 | Thus, it is likely that there will be a grouping effect on the students if they attended the same school. This would imply that some observations in our data are not independent observations.
213 |
214 | By building random effects in our linear model, that is building a linear mixed model, we should be able to produce a model with better fit if we consider this grouping effect of schools into our model.
215 |
216 | ```{r}
217 | # joining school and student data, building a linear mixed model
218 | lmm2018 = aus_data |>
219 | filter(year == 2018) |>
220 | dplyr::select(
221 | school_id,
222 | math, read, science,
223 | all_of(student_predictors)) |>
224 | na.omit()
225 |
226 | lmm2018_math = lmer(formula = as.formula(paste("math ~ ", student_formula_rhs, "+ (escs | school_id)")), data = lmm2018)
227 |
228 | lmm2018_read = lmer(formula = as.formula(paste("read ~ ", student_formula_rhs, "+ (escs | school_id)")), data = lmm2018)
229 |
230 | lmm2018_science = lmer(formula = as.formula(paste("science ~ ", student_formula_rhs, "+ (escs | school_id)")), data = lmm2018)
231 |
232 | sjPlot::tab_model(lmm2018_math, lmm2018_read, lmm2018_science,
233 | show.ci = FALSE, show.aic = TRUE, show.se = TRUE,
234 | show.stat = TRUE,
235 | show.obs = FALSE)
236 | ```
237 |
238 | We see that the linear mixed model improved on the fit of the model, as judged by the AIC.
239 |
240 | ```{r}
241 | # subtracting AIC values of the two models
242 | bind_cols(
243 | AIC(aus2018_math) - AIC(lmm2018_math),
244 | AIC(aus2018_read) - AIC(lmm2018_read),
245 | AIC(aus2018_science) - AIC(lmm2018_science)
246 | ) |>
247 | rename(maths = ...1,
248 | read = ...2,
249 | science = ...3) |>
250 | kbl(caption = "AIC Values") |>
251 | kable_styling(full_width = NULL,
252 | position = "center",
253 | bootstrap_options = c("hover", "striped"))
254 | ```
255 |
256 | # Integrating with `school` data
257 |
258 | We now take this dataset on students and merge it with some variables from the `school` data which is also a part of this `learningtower` package. This allows us to gain more access to the school level variables this is helpful in modelling the data.
259 |
260 | ```{r}
261 | #taking into account the school dataset variables and fitting a linear mixed model
262 | selected_vars = c("father_educ", "gender", "internet",
263 | "desk", "computer_n", "car",
264 | "book", "wealth", "escs")
265 |
266 | data(school)
267 |
268 | aus_school_2018 = school |>
269 | dplyr::filter(country == "AUS", year == "2018") |>
270 | dplyr::mutate(school_size = log10(school_size)) |> ## We take the log due to the scale
271 | dplyr::select(-year, -country, -contains("fund"), -sch_wgt)
272 |
273 | lmm2018_sch = lmm2018 |>
274 | left_join(aus_school_2018, by = c("school_id")) |> na.omit()
275 |
276 | school_predictors = c("stratio", "public_private", "staff_shortage", "school_size")
277 | school_formula_rhs = paste(school_predictors, collapse = "+")
278 |
279 | lmm2018_sch_math = lmer(formula = as.formula(paste("math ~ ", student_formula_rhs, "+ (escs | school_id) + ",
280 | school_formula_rhs)), data = lmm2018_sch)
281 |
282 | lmm2018_sch_read = lmer(formula = as.formula(paste("read ~ ", student_formula_rhs, "+ (escs | school_id) + ",
283 | school_formula_rhs)), data = lmm2018_sch)
284 |
285 | lmm2018_sch_science = lmer(formula = as.formula(paste("science ~ ", student_formula_rhs, "+ (escs | school_id) + ",
286 | school_formula_rhs)), data = lmm2018_sch)
287 |
288 |
289 | sjPlot::tab_model(lmm2018_sch_math, lmm2018_sch_read, lmm2018_sch_science,
290 | show.ci = FALSE, show.aic = TRUE, show.se = TRUE,
291 | show.stat = TRUE,
292 | show.obs = FALSE)
293 | ```
294 |
295 | We note the following:
296 |
297 | 1. The school size (`school_size`) is a strong predictor for academic performance, implying larger schools tend to do better. This is likely a confounding variable for the urban/rural region of the school which can imply a difference in available funding of school facilities.
298 |
299 | 2. Private school tends to better than public schools (note the reference level and the negative coefficient estimate in the variable `public_private`).
300 |
301 | 3. Perhaps surprisingly, the student-teacher ratio (`stratio`) wasn't found to be significant but the shortage of staff (`staff_shortage`) was significant. This would imply that as long as the school is adequately supported by staff, further reduction in the student-teacher ratio does not have a statistical significant effect on student performance.
302 |
303 | # Visualising coefficient estimates over the years
304 |
305 | All analyses above focused on the year 2018 for Australia, but what about the other years? We also visualize the academic performances of students as a function of time in the [time trend article](https://kevinwang09.github.io/learningtower/articles/exploring_time.html), so in this section, we attempt to visualize the effect of some interesting variables and their linear model coefficient estimates for each of the PISA study over time.
306 |
307 | We would expect the availability of technology (e.g. computer) could be beneficial for students at the start of the 21st century, but it is not clear if students will be helped by these technologies as time goes by.
308 |
309 | The construction goes as follow:
310 |
311 | 1. We first split the entire Australian data by year and fit a linear model, with `math` as the response variable.
312 |
313 | 2. We extract the coefficient estimate for every predictor from every linear model and combine the result.
314 |
315 | 3. We then plot the years on the x-axis and the coefficient estimates on the y-axis as points and join each variable using a line. For categorical variables, we split the categories as separate lines.
316 |
317 | 4. Additionally, we show the 95% confidence interval of each coefficient estimate using a transparent ribbon and show the y = 0 line. i.e. whenever the ribbon crosses the horizontal line, the p-value for testing this level will be < 0.05.
318 |
319 | ```{r, fig.height = 12, fig.width = 15}
320 | #Fitting a linear model, extracting the coefficients and visualizing every predictor
321 | aus_student_years = aus_data |>
322 | dplyr::select(
323 | math,
324 | all_of(student_predictors),
325 | year) |>
326 | na.omit()
327 |
328 | aus_student_years_coef = aus_student_years |>
329 | group_by(year) |>
330 | nest() |>
331 | dplyr::mutate(math_lm_coef = purrr::map(.x = data,
332 | .f = ~ lm(formula = as.formula(paste("math ~ ", student_formula_rhs)), data = .x) |>
333 | broom::tidy())) |>
334 | dplyr::select(-data) |>
335 | tidyr::unnest(math_lm_coef)
336 |
337 | aus_student_years_coef |>
338 | dplyr::filter(str_detect(term, "computer|father_educ|escs|wealth")) |>
339 | dplyr::mutate(
340 | year = year |> as.character() |> as.integer(),
341 | facet = case_when(
342 | str_detect(term, "computer") ~ "Number of computer",
343 | str_detect(term, "father_educ") ~ "Education of father",
344 | # str_detect(term, "mother_educ") ~ "Education of mother",
345 | str_detect(term, "wealth") ~ "Wealth",
346 | str_detect(term, "escs") ~ "Socio-economic index"),
347 | last_point = ifelse(year == 2018, term, NA)) |>
348 | ggplot(aes(x = year, y = estimate,
349 | colour = term,
350 | group = term,
351 | label = last_point)) +
352 | geom_hline(yintercept = 0) +
353 | geom_point(position = position_dodge(width = 0.8), size = 2) +
354 | geom_line(position = position_dodge(width = 0.8), size = 1.5) +
355 | geom_linerange(aes(ymin = estimate - 2*std.error,
356 | ymax = estimate + 2*std.error),
357 | size = 4, alpha = 0.7,
358 | position = position_dodge(width = 0.8)) +
359 | geom_label_repel(direction = "both", nudge_x = 2, seed = 2020, segment.size = 0) +
360 | scale_x_continuous(limits = c(2005.5, 2022),
361 | breaks = c(2006, 2009, 2012, 2015, 2018)) +
362 | facet_wrap(~facet, scales = "free_y") +
363 | theme(legend.position = "none") +
364 | labs(x = "Year",
365 | y = "Estimate",
366 | title = "Graphing coefficient estimates throughout time")
367 | ```
368 |
369 | We note the following:
370 |
371 | 1. Even though in the 2018, we found the education of father was statistically significant against students' academic performance, this was not always the case. From 2006 to 2018, the education of father seems to have ever positive influence on students.
372 |
373 | 2. It is clear that access to computers is ever more prevalent in Australia. But surprisingly, the positive influence of computers are decreasing. It is not clear why this would be the case. One possible reason is that students might have access to computers outside of their homes (e.g. from schools) and thus the advantages of accessing computers are dampened.
374 |
375 | 3. Quite interestingly, the influence of socio-economic index is dropping, implying a gradual move towards equality.
376 |
377 | # Session info
378 | ```{r}
379 | sessionInfo()
380 | ```
381 |
--------------------------------------------------------------------------------
/vignettes/articles/exploring_time.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Exploring temporal trends"
3 | author: "The Freemasons"
4 | date: "`r Sys.Date()`"
5 | output:
6 | rmarkdown::html_vignette:
7 | fig_height: 10
8 | fig_width: 14
9 | number_sections: true
10 | vignette: >
11 | %\VignetteIndexEntry{Temporal trends}
12 | %\VignetteEngine{knitr::rmarkdown}
13 | %\VignetteEncoding{UTF-8}
14 | ---
15 |
16 | ```{r setup, include = FALSE}
17 | knitr::opts_chunk$set(
18 | collapse = TRUE,
19 | comment = "#>",
20 | warning = FALSE,
21 | message = FALSE,
22 | error = FALSE,
23 | outwidth = "100%",
24 | fig.width = 8,
25 | fig.height = 6)
26 | ```
27 |
28 | # Introduction
29 |
30 | One of the most interesting thing that we can explore in this PISA data are the temporal trends for each country/region.
31 |
32 |
33 | # Loading packages and data
34 |
35 | ```{r}
36 | #loading the data and libraries
37 | library(learningtower)
38 | library(tidyverse)
39 | library(patchwork)
40 | library(brolgar)
41 | library(gghighlight)
42 | library(ggrepel)
43 | library(tsibble)
44 | library(kableExtra)
45 |
46 | student <- load_student("all")
47 | data(countrycode)
48 |
49 | theme_set(theme_classic(18) +
50 | theme(legend.position = "bottom"))
51 | ```
52 |
53 | # Basic time series visualisation
54 |
55 | We begin by visualizing the time series trend of countries independent of when and for how long they participated in the PISA survey. The following code computes the weighted means of each subject (maths, reading, and science) for each country and year. The weighted averages are then plotted as three time series plots, with each joined line in the plots representing a country's performance in that subject throughout the time that they participated in the PISA research.
56 |
57 | ```{r}
58 | #calculating the weighted means for all three subjects and plotting them
59 | w_mean = function(x, w){weighted.mean(x = x, w = w, na.rm=TRUE)}
60 |
61 | stu_summ = student |>
62 | group_by(year, country) |>
63 | summarise_at(.vars = vars(math, read, science),
64 | .funs = list(wmean = ~w_mean(., w = stu_wgt),
65 | min = ~min(., na.rm = TRUE),
66 | max = ~max(., na.rm = TRUE))) |>
67 | ungroup()
68 |
69 |
70 | stu_wmean_long = stu_summ |>
71 | select(year, country, contains("wmean")) |>
72 | pivot_longer(cols = contains("wmean"),
73 | names_to = "wmean_names",
74 | values_to = "wmean_values")
75 |
76 | stu_wmean_long |>
77 | ggplot(aes(x = year, y = wmean_values, group = country)) +
78 | geom_line() +
79 | facet_wrap(~wmean_names) +
80 | labs(x = "Year", y = "Weighted mean values",
81 | title = "Weighted means of countries in all subjects")
82 |
83 | ```
84 |
85 |
86 | ## Australia, New Zealand, Indonesia
87 |
88 | A core of this package was built at the 2019 OzUnconf in Australia. Hence, we focus on three countries in the APAC region for more detailed visualizations. In the plot below, the dark line is the weighted mean score of each country for each subject. The shading indicates the minimum and maximum of scores for a given year. We can see that when looking at range of scores, the variations in the mean of the data is almost negligible. We explore this effect in details later.
89 |
90 | ```{r}
91 | #plotting the weighted mean score with minimum and maximum range for the three countries
92 | stu_summ_long2 = stu_summ |>
93 | filter(country %in% c("AUS", "NZL", "IDN")) |>
94 | pivot_longer(cols = math_wmean:science_max,
95 | names_to = "names",
96 | values_to = "values") |>
97 | separate(col = names, into = c("subject", "statistics"), sep = "_") |>
98 | pivot_wider(names_from = "statistics",
99 | values_from = "values")
100 |
101 |
102 | stu_summ_long2 |>
103 | ggplot(aes(x = year, y = wmean)) +
104 | geom_ribbon(aes(ymin = min, ymax = max), fill = "grey70") +
105 | geom_line(colour = "black", size = 2) +
106 | facet_grid(subject~country, labeller = label_both) +
107 | labs(x = "Year", y = "Test score values",
108 | title = "Weighted means/Min and Max range")
109 | ```
110 |
111 | # `brolgar` visualisations
112 |
113 | [brolgar](https://github.com/njtierney/brolgar) is a new R package that makes visualization of time series easier. We now use this package to pick out some interesting patterns in the data.
114 |
115 | ## Linear model for every country
116 |
117 | We now consider fitting a linear model for every country's performance in maths. This extracts the general trend of performance in mathematics extracted from the "spaghetti" plot above.
118 |
119 | There are many countries/regions who did not participate in all 7 PISA studies (between 2000 to 2018, a study is conducted every three years). As we are interested in calculating linear models, we retain only those countries/regions participated in 5 or more studies.
120 |
121 | For simplicity of interpretation, we center each country/region's performance to the first time that country/region participated in the PISA study. Hence, the intercept terms of the linear models (x-axis in the plot below) represent the weighted means of the countries/regions when they first participated in PISA study. The slope terms of the linear models (y-axis in the plot below) represent the average annual increase in the weighted mean score for each country/region.
122 |
123 | Based on this interpretation, it appears that if a country/region has a good initial performance in the PISA study, then that country is likely to have reach "saturation" where it is hard for it to improve any further, and thus only has a small annual increase or even a decrease.
124 |
125 | ```{r}
126 | #considering countries whose participation is 5 or more, calculating the math slope and plotting it
127 | complete_nations = stu_summ |>
128 | group_by(country) |>
129 | filter(n() >= 5) |>
130 | ungroup() |>
131 | mutate(year_subtract = year - min(year)) |>
132 | as_tsibble(key = country, index = year_subtract)
133 |
134 | math_slope = complete_nations |>
135 | select(
136 | year_subtract,
137 | country,
138 | math_wmean) |>
139 | key_slope(math_wmean ~ year_subtract) |>
140 | left_join(countrycode, by = "country")
141 |
142 | math_slope |>
143 | ggplot(aes(x = .intercept, y = .slope_year_subtract)) +
144 | geom_point() +
145 | geom_text_repel(aes(label = country_name), size = 3) +
146 | geom_hline(yintercept = 0, colour = "red") +
147 | labs(x = "Weighted mean math score in first participation",
148 | y = "Avg. increase in weighted mean score every year",
149 | title = "Countries performance in maths") +
150 | scale_y_continuous(limits = c(-5, 8))
151 |
152 |
153 | math_slope_near <- math_slope |>
154 | keys_near(key = country, var = .slope_year_subtract)
155 |
156 | math_slope_near |>
157 | kbl(caption = "Summary Statistics") |>
158 | kable_styling(full_width = NULL,
159 | position = "center",
160 | bootstrap_options = c("hover", "striped"))
161 | ```
162 |
163 | ## Highlighting monotone countries for subjects
164 |
165 | There are some countries, since their initial participation in the PISA study, always exhibit monotone trending (increase or decrease). We use the `brolgar` package to highlight these countries.
166 |
167 | Quite interestingly, the countries exhibiting monotone decreasing patterns are Australia, New Zealand and Netherlands. Despite this decreasing pattern, all three countries remain on the top of the world in terms of their performance. This is consistent with the idea of "saturation" above as we can see a cluster of countries towards the top of the score range of each subject. On the other hand, Qatar and Peru are the two countries that massively improved their performance since the PISA study began.
168 |
169 | ```{r}
170 | #plotting increasing and decreasing patterns in countries for all three subjects
171 | feature_monotone = complete_nations |>
172 | features_at(.var = vars(math_wmean, read_wmean, science_wmean),
173 | features = feat_monotonic) |>
174 | dplyr::select(country, contains("increase"), contains("decrease"))
175 |
176 | feature_monotone_long = feature_monotone |>
177 | pivot_longer(cols = -country,
178 | names_to = "names",
179 | values_to = "monotone_value") |>
180 | separate(col = names, into = c("subject", "direction"), sep = "_(?!.*_)")
181 |
182 | plot_tbl = complete_nations |>
183 | as_tibble() |>
184 | select(year, country, math_wmean, read_wmean, science_wmean) |>
185 | pivot_longer(cols = contains("_wmean"),
186 | names_to = "subject",
187 | values_to = "wmean_value") |>
188 | left_join(feature_monotone_long, by = c("country", "subject")) |>
189 | left_join(countrycode, by = "country")
190 |
191 | plot_tbl |>
192 | ggplot(aes(x = year,
193 | y = wmean_value,
194 | group = interaction(country, subject))) +
195 | geom_line() +
196 | gghighlight::gghighlight(monotone_value, label_key = country_name) +
197 | facet_grid(direction~subject) +
198 | labs(x = "Year",
199 | y = "Weighted means",
200 | title = "Monotonic trending of countries")
201 | ```
202 |
203 |
204 | ## Highlighting variance
205 |
206 | As the PISA study spans multiple countries, schools and across time, there is a huge amount of variations in the data that the simple linear analyses above are not able to fully capture. Here, we turn our attention to the variability themselves and visualize these. We primarily use standard deviation and coefficient of variation to visualize the general trends of countries/region over time.
207 |
208 | ```{r}
209 | student |>
210 | group_by(year, country) |>
211 | summarise_at(
212 | .vars = vars(math, read, science, wealth, escs),
213 | .funs = list(
214 | mean = ~ mean(., na.rm = TRUE),
215 | sd = ~ sd(., na.rm = TRUE))) |>
216 | ggplot(aes(x = math_mean, y = math_sd, colour = factor(year))) +
217 | geom_point(size = 3) +
218 | scale_colour_brewer(palette = "Dark2") +
219 | labs(x = "Mean maths score",
220 | y = "SD maths score",
221 | title = "Highlighting variance") +
222 | facet_wrap(~year) +
223 | theme(legend.position = "none")
224 |
225 | cv = function(x){
226 | sd(x, na.rm = TRUE)/mean(x, na.rm = TRUE)
227 | }
228 |
229 | stu_var_summ = student |>
230 | group_by(year, country) |>
231 | summarise_at(
232 | .vars = vars(math, read, science),
233 | .funs = list(
234 | sd = ~ sd(., na.rm = TRUE),
235 | cv = ~ cv(.))) |>
236 | group_by(country) |>
237 | filter(n() >= 5) |>
238 | ungroup()
239 |
240 |
241 | stu_var_summ_long = stu_var_summ |>
242 | pivot_longer(cols = -c("year", "country"),
243 | names_to = "names",
244 | values_to = "values") |>
245 | separate(col = "names", into = c("subject", "statistic"), sep = "_")
246 |
247 | stu_var_summ_long |>
248 | ggplot(aes(x = year, y = values,
249 | group = country)) +
250 | geom_line() +
251 | facet_grid(statistic~subject, scales = "free_y") +
252 | labs(x = "Year",
253 | y = "Values",
254 | title = "Highlighting variance")
255 | ```
256 |
257 | In the plot above, we see that there countries that has high variations tend to lower as time passes by while low variations tend to stay stay that way. This implies that most countries/regions typically exhibit non-increasing pattern in terms of the quality of performance.
258 |
259 | We now zoom into the mathematics performance (measured using coefficient of variation) panel and take a close look at the countries, highlighting certain countries of interest.
260 |
261 | ```{r}
262 | stu_var_summ = stu_var_summ |>
263 | as_tsibble(key = country, index = year)
264 |
265 | stu_var_near = stu_var_summ |>
266 | features(math_cv, feat_brolgar) |>
267 | keys_near(key = country, var = median)
268 |
269 | stu_var_plotdf = stu_var_summ_long |>
270 | filter(subject == "math", statistic == "cv") |>
271 | left_join(stu_var_near, by = "country") |>
272 | left_join(countrycode, by = "country") |>
273 | as_tibble() |>
274 | mutate(label_stats_country = ifelse(is.na(stat), NA, paste0(stat, ":", country_name)))
275 |
276 | stu_var_plotdf |>
277 | ggplot(aes(x = year, y = values,
278 | group = country, colour = stat)) +
279 | geom_line() +
280 | gghighlight::gghighlight(!is.na(stat), label_key = label_stats_country) +
281 | labs(y = "Coeffiecent of variation across students",
282 | x = "Year",
283 | title = "Maths scores using coefficient of variation")
284 | ```
285 |
286 | We again see Qatar appearing in this visualization. Qatar is highlighted as it is the country with a large amount of variations, implying a high level of inequality in the performance in mathematics. But what is particularly interesting here is that Qatar is consistently lowering the variation every time it participate in the PISA study. Combined with the visualizations above, we might conjecture that Qatar is not only improving its performance but also the equality of access.
287 |
288 | ## Gender gap over time
289 |
290 | One of the ongoing myth in education is that there is a difference in the performance of different subjects by gender. While this may appear true in selected cases, it is important to note gender is often a confounding variable masking the effect of some genuine underlying cause. Together with the large amount of variations in the data across socioeconomic status of different families in different countries/regions, it is never possible to draw a generalized conclusion.
291 |
292 | That being said, we now visualize the differences in the average test scores for each gender (PISA study chose a binary coding). Across the three subjects, there are more countries with a higher average for the boys in maths. In reading, girls completely dominate in every country while performance in science is more evenly split between the genders.
293 |
294 | ```{r}
295 | stu_gender_summ = student |>
296 | filter(complete.cases(gender)) |>
297 | group_by(year, country, gender) |>
298 | summarise_at(.vars = vars(math, read, science),
299 | .funs = list(wmean = ~w_mean(., w = stu_wgt))) |>
300 | group_by(country) |>
301 | filter(n() >= 10) |>
302 | ungroup() |>
303 | pivot_longer(cols = contains("_wmean"),
304 | names_to = "names",
305 | values_to = "values") |>
306 | pivot_wider(names_from = c("gender", "names"),
307 | values_from = "values")
308 |
309 | stu_ggap_summ = stu_gender_summ |>
310 | dplyr::transmute(
311 | year, country,
312 | gap_math_wmean = female_math_wmean - male_math_wmean,
313 | gap_read_wmean = female_read_wmean - male_read_wmean,
314 | gap_science_wmean = female_science_wmean - male_science_wmean)
315 |
316 |
317 | stu_ggap_summ_long = stu_ggap_summ |>
318 | pivot_longer(cols = contains("gap"),
319 | names_to = "gap_names",
320 | values_to = "gap_values")
321 |
322 | stu_ggap_summ_long |>
323 | ggplot(aes(x = year, y = gap_values)) +
324 | geom_point() +
325 | geom_line(aes(group = country)) +
326 | geom_hline(yintercept = 0, colour = "red") +
327 | facet_wrap(~gap_names) +
328 | labs(title = "Average gender gaps across subjects and years",
329 | subtitle = "Gap = avg. female score - avg. male score",
330 | x = "Year",
331 | y = "Gender Gap Values")
332 | ```
333 |
334 | ### Highlighting key countries across all three subjects
335 |
336 |
337 | ```{r, fig.width = 18}
338 | stu_ggap_summ_nest = stu_ggap_summ |>
339 | pivot_longer(contains("_wmean"),
340 | names_to = "names",
341 | values_to = "values") |>
342 | group_by(names) |>
343 | nest() |>
344 | mutate(f_tbl = map(.x = data,
345 | .f = ~ .x |>
346 | as_tsibble(key = country, index = year) |>
347 | features(values, feat_brolgar) |>
348 | keys_near(key = country, var = range2)),
349 | f_data = map2(.x = data, .y = f_tbl,
350 | .f = ~ left_join(.x, .y, by = "country")))
351 |
352 | stu_ggap_summ_plotdf = stu_ggap_summ_nest |>
353 | select(names, f_data) |>
354 | unnest(f_data) |>
355 | left_join(countrycode, by = "country") |>
356 | mutate(label_stats_country = ifelse(is.na(stat), NA, paste0(stat, ":", country_name)))
357 |
358 | stu_ggap_summ_plotdf |>
359 | ggplot(aes(x = year, y = values)) +
360 | geom_line(aes(group = country, colour = country)) +
361 | gghighlight(!is.na(stat), label_key = label_stats_country, calculate_per_facet = TRUE, keep_scales = TRUE) +
362 | facet_wrap(~names) +
363 | labs(x = "Year",
364 | y = "Values",
365 | title = "Highlighting key countries across all three subjects")
366 | ```
367 |
368 |
369 | ### Highlighting key countries for maths only
370 |
371 | ```{r}
372 | stu_gap_math_near = stu_ggap_summ |>
373 | as_tsibble(key = country, index = year) |>
374 | features(gap_math_wmean, feat_brolgar) |>
375 | keys_near(key = country, var = median)
376 |
377 | stu_gap_math_plotdf = stu_ggap_summ |>
378 | as_tibble() |>
379 | left_join(stu_gap_math_near, by = "country") |>
380 | left_join(countrycode, by = "country") |>
381 | mutate(label_stats_country = ifelse(is.na(stat), NA, paste0(stat, ":", country_name)))
382 |
383 | p_math = stu_gap_math_plotdf |>
384 | ggplot(aes(x = year, y = gap_math_wmean,
385 | group = country, colour = stat)) +
386 | geom_line() +
387 | gghighlight::gghighlight(!is.na(stat), label_key = label_stats_country) +
388 | labs(x = "Year",
389 | y = "Values",
390 | title = "Highlighting key countries for maths")
391 |
392 | p_math
393 | ```
394 |
395 |
396 | # Session info
397 | ```{r}
398 | sessionInfo()
399 | ```
400 |
--------------------------------------------------------------------------------
/vignettes/learningtower_school.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Using the Student and School Data"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{learningtower_school}
6 | %\VignetteEncoding{UTF-8}
7 | %\VignetteEngine{knitr::rmarkdown}
8 | editor_options:
9 | chunk_output_type: console
10 | ---
11 |
12 | ```{r setup, include = FALSE}
13 | knitr::opts_chunk$set(
14 | echo = TRUE,
15 | collapse = TRUE,
16 | comment = "#>",
17 | warning = FALSE,
18 | message = FALSE,
19 | error = FALSE,
20 | outwidth = "100%",
21 | fig.width = 6,
22 | fig.height = 4,
23 | fig.align = "center")
24 | ```
25 |
26 | # Introduction
27 |
28 | The goal of `learningtower` is to provide a user-friendly R package to provide easy access to a subset of variables from PISA data collected from the [OECD](https://www.oecd.org/en/about/programmes/pisa/pisa-data.html). Version `r utils::packageVersion("learningtower")` of this package provides the data for the years `r learningtower:::data_range()`. The survey data is published every three years. This is an excellent real world dataset for data exploring, data visualizing and statistical computations.
29 |
30 | This vignette documents how to access the data, and shows a few ways of integrating the data.
31 |
32 | # Using both the `student` and `school` data
33 |
34 | The size of the full `student` is too big to fit inside the package. Hence, in our package, we provide a random subset of the student data, stored as `student_subset_yyyy` data objects (where `yyyy` denotes the specific year of the study). These subset data can be used to understanding the data structure before using the full dataset which is available for download.
35 |
36 | In the `student_subset_2018` and `school` data, there are three common columns, `school_id`, `country` and `year`. It should be noted that `school_id` is only meaningful within a country within a specific year; meaning that when we join the two data, we need to use the keys `c("school_id", "country", "year")`.
37 |
38 | ## Using the student subset data and school data
39 |
40 | ```{r}
41 | library(dplyr)
42 | library(ggplot2)
43 | library(forcats)
44 | library(learningtower)
45 |
46 | #loading the student subset data
47 | data(student_subset_2018)
48 |
49 | #loading the school data
50 | data(school)
51 |
52 | #loading the country data
53 | data(countrycode)
54 |
55 | selected_countries = c("AUS", "FIN", "JPN", "USA", "NZL", "ESP")
56 |
57 | #joining the student, school dataset
58 | school_student_subset_2018 <- left_join(
59 | student_subset_2018,
60 | school,
61 | by = c("school_id", "country", "year"))
62 |
63 | #check the count of public and private schools in the a few randomly selected countries
64 | school_student_subset_2018 |>
65 | dplyr::filter(country %in% selected_countries) |>
66 | group_by(country, public_private) |>
67 | tally() |>
68 | dplyr::mutate(percent = n/sum(n)) |>
69 | dplyr::ungroup() |>
70 | left_join(countrycode, by = "country") |>
71 | ggplot(aes(x = percent,
72 | y = country_name,
73 | fill = public_private)) +
74 | geom_col(position = position_stack()) +
75 | scale_x_continuous(labels = scales::percent) +
76 | scale_fill_manual(values = c("#FF7F0EFF", "#1F77B4FF")) +
77 | labs(title = "Distribution of public and private schools in the year 2018",
78 | y = "",
79 | x = "Percentage of schools",
80 | fill = "")
81 | ```
82 |
83 | - The graph assists us in understanding the distribution of public and private schools in few countries based on the datasets. Taking a closer look at the above plot, we can infer that most countries have more public schools than private schools. Interestingly, Spain had a nearly equal mix of public and private schools in the year 2018.
84 |
85 | - Similarly, we may derive additional intriguing patterns and analysis by considering the other variables in the school dataset.
86 |
87 | ```{r, echo=FALSE}
88 | student_data_2018 <- load_student("2018")
89 | data(school)
90 |
91 | data(countrycode)
92 |
93 | school_student_2018 <- left_join(
94 | student_data_2018,
95 | school,
96 | by = c("school_id", "country", "year"))
97 |
98 | school_student_2018 |>
99 | dplyr::filter(country %in% selected_countries) |>
100 | group_by(country) |>
101 | summarise(avg_fund_gov = mean(fund_gov, na.rm = TRUE)) |>
102 | arrange(avg_fund_gov) |>
103 | mutate(country = fct_reorder(country, avg_fund_gov)) |>
104 | left_join(countrycode, by = "country") |>
105 | mutate(country_name = fct_reorder(country_name, avg_fund_gov)) |>
106 | ggplot(aes(x=country_name, y=avg_fund_gov)) +
107 | geom_segment(aes(xend=country_name, yend=0)) +
108 | geom_point(size=4, color="orange") +
109 | coord_flip() +
110 | theme_bw() +
111 | labs(x = "",
112 | y = "Average percentage of government funding",
113 | title = "Funding for schools in the year 2018 from government")
114 | ```
115 |
116 | - The above figure shows the average percentage of overall financing in various schools for a random sample of countries. We conclude that countries such as Finland and the United States received the most funding from their governments, whilst Qatar received the least funding.
117 |
118 | - In addition, to perform a detail analysis on the school and entire student data it can be downloaded for the desired years using the `load_student` function available in this package.
119 |
120 | - Similarly, you may import student data for any chosen year and experiment with PISA scores growth or additional analysis of these datasets with their other elements that assist contributor comprehend the data. Refer to our articles [here](https://kevinwang09.github.io/learningtower/articles/exploring_time.html) for additional interesting analyses and plots.
121 |
--------------------------------------------------------------------------------
/vignettes/learningtower_student.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Using the Student and Country Data"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{learningtower_student}
6 | %\VignetteEncoding{UTF-8}
7 | %\VignetteEngine{knitr::rmarkdown}
8 | editor_options:
9 | chunk_output_type: console
10 | ---
11 |
12 | ```{r setup, include = FALSE}
13 | options(rmarkdown.html_vignette.check_title = FALSE)
14 | knitr::opts_chunk$set(
15 | echo = TRUE,
16 | collapse = TRUE,
17 | comment = "#>",
18 | warning = FALSE,
19 | message = FALSE,
20 | error = FALSE,
21 | outwidth = "100%",
22 | fig.width = 8,
23 | fig.height = 6,
24 | fig.align = "center")
25 | ```
26 |
27 | # Introduction
28 |
29 | The goal of `learningtower` is to provide a user-friendly R package to provide easy access to a subset of variables from PISA data collected from the [OECD](https://www.oecd.org/en/about/programmes/pisa/pisa-data.html). Version `r utils::packageVersion("learningtower")` of this package provides the data for the years `r learningtower:::data_range()`. The survey data is published every three years. This is an excellent real world dataset for data exploring, data visualizing and statistical computations.
30 |
31 | This vignette documents how to access the data, and shows a few typical methods to explore the data.
32 |
33 | # Exploring the `student` data
34 |
35 | ## Usage of the subset of the `student` data
36 |
37 | - In `learningtower`, the main data is the student data. This data contains information regarding student test scores and some selected variables regarding their schooling and socio-economic status. The original and complete data may be obtained from [OECD](https://www.oecd.org/pisa/data/).
38 |
39 | - However, the size of the full `student` is too big to fit inside the package. Hence, in our package, we provide a random subset of the student data, stored as `student_subset_20xx` data objects (where `xx` denotes the specific year of the study). These subset data can be used to understanding the data structure before using the full dataset which is available for download.
40 |
41 | - The student subset data is constructed by randomly sampling from the full student data. For each year and each country, we randomly sample approximately 50 observations.
42 |
43 | - The complete student dataset is [available for download](https://github.com/kevinwang09/learningtower/tree/master/student_full_data) and can be loaded using the `load_student()` function included in this package.
44 |
45 |
46 | Below is a quick example of loading the 2018 subset student data.
47 |
48 | ```{r}
49 | library(dplyr)
50 | library(ggplot2)
51 | library(learningtower)
52 |
53 | #load the subset student data for the year 2018
54 | data(student_subset_2018)
55 | #load the countrycode data
56 | data(countrycode)
57 |
58 | glimpse(student_subset_2018)
59 | ```
60 |
61 | ```{r}
62 | selected_countries = c("AUS", "USA", "TUR", "SWE",
63 | "CHE", "NZL", "BEL", "DEU")
64 |
65 | student_subset_2018 |>
66 | group_by(country, gender) |>
67 | dplyr::filter(country %in% selected_countries) |>
68 | dplyr::left_join(countrycode, by = "country") |>
69 | ggplot(aes(x = math,
70 | y = country_name,
71 | fill = gender)) +
72 | geom_boxplot() +
73 | scale_fill_manual(values = c("#FF7F0EFF", "#1F77B4FF")) +
74 | theme_classic() +
75 | labs(x = "Math score",
76 | y = "")
77 | ```
78 |
79 | - In the figure above, we see that from the student subset data for the year 2018, in the countries like USA and Belgium boys perform better as compared to the girls. However, in countries such as Turkey and Switzerland, girls perform better than the boys or are on the same level with boys when it comes to their average mathematics scores.
80 |
81 | - Furthermore, if we want to learn more about the trend in each year of the selected countries or know more about the yearly student scores, the complete student data can be retrieved for that/those years or all years using the `load_student()` function included in this package.
82 |
83 | ## Usage of the entire student data
84 |
85 | - In order to load and download the complete student data for each year(s), here are the various ways to retrieve the entire student dataset for each year(s) for additional study or analysis purposes.
86 |
87 | ```
88 | #load the entire student data for the year 2018
89 | student_data_2018 <- load_student(2018)
90 |
91 | #load the entire student data for two of the years (2012, 2018)
92 | student_data_2012_2018 <- load_student(c(2012, 2018))
93 |
94 | #load the entire student
95 | student_data_all <- load_student("all")
96 | ```
97 |
98 | - Note, now that we can load and download the the entire student data. Let us plot the difference in score between a few randomly picked countries seen previously and observe how they have grown in terms their average mathematics score from the year 2012 to 2018.
99 |
100 | ```{r}
101 | student_data_2012_2018 <- load_student(c(2012, 2018))
102 |
103 | plot_data <- student_data_2012_2018 |>
104 | group_by(country, year) |>
105 | dplyr::filter(country %in% selected_countries) |>
106 | dplyr::summarise(avg_math = mean(math, na.rm = TRUE)) |>
107 | left_join(countrycode, by = "country") |>
108 | dplyr::select(country_name, year, avg_math) |>
109 | ungroup() |>
110 | dplyr::mutate(
111 | label_x_pos = ifelse(year == 2012, 2012 - 2, 2018 + 1),
112 | label = ifelse(
113 | year == 2012,
114 | paste0(country_name, ", ", round(avg_math)),
115 | round(avg_math)))
116 |
117 | plot_data |>
118 | ggplot(aes(x = year,
119 | y = avg_math,
120 | label = label,
121 | colour = country_name,
122 | group = country_name)) +
123 | geom_point() +
124 | geom_line() +
125 | geom_vline(xintercept=2012,
126 | linetype="dashed",
127 | linewidth=0.1) +
128 | geom_vline(xintercept=2018,
129 | linetype="dashed",
130 | linewidth=0.1) +
131 | geom_text(aes(x = label_x_pos),
132 | position = position_nudge(y = 0)) +
133 | scale_x_continuous(breaks = c(2012, 2018),
134 | limits = c(2008, 2020)) +
135 | scale_colour_manual(values = c("#1F77B4FF", "#FF7F0EFF", "#2CA02CFF", "#D62728FF",
136 | "#9467BDFF", "#8C564BFF", "#E377C2FF", "#7F7F7FFF")) +
137 | labs(x = "",
138 | y = "Average maths score") +
139 | theme_classic() +
140 | theme(axis.ticks.y = element_blank(),
141 | axis.text.y = element_blank(),
142 | legend.position = "none")
143 | ```
144 |
145 | - The figure above assists us in deducing the score change in the different countries from the year 2012 to 2018. This figure enables us to deduce that Albania, Qatar, and Peru have significantly boosted their average mathematics score between these years. While we also observe drop in average mathematics score for Japan.
146 |
147 | - Similarly, you may import student data for any chosen year and experiment with the PISA scores or additional analysis of these datasets with their other variables that assist contributor comprehend the data. Refer to our articles [here](https://kevinwang09.github.io/learningtower/articles/exploring_time.html) for additional interesting analyses and plots.
148 |
149 |
--------------------------------------------------------------------------------