├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── pkgdown.yaml ├── .gitignore ├── CRAN-SUBMISSION ├── Code_of_Conduct.md ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── data.R ├── load_student.R └── zzz.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── cran-comments.md ├── data ├── countrycode.rda ├── school.rda ├── student_subset_2000.rda ├── student_subset_2003.rda ├── student_subset_2006.rda ├── student_subset_2009.rda ├── student_subset_2012.rda ├── student_subset_2015.rda ├── student_subset_2018.rda └── student_subset_2022.rda ├── inst └── CITATION ├── learningtower.Rproj ├── man ├── countrycode.Rd ├── figures │ ├── README_school_data_missing_values_summary.png │ ├── README_student_data_missing_values_summary.png │ ├── conversation_holden.png │ ├── logo.png │ ├── pisa_image.png │ ├── readme.gif │ └── readme.png ├── load_student.Rd ├── school.Rd └── student.Rd ├── student_full_data ├── student_2000.rds ├── student_2003.rds ├── student_2006.rds ├── student_2009.rds ├── student_2012.rds ├── student_2015.rds ├── student_2018.rds └── student_2022.rds ├── tests ├── testthat.R └── testthat │ ├── test-countrycode-col-types.R │ ├── test-merge.R │ ├── test-school-col-types.R │ ├── test-student-col-types.R │ └── test-test-load.R └── vignettes ├── .gitignore ├── articles ├── Australia_trends.Rmd └── exploring_time.Rmd ├── learningtower_school.Rmd └── learningtower_student.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^_pkgdown\.yml$ 5 | ^docs$ 6 | ^pkgdown$ 7 | ^\.github$ 8 | ^README_cache$ 9 | ^data_raw$ 10 | ^LICENSE\.md$ 11 | ^pkgdown$ 12 | ^vignettes/articles$ 13 | ^student_full_data$ 14 | ^school_full_data$ 15 | ^CODE_OF_CONDUCT\.md$ 16 | ^CRAN-RELEASE$ 17 | ^cran-comments\.md$ 18 | ^man/figures/readme.gif$ 19 | ^man/figures/README_student_data_missing_values_summary.png$ 20 | ^man/figures/README_school_data_missing_values_summary.png$ 21 | ^\.DS_Store$ 22 | ^CRAN-SUBMISSION$ 23 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - '*' 8 | pull_request: 9 | branches: 10 | - master 11 | - '*' 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | R-CMD-check: 17 | runs-on: ${{ matrix.config.os }} 18 | 19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | config: 25 | - {os: macos-latest, r: 'release'} 26 | - {os: windows-latest, r: 'release'} 27 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 28 | - {os: ubuntu-latest, r: 'release'} 29 | - {os: ubuntu-latest, r: 'oldrel-1'} 30 | 31 | env: 32 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 33 | R_KEEP_PKG_SOURCE: yes 34 | 35 | steps: 36 | - uses: actions/checkout@v3 37 | 38 | - uses: r-lib/actions/setup-pandoc@v2 39 | 40 | - uses: r-lib/actions/setup-r@v2 41 | with: 42 | r-version: ${{ matrix.config.r }} 43 | http-user-agent: ${{ matrix.config.http-user-agent }} 44 | use-public-rspm: true 45 | 46 | - uses: r-lib/actions/setup-r-dependencies@v2 47 | with: 48 | extra-packages: any::rcmdcheck 49 | needs: check 50 | 51 | - uses: r-lib/actions/check-r-package@v2 52 | with: 53 | upload-snapshots: true 54 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v3 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Install additional dependencies 37 | run: | 38 | install.packages("remotes") 39 | install.packages("tidyverse") 40 | install.packages("rmarkdown") 41 | install.packages("broom") 42 | install.packages("knitr") 43 | install.packages("ggrepel") 44 | install.packages("patchwork") 45 | install.packages("gghighlight") 46 | install.packages("brolgar") 47 | install.packages("tsibble") 48 | install.packages("lme4") 49 | install.packages("gganimate") 50 | install.packages("ggfortify") 51 | install.packages("sjPlot") 52 | install.packages("kableExtra") 53 | shell: Rscript {0} 54 | 55 | - name: Build site 56 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 57 | shell: Rscript {0} 58 | 59 | - name: Deploy to GitHub pages 60 | if: github.event_name != 'pull_request' 61 | uses: JamesIves/github-pages-deploy-action@v4.4.1 62 | with: 63 | clean: false 64 | branch: gh-pages 65 | folder: docs 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | docs 6 | inst/doc 7 | doc 8 | Meta 9 | .DS_Store 10 | -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 1.1.0 2 | Date: 2024-12-21 01:56:18 UTC 3 | SHA: e46f5094a6d05f4406087ca802e3466209a0abf3 4 | -------------------------------------------------------------------------------- /Code_of_Conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (http:contributor-covenant.org), version 1.0.0, available at 25 | http://contributor-covenant.org/version/1/0/0/ 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: learningtower 2 | Title: OECD PISA Datasets from 2000-2022 in an Easy-to-Use Format 3 | Version: 1.1.0 4 | Authors@R: c( 5 | person(given = "Kevin", family = "Wang", role = c("aut", "cre"), 6 | email = "kevinwangstats@gmail.com"), 7 | person(given = "Paul", family = "Yacobellis", role = "aut", 8 | email = "pyacobellis@hotmail.com"), 9 | person(given = "Erika", family = "Siregar", role = "aut", 10 | email = "erika.mukhlisina@gmail.com"), 11 | person(given = "Sarah", family = "Romanes", role = "aut", 12 | email = "srom8308@uni.sydney.edu.au"), 13 | person(given = "Kim", family = "Fitter", role = "aut", 14 | email = "kimfitter@yahoo.com"), 15 | person(given = "Giulio", family = "Valentino Dalla Riva", role = "aut", 16 | email = "me@gvdallariva.net"), 17 | person(given = "Dianne", family = "Cook", role = "aut", 18 | email = "dicook@monash.edu"), 19 | person(given = "Nick", family = "Tierney", role = "aut", 20 | email = "nicholas.tierney@gmail.com"), 21 | person(given = "Priya", family = "Dingorkar", role = "aut", 22 | email = "priyadingorkar@gmail.com"), 23 | person(given = "Shabarish", family = "Sai Subramanian", role = "aut", 24 | email = "shabarish161@gmail.com"), 25 | person(given = "Guan Ru", family = "Chen", role = "aut", 26 | email = "rix09207@gmail.com") 27 | ) 28 | Description: The Programme for International Student Assessment (PISA) is a global study conducted by the Organization for Economic Cooperation and Development (OECD) in member and non-member countries to assess educational systems by assessing 15-year-old school students academic performance in mathematics, science, and reading. This datasets contains information on their scores and other socioeconomic characteristics, information about their school and its infrastructure, as well as the countries that are taking part in the program. 29 | Depends: R (>= 3.5.0) 30 | Encoding: UTF-8 31 | RoxygenNote: 7.3.2 32 | VignetteBuilder: knitr 33 | License: MIT + file LICENSE 34 | URL: https://kevinwang09.github.io/learningtower/, https://github.com/kevinwang09/learningtower 35 | BugReports: https://github.com/kevinwang09/learningtower/issues 36 | Imports: 37 | tibble, 38 | dplyr 39 | Suggests: 40 | testthat (>= 3.0.0), 41 | knitr, 42 | rmarkdown, 43 | ggplot2, 44 | forcats, 45 | scales 46 | Config/testthat/edition: 3 47 | LazyData: true 48 | LazyDataCompression: xz 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2024 2 | COPYRIGHT HOLDER: learningtower authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2024 learningtower authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(load_student) 4 | importFrom(dplyr,bind_rows) 5 | importFrom(tibble,tibble) 6 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # learningtower 1.1.0 2 | 3 | * Added 2022 data set. 4 | * Updates to the README and contributor information. 5 | * Added a `NEWS.md` file to track changes to the package. 6 | * `year` column in both the `student` and the `school` datasets are changed from a factor column to an integer column. 7 | * `school_id` column in both the `student` and the `school` datasets are changed from a factor column to a character column. 8 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' @title Processed and Sampled PISA Student Data (2000-2022) 2 | #' 3 | #' @description This dataset provides a clean and processed subset of the OECD PISA student data 4 | #' for the years 2000-2022. The original data is sourced from 5 | #' \url{https://www.oecd.org/en/about/programmes/pisa/pisa-data.html} and has been prepared for analysis. 6 | #' A sampling of 50 students per country (for OECD countries) has been included for each year. 7 | #' The data curation and sampling process are documented in \url{https://github.com/kevinwang09/learningtower_masonry/blob/master/Code/student_bind_rows.Rmd} 8 | #' 9 | #' @format A tibble of the following variables 10 | #' \itemize{ 11 | #' \item \code{year}: Year of the PISA data. Integer. 12 | #' \item \code{country}: Country 3 character code. Note that some regions/territories are coded as "country" for ease of input. Factor. 13 | #' \item \code{school_id}: Unique school identifier for each country and year. Character. 14 | #' \item \code{student_id}: Unique student identifier within each school. Integer. 15 | #' \item \code{mother_educ}: Mother's highest level of education, from "less than ISCED1" to "ISCED 3A". Factor. 16 | #' \item \code{father_educ}: Father's highest level of education, from "less than ISCED1" to "ISCED 3A". Factor. 17 | #' \item \code{gender}: Gender of the student. Only "male" and "female" are recorded. Factor. 18 | #' Note that we call this variable gender and not sex as this term was used in the OECD PISA database. 19 | #' \item \code{computer}: Possession of computer. Only "yes" and "no" are recorded. Factor. 20 | #' \item \code{internet}: Access to internet. Only "yes" and "no" are recorded. Factor. 21 | #' \item \code{math}: Simulated score in mathematics. Numeric. 22 | #' \item \code{read}: Simulated score in reading. Numeric. 23 | #' \item \code{science}: Simulated score in science. Numeric. 24 | #' \item \code{stu_wgt}: The final survey weight score for the student score. Numeric. 25 | #' \item \code{desk}: Possession of desk to study at. Only "yes" and "no" are recorded. Factor. 26 | #' \item \code{room}: Possession of a room of your own. Only "yes" and "no" are recorded. Factor. 27 | #' \item \code{dishwasher}: Possession of a dishwasher. Only "yes" and "no" are recorded. Factor. 28 | #' Note that in 2015 and 2018, all entries are missing. 29 | #' \item \code{television}: Number of televisions. 30 | #' "0", "1", "2" are code for no, one and two TVs in the house. "3+" codes for three or more TVs. Factor. 31 | #' Note that in 2003, all entries are missing. 32 | #' \item \code{computer_n}: Number of computers. 33 | #' "0", "1", "2" are code for no, one and two computers in the house. "3+" codes for three or more computers. Factor. 34 | #' Note that in 2003, all entries are missing. 35 | #' \item \code{car}: Number of cars. 36 | #' "0", "1", "2" are code for no, one and two cars in the house. "3+" codes for three or more cars Factor. 37 | #' Note that in 2003, all entries are missing. 38 | #' \item \code{book}: Number of books. Factor. 39 | #' Note that encoding is different in the years 2000 and 2003 compared to all other years. Factor. 40 | #' Evaluate \code{table(student$book, student$year)} for a demo. 41 | #' \item \code{wealth}: Index of family wealth. Numeric. 42 | #' Note that in 2003, all entries are missing. 43 | #' \item \code{escs}: Index of economic, social and cultural status. Numeric. 44 | #' } 45 | #' @docType data 46 | #' @name student 47 | #' @rdname student 48 | #' @importFrom dplyr bind_rows 49 | #' @examples 50 | #' library(dplyr) 51 | #' data(student_subset_2000) 52 | #' data(student_subset_2003) 53 | #' dplyr::bind_rows( 54 | #' student_subset_2000, 55 | #' student_subset_2003 56 | #' ) 57 | NULL 58 | 59 | 60 | #' @docType data 61 | #' @name student_subset_2000 62 | #' @rdname student 63 | NULL 64 | 65 | #' @docType data 66 | #' @name student_subset_2003 67 | #' @rdname student 68 | NULL 69 | 70 | #' @docType data 71 | #' @name student_subset_2006 72 | #' @rdname student 73 | NULL 74 | 75 | #' @docType data 76 | #' @name student_subset_2009 77 | #' @rdname student 78 | NULL 79 | 80 | #' @docType data 81 | #' @name student_subset_2012 82 | #' @rdname student 83 | NULL 84 | 85 | #' @docType data 86 | #' @name student_subset_2015 87 | #' @rdname student 88 | NULL 89 | 90 | #' @docType data 91 | #' @name student_subset_2018 92 | #' @rdname student 93 | NULL 94 | 95 | #' @docType data 96 | #' @name student_subset_2022 97 | #' @rdname student 98 | NULL 99 | 100 | 101 | #' @title Subset of the School data available for the years 2000-2022 from the PISA OECD database 102 | #' 103 | #' @description A subset data containing school weight and other information 104 | #' from the triennial testing of 15 year olds around 105 | #' the globe. Original data available from 106 | #' \url{https://www.oecd.org/en/about/programmes/pisa/pisa-data.html}. 107 | #' 108 | #' @format A tibble of the following variables 109 | #' \itemize{ 110 | #' \item \code{year}: Year of the PISA data. Integer. 111 | #' \item \code{country}: Country 3 character code. Note that some regions/territories are coded as country for ease of input. Chracter. 112 | #' \item \code{school_id}: The school identification number, unique for each country and year combination. Character. 113 | #' \item \code{fund_gov}: Percentage of total funding for school year from government. Numeric. 114 | #' \item \code{fund_fees}: Percentage of total funding for school year from student fees or school charges paid by parents. Numeric. 115 | #' \item \code{fund_donation}: Percentage of total funding for school year from 116 | #' benefactors, donations, bequests, sponsorship, parent fundraising. Numeric. 117 | #' \item \code{enrol_boys}: Number of boys in the school. Numeric. 118 | #' \item \code{enrol_girls}: Number of girls in the school. Numeric. 119 | #' \item \code{stratio}: Student-Teacher ratio. Numeric. 120 | #' \item \code{public_private}: Is the school a public or private school. Factor. 121 | #' \item \code{staff_shortage}: Shortage of staff. Numeric. 122 | #' \item \code{sch_wgt}: The final survey weight score for the schools. Numeric. 123 | #' \item \code{school_size}: The school size. Numeric. 124 | #' } 125 | #' @docType data 126 | #' @name school 127 | NULL 128 | 129 | #' @title Country iso3c and name mapping for PISA OECD countries participants. 130 | #' 131 | #' @description A dataset containing mapping of the country ISO code to the country names. 132 | #' More information on participating countries can be found at 133 | #' \url{https://www.oecd.org/pisa/aboutpisa/pisa-participants.htm}. 134 | #' 135 | #' @format A tibble of the following variables 136 | #' \itemize{ 137 | #' \item \code{country}: Country 3 character code. Note that some regions/territories are coded as country for ease of input. Character. 138 | #' \item \code{country_name}: Country name. Note that some regions/territories are coded as country for ease of input. Character. 139 | #' } 140 | #' @docType data 141 | #' @name countrycode 142 | NULL 143 | 144 | -------------------------------------------------------------------------------- /R/load_student.R: -------------------------------------------------------------------------------- 1 | #'@title load_student() function allows the user to extract the PISA student scores for any desired year 2 | #'from 2000-2022 3 | #' 4 | #'@description load_student() function was created to extract the data of student's scores in any 5 | #'years from 2000-2022, the function requires any of the year as it argument or a string "all" 6 | #'that will return all the PISA scores of the students from the years 2000-2022. 7 | #' 8 | #'@param year is the required parameter for the function to display the 9 | #'dataset the user wants to view the PISA scores for the selected year else the entire student 10 | #'data will be available to the user 11 | #' 12 | #'@importFrom dplyr bind_rows 13 | #'@importFrom tibble tibble 14 | #' 15 | #'@return A dataset of PISA scores of students that took the test in the selected year as per user 16 | #'from the years 2000-2018 17 | #' 18 | #'@usage load_student(year = "2000") 19 | #' 20 | #'@examples 21 | #' \dontrun{ 22 | #' library(learningtower) 23 | #' student_all <- load_student("all") 24 | #' student_2000 <- load_student("2000") 25 | #' } 26 | #' 27 | #'@export 28 | load_student <- function(year = "2000"){ 29 | 30 | year <- as.character(year) 31 | stopifnot(all(year %in% c("2000", "2003", 32 | "2006", "2009", 33 | "2012", "2015", 34 | "2018", "2022", 35 | "all"))) 36 | 37 | ## If "all" is in the year vector, we will download everything 38 | if("all" %in% year){ 39 | year = c("2000", "2003", "2006", "2009", "2012", "2015", "2018", "2022") 40 | } 41 | 42 | result = tibble::tibble() 43 | 44 | for(this_year in year){ 45 | message("Downloading year ", this_year, "...\n") 46 | this_data = download_single_student(year = this_year) 47 | result = dplyr::bind_rows(result, this_data) 48 | } 49 | return(result) 50 | } 51 | 52 | download_single_student <- function(year){ 53 | url_git = base::paste0("https://github.com/kevinwang09/learningtower/raw/master/student_full_data/student_", year, ".rds") 54 | tmp <- tempfile() 55 | utils::download.file(url = url_git, destfile = tmp) 56 | return(base::readRDS(file = tmp)) 57 | } 58 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | data_range <- function(){ 2 | return("2000 - 2022") 3 | } 4 | 5 | .onAttach <- function(libname, pkgname) { 6 | m = paste0( 7 | "The learningtower package (version 1.1.0)", 8 | " provides data from OECD PISA database between ", data_range(), ".", 9 | " For package size reasons, only a small subset is provided in the package. Use the function `load_student()` to access the full data.") 10 | packageStartupMessage(m) 11 | } 12 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | ```{r, include = FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | warning = FALSE, 9 | message = FALSE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "54%", 13 | fig.align = "center") 14 | library(tidyverse) 15 | library(learningtower) 16 | ``` 17 | 18 | # learningtower 19 | 20 | [![R-CMD-check](https://github.com/kevinwang09/learningtower/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kevinwang09/learningtower/actions/workflows/R-CMD-check.yaml) 21 | 22 | The goal of `learningtower` is to provide a user-friendly R package to provide easy access to a subset of variables from PISA data collected from the [OECD](https://www.oecd.org/pisa/data/). Version `r utils::packageVersion("learningtower")` of this package provides the data for the years `r learningtower:::data_range()`. The survey data is published every three years. This is an excellent real world dataset for data exploring, data visualising and statistical computations. 23 | 24 | ## What is the PISA dataset? 25 | 26 |

27 | 28 |

29 | 30 | 31 | The Programme for International Student Assessment (PISA) is an international assessment measuring student performance in reading, mathematical and scientific literacy. 32 | 33 | PISA assesses the extent to which 15-year-old students have acquired some of the knowledge and skills that are essential for full participation in society, and how well they are prepared for lifelong learning in the areas of reading, mathematical and scientific literacy. 34 | 35 | In 2022, PISA involved 79 countries and 600,000+ students worldwide. 36 | 37 | Read more about the Programme [here](https://www.oecd.org/en/about/programmes/pisa.html). 38 | 39 | 40 | ## Installation 41 | 42 | You can install the `learningtower` package from [CRAN](https://CRAN.R-project.org) with: 43 | 44 | ``` r 45 | install.packages("learningtower") 46 | ``` 47 | 48 | To install the development version of `learningtower` from [GitHub](https://github.com/) use: 49 | 50 | ``` r 51 | devtools::install_github("kevinwang09/learningtower") 52 | ``` 53 | 54 | ## Data Description 55 | 56 | The `learningtower` gives access to a subset of variables from PISA data originally collected and are available from [OECD](https://www.oecd.org/pisa/data/), collected on a three year basis. 57 | 58 | The `learningtower` package contains mainly three datasets: 59 | 60 | + `student` 61 | + `school` 62 | + `countrycode` 63 | 64 | This provides us with information about the students scores in mathematics, reading and science, their school details, and which country they are from. The data provided in this package is a cleaned version of the full published PISA organisation, with reproducible code available in [this repository](https://github.com/kevinwang09/learningtower_masonry). 65 | 66 | The number of entries for the `student` and `school` data are shown below. 67 | 68 | ```{r, eval=FALSE, echo=FALSE} 69 | library(learningtower) 70 | student = load_student("all") 71 | data(school) 72 | 73 | library(dplyr) 74 | 75 | student_summary <- student |> 76 | group_by(year) |> 77 | tally(name = "Number of Students") 78 | 79 | school_summary <- school |> 80 | group_by(year) |> 81 | tally(name = "Number of Schools") 82 | 83 | combined_summary <- full_join(student_summary, school_summary, by = "year") 84 | 85 | knitr::kable(combined_summary) 86 | ``` 87 | 88 | | Year | Number of Students | Number of Schools | 89 | |------|--------------------:|------------------:| 90 | | 2000 | 127,236 | 8,526 | 91 | | 2003 | 276,165 | 10,274 | 92 | | 2006 | 398,750 | 14,365 | 93 | | 2009 | 515,958 | 18,641 | 94 | | 2012 | 480,174 | 18,139 | 95 | | 2015 | 519,334 | 17,908 | 96 | | 2018 | 612,004 | 21,903 | 97 | | 2022 | 613,744 | 21,629 | 98 | 99 | ### Student Dataset 100 | 101 | The `student` dataset comprises of the scores from the triennial testing of 15-year-olds worldwide. In addition, this dataset contains interesting information on their parents qualifications, family wealth, gender, and possession of computers, internet, cars, books, rooms, desks, and similar other variables. 102 | 103 | The full dataset is approximately 50MB in size, which is much larger than the CRAN's allowed package size limit. As the result, the package itself only includes a random 50 rows from the [38 OECD countries](https://en.wikipedia.org/wiki/OECD#Member_countries), for each of the survey years. i.e. `student_subset_2000`, `student_subset_2003` etc. 104 | 105 | The `student` subset dataset can be loaded easily. See `?student` for detailed information on the measured variables. 106 | 107 | ```{r} 108 | library(learningtower) 109 | 110 | data(student_subset_2018) 111 | dim(student_subset_2018) 112 | ``` 113 | 114 | The entire `student` data can be downloaded using the `load_student` function. 115 | 116 | ```{r, eval=FALSE} 117 | #load the entire student data for a single year 118 | student_data_2018 <- load_student(2018) 119 | 120 | #load the entire student data for two of the years (2012, 2018) 121 | student_data_2012_2018 <- load_student(c(2012, 2018)) 122 | 123 | #load the entire student data 124 | student_data_all <- load_student("all") 125 | ``` 126 | 127 | Note that because of changing data specification over the survery years, not all variables were measured consistently across the years. 128 | 129 |

130 | 131 |

132 | 133 | ### School Dataset 134 | 135 | The `school` dataset comprises school weight and other information such as the funding distribution of the schools, whether the school is private or public, the enrollment of boys and girls, the school size, and similar other characteristics of interest of different schools these 15-year-olds attend throughout the world. 136 | 137 | - The school subset dataset can be loaded as follows 138 | 139 | ```{r} 140 | # loading the school data 141 | data(school) 142 | ``` 143 | 144 | See `?school` for more information on the different variables present in the the school dataset. 145 | 146 |

147 | 148 |

149 | 150 | ### Countrycode Dataset 151 | 152 | The countrycode dataset contains mapping of the [country ISO code to the country name](https://www.oecd.org/content/dam/oecd/en/about/programmes/edu/pisa/publications/technical-report/PISA2015_TechRep_Final.pdf). More information on the participating countries can be found [here](https://www.oecd.org/en/about/programmes/pisa/pisa-participants.html). 153 | 154 | ```{r} 155 | # loading the countrycode data 156 | data(countrycode) 157 | head(countrycode) 158 | ``` 159 | 160 |
161 | Notes on countries 162 | + Not all data entries in the `countrycode` are countries. For example, "QCN" refers to "Shanghai-China". 163 | + Due to differences in country codes, not all `student_subset_yyyy` data has all 38 OECD countries. 164 |
165 | 166 | See `?countrycode` for more detailed information on the countries that participated in the PISA experiment. 167 | 168 | ## Exploring the data 169 | 170 | In the plot shown below, shows the weighted mean of mathematics scores of these 15 year old students for a few selected countries over the available years. 171 | 172 | ```{r, eval = FALSE, echo = FALSE} 173 | library(dplyr) 174 | library(learningtower) 175 | 176 | student <- load_student("all") 177 | 178 | p = student |> 179 | dplyr::filter(country %in% c("SGP","CAN", "FIN", "NZL", 180 | "USA", "JPN", "GBR", "AUS")) |> 181 | group_by(year, country) |> 182 | summarise(math = weighted.mean(math, stu_wgt, na.rm=TRUE)) |> 183 | ggplot(aes(x=year, y=math, group=country, color = country)) + 184 | geom_line(alpha=0.6, linewidth = 2) + 185 | geom_point(alpha=0.6, size=3)+ 186 | ylim(c(450, 600)) + 187 | theme_minimal() + 188 | labs(x = "Year", 189 | y = "Score", 190 | title = "Math Scores 2000 - 2022") + 191 | theme(text = element_text(size=10), 192 | legend.title = element_blank()) + 193 | scale_color_brewer(palette = "Dark2") 194 | 195 | ggsave(p, filename = "man/figures/readme.png", width=6, height=4) 196 | ``` 197 | 198 |

199 | 200 |

201 | 202 | 203 | - Similarly, you can find more code examples and data visualizations for exploring `learningtower` through our vignettes and articles 204 | 205 | - Further data exploration can be found in our articles exploring temporal trends [here](https://kevinwang09.github.io/learningtower/articles/articles/exploring_time.html). 206 | 207 | ## Citation 208 | 209 | To cite the `learningtower` package, please use: 210 | 211 | ```{r} 212 | citation("learningtower") 213 | ``` 214 | 215 | ## Motivation for `learningtower` 216 | 217 | + The PISA 2018 results were released on 3 December 2019. This led to wringing of hands in the Australian press, with titles of stories like [Vital Signs: Australia's slipping student scores will lead to greater income inequality](https://theconversation.com/vital-signs-australias-slipping-student-scores-will-lead-to-greater-income-inequality-128301) and [In China, Nicholas studied maths 20 hours a week. In Australia, it's three](https://www.smh.com.au/education/in-china-nicholas-studied-maths-20-hours-a-week-in-australia-it-s-three-20191203-p53ggv.html). 218 | 219 |

220 | 221 |

222 | 223 | + Australia's neighbours, New Zealand and Indonesia, are also worrying: [New Zealand top-end in OECD's latest PISA report but drop in achievements 'worrying'](https://www.stuff.co.nz/national/education/117890945/new-zealand-topend-in-oecds-latest-pisa-report-but-drop-in-achievements-worrying), [Not even mediocre? Indonesian students score low in math, reading, science: PISA report](https://www.thejakartapost.com/news/2019/12/04/not-even-mediocre-indonesian-students-score-low-in-math-reading-science-pisa-report.html). 224 | 225 | + The data from this survey and all of the surveys conducted since the first collection in 2000, is publicly available. We decided to have made a more convenient subset of the data available in a new R package, called `learningtower` 226 | 227 | 228 | ## Acknowledgement 229 | 230 | The work to make the data available is the effort of several researchers from Australia, New Zealand and Indonesia, conducted as part of the [ROpenSci OzUnconf](https://ozunconf19.ropensci.org) held in Sydney, Dec 11-13, 2019. 231 | 232 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # learningtower 3 | 4 | [![R-CMD-check](https://github.com/kevinwang09/learningtower/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kevinwang09/learningtower/actions/workflows/R-CMD-check.yaml) 5 | 6 | The goal of `learningtower` is to provide a user-friendly R package to 7 | provide easy access to a subset of variables from PISA data collected 8 | from the [OECD](https://www.oecd.org/pisa/data/). Version 1.1.0 of this 9 | package provides the data for the years 2000 - 2022. The survey data is 10 | published every three years. This is an excellent real world dataset for 11 | data exploring, data visualising and statistical computations. 12 | 13 | ## What is the PISA dataset? 14 | 15 |

16 | 17 |

18 | 19 | The Programme for International Student Assessment (PISA) is an 20 | international assessment measuring student performance in reading, 21 | mathematical and scientific literacy. 22 | 23 | PISA assesses the extent to which 15-year-old students have acquired 24 | some of the knowledge and skills that are essential for full 25 | participation in society, and how well they are prepared for lifelong 26 | learning in the areas of reading, mathematical and scientific literacy. 27 | 28 | In 2022, PISA involved 79 countries and 600,000+ students worldwide. 29 | 30 | Read more about the Programme 31 | [here](https://www.oecd.org/en/about/programmes/pisa.html). 32 | 33 | ## Installation 34 | 35 | You can install the `learningtower` package from 36 | [CRAN](https://CRAN.R-project.org) with: 37 | 38 | ``` r 39 | install.packages("learningtower") 40 | ``` 41 | 42 | To install the development version of `learningtower` from 43 | [GitHub](https://github.com/) use: 44 | 45 | ``` r 46 | devtools::install_github("kevinwang09/learningtower") 47 | ``` 48 | 49 | ## Data Description 50 | 51 | The `learningtower` gives access to a subset of variables from PISA data 52 | originally collected and are available from 53 | [OECD](https://www.oecd.org/pisa/data/), collected on a three year 54 | basis. 55 | 56 | The `learningtower` package contains mainly three datasets: 57 | 58 | - `student` 59 | - `school` 60 | - `countrycode` 61 | 62 | This provides us with information about the students scores in 63 | mathematics, reading and science, their school details, and which 64 | country they are from. The data provided in this package is a cleaned 65 | version of the full published PISA organisation, with reproducible code 66 | available in [this 67 | repository](https://github.com/kevinwang09/learningtower_masonry). 68 | 69 | The number of entries for the `student` and `school` data are shown 70 | below. 71 | 72 | | Year | Number of Students | Number of Schools | 73 | |------|-------------------:|------------------:| 74 | | 2000 | 127,236 | 8,526 | 75 | | 2003 | 276,165 | 10,274 | 76 | | 2006 | 398,750 | 14,365 | 77 | | 2009 | 515,958 | 18,641 | 78 | | 2012 | 480,174 | 18,139 | 79 | | 2015 | 519,334 | 17,908 | 80 | | 2018 | 612,004 | 21,903 | 81 | | 2022 | 613,744 | 21,629 | 82 | 83 | ### Student Dataset 84 | 85 | The `student` dataset comprises of the scores from the triennial testing 86 | of 15-year-olds worldwide. In addition, this dataset contains 87 | interesting information on their parents qualifications, family wealth, 88 | gender, and possession of computers, internet, cars, books, rooms, 89 | desks, and similar other variables. 90 | 91 | The full dataset is approximately 50MB in size, which is much larger 92 | than the CRAN’s allowed package size limit. As the result, the package 93 | itself only includes a random 50 rows from the [38 OECD 94 | countries](https://en.wikipedia.org/wiki/OECD#Member_countries), for 95 | each of the survey years. i.e. `student_subset_2000`, 96 | `student_subset_2003` etc. 97 | 98 | The `student` subset dataset can be loaded easily. See `?student` for 99 | detailed information on the measured variables. 100 | 101 | ``` r 102 | library(learningtower) 103 | 104 | data(student_subset_2018) 105 | dim(student_subset_2018) 106 | #> [1] 1900 22 107 | ``` 108 | 109 | The entire `student` data can be downloaded using the `load_student` 110 | function. 111 | 112 | ``` r 113 | #load the entire student data for a single year 114 | student_data_2018 <- load_student(2018) 115 | 116 | #load the entire student data for two of the years (2012, 2018) 117 | student_data_2012_2018 <- load_student(c(2012, 2018)) 118 | 119 | #load the entire student data 120 | student_data_all <- load_student("all") 121 | ``` 122 | 123 | Note that because of changing data specification over the survery years, 124 | not all variables were measured consistently across the years. 125 | 126 |

127 | 128 |

129 | 130 | ### School Dataset 131 | 132 | The `school` dataset comprises school weight and other information such 133 | as the funding distribution of the schools, whether the school is 134 | private or public, the enrollment of boys and girls, the school size, 135 | and similar other characteristics of interest of different schools these 136 | 15-year-olds attend throughout the world. 137 | 138 | - The school subset dataset can be loaded as follows 139 | 140 | ``` r 141 | # loading the school data 142 | data(school) 143 | ``` 144 | 145 | See `?school` for more information on the different variables present in 146 | the the school dataset. 147 | 148 |

149 | 150 |

151 | 152 | ### Countrycode Dataset 153 | 154 | The countrycode dataset contains mapping of the [country ISO code to the 155 | country 156 | name](https://www.oecd.org/content/dam/oecd/en/about/programmes/edu/pisa/publications/technical-report/PISA2015_TechRep_Final.pdf). 157 | More information on the participating countries can be found 158 | [here](https://www.oecd.org/en/about/programmes/pisa/pisa-participants.html). 159 | 160 | ``` r 161 | # loading the countrycode data 162 | data(countrycode) 163 | head(countrycode) 164 | #> # A tibble: 6 × 2 165 | #> country country_name 166 | #> 167 | #> 1 AZE Azerbaijan 168 | #> 2 ARG Argentina 169 | #> 3 AUS Australia 170 | #> 4 AUT Austria 171 | #> 5 BEL Belgium 172 | #> 6 BRA Brazil 173 | ``` 174 | 175 |
176 | 177 | Notes on countries 178 | 179 | 180 | - Not all data entries in the `countrycode` are countries. For example, 181 | “QCN” refers to “Shanghai-China”. 182 | - Due to differences in country codes, not all `student_subset_yyyy` 183 | data has all 38 OECD countries. 184 | 185 |
186 | 187 | See `?countrycode` for more detailed information on the countries that 188 | participated in the PISA experiment. 189 | 190 | ## Exploring the data 191 | 192 | In the plot shown below, shows the weighted mean of mathematics scores 193 | of these 15 year old students for a few selected countries over the 194 | available years. 195 | 196 |

197 | 198 |

199 | 200 | - Similarly, you can find more code examples and data visualizations for 201 | exploring `learningtower` through our vignettes and articles 202 | 203 | - Further data exploration can be found in our articles exploring 204 | temporal trends 205 | [here](https://kevinwang09.github.io/learningtower/articles/articles/exploring_time.html). 206 | 207 | ## Citation 208 | 209 | To cite the `learningtower` package, please use: 210 | 211 | ``` r 212 | citation("learningtower") 213 | #> To cite package 'learningtower' in publications use: 214 | #> 215 | #> Wang K, Yacobellis P, Siregar E, Romanes S, Fitter K, Dalla Riva G, 216 | #> Cook D, Tierney N, Dingorkar P, Sai Subramanian S, Chen G (2024). 217 | #> _learningtower: OECD PISA Datasets from 2000-2022 in an Easy-to-Use 218 | #> Format_. R package version 1.1.0, 219 | #> https://github.com/kevinwang09/learningtower, 220 | #> . 221 | #> 222 | #> A BibTeX entry for LaTeX users is 223 | #> 224 | #> @Manual{, 225 | #> title = {learningtower: OECD PISA Datasets from 2000-2022 in an Easy-to-Use Format}, 226 | #> author = {Kevin Wang and Paul Yacobellis and Erika Siregar and Sarah Romanes and Kim Fitter and Giulio Valentino {Dalla Riva} and Dianne Cook and Nick Tierney and Priya Dingorkar and Shabarish {Sai Subramanian} and Guan Ru Chen}, 227 | #> note = {R package version 1.1.0, https://github.com/kevinwang09/learningtower}, 228 | #> url = {https://kevinwang09.github.io/learningtower/}, 229 | #> year = {2024}, 230 | #> } 231 | ``` 232 | 233 | ## Motivation for `learningtower` 234 | 235 | - The PISA 2018 results were released on 3 December 2019. This led to 236 | wringing of hands in the Australian press, with titles of stories like 237 | [Vital Signs: Australia’s slipping student scores will lead to greater 238 | income 239 | inequality](https://theconversation.com/vital-signs-australias-slipping-student-scores-will-lead-to-greater-income-inequality-128301) 240 | and [In China, Nicholas studied maths 20 hours a week. In Australia, 241 | it’s 242 | three](https://www.smh.com.au/education/in-china-nicholas-studied-maths-20-hours-a-week-in-australia-it-s-three-20191203-p53ggv.html). 243 | 244 |

245 | 246 |

247 | 248 | - Australia’s neighbours, New Zealand and Indonesia, are also worrying: 249 | [New Zealand top-end in OECD’s latest PISA report but drop in 250 | achievements 251 | ‘worrying’](https://www.stuff.co.nz/national/education/117890945/new-zealand-topend-in-oecds-latest-pisa-report-but-drop-in-achievements-worrying), 252 | [Not even mediocre? Indonesian students score low in math, reading, 253 | science: PISA 254 | report](https://www.thejakartapost.com/news/2019/12/04/not-even-mediocre-indonesian-students-score-low-in-math-reading-science-pisa-report.html). 255 | 256 | - The data from this survey and all of the surveys conducted since the 257 | first collection in 2000, is publicly available. We decided to have 258 | made a more convenient subset of the data available in a new R 259 | package, called `learningtower` 260 | 261 | ## Acknowledgement 262 | 263 | The work to make the data available is the effort of several researchers 264 | from Australia, New Zealand and Indonesia, conducted as part of the 265 | [ROpenSci OzUnconf](https://ozunconf19.ropensci.org) held in Sydney, Dec 266 | 11-13, 2019. 267 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://kevinwang09.github.io/learningtower/ 2 | template: 3 | params: 4 | bootswatch: flatly 5 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | 3 | * ubuntu 16.04 (GitHub Actions), R devel 4 | * MacOS (GitHub Actions), R devel 5 | * Windows (GitHub Actions), R devel 6 | * win-builder (devel) 7 | 8 | ## R CMD check results 9 | There were no ERRORs or WARNINGs. 10 | 11 | Winbuilder gave some notes, but these are on URLs that are valid: https://win-builder.r-project.org/O7qG479Sn4gS/00check.log. 12 | -------------------------------------------------------------------------------- /data/countrycode.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/countrycode.rda -------------------------------------------------------------------------------- /data/school.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/school.rda -------------------------------------------------------------------------------- /data/student_subset_2000.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2000.rda -------------------------------------------------------------------------------- /data/student_subset_2003.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2003.rda -------------------------------------------------------------------------------- /data/student_subset_2006.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2006.rda -------------------------------------------------------------------------------- /data/student_subset_2009.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2009.rda -------------------------------------------------------------------------------- /data/student_subset_2012.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2012.rda -------------------------------------------------------------------------------- /data/student_subset_2015.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2015.rda -------------------------------------------------------------------------------- /data/student_subset_2018.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2018.rda -------------------------------------------------------------------------------- /data/student_subset_2022.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/data/student_subset_2022.rda -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype = "Manual", 3 | title = "learningtower: OECD PISA Datasets from 2000-2022 in an Easy-to-Use Format", 4 | author = c( 5 | person("Kevin", "Wang"), 6 | person("Paul", "Yacobellis"), 7 | person("Erika", "Siregar"), 8 | person("Sarah", "Romanes"), 9 | person("Kim", "Fitter"), 10 | person("Giulio Valentino", "Dalla Riva"), 11 | person("Dianne", "Cook"), 12 | person("Nick", "Tierney"), 13 | person("Priya", "Dingorkar"), 14 | person("Shabarish", "Sai Subramanian"), 15 | person("Guan Ru", "Chen") 16 | ), 17 | note = "R package version 1.1.0, https://github.com/kevinwang09/learningtower", 18 | url = "https://kevinwang09.github.io/learningtower/", 19 | year = "2024" 20 | ) 21 | -------------------------------------------------------------------------------- /learningtower.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | -------------------------------------------------------------------------------- /man/countrycode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{countrycode} 5 | \alias{countrycode} 6 | \title{Country iso3c and name mapping for PISA OECD countries participants.} 7 | \format{ 8 | A tibble of the following variables 9 | \itemize{ 10 | \item \code{country}: Country 3 character code. Note that some regions/territories are coded as country for ease of input. Character. 11 | \item \code{country_name}: Country name. Note that some regions/territories are coded as country for ease of input. Character. 12 | } 13 | } 14 | \description{ 15 | A dataset containing mapping of the country ISO code to the country names. 16 | More information on participating countries can be found at 17 | \url{https://www.oecd.org/pisa/aboutpisa/pisa-participants.htm}. 18 | } 19 | -------------------------------------------------------------------------------- /man/figures/README_school_data_missing_values_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/README_school_data_missing_values_summary.png -------------------------------------------------------------------------------- /man/figures/README_student_data_missing_values_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/README_student_data_missing_values_summary.png -------------------------------------------------------------------------------- /man/figures/conversation_holden.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/conversation_holden.png -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/logo.png -------------------------------------------------------------------------------- /man/figures/pisa_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/pisa_image.png -------------------------------------------------------------------------------- /man/figures/readme.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/readme.gif -------------------------------------------------------------------------------- /man/figures/readme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/man/figures/readme.png -------------------------------------------------------------------------------- /man/load_student.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/load_student.R 3 | \name{load_student} 4 | \alias{load_student} 5 | \title{load_student() function allows the user to extract the PISA student scores for any desired year 6 | from 2000-2022} 7 | \usage{ 8 | load_student(year = "2000") 9 | } 10 | \arguments{ 11 | \item{year}{is the required parameter for the function to display the 12 | dataset the user wants to view the PISA scores for the selected year else the entire student 13 | data will be available to the user} 14 | } 15 | \value{ 16 | A dataset of PISA scores of students that took the test in the selected year as per user 17 | from the years 2000-2018 18 | } 19 | \description{ 20 | load_student() function was created to extract the data of student's scores in any 21 | years from 2000-2022, the function requires any of the year as it argument or a string "all" 22 | that will return all the PISA scores of the students from the years 2000-2022. 23 | } 24 | \examples{ 25 | \dontrun{ 26 | library(learningtower) 27 | student_all <- load_student("all") 28 | student_2000 <- load_student("2000") 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/school.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{school} 5 | \alias{school} 6 | \title{Subset of the School data available for the years 2000-2022 from the PISA OECD database} 7 | \format{ 8 | A tibble of the following variables 9 | \itemize{ 10 | \item \code{year}: Year of the PISA data. Integer. 11 | \item \code{country}: Country 3 character code. Note that some regions/territories are coded as country for ease of input. Chracter. 12 | \item \code{school_id}: The school identification number, unique for each country and year combination. Character. 13 | \item \code{fund_gov}: Percentage of total funding for school year from government. Numeric. 14 | \item \code{fund_fees}: Percentage of total funding for school year from student fees or school charges paid by parents. Numeric. 15 | \item \code{fund_donation}: Percentage of total funding for school year from 16 | benefactors, donations, bequests, sponsorship, parent fundraising. Numeric. 17 | \item \code{enrol_boys}: Number of boys in the school. Numeric. 18 | \item \code{enrol_girls}: Number of girls in the school. Numeric. 19 | \item \code{stratio}: Student-Teacher ratio. Numeric. 20 | \item \code{public_private}: Is the school a public or private school. Factor. 21 | \item \code{staff_shortage}: Shortage of staff. Numeric. 22 | \item \code{sch_wgt}: The final survey weight score for the schools. Numeric. 23 | \item \code{school_size}: The school size. Numeric. 24 | } 25 | } 26 | \description{ 27 | A subset data containing school weight and other information 28 | from the triennial testing of 15 year olds around 29 | the globe. Original data available from 30 | \url{https://www.oecd.org/en/about/programmes/pisa/pisa-data.html}. 31 | } 32 | -------------------------------------------------------------------------------- /man/student.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{student} 5 | \alias{student} 6 | \alias{student_subset_2000} 7 | \alias{student_subset_2003} 8 | \alias{student_subset_2006} 9 | \alias{student_subset_2009} 10 | \alias{student_subset_2012} 11 | \alias{student_subset_2015} 12 | \alias{student_subset_2018} 13 | \alias{student_subset_2022} 14 | \title{Processed and Sampled PISA Student Data (2000-2022)} 15 | \format{ 16 | A tibble of the following variables 17 | \itemize{ 18 | \item \code{year}: Year of the PISA data. Integer. 19 | \item \code{country}: Country 3 character code. Note that some regions/territories are coded as "country" for ease of input. Factor. 20 | \item \code{school_id}: Unique school identifier for each country and year. Character. 21 | \item \code{student_id}: Unique student identifier within each school. Integer. 22 | \item \code{mother_educ}: Mother's highest level of education, from "less than ISCED1" to "ISCED 3A". Factor. 23 | \item \code{father_educ}: Father's highest level of education, from "less than ISCED1" to "ISCED 3A". Factor. 24 | \item \code{gender}: Gender of the student. Only "male" and "female" are recorded. Factor. 25 | Note that we call this variable gender and not sex as this term was used in the OECD PISA database. 26 | \item \code{computer}: Possession of computer. Only "yes" and "no" are recorded. Factor. 27 | \item \code{internet}: Access to internet. Only "yes" and "no" are recorded. Factor. 28 | \item \code{math}: Simulated score in mathematics. Numeric. 29 | \item \code{read}: Simulated score in reading. Numeric. 30 | \item \code{science}: Simulated score in science. Numeric. 31 | \item \code{stu_wgt}: The final survey weight score for the student score. Numeric. 32 | \item \code{desk}: Possession of desk to study at. Only "yes" and "no" are recorded. Factor. 33 | \item \code{room}: Possession of a room of your own. Only "yes" and "no" are recorded. Factor. 34 | \item \code{dishwasher}: Possession of a dishwasher. Only "yes" and "no" are recorded. Factor. 35 | Note that in 2015 and 2018, all entries are missing. 36 | \item \code{television}: Number of televisions. 37 | "0", "1", "2" are code for no, one and two TVs in the house. "3+" codes for three or more TVs. Factor. 38 | Note that in 2003, all entries are missing. 39 | \item \code{computer_n}: Number of computers. 40 | "0", "1", "2" are code for no, one and two computers in the house. "3+" codes for three or more computers. Factor. 41 | Note that in 2003, all entries are missing. 42 | \item \code{car}: Number of cars. 43 | "0", "1", "2" are code for no, one and two cars in the house. "3+" codes for three or more cars Factor. 44 | Note that in 2003, all entries are missing. 45 | \item \code{book}: Number of books. Factor. 46 | Note that encoding is different in the years 2000 and 2003 compared to all other years. Factor. 47 | Evaluate \code{table(student$book, student$year)} for a demo. 48 | \item \code{wealth}: Index of family wealth. Numeric. 49 | Note that in 2003, all entries are missing. 50 | \item \code{escs}: Index of economic, social and cultural status. Numeric. 51 | } 52 | } 53 | \description{ 54 | This dataset provides a clean and processed subset of the OECD PISA student data 55 | for the years 2000-2022. The original data is sourced from 56 | \url{https://www.oecd.org/en/about/programmes/pisa/pisa-data.html} and has been prepared for analysis. 57 | A sampling of 50 students per country (for OECD countries) has been included for each year. 58 | The data curation and sampling process are documented in \url{https://github.com/kevinwang09/learningtower_masonry/blob/master/Code/student_bind_rows.Rmd} 59 | } 60 | \examples{ 61 | library(dplyr) 62 | data(student_subset_2000) 63 | data(student_subset_2003) 64 | dplyr::bind_rows( 65 | student_subset_2000, 66 | student_subset_2003 67 | ) 68 | } 69 | -------------------------------------------------------------------------------- /student_full_data/student_2000.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2000.rds -------------------------------------------------------------------------------- /student_full_data/student_2003.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2003.rds -------------------------------------------------------------------------------- /student_full_data/student_2006.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2006.rds -------------------------------------------------------------------------------- /student_full_data/student_2009.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2009.rds -------------------------------------------------------------------------------- /student_full_data/student_2012.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2012.rds -------------------------------------------------------------------------------- /student_full_data/student_2015.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2015.rds -------------------------------------------------------------------------------- /student_full_data/student_2018.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2018.rds -------------------------------------------------------------------------------- /student_full_data/student_2022.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinwang09/learningtower/3bad6594624463027a0b67e30ff8d3610b28a555/student_full_data/student_2022.rds -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(learningtower) 3 | 4 | test_check("learningtower") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-countrycode-col-types.R: -------------------------------------------------------------------------------- 1 | expected_countrycode_columns <- c("country", "country_name") 2 | expected_countrycode_types <- c("character", "character") 3 | 4 | test_that("countrycode dataset has correct structure", { 5 | data("countrycode", package = "learningtower") # Replace with your package name 6 | 7 | # Check column names 8 | expect_named(countrycode, expected_countrycode_columns, info = "Column names for countrycode dataset") 9 | 10 | # Check column types 11 | for (i in seq_along(expected_countrycode_columns)) { 12 | expect_true(class(countrycode[[expected_countrycode_columns[i]]])[1] == expected_countrycode_types[i], 13 | info = paste("Column", expected_countrycode_columns[i], "in countrycode dataset should be", expected_countrycode_types[i])) 14 | } 15 | }) 16 | -------------------------------------------------------------------------------- /tests/testthat/test-merge.R: -------------------------------------------------------------------------------- 1 | test_that("Merging student and school data works correctly", { 2 | # Load datasets 3 | student_data <- load_student(2000) 4 | data("school", package = "learningtower") 5 | 6 | # Perform merge 7 | expect_no_warning( 8 | merged_data <- dplyr::left_join(student_data, school, by = c("year", "school_id", "country"), relationship = "many-to-one") 9 | ) 10 | 11 | # Check that all columns from both datasets are present 12 | expected_columns <- unique(c(colnames(student_data), colnames(school))) 13 | expect_named(merged_data, expected_columns, 14 | info = "All columns from both datasets should be present after merging") 15 | 16 | # Check for no NA values in key columns after merge 17 | expect_true(all(!is.na(merged_data$school_id)), 18 | info = "No NA values should be introduced in school_id column after merging") 19 | }) 20 | 21 | 22 | test_that("Merging student and countrycode data works correctly", { 23 | # Load datasets 24 | student_data <- load_student(2000) 25 | data("countrycode", package = "learningtower") 26 | 27 | # Perform merge 28 | expect_no_warning( 29 | merged_data <- dplyr::left_join(student_data, countrycode, by = "country", relationship = "many-to-one") 30 | ) 31 | 32 | # Check that all columns from both datasets are present 33 | expected_columns <- unique(c(colnames(student_data), colnames(countrycode))) 34 | expect_named(merged_data, expected_columns, 35 | info = "All columns from both datasets should be present after merging") 36 | 37 | # Check for no NA values in the country column after merge 38 | expect_true(all(!is.na(merged_data$country)), 39 | info = "No NA values should be introduced in the country column after merging") 40 | }) 41 | 42 | test_that("Sequential merging of student, school, and countrycode works", { 43 | # Load datasets 44 | student_data <- load_student(2000) 45 | data("school", package = "learningtower") 46 | data("countrycode", package = "learningtower") 47 | 48 | # Merge student and school 49 | expect_no_warning( 50 | merged_data <- dplyr::left_join(student_data, school, by = c("year", "school_id", "country"), relationship = "many-to-one") 51 | ) 52 | 53 | # Merge with countrycode 54 | expect_no_warning( 55 | final_data <- dplyr::left_join(merged_data, countrycode, by = "country", relationship = "many-to-one") 56 | ) 57 | 58 | # Check that all columns from all datasets are present 59 | expected_columns <- unique(c(colnames(student_data), colnames(school), colnames(countrycode))) 60 | expect_named(final_data, expected_columns, 61 | info = "All columns from student, school, and countrycode should be present after merging") 62 | 63 | # Check for no NA values in key columns 64 | expect_true(all(!is.na(final_data$school_id)), 65 | info = "No NA values should be introduced in school_id column after merging") 66 | expect_true(all(!is.na(final_data$country)), 67 | info = "No NA values should be introduced in the country column after merging") 68 | expect_true(all(!is.na(final_data$country_name)), 69 | info = "No NA values should be introduced in the country_name column after merging") 70 | }) 71 | -------------------------------------------------------------------------------- /tests/testthat/test-school-col-types.R: -------------------------------------------------------------------------------- 1 | expected_school_columns <- c( 2 | "year", "country", "school_id", "fund_gov", "fund_fees", "fund_donation", 3 | "enrol_boys", "enrol_girls", "stratio", "public_private", "staff_shortage", 4 | "sch_wgt", "school_size" 5 | ) 6 | 7 | expected_school_types <- c( 8 | "integer", "character", "character", "numeric", "numeric", "numeric", "numeric", 9 | "numeric", "numeric", "factor", "numeric", "numeric", "numeric" 10 | ) 11 | 12 | test_that("school dataset has correct structure", { 13 | data("school", package = "learningtower") # Replace with your package name 14 | 15 | # Check column names 16 | expect_named(school, expected_school_columns, info = "Column names for school dataset") 17 | 18 | # Check column types 19 | for (i in seq_along(expected_school_columns)) { 20 | expect_true(class(school[[expected_school_columns[i]]])[1] == expected_school_types[i], 21 | info = paste("Column", expected_school_columns[i], "in school dataset should be", expected_school_types[i])) 22 | } 23 | }) 24 | -------------------------------------------------------------------------------- /tests/testthat/test-student-col-types.R: -------------------------------------------------------------------------------- 1 | # Define expected column names and types for student data 2 | expected_student_columns <- c( 3 | "year", "country", "school_id", "student_id", "mother_educ", "father_educ", 4 | "gender", "computer", "internet", "math", "read", "science", "stu_wgt", 5 | "desk", "room", "dishwasher", "television", "computer_n", "car", "book", 6 | "wealth", "escs" 7 | ) 8 | 9 | expected_student_types <- c( 10 | "integer", "factor", "character", "integer", "factor", "factor", "factor", 11 | "factor", "factor", "numeric", "numeric", "numeric", "numeric", "factor", 12 | "factor", "factor", "factor", "factor", "factor", "factor", "numeric", 13 | "numeric" 14 | ) 15 | 16 | test_that("student_subset_* datasets have correct structure", { 17 | for (year in c("2000", "2003", "2006", "2009", "2012", "2015", "2018", "2022")) { 18 | data_name <- paste0("student_subset_", year) 19 | dataset <- get(data_name) 20 | 21 | # Check column names 22 | expect_named(dataset, 23 | expected_student_columns, 24 | info = paste("Column names for", data_name)) 25 | 26 | # Check column types 27 | for (i in seq_along(expected_student_columns)) { 28 | expect_true(class(dataset[[expected_student_columns[i]]])[1] == expected_student_types[i], 29 | info = paste("Column", expected_student_columns[i], "in", data_name, "should be", expected_student_types[i])) 30 | } 31 | } 32 | }) 33 | 34 | test_that("load_student() returns correct structure for full datasets", { 35 | for (year in c("2000", "2003", "2006", "2009", "2012", "2015", "2018", "2022")) { 36 | dataset <- load_student(year) 37 | 38 | # Check column names 39 | expect_named(dataset, expected_student_columns, info = paste("Column names for full dataset of", year)) 40 | 41 | # Check column types 42 | for (i in seq_along(expected_student_columns)) { 43 | expect_true(class(dataset[[expected_student_columns[i]]])[1] == expected_student_types[i], 44 | info = paste("Column", expected_student_columns[i], "in full dataset of", year, "should be", expected_student_types[i])) 45 | } 46 | } 47 | }) 48 | -------------------------------------------------------------------------------- /tests/testthat/test-test-load.R: -------------------------------------------------------------------------------- 1 | test_that("load student year can be done using both an integer and a character value", { 2 | expect_equal(load_student(2000), load_student("2000")) 3 | }) 4 | 5 | 6 | test_that("load multiple years, test is on if the data be be properly binded", { 7 | expect_no_error(load_student(c("2000", "2003"))) 8 | }) 9 | 10 | test_that("load non-existing years", { 11 | expect_error(load_student("2001")) 12 | }) 13 | 14 | test_that("test for load_student class functions",{ 15 | expect_s3_class(object = load_student(year = 2000), 16 | class = c("tbl_df", "tbl", "data.frame")) 17 | }) 18 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/articles/Australia_trends.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How did Australia do in the PISA study" 3 | author: "The Freemasons" 4 | date: "`r Sys.Date()`" 5 | output: 6 | rmarkdown::html_vignette: 7 | fig_height: 10 8 | fig_width: 14 9 | number_sections: true 10 | vignette: > 11 | %\VignetteIndexEntry{How did Australia do in the PISA study} 12 | %\VignetteEncoding{UTF-8} 13 | %\VignetteEngine{knitr::rmarkdown} 14 | editor_options: 15 | chunk_output_type: console 16 | --- 17 | 18 | ```{r setup, include = FALSE} 19 | knitr::opts_chunk$set( 20 | collapse = TRUE, 21 | comment = "#>", 22 | warning = FALSE, 23 | message = FALSE, 24 | error = FALSE, 25 | outwidth = "100%", 26 | fig.width = 8, 27 | fig.height = 6) 28 | ``` 29 | 30 | # Introduction 31 | 32 | The purpose of this article is to explore some of the variables that influenced Australia's performance in PISA study. Note that this is an observational study (as oppose to controlled experiment), and we are inferring on factors that are correlated with academic performance rather than specific causes. 33 | 34 | 35 | # Loading the packages and data 36 | 37 | ```{r} 38 | #loading the data and libraries 39 | library(learningtower) 40 | library(tidyverse) 41 | library(lme4) 42 | library(ggfortify) 43 | library(sjPlot) 44 | library(patchwork) 45 | library(ggrepel) 46 | library(kableExtra) 47 | 48 | student <- load_student("all") 49 | data(school) 50 | data(countrycode) 51 | 52 | theme_set(theme_classic(18) 53 | + theme(legend.position = "bottom")) 54 | ``` 55 | 56 | # Visualise predictors over time 57 | 58 | Since we are expecting some time variations in the data, let's quickly visualize the time trends. 59 | 60 | ```{r} 61 | #filtering the data for Australia 62 | aus_data = student |> 63 | dplyr::filter(country %in% c("AUS")) |> 64 | dplyr::mutate(mother_educ = mother_educ |> fct_relevel("less than ISCED1"), 65 | father_educ = father_educ |> fct_relevel("less than ISCED1")) 66 | ``` 67 | 68 | 69 | ## Numeric variables 70 | 71 | A boxplot is a standardized method of presenting data distribution. It informs whether or not our data is symmetrical. Box plots are important because they give a visual overview of the data, allowing researchers to rapidly discover mean values, data set dispersion, and skewness. In this data we visualize the numeric distribution across the years via boxplots. 72 | 73 | ```{r, fig.height = 9, fig.width = 15} 74 | # plotting the distribution of numeric variables via boxplots 75 | aus_data |> 76 | select(where(is.numeric), -school_id, -student_id) |> 77 | pivot_longer(cols = -year) |> 78 | ggplot(aes(x = factor(year), 79 | y = value, 80 | colour = factor(year))) + 81 | geom_boxplot() + 82 | facet_wrap(~name, scales = "free_y") + 83 | theme(legend.position = "none") + 84 | labs(x = "Year", 85 | y = "", 86 | title = "The distribution of numerical variables in the student dataset over all years") 87 | ``` 88 | 89 | ## Factor variables 90 | 91 | Missing data is a common issue that data professionals must deal with on a daily basis. In this section we visualize the number of missing values across the years for all the factor variables in the student dataset. 92 | 93 | ```{r, fig.height = 15, fig.width = 15} 94 | #checking the missing values in the factor variables of the data 95 | aus_fct_plotdata = aus_data |> 96 | select(year, where(is.factor)) |> 97 | dplyr::select(-country) |> 98 | pivot_longer(cols = -year) |> 99 | group_by(year, name, value) |> 100 | tally() |> 101 | dplyr::mutate( 102 | value = coalesce(value, "missing"), 103 | percent = n/sum(n), 104 | year = year |> as.character() |> as.integer()) |> 105 | group_by(name, value) |> 106 | dplyr::mutate(last_point = ifelse(year == max(year), as.character(value), NA)) 107 | 108 | aus_fct_plotdata |> 109 | ggplot(aes(x = year, y = percent, 110 | label = last_point, 111 | group = value)) + 112 | geom_point() + 113 | geom_line() + 114 | geom_label_repel(direction = "both", nudge_x = 3, seed = 2020, segment.size = 0) + 115 | facet_wrap(~name, scales = "free_y", ncol = 3) + 116 | scale_x_continuous(breaks = c(2000, 2003, 2006, 2009, 2012, 2015, 2018)) + 117 | scale_y_continuous(labels = scales::percent) + 118 | labs(x = "Year", 119 | y = "Percentage of missing values", 120 | title = "Missing values in the student dataset's factor variables") 121 | ``` 122 | 123 | We initially investigate the most current 2018 data before generalizing the models/results into any patterns due to the quantity of missing values in the data in previous years and also to decrease the time complexity in modeling. 124 | 125 | # Linear regression model for the 2018 study 126 | 127 | Linear regression analysis predicts the value of one variable depending on the value of other variables. Because they are well known and can be trained rapidly, linear regression models have become a effective way of scientifically and consistently predicting the future. 128 | 129 | We begin by doing a basic data exploration using linear regression models. To begin, we fit three linear models (one for each subject of math, reading, and science) to the 2018 Australian data to gain an understanding of the key variables that may be impacting test scores. 130 | 131 | We filter the student data (we will load the complete student data using `load student("all")`) to pick the scores in Australia and re level some variables for further analyses. 132 | 133 | ```{r} 134 | #filtering the data to Australia, defining the predictors and selecting the scores 135 | student_predictors = c("mother_educ", "father_educ", "gender", "internet", 136 | "desk", "room", "television", "computer_n", 137 | "car", "book", "wealth", "escs") 138 | 139 | student_formula_rhs = paste(student_predictors, collapse = "+") 140 | 141 | aus2018 = aus_data |> 142 | dplyr::filter(year == "2018") |> 143 | dplyr::select( 144 | math, read, science, 145 | all_of(student_predictors)) |> 146 | na.omit() 147 | ``` 148 | 149 | ## Checking correlation matrix of the numeric variables 150 | 151 | A correlation matrix is a table that displays the coefficients of correlation between variables. Each cell in the table represents the relationship between two variables. 152 | 153 | ```{r} 154 | #correlation matrix for the numeric variables 155 | aus2018 |> 156 | select(where(is.numeric)) |> 157 | cor(use = "pairwise.complete.obs") |> 158 | round(2) |> 159 | kbl(caption = "Correlation Matrix") |> 160 | kable_styling(full_width = NULL, 161 | position = "center", 162 | bootstrap_options = c("hover", "striped")) 163 | ``` 164 | 165 | ## Fitting three linear models 166 | 167 | 168 | ```{r} 169 | #fitting linear models for the three subjects maths, reading and science 170 | 171 | aus2018_math = lm(formula = as.formula(paste("math ~ ", student_formula_rhs)) , data = aus2018) 172 | 173 | aus2018_read = lm(formula = as.formula(paste("read ~ ", student_formula_rhs)) , data = aus2018) 174 | 175 | aus2018_science = lm(formula = as.formula(paste("science ~ ", student_formula_rhs)) , data = aus2018) 176 | 177 | sjPlot::tab_model(aus2018_math, aus2018_read, aus2018_science, 178 | show.ci = FALSE, show.aic = TRUE, show.se = TRUE, 179 | show.stat = TRUE, 180 | show.obs = FALSE) 181 | ``` 182 | 183 | 184 | Some interesting discoveries from these models: 185 | 186 | 1. All three response variables seem to be influenced by the same set of factors. 187 | 188 | 2. Father's education level (`father_educ`) seems to have a much stronger effect than mother's education level (`mother_educ`). 189 | 190 | 3. While most estimates agree in signs across the three subjects, the most notable exception to this is `gender`, where girls tend to perform better than boys in reading. 191 | 192 | 4. The most influential predictors are those associated with socioeconomic status (`escs`) and education (`book`). A number of variables that should not be directly causal to academic performance also showed up as significant. This is likely due to their associations with socio-economic status. 193 | 194 | Note that in making these conclusions, we have ignored the effects of multicollinearity. 195 | 196 | Upon checking the classical diagnostic plots of these models, we see no major violation on the assumptions of linear models. The large amount of variations in the data may help to explain why the models only has a moderately low $R^2$ values (~ 0.20). 197 | 198 | ```{r, fig.height = 30, fig.width = 12} 199 | #plotting the outcome of linear models 200 | autoplot(aus2018_math) + labs(title = "2018 Australia maths model") + 201 | autoplot(aus2018_read) + labs(title = "2018 Australia read model") + 202 | autoplot(aus2018_science) + labs(title = "2018 Australia science model") 203 | ``` 204 | 205 | 206 | # Linear mixed model 207 | 208 | Linear mixed models are a subset of simple linear models that allow for both fixed and random effects. 209 | 210 | We already know that the socio-economic status (SES) of a student is often the most influential predictor and it is likely that students with similar SES will attend the same schools in their neighborhood and receive similar level of quality of education from the same teachers. 211 | 212 | Thus, it is likely that there will be a grouping effect on the students if they attended the same school. This would imply that some observations in our data are not independent observations. 213 | 214 | By building random effects in our linear model, that is building a linear mixed model, we should be able to produce a model with better fit if we consider this grouping effect of schools into our model. 215 | 216 | ```{r} 217 | # joining school and student data, building a linear mixed model 218 | lmm2018 = aus_data |> 219 | filter(year == 2018) |> 220 | dplyr::select( 221 | school_id, 222 | math, read, science, 223 | all_of(student_predictors)) |> 224 | na.omit() 225 | 226 | lmm2018_math = lmer(formula = as.formula(paste("math ~ ", student_formula_rhs, "+ (escs | school_id)")), data = lmm2018) 227 | 228 | lmm2018_read = lmer(formula = as.formula(paste("read ~ ", student_formula_rhs, "+ (escs | school_id)")), data = lmm2018) 229 | 230 | lmm2018_science = lmer(formula = as.formula(paste("science ~ ", student_formula_rhs, "+ (escs | school_id)")), data = lmm2018) 231 | 232 | sjPlot::tab_model(lmm2018_math, lmm2018_read, lmm2018_science, 233 | show.ci = FALSE, show.aic = TRUE, show.se = TRUE, 234 | show.stat = TRUE, 235 | show.obs = FALSE) 236 | ``` 237 | 238 | We see that the linear mixed model improved on the fit of the model, as judged by the AIC. 239 | 240 | ```{r} 241 | # subtracting AIC values of the two models 242 | bind_cols( 243 | AIC(aus2018_math) - AIC(lmm2018_math), 244 | AIC(aus2018_read) - AIC(lmm2018_read), 245 | AIC(aus2018_science) - AIC(lmm2018_science) 246 | ) |> 247 | rename(maths = ...1, 248 | read = ...2, 249 | science = ...3) |> 250 | kbl(caption = "AIC Values") |> 251 | kable_styling(full_width = NULL, 252 | position = "center", 253 | bootstrap_options = c("hover", "striped")) 254 | ``` 255 | 256 | # Integrating with `school` data 257 | 258 | We now take this dataset on students and merge it with some variables from the `school` data which is also a part of this `learningtower` package. This allows us to gain more access to the school level variables this is helpful in modelling the data. 259 | 260 | ```{r} 261 | #taking into account the school dataset variables and fitting a linear mixed model 262 | selected_vars = c("father_educ", "gender", "internet", 263 | "desk", "computer_n", "car", 264 | "book", "wealth", "escs") 265 | 266 | data(school) 267 | 268 | aus_school_2018 = school |> 269 | dplyr::filter(country == "AUS", year == "2018") |> 270 | dplyr::mutate(school_size = log10(school_size)) |> ## We take the log due to the scale 271 | dplyr::select(-year, -country, -contains("fund"), -sch_wgt) 272 | 273 | lmm2018_sch = lmm2018 |> 274 | left_join(aus_school_2018, by = c("school_id")) |> na.omit() 275 | 276 | school_predictors = c("stratio", "public_private", "staff_shortage", "school_size") 277 | school_formula_rhs = paste(school_predictors, collapse = "+") 278 | 279 | lmm2018_sch_math = lmer(formula = as.formula(paste("math ~ ", student_formula_rhs, "+ (escs | school_id) + ", 280 | school_formula_rhs)), data = lmm2018_sch) 281 | 282 | lmm2018_sch_read = lmer(formula = as.formula(paste("read ~ ", student_formula_rhs, "+ (escs | school_id) + ", 283 | school_formula_rhs)), data = lmm2018_sch) 284 | 285 | lmm2018_sch_science = lmer(formula = as.formula(paste("science ~ ", student_formula_rhs, "+ (escs | school_id) + ", 286 | school_formula_rhs)), data = lmm2018_sch) 287 | 288 | 289 | sjPlot::tab_model(lmm2018_sch_math, lmm2018_sch_read, lmm2018_sch_science, 290 | show.ci = FALSE, show.aic = TRUE, show.se = TRUE, 291 | show.stat = TRUE, 292 | show.obs = FALSE) 293 | ``` 294 | 295 | We note the following: 296 | 297 | 1. The school size (`school_size`) is a strong predictor for academic performance, implying larger schools tend to do better. This is likely a confounding variable for the urban/rural region of the school which can imply a difference in available funding of school facilities. 298 | 299 | 2. Private school tends to better than public schools (note the reference level and the negative coefficient estimate in the variable `public_private`). 300 | 301 | 3. Perhaps surprisingly, the student-teacher ratio (`stratio`) wasn't found to be significant but the shortage of staff (`staff_shortage`) was significant. This would imply that as long as the school is adequately supported by staff, further reduction in the student-teacher ratio does not have a statistical significant effect on student performance. 302 | 303 | # Visualising coefficient estimates over the years 304 | 305 | All analyses above focused on the year 2018 for Australia, but what about the other years? We also visualize the academic performances of students as a function of time in the [time trend article](https://kevinwang09.github.io/learningtower/articles/exploring_time.html), so in this section, we attempt to visualize the effect of some interesting variables and their linear model coefficient estimates for each of the PISA study over time. 306 | 307 | We would expect the availability of technology (e.g. computer) could be beneficial for students at the start of the 21st century, but it is not clear if students will be helped by these technologies as time goes by. 308 | 309 | The construction goes as follow: 310 | 311 | 1. We first split the entire Australian data by year and fit a linear model, with `math` as the response variable. 312 | 313 | 2. We extract the coefficient estimate for every predictor from every linear model and combine the result. 314 | 315 | 3. We then plot the years on the x-axis and the coefficient estimates on the y-axis as points and join each variable using a line. For categorical variables, we split the categories as separate lines. 316 | 317 | 4. Additionally, we show the 95% confidence interval of each coefficient estimate using a transparent ribbon and show the y = 0 line. i.e. whenever the ribbon crosses the horizontal line, the p-value for testing this level will be < 0.05. 318 | 319 | ```{r, fig.height = 12, fig.width = 15} 320 | #Fitting a linear model, extracting the coefficients and visualizing every predictor 321 | aus_student_years = aus_data |> 322 | dplyr::select( 323 | math, 324 | all_of(student_predictors), 325 | year) |> 326 | na.omit() 327 | 328 | aus_student_years_coef = aus_student_years |> 329 | group_by(year) |> 330 | nest() |> 331 | dplyr::mutate(math_lm_coef = purrr::map(.x = data, 332 | .f = ~ lm(formula = as.formula(paste("math ~ ", student_formula_rhs)), data = .x) |> 333 | broom::tidy())) |> 334 | dplyr::select(-data) |> 335 | tidyr::unnest(math_lm_coef) 336 | 337 | aus_student_years_coef |> 338 | dplyr::filter(str_detect(term, "computer|father_educ|escs|wealth")) |> 339 | dplyr::mutate( 340 | year = year |> as.character() |> as.integer(), 341 | facet = case_when( 342 | str_detect(term, "computer") ~ "Number of computer", 343 | str_detect(term, "father_educ") ~ "Education of father", 344 | # str_detect(term, "mother_educ") ~ "Education of mother", 345 | str_detect(term, "wealth") ~ "Wealth", 346 | str_detect(term, "escs") ~ "Socio-economic index"), 347 | last_point = ifelse(year == 2018, term, NA)) |> 348 | ggplot(aes(x = year, y = estimate, 349 | colour = term, 350 | group = term, 351 | label = last_point)) + 352 | geom_hline(yintercept = 0) + 353 | geom_point(position = position_dodge(width = 0.8), size = 2) + 354 | geom_line(position = position_dodge(width = 0.8), size = 1.5) + 355 | geom_linerange(aes(ymin = estimate - 2*std.error, 356 | ymax = estimate + 2*std.error), 357 | size = 4, alpha = 0.7, 358 | position = position_dodge(width = 0.8)) + 359 | geom_label_repel(direction = "both", nudge_x = 2, seed = 2020, segment.size = 0) + 360 | scale_x_continuous(limits = c(2005.5, 2022), 361 | breaks = c(2006, 2009, 2012, 2015, 2018)) + 362 | facet_wrap(~facet, scales = "free_y") + 363 | theme(legend.position = "none") + 364 | labs(x = "Year", 365 | y = "Estimate", 366 | title = "Graphing coefficient estimates throughout time") 367 | ``` 368 | 369 | We note the following: 370 | 371 | 1. Even though in the 2018, we found the education of father was statistically significant against students' academic performance, this was not always the case. From 2006 to 2018, the education of father seems to have ever positive influence on students. 372 | 373 | 2. It is clear that access to computers is ever more prevalent in Australia. But surprisingly, the positive influence of computers are decreasing. It is not clear why this would be the case. One possible reason is that students might have access to computers outside of their homes (e.g. from schools) and thus the advantages of accessing computers are dampened. 374 | 375 | 3. Quite interestingly, the influence of socio-economic index is dropping, implying a gradual move towards equality. 376 | 377 | # Session info 378 | ```{r} 379 | sessionInfo() 380 | ``` 381 | -------------------------------------------------------------------------------- /vignettes/articles/exploring_time.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Exploring temporal trends" 3 | author: "The Freemasons" 4 | date: "`r Sys.Date()`" 5 | output: 6 | rmarkdown::html_vignette: 7 | fig_height: 10 8 | fig_width: 14 9 | number_sections: true 10 | vignette: > 11 | %\VignetteIndexEntry{Temporal trends} 12 | %\VignetteEngine{knitr::rmarkdown} 13 | %\VignetteEncoding{UTF-8} 14 | --- 15 | 16 | ```{r setup, include = FALSE} 17 | knitr::opts_chunk$set( 18 | collapse = TRUE, 19 | comment = "#>", 20 | warning = FALSE, 21 | message = FALSE, 22 | error = FALSE, 23 | outwidth = "100%", 24 | fig.width = 8, 25 | fig.height = 6) 26 | ``` 27 | 28 | # Introduction 29 | 30 | One of the most interesting thing that we can explore in this PISA data are the temporal trends for each country/region. 31 | 32 | 33 | # Loading packages and data 34 | 35 | ```{r} 36 | #loading the data and libraries 37 | library(learningtower) 38 | library(tidyverse) 39 | library(patchwork) 40 | library(brolgar) 41 | library(gghighlight) 42 | library(ggrepel) 43 | library(tsibble) 44 | library(kableExtra) 45 | 46 | student <- load_student("all") 47 | data(countrycode) 48 | 49 | theme_set(theme_classic(18) + 50 | theme(legend.position = "bottom")) 51 | ``` 52 | 53 | # Basic time series visualisation 54 | 55 | We begin by visualizing the time series trend of countries independent of when and for how long they participated in the PISA survey. The following code computes the weighted means of each subject (maths, reading, and science) for each country and year. The weighted averages are then plotted as three time series plots, with each joined line in the plots representing a country's performance in that subject throughout the time that they participated in the PISA research. 56 | 57 | ```{r} 58 | #calculating the weighted means for all three subjects and plotting them 59 | w_mean = function(x, w){weighted.mean(x = x, w = w, na.rm=TRUE)} 60 | 61 | stu_summ = student |> 62 | group_by(year, country) |> 63 | summarise_at(.vars = vars(math, read, science), 64 | .funs = list(wmean = ~w_mean(., w = stu_wgt), 65 | min = ~min(., na.rm = TRUE), 66 | max = ~max(., na.rm = TRUE))) |> 67 | ungroup() 68 | 69 | 70 | stu_wmean_long = stu_summ |> 71 | select(year, country, contains("wmean")) |> 72 | pivot_longer(cols = contains("wmean"), 73 | names_to = "wmean_names", 74 | values_to = "wmean_values") 75 | 76 | stu_wmean_long |> 77 | ggplot(aes(x = year, y = wmean_values, group = country)) + 78 | geom_line() + 79 | facet_wrap(~wmean_names) + 80 | labs(x = "Year", y = "Weighted mean values", 81 | title = "Weighted means of countries in all subjects") 82 | 83 | ``` 84 | 85 | 86 | ## Australia, New Zealand, Indonesia 87 | 88 | A core of this package was built at the 2019 OzUnconf in Australia. Hence, we focus on three countries in the APAC region for more detailed visualizations. In the plot below, the dark line is the weighted mean score of each country for each subject. The shading indicates the minimum and maximum of scores for a given year. We can see that when looking at range of scores, the variations in the mean of the data is almost negligible. We explore this effect in details later. 89 | 90 | ```{r} 91 | #plotting the weighted mean score with minimum and maximum range for the three countries 92 | stu_summ_long2 = stu_summ |> 93 | filter(country %in% c("AUS", "NZL", "IDN")) |> 94 | pivot_longer(cols = math_wmean:science_max, 95 | names_to = "names", 96 | values_to = "values") |> 97 | separate(col = names, into = c("subject", "statistics"), sep = "_") |> 98 | pivot_wider(names_from = "statistics", 99 | values_from = "values") 100 | 101 | 102 | stu_summ_long2 |> 103 | ggplot(aes(x = year, y = wmean)) + 104 | geom_ribbon(aes(ymin = min, ymax = max), fill = "grey70") + 105 | geom_line(colour = "black", size = 2) + 106 | facet_grid(subject~country, labeller = label_both) + 107 | labs(x = "Year", y = "Test score values", 108 | title = "Weighted means/Min and Max range") 109 | ``` 110 | 111 | # `brolgar` visualisations 112 | 113 | [brolgar](https://github.com/njtierney/brolgar) is a new R package that makes visualization of time series easier. We now use this package to pick out some interesting patterns in the data. 114 | 115 | ## Linear model for every country 116 | 117 | We now consider fitting a linear model for every country's performance in maths. This extracts the general trend of performance in mathematics extracted from the "spaghetti" plot above. 118 | 119 | There are many countries/regions who did not participate in all 7 PISA studies (between 2000 to 2018, a study is conducted every three years). As we are interested in calculating linear models, we retain only those countries/regions participated in 5 or more studies. 120 | 121 | For simplicity of interpretation, we center each country/region's performance to the first time that country/region participated in the PISA study. Hence, the intercept terms of the linear models (x-axis in the plot below) represent the weighted means of the countries/regions when they first participated in PISA study. The slope terms of the linear models (y-axis in the plot below) represent the average annual increase in the weighted mean score for each country/region. 122 | 123 | Based on this interpretation, it appears that if a country/region has a good initial performance in the PISA study, then that country is likely to have reach "saturation" where it is hard for it to improve any further, and thus only has a small annual increase or even a decrease. 124 | 125 | ```{r} 126 | #considering countries whose participation is 5 or more, calculating the math slope and plotting it 127 | complete_nations = stu_summ |> 128 | group_by(country) |> 129 | filter(n() >= 5) |> 130 | ungroup() |> 131 | mutate(year_subtract = year - min(year)) |> 132 | as_tsibble(key = country, index = year_subtract) 133 | 134 | math_slope = complete_nations |> 135 | select( 136 | year_subtract, 137 | country, 138 | math_wmean) |> 139 | key_slope(math_wmean ~ year_subtract) |> 140 | left_join(countrycode, by = "country") 141 | 142 | math_slope |> 143 | ggplot(aes(x = .intercept, y = .slope_year_subtract)) + 144 | geom_point() + 145 | geom_text_repel(aes(label = country_name), size = 3) + 146 | geom_hline(yintercept = 0, colour = "red") + 147 | labs(x = "Weighted mean math score in first participation", 148 | y = "Avg. increase in weighted mean score every year", 149 | title = "Countries performance in maths") + 150 | scale_y_continuous(limits = c(-5, 8)) 151 | 152 | 153 | math_slope_near <- math_slope |> 154 | keys_near(key = country, var = .slope_year_subtract) 155 | 156 | math_slope_near |> 157 | kbl(caption = "Summary Statistics") |> 158 | kable_styling(full_width = NULL, 159 | position = "center", 160 | bootstrap_options = c("hover", "striped")) 161 | ``` 162 | 163 | ## Highlighting monotone countries for subjects 164 | 165 | There are some countries, since their initial participation in the PISA study, always exhibit monotone trending (increase or decrease). We use the `brolgar` package to highlight these countries. 166 | 167 | Quite interestingly, the countries exhibiting monotone decreasing patterns are Australia, New Zealand and Netherlands. Despite this decreasing pattern, all three countries remain on the top of the world in terms of their performance. This is consistent with the idea of "saturation" above as we can see a cluster of countries towards the top of the score range of each subject. On the other hand, Qatar and Peru are the two countries that massively improved their performance since the PISA study began. 168 | 169 | ```{r} 170 | #plotting increasing and decreasing patterns in countries for all three subjects 171 | feature_monotone = complete_nations |> 172 | features_at(.var = vars(math_wmean, read_wmean, science_wmean), 173 | features = feat_monotonic) |> 174 | dplyr::select(country, contains("increase"), contains("decrease")) 175 | 176 | feature_monotone_long = feature_monotone |> 177 | pivot_longer(cols = -country, 178 | names_to = "names", 179 | values_to = "monotone_value") |> 180 | separate(col = names, into = c("subject", "direction"), sep = "_(?!.*_)") 181 | 182 | plot_tbl = complete_nations |> 183 | as_tibble() |> 184 | select(year, country, math_wmean, read_wmean, science_wmean) |> 185 | pivot_longer(cols = contains("_wmean"), 186 | names_to = "subject", 187 | values_to = "wmean_value") |> 188 | left_join(feature_monotone_long, by = c("country", "subject")) |> 189 | left_join(countrycode, by = "country") 190 | 191 | plot_tbl |> 192 | ggplot(aes(x = year, 193 | y = wmean_value, 194 | group = interaction(country, subject))) + 195 | geom_line() + 196 | gghighlight::gghighlight(monotone_value, label_key = country_name) + 197 | facet_grid(direction~subject) + 198 | labs(x = "Year", 199 | y = "Weighted means", 200 | title = "Monotonic trending of countries") 201 | ``` 202 | 203 | 204 | ## Highlighting variance 205 | 206 | As the PISA study spans multiple countries, schools and across time, there is a huge amount of variations in the data that the simple linear analyses above are not able to fully capture. Here, we turn our attention to the variability themselves and visualize these. We primarily use standard deviation and coefficient of variation to visualize the general trends of countries/region over time. 207 | 208 | ```{r} 209 | student |> 210 | group_by(year, country) |> 211 | summarise_at( 212 | .vars = vars(math, read, science, wealth, escs), 213 | .funs = list( 214 | mean = ~ mean(., na.rm = TRUE), 215 | sd = ~ sd(., na.rm = TRUE))) |> 216 | ggplot(aes(x = math_mean, y = math_sd, colour = factor(year))) + 217 | geom_point(size = 3) + 218 | scale_colour_brewer(palette = "Dark2") + 219 | labs(x = "Mean maths score", 220 | y = "SD maths score", 221 | title = "Highlighting variance") + 222 | facet_wrap(~year) + 223 | theme(legend.position = "none") 224 | 225 | cv = function(x){ 226 | sd(x, na.rm = TRUE)/mean(x, na.rm = TRUE) 227 | } 228 | 229 | stu_var_summ = student |> 230 | group_by(year, country) |> 231 | summarise_at( 232 | .vars = vars(math, read, science), 233 | .funs = list( 234 | sd = ~ sd(., na.rm = TRUE), 235 | cv = ~ cv(.))) |> 236 | group_by(country) |> 237 | filter(n() >= 5) |> 238 | ungroup() 239 | 240 | 241 | stu_var_summ_long = stu_var_summ |> 242 | pivot_longer(cols = -c("year", "country"), 243 | names_to = "names", 244 | values_to = "values") |> 245 | separate(col = "names", into = c("subject", "statistic"), sep = "_") 246 | 247 | stu_var_summ_long |> 248 | ggplot(aes(x = year, y = values, 249 | group = country)) + 250 | geom_line() + 251 | facet_grid(statistic~subject, scales = "free_y") + 252 | labs(x = "Year", 253 | y = "Values", 254 | title = "Highlighting variance") 255 | ``` 256 | 257 | In the plot above, we see that there countries that has high variations tend to lower as time passes by while low variations tend to stay stay that way. This implies that most countries/regions typically exhibit non-increasing pattern in terms of the quality of performance. 258 | 259 | We now zoom into the mathematics performance (measured using coefficient of variation) panel and take a close look at the countries, highlighting certain countries of interest. 260 | 261 | ```{r} 262 | stu_var_summ = stu_var_summ |> 263 | as_tsibble(key = country, index = year) 264 | 265 | stu_var_near = stu_var_summ |> 266 | features(math_cv, feat_brolgar) |> 267 | keys_near(key = country, var = median) 268 | 269 | stu_var_plotdf = stu_var_summ_long |> 270 | filter(subject == "math", statistic == "cv") |> 271 | left_join(stu_var_near, by = "country") |> 272 | left_join(countrycode, by = "country") |> 273 | as_tibble() |> 274 | mutate(label_stats_country = ifelse(is.na(stat), NA, paste0(stat, ":", country_name))) 275 | 276 | stu_var_plotdf |> 277 | ggplot(aes(x = year, y = values, 278 | group = country, colour = stat)) + 279 | geom_line() + 280 | gghighlight::gghighlight(!is.na(stat), label_key = label_stats_country) + 281 | labs(y = "Coeffiecent of variation across students", 282 | x = "Year", 283 | title = "Maths scores using coefficient of variation") 284 | ``` 285 | 286 | We again see Qatar appearing in this visualization. Qatar is highlighted as it is the country with a large amount of variations, implying a high level of inequality in the performance in mathematics. But what is particularly interesting here is that Qatar is consistently lowering the variation every time it participate in the PISA study. Combined with the visualizations above, we might conjecture that Qatar is not only improving its performance but also the equality of access. 287 | 288 | ## Gender gap over time 289 | 290 | One of the ongoing myth in education is that there is a difference in the performance of different subjects by gender. While this may appear true in selected cases, it is important to note gender is often a confounding variable masking the effect of some genuine underlying cause. Together with the large amount of variations in the data across socioeconomic status of different families in different countries/regions, it is never possible to draw a generalized conclusion. 291 | 292 | That being said, we now visualize the differences in the average test scores for each gender (PISA study chose a binary coding). Across the three subjects, there are more countries with a higher average for the boys in maths. In reading, girls completely dominate in every country while performance in science is more evenly split between the genders. 293 | 294 | ```{r} 295 | stu_gender_summ = student |> 296 | filter(complete.cases(gender)) |> 297 | group_by(year, country, gender) |> 298 | summarise_at(.vars = vars(math, read, science), 299 | .funs = list(wmean = ~w_mean(., w = stu_wgt))) |> 300 | group_by(country) |> 301 | filter(n() >= 10) |> 302 | ungroup() |> 303 | pivot_longer(cols = contains("_wmean"), 304 | names_to = "names", 305 | values_to = "values") |> 306 | pivot_wider(names_from = c("gender", "names"), 307 | values_from = "values") 308 | 309 | stu_ggap_summ = stu_gender_summ |> 310 | dplyr::transmute( 311 | year, country, 312 | gap_math_wmean = female_math_wmean - male_math_wmean, 313 | gap_read_wmean = female_read_wmean - male_read_wmean, 314 | gap_science_wmean = female_science_wmean - male_science_wmean) 315 | 316 | 317 | stu_ggap_summ_long = stu_ggap_summ |> 318 | pivot_longer(cols = contains("gap"), 319 | names_to = "gap_names", 320 | values_to = "gap_values") 321 | 322 | stu_ggap_summ_long |> 323 | ggplot(aes(x = year, y = gap_values)) + 324 | geom_point() + 325 | geom_line(aes(group = country)) + 326 | geom_hline(yintercept = 0, colour = "red") + 327 | facet_wrap(~gap_names) + 328 | labs(title = "Average gender gaps across subjects and years", 329 | subtitle = "Gap = avg. female score - avg. male score", 330 | x = "Year", 331 | y = "Gender Gap Values") 332 | ``` 333 | 334 | ### Highlighting key countries across all three subjects 335 | 336 | 337 | ```{r, fig.width = 18} 338 | stu_ggap_summ_nest = stu_ggap_summ |> 339 | pivot_longer(contains("_wmean"), 340 | names_to = "names", 341 | values_to = "values") |> 342 | group_by(names) |> 343 | nest() |> 344 | mutate(f_tbl = map(.x = data, 345 | .f = ~ .x |> 346 | as_tsibble(key = country, index = year) |> 347 | features(values, feat_brolgar) |> 348 | keys_near(key = country, var = range2)), 349 | f_data = map2(.x = data, .y = f_tbl, 350 | .f = ~ left_join(.x, .y, by = "country"))) 351 | 352 | stu_ggap_summ_plotdf = stu_ggap_summ_nest |> 353 | select(names, f_data) |> 354 | unnest(f_data) |> 355 | left_join(countrycode, by = "country") |> 356 | mutate(label_stats_country = ifelse(is.na(stat), NA, paste0(stat, ":", country_name))) 357 | 358 | stu_ggap_summ_plotdf |> 359 | ggplot(aes(x = year, y = values)) + 360 | geom_line(aes(group = country, colour = country)) + 361 | gghighlight(!is.na(stat), label_key = label_stats_country, calculate_per_facet = TRUE, keep_scales = TRUE) + 362 | facet_wrap(~names) + 363 | labs(x = "Year", 364 | y = "Values", 365 | title = "Highlighting key countries across all three subjects") 366 | ``` 367 | 368 | 369 | ### Highlighting key countries for maths only 370 | 371 | ```{r} 372 | stu_gap_math_near = stu_ggap_summ |> 373 | as_tsibble(key = country, index = year) |> 374 | features(gap_math_wmean, feat_brolgar) |> 375 | keys_near(key = country, var = median) 376 | 377 | stu_gap_math_plotdf = stu_ggap_summ |> 378 | as_tibble() |> 379 | left_join(stu_gap_math_near, by = "country") |> 380 | left_join(countrycode, by = "country") |> 381 | mutate(label_stats_country = ifelse(is.na(stat), NA, paste0(stat, ":", country_name))) 382 | 383 | p_math = stu_gap_math_plotdf |> 384 | ggplot(aes(x = year, y = gap_math_wmean, 385 | group = country, colour = stat)) + 386 | geom_line() + 387 | gghighlight::gghighlight(!is.na(stat), label_key = label_stats_country) + 388 | labs(x = "Year", 389 | y = "Values", 390 | title = "Highlighting key countries for maths") 391 | 392 | p_math 393 | ``` 394 | 395 | 396 | # Session info 397 | ```{r} 398 | sessionInfo() 399 | ``` 400 | -------------------------------------------------------------------------------- /vignettes/learningtower_school.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using the Student and School Data" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{learningtower_school} 6 | %\VignetteEncoding{UTF-8} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | editor_options: 9 | chunk_output_type: console 10 | --- 11 | 12 | ```{r setup, include = FALSE} 13 | knitr::opts_chunk$set( 14 | echo = TRUE, 15 | collapse = TRUE, 16 | comment = "#>", 17 | warning = FALSE, 18 | message = FALSE, 19 | error = FALSE, 20 | outwidth = "100%", 21 | fig.width = 6, 22 | fig.height = 4, 23 | fig.align = "center") 24 | ``` 25 | 26 | # Introduction 27 | 28 | The goal of `learningtower` is to provide a user-friendly R package to provide easy access to a subset of variables from PISA data collected from the [OECD](https://www.oecd.org/en/about/programmes/pisa/pisa-data.html). Version `r utils::packageVersion("learningtower")` of this package provides the data for the years `r learningtower:::data_range()`. The survey data is published every three years. This is an excellent real world dataset for data exploring, data visualizing and statistical computations. 29 | 30 | This vignette documents how to access the data, and shows a few ways of integrating the data. 31 | 32 | # Using both the `student` and `school` data 33 | 34 | The size of the full `student` is too big to fit inside the package. Hence, in our package, we provide a random subset of the student data, stored as `student_subset_yyyy` data objects (where `yyyy` denotes the specific year of the study). These subset data can be used to understanding the data structure before using the full dataset which is available for download. 35 | 36 | In the `student_subset_2018` and `school` data, there are three common columns, `school_id`, `country` and `year`. It should be noted that `school_id` is only meaningful within a country within a specific year; meaning that when we join the two data, we need to use the keys `c("school_id", "country", "year")`. 37 | 38 | ## Using the student subset data and school data 39 | 40 | ```{r} 41 | library(dplyr) 42 | library(ggplot2) 43 | library(forcats) 44 | library(learningtower) 45 | 46 | #loading the student subset data 47 | data(student_subset_2018) 48 | 49 | #loading the school data 50 | data(school) 51 | 52 | #loading the country data 53 | data(countrycode) 54 | 55 | selected_countries = c("AUS", "FIN", "JPN", "USA", "NZL", "ESP") 56 | 57 | #joining the student, school dataset 58 | school_student_subset_2018 <- left_join( 59 | student_subset_2018, 60 | school, 61 | by = c("school_id", "country", "year")) 62 | 63 | #check the count of public and private schools in the a few randomly selected countries 64 | school_student_subset_2018 |> 65 | dplyr::filter(country %in% selected_countries) |> 66 | group_by(country, public_private) |> 67 | tally() |> 68 | dplyr::mutate(percent = n/sum(n)) |> 69 | dplyr::ungroup() |> 70 | left_join(countrycode, by = "country") |> 71 | ggplot(aes(x = percent, 72 | y = country_name, 73 | fill = public_private)) + 74 | geom_col(position = position_stack()) + 75 | scale_x_continuous(labels = scales::percent) + 76 | scale_fill_manual(values = c("#FF7F0EFF", "#1F77B4FF")) + 77 | labs(title = "Distribution of public and private schools in the year 2018", 78 | y = "", 79 | x = "Percentage of schools", 80 | fill = "") 81 | ``` 82 | 83 | - The graph assists us in understanding the distribution of public and private schools in few countries based on the datasets. Taking a closer look at the above plot, we can infer that most countries have more public schools than private schools. Interestingly, Spain had a nearly equal mix of public and private schools in the year 2018. 84 | 85 | - Similarly, we may derive additional intriguing patterns and analysis by considering the other variables in the school dataset. 86 | 87 | ```{r, echo=FALSE} 88 | student_data_2018 <- load_student("2018") 89 | data(school) 90 | 91 | data(countrycode) 92 | 93 | school_student_2018 <- left_join( 94 | student_data_2018, 95 | school, 96 | by = c("school_id", "country", "year")) 97 | 98 | school_student_2018 |> 99 | dplyr::filter(country %in% selected_countries) |> 100 | group_by(country) |> 101 | summarise(avg_fund_gov = mean(fund_gov, na.rm = TRUE)) |> 102 | arrange(avg_fund_gov) |> 103 | mutate(country = fct_reorder(country, avg_fund_gov)) |> 104 | left_join(countrycode, by = "country") |> 105 | mutate(country_name = fct_reorder(country_name, avg_fund_gov)) |> 106 | ggplot(aes(x=country_name, y=avg_fund_gov)) + 107 | geom_segment(aes(xend=country_name, yend=0)) + 108 | geom_point(size=4, color="orange") + 109 | coord_flip() + 110 | theme_bw() + 111 | labs(x = "", 112 | y = "Average percentage of government funding", 113 | title = "Funding for schools in the year 2018 from government") 114 | ``` 115 | 116 | - The above figure shows the average percentage of overall financing in various schools for a random sample of countries. We conclude that countries such as Finland and the United States received the most funding from their governments, whilst Qatar received the least funding. 117 | 118 | - In addition, to perform a detail analysis on the school and entire student data it can be downloaded for the desired years using the `load_student` function available in this package. 119 | 120 | - Similarly, you may import student data for any chosen year and experiment with PISA scores growth or additional analysis of these datasets with their other elements that assist contributor comprehend the data. Refer to our articles [here](https://kevinwang09.github.io/learningtower/articles/exploring_time.html) for additional interesting analyses and plots. 121 | -------------------------------------------------------------------------------- /vignettes/learningtower_student.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using the Student and Country Data" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{learningtower_student} 6 | %\VignetteEncoding{UTF-8} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | editor_options: 9 | chunk_output_type: console 10 | --- 11 | 12 | ```{r setup, include = FALSE} 13 | options(rmarkdown.html_vignette.check_title = FALSE) 14 | knitr::opts_chunk$set( 15 | echo = TRUE, 16 | collapse = TRUE, 17 | comment = "#>", 18 | warning = FALSE, 19 | message = FALSE, 20 | error = FALSE, 21 | outwidth = "100%", 22 | fig.width = 8, 23 | fig.height = 6, 24 | fig.align = "center") 25 | ``` 26 | 27 | # Introduction 28 | 29 | The goal of `learningtower` is to provide a user-friendly R package to provide easy access to a subset of variables from PISA data collected from the [OECD](https://www.oecd.org/en/about/programmes/pisa/pisa-data.html). Version `r utils::packageVersion("learningtower")` of this package provides the data for the years `r learningtower:::data_range()`. The survey data is published every three years. This is an excellent real world dataset for data exploring, data visualizing and statistical computations. 30 | 31 | This vignette documents how to access the data, and shows a few typical methods to explore the data. 32 | 33 | # Exploring the `student` data 34 | 35 | ## Usage of the subset of the `student` data 36 | 37 | - In `learningtower`, the main data is the student data. This data contains information regarding student test scores and some selected variables regarding their schooling and socio-economic status. The original and complete data may be obtained from [OECD](https://www.oecd.org/pisa/data/). 38 | 39 | - However, the size of the full `student` is too big to fit inside the package. Hence, in our package, we provide a random subset of the student data, stored as `student_subset_20xx` data objects (where `xx` denotes the specific year of the study). These subset data can be used to understanding the data structure before using the full dataset which is available for download. 40 | 41 | - The student subset data is constructed by randomly sampling from the full student data. For each year and each country, we randomly sample approximately 50 observations. 42 | 43 | - The complete student dataset is [available for download](https://github.com/kevinwang09/learningtower/tree/master/student_full_data) and can be loaded using the `load_student()` function included in this package. 44 | 45 | 46 | Below is a quick example of loading the 2018 subset student data. 47 | 48 | ```{r} 49 | library(dplyr) 50 | library(ggplot2) 51 | library(learningtower) 52 | 53 | #load the subset student data for the year 2018 54 | data(student_subset_2018) 55 | #load the countrycode data 56 | data(countrycode) 57 | 58 | glimpse(student_subset_2018) 59 | ``` 60 | 61 | ```{r} 62 | selected_countries = c("AUS", "USA", "TUR", "SWE", 63 | "CHE", "NZL", "BEL", "DEU") 64 | 65 | student_subset_2018 |> 66 | group_by(country, gender) |> 67 | dplyr::filter(country %in% selected_countries) |> 68 | dplyr::left_join(countrycode, by = "country") |> 69 | ggplot(aes(x = math, 70 | y = country_name, 71 | fill = gender)) + 72 | geom_boxplot() + 73 | scale_fill_manual(values = c("#FF7F0EFF", "#1F77B4FF")) + 74 | theme_classic() + 75 | labs(x = "Math score", 76 | y = "") 77 | ``` 78 | 79 | - In the figure above, we see that from the student subset data for the year 2018, in the countries like USA and Belgium boys perform better as compared to the girls. However, in countries such as Turkey and Switzerland, girls perform better than the boys or are on the same level with boys when it comes to their average mathematics scores. 80 | 81 | - Furthermore, if we want to learn more about the trend in each year of the selected countries or know more about the yearly student scores, the complete student data can be retrieved for that/those years or all years using the `load_student()` function included in this package. 82 | 83 | ## Usage of the entire student data 84 | 85 | - In order to load and download the complete student data for each year(s), here are the various ways to retrieve the entire student dataset for each year(s) for additional study or analysis purposes. 86 | 87 | ``` 88 | #load the entire student data for the year 2018 89 | student_data_2018 <- load_student(2018) 90 | 91 | #load the entire student data for two of the years (2012, 2018) 92 | student_data_2012_2018 <- load_student(c(2012, 2018)) 93 | 94 | #load the entire student 95 | student_data_all <- load_student("all") 96 | ``` 97 | 98 | - Note, now that we can load and download the the entire student data. Let us plot the difference in score between a few randomly picked countries seen previously and observe how they have grown in terms their average mathematics score from the year 2012 to 2018. 99 | 100 | ```{r} 101 | student_data_2012_2018 <- load_student(c(2012, 2018)) 102 | 103 | plot_data <- student_data_2012_2018 |> 104 | group_by(country, year) |> 105 | dplyr::filter(country %in% selected_countries) |> 106 | dplyr::summarise(avg_math = mean(math, na.rm = TRUE)) |> 107 | left_join(countrycode, by = "country") |> 108 | dplyr::select(country_name, year, avg_math) |> 109 | ungroup() |> 110 | dplyr::mutate( 111 | label_x_pos = ifelse(year == 2012, 2012 - 2, 2018 + 1), 112 | label = ifelse( 113 | year == 2012, 114 | paste0(country_name, ", ", round(avg_math)), 115 | round(avg_math))) 116 | 117 | plot_data |> 118 | ggplot(aes(x = year, 119 | y = avg_math, 120 | label = label, 121 | colour = country_name, 122 | group = country_name)) + 123 | geom_point() + 124 | geom_line() + 125 | geom_vline(xintercept=2012, 126 | linetype="dashed", 127 | linewidth=0.1) + 128 | geom_vline(xintercept=2018, 129 | linetype="dashed", 130 | linewidth=0.1) + 131 | geom_text(aes(x = label_x_pos), 132 | position = position_nudge(y = 0)) + 133 | scale_x_continuous(breaks = c(2012, 2018), 134 | limits = c(2008, 2020)) + 135 | scale_colour_manual(values = c("#1F77B4FF", "#FF7F0EFF", "#2CA02CFF", "#D62728FF", 136 | "#9467BDFF", "#8C564BFF", "#E377C2FF", "#7F7F7FFF")) + 137 | labs(x = "", 138 | y = "Average maths score") + 139 | theme_classic() + 140 | theme(axis.ticks.y = element_blank(), 141 | axis.text.y = element_blank(), 142 | legend.position = "none") 143 | ``` 144 | 145 | - The figure above assists us in deducing the score change in the different countries from the year 2012 to 2018. This figure enables us to deduce that Albania, Qatar, and Peru have significantly boosted their average mathematics score between these years. While we also observe drop in average mathematics score for Japan. 146 | 147 | - Similarly, you may import student data for any chosen year and experiment with the PISA scores or additional analysis of these datasets with their other variables that assist contributor comprehend the data. Refer to our articles [here](https://kevinwang09.github.io/learningtower/articles/exploring_time.html) for additional interesting analyses and plots. 148 | 149 | --------------------------------------------------------------------------------