├── .gitignore ├── figures ├── 001_input.png ├── 003_corr.png ├── 004_diag.png ├── 002_inspect.png ├── 005_compute.png ├── 006_pca_plots.png └── 007_pca_output.png ├── DESCRIPTION ├── Interactive_PCA_Explorer.Rproj ├── rsconnect └── shinyapps.io │ └── benmarwick │ └── interactive_pca.dcf ├── LICENSE ├── CONDUCT.md ├── readme.md ├── CONTRIBUTING.md ├── iris.csv ├── ui.R └── server.R /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | -------------------------------------------------------------------------------- /figures/001_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/HEAD/figures/001_input.png -------------------------------------------------------------------------------- /figures/003_corr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/HEAD/figures/003_corr.png -------------------------------------------------------------------------------- /figures/004_diag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/HEAD/figures/004_diag.png -------------------------------------------------------------------------------- /figures/002_inspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/HEAD/figures/002_inspect.png -------------------------------------------------------------------------------- /figures/005_compute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/HEAD/figures/005_compute.png -------------------------------------------------------------------------------- /figures/006_pca_plots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/HEAD/figures/006_pca_plots.png -------------------------------------------------------------------------------- /figures/007_pca_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/HEAD/figures/007_pca_output.png -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Title: Interactive PCA Explorer 2 | Author: Ben Marwick 3 | License: GPL-3 4 | # DisplayMode: Showcase 5 | Type: Shiny -------------------------------------------------------------------------------- /Interactive_PCA_Explorer.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: XeLaTeX 14 | -------------------------------------------------------------------------------- /rsconnect/shinyapps.io/benmarwick/interactive_pca.dcf: -------------------------------------------------------------------------------- 1 | name: interactive_pca 2 | account: benmarwick 3 | server: shinyapps.io 4 | appId: 97567 5 | bundleId: 431924 6 | url: https://benmarwick.shinyapps.io/interactive_pca/ 7 | when: 1461510915.71543 8 | asMultiple: FALSE 9 | asStatic: FALSE 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Ben Marwick 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (http:contributor-covenant.org), version 1.0.0, available at 25 | http://contributor-covenant.org/version/1/0/0/ 26 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Interactive PCA Explorer 2 | 3 | This [Shiny](http://shiny.rstudio.com/) application takes a CSV file of clean data, allows you to inspect the data and compute a Principal Components Analysis, and will return several diagnostic plots and tables. The plots include a tableplot, a correlation matrix, a scree plot, and a biplot of Principal Components. 4 | 5 | You can chose which columns to include in the PCA, and which column to use as a grouping variable. You can choose the center and/or scale the data, or not. You can choose which PCs to include on the biplot. 6 | 7 | The biplot of PCs is interactive, so you can click on points or select points and inspect the details of those points in a table. 8 | 9 | ## How to run or install 10 | 11 | There are two ways to run/install this app. 12 | 13 | First, you can run it on your computer like so: 14 | 15 | ``` 16 | library(shiny) 17 | runGitHub("interactive_pca_explorer", "benmarwick") 18 | 19 | ``` 20 | 21 | Second, you can clone this repo to have the code on your computer, and run the app from there, like so: 22 | 23 | ``` 24 | # First clone the repository with git. If you have cloned it into 25 | # ~/interactive_pca_explorer, first change your working directory to ~/interactive_pca_explorer, then use runApp() to start the app. 26 | setwd("~/interactive_pca_explorer") # change to match where you downloaded this repo to 27 | runApp() # runs the app 28 | ``` 29 | 30 | This app depends on several R packages (ggplot2, DT, GGally, psych, Hmisc, MASS, tabplot). The app will check to see if you have them installed, and if you don't, it will try to download and install them for you. 31 | 32 | ## How to use 33 | 34 | Start on the first (left-most) tab to upload your CSV file, then click on each tab, in order from left to right, to see the results. 35 | 36 | ## Screenshots 37 | 38 | Here's what it looks like. Here we have input a CSV file that contain the [iris data](https://en.wikipedia.org/wiki/Iris_flower_data_set) (included with this app). 39 | 40 | ![](figures/001_input.png) 41 | 42 | Then we can see some simple descriptions of the data, and the raw data at the bottom of the page. 43 | 44 | 45 | ![](figures/002_inspect.png) 46 | 47 | Below we see how we can choose the variables to explore in a correlation matrix. We also have a table that summarizes the correlations and gives p-values. 48 | 49 | ![](figures/003_corr.png) 50 | 51 | Below we have a few popular diagnostic tests that many people like to do before doing a PCA. They're not very informative and can be skipped, but people coming from SPSS might feel more comfortable if they can see them here also. 52 | 53 | ![](figures/004_diag.png) 54 | 55 | Below are the options for computing the PCA. We can choose which columns to include, and a few details about the PCA function. We are using the [prcomp](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/prcomp.html) function to compute the PCA. 56 | 57 | ![](figures/005_compute.png) 58 | 59 | Here are the classic PCA plots. First is the scree plot summarizing how important the first few PCs are. Second is the interactive PC biplot. You can see that I've used my mouse to draw a rectangle around a few of the points in the biplot (this is called 'brushing') and in the table below we can see the details of those points in the selected area. We can choose which column to use for grouping (this only affects the colouring of the plot, it doesn't change the PCA results), and we can choose which PCs to show on the plot. 60 | 61 | ![](figures/006_pca_plots.png) 62 | 63 | Finally we have some of the raw output from the PCA. 64 | 65 | ![](figures/007_pca_output.png) 66 | 67 | 68 | ## Feedback, contributing, etc. 69 | 70 | Please [open an issue](https://github.com/benmarwick/wordcountaddin/issues/new) if you find something that doesn't work as expected. Note that this project is released with a [Guide to Contributing](CONTRIBUTING.md) and a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | ## Pull requests 4 | 5 | Requirements for making a pull request: 6 | 7 | * Some knowledge of [git]() 8 | * Some knowledge of [GitHub]() 9 | 10 | Read more about pull requests on GitHub at [https://help.github.com/articles/using-pull-requests/](https://help.github.com/articles/using-pull-requests/). If you haven't done this before, Hadley Wickham provides a nice overview of git (), as well as best practices for submitting pull requests (). 11 | 12 | Then: 13 | 14 | * Fork the repo to your GitHub account 15 | * Clone the version on your account down to your machine from your account, e.g,. `git clone git@github.com:benmarwick/.git` 16 | * Make sure to track progress upstream (i.e., on our version of the package at `benmarwick/`) by doing `git remote add upstream git@github.com:benmarwick/.git`. Each time you go to make changes on your machine, be sure to pull changes in from upstream (aka the ropensci version) by doing either `git fetch upstream` then merge later or `git pull upstream` to fetch and merge in one step 17 | * Make your changes (we prefer if you make changes on a new branch) 18 | * Ideally included in your contributions: 19 | * Well documented code in roxygen docs 20 | * If you add new functions or change functionality, add one or more tests. 21 | * Make sure the package passes `R CMD CHECK` on your machine without errors/warnings 22 | * Push up to your account 23 | * Submit a pull request and participate in the discussion. 24 | 25 | ## Documentation contributions 26 | 27 | Documentation contributions are surely much needed in every project as each could surely use better instructions. If you are editing any files in the repo, follow the above instructions for pull requests to add contributions. However, if you are editing the wiki, then you can just edit the wiki and no need to do git, pull requests, etc. 28 | 29 | All of the function documentation is generated automatically. Please do not edit any of the documentation files in man/ or the NAMESPACE. Instead, construct the appropriate roxygen2 documentation in the function files in R/ themselves. The documentation is then generated by running the document() function from the devtools package. Please consult the Advanced R programming guide if this workflow is unfamiliar to you. Note that functions should include examples in the documentation. Please use \dontrun for examples that take more than a few seconds to execute or require an internet connection. 30 | 31 | Likewise, the README.md file in the base directory should not be edited directly. This file is created automatically from code that runs the examples shown, helping to ensure that they are functioning as advertised and consistent with the package README vignette. Instead, edit the README.Rmd source file in manuscripts and run make to build the README. 32 | 33 | ## Repository structure 34 | 35 | This repository is structured as a standard R package following the conventions outlined in the Writing R extensions manual. A few additional files are provided that are not part of the built R package and are listed in .Rbuildignore, such as .travis.yml, which is used for continuous testing and integration. 36 | 37 | ## Code 38 | 39 | All code for this package is found in R/, (except compiled source code, if used, which is in /src). All functions should be thoroughly documented with roxygen2 notation; see Documentation. 40 | 41 | Bug reports _must_ have a [reproducible example](http://adv-r.had.co.nz/Reproducibility.html) and include the output of `devtools::session_info()` (instead of `sessionInfo()`). We recommend using Hadley Wickham's style guide when writing code (). 42 | 43 | ## Testing 44 | 45 | Any new feature or bug-fix should include a unit-test demonstrating the change. Unit tests follow the testthat framework with files in tests/testthat. Please make sure that the testing suite passes before issuing a pull request. This can be done by running check() from the devtools package, which will also check for consistent documentation, etc. 46 | 47 | This package uses the travis continuous testing mechanism for R to ensure that the test suite is run on each push to Github. An icon at the top of the README.md indicates whether or not the tests are currently passing. 48 | 49 | ## Questions or comments? 50 | 51 | Do not hesistate to open an issue in the issues tracker to raise any questions or comments about the package or these guidelines. 52 | -------------------------------------------------------------------------------- /iris.csv: -------------------------------------------------------------------------------- 1 | "Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species" 2 | 5.1,3.5,1.4,0.2,"setosa" 3 | 4.9,3,1.4,0.2,"setosa" 4 | 4.7,3.2,1.3,0.2,"setosa" 5 | 4.6,3.1,1.5,0.2,"setosa" 6 | 5,3.6,1.4,0.2,"setosa" 7 | 5.4,3.9,1.7,0.4,"setosa" 8 | 4.6,3.4,1.4,0.3,"setosa" 9 | 5,3.4,1.5,0.2,"setosa" 10 | 4.4,2.9,1.4,0.2,"setosa" 11 | 4.9,3.1,1.5,0.1,"setosa" 12 | 5.4,3.7,NA,0.2,"setosa" 13 | 4.8,3.4,1.6,0.2,"setosa" 14 | 4.8,3,1.4,0.1,"setosa" 15 | 4.3,3,1.1,0.1,"setosa" 16 | 5.8,4,1.2,0.2,"setosa" 17 | 5.7,4.4,1.5,0.4,"setosa" 18 | 5.4,3.9,1.3,0.4,"setosa" 19 | 5.1,3.5,1.4,0.3,"setosa" 20 | 5.7,NA,1.7,0.3,"setosa" 21 | 5.1,3.8,1.5,0.3,"setosa" 22 | 5.4,3.4,1.7,0.2,"setosa" 23 | 5.1,3.7,1.5,0.4,"setosa" 24 | 4.6,3.6,1,0.2,"setosa" 25 | 5.1,3.3,1.7,0.5,"setosa" 26 | 4.8,3.4,1.9,0.2,"setosa" 27 | 5,3,1.6,0.2,"setosa" 28 | 5,3.4,1.6,0.4,"setosa" 29 | 5.2,3.5,NA,0.2,"setosa" 30 | 5.2,3.4,1.4,0.2,"setosa" 31 | 4.7,3.2,1.6,0.2,"setosa" 32 | 4.8,3.1,1.6,0.2,"setosa" 33 | 5.4,3.4,1.5,0.4,"setosa" 34 | 5.2,4.1,1.5,0.1,"setosa" 35 | 5.5,4.2,1.4,0.2,"setosa" 36 | 4.9,3.1,1.5,0.2,"setosa" 37 | 5,3.2,1.2,0.2,"setosa" 38 | 5.5,3.5,1.3,0.2,"setosa" 39 | 4.9,3.6,1.4,0.1,"setosa" 40 | 4.4,3,1.3,0.2,"setosa" 41 | 5.1,3.4,1.5,0.2,"setosa" 42 | 5,3.5,1.3,0.3,"setosa" 43 | 4.5,2.3,1.3,0.3,"setosa" 44 | 4.4,3.2,1.3,0.2,"setosa" 45 | 5,3.5,1.6,0.6,"setosa" 46 | 5.1,3.8,1.9,0.4,"setosa" 47 | 4.8,3,1.4,0.3,"setosa" 48 | 5.1,3.8,1.6,0.2,"setosa" 49 | 4.6,3.2,1.4,0.2,"setosa" 50 | 5.3,3.7,1.5,0.2,"setosa" 51 | 5,3.3,1.4,0.2,"setosa" 52 | 7,3.2,4.7,1.4,"versicolor" 53 | 6.4,3.2,4.5,1.5,"versicolor" 54 | 6.9,3.1,4.9,1.5,"versicolor" 55 | 5.5,2.3,4,1.3,"versicolor" 56 | 6.5,2.8,4.6,1.5,"versicolor" 57 | 5.7,2.8,4.5,1.3,"versicolor" 58 | 6.3,3.3,4.7,1.6,"versicolor" 59 | 4.9,2.4,3.3,1,"versicolor" 60 | 6.6,2.9,4.6,1.3,"versicolor" 61 | 5.2,2.7,3.9,1.4,"versicolor" 62 | 5,2,3.5,1,"versicolor" 63 | 5.9,3,4.2,1.5,"versicolor" 64 | 6,2.2,4,1,"versicolor" 65 | 6.1,2.9,4.7,1.4,"versicolor" 66 | 5.6,2.9,3.6,1.3,"versicolor" 67 | 6.7,3.1,4.4,1.4,"versicolor" 68 | 5.6,3,4.5,1.5,"versicolor" 69 | 5.8,2.7,4.1,1,"versicolor" 70 | 6.2,2.2,4.5,1.5,"versicolor" 71 | 5.6,2.5,3.9,1.1,"versicolor" 72 | 5.9,3.2,4.8,1.8,"versicolor" 73 | 6.1,2.8,4,1.3,"versicolor" 74 | 6.3,2.5,4.9,1.5,"versicolor" 75 | 6.1,2.8,4.7,1.2,"versicolor" 76 | 6.4,2.9,4.3,1.3,"versicolor" 77 | 6.6,3,4.4,1.4,"versicolor" 78 | 6.8,2.8,4.8,1.4,"versicolor" 79 | 6.7,3,5,1.7,"versicolor" 80 | 6,2.9,4.5,1.5,"versicolor" 81 | 5.7,2.6,3.5,1,"versicolor" 82 | 5.5,2.4,3.8,1.1,"versicolor" 83 | 5.5,2.4,3.7,1,"versicolor" 84 | 5.8,2.7,3.9,1.2,"versicolor" 85 | 6,2.7,5.1,1.6,"versicolor" 86 | 5.4,3,4.5,1.5,"versicolor" 87 | 6,3.4,4.5,1.6,"versicolor" 88 | 6.7,3.1,4.7,1.5,"versicolor" 89 | 6.3,2.3,4.4,1.3,"versicolor" 90 | 5.6,3,4.1,1.3,"versicolor" 91 | 5.5,2.5,4,1.3,"versicolor" 92 | 5.5,2.6,4.4,1.2,"versicolor" 93 | 6.1,3,4.6,1.4,"versicolor" 94 | 5.8,2.6,4,1.2,"versicolor" 95 | 5,2.3,3.3,1,"versicolor" 96 | 5.6,2.7,4.2,1.3,"versicolor" 97 | 5.7,3,4.2,1.2,"versicolor" 98 | 5.7,2.9,4.2,1.3,"versicolor" 99 | 6.2,2.9,4.3,1.3,"versicolor" 100 | 5.1,2.5,3,1.1,"versicolor" 101 | 5.7,2.8,4.1,1.3,"versicolor" 102 | 6.3,3.3,6,2.5,"virginica" 103 | 5.8,2.7,5.1,1.9,"virginica" 104 | 7.1,3,5.9,2.1,"virginica" 105 | 6.3,2.9,5.6,1.8,"virginica" 106 | 6.5,3,5.8,2.2,"virginica" 107 | 7.6,3,6.6,2.1,"virginica" 108 | 4.9,2.5,4.5,1.7,"virginica" 109 | 7.3,2.9,6.3,1.8,"virginica" 110 | 6.7,2.5,5.8,1.8,"virginica" 111 | 7.2,3.6,6.1,2.5,"virginica" 112 | 6.5,3.2,5.1,2,"virginica" 113 | 6.4,2.7,5.3,1.9,"virginica" 114 | 6.8,3,5.5,2.1,"virginica" 115 | 5.7,2.5,5,2,"virginica" 116 | 5.8,2.8,5.1,2.4,"virginica" 117 | 6.4,3.2,5.3,2.3,"virginica" 118 | 6.5,3,5.5,1.8,"virginica" 119 | 7.7,3.8,6.7,2.2,"virginica" 120 | 7.7,2.6,6.9,2.3,"virginica" 121 | 6,2.2,5,1.5,"virginica" 122 | 6.9,3.2,5.7,2.3,"virginica" 123 | 5.6,2.8,4.9,2,"virginica" 124 | 7.7,2.8,6.7,2,"virginica" 125 | 6.3,2.7,4.9,1.8,"virginica" 126 | 6.7,3.3,5.7,2.1,"virginica" 127 | 7.2,3.2,6,1.8,"virginica" 128 | 6.2,2.8,4.8,1.8,"virginica" 129 | 6.1,3,4.9,1.8,"virginica" 130 | 6.4,2.8,5.6,2.1,"virginica" 131 | 7.2,3,5.8,1.6,"virginica" 132 | 7.4,2.8,6.1,1.9,"virginica" 133 | 7.9,3.8,6.4,2,"virginica" 134 | 6.4,2.8,5.6,2.2,"virginica" 135 | 6.3,2.8,5.1,1.5,"virginica" 136 | 6.1,2.6,5.6,1.4,"virginica" 137 | 7.7,3,6.1,2.3,"virginica" 138 | 6.3,3.4,5.6,2.4,"virginica" 139 | 6.4,3.1,5.5,1.8,"virginica" 140 | 6,3,4.8,1.8,"virginica" 141 | 6.9,3.1,5.4,2.1,"virginica" 142 | 6.7,3.1,5.6,2.4,"virginica" 143 | 6.9,3.1,5.1,2.3,"virginica" 144 | 5.8,2.7,5.1,1.9,"virginica" 145 | 6.8,3.2,5.9,2.3,"virginica" 146 | 6.7,3.3,5.7,2.5,"virginica" 147 | 6.7,3,5.2,2.3,"virginica" 148 | 6.3,2.5,5,1.9,"virginica" 149 | 6.5,3,5.2,2,"virginica" 150 | 6.2,3.4,5.4,2.3,"virginica" 151 | 5.9,3,5.1,1.8,"virginica" 152 | -------------------------------------------------------------------------------- /ui.R: -------------------------------------------------------------------------------- 1 | ui <- bootstrapPage( 2 | mainPanel( 3 | titlePanel("Interactive PCA Explorer"), 4 | 5 | tabsetPanel( 6 | 7 | tabPanel("Data input", 8 | p("Before uploading your data, check that it is clean, especially ensure that the the numeric variables contain only the digits 0-9 or NA (to indicate missing data)."), 9 | p("Rows that contain one or more NAs will be excluded from the PCA."), 10 | p("Columns that contain a mixture of numbers and text will not be included in the computation of the PCA results."), 11 | p("Have a look at the ", a("iris.csv", href = "https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/master/iris.csv"), " file included with this app to see what a clean CSV file looks like."), 12 | tags$hr(), 13 | p("Select the options that match your CSV file, then upload your file:"), 14 | 15 | 16 | radioButtons(inputId = 'header', 17 | label = 'Header', 18 | choices = c('Columns have headers'='Yes', 19 | 'Columns do not have headers'='No'), 20 | selected = 'Yes'), 21 | 22 | radioButtons('sep', 'Separator', 23 | c(Comma=',', 24 | Semicolon=';', 25 | Tab='\t'), 26 | ','), 27 | 28 | radioButtons('quote', 'Quote', 29 | c(None='', 30 | 'Double Quote'='"', 31 | 'Single Quote'="'"), 32 | '"'), 33 | 34 | tags$hr(), 35 | 36 | fileInput('file1', 'Choose a CSV file to upload:', 37 | accept = c( 38 | 'text/csv', 39 | 'text/comma-separated-values', 40 | 'text/tab-separated-values', 41 | 'text/plain', 42 | '.csv', 43 | '.tsv' 44 | )), 45 | p("After uploading your CSV file, click on the 'Inspect the data' tab") 46 | 47 | ), # end file tab 48 | 49 | tabPanel("Inspect the data", 50 | 51 | p("The tableplot below (it will take a few seconds to appear) may be useful to explore the relationships between the variables, to discover strange data patterns, and to check the occurrence and selectivity of missing values."), 52 | plotOutput("tableplot"), 53 | tags$hr(), 54 | p("Here is a summary of the data"), 55 | tableOutput('summary'), 56 | tags$hr(), 57 | p("Here is the raw data from the CSV file"), 58 | DT::dataTableOutput('contents') 59 | ), # end tab 60 | 61 | 62 | tabPanel("Correlation Plots", 63 | uiOutput("choose_columns_biplot"), 64 | tags$hr(), 65 | p("This plot may take a few moments to appear when analysing large datasets. You may want to exclude highly correlated variables from the PCA."), 66 | 67 | plotOutput("corr_plot"), 68 | tags$hr(), 69 | p("Summary of correlations"), 70 | tableOutput("corr_tables") 71 | ), # end tab 72 | 73 | tabPanel("Diagnostics", 74 | 75 | p("Among SPSS users, these tests are considered to provide some guidelines on the suitability of the data for a principal components analysis. However, they may be safely ignored in favour of common sense. Variables with zero variance are excluded."), 76 | tags$hr(), 77 | p("Here is the output of Bartlett's sphericity test. Bartlett's test of sphericity tests whether the data comes from multivariate normal distribution with zero covariances. If p > 0.05 then PCA may not be very informative"), 78 | verbatimTextOutput("bartlett"), 79 | tags$hr(), 80 | p("Here is the output of the Kaiser-Meyer-Olkin (KMO) index test. The overall measure varies between 0 and 1, and values closer to 1 are better. A value of 0.6 is a suggested minimum. "), 81 | verbatimTextOutput("kmo") 82 | 83 | 84 | 85 | ), # end tab 86 | 87 | tabPanel("Compute PCA", 88 | 89 | p("Choose the columns of your data to include in the PCA."), 90 | p("Only columns containing numeric data are shown here because PCA doesn't work with non-numeric data."), 91 | p("The PCA is automatically re-computed each time you change your selection."), 92 | p("Observations (ie. rows) are automatically removed if they contain any missing values."), 93 | p("Variables with zero variance have been automatically removed because they're not useful in a PCA."), 94 | uiOutput("choose_columns_pca"), 95 | tags$hr(), 96 | p("Select options for the PCA computation (we are using the prcomp function here)"), 97 | radioButtons(inputId = 'center', 98 | label = 'Center', 99 | choices = c('Shift variables to be zero centered'='Yes', 100 | 'Do not shift variables'='No'), 101 | selected = 'Yes'), 102 | 103 | radioButtons('scale.', 'Scale', 104 | choices = c('Scale variables to have unit variance'='Yes', 105 | 'Do not scale variables'='No'), 106 | selected = 'Yes') 107 | 108 | ), # end tab 109 | 110 | 111 | 112 | tabPanel("PC Plots", 113 | h2("Scree plot"), 114 | p("The scree plot shows the variances of each PC, and the cumulative variance explained by each PC (in %) "), 115 | plotOutput("plot2", height = "300px"), 116 | tags$hr(), 117 | h2("PC plot: zoom and select points"), 118 | p("Select the grouping variable."), 119 | p("Only variables where the number of unique values is less than 10% of the total number of observations are shown here (because seeing groups with 1-2 observations is usually not very useful)."), 120 | uiOutput("the_grouping_variable"), 121 | tags$hr(), 122 | p("Select the PCs to plot"), 123 | uiOutput("the_pcs_to_plot_x"), 124 | uiOutput("the_pcs_to_plot_y"), 125 | tags$hr(), 126 | 127 | p("Click and drag on the first plot below to zoom into a region on the plot. Or you can go directly to the second plot below to select points to get more information about them."), 128 | p("Then select points on zoomed plot below to get more information about the points."), 129 | p("You can click on the 'Compute PCA' tab at any time to change the variables included in the PCA, and then come back to this tab and the plots will automatically update."), 130 | plotOutput ("z_plot1", height = 400, 131 | brush = brushOpts( 132 | id = "z_plot1Brush", 133 | resetOnNew = TRUE)), 134 | tags$hr(), 135 | 136 | p("Click and drag on the plot below to select points, and inspect the table of selected points below"), 137 | 138 | plotOutput("z_plot2", height = 400, 139 | brush = brushOpts( 140 | id = "plot_brush_after_zoom", 141 | resetOnNew = TRUE)), 142 | tags$hr(), 143 | p("Details of the brushed points"), 144 | tableOutput("brush_info_after_zoom") 145 | ), # end tab 146 | 147 | 148 | 149 | tabPanel("PCA output", 150 | verbatimTextOutput("pca_details") 151 | 152 | ), # end tab 153 | 154 | tabPanel("Colophon", 155 | p("The code for this Shiny app is online at ", a("https://github.com/benmarwick/Interactive_PCA_Explorer", href = "https://github.com/benmarwick/Interactive_PCA_Explorer"), ". Please post any feedback, question, etc. as an ", a("issue on github", href = "https://github.com/benmarwick/Interactive_PCA_Explorer/issues/new"), "."), 156 | p("The text is licensed ", a("CC-BY", href = "http://creativecommons.org/licenses/by/4.0/"), " and the code ", a(href = "https://opensource.org/licenses/MIT", "MIT"), ".") 157 | 158 | 159 | ) # end tab 160 | 161 | 162 | ))) -------------------------------------------------------------------------------- /server.R: -------------------------------------------------------------------------------- 1 | # global items 2 | 3 | # check if pkgs are installed already, if not, install automatically: 4 | # (http://stackoverflow.com/a/4090208/1036500) 5 | list.of.packages <- c("ggplot2", 6 | "DT", 7 | "GGally", 8 | "psych", 9 | "Hmisc", 10 | "MASS", 11 | "tabplot") 12 | 13 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] 14 | if(length(new.packages)) install.packages(new.packages) 15 | 16 | # load all these 17 | lapply(list.of.packages, require, character.only = TRUE) 18 | 19 | server <- function(input, output) { 20 | 21 | # read in the CSV 22 | the_data_fn <- reactive({ 23 | inFile <- input$file1 24 | if (is.null(inFile)) return(NULL) 25 | the_data <- read.csv(inFile$datapath, header = (input$header == "Yes"), 26 | sep = input$sep, quote = input$quote, stringsAsFactors=FALSE) 27 | return(the_data) 28 | }) 29 | 30 | 31 | # tableplot 32 | output$tableplot <- renderPlot({ 33 | if(is.null(the_data_fn())) return() 34 | the_data <- the_data_fn() 35 | tabplot::tableplot(the_data) 36 | 37 | }) 38 | 39 | # display a table of the CSV contents 40 | output$contents <- DT::renderDataTable({ 41 | # 42 | the_data_fn() 43 | }) 44 | 45 | # display a summary of the CSV contents 46 | output$summary <- renderTable({ 47 | the_data <- the_data_fn() 48 | psych::describe(the_data) 49 | }) 50 | 51 | # Check boxes to choose columns 52 | output$choose_columns_biplot <- renderUI({ 53 | 54 | the_data <- the_data_fn() 55 | 56 | colnames <- names(the_data) 57 | 58 | # Create the checkboxes and select them all by default 59 | checkboxGroupInput("columns_biplot", "Choose up to five columns to display on the scatterplot matrix", 60 | choices = colnames, 61 | selected = colnames[1:5]) 62 | }) 63 | 64 | # corr plot 65 | output$corr_plot <- renderPlot({ 66 | the_data <- the_data_fn() 67 | # Keep the selected columns 68 | columns_biplot <- input$columns_biplot 69 | the_data_subset_biplot <- the_data[, columns_biplot, drop = FALSE] 70 | ggpairs(the_data_subset_biplot) 71 | }) 72 | 73 | # corr tables 74 | output$corr_tables <- renderTable({ 75 | the_data <- the_data_fn() 76 | # we only want to show numeric cols 77 | the_data_num <- the_data[,sapply(the_data,is.numeric)] 78 | # exclude cols with zero variance 79 | the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))] 80 | 81 | 82 | res <- Hmisc::rcorr(as.matrix(the_data_num)) 83 | cormat <- res$r 84 | pmat <- res$P 85 | ut <- upper.tri(cormat) 86 | df <- data.frame( 87 | row = rownames(cormat)[row(cormat)[ut]], 88 | column = rownames(cormat)[col(cormat)[ut]], 89 | cor = (cormat)[ut], 90 | p = pmat[ut] 91 | ) 92 | with(df, df[order(-cor), ]) 93 | 94 | }) 95 | 96 | output$bartlett <- renderPrint({ 97 | the_data <- the_data_fn() 98 | the_data_num <- na.omit(the_data[,sapply(the_data,is.numeric)]) 99 | # exclude cols with zero variance 100 | the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))] 101 | 102 | cortest.bartlett(cor(the_data_num), n = nrow(the_data_num)) 103 | }) 104 | 105 | output$kmo <- renderPrint({ 106 | the_data <- the_data_fn() 107 | the_data_num <- the_data[,sapply(the_data,is.numeric)] 108 | # exclude cols with zero variance 109 | the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))] 110 | 111 | # R <- cor(the_data_num) 112 | # KMO(R) 113 | 114 | # http://www.opensubscriber.com/message/r-help@stat.math.ethz.ch/7315408.html 115 | # KMO Kaiser-Meyer-Olkin Measure of Sampling Adequacy 116 | kmo = function( data ){ 117 | 118 | library(MASS) 119 | X <- cor(as.matrix(data)) 120 | iX <- ginv(X) 121 | S2 <- diag(diag((iX^-1))) 122 | AIS <- S2%*%iX%*%S2 # anti-image covariance matrix 123 | IS <- X+AIS-2*S2 # image covariance matrix 124 | Dai <- sqrt(diag(diag(AIS))) 125 | IR <- ginv(Dai)%*%IS%*%ginv(Dai) # image correlation matrix 126 | AIR <- ginv(Dai)%*%AIS%*%ginv(Dai) # anti-image correlation matrix 127 | a <- apply((AIR - diag(diag(AIR)))^2, 2, sum) 128 | AA <- sum(a) 129 | b <- apply((X - diag(nrow(X)))^2, 2, sum) 130 | BB <- sum(b) 131 | MSA <- b/(b+a) # indiv. measures of sampling adequacy 132 | 133 | AIR <- AIR-diag(nrow(AIR))+diag(MSA) # Examine the anti-image of the 134 | # correlation matrix. That is the 135 | # negative of the partial correlations, 136 | # partialling out all other variables. 137 | 138 | kmo <- BB/(AA+BB) # overall KMO statistic 139 | 140 | # Reporting the conclusion 141 | if (kmo >= 0.00 && kmo < 0.50){ 142 | test <- 'The KMO test yields a degree of common variance 143 | unacceptable for FA.' 144 | } else if (kmo >= 0.50 && kmo < 0.60){ 145 | test <- 'The KMO test yields a degree of common variance miserable.' 146 | } else if (kmo >= 0.60 && kmo < 0.70){ 147 | test <- 'The KMO test yields a degree of common variance mediocre.' 148 | } else if (kmo >= 0.70 && kmo < 0.80){ 149 | test <- 'The KMO test yields a degree of common variance middling.' 150 | } else if (kmo >= 0.80 && kmo < 0.90){ 151 | test <- 'The KMO test yields a degree of common variance meritorious.' 152 | } else { 153 | test <- 'The KMO test yields a degree of common variance marvelous.' 154 | } 155 | 156 | ans <- list( overall = kmo, 157 | report = test, 158 | individual = MSA, 159 | AIS = AIS, 160 | AIR = AIR ) 161 | return(ans) 162 | 163 | } # end of kmo() 164 | kmo(na.omit(the_data_num)) 165 | 166 | }) 167 | 168 | 169 | 170 | # Check boxes to choose columns 171 | output$choose_columns_pca <- renderUI({ 172 | 173 | the_data <- the_data_fn() 174 | 175 | # Get the data set with the appropriate name 176 | 177 | # we only want to show numeric cols 178 | the_data_num <- na.omit(the_data[,sapply(the_data,is.numeric)]) 179 | # exclude cols with zero variance 180 | the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))] 181 | 182 | 183 | colnames <- names(the_data_num) 184 | 185 | # Create the checkboxes and select them all by default 186 | checkboxGroupInput("columns", "Choose columns", 187 | choices = colnames, 188 | selected = colnames) 189 | }) 190 | 191 | # choose a grouping variable 192 | output$the_grouping_variable <- renderUI({ 193 | the_data <- the_data_fn() 194 | 195 | 196 | # for grouping we want to see only cols where the number of unique values are less than 197 | # 10% the number of observations 198 | grouping_cols <- sapply(seq(1, ncol(the_data)), function(i) length(unique(the_data[,i])) < nrow(the_data)/10 ) 199 | 200 | the_data_group_cols <- the_data[, grouping_cols, drop = FALSE] 201 | # drop down selection 202 | selectInput(inputId = "the_grouping_variable", 203 | label = "Grouping variable:", 204 | choices=c("None", names(the_data_group_cols))) 205 | 206 | }) 207 | 208 | 209 | pca_objects <- reactive({ 210 | # Keep the selected columns 211 | columns <- input$columns 212 | the_data <- na.omit(the_data_fn()) 213 | the_data_subset <- na.omit(the_data[, columns, drop = FALSE]) 214 | 215 | # from http://rpubs.com/sinhrks/plot_pca 216 | pca_output <- prcomp(na.omit(the_data_subset), 217 | center = (input$center == 'Yes'), 218 | scale. = (input$scale. == 'Yes')) 219 | # data.frame of PCs 220 | pcs_df <- cbind(the_data, pca_output$x) 221 | 222 | return(list(the_data = the_data, 223 | the_data_subset = the_data_subset, 224 | pca_output = pca_output, 225 | pcs_df = pcs_df)) 226 | 227 | }) 228 | 229 | output$the_pcs_to_plot_x <- renderUI({ 230 | pca_output <- pca_objects()$pca_output$x 231 | 232 | # drop down selection 233 | selectInput(inputId = "the_pcs_to_plot_x", 234 | label = "X axis:", 235 | choices= colnames(pca_output), 236 | selected = 'PC1') 237 | }) 238 | 239 | output$the_pcs_to_plot_y <- renderUI({ 240 | pca_output <- pca_objects()$pca_output$x 241 | 242 | # drop down selection 243 | selectInput(inputId = "the_pcs_to_plot_y", 244 | label = "Y axis:", 245 | choices= colnames(pca_output), 246 | selected = 'PC2') 247 | }) 248 | 249 | 250 | 251 | output$plot2 <- renderPlot({ 252 | pca_output <- pca_objects()$pca_output 253 | eig = (pca_output$sdev)^2 254 | variance <- eig*100/sum(eig) 255 | cumvar <- paste(round(cumsum(variance),1), "%") 256 | eig_df <- data.frame(eig = eig, 257 | PCs = colnames(pca_output$x), 258 | cumvar = cumvar) 259 | ggplot(eig_df, aes(reorder(PCs, -eig), eig)) + 260 | geom_bar(stat = "identity", fill = "white", colour = "black") + 261 | geom_text(label = cumvar, size = 4, 262 | vjust=-0.4) + 263 | theme_bw(base_size = 14) + 264 | xlab("PC") + 265 | ylab("Variances") + 266 | ylim(0,(max(eig_df$eig) * 1.1)) 267 | }) 268 | 269 | 270 | # PC plot 271 | pca_biplot <- reactive({ 272 | pcs_df <- pca_objects()$pcs_df 273 | pca_output <- pca_objects()$pca_output 274 | 275 | var_expl_x <- round(100 * pca_output$sdev[as.numeric(gsub("[^0-9]", "", input$the_pcs_to_plot_x))]^2/sum(pca_output$sdev^2), 1) 276 | var_expl_y <- round(100 * pca_output$sdev[as.numeric(gsub("[^0-9]", "", input$the_pcs_to_plot_y))]^2/sum(pca_output$sdev^2), 1) 277 | labels <- rownames(pca_output$x) 278 | grouping <- input$the_grouping_variable 279 | 280 | if(grouping == 'None'){ 281 | # plot without grouping variable 282 | pc_plot_no_groups <- ggplot(pcs_df, 283 | aes_string(input$the_pcs_to_plot_x, 284 | input$the_pcs_to_plot_y 285 | )) + 286 | 287 | 288 | geom_text(aes(label = labels), size = 5) + 289 | theme_bw(base_size = 14) + 290 | coord_equal() + 291 | xlab(paste0(input$the_pcs_to_plot_x, " (", var_expl_x, "% explained variance)")) + 292 | ylab(paste0(input$the_pcs_to_plot_y, " (", var_expl_y, "% explained variance)")) 293 | # the plot 294 | pc_plot_no_groups 295 | 296 | 297 | } else { 298 | # plot with grouping variable 299 | 300 | pcs_df$fill_ <- as.character(pcs_df[, grouping, drop = TRUE]) 301 | pc_plot_groups <- ggplot(pcs_df, aes_string(input$the_pcs_to_plot_x, 302 | input$the_pcs_to_plot_y, 303 | fill = 'fill_', 304 | colour = 'fill_' 305 | )) + 306 | stat_ellipse(geom = "polygon", alpha = 0.1) + 307 | 308 | geom_text(aes(label = labels), size = 5) + 309 | theme_bw(base_size = 14) + 310 | scale_colour_discrete(guide = FALSE) + 311 | guides(fill = guide_legend(title = "groups")) + 312 | theme(legend.position="top") + 313 | coord_equal() + 314 | xlab(paste0(input$the_pcs_to_plot_x, " (", var_expl_x, "% explained variance)")) + 315 | ylab(paste0(input$the_pcs_to_plot_y, " (", var_expl_y, "% explained variance)")) 316 | # the plot 317 | pc_plot_groups 318 | } 319 | 320 | 321 | }) 322 | 323 | output$brush_info <- renderTable({ 324 | # the brushing function 325 | brushedPoints(pca_objects()$pcs_df, input$plot_brush) 326 | }) 327 | 328 | 329 | # for zooming 330 | output$z_plot1 <- renderPlot({ 331 | 332 | pca_biplot() 333 | 334 | }) 335 | 336 | # zoom ranges 337 | zooming <- reactiveValues(x = NULL, y = NULL) 338 | 339 | observe({ 340 | brush <- input$z_plot1Brush 341 | if (!is.null(brush)) { 342 | zooming$x <- c(brush$xmin, brush$xmax) 343 | zooming$y <- c(brush$ymin, brush$ymax) 344 | } 345 | else { 346 | zooming$x <- NULL 347 | zooming$y <- NULL 348 | } 349 | }) 350 | 351 | 352 | # for zooming 353 | output$z_plot2 <- renderPlot({ 354 | 355 | pca_biplot() + coord_cartesian(xlim = zooming$x, ylim = zooming$y) 356 | 357 | 358 | }) 359 | 360 | output$brush_info_after_zoom <- renderTable({ 361 | # the brushing function 362 | brushedPoints(pca_objects()$pcs_df, input$plot_brush_after_zoom) 363 | }) 364 | 365 | output$pca_details <- renderPrint({ 366 | # 367 | print(pca_objects()$pca_output$rotation) 368 | summary(pca_objects()$pca_output) 369 | 370 | }) 371 | 372 | output$Colophon <- renderPrint({ 373 | 374 | 375 | }) 376 | 377 | 378 | } --------------------------------------------------------------------------------