├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R ├── get_join_cols.R ├── get_join_count.R ├── get_join_rowindex.R ├── get_join_rows_n.R └── utils-pipe.R ├── README.md ├── joincheckr.Rproj └── man ├── get_join_cols.Rd ├── get_join_count.Rd ├── get_join_rowindex.Rd ├── get_join_rows_n.Rd └── pipe.Rd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | License.md 4 | ^\.github$ 5 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | pull_request: 9 | branches: 10 | - main 11 | - master 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | R-CMD-check: 17 | runs-on: macOS-latest 18 | env: 19 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 20 | steps: 21 | - uses: actions/checkout@v2 22 | - uses: r-lib/actions/setup-r@v1 23 | - name: Install dependencies 24 | run: | 25 | install.packages(c("remotes", "rcmdcheck")) 26 | remotes::install_deps(dependencies = TRUE) 27 | shell: Rscript {0} 28 | - name: Check 29 | run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") 30 | shell: Rscript {0} 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # produced vignettes 24 | vignettes/*.html 25 | vignettes/*.pdf 26 | 27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 28 | .httr-oauth 29 | 30 | # knitr and R markdown default cache directories 31 | *_cache/ 32 | /cache/ 33 | 34 | # Temporary files created by R markdown 35 | *.utf8.md 36 | *.knit.md 37 | 38 | # R Environment Variables 39 | .Renviron 40 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: joincheckr 2 | Type: Package 3 | Title: Get information for left join of two data frames 4 | Version: 0.1.0 5 | Author: Tobias Stalder 6 | Maintainer: Tobias Stalder 7 | Description: This package offers simple functions to get information 8 | on the potential result of a left join between two data frames. 9 | For example, calculate how many rows of x will have at 10 | least one join partner in y based on a common join ID. 11 | License: MIT + file LICENSE 12 | Encoding: UTF-8 13 | LazyData: true 14 | RoxygenNote: 7.1.1 15 | Imports: 16 | dplyr (>= 1.0.0.0), 17 | magrittr, 18 | rlang 19 | suggests: 20 | tibble (>= 3.0.5) 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: joincheckr authors: Tobias Stalder 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 joincheckr authors: Tobias Stalder 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%>%") 4 | export(get_join_cols) 5 | export(get_join_count) 6 | export(get_join_rowindex) 7 | export(get_join_rows_n) 8 | importFrom(magrittr,"%>%") 9 | importFrom(rlang,.data) 10 | -------------------------------------------------------------------------------- /R/get_join_cols.R: -------------------------------------------------------------------------------- 1 | #' cols that would be joined and their origin 2 | #' 3 | #' @param data_x data x 4 | #' @param data_y data y (will be joined on x) 5 | #' @return df with all columns of a join and their origin 6 | #' @export 7 | #' @examples 8 | #' ## get_join_cols(df.x, df.y) 9 | 10 | # function that shows which (names)columns would be joined upon entering the datasets x and y 11 | get_join_cols <- function(data_x, data_y) { 12 | 13 | cols_x <- data.frame(colnames(data_x)) %>% 14 | dplyr::mutate(origin = "data_x") %>% 15 | dplyr::rename(cols = .data$colnames.data_x.) 16 | 17 | cols_y <- data.frame(colnames(data_y)) %>% 18 | dplyr::mutate(origin = "data_y") %>% 19 | dplyr::rename(cols = .data$colnames.data_y.) 20 | 21 | cols <- rbind(cols_x, cols_y) 22 | 23 | return(cols) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /R/get_join_count.R: -------------------------------------------------------------------------------- 1 | #' Count Join ID in a dataset 2 | #' 3 | #' @param data_x data 4 | #' @param ID_x Join ID in dataset x 5 | #' @return df with how many times an ID appears in a dataset 6 | #' @export 7 | #' @importFrom rlang .data 8 | #' @examples 9 | #' ##get_join_count(df.x, df.x$ID) 10 | 11 | 12 | 13 | # function that generates a join count for each ID 14 | 15 | get_join_count <- function(data_x, ID_x) { 16 | 17 | data_x %>% 18 | dplyr::group_by({{ID_x}}) %>% 19 | dplyr::summarise(n = dplyr::n()) %>% 20 | dplyr::rename(join_count = .data$n) -> join_c 21 | 22 | return(join_c) 23 | 24 | } 25 | 26 | -------------------------------------------------------------------------------- /R/get_join_rowindex.R: -------------------------------------------------------------------------------- 1 | #' col indicating join partner in y 2 | #' 3 | #' @param data_x data x that the join will be based on 4 | #' @param ID_x Join ID in dataset x 5 | #' @param ID_y Join ID in dataset y 6 | #' @return df x with new column join_index (0 = FALSE) 7 | #' @export 8 | #' @examples 9 | #' ##get_join_rowindex(df.x, df.x$ID, df.y$ID) 10 | 11 | 12 | # create new column that indicates if this row finds at least one match in ID of y 13 | get_join_rowindex <- function(data_x, ID_x, ID_y) { 14 | 15 | data_x %>% 16 | dplyr::mutate(join_index = ifelse(ID_x %in% ID_y, 17 | 1, 18 | 0)) -> data_x 19 | return(data_x) 20 | } 21 | -------------------------------------------------------------------------------- /R/get_join_rows_n.R: -------------------------------------------------------------------------------- 1 | #' count of rows with join 2 | #' 3 | #' @param data_x data x that the join will be based on 4 | #' @param ID_x Join ID in dataset x 5 | #' @param ID_y Join ID in dataset y 6 | #' @return dataframe of n rows that have a join match in x 7 | #' @export 8 | #' @examples 9 | #' ##get_join_rows_n(df.x, df.x$ID, df.y$ID) 10 | 11 | 12 | 13 | # function that summarises how many rows of the total rows of x will have a join match based on entered IDs of x and y 14 | get_join_rows_n <- function(data_x, ID_x, ID_y) { 15 | 16 | data_x %>% 17 | dplyr::mutate(join_index = ifelse(ID_x %in% ID_y, 18 | 1, 19 | 0)) -> data_x 20 | 21 | 22 | 23 | data_x %>% 24 | dplyr::group_by(.data$join_index) %>% 25 | dplyr::count() -> n_sum 26 | 27 | return(n_sum) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![R-CMD-check](https://github.com/toebR/joincheckr/workflows/R-CMD-check/badge.svg)](https://github.com/toebR/joincheckr/actions) 3 | 4 | 5 | # joincheckr 6 | An R package with functions to get information on a left join between 2 data frames. 7 | Feel free to create pull requests, open issues etc. Contributions are very welcome. 8 | If you are unfamiliar with merging/joining datasets, please visit [this blog](https://thomasadventure.blog/posts/r-merging-datasets/) by thomas neitmann on this topic: 9 | 10 | 11 | 12 | 13 | 14 | 15 | ## Introduction 16 | I often find myself in the situation where I need to join (well, left-join) two data sets which are messy and/or too large for any visual control. 17 | Some months ago, I started to write small (and honestly, very simple) helper functions to get information on a possible join before I actually join the data. 18 | In this package, I share some of them which might prove useful to others. This is most of all a learning project for writting transferable functions and wrapping them in a documented package, so don't expect too much. 19 | 20 | ## Installation 21 | The package is not yet on CRAN so you can install it from github and load it into your session by running the following line: 22 | 23 | ```r 24 | remotes::install_github("toebR/joincheckr") 25 | library(joincheckr) 26 | ``` 27 | 28 | ## Functions 29 | Regarding nomenclature, the functions arguments are distinguished between "data" and "ID". A request for "data_x" is a request for your first data frame i.e. the data frame the join will be based on. "data_y" is subsequently the other. Same principle goes for the "ID" where we enter the common join ID of df x or y. **All functions should be fully pipeable with tidyverse functions**. 30 | 31 | Before we get to the functions, lets prepare 2 example datasets: data1 and data2. 32 | 33 | ```r 34 | datagen <- function(){ 35 | data1 <<- data.frame(ID = c(1,1, 2, 3, 4), 36 | GroupX = c("A", "A", "A", "B", "C"), 37 | VarX = c(1, 10,100, 1000, 10000)) 38 | 39 | data2 <<- data.frame(ID = c(1, 2, 2, 3, 5), 40 | GroupY = c("A", "B", "B", "C", "D"), 41 | VarY = c(2, 20,200, 2000, 20000)) 42 | 43 | 44 | } 45 | datagen() 46 | ``` 47 | Which assigns data1 and data2 to the global environment. 48 | data1: 49 | ```r 50 | > data1 51 | ID GroupX VarX 52 | 1 1 A 1 53 | 2 1 A 10 54 | 3 2 A 100 55 | 4 3 B 1000 56 | 5 4 C 10000 57 | ``` 58 | data2: 59 | ```r 60 | > data2 61 | ID GroupY VarY 62 | 1 1 A 2 63 | 2 2 B 20 64 | 3 2 B 200 65 | 4 3 C 2000 66 | 5 5 D 20000 67 | ``` 68 | 69 | ### get_join_cols() 70 | This function returns a dataframe where columns of two entered dataframes are listed which would be subject to a join. 71 | 72 | ```r 73 | get_join_cols(data_x = data1, data_y = data2) 74 | ``` 75 | ```r 76 | cols origin 77 | 1 ID data_x 78 | 2 GroupX data_x 79 | 3 VarX data_x 80 | 4 ID data_y 81 | 5 GroupY data_y 82 | 6 VarY data_y 83 | ``` 84 | ### get_join_count() 85 | This function counts how many times an ID is present in a data frame and returns a tibble. 86 | **Note: When passing a column as ID_x, make sure to use the $ operator** 87 | 88 | ```r 89 | > get_join_count(data_x = data1, ID_x = data1$ID) 90 | ``` 91 | ```r 92 | `data1$ID` join_count 93 | 94 | 1 1 2 95 | 2 2 1 96 | 3 3 1 97 | 4 4 1 98 | ``` 99 | 100 | ### get_join_rows_n() 101 | This function returns a tibble showing how many rows of df x have at least one join partner in df y by providing the join ID columns of x and y. 102 | 0 indicates no join, 1 indicates join. 103 | ```r 104 | get_join_rows_n(data_x = data1, ID_x = data1$ID, ID_y = data2$ID) 105 | ``` 106 | ```r 107 | join_index n 108 | 109 | 1 0 1 110 | 2 1 4 111 | ``` 112 | With our example data, we see that out of five rows in data1, 4 rows have a join partner in data2 based on the column "ID". 113 | 114 | ## get_join_rowindex() 115 | This function creates a new column in the first df we enter into the function which indicates if the row will receive at least 1 join or not out of df 2. 0 indicates no join, 1 indicates join. 116 | ```r 117 | get_join_rowindex(data_x = data1, ID_x = data1$ID, ID_y = data2$ID) 118 | ``` 119 | ```r 120 | ID GroupX VarX join_index 121 | 1 1 A 1 1 122 | 2 1 A 10 1 123 | 3 2 A 100 1 124 | 4 3 B 1000 1 125 | 5 4 C 10000 0 126 | ``` 127 | ## Examples 128 | ### 1) Get all rows out of a dataframe which would have a join partner in another 129 | ```r 130 | library(dplyr) 131 | 132 | get_join_rowindex(data1, data1$ID, data2$ID) %>% 133 | filter(join_index == 1) #added filter function 134 | ``` 135 | Result: 136 | ```r 137 | ID GroupX VarX join_index 138 | 1 1 A 1 1 139 | 2 1 A 10 1 140 | 3 2 A 100 1 141 | 4 3 B 1000 1 142 | ``` 143 | 144 | ### 2) Get nr of ID entries before and after a join 145 | Sometimes, an ID is appearing multiple times in a dataframe. If both dataframes have multiple ID entries, a single ID in df 1 might multiply after the **left** join with df 2 since different rowdata on the same id in df 2 will be joined back to a single entry in df 1. To have a bit of control over this, we can use **get_join_count()** before and after a join to see if the entries multiplied. 146 | 147 | First we run the function on our data1: 148 | ```r 149 | get_join_count(data_x = data1, ID_x = data1$ID) 150 | ``` 151 | ```r 152 | `data1$ID` join_count 153 | 154 | 1 1 2 155 | 2 2 1 156 | 3 3 1 157 | 4 4 1 158 | ``` 159 | This result shows us, that we have 2 entries for ID 1 in our data1. 160 | 161 | Now we perform a dplyr::left_join() between data1 and data2 and then run the function again on the result 162 | ```r 163 | left_join(data1, data2, by = "ID") %>% 164 | get_join_count(ID_x = ID) 165 | ``` 166 | Which yields the following result. 167 | ```r 168 | ID join_count 169 | 170 | 1 1 2 171 | 2 2 2 172 | 3 3 1 173 | 4 4 1 174 | ``` 175 | As we see, we do not only have two entries for ID 1 but now also for ID 2! This means that in data2, we have 2 entries with ID = 2 which are joined to one entry of ID = 2 in data1 consequently leading to 2 entries. Let's control this on data2: 176 | ```r 177 | > data2 178 | ID GroupY VarY 179 | 1 1 A 2 180 | 2 2 B 20 181 | 3 2 B 200 182 | 4 3 C 2000 183 | 5 5 D 20000 184 | ``` 185 | The assumption was correct since we see a double entrie of ID = 2 but with different variables in the other columns. 186 | 187 | ## Acknowledgements 188 | A big thank you to Joshua de la Bruere (Twitter: @delaBJL) for the advice on how to write pipeable functions and help de-bugging the package! 189 | -------------------------------------------------------------------------------- /joincheckr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /man/get_join_cols.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_join_cols.R 3 | \name{get_join_cols} 4 | \alias{get_join_cols} 5 | \title{cols that would be joined and their origin} 6 | \usage{ 7 | get_join_cols(data_x, data_y) 8 | } 9 | \arguments{ 10 | \item{data_x}{data x} 11 | 12 | \item{data_y}{data y (will be joined on x)} 13 | } 14 | \value{ 15 | df with all columns of a join and their origin 16 | } 17 | \description{ 18 | cols that would be joined and their origin 19 | } 20 | \examples{ 21 | ## get_join_cols(df.x, df.y) 22 | } 23 | -------------------------------------------------------------------------------- /man/get_join_count.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_join_count.R 3 | \name{get_join_count} 4 | \alias{get_join_count} 5 | \title{Count Join ID in a dataset} 6 | \usage{ 7 | get_join_count(data_x, ID_x) 8 | } 9 | \arguments{ 10 | \item{data_x}{data} 11 | 12 | \item{ID_x}{Join ID in dataset x} 13 | } 14 | \value{ 15 | df with how many times an ID appears in a dataset 16 | } 17 | \description{ 18 | Count Join ID in a dataset 19 | } 20 | \examples{ 21 | ##get_join_count(df.x, df.x$ID) 22 | } 23 | -------------------------------------------------------------------------------- /man/get_join_rowindex.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_join_rowindex.R 3 | \name{get_join_rowindex} 4 | \alias{get_join_rowindex} 5 | \title{col indicating join partner in y} 6 | \usage{ 7 | get_join_rowindex(data_x, ID_x, ID_y) 8 | } 9 | \arguments{ 10 | \item{data_x}{data x that the join will be based on} 11 | 12 | \item{ID_x}{Join ID in dataset x} 13 | 14 | \item{ID_y}{Join ID in dataset y} 15 | } 16 | \value{ 17 | df x with new column join_index (0 = FALSE) 18 | } 19 | \description{ 20 | col indicating join partner in y 21 | } 22 | \examples{ 23 | ##get_join_rowindex(df.x, df.x$ID, df.y$ID) 24 | } 25 | -------------------------------------------------------------------------------- /man/get_join_rows_n.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_join_rows_n.R 3 | \name{get_join_rows_n} 4 | \alias{get_join_rows_n} 5 | \title{count of rows with join} 6 | \usage{ 7 | get_join_rows_n(data_x, ID_x, ID_y) 8 | } 9 | \arguments{ 10 | \item{data_x}{data x that the join will be based on} 11 | 12 | \item{ID_x}{Join ID in dataset x} 13 | 14 | \item{ID_y}{Join ID in dataset y} 15 | } 16 | \value{ 17 | dataframe of n rows that have a join match in x 18 | } 19 | \description{ 20 | count of rows with join 21 | } 22 | \examples{ 23 | ##get_join_rows_n(df.x, df.x$ID, df.y$ID) 24 | } 25 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | --------------------------------------------------------------------------------