├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   └── R-CMD-check.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── R
    ├── get_join_cols.R
    ├── get_join_count.R
    ├── get_join_rowindex.R
    ├── get_join_rows_n.R
    └── utils-pipe.R
├── README.md
├── joincheckr.Rproj
└── man
    ├── get_join_cols.Rd
    ├── get_join_count.Rd
    ├── get_join_rowindex.Rd
    ├── get_join_rows_n.Rd
    └── pipe.Rd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | License.md
4 | ^\.github$
5 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |       - master
12 | 
13 | name: R-CMD-check
14 | 
15 | jobs:
16 |   R-CMD-check:
17 |     runs-on: macOS-latest
18 |     env:
19 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
20 |     steps:
21 |       - uses: actions/checkout@v2
22 |       - uses: r-lib/actions/setup-r@v1
23 |       - name: Install dependencies
24 |         run: |
25 |           install.packages(c("remotes", "rcmdcheck"))
26 |           remotes::install_deps(dependencies = TRUE)
27 |         shell: Rscript {0}
28 |       - name: Check
29 |         run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error")
30 |         shell: Rscript {0}
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 | 
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 | 
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 | 
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 | 
38 | # R Environment Variables
39 | .Renviron
40 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: joincheckr
 2 | Type: Package
 3 | Title: Get information for left join of two data frames
 4 | Version: 0.1.0
 5 | Author: Tobias Stalder
 6 | Maintainer: Tobias Stalder <tobias.stalder.geo@outlook.com>
 7 | Description: This package offers simple functions to get information
 8 |     on the potential result of a left join between two data frames.
 9 | 	For example, calculate how many rows of x will have at
10 | 	least one join partner in y based on a common join ID.
11 | License: MIT + file LICENSE
12 | Encoding: UTF-8
13 | LazyData: true
14 | RoxygenNote: 7.1.1
15 | Imports:
16 |     dplyr (>= 1.0.0.0),
17 |     magrittr, 
18 |     rlang
19 | suggests:
20 |     tibble (>= 3.0.5)
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: joincheckr authors: Tobias Stalder
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2021 joincheckr authors: Tobias Stalder
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export("%>%")
 4 | export(get_join_cols)
 5 | export(get_join_count)
 6 | export(get_join_rowindex)
 7 | export(get_join_rows_n)
 8 | importFrom(magrittr,"%>%")
 9 | importFrom(rlang,.data)
10 | 


--------------------------------------------------------------------------------
/R/get_join_cols.R:
--------------------------------------------------------------------------------
 1 | #' cols that would be joined and their origin
 2 | #'
 3 | #' @param data_x data x
 4 | #' @param data_y data y (will be joined on x)
 5 | #' @return df with all columns of a join and their origin
 6 | #' @export
 7 | #' @examples
 8 | #' ## get_join_cols(df.x, df.y)
 9 | 
10 | # function that shows which (names)columns would be joined upon entering the datasets x and y
11 | get_join_cols <- function(data_x, data_y) {
12 | 
13 |   cols_x <- data.frame(colnames(data_x)) %>%
14 |     dplyr::mutate(origin = "data_x") %>%
15 |     dplyr::rename(cols = .data$colnames.data_x.)
16 | 
17 |   cols_y <- data.frame(colnames(data_y)) %>%
18 |     dplyr::mutate(origin = "data_y") %>%
19 |     dplyr::rename(cols = .data$colnames.data_y.)
20 | 
21 |   cols <- rbind(cols_x, cols_y)
22 | 
23 |   return(cols)
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/R/get_join_count.R:
--------------------------------------------------------------------------------
 1 | #' Count Join ID in a dataset
 2 | #'
 3 | #' @param data_x data
 4 | #' @param ID_x Join ID in dataset x
 5 | #' @return df with how many times an ID appears in a dataset
 6 | #' @export
 7 | #' @importFrom rlang .data
 8 | #' @examples
 9 | #' ##get_join_count(df.x, df.x$ID)
10 | 
11 | 
12 | 
13 | # function that generates a join count for each ID
14 | 
15 | get_join_count <- function(data_x, ID_x) {
16 | 
17 |   data_x %>%
18 |     dplyr::group_by({{ID_x}}) %>%
19 |     dplyr::summarise(n = dplyr::n()) %>%
20 |     dplyr::rename(join_count = .data$n) -> join_c
21 | 
22 |   return(join_c)
23 | 
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/R/get_join_rowindex.R:
--------------------------------------------------------------------------------
 1 | #' col indicating join partner in y
 2 | #'
 3 | #' @param data_x data x that the join will be based on
 4 | #' @param ID_x Join ID in dataset x
 5 | #' @param ID_y Join ID in dataset y
 6 | #' @return df x with new column join_index (0 = FALSE)
 7 | #' @export
 8 | #' @examples
 9 | #' ##get_join_rowindex(df.x, df.x$ID, df.y$ID)
10 | 
11 | 
12 | # create new column that indicates if this row finds at least one match in ID of y
13 | get_join_rowindex <- function(data_x, ID_x, ID_y) {
14 | 
15 |   data_x %>%
16 |     dplyr::mutate(join_index = ifelse(ID_x %in% ID_y,
17 |                                1,
18 |                                0)) -> data_x
19 |   return(data_x)
20 | }
21 | 


--------------------------------------------------------------------------------
/R/get_join_rows_n.R:
--------------------------------------------------------------------------------
 1 | #' count of rows with join
 2 | #'
 3 | #' @param data_x data x that the join will be based on
 4 | #' @param ID_x Join ID in dataset x
 5 | #' @param ID_y Join ID in dataset y
 6 | #' @return dataframe of n rows that have a join match in x
 7 | #' @export
 8 | #' @examples
 9 | #' ##get_join_rows_n(df.x, df.x$ID, df.y$ID)
10 | 
11 | 
12 | 
13 | # function that summarises how many rows of the total rows of x will have a join match based on entered IDs of x and y
14 | get_join_rows_n <- function(data_x, ID_x, ID_y) {
15 | 
16 |   data_x %>%
17 |     dplyr::mutate(join_index = ifelse(ID_x %in% ID_y,
18 |                                1,
19 |                                0)) -> data_x
20 | 
21 | 
22 | 
23 |   data_x %>%
24 |     dplyr::group_by(.data$join_index) %>%
25 |     dplyr::count() -> n_sum
26 | 
27 |   return(n_sum)
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 |  <!-- badges: start -->
  2 |   [![R-CMD-check](https://github.com/toebR/joincheckr/workflows/R-CMD-check/badge.svg)](https://github.com/toebR/joincheckr/actions)
  3 |   <!-- badges: end -->
  4 | 
  5 | # joincheckr
  6 | An R package with functions to get information on a left join between 2 data frames.
  7 | Feel free to create pull requests, open issues etc. Contributions are very welcome.
  8 | If you are unfamiliar with merging/joining datasets, please visit [this blog](https://thomasadventure.blog/posts/r-merging-datasets/) by thomas neitmann on this topic:
  9 | 
 10 | 
 11 | 
 12 | 
 13 | <img src="https://user-images.githubusercontent.com/65813696/105766734-63bda380-5f5a-11eb-94bb-6f78f021e805.png" width="400" height="461" align = "right" />
 14 | 
 15 | ## Introduction
 16 | I often find myself in the situation where I need to join (well, left-join) two data sets which are messy and/or too large for any visual control.
 17 | Some months ago, I started to write small (and honestly, very simple) helper functions to get information on a possible join before I actually join the data.
 18 | In this package, I share some of them which might prove useful to others. This is most of all a learning project for writting transferable functions and wrapping them in a documented package, so don't expect too much.
 19 | 
 20 | ## Installation
 21 | The package is not yet on CRAN so you can install it from github and load it into your session by running the following line:
 22 | 
 23 | ```r
 24 | remotes::install_github("toebR/joincheckr")
 25 | library(joincheckr)
 26 | ```
 27 | 
 28 | ## Functions
 29 | Regarding nomenclature, the functions arguments are distinguished between "data" and "ID". A request for "data_x" is a request for your first data frame i.e. the data frame the join will be based on. "data_y" is subsequently the other. Same principle goes for the "ID" where we enter the common join ID of df x or y. **All functions should be fully pipeable with tidyverse functions**.
 30 | 
 31 | Before we get to the functions, lets prepare 2 example datasets: data1 and data2.
 32 | 
 33 | ```r
 34 | datagen <- function(){
 35 |   data1 <<- data.frame(ID = c(1,1, 2, 3, 4),
 36 |                        GroupX = c("A", "A", "A", "B", "C"),
 37 |                        VarX = c(1, 10,100, 1000, 10000))
 38 |   
 39 |   data2 <<- data.frame(ID = c(1, 2, 2, 3, 5),
 40 |                        GroupY = c("A", "B", "B", "C", "D"),
 41 |                        VarY = c(2, 20,200, 2000, 20000))
 42 |   
 43 |   
 44 | }
 45 | datagen()
 46 | ```
 47 | Which assigns data1 and data2 to the global environment.
 48 | data1:
 49 | ```r
 50 | > data1
 51 |   ID GroupX  VarX
 52 | 1  1      A     1
 53 | 2  1      A    10
 54 | 3  2      A   100
 55 | 4  3      B  1000
 56 | 5  4      C 10000
 57 | ```
 58 | data2:
 59 | ```r
 60 | > data2
 61 |   ID GroupY  VarY
 62 | 1  1      A     2
 63 | 2  2      B    20
 64 | 3  2      B   200
 65 | 4  3      C  2000
 66 | 5  5      D 20000
 67 | ```
 68 | 
 69 | ### get_join_cols()
 70 | This function returns a dataframe where columns of two entered dataframes are listed which would be subject to a join.
 71 | 
 72 | ```r
 73 | get_join_cols(data_x = data1, data_y = data2)
 74 | ```
 75 | ```r
 76 |     cols origin
 77 | 1     ID data_x
 78 | 2 GroupX data_x
 79 | 3   VarX data_x
 80 | 4     ID data_y
 81 | 5 GroupY data_y
 82 | 6   VarY data_y
 83 | ```
 84 | ### get_join_count()
 85 | This function counts how many times an ID is present in a data frame and returns a tibble.
 86 | **Note: When passing a column as ID_x, make sure to use the $ operator**
 87 | 
 88 | ```r
 89 | > get_join_count(data_x = data1, ID_x = data1$ID)
 90 | ```
 91 | ```r
 92 |   `data1$ID` join_count
 93 |        <dbl>      <int>
 94 | 1          1          2
 95 | 2          2          1
 96 | 3          3          1
 97 | 4          4          1
 98 | ```
 99 | 
100 | ### get_join_rows_n()
101 | This function returns a tibble showing how many rows of df x have at least one join partner in df y by providing the join ID columns of x and y.
102 | 0 indicates no join, 1 indicates join.
103 | ```r
104 | get_join_rows_n(data_x = data1, ID_x = data1$ID, ID_y = data2$ID)
105 | ```
106 | ```r
107 |   join_index     n
108 |        <dbl> <int>
109 | 1          0     1
110 | 2          1     4
111 | ```
112 | With our example data, we see that out of five rows in data1, 4 rows have a join partner in data2 based on the column "ID".
113 | 
114 | ## get_join_rowindex()
115 | This function creates a new column in the first df we enter into the function which indicates if the row will receive at least 1 join or not out of df 2. 0 indicates no join, 1 indicates join.
116 | ```r
117 | get_join_rowindex(data_x = data1, ID_x = data1$ID, ID_y = data2$ID)
118 | ```
119 | ```r
120 |   ID GroupX  VarX join_index
121 | 1  1      A     1          1
122 | 2  1      A    10          1
123 | 3  2      A   100          1
124 | 4  3      B  1000          1
125 | 5  4      C 10000          0
126 | ```
127 | ## Examples
128 | ### 1) Get all rows out of a dataframe which would have a join partner in another
129 | ```r
130 | library(dplyr)
131 | 
132 | get_join_rowindex(data1, data1$ID, data2$ID) %>%
133 |   filter(join_index == 1) #added filter function
134 | ```
135 | Result:
136 | ```r
137 |   ID GroupX VarX join_index
138 | 1  1      A    1          1
139 | 2  1      A   10          1
140 | 3  2      A  100          1
141 | 4  3      B 1000          1
142 | ```
143 | 
144 | ### 2) Get nr of ID entries before and after a join
145 | Sometimes, an ID is appearing multiple times in a dataframe. If both dataframes have multiple ID entries, a single ID in df 1 might multiply after the **left** join with df 2 since different rowdata on the same id in df 2 will be joined back to a single entry in df 1. To have a bit of control over this, we can use **get_join_count()** before and after a join to see if the entries multiplied.
146 | 
147 | First we run the function on our data1:
148 | ```r
149 | get_join_count(data_x =  data1, ID_x = data1$ID)
150 | ```
151 | ```r
152 |   `data1$ID` join_count
153 |        <dbl>      <int>
154 | 1          1          2
155 | 2          2          1
156 | 3          3          1
157 | 4          4          1
158 | ```
159 | This result shows us, that we have 2 entries for ID 1 in our data1.
160 | 
161 | Now we perform a dplyr::left_join() between data1 and data2 and then run the function again on the result
162 | ```r
163 | left_join(data1, data2, by = "ID") %>%
164 |   get_join_count(ID_x = ID)
165 | ```
166 | Which yields the following result.
167 | ```r
168 |      ID join_count
169 |   <dbl>      <int>
170 | 1     1          2
171 | 2     2          2
172 | 3     3          1
173 | 4     4          1
174 | ```
175 | As we see, we do not only have two entries for ID 1 but now also for ID 2! This means that in data2, we have 2 entries with ID = 2 which are joined to one entry of ID = 2 in data1 consequently leading to 2 entries. Let's control this on data2:
176 | ```r
177 | > data2
178 |   ID GroupY  VarY
179 | 1  1      A     2
180 | 2  2      B    20
181 | 3  2      B   200
182 | 4  3      C  2000
183 | 5  5      D 20000
184 | ```
185 | The assumption was correct since we see a double entrie of ID = 2 but with different variables in the other columns.
186 | 
187 | ## Acknowledgements
188 | A big thank you to Joshua de la Bruere (Twitter: @delaBJL) for the advice on how to write pipeable functions and help de-bugging the package!
189 | 


--------------------------------------------------------------------------------
/joincheckr.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | 


--------------------------------------------------------------------------------
/man/get_join_cols.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_join_cols.R
 3 | \name{get_join_cols}
 4 | \alias{get_join_cols}
 5 | \title{cols that would be joined and their origin}
 6 | \usage{
 7 | get_join_cols(data_x, data_y)
 8 | }
 9 | \arguments{
10 | \item{data_x}{data x}
11 | 
12 | \item{data_y}{data y (will be joined on x)}
13 | }
14 | \value{
15 | df with all columns of a join and their origin
16 | }
17 | \description{
18 | cols that would be joined and their origin
19 | }
20 | \examples{
21 | ## get_join_cols(df.x, df.y)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/get_join_count.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_join_count.R
 3 | \name{get_join_count}
 4 | \alias{get_join_count}
 5 | \title{Count Join ID in a dataset}
 6 | \usage{
 7 | get_join_count(data_x, ID_x)
 8 | }
 9 | \arguments{
10 | \item{data_x}{data}
11 | 
12 | \item{ID_x}{Join ID in dataset x}
13 | }
14 | \value{
15 | df with how many times an ID appears in a dataset
16 | }
17 | \description{
18 | Count Join ID in a dataset
19 | }
20 | \examples{
21 | ##get_join_count(df.x, df.x$ID)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/get_join_rowindex.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_join_rowindex.R
 3 | \name{get_join_rowindex}
 4 | \alias{get_join_rowindex}
 5 | \title{col indicating join partner in y}
 6 | \usage{
 7 | get_join_rowindex(data_x, ID_x, ID_y)
 8 | }
 9 | \arguments{
10 | \item{data_x}{data x that the join will be based on}
11 | 
12 | \item{ID_x}{Join ID in dataset x}
13 | 
14 | \item{ID_y}{Join ID in dataset y}
15 | }
16 | \value{
17 | df x with new column join_index (0 = FALSE)
18 | }
19 | \description{
20 | col indicating join partner in y
21 | }
22 | \examples{
23 | ##get_join_rowindex(df.x, df.x$ID, df.y$ID)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/get_join_rows_n.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_join_rows_n.R
 3 | \name{get_join_rows_n}
 4 | \alias{get_join_rows_n}
 5 | \title{count of rows with join}
 6 | \usage{
 7 | get_join_rows_n(data_x, ID_x, ID_y)
 8 | }
 9 | \arguments{
10 | \item{data_x}{data x that the join will be based on}
11 | 
12 | \item{ID_x}{Join ID in dataset x}
13 | 
14 | \item{ID_y}{Join ID in dataset y}
15 | }
16 | \value{
17 | dataframe of n rows that have a join match in x
18 | }
19 | \description{
20 | count of rows with join
21 | }
22 | \examples{
23 | ##get_join_rows_n(df.x, df.x$ID, df.y$ID)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------