├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── check-standard.yaml │ ├── lint.yaml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ └── test-coverage.yaml ├── .gitignore ├── .lintr ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── dupree.R ├── dupree_classes.R ├── dupree_code_enumeration.R ├── dupree_data_validity.R ├── dups-class.R └── utils.R ├── README.Rmd ├── README.md ├── TODO.md ├── _pkgdown.yml ├── codecov.yml ├── cran-comments.md ├── docs ├── 404.html ├── LICENSE-text.html ├── TODO.html ├── _config.yml ├── authors.html ├── bootstrap-toc.css ├── bootstrap-toc.js ├── docsearch.css ├── docsearch.js ├── index.html ├── link.svg ├── news │ └── index.html ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml ├── reference │ ├── EnumeratedCodeTable-class.html │ ├── Rplot001.png │ ├── as.data.frame.dups.html │ ├── as_tibble.dups.html │ ├── dupree.html │ ├── dupree_dir.html │ ├── dupree_package.html │ ├── index.html │ ├── print.dups.html │ └── reexports.html └── sitemap.xml ├── dupree.Rproj ├── inst └── extdata │ └── duplicated.R ├── man ├── EnumeratedCodeTable-class.Rd ├── as.data.frame.dups.Rd ├── as_tibble.dups.Rd ├── dupree.Rd ├── dupree_dir.Rd ├── dupree_package.Rd ├── print.dups.Rd └── reexports.Rd ├── presentations ├── cleanish_code.Rmd └── duplication_heavy.R └── tests ├── testthat.R └── testthat ├── helper.R ├── test-dupree_dir_integration.R ├── test-dupree_integration.R ├── test-dupree_package_integration.R ├── test-dups-class.R ├── test_dupree.R ├── test_dupree_classes.R ├── test_dupree_code_enumeration.R └── testdata ├── anRpackage ├── DESCRIPTION ├── NAMESPACE ├── R │ └── anRpackage-internal.R ├── Read-and-delete-me ├── data │ └── ok.rda ├── inst │ └── dir1 │ │ └── R │ │ └── dont_dup_me.R └── man │ ├── anRpackage-package.Rd │ └── ok.Rd ├── comments.R ├── empty.R ├── empty.Rmd ├── header_only.Rmd ├── max_9_symbols.R ├── non_r_blocks.Rmd ├── r_blocks.Rmd └── text_only.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.lintr$ 4 | ^README\.Rmd$ 5 | ^README-.*\.png$ 6 | ^TODO\.md$ 7 | ^codecov\.yml$ 8 | ^presentations$ 9 | ^cran-comments.md$ 10 | ^CRAN-RELEASE$ 11 | ^_pkgdown\.yml$ 12 | ^docs$ 13 | ^pkgdown$ 14 | ^\.github$ 15 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: lint 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | steps: 17 | - uses: actions/checkout@v3 18 | 19 | - uses: r-lib/actions/setup-r@v2 20 | with: 21 | use-public-rspm: true 22 | 23 | - uses: r-lib/actions/setup-r-dependencies@v2 24 | with: 25 | extra-packages: any::lintr, local::. 26 | needs: lint 27 | 28 | - name: Lint 29 | run: lintr::lint_package() 30 | shell: Rscript {0} 31 | env: 32 | LINTR_ERROR_ON_LINT: true 33 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v3 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@v4.4.1 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | issue_comment: 3 | types: [created] 4 | name: Commands 5 | jobs: 6 | document: 7 | if: startsWith(github.event.comment.body, '/document') 8 | name: document 9 | runs-on: macOS-latest 10 | env: 11 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 12 | steps: 13 | - uses: actions/checkout@v2 14 | - uses: r-lib/actions/pr-fetch@v1 15 | with: 16 | repo-token: ${{ secrets.GITHUB_TOKEN }} 17 | - uses: r-lib/actions/setup-r@v1 18 | - name: Install dependencies 19 | run: Rscript -e 'install.packages(c("remotes", "roxygen2"))' -e 'remotes::install_deps(dependencies = TRUE)' 20 | - name: Document 21 | run: Rscript -e 'roxygen2::roxygenise()' 22 | - name: commit 23 | run: | 24 | git config --local user.email "actions@github.com" 25 | git config --local user.name "GitHub Actions" 26 | git add man/\* NAMESPACE 27 | git commit -m 'Document' 28 | - uses: r-lib/actions/pr-push@v1 29 | with: 30 | repo-token: ${{ secrets.GITHUB_TOKEN }} 31 | style: 32 | if: startsWith(github.event.comment.body, '/style') 33 | name: style 34 | runs-on: macOS-latest 35 | env: 36 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 37 | steps: 38 | - uses: actions/checkout@v2 39 | - uses: r-lib/actions/pr-fetch@v1 40 | with: 41 | repo-token: ${{ secrets.GITHUB_TOKEN }} 42 | - uses: r-lib/actions/setup-r@v1 43 | - name: Install dependencies 44 | run: Rscript -e 'install.packages("styler")' 45 | - name: Style 46 | run: Rscript -e 'styler::style_pkg()' 47 | - name: commit 48 | run: | 49 | git config --local user.email "actions@github.com" 50 | git config --local user.name "GitHub Actions" 51 | git add \*.R 52 | git commit -m 'Style' 53 | - uses: r-lib/actions/pr-push@v1 54 | with: 55 | repo-token: ${{ secrets.GITHUB_TOKEN }} 56 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v3 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | -------------------------------------------------------------------------------- /.lintr: -------------------------------------------------------------------------------- 1 | linters: linters_with_defaults( 2 | commented_code_linter = NULL, 3 | line_length_linter(80), 4 | object_length_linter(40), 5 | object_name_linter(c("snake_case", "CamelCase")), 6 | undesirable_function_linter = undesirable_function_linter(), 7 | undesirable_operator_linter = undesirable_operator_linter() 8 | ) 9 | exclusions: list( 10 | "inst/extdata/duplicated.R", 11 | "tests/testthat.R", 12 | "tests/testthat/testdata" 13 | ) 14 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: dupree 2 | Type: Package 3 | Title: Identify Duplicated R Code in a Project 4 | Version: 0.3.0.9000 5 | Author: Russ Hyde 6 | Maintainer: Russ Hyde 7 | Description: Identifies code blocks that have a high level of similarity 8 | within a set of R files. 9 | URL: https://russhyde.github.io/dupree/, https://github.com/russHyde/dupree 10 | BugReports: https://github.com/russHyde/dupree/issues 11 | License: MIT + file LICENSE 12 | Encoding: UTF-8 13 | Language: en-GB 14 | LazyData: true 15 | Suggests: 16 | testthat (>= 2.1.0), 17 | knitr, 18 | rmarkdown, 19 | covr 20 | Imports: 21 | dplyr (>= 1.1.0), 22 | purrr, 23 | tibble, 24 | magrittr, 25 | methods, 26 | stringdist (>= 0.9.5.5), 27 | lintr (>= 3.0.0), 28 | rlang 29 | RoxygenNote: 7.2.2 30 | Collate: 31 | 'utils.R' 32 | 'dupree.R' 33 | 'dupree_classes.R' 34 | 'dupree_data_validity.R' 35 | 'dupree_code_enumeration.R' 36 | 'dups-class.R' 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2018 2 | COPYRIGHT HOLDER: Russ Hyde 3 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(as.data.frame,dups) 4 | S3method(as_tibble,dups) 5 | S3method(print,dups) 6 | export(as_tibble) 7 | export(dupree) 8 | export(dupree_dir) 9 | export(dupree_package) 10 | importFrom(dplyr,arrange) 11 | importFrom(dplyr,bind_rows) 12 | importFrom(dplyr,desc) 13 | importFrom(dplyr,filter) 14 | importFrom(dplyr,group_by) 15 | importFrom(dplyr,mutate) 16 | importFrom(dplyr,n) 17 | importFrom(dplyr,select) 18 | importFrom(dplyr,summarise) 19 | importFrom(lintr,get_source_expressions) 20 | importFrom(magrittr,"%>%") 21 | importFrom(methods,callNextMethod) 22 | importFrom(methods,is) 23 | importFrom(methods,new) 24 | importFrom(methods,setMethod) 25 | importFrom(methods,validObject) 26 | importFrom(purrr,keep) 27 | importFrom(purrr,map) 28 | importFrom(purrr,map2) 29 | importFrom(purrr,map_df) 30 | importFrom(rlang,.data) 31 | importFrom(stringdist,seq_sim) 32 | importFrom(tibble,as_tibble) 33 | importFrom(tibble,tibble) 34 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # dupree (development version) 2 | 3 | * Pass print(dups, ...) dots to print(tibble, ...) so that the number of output lines can be 4 | specified in the output (thanks @mikemahoney218) 5 | * Update the CI workflows for pkgdown, test-coverage and R CMD check 6 | * use lintr >= 3, and update .lintr config file 7 | * Fixed linting across package 8 | * Fixed tidyselect warnings 9 | * lint the package in CI 10 | * Convert default git branch to 'main' (#84, @russHyde) 11 | 12 | # dupree 0.3.0 13 | 14 | ## Breaking changes 15 | 16 | * `dupree()`, `dupree_package()` and `dupree_dir()` return an object of class 17 | `dups`, rather than a data-frame. Methods to convert to data.frame / tibble 18 | are provided though (#60, @russHyde) 19 | 20 | ## Minor changes and bug-fixes 21 | 22 | * Changed the default `min_block_size` to 40 throughout (#68, @russHyde) 23 | 24 | * Rewrote a test-helper function that compares two list-column-containing 25 | tibbles: necessitated by a change in dplyr=1.0 (#65, @russHyde) 26 | 27 | * `dupree_package()` and `dupree_dir()` fail early when provided a non-existing 28 | path (#67, @russHyde) 29 | 30 | * `dupree_package()` asserts that a path has a DESCRIPTION and an R/ subdir 31 | present (#57, @russHyde) 32 | 33 | # dupree 0.2.0 34 | 35 | * lintr dependence pinned to lintr=2.0.0 so that non-R-codeblocks and empty R 36 | markdown files can be dealt with cleanly 37 | 38 | * Tests that depend on `stringdist::seq_sim` were rewritten to ensure they 39 | consistently pass 40 | 41 | * Dependency on deprecated dplyr verbs removed 42 | 43 | * Code to prevent double-reporting of code-block pairs was 44 | initiated by @Alanocallaghan 45 | 46 | # dupree 0.1.0 47 | 48 | * Added a `NEWS.md` file to track changes to the package. 49 | -------------------------------------------------------------------------------- /R/dupree.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | 3 | # Workflow for a set of files: 4 | # - for each file: 5 | # - `get_source_expressions` on the file 6 | # - drop any entry in `expressions` that stores the whole file 7 | # - join the parsed_content blocks by block-number 8 | # - join the file-level parsed-content blocks by file-name 9 | # - filter the tokens to non-trivial symbols 10 | # - drop any blocks that have fewer non-trivial symbols than some threshold 11 | # - enumerate the symbols 12 | # - construct a vector of enumerated-symbols for each code-block in each file 13 | # - run levenshtein distance between each pair of enumerated code-blocks 14 | # - return the distance-sorted associations between blocks 15 | 16 | # Therefore, need class: 17 | # - `EnumeratedCodeTable`: tibble containing colnames "file", "block", 18 | # "start_line", "enumerated_code" 19 | # - the "enumerated_code" column is a list of vectors of integers 20 | # - methods: find_best_matches() and find_best_matches_of_single_block() 21 | 22 | # Also need a way to filter out blocks that have few non-trivial symbols 23 | 24 | ############################################################################### 25 | 26 | #' Detect code duplication between the code-blocks in a set of files 27 | #' 28 | #' This function identifies all code-blocks in a set of files and then computes 29 | #' a similarity score between those code-blocks to help identify functions / 30 | #' classes that have a high level of duplication, and could possibly be 31 | #' refactored. 32 | #' 33 | #' Code-blocks under a size threshold are disregarded before analysis (the size 34 | #' threshold is controlled by \code{min_block_size}); and only top-level code 35 | #' blocks are considered. 36 | #' 37 | #' Every sufficiently large code-block in the input files will be present in 38 | #' the results at least once. If code-block X and code-block Y are present in 39 | #' a row of the resulting data-frame, then either X is the closest match to Y, 40 | #' or Y is the closest match to X (or possibly both) according to the 41 | #' similarity score; as such, some code-blocks may be present multiple times in 42 | #' the results. 43 | #' 44 | #' Similarity between code-blocks is calculated using the 45 | #' longest-common-subsequence (\code{lcs}) measure from the package 46 | #' \code{stringdist}. This measure is applied to a tokenised version of the 47 | #' code-blocks. That is, each function name / operator / variable in the code 48 | #' blocks is converted to a unique integer so that a code-block can be 49 | #' represented as a vector of integers and the \code{lcs} measure is applied to 50 | #' each pair of these vectors. 51 | #' 52 | #' @param files A set of files over which code-duplication 53 | #' should be measured. 54 | #' 55 | #' @param min_block_size \code{dupree} uses a notion of non-trivial 56 | #' symbols. These are the symbols / code-words that remain after filtering 57 | #' out really common symbols like \code{<-}, \code{,}, etc. After filtering 58 | #' out these symbols from each code-block, only those blocks containing at 59 | #' least \code{min_block_size} symbols are used in the inter-block 60 | #' code-duplication measurement. 61 | #' 62 | #' @param ... Unused at present. 63 | #' 64 | #' @return A \code{tibble}. Each row in the table summarises the 65 | #' comparison between two code-blocks (block 'a' and block 'b') in the input 66 | #' files. Each code-block in the pair is indicated by: i) the file 67 | #' (\code{file_a} / \code{file_b}) that contains it; ii) its position within 68 | #' that file (\code{block_a} / \code{block_b}; 1 being the first code-block in 69 | #' a given file); and iii) the line where that code-block starts in that file 70 | #' (\code{line_a} / \code{line_b}). The pairs of code-blocks are ordered by 71 | #' decreasing similarity. Any match that is returned is either the top hit for 72 | #' block 'a' or for block 'b' (or both). 73 | #' 74 | #' @importFrom magrittr %>% 75 | #' 76 | #' @examples 77 | #' # To quantify duplication between the top-level code-blocks in a file 78 | #' example_file <- system.file("extdata", "duplicated.R", package = "dupree") 79 | #' dup <- dupree(example_file, min_block_size = 10) 80 | #' dup 81 | #' 82 | #' # For the block-pair with the highest duplication, we print the first four 83 | #' # lines: 84 | #' readLines(example_file)[dup$line_a[1] + c(0:3)] 85 | #' readLines(example_file)[dup$line_b[1] + c(0:3)] 86 | #' 87 | #' # The code-blocks in the example file are rather small, so if 88 | #' # `min_block_size` is too large, none of the code-blocks will be analysed 89 | #' # and the results will be empty: 90 | #' dupree(example_file, min_block_size = 40) 91 | #' @export 92 | 93 | dupree <- function(files, min_block_size = 40, ...) { 94 | preprocess_code_blocks(files, min_block_size) %>% 95 | find_best_matches() %>% 96 | as_dups() 97 | } 98 | 99 | ############################################################################### 100 | 101 | #' Run duplicate-code detection over all R-files in a directory 102 | #' 103 | #' @inheritParams dupree 104 | #' 105 | #' @param path A directory (By default the current working 106 | #' directory). All files in this directory that have a ".R", ".r" or ".Rmd" 107 | #' extension will be checked for code duplication. 108 | #' 109 | #' @param filter A pattern for use in grep - this is used to keep 110 | #' only particular files: eg, filter = "classes" would compare files with 111 | #' `classes` in the filename 112 | #' 113 | #' @param ... Further arguments for grep. For example, `filter 114 | #' = "test", invert = TRUE` would disregard all files with `test` in the 115 | #' file-path. 116 | #' 117 | #' @param recursive Should we consider files in subdirectories as 118 | #' well? 119 | #' 120 | #' @seealso dupree 121 | #' 122 | #' @export 123 | 124 | dupree_dir <- function(path = ".", 125 | min_block_size = 40, 126 | filter = NULL, 127 | ..., 128 | recursive = TRUE) { 129 | if (!dir.exists(path)) { 130 | stop("The path ", path, " does not exist") 131 | } 132 | files <- dir( 133 | path, 134 | pattern = ".*(.R|.r|.Rmd)$", full.names = TRUE, recursive = recursive 135 | ) 136 | keep_files <- if (is.null(filter)) { 137 | files 138 | } else { 139 | files[grep(pattern = filter, x = files, ...)] 140 | } 141 | 142 | dupree(keep_files, min_block_size) 143 | } 144 | 145 | ############################################################################### 146 | 147 | #' Run duplicate-code detection over all files in the `R` directory of a 148 | #' package 149 | #' 150 | #' The function fails if the path does not look like a typical R package (it 151 | #' should have both an R/ subdirectory and a DESCRIPTION file present). 152 | #' 153 | #' @inheritParams dupree 154 | #' 155 | #' @param package The name or path to the package that is to be 156 | #' checked (By default the current working directory). 157 | #' 158 | #' @seealso dupree 159 | #' 160 | #' @include utils.R 161 | #' @export 162 | 163 | dupree_package <- function(package = ".", 164 | min_block_size = 40) { 165 | if (!dir.exists(package)) { 166 | stop("The path ", package, " does not exist") 167 | } 168 | if (!has_description(package)) { 169 | stop("The path ", package, " is not an R package (no DESCRIPTION)") 170 | } 171 | if (!has_r_source_dir(package)) { 172 | stop("The path", package, " is not an R package (no R/ subdir)") 173 | } 174 | dupree_dir(package, min_block_size, filter = paste0(package, "/R/")) 175 | } 176 | 177 | ############################################################################### 178 | -------------------------------------------------------------------------------- /R/dupree_classes.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | 3 | # Classes for `dupree` 4 | 5 | ############################################################################### 6 | 7 | # Class definition: `EnumeratedCodeTable` 8 | 9 | #' An S4 class to represent the code blocks as strings of integers 10 | #' 11 | #' @name EnumeratedCodeTable-class 12 | #' @slot blocks A tbl_df with columns `file`, `block`, `start_line` and 13 | #' `enumerated_code` 14 | #' 15 | methods::setClass("EnumeratedCodeTable", slots = list(blocks = "tbl_df")) 16 | 17 | ############################################################################### 18 | 19 | #' `EnumeratedCodeTable` validation 20 | #' 21 | #' @noRd 22 | #' 23 | .is_enumerated_code_table <- function(object) { 24 | required_cols <- c("file", "block", "start_line", "enumerated_code") 25 | observed_cols <- colnames(object@blocks) 26 | 27 | if ( 28 | all(required_cols %in% observed_cols) 29 | ) { 30 | TRUE 31 | } else { 32 | missing_cols <- setdiff(required_cols, observed_cols) 33 | paste("Column", missing_cols, "should be in object@blocks") 34 | } 35 | } 36 | 37 | methods::setValidity( 38 | "EnumeratedCodeTable", 39 | function(object) .is_enumerated_code_table(object) 40 | ) 41 | 42 | ############################################################################### 43 | 44 | #' Initialise an `EnumeratedCodeTable` 45 | #' 46 | #' An `EnumeratedCodeTable` contains a `blocks` table. Each row of this table 47 | #' contains details for a block of R code: the filename, block-id and startline 48 | #' of the block, and a tokenized version of the code within that block. 49 | #' 50 | #' Once initialised, the blocks table is ordered by filename and then block-id. 51 | #' 52 | #' @importFrom methods callNextMethod setMethod validObject 53 | #' @importFrom tibble tibble 54 | #' 55 | #' @noRd 56 | #' 57 | methods::setMethod( 58 | "initialize", 59 | "EnumeratedCodeTable", 60 | function(.Object, blocks = NULL, ...) { 61 | .Object <- methods::callNextMethod(...) 62 | 63 | default_code_table <- tibble::tibble( 64 | file = character(0), block = integer(0), start_line = integer(0), 65 | enumerated_code = list() 66 | ) 67 | 68 | if (is.null(blocks)) { 69 | .Object@blocks <- default_code_table 70 | } else { 71 | # we ensure that the code blocks are ordered by file and then block 72 | .Object@blocks <- dplyr::arrange( 73 | blocks, .data[["file"]], .data[["block"]] 74 | ) 75 | } 76 | 77 | methods::validObject(.Object) 78 | 79 | .Object 80 | } 81 | ) 82 | 83 | ############################################################################### 84 | 85 | # `find_best_matches` 86 | 87 | ############################################################################### 88 | # By default we use `lcs` as the sequence-similarity measure 89 | # - for two integer vectors, the lcs-distance is the minimum number of entries 90 | # that need to be removed from both vectors before identity is reached 91 | # - then the lcs-similarity score is 1 - distance / max_length; where 92 | # max_length is the sum of the lengths of the two input vectors 93 | # - d((1, 2, 3, 4), (1, 4, 5, 6)) = 4; s(..., ...) = 1 - 4 / 8 94 | # - we use lcs because it's simple to explain 95 | 96 | #' `find_best_matches` between code blocks 97 | #' 98 | #' @noRd 99 | #' 100 | 101 | # nocov start 102 | methods::setGeneric("find_best_matches", function(x, ...) { 103 | methods::standardGeneric("find_best_matches") 104 | }) 105 | # nocov end 106 | 107 | #' `find_best_matches` between code blocks in an `EnumeratedCodeTable` 108 | #' 109 | #' The code blocks are assumed to be ordered within the 110 | #' `EnumeratedCodeTable`, as such when two code blocks are 111 | #' mutually-best-matches, the results returned by this function only contains 112 | #' a single row for those two code blocks; when this happens we guarantee that 113 | #' `file_a` <= `file_b` and `block_a` <= `block_b` 114 | #' 115 | #' @noRd 116 | #' 117 | methods::setMethod( 118 | "find_best_matches", 119 | methods::signature("EnumeratedCodeTable"), 120 | function(x, ...) { 121 | blocks <- x@blocks 122 | enum_codes <- x@blocks$enumerated_code 123 | index_matches <- find_indexes_of_best_matches(enum_codes, ...) 124 | details_a <- blocks[index_matches$index_a, ] 125 | details_b <- blocks[index_matches$index_b, ] 126 | 127 | score <- index_matches$score 128 | 129 | tibble::tibble( 130 | file_a = details_a$file, 131 | file_b = details_b$file, 132 | block_a = details_a$block, 133 | block_b = details_b$block, 134 | line_a = details_a$start_line, 135 | line_b = details_b$start_line, 136 | score = score 137 | ) 138 | } 139 | ) 140 | 141 | ############################################################################### 142 | 143 | # Related Functions 144 | 145 | ############################################################################### 146 | 147 | #' One against all search 148 | #' 149 | #' @noRd 150 | #' 151 | .one_against_all <- function(subject_index, enum_codes, sim_func) { 152 | subject <- enum_codes[subject_index] 153 | scores <- sim_func(subject, enum_codes) 154 | scores[subject_index] <- -1 155 | list( 156 | index_a = subject_index, 157 | index_b = which.max(scores), 158 | score = max(scores) 159 | ) 160 | } 161 | 162 | #' All against all search 163 | #' 164 | #' @param enum_codes List of vectors of integers. Each `int` is an 165 | #' enumerated code for some code-symbol (like a conversion of the 166 | #' code-symbols into a factor). 167 | #' @param method Alignment method for use in 168 | #' `stringdist::seq_sim`. 169 | #' @param ... Further parameters for passing to 170 | #' `stringdist::seq_sim`. 171 | #' 172 | #' @importFrom dplyr arrange desc mutate select 173 | #' @importFrom purrr map_df 174 | #' @importFrom stringdist seq_sim 175 | #' @importFrom tibble tibble 176 | #' @importFrom rlang .data 177 | #' 178 | #' @noRd 179 | #' 180 | find_indexes_of_best_matches <- function(enum_codes, method = "lcs", ...) { 181 | empty_result <- tibble::tibble( 182 | index_a = integer(0), index_b = integer(0), score = numeric(0) 183 | ) 184 | if (length(enum_codes) <= 1) { 185 | return(empty_result) 186 | } 187 | 188 | sim_func <- function(x, y) { 189 | stringdist::seq_sim(x, y, method = method, ...) 190 | } 191 | 192 | # .one_against_all returns df: (index_a, index_b, score) 193 | 194 | # For each code-block we want to identify it's closest matching code-block 195 | # 196 | # We only return a code-block pair once (ie, if A-B is a pair and B-A is a 197 | # pair, then we return A-B, but not B-A) 198 | # 199 | # When C-A is a pair but the index of C is greater than that of A, we return 200 | # the pair A-C 201 | 202 | scores <- purrr::map_df( 203 | seq_along(enum_codes), 204 | .one_against_all, 205 | enum_codes, 206 | sim_func 207 | ) %>% 208 | # ensure the index of A is less than the index of B 209 | dplyr::mutate( 210 | temp = pmax(.data[["index_a"]], .data[["index_b"]]), 211 | index_a = pmin(.data[["index_a"]], .data[["index_b"]]), 212 | index_b = .data[["temp"]] 213 | ) %>% 214 | dplyr::select( 215 | -"temp" 216 | ) %>% 217 | # only return each code-block pair once 218 | unique() %>% 219 | # order the code-block pairs by decreasing score 220 | dplyr::arrange( 221 | dplyr::desc(.data[["score"]]), .data[["index_a"]], .data[["index_b"]] 222 | ) 223 | 224 | scores 225 | } 226 | 227 | ############################################################################### 228 | -------------------------------------------------------------------------------- /R/dupree_code_enumeration.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | 3 | # Functions / Classes for extracting / collapsing / filtering parsed-code 4 | # blocks from a set of files 5 | # - All non-trivial symbols in the code blocks are enumerated (converted to an 6 | # integer) for use in similarity measurement 7 | 8 | ############################################################################### 9 | 10 | #' `.get_default_annotated_parsed_content` 11 | #' 12 | #' @noRd 13 | #' 14 | .get_default_annotated_parsed_content <- function() { 15 | i0 <- integer(0) 16 | c0 <- character(0) 17 | l0 <- logical(0) 18 | 19 | tibble::tibble( 20 | line1 = i0, col1 = i0, line2 = i0, col2 = i0, id = i0, parent = i0, 21 | token = c0, terminal = l0, text = c0, file = c0, block = i0, 22 | start_line = i0 23 | ) 24 | } 25 | 26 | #' Add filename, block-number and start-line for the parsed-content for each 27 | #' code block in a given file 28 | #' 29 | #' @param parsed_content The parsed-content for a specific code-block 30 | #' from running get_source_expressions on a file. 31 | #' @param file The filename for the content. 32 | #' @param block The block from which the content came. 33 | #' @param start_line The start-line of the block in the filename. 34 | #' 35 | #' @importFrom dplyr mutate 36 | #' @include dupree_data_validity.R 37 | #' 38 | #' @noRd 39 | #' 40 | annotate_parsed_content <- function(parsed_content, file, block, start_line) { 41 | stopifnot(.is_parsed_content(parsed_content)) 42 | 43 | parsed_content %>% 44 | dplyr::mutate( 45 | file = file, block = block, start_line = start_line 46 | ) 47 | } 48 | 49 | #' Convert a list of source_expressions to a data-frame that contains the 50 | #' parsed-content from each source expression, and indicates the file, 51 | #' block-number and start-line for that source expression 52 | #' 53 | #' @param source_exprs A list of source-expressions, obtained from 54 | #' lintr::get_source_expressions. 55 | #' 56 | #' @importFrom dplyr bind_rows 57 | #' @importFrom purrr keep map2 58 | #' @include dupree_data_validity.R 59 | #' 60 | #' @noRd 61 | #' 62 | get_localised_parsed_code_blocks <- function(source_exprs) { 63 | source_blocks <- purrr::keep( 64 | source_exprs[["expressions"]], 65 | .has_parsed_content 66 | ) 67 | 68 | if (length(source_blocks) == 0) { 69 | return(.get_default_annotated_parsed_content()) 70 | } 71 | 72 | parsed_content <- purrr::map2( 73 | source_blocks, 74 | seq_along(source_blocks), 75 | function(x, y) { 76 | annotate_parsed_content(x$parsed_content, x$file, y, x$line) 77 | } 78 | ) 79 | 80 | dplyr::bind_rows(parsed_content) 81 | } 82 | 83 | #' `remove_trivial_code_symbols` 84 | #' 85 | #' @importFrom dplyr filter 86 | #' @importFrom rlang .data 87 | #' 88 | #' @noRd 89 | #' 90 | remove_trivial_code_symbols <- function(df) { 91 | # TODO: check for presence of `token` column 92 | .quote_wrap <- function(x) { 93 | gsub(pattern = "^(.*)$", replacement = "'\\1'", x) 94 | } 95 | 96 | drop_tokens <- c( 97 | .quote_wrap( 98 | c("-", "+", ",", "(", ")", "[", "]", "{", "}", "$", "@", ":") 99 | ), 100 | "AND2", "NS_GET", "expr", "COMMENT", "LEFT_ASSIGN", "LBB", "EQ_SUB" 101 | ) 102 | 103 | df %>% 104 | dplyr::filter(!.data[["token"]] %in% drop_tokens) 105 | } 106 | 107 | #' enumerate_code_symbols 108 | #' 109 | #' @importFrom dplyr mutate 110 | #' @importFrom rlang .data 111 | #' 112 | #' @noRd 113 | #' 114 | enumerate_code_symbols <- function(df) { 115 | # TODO: check for `text` column 116 | df %>% 117 | dplyr::mutate(symbol_enum = as.integer(factor(.data[["text"]]))) 118 | } 119 | 120 | #' summarise_enumerated_blocks 121 | #' 122 | #' @importFrom dplyr group_by summarise n 123 | #' @importFrom rlang .data 124 | #' 125 | #' @noRd 126 | #' 127 | summarise_enumerated_blocks <- function(df) { 128 | grouping_cols <- c("file", "block", "start_line") 129 | df %>% 130 | dplyr::group_by( 131 | dplyr::across( 132 | dplyr::all_of(grouping_cols) 133 | ) 134 | ) %>% 135 | dplyr::summarise( 136 | enumerated_code = list(c(.data[["symbol_enum"]])), 137 | block_size = dplyr::n() 138 | ) 139 | } 140 | 141 | ############################################################################### 142 | 143 | #' import_parsed_code_blocks_from_one_file 144 | #' 145 | #' @importFrom dplyr filter 146 | #' @importFrom lintr get_source_expressions 147 | #' @importFrom rlang .data 148 | #' 149 | #' @noRd 150 | #' 151 | import_parsed_code_blocks_from_one_file <- function(file) { 152 | file %>% 153 | lintr::get_source_expressions() %>% 154 | get_localised_parsed_code_blocks() %>% 155 | dplyr::filter(!.data[["token"]] %in% "COMMENT") 156 | } 157 | 158 | #' import_parsed_code_blocks 159 | #' 160 | #' @importFrom dplyr bind_rows 161 | #' @importFrom purrr map 162 | #' 163 | #' @noRd 164 | #' 165 | import_parsed_code_blocks <- function(files) { 166 | files %>% 167 | purrr::map(import_parsed_code_blocks_from_one_file) %>% 168 | dplyr::bind_rows() 169 | } 170 | 171 | #' tokenize_code_blocks 172 | #' 173 | #' @noRd 174 | #' 175 | tokenize_code_blocks <- function(block_df) { 176 | block_df %>% 177 | remove_trivial_code_symbols() %>% 178 | enumerate_code_symbols() %>% 179 | summarise_enumerated_blocks() 180 | } 181 | 182 | ############################################################################### 183 | 184 | #' preprocess_code_blocks 185 | #' 186 | #' @param files A set of *.R or *.Rmd files over which dupree is 187 | #' to perform duplicate-identification 188 | #' @param min_block_size An integer >= 1. How many non-trivial symbols 189 | #' must be present in a code-block if that block is to be used in 190 | #' code-duplication detection. 191 | #' 192 | #' @importFrom dplyr filter 193 | #' @importFrom methods new 194 | #' @include dupree_classes.R 195 | #' 196 | #' @noRd 197 | #' 198 | preprocess_code_blocks <- function(files, min_block_size = 40) { 199 | blocks <- files %>% 200 | import_parsed_code_blocks() %>% 201 | tokenize_code_blocks() %>% 202 | dplyr::filter( 203 | .data[["block_size"]] >= min_block_size 204 | ) 205 | 206 | methods::new("EnumeratedCodeTable", blocks) 207 | } 208 | 209 | ############################################################################### 210 | -------------------------------------------------------------------------------- /R/dupree_data_validity.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | 3 | # Data-validity checking for data-structures in `dupree` 4 | 5 | ############################################################################### 6 | 7 | .build_name_checker <- function(label) { 8 | function(x) label %in% names(x) 9 | } 10 | 11 | ############################################################################### 12 | 13 | .has_content <- .build_name_checker("content") 14 | 15 | .has_parsed_content <- .build_name_checker("parsed_content") 16 | 17 | ############################################################################### 18 | 19 | #' Checks if a data-structure conforms to the structure of a `parsed_content` 20 | #' entry as present in a subentry of get_source_expressions...$expressions 21 | #' 22 | #' @noRd 23 | #' 24 | .is_parsed_content <- function(x) { 25 | # data-frame with columns: line1, col1, line2, col2, id, parent, token, 26 | # terminal, text 27 | reqd_columns <- c( 28 | "line1", "col1", "line2", "col2", "id", "parent", "token", 29 | "terminal", "text" 30 | ) 31 | 32 | is.data.frame(x) && 33 | all(reqd_columns %in% colnames(x)) 34 | } 35 | -------------------------------------------------------------------------------- /R/dups-class.R: -------------------------------------------------------------------------------- 1 | #' @importFrom methods is 2 | as_dups <- function(x) { 3 | if (!is.data.frame(x) && !methods::is(x, "dups")) { 4 | stop("Can only convert 'data.frame' and 'dups' to 'dups'") 5 | } 6 | if (methods::is(x, "dups")) { 7 | return(x) 8 | } 9 | 10 | dups <- list(dups_df = x) 11 | class(dups) <- "dups" 12 | 13 | dups 14 | } 15 | 16 | #' as.data.frame method for `dups` class 17 | #' 18 | #' @inheritParams base::as.data.frame 19 | #' @export 20 | #' 21 | as.data.frame.dups <- function(x, ...) { 22 | as.data.frame(x[["dups_df"]]) 23 | } 24 | 25 | #' convert a `dups` object to a `tibble` 26 | #' 27 | #' @inheritParams tibble::as_tibble 28 | #' @importFrom tibble as_tibble 29 | #' 30 | #' @exportS3Method 31 | #' 32 | 33 | # nolint start 34 | as_tibble.dups <- function(x, ...) { 35 | tibble::as_tibble(x[["dups_df"]]) 36 | } 37 | # nolint end 38 | 39 | #' @export 40 | tibble::as_tibble 41 | 42 | #' print method for `dups` class 43 | #' 44 | #' @inheritParams base::print 45 | #' @export 46 | #' 47 | 48 | # nocov start 49 | print.dups <- function(x, ...) { 50 | print(as_tibble(x), ...) 51 | invisible(x) 52 | } 53 | # nocov end 54 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | has_description <- function(path) { 2 | file.exists(file.path(path, "DESCRIPTION")) 3 | } 4 | 5 | has_r_source_dir <- function(path) { 6 | dir.exists(file.path(path, "R")) 7 | } 8 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "README-" 12 | ) 13 | ``` 14 | 15 | 16 | [![Codecov test coverage](https://codecov.io/gh/russHyde/dupree/branch/main/graph/badge.svg)](https://codecov.io/gh/russHyde/dupree?branch=main) 17 | [![R-CMD-check](https://github.com/russHyde/dupree/workflows/R-CMD-check/badge.svg)](https://github.com/russHyde/dupree/actions) 18 | 19 | 20 | # dupree 21 | 22 | The goal of `dupree` is to identify chunks / blocks of highly duplicated code 23 | within a set of R scripts. 24 | 25 | A very lightweight approach is used: 26 | 27 | - The user provides a set of `*.R` and/or `*.Rmd` files; 28 | 29 | - All R-code in the user-provided files is read and code-blocks are identified; 30 | 31 | - The non-trivial symbols from each code-block are retained (for instance, 32 | really common symbols like `<-`, `,`, `+`, `(` are dropped); 33 | 34 | - Similarity between different blocks is calculated using `stringdist::seq_sim` 35 | by longest-common-subsequence (symbol-identity is at whole-word level - so 36 | "my_data", "my_Data", "my.data" and "myData" are not considered to be identical 37 | in the calculation - and all non-trivial symbols have equal weight in the 38 | similarity calculation); 39 | 40 | - Code-blocks pairs (both between and within the files) are returned in order 41 | of highest similarity 42 | 43 | To prevent the results being dominated by high-identity blocks containing very 44 | few symbols (eg, `library(dplyr)`) the user can specify a `min_block_size`. Any 45 | code-block containing at least this many non-trivial symbols will be kept. 46 | 47 | ## Installation 48 | 49 | You can install `dupree` from github with: 50 | 51 | ```{r gh-installation, eval = FALSE} 52 | if (!"dupree" %in% installed.packages()) { 53 | # Alternatively: 54 | # install.packages("dupree") 55 | remotes::install_github("russHyde/dupree") 56 | } 57 | ``` 58 | 59 | ## Example 60 | 61 | To run `dupree` over a set of R files, you can use the `dupree()`, 62 | `dupree_dir()` or `dupree_package()` functions. For example, to identify 63 | duplication within all of the `.R` and `.Rmd` files for the `dupree` package 64 | you could run the following: 65 | 66 | ```{r example} 67 | ## basic example code 68 | library(dupree) 69 | 70 | files <- dir(pattern = "*.R(md)*$", recursive = TRUE) 71 | 72 | dupree(files) 73 | ``` 74 | 75 | Any top-level code blocks that contain at least 76 | `r formals(dupree)$min_block_size` non-trivial tokens are 77 | included in the above analysis (a token being a function or variable name, an 78 | operator etc; but ignoring comments, white-space and some really common tokens: 79 | `[](){}-+$@:,=`, `<-`, `&&` etc). To be more restrictive, you could consider 80 | larger code-blocks (increase `min_block_size`) within just the `./R/` source 81 | code directory: 82 | 83 | ```{r} 84 | # R-source code files in the ./R/ directory of the dupree package: 85 | source_files <- dir(path = "./R", pattern = "*.R(md)*$", full.names = TRUE) 86 | 87 | # analyse any code blocks that contain at least 50 non-trivial tokens 88 | dupree(source_files, min_block_size = 50) 89 | ``` 90 | 91 | For each (sufficiently big) code block in the provided files, `dupree` will 92 | return the code-block that is most-similar to it (although any given block 93 | may be present in the results multiple times if it is the closest match for 94 | several other code blocks). 95 | 96 | Code block pairs with a higher `score` value are more similar. `score` lies in 97 | the range [0, 1]; and is calculated by the 98 | [`stringdist`](https://github.com/markvanderloo/stringdist) package: matching 99 | occurs at the token level: the token "my_data" is no more similar to the token 100 | "myData" than it is to "x". 101 | 102 | If you find code-block-pairs with a similarity score much greater than 0.5 103 | there is probably some commonality that could be abstracted away. 104 | 105 | ---- 106 | 107 | Note that you can do something similar using the functions `dupree_dir` and 108 | (if you are analysing a package) `dupree_package`. 109 | 110 | ```{r} 111 | # Analyse all R files in the R/ directory: 112 | dupree_dir(".", filter = "R/") 113 | ``` 114 | 115 | ```{r} 116 | # Analyse all R files except those in the tests / presentations directories: 117 | # `dupree_dir` uses grep-like arguments 118 | dupree_dir( 119 | ".", 120 | filter = "tests|presentations", invert = TRUE 121 | ) 122 | ``` 123 | 124 | ```{r} 125 | # Analyse all R source code in the package (only looking at the ./R/ directory) 126 | dupree_package(".") 127 | ``` 128 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | [![Codecov test 6 | coverage](https://codecov.io/gh/russHyde/dupree/branch/main/graph/badge.svg)](https://codecov.io/gh/russHyde/dupree?branch=main) 7 | [![R-CMD-check](https://github.com/russHyde/dupree/workflows/R-CMD-check/badge.svg)](https://github.com/russHyde/dupree/actions) 8 | 9 | 10 | # dupree 11 | 12 | The goal of `dupree` is to identify chunks / blocks of highly duplicated 13 | code within a set of R scripts. 14 | 15 | A very lightweight approach is used: 16 | 17 | - The user provides a set of `*.R` and/or `*.Rmd` files; 18 | 19 | - All R-code in the user-provided files is read and code-blocks are 20 | identified; 21 | 22 | - The non-trivial symbols from each code-block are retained (for 23 | instance, really common symbols like `<-`, `,`, `+`, `(` are dropped); 24 | 25 | - Similarity between different blocks is calculated using 26 | `stringdist::seq_sim` by longest-common-subsequence (symbol-identity 27 | is at whole-word level - so “my_data”, “my_Data”, “my.data” and 28 | “myData” are not considered to be identical in the calculation - and 29 | all non-trivial symbols have equal weight in the similarity 30 | calculation); 31 | 32 | - Code-blocks pairs (both between and within the files) are returned in 33 | order of highest similarity 34 | 35 | To prevent the results being dominated by high-identity blocks 36 | containing very few symbols (eg, `library(dplyr)`) the user can specify 37 | a `min_block_size`. Any code-block containing at least this many 38 | non-trivial symbols will be kept. 39 | 40 | ## Installation 41 | 42 | You can install `dupree` from github with: 43 | 44 | ``` r 45 | if (!"dupree" %in% installed.packages()) { 46 | # Alternatively: 47 | # install.packages("dupree") 48 | remotes::install_github("russHyde/dupree") 49 | } 50 | ``` 51 | 52 | ## Example 53 | 54 | To run `dupree` over a set of R files, you can use the `dupree()`, 55 | `dupree_dir()` or `dupree_package()` functions. For example, to identify 56 | duplication within all of the `.R` and `.Rmd` files for the `dupree` 57 | package you could run the following: 58 | 59 | ``` r 60 | ## basic example code 61 | library(dupree) 62 | 63 | files <- dir(pattern = "*.R(md)*$", recursive = TRUE) 64 | 65 | dupree(files) 66 | #> # A tibble: 14 × 7 67 | #> file_a file_b block_a block_b line_a line_b score 68 | #> 69 | #> 1 R/dupree_classes.R tests… 33 8 57 13 0.296 70 | #> 2 tests/testthat/test_dupree_clas… tests… 8 10 13 118 0.248 71 | #> 3 R/dupree_classes.R R/dup… 33 61 57 117 0.218 72 | #> 4 tests/testthat/test_dupree_clas… tests… 8 11 13 64 0.216 73 | #> 5 R/dupree_classes.R R/dup… 33 88 57 180 0.215 74 | #> 6 tests/testthat/test_dupree_clas… tests… 11 1 64 1 0.185 75 | #> 7 tests/testthat/testdata/anRpack… tests… 2 1 132 1 0.172 76 | #> 8 R/dupree.R R/dup… 111 33 124 57 0.146 77 | #> 9 tests/testthat/test_dupree_clas… tests… 8 6 13 25 0.120 78 | #> 10 R/dupree.R tests… 111 4 124 4 0.114 79 | #> 11 R/dupree_classes.R R/dup… 88 48 180 90 0.111 80 | #> 12 R/dupree_classes.R prese… 61 28 117 316 0.105 81 | #> 13 tests/testthat/test-dupree_dir_… tests… 3 6 11 25 0.0972 82 | #> 14 R/dupree_code_enumeration.R tests… 48 1 90 1 0.00298 83 | ``` 84 | 85 | Any top-level code blocks that contain at least 40 non-trivial tokens 86 | are included in the above analysis (a token being a function or variable 87 | name, an operator etc; but ignoring comments, white-space and some 88 | really common tokens: `[](){}-+$@:,=`, `<-`, `&&` etc). To be more 89 | restrictive, you could consider larger code-blocks (increase 90 | `min_block_size`) within just the `./R/` source code directory: 91 | 92 | ``` r 93 | # R-source code files in the ./R/ directory of the dupree package: 94 | source_files <- dir(path = "./R", pattern = "*.R(md)*$", full.names = TRUE) 95 | 96 | # analyse any code blocks that contain at least 50 non-trivial tokens 97 | dupree(source_files, min_block_size = 50) 98 | #> # A tibble: 1 × 7 99 | #> file_a file_b block_a block_b line_a line_b score 100 | #> 101 | #> 1 ./R/dupree_classes.R ./R/dupree_classes.R 61 88 117 180 0.104 102 | ``` 103 | 104 | For each (sufficiently big) code block in the provided files, `dupree` 105 | will return the code-block that is most-similar to it (although any 106 | given block may be present in the results multiple times if it is the 107 | closest match for several other code blocks). 108 | 109 | Code block pairs with a higher `score` value are more similar. `score` 110 | lies in the range \[0, 1\]; and is calculated by the 111 | [`stringdist`](https://github.com/markvanderloo/stringdist) package: 112 | matching occurs at the token level: the token “my_data” is no more 113 | similar to the token “myData” than it is to “x”. 114 | 115 | If you find code-block-pairs with a similarity score much greater than 116 | 0.5 there is probably some commonality that could be abstracted away. 117 | 118 | ------------------------------------------------------------------------ 119 | 120 | Note that you can do something similar using the functions `dupree_dir` 121 | and (if you are analysing a package) `dupree_package`. 122 | 123 | ``` r 124 | # Analyse all R files in the R/ directory: 125 | dupree_dir(".", filter = "R/") 126 | #> # A tibble: 6 × 7 127 | #> file_a file_b block_a block_b line_a line_b score 128 | #> 129 | #> 1 ./R/dupree_classes.R ./R/d… 33 61 57 117 0.218 130 | #> 2 ./R/dupree_classes.R ./R/d… 33 88 57 180 0.215 131 | #> 3 ./tests/testthat/testdata/anRpac… ./tes… 2 1 132 1 0.172 132 | #> 4 ./R/dupree.R ./R/d… 111 33 124 57 0.146 133 | #> 5 ./R/dupree_classes.R ./R/d… 88 48 180 90 0.111 134 | #> 6 ./R/dupree_code_enumeration.R ./tes… 48 1 90 1 0.00298 135 | ``` 136 | 137 | ``` r 138 | # Analyse all R files except those in the tests / presentations directories: 139 | # `dupree_dir` uses grep-like arguments 140 | dupree_dir( 141 | ".", 142 | filter = "tests|presentations", invert = TRUE 143 | ) 144 | #> # A tibble: 4 × 7 145 | #> file_a file_b block_a block_b line_a line_b score 146 | #> 147 | #> 1 ./R/dupree_classes.R ./R/dupree_classes.R 33 61 57 117 0.218 148 | #> 2 ./R/dupree_classes.R ./R/dupree_classes.R 33 88 57 180 0.215 149 | #> 3 ./R/dupree.R ./R/dupree_classes.R 111 33 124 57 0.146 150 | #> 4 ./R/dupree_classes.R ./R/dupree_code_enum… 88 48 180 90 0.111 151 | ``` 152 | 153 | ``` r 154 | # Analyse all R source code in the package (only looking at the ./R/ directory) 155 | dupree_package(".") 156 | #> # A tibble: 6 × 7 157 | #> file_a file_b block_a block_b line_a line_b score 158 | #> 159 | #> 1 ./R/dupree_classes.R ./R/d… 33 61 57 117 0.218 160 | #> 2 ./R/dupree_classes.R ./R/d… 33 88 57 180 0.215 161 | #> 3 ./tests/testthat/testdata/anRpac… ./tes… 2 1 132 1 0.172 162 | #> 4 ./R/dupree.R ./R/d… 111 33 124 57 0.146 163 | #> 5 ./R/dupree_classes.R ./R/d… 88 48 180 90 0.111 164 | #> 6 ./R/dupree_code_enumeration.R ./tes… 48 1 90 1 0.00298 165 | ``` 166 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # `dupree` TODO notes 2 | 3 | ## Functions 4 | 5 | - `dupree_classes`: `find_best_matches_of_single_block` 6 | 7 | ## Tests 8 | 9 | ## Data-structures 10 | 11 | - Suggestion: 12 | 13 | - if score for (`block_a`, `block_b`) is returned, score for (`block_b`, 14 | `block_a`) should not be returned 15 | 16 | ## Quicker implementation 17 | 18 | - Fastest version: 19 | 20 | - convert to frequency-vector of symbols-used (eg, tidytext / `tf_idf` 21 | analysis) and just determine distances between block-contents 22 | 23 | - Alignment version: 24 | 25 | - run (various) distance functions on the vector-pairs (using 26 | `stringdist::seq_sim`; note that stringdist is required for 27 | `lintr/available`) 28 | 29 | ## Visualisation 30 | 31 | - Although `dupree` should not have to depend on any visualisation packages, it 32 | would be nice if it could convert a package or a set of files into some a 33 | data structure that could readily be visualised in one of the graph packages 34 | (tidygraph / igraph) 35 | 36 | - Suggest making two different types of graphs structures and combining them in 37 | a single image: 38 | 39 | - Sequential connections between blocks of code in files 40 | 41 | - Duplication connections between blocks of code across the files 42 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | destination: docs 2 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Submission 2 | 3 | The prospective release of dplyr=1.0 necessitated an update to the CRAN package 4 | {dupree}. The dplyr change made some dupree-tests fail. The dupree-tests have 5 | been fixed in this update. 6 | 7 | A new class has been added to dupree, which introduces a breaking change, hence 8 | the update from v0.2.0 to v0.3.0. 9 | 10 | ## Test environments 11 | 12 | * MacOS via r-hub: x86_64-apple-darwin15.6.0, R 3.6.3 13 | * local Ubuntu R 3.5.1 14 | * Ubuntu 16.04 (on travis-ci.org), R 3.4.4, R 3.5.3, R 3.6.2 and devel 15 | * Windows (on ci.appveyor.com), R 3.6.3 16 | 17 | ## R CMD check results 18 | There were no ERRORS, WARNINGs or NOTEs 19 | 20 | ## Downstream dependencies 21 | There are currently no downstream dependencies for this package 22 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Page not found (404) • dupree 9 | 10 | 11 | 12 | 13 | 14 | 15 | 19 | 20 | 21 | 22 | 23 |
24 |
61 | 62 | 63 | 64 | 65 |
66 |
67 | 70 | 71 | Content not found. Please use links in the navbar. 72 | 73 |
74 | 75 | 79 | 80 |
81 | 82 | 83 | 84 |
88 | 89 |
90 |

91 |

Site built with pkgdown 2.0.7.

92 |
93 | 94 |
95 |
96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | License • dupree 6 | 7 | 8 |
9 |
40 | 41 | 42 | 43 |
44 |
45 | 48 | 49 |
YEAR: 2018
50 | COPYRIGHT HOLDER: Russ Hyde
51 | 
52 | 53 |
54 | 55 | 58 | 59 |
60 | 61 | 62 | 63 |
66 | 67 |
68 |

Site built with pkgdown 2.0.7.

69 |
70 | 71 |
72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /docs/TODO.html: -------------------------------------------------------------------------------- 1 | 2 | dupree TODO notes • dupree 6 | 7 | 8 |
9 |
40 | 41 | 42 | 43 |
44 |
45 | 48 | 49 |
50 | 51 |
52 |

Functions

53 |
  • 54 | dupree_classes: find_best_matches_of_single_block 55 |
  • 56 |
57 |
58 |

Tests

59 |
60 |
61 |

Data-structures

62 |
  • 63 |

    Suggestion:

    64 |
    • if score for (block_a, block_b) is returned, score for (block_b, block_a) should not be returned
    • 65 |
  • 66 |
67 |
68 |

Quicker implementation

69 |
  • 70 |

    Fastest version:

    71 |
    • convert to frequency-vector of symbols-used (eg, tidytext / tf_idf analysis) and just determine distances between block-contents
    • 72 |
  • 73 |
  • 74 |

    Alignment version:

    75 |
    • run (various) distance functions on the vector-pairs (using stringdist::seq_sim; note that stringdist is required for lintr/available)
    • 76 |
  • 77 |
78 |
79 |

Visualisation

80 |
  • Although dupree should not have to depend on any visualisation packages, it would be nice if it could convert a package or a set of files into some a data structure that could readily be visualised in one of the graph packages (tidygraph / igraph)

  • 81 |
  • 82 |

    Suggest making two different types of graphs structures and combining them in a single image:

    83 |
    • Sequential connections between blocks of code in files

    • 84 |
    • Duplication connections between blocks of code across the files

    • 85 |
  • 86 |
87 |
88 | 89 |
90 | 91 | 94 | 95 |
96 | 97 | 98 | 99 |
102 | 103 |
104 |

Site built with pkgdown 2.0.7.

105 |
106 | 107 |
108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | Authors and Citation • dupree 6 | 7 | 8 |
9 |
40 | 41 | 42 | 43 |
44 |
45 |
46 | 49 | 50 | 51 |
  • 52 |

    Russ Hyde. Maintainer. 53 |

    54 |
  • 55 |
56 |
57 |
58 |

Citation

59 | Source: DESCRIPTION 60 |
61 |
62 | 63 | 64 |

Hyde R, Glasgow Uo (2023). 65 | dupree: Identify Duplicated R Code in a Project. 66 | R package version 0.3.0.9000, https://github.com/russHyde/dupree. 67 |

68 |
@Manual{,
69 |   title = {dupree: Identify Duplicated R Code in a Project},
70 |   author = {Russ Hyde and University of Glasgow},
71 |   year = {2023},
72 |   note = {R package version 0.3.0.9000},
73 |   url = {https://github.com/russHyde/dupree},
74 | }
75 | 76 |
77 | 78 |
79 | 80 | 81 | 82 |
85 | 86 |
87 |

Site built with pkgdown 2.0.7.

88 |
89 | 90 |
91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ 7 | 8 | /* All levels of nav */ 9 | nav[data-toggle='toc'] .nav > li > a { 10 | display: block; 11 | padding: 4px 20px; 12 | font-size: 13px; 13 | font-weight: 500; 14 | color: #767676; 15 | } 16 | nav[data-toggle='toc'] .nav > li > a:hover, 17 | nav[data-toggle='toc'] .nav > li > a:focus { 18 | padding-left: 19px; 19 | color: #563d7c; 20 | text-decoration: none; 21 | background-color: transparent; 22 | border-left: 1px solid #563d7c; 23 | } 24 | nav[data-toggle='toc'] .nav > .active > a, 25 | nav[data-toggle='toc'] .nav > .active:hover > a, 26 | nav[data-toggle='toc'] .nav > .active:focus > a { 27 | padding-left: 18px; 28 | font-weight: bold; 29 | color: #563d7c; 30 | background-color: transparent; 31 | border-left: 2px solid #563d7c; 32 | } 33 | 34 | /* Nav: second level (shown on .active) */ 35 | nav[data-toggle='toc'] .nav .nav { 36 | display: none; /* Hide by default, but at >768px, show it */ 37 | padding-bottom: 10px; 38 | } 39 | nav[data-toggle='toc'] .nav .nav > li > a { 40 | padding-top: 1px; 41 | padding-bottom: 1px; 42 | padding-left: 30px; 43 | font-size: 12px; 44 | font-weight: normal; 45 | } 46 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 47 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 48 | padding-left: 29px; 49 | } 50 | nav[data-toggle='toc'] .nav .nav > .active > a, 51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 53 | padding-left: 28px; 54 | font-weight: 500; 55 | } 56 | 57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ 58 | nav[data-toggle='toc'] .nav > .active > ul { 59 | display: block; 60 | } 61 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | (function() { 6 | 'use strict'; 7 | 8 | window.Toc = { 9 | helpers: { 10 | // return all matching elements in the set, or their descendants 11 | findOrFilter: function($el, selector) { 12 | // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ 13 | // http://stackoverflow.com/a/12731439/358804 14 | var $descendants = $el.find(selector); 15 | return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); 16 | }, 17 | 18 | generateUniqueIdBase: function(el) { 19 | var text = $(el).text(); 20 | var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); 21 | return anchor || el.tagName.toLowerCase(); 22 | }, 23 | 24 | generateUniqueId: function(el) { 25 | var anchorBase = this.generateUniqueIdBase(el); 26 | for (var i = 0; ; i++) { 27 | var anchor = anchorBase; 28 | if (i > 0) { 29 | // add suffix 30 | anchor += '-' + i; 31 | } 32 | // check if ID already exists 33 | if (!document.getElementById(anchor)) { 34 | return anchor; 35 | } 36 | } 37 | }, 38 | 39 | generateAnchor: function(el) { 40 | if (el.id) { 41 | return el.id; 42 | } else { 43 | var anchor = this.generateUniqueId(el); 44 | el.id = anchor; 45 | return anchor; 46 | } 47 | }, 48 | 49 | createNavList: function() { 50 | return $(''); 51 | }, 52 | 53 | createChildNavList: function($parent) { 54 | var $childList = this.createNavList(); 55 | $parent.append($childList); 56 | return $childList; 57 | }, 58 | 59 | generateNavEl: function(anchor, text) { 60 | var $a = $(''); 61 | $a.attr('href', '#' + anchor); 62 | $a.text(text); 63 | var $li = $('
  • '); 64 | $li.append($a); 65 | return $li; 66 | }, 67 | 68 | generateNavItem: function(headingEl) { 69 | var anchor = this.generateAnchor(headingEl); 70 | var $heading = $(headingEl); 71 | var text = $heading.data('toc-text') || $heading.text(); 72 | return this.generateNavEl(anchor, text); 73 | }, 74 | 75 | // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). 76 | getTopLevel: function($scope) { 77 | for (var i = 1; i <= 6; i++) { 78 | var $headings = this.findOrFilter($scope, 'h' + i); 79 | if ($headings.length > 1) { 80 | return i; 81 | } 82 | } 83 | 84 | return 1; 85 | }, 86 | 87 | // returns the elements for the top level, and the next below it 88 | getHeadings: function($scope, topLevel) { 89 | var topSelector = 'h' + topLevel; 90 | 91 | var secondaryLevel = topLevel + 1; 92 | var secondarySelector = 'h' + secondaryLevel; 93 | 94 | return this.findOrFilter($scope, topSelector + ',' + secondarySelector); 95 | }, 96 | 97 | getNavLevel: function(el) { 98 | return parseInt(el.tagName.charAt(1), 10); 99 | }, 100 | 101 | populateNav: function($topContext, topLevel, $headings) { 102 | var $context = $topContext; 103 | var $prevNav; 104 | 105 | var helpers = this; 106 | $headings.each(function(i, el) { 107 | var $newNav = helpers.generateNavItem(el); 108 | var navLevel = helpers.getNavLevel(el); 109 | 110 | // determine the proper $context 111 | if (navLevel === topLevel) { 112 | // use top level 113 | $context = $topContext; 114 | } else if ($prevNav && $context === $topContext) { 115 | // create a new level of the tree and switch to it 116 | $context = helpers.createChildNavList($prevNav); 117 | } // else use the current $context 118 | 119 | $context.append($newNav); 120 | 121 | $prevNav = $newNav; 122 | }); 123 | }, 124 | 125 | parseOps: function(arg) { 126 | var opts; 127 | if (arg.jquery) { 128 | opts = { 129 | $nav: arg 130 | }; 131 | } else { 132 | opts = arg; 133 | } 134 | opts.$scope = opts.$scope || $(document.body); 135 | return opts; 136 | } 137 | }, 138 | 139 | // accepts a jQuery object, or an options object 140 | init: function(opts) { 141 | opts = this.helpers.parseOps(opts); 142 | 143 | // ensure that the data attribute is in place for styling 144 | opts.$nav.attr('data-toggle', 'toc'); 145 | 146 | var $topContext = this.helpers.createChildNavList(opts.$nav); 147 | var topLevel = this.helpers.getTopLevel(opts.$scope); 148 | var $headings = this.helpers.getHeadings(opts.$scope, topLevel); 149 | this.helpers.populateNav($topContext, topLevel, $headings); 150 | } 151 | }; 152 | 153 | $(function() { 154 | $('nav[data-toggle="toc"]').each(function(i, el) { 155 | var $nav = $(el); 156 | Toc.init($nav); 157 | }); 158 | }); 159 | })(); 160 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/news/index.html: -------------------------------------------------------------------------------- 1 | 2 | Changelog • dupree 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 49 | 50 |
    51 | 52 |
    • Update the CI workflows for pkgdown, test-coverage and R CMD check
    • 53 |
    • use lintr >= 3, and update .lintr config file
    • 54 |
    • lint the package in CI
    • 55 |
    56 |
    57 | 58 |
    59 |

    Breaking changes

    60 |
    63 |
    64 |

    Minor changes and bug-fixes

    65 |
    70 |
    71 |
    72 | 73 |
    • lintr dependence pinned to lintr=2.0.0 so that non-R-codeblocks and empty R markdown files can be dealt with cleanly

    • 74 |
    • Tests that depend on stringdist::seq_sim were rewritten to ensure they consistently pass

    • 75 |
    • Dependency on deprecated dplyr verbs removed

    • 76 |
    • Code to prevent double-reporting of code-block pairs was initiated by @Alanocallaghan

    • 77 |
    78 |
    79 | 80 |
    • Added a NEWS.md file to track changes to the package.
    • 81 |
    82 |
    83 | 84 | 87 | 88 |
    89 | 90 | 91 |
    94 | 95 |
    96 |

    Site built with pkgdown 2.0.7.

    97 |
    98 | 99 |
    100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body { 21 | position: relative; 22 | } 23 | 24 | body > .container { 25 | display: flex; 26 | height: 100%; 27 | flex-direction: column; 28 | } 29 | 30 | body > .container .row { 31 | flex: 1 0 auto; 32 | } 33 | 34 | footer { 35 | margin-top: 45px; 36 | padding: 35px 0 36px; 37 | border-top: 1px solid #e5e5e5; 38 | color: #666; 39 | display: flex; 40 | flex-shrink: 0; 41 | } 42 | footer p { 43 | margin-bottom: 0; 44 | } 45 | footer div { 46 | flex: 1; 47 | } 48 | footer .pkgdown { 49 | text-align: right; 50 | } 51 | footer p { 52 | margin-bottom: 0; 53 | } 54 | 55 | img.icon { 56 | float: right; 57 | } 58 | 59 | /* Ensure in-page images don't run outside their container */ 60 | .contents img { 61 | max-width: 100%; 62 | height: auto; 63 | } 64 | 65 | /* Fix bug in bootstrap (only seen in firefox) */ 66 | summary { 67 | display: list-item; 68 | } 69 | 70 | /* Typographic tweaking ---------------------------------*/ 71 | 72 | .contents .page-header { 73 | margin-top: calc(-60px + 1em); 74 | } 75 | 76 | dd { 77 | margin-left: 3em; 78 | } 79 | 80 | /* Section anchors ---------------------------------*/ 81 | 82 | a.anchor { 83 | display: none; 84 | margin-left: 5px; 85 | width: 20px; 86 | height: 20px; 87 | 88 | background-image: url(./link.svg); 89 | background-repeat: no-repeat; 90 | background-size: 20px 20px; 91 | background-position: center center; 92 | } 93 | 94 | h1:hover .anchor, 95 | h2:hover .anchor, 96 | h3:hover .anchor, 97 | h4:hover .anchor, 98 | h5:hover .anchor, 99 | h6:hover .anchor { 100 | display: inline-block; 101 | } 102 | 103 | /* Fixes for fixed navbar --------------------------*/ 104 | 105 | .contents h1, .contents h2, .contents h3, .contents h4 { 106 | padding-top: 60px; 107 | margin-top: -40px; 108 | } 109 | 110 | /* Navbar submenu --------------------------*/ 111 | 112 | .dropdown-submenu { 113 | position: relative; 114 | } 115 | 116 | .dropdown-submenu>.dropdown-menu { 117 | top: 0; 118 | left: 100%; 119 | margin-top: -6px; 120 | margin-left: -1px; 121 | border-radius: 0 6px 6px 6px; 122 | } 123 | 124 | .dropdown-submenu:hover>.dropdown-menu { 125 | display: block; 126 | } 127 | 128 | .dropdown-submenu>a:after { 129 | display: block; 130 | content: " "; 131 | float: right; 132 | width: 0; 133 | height: 0; 134 | border-color: transparent; 135 | border-style: solid; 136 | border-width: 5px 0 5px 5px; 137 | border-left-color: #cccccc; 138 | margin-top: 5px; 139 | margin-right: -10px; 140 | } 141 | 142 | .dropdown-submenu:hover>a:after { 143 | border-left-color: #ffffff; 144 | } 145 | 146 | .dropdown-submenu.pull-left { 147 | float: none; 148 | } 149 | 150 | .dropdown-submenu.pull-left>.dropdown-menu { 151 | left: -100%; 152 | margin-left: 10px; 153 | border-radius: 6px 0 6px 6px; 154 | } 155 | 156 | /* Sidebar --------------------------*/ 157 | 158 | #pkgdown-sidebar { 159 | margin-top: 30px; 160 | position: -webkit-sticky; 161 | position: sticky; 162 | top: 70px; 163 | } 164 | 165 | #pkgdown-sidebar h2 { 166 | font-size: 1.5em; 167 | margin-top: 1em; 168 | } 169 | 170 | #pkgdown-sidebar h2:first-child { 171 | margin-top: 0; 172 | } 173 | 174 | #pkgdown-sidebar .list-unstyled li { 175 | margin-bottom: 0.5em; 176 | } 177 | 178 | /* bootstrap-toc tweaks ------------------------------------------------------*/ 179 | 180 | /* All levels of nav */ 181 | 182 | nav[data-toggle='toc'] .nav > li > a { 183 | padding: 4px 20px 4px 6px; 184 | font-size: 1.5rem; 185 | font-weight: 400; 186 | color: inherit; 187 | } 188 | 189 | nav[data-toggle='toc'] .nav > li > a:hover, 190 | nav[data-toggle='toc'] .nav > li > a:focus { 191 | padding-left: 5px; 192 | color: inherit; 193 | border-left: 1px solid #878787; 194 | } 195 | 196 | nav[data-toggle='toc'] .nav > .active > a, 197 | nav[data-toggle='toc'] .nav > .active:hover > a, 198 | nav[data-toggle='toc'] .nav > .active:focus > a { 199 | padding-left: 5px; 200 | font-size: 1.5rem; 201 | font-weight: 400; 202 | color: inherit; 203 | border-left: 2px solid #878787; 204 | } 205 | 206 | /* Nav: second level (shown on .active) */ 207 | 208 | nav[data-toggle='toc'] .nav .nav { 209 | display: none; /* Hide by default, but at >768px, show it */ 210 | padding-bottom: 10px; 211 | } 212 | 213 | nav[data-toggle='toc'] .nav .nav > li > a { 214 | padding-left: 16px; 215 | font-size: 1.35rem; 216 | } 217 | 218 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 219 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 220 | padding-left: 15px; 221 | } 222 | 223 | nav[data-toggle='toc'] .nav .nav > .active > a, 224 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 225 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 226 | padding-left: 15px; 227 | font-weight: 500; 228 | font-size: 1.35rem; 229 | } 230 | 231 | /* orcid ------------------------------------------------------------------- */ 232 | 233 | .orcid { 234 | font-size: 16px; 235 | color: #A6CE39; 236 | /* margins are required by official ORCID trademark and display guidelines */ 237 | margin-left:4px; 238 | margin-right:4px; 239 | vertical-align: middle; 240 | } 241 | 242 | /* Reference index & topics ----------------------------------------------- */ 243 | 244 | .ref-index th {font-weight: normal;} 245 | 246 | .ref-index td {vertical-align: top; min-width: 100px} 247 | .ref-index .icon {width: 40px;} 248 | .ref-index .alias {width: 40%;} 249 | .ref-index-icons .alias {width: calc(40% - 40px);} 250 | .ref-index .title {width: 60%;} 251 | 252 | .ref-arguments th {text-align: right; padding-right: 10px;} 253 | .ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} 254 | .ref-arguments .name {width: 20%;} 255 | .ref-arguments .desc {width: 80%;} 256 | 257 | /* Nice scrolling for wide elements --------------------------------------- */ 258 | 259 | table { 260 | display: block; 261 | overflow: auto; 262 | } 263 | 264 | /* Syntax highlighting ---------------------------------------------------- */ 265 | 266 | pre, code, pre code { 267 | background-color: #f8f8f8; 268 | color: #333; 269 | } 270 | pre, pre code { 271 | white-space: pre-wrap; 272 | word-break: break-all; 273 | overflow-wrap: break-word; 274 | } 275 | 276 | pre { 277 | border: 1px solid #eee; 278 | } 279 | 280 | pre .img, pre .r-plt { 281 | margin: 5px 0; 282 | } 283 | 284 | pre .img img, pre .r-plt img { 285 | background-color: #fff; 286 | } 287 | 288 | code a, pre a { 289 | color: #375f84; 290 | } 291 | 292 | a.sourceLine:hover { 293 | text-decoration: none; 294 | } 295 | 296 | .fl {color: #1514b5;} 297 | .fu {color: #000000;} /* function */ 298 | .ch,.st {color: #036a07;} /* string */ 299 | .kw {color: #264D66;} /* keyword */ 300 | .co {color: #888888;} /* comment */ 301 | 302 | .error {font-weight: bolder;} 303 | .warning {font-weight: bolder;} 304 | 305 | /* Clipboard --------------------------*/ 306 | 307 | .hasCopyButton { 308 | position: relative; 309 | } 310 | 311 | .btn-copy-ex { 312 | position: absolute; 313 | right: 0; 314 | top: 0; 315 | visibility: hidden; 316 | } 317 | 318 | .hasCopyButton:hover button.btn-copy-ex { 319 | visibility: visible; 320 | } 321 | 322 | /* headroom.js ------------------------ */ 323 | 324 | .headroom { 325 | will-change: transform; 326 | transition: transform 200ms linear; 327 | } 328 | .headroom--pinned { 329 | transform: translateY(0%); 330 | } 331 | .headroom--unpinned { 332 | transform: translateY(-100%); 333 | } 334 | 335 | /* mark.js ----------------------------*/ 336 | 337 | mark { 338 | background-color: rgba(255, 255, 51, 0.5); 339 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 340 | padding: 1px; 341 | } 342 | 343 | /* vertical spacing after htmlwidgets */ 344 | .html-widget { 345 | margin-bottom: 10px; 346 | } 347 | 348 | /* fontawesome ------------------------ */ 349 | 350 | .fab { 351 | font-family: "Font Awesome 5 Brands" !important; 352 | } 353 | 354 | /* don't display links in code chunks when printing */ 355 | /* source: https://stackoverflow.com/a/10781533 */ 356 | @media print { 357 | code a:link:after, code a:visited:after { 358 | content: ""; 359 | } 360 | } 361 | 362 | /* Section anchors --------------------------------- 363 | Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 364 | */ 365 | 366 | div.csl-bib-body { } 367 | div.csl-entry { 368 | clear: both; 369 | } 370 | .hanging-indent div.csl-entry { 371 | margin-left:2em; 372 | text-indent:-2em; 373 | } 374 | div.csl-left-margin { 375 | min-width:2em; 376 | float:left; 377 | } 378 | div.csl-right-inline { 379 | margin-left:2em; 380 | padding-left:1em; 381 | } 382 | div.csl-indent { 383 | margin-left: 2em; 384 | } 385 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('[data-toggle="tooltip"]').tooltip(); 13 | 14 | var cur_path = paths(location.pathname); 15 | var links = $("#navbar ul li a"); 16 | var max_length = -1; 17 | var pos = -1; 18 | for (var i = 0; i < links.length; i++) { 19 | if (links[i].getAttribute("href") === "#") 20 | continue; 21 | // Ignore external links 22 | if (links[i].host !== location.host) 23 | continue; 24 | 25 | var nav_path = paths(links[i].pathname); 26 | 27 | var length = prefix_length(nav_path, cur_path); 28 | if (length > max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $("div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.19.2 2 | pkgdown: 2.0.7 3 | pkgdown_sha: ~ 4 | articles: {} 5 | last_built: 2023-02-12T15:31Z 6 | 7 | -------------------------------------------------------------------------------- /docs/reference/EnumeratedCodeTable-class.html: -------------------------------------------------------------------------------- 1 | 2 | An S4 class to represent the code blocks as strings of integers — EnumeratedCodeTable-class • dupree 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 50 | 51 |
    52 |

    An S4 class to represent the code blocks as strings of integers

    53 |
    54 | 55 | 56 |
    57 |

    Slots

    58 | 59 | 60 |
    blocks
    61 |

    A tbl_df with columns `file`, `block`, `start_line` and 62 | `enumerated_code`

    63 | 64 | 65 |
    66 | 67 |
    68 | 71 |
    72 | 73 | 74 |
    77 | 78 |
    79 |

    Site built with pkgdown 2.0.7.

    80 |
    81 | 82 |
    83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/russHyde/dupree/6be55893a839717d36e6b3e21bdd662951a59bf8/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /docs/reference/as.data.frame.dups.html: -------------------------------------------------------------------------------- 1 | 2 | as.data.frame method for `dups` class — as.data.frame.dups • dupree 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 50 | 51 |
    52 |

    as.data.frame method for `dups` class

    53 |
    54 | 55 |
    56 |
    # S3 method for dups
    57 | as.data.frame(x, ...)
    58 |
    59 | 60 |
    61 |

    Arguments

    62 |
    x
    63 |

    any R object.

    64 | 65 | 66 |
    ...
    67 |

    additional arguments to be passed to or from methods.

    68 | 69 |
    70 | 71 |
    72 | 75 |
    76 | 77 | 78 |
    81 | 82 |
    83 |

    Site built with pkgdown 2.0.7.

    84 |
    85 | 86 |
    87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /docs/reference/as_tibble.dups.html: -------------------------------------------------------------------------------- 1 | 2 | convert a `dups` object to a `tibble` — as_tibble.dups • dupree 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 50 | 51 |
    52 |

    convert a `dups` object to a `tibble`

    53 |
    54 | 55 |
    56 |
    # S3 method for dups
    57 | as_tibble(x, ...)
    58 |
    59 | 60 |
    61 |

    Arguments

    62 |
    x
    63 |

    A data frame, list, matrix, or other object that could reasonably be 64 | coerced to a tibble.

    65 | 66 | 67 |
    ...
    68 |

    Unused, for extensibility.

    69 | 70 |
    71 | 72 |
    73 | 76 |
    77 | 78 | 79 |
    82 | 83 |
    84 |

    Site built with pkgdown 2.0.7.

    85 |
    86 | 87 |
    88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /docs/reference/dupree_dir.html: -------------------------------------------------------------------------------- 1 | 2 | Run duplicate-code detection over all R-files in a directory — dupree_dir • dupree 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 50 | 51 |
    52 |

    Run duplicate-code detection over all R-files in a directory

    53 |
    54 | 55 |
    56 |
    dupree_dir(
     57 |   path = ".",
     58 |   min_block_size = 40,
     59 |   filter = NULL,
     60 |   ...,
     61 |   recursive = TRUE
     62 | )
    63 |
    64 | 65 |
    66 |

    Arguments

    67 |
    path
    68 |

    A directory (By default the current working 69 | directory). All files in this directory that have a ".R", ".r" or ".Rmd" 70 | extension will be checked for code duplication.

    71 | 72 | 73 |
    min_block_size
    74 |

    dupree uses a notion of non-trivial 75 | symbols. These are the symbols / code-words that remain after filtering 76 | out really common symbols like <-, ,, etc. After filtering 77 | out these symbols from each code-block, only those blocks containing at 78 | least min_block_size symbols are used in the inter-block 79 | code-duplication measurement.

    80 | 81 | 82 |
    filter
    83 |

    A pattern for use in grep - this is used to keep 84 | only particular files: eg, filter = "classes" would compare files with 85 | `classes` in the filename

    86 | 87 | 88 |
    ...
    89 |

    Further arguments for grep. For example, `filter 90 | = "test", invert = TRUE` would disregard all files with `test` in the 91 | file-path.

    92 | 93 | 94 |
    recursive
    95 |

    Should we consider files in subdirectories as 96 | well?

    97 | 98 |
    99 |
    100 |

    See also

    101 |

    dupree

    102 |
    103 | 104 |
    105 | 108 |
    109 | 110 | 111 |
    114 | 115 |
    116 |

    Site built with pkgdown 2.0.7.

    117 |
    118 | 119 |
    120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /docs/reference/dupree_package.html: -------------------------------------------------------------------------------- 1 | 2 | Run duplicate-code detection over all files in the `R` directory of a 3 | package — dupree_package • dupree 9 | 10 | 11 |
    12 |
    43 | 44 | 45 | 46 |
    47 |
    48 | 54 | 55 |
    56 |

    The function fails if the path does not look like a typical R package (it 57 | should have both an R/ subdirectory and a DESCRIPTION file present).

    58 |
    59 | 60 |
    61 |
    dupree_package(package = ".", min_block_size = 40)
    62 |
    63 | 64 |
    65 |

    Arguments

    66 |
    package
    67 |

    The name or path to the package that is to be 68 | checked (By default the current working directory).

    69 | 70 | 71 |
    min_block_size
    72 |

    dupree uses a notion of non-trivial 73 | symbols. These are the symbols / code-words that remain after filtering 74 | out really common symbols like <-, ,, etc. After filtering 75 | out these symbols from each code-block, only those blocks containing at 76 | least min_block_size symbols are used in the inter-block 77 | code-duplication measurement.

    78 | 79 |
    80 |
    81 |

    See also

    82 |

    dupree

    83 |
    84 | 85 |
    86 | 89 |
    90 | 91 | 92 |
    95 | 96 |
    97 |

    Site built with pkgdown 2.0.7.

    98 |
    99 | 100 |
    101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | Function reference • dupree 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 48 | 49 | 53 | 56 | 57 | 60 | 61 | 64 | 65 | 68 | 69 | 72 | 73 | 76 | 78 | 81 | 82 |
    50 |

    All functions

    51 |

    52 |
    54 |

    EnumeratedCodeTable-class

    55 |

    An S4 class to represent the code blocks as strings of integers

    58 |

    as.data.frame(<dups>)

    59 |

    as.data.frame method for `dups` class

    62 |

    as_tibble(<dups>)

    63 |

    convert a `dups` object to a `tibble`

    66 |

    dupree()

    67 |

    Detect code duplication between the code-blocks in a set of files

    70 |

    dupree_dir()

    71 |

    Run duplicate-code detection over all R-files in a directory

    74 |

    dupree_package()

    75 |

    Run duplicate-code detection over all files in the `R` directory of a 77 | package

    79 |

    print(<dups>)

    80 |

    print method for `dups` class

    83 | 84 | 87 |
    88 | 89 | 90 |
    93 | 94 |
    95 |

    Site built with pkgdown 2.0.7.

    96 |
    97 | 98 |
    99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /docs/reference/print.dups.html: -------------------------------------------------------------------------------- 1 | 2 | print method for `dups` class — print.dups • dupree 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 50 | 51 |
    52 |

    print method for `dups` class

    53 |
    54 | 55 |
    56 |
    # S3 method for dups
    57 | print(x, ...)
    58 |
    59 | 60 |
    61 |

    Arguments

    62 |
    x
    63 |

    an object used to select a method.

    64 | 65 | 66 |
    ...
    67 |

    further arguments passed to or from other methods.

    68 | 69 |
    70 | 71 |
    72 | 75 |
    76 | 77 | 78 |
    81 | 82 |
    83 |

    Site built with pkgdown 2.0.7.

    84 |
    85 | 86 |
    87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /docs/reference/reexports.html: -------------------------------------------------------------------------------- 1 | 2 | Objects exported from other packages — reexports • dupree 13 | 14 | 15 |
    16 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 57 | 58 |
    59 |

    These objects are imported from other packages. Follow the links 60 | below to see their documentation.

    61 |
    tibble
    62 |

    as_tibble

    63 | 64 | 65 |
    66 | 67 | 68 | 69 |
    70 | 73 |
    74 | 75 | 76 |
    79 | 80 |
    81 |

    Site built with pkgdown 2.0.7.

    82 |
    83 | 84 |
    85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /404.html 5 | 6 | 7 | /LICENSE-text.html 8 | 9 | 10 | /TODO.html 11 | 12 | 13 | /authors.html 14 | 15 | 16 | /index.html 17 | 18 | 19 | /news/index.html 20 | 21 | 22 | /reference/EnumeratedCodeTable-class.html 23 | 24 | 25 | /reference/as.data.frame.dups.html 26 | 27 | 28 | /reference/as_tibble.dups.html 29 | 30 | 31 | /reference/dupree.html 32 | 33 | 34 | /reference/dupree_dir.html 35 | 36 | 37 | /reference/dupree_package.html 38 | 39 | 40 | /reference/index.html 41 | 42 | 43 | /reference/print.dups.html 44 | 45 | 46 | /reference/reexports.html 47 | 48 | 49 | -------------------------------------------------------------------------------- /dupree.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | LineEndingConversion: Posix 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /inst/extdata/duplicated.R: -------------------------------------------------------------------------------- 1 | # Example script for illustrating code duplication 2 | library(dplyr) 3 | data(diamonds) 4 | 5 | diamonds %>% 6 | filter(clarity %in% c("SI1", "SI2")) %>% 7 | group_by(color) %>% 8 | summarise(m_price = mean(price), sd_price = sd(price)) 9 | 10 | diamonds %>% 11 | filter(cut >= "Very Good") %>% 12 | group_by(color) %>% 13 | summarise(m_price = mean(price), sd_price = sd(price)) 14 | 15 | 16 | # note that dupree can't tell that the following code is logically 17 | # the same as the preceding code 18 | summarise( 19 | group_by( 20 | filter(diamonds, cut >= "Very Good"), 21 | color 22 | ), 23 | sd_price = sd(price), 24 | m_price = mean(price) 25 | ) 26 | -------------------------------------------------------------------------------- /man/EnumeratedCodeTable-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dupree_classes.R 3 | \docType{class} 4 | \name{EnumeratedCodeTable-class} 5 | \alias{EnumeratedCodeTable-class} 6 | \title{An S4 class to represent the code blocks as strings of integers} 7 | \description{ 8 | An S4 class to represent the code blocks as strings of integers 9 | } 10 | \section{Slots}{ 11 | 12 | \describe{ 13 | \item{\code{blocks}}{A tbl_df with columns `file`, `block`, `start_line` and 14 | `enumerated_code`} 15 | }} 16 | 17 | -------------------------------------------------------------------------------- /man/as.data.frame.dups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dups-class.R 3 | \name{as.data.frame.dups} 4 | \alias{as.data.frame.dups} 5 | \title{as.data.frame method for `dups` class} 6 | \usage{ 7 | \method{as.data.frame}{dups}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{any \R object.} 11 | 12 | \item{...}{additional arguments to be passed to or from methods.} 13 | } 14 | \description{ 15 | as.data.frame method for `dups` class 16 | } 17 | -------------------------------------------------------------------------------- /man/as_tibble.dups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dups-class.R 3 | \name{as_tibble.dups} 4 | \alias{as_tibble.dups} 5 | \title{convert a `dups` object to a `tibble`} 6 | \usage{ 7 | \method{as_tibble}{dups}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A data frame, list, matrix, or other object that could reasonably be 11 | coerced to a tibble.} 12 | 13 | \item{...}{Unused, for extensibility.} 14 | } 15 | \description{ 16 | convert a `dups` object to a `tibble` 17 | } 18 | -------------------------------------------------------------------------------- /man/dupree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dupree.R 3 | \name{dupree} 4 | \alias{dupree} 5 | \title{Detect code duplication between the code-blocks in a set of files} 6 | \usage{ 7 | dupree(files, min_block_size = 40, ...) 8 | } 9 | \arguments{ 10 | \item{files}{A set of files over which code-duplication 11 | should be measured.} 12 | 13 | \item{min_block_size}{\code{dupree} uses a notion of non-trivial 14 | symbols. These are the symbols / code-words that remain after filtering 15 | out really common symbols like \code{<-}, \code{,}, etc. After filtering 16 | out these symbols from each code-block, only those blocks containing at 17 | least \code{min_block_size} symbols are used in the inter-block 18 | code-duplication measurement.} 19 | 20 | \item{...}{Unused at present.} 21 | } 22 | \value{ 23 | A \code{tibble}. Each row in the table summarises the 24 | comparison between two code-blocks (block 'a' and block 'b') in the input 25 | files. Each code-block in the pair is indicated by: i) the file 26 | (\code{file_a} / \code{file_b}) that contains it; ii) its position within 27 | that file (\code{block_a} / \code{block_b}; 1 being the first code-block in 28 | a given file); and iii) the line where that code-block starts in that file 29 | (\code{line_a} / \code{line_b}). The pairs of code-blocks are ordered by 30 | decreasing similarity. Any match that is returned is either the top hit for 31 | block 'a' or for block 'b' (or both). 32 | } 33 | \description{ 34 | This function identifies all code-blocks in a set of files and then computes 35 | a similarity score between those code-blocks to help identify functions / 36 | classes that have a high level of duplication, and could possibly be 37 | refactored. 38 | } 39 | \details{ 40 | Code-blocks under a size threshold are disregarded before analysis (the size 41 | threshold is controlled by \code{min_block_size}); and only top-level code 42 | blocks are considered. 43 | 44 | Every sufficiently large code-block in the input files will be present in 45 | the results at least once. If code-block X and code-block Y are present in 46 | a row of the resulting data-frame, then either X is the closest match to Y, 47 | or Y is the closest match to X (or possibly both) according to the 48 | similarity score; as such, some code-blocks may be present multiple times in 49 | the results. 50 | 51 | Similarity between code-blocks is calculated using the 52 | longest-common-subsequence (\code{lcs}) measure from the package 53 | \code{stringdist}. This measure is applied to a tokenised version of the 54 | code-blocks. That is, each function name / operator / variable in the code 55 | blocks is converted to a unique integer so that a code-block can be 56 | represented as a vector of integers and the \code{lcs} measure is applied to 57 | each pair of these vectors. 58 | } 59 | \examples{ 60 | # To quantify duplication between the top-level code-blocks in a file 61 | example_file <- system.file("extdata", "duplicated.R", package = "dupree") 62 | dup <- dupree(example_file, min_block_size = 10) 63 | dup 64 | 65 | # For the block-pair with the highest duplication, we print the first four 66 | # lines: 67 | readLines(example_file)[dup$line_a[1] + c(0:3)] 68 | readLines(example_file)[dup$line_b[1] + c(0:3)] 69 | 70 | # The code-blocks in the example file are rather small, so if 71 | # `min_block_size` is too large, none of the code-blocks will be analysed 72 | # and the results will be empty: 73 | dupree(example_file, min_block_size = 40) 74 | } 75 | -------------------------------------------------------------------------------- /man/dupree_dir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dupree.R 3 | \name{dupree_dir} 4 | \alias{dupree_dir} 5 | \title{Run duplicate-code detection over all R-files in a directory} 6 | \usage{ 7 | dupree_dir( 8 | path = ".", 9 | min_block_size = 40, 10 | filter = NULL, 11 | ..., 12 | recursive = TRUE 13 | ) 14 | } 15 | \arguments{ 16 | \item{path}{A directory (By default the current working 17 | directory). All files in this directory that have a ".R", ".r" or ".Rmd" 18 | extension will be checked for code duplication.} 19 | 20 | \item{min_block_size}{\code{dupree} uses a notion of non-trivial 21 | symbols. These are the symbols / code-words that remain after filtering 22 | out really common symbols like \code{<-}, \code{,}, etc. After filtering 23 | out these symbols from each code-block, only those blocks containing at 24 | least \code{min_block_size} symbols are used in the inter-block 25 | code-duplication measurement.} 26 | 27 | \item{filter}{A pattern for use in grep - this is used to keep 28 | only particular files: eg, filter = "classes" would compare files with 29 | `classes` in the filename} 30 | 31 | \item{...}{Further arguments for grep. For example, `filter 32 | = "test", invert = TRUE` would disregard all files with `test` in the 33 | file-path.} 34 | 35 | \item{recursive}{Should we consider files in subdirectories as 36 | well?} 37 | } 38 | \description{ 39 | Run duplicate-code detection over all R-files in a directory 40 | } 41 | \seealso{ 42 | dupree 43 | } 44 | -------------------------------------------------------------------------------- /man/dupree_package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dupree.R 3 | \name{dupree_package} 4 | \alias{dupree_package} 5 | \title{Run duplicate-code detection over all files in the `R` directory of a 6 | package} 7 | \usage{ 8 | dupree_package(package = ".", min_block_size = 40) 9 | } 10 | \arguments{ 11 | \item{package}{The name or path to the package that is to be 12 | checked (By default the current working directory).} 13 | 14 | \item{min_block_size}{\code{dupree} uses a notion of non-trivial 15 | symbols. These are the symbols / code-words that remain after filtering 16 | out really common symbols like \code{<-}, \code{,}, etc. After filtering 17 | out these symbols from each code-block, only those blocks containing at 18 | least \code{min_block_size} symbols are used in the inter-block 19 | code-duplication measurement.} 20 | } 21 | \description{ 22 | The function fails if the path does not look like a typical R package (it 23 | should have both an R/ subdirectory and a DESCRIPTION file present). 24 | } 25 | \seealso{ 26 | dupree 27 | } 28 | -------------------------------------------------------------------------------- /man/print.dups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dups-class.R 3 | \name{print.dups} 4 | \alias{print.dups} 5 | \title{print method for `dups` class} 6 | \usage{ 7 | \method{print}{dups}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{an object used to select a method.} 11 | 12 | \item{...}{further arguments passed to or from other methods.} 13 | } 14 | \description{ 15 | print method for `dups` class 16 | } 17 | -------------------------------------------------------------------------------- /man/reexports.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dups-class.R 3 | \docType{import} 4 | \name{reexports} 5 | \alias{reexports} 6 | \alias{as_tibble} 7 | \title{Objects exported from other packages} 8 | \keyword{internal} 9 | \description{ 10 | These objects are imported from other packages. Follow the links 11 | below to see their documentation. 12 | 13 | \describe{ 14 | \item{tibble}{\code{\link[tibble]{as_tibble}}} 15 | }} 16 | 17 | -------------------------------------------------------------------------------- /presentations/cleanish_code.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Clean(ish) Code: `dupree`" 3 | subtitle: "Presented to EdinbR R-Users Group, 2019-07-17" 4 | author: "Russ Hyde, University of Glasgow" 5 | output: 6 | ioslides_presentation: 7 | df_print: paged 8 | --- 9 | 10 | 17 | 18 | 27 | 28 | 29 | 30 | 46 | 47 | 48 | 49 | ```{r, echo = FALSE} 50 | suppressPackageStartupMessages({ 51 | library(dplyr) 52 | library(magrittr) 53 | library(tidygraph) 54 | library(ggraph) 55 | library(stringdist) 56 | }) 57 | ``` 58 | 59 | 60 | 61 | ## Background and Links: 62 | 63 | ### Today's packages 64 | 65 | - [https://github.com/russHyde/dupree](https://github.com/russHyde/dupree) 66 | - [https://github.com/jimhester/lintr](https://github.com/jimhester/lintr) 67 | - [`https://github.com/ropensci/git2r`](https://github.com/ropensci/git2r) 68 | 69 | ### Me 70 | 71 | - [https://twitter.com/haematobot](https://twitter.com/haematobot) 72 | - [https://russ-hyde.rbind.io/](https://russ-hyde.rbind.io/) 73 | - [Paul O'Gorman Leukaemia Research Centre]( 74 | https://www.gla.ac.uk/connect/supportus/medicalfund/paulogormanleukaemiaresearchcentre/ 75 | ) 76 | 77 | ## Preamble 78 | 79 | ```{r} 80 | # Dependencies: 81 | # - lintr, dplyr, purrr, tibble, magrittr, methods, stringdist 82 | if (!"dupree" %in% installed.packages()) { 83 | require("devtools") 84 | devtools::install_github( 85 | repo = "russHyde/dupree", dependencies = FALSE 86 | ) 87 | } 88 | 89 | suppressPackageStartupMessages({ 90 | library(lintr) 91 | library(dupree) 92 | library(git2r) 93 | }) 94 | ``` 95 | 96 | ## Code Smells & Architectural Ideals 97 | 98 | _"The most common design problems result from code that_ 99 | 100 | - _Is duplicated_ 101 | 102 | - _Is unclear_ 103 | 104 | - _Is complicated"_ 105 | 106 | Quote: Kerievsky 'Refactoring to Patterns' 107 | 108 | See also 109 | Fowler 'Refactoring', 110 | Martin 'Clean Code' 111 | and Jenny Bryan's talk 'Code smells and feels' 112 | 113 | ## Types of duplication 114 | 115 | ```{r, echo = FALSE} 116 | url <- "https://static.fjcdn.com/large/pictures/9d/b7/9db733_1672275.gif" 117 | ``` 118 | 119 | ![](`r url`){ width=80% } 120 | 121 | - Trivial stuff (`library(dplyr)`) 122 | 123 | - Copy/paste-driven development (similar logic & code) 124 | 125 | - Functional duplication (same logic, different code) 126 | 127 | - ? False duplication (different logic, similar code) 128 | 129 | ## How to detect duplication? 130 | 131 | - Python 132 | - `pylint` (looks for identical _lines_ between files) 133 | 134 | - Java / C++ / C# etc 135 | - lots of choice (code structure / identity) 136 | 137 | - R: nothing for source code (AFAIK) 138 | 139 | - String / Sequence similarity: `stringdist` 140 | - Text analysis: `ropensci:textreuse` 141 | - (But tools like: `goodpractice`, `lintr`, `styler`, `cyclocomp`, `pkgnet`) 142 | 143 | ## `dupree` 144 | 145 | - [https://github.com/russHyde/dupree](https://github.com/russHyde/dupree) 146 | 147 | - All community input is welcome 148 | 149 | - Most data input is welcome: 150 | - sets of files (`dupree()`) 151 | - a directory (`dupree_dir()`) 152 | - or a package (`dupree_package()`) 153 | 154 | 155 | ## Duplication in a script 156 | 157 | ```{r} 158 | # min_block_size: used to prevent dupree analysing really small code blocks 159 | dupree("duplication_heavy.R", min_block_size = 3) %>% 160 | dplyr::select(-file_a, -file_b) 161 | ``` 162 | 163 | ## Duplication in a script (cont.) {.smaller} 164 | 165 | ```{r, code = readLines("duplication_heavy.R"), eval = FALSE} 166 | ``` 167 | 168 | 212 | 213 | ## Mechanics 214 | 215 | Longest Common Substring 216 | ```{r} 217 | # breakf-a---st 218 | # break-dance-- 219 | stringdist::stringdist("breakfast", "breakdance", method = "lcs") 220 | ``` 221 | Code blocks 222 | 223 | -> Sentences of function / variable names 224 | 225 | -> "Sentences" of integers 226 | 227 | -> Compute similarity score based on longest-common-subsequence 228 | 229 | ## Mechanics (cont.) 230 | 231 | Use `seq_sim` to compute LCS-based distance between vectors of integers 232 | 233 | ```{r} 234 | to_ints <- function(word){ 235 | as.integer(factor(strsplit(word, "")[[1]], levels = letters)) 236 | } 237 | 238 | to_ints("breakfast") 239 | ``` 240 | 241 | ```{r} 242 | stringdist::seq_sim( 243 | list(to_ints("breakfast")), list(to_ints("breakdance")), method = "lcs" 244 | ) # 1 - |LCS| / (|seq1| + |seq2|) 245 | ``` 246 | 247 | ## Duplication in a package 248 | 249 | Downloaded the source code for `lintr` from github using 250 | [`ropensci/git2r`](https://github.com/ropensci/git2r). 251 | 252 | ```{r, message = FALSE, results="hide"} 253 | # temporary dir for storing `lintr`'s source code 254 | lintr_path <- file.path(tempdir(), "lintr") 255 | lintr_repo <- git2r::clone( 256 | "https://github.com/jimhester/lintr", 257 | lintr_path 258 | ) 259 | ``` 260 | 261 | ## Duplication in a package (cont) 262 | 263 | Ran dupree on `lintr` 264 | 265 | ```{r} 266 | dups <- dupree::dupree_package( 267 | lintr_path, min_block_size = 40 268 | ) 269 | ``` 270 | 271 | ```{r, fig.height=3, echo = FALSE} 272 | ggplot2::qplot( 273 | x = seq(nrow(dups)), y = dups[["score"]], 274 | xlab = "Index", 275 | ylab = "Similarity score:\nversus closest matching block") 276 | ``` 277 | 278 | ## Duplication in a package (cont) {.smaller} 279 | 280 | ```{r} 281 | dups %>% 282 | dplyr::filter(score > 0.4 & file_a != file_b) %>% 283 | dplyr::mutate_at(c("file_a", "file_b"), basename) %>% 284 | head() 285 | ``` 286 | 287 | [GOTO: `equals_na_lintr.R`](https://github.com/jimhester/lintr/blob/master/R/equals_na_lintr.R) 288 | 289 | ## Visualisation of duplication results 290 | 291 | We make a tidygraph structure from the similarity scores 292 | 293 | ```{r} 294 | dup_graph <- dups %>% 295 | # keep code-block pairs with moderate similarity: 296 | dplyr::filter(score > 0.4) %>% 297 | dplyr::transmute( 298 | # indicate code-blocks by filename and start-line 299 | from = paste(basename(file_a), line_a), 300 | to = paste(basename(file_b), line_b), 301 | type = "duplication", 302 | score = score 303 | ) %>% 304 | tidygraph::as_tbl_graph() %>% 305 | # distinguish the file each code block came from 306 | mutate(filename = gsub("(.*) \\d+$", "\\1", name)) 307 | ``` 308 | 309 | 367 | 368 | ## Visualisation of duplication results (cont) 369 | 370 | ```{r} 371 | graph_image <- dup_graph %>% 372 | ggraph(layout = "gem") + 373 | geom_edge_link( 374 | aes(colour = type, edge_width = score) 375 | ) + 376 | geom_node_point( 377 | aes(colour = filename), size = 4, show.legend = FALSE 378 | ) + 379 | theme_graph() 380 | ``` 381 | 382 | ## Visualisation of duplication results (cont) 383 | 384 | ```{r} 385 | graph_image 386 | ``` 387 | 388 | ## Visualisation of duplication results (cont) 389 | 390 | ```{r} 391 | graph_image + 392 | geom_node_text(aes(label = name), repel = TRUE) 393 | ``` 394 | 395 | ## What was `lintr` by the way? 396 | 397 | - style / syntax checker for _R_ 398 | 399 | - configurable 400 | 401 | - can be ran 402 | 403 | - in Rstudio / vim / atom etc 404 | 405 | - or on Travis 406 | 407 | - (and dupree uses lintr's file parsers) 408 | 409 | ## refactoRing 410 | 411 | - Improving the structure of code (without modifying its function) 412 | 413 | - The rule of 3 414 | 415 | - Examples 416 | 417 | - Figures: Global theming / %+% 418 | 419 | - Statements: Replace with function call 420 | 421 | - Common functions: Move to a package 422 | 423 | - RMarkdown: Configurable reports / child-stubs 424 | 425 | # Thanks 426 | -------------------------------------------------------------------------------- /presentations/duplication_heavy.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | data(diamonds) 3 | 4 | diamonds %>% 5 | filter(clarity %in% c("SI1", "SI2")) %>% 6 | group_by(color) %>% 7 | summarise(m_price = mean(price), sd_price = sd(price)) 8 | 9 | diamonds %>% 10 | filter(cut >= "Very Good") %>% 11 | group_by(color) %>% 12 | summarise(m_price = mean(price), sd_price = sd(price)) 13 | 14 | 15 | # note that dupree can't tell that the following code is logically 16 | # the same as the preceding code 17 | summarise( 18 | group_by( 19 | filter(diamonds, cut >= "Very Good"), 20 | color 21 | ), 22 | sd_price = sd(price), 23 | m_price = mean(price) 24 | ) 25 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(dupree) 3 | 4 | test_check("dupree") 5 | -------------------------------------------------------------------------------- /tests/testthat/helper.R: -------------------------------------------------------------------------------- 1 | #' Since you can't compare two `tbl_df` objects when they contain a list as a 2 | #' column using expect_equal or all.equal 3 | #' 4 | expect_equal_tbl <- function(object, expected, ..., info = NULL) { 5 | act <- testthat::quasi_label(rlang::enquo(object), arg = "object") 6 | exp <- testthat::quasi_label(rlang::enquo(expected), arg = "expected") 7 | 8 | # all.equal.list is slightly problematic: it returns TRUE for match, and 9 | # returns a character vector when differences are observed. We extract 10 | # both a match-indicator and a failure message 11 | 12 | diffs <- all.equal.list(object, expected, ...) 13 | has_diff <- if (is.logical(diffs)) diffs else FALSE 14 | diff_msg <- paste(diffs, collapse = "\n") 15 | 16 | testthat::expect( 17 | has_diff, 18 | failure_message = sprintf( 19 | "%s not equal to %s.\n%s", act$lab, exp$lab, diff_msg 20 | ), 21 | info = info 22 | ) 23 | 24 | invisible(act$val) 25 | } 26 | 27 | expect_equivalent_tbl <- function(object, expected, ..., info = NULL) { 28 | expect_equal_tbl( 29 | object, expected, ..., 30 | check.attributes = FALSE, info = info 31 | ) 32 | } 33 | 34 | get_dups_tbl <- function( 35 | ... 36 | ) { 37 | empty_tbl <- tibble::tibble( 38 | file_a = character(0), 39 | file_b = character(0), 40 | block_a = integer(0), 41 | block_b = integer(0), 42 | line_a = integer(0), 43 | line_b = integer(0), 44 | score = numeric(0) 45 | ) 46 | 47 | user_tbl <- tibble::tibble(...) 48 | 49 | common_cols <- intersect(colnames(user_tbl), colnames(empty_tbl)) 50 | 51 | if (length(common_cols) == 0) { 52 | return(dplyr::cross_join(user_tbl, empty_tbl)) 53 | } 54 | 55 | dplyr::left_join( 56 | user_tbl, 57 | empty_tbl, 58 | by = common_cols 59 | ) 60 | } 61 | 62 | get_empty_dups_df <- function() { 63 | as.data.frame(get_dups_tbl(), stringsAsFactors = FALSE) 64 | } 65 | 66 | ############################################################################### 67 | -------------------------------------------------------------------------------- /tests/testthat/test-dupree_dir_integration.R: -------------------------------------------------------------------------------- 1 | context("Integration tests for `dupree_dir()`") 2 | 3 | test_that("`dupree_dir` fails early when passed a nonexisting dir", { 4 | expect_error( 5 | dupree_dir(file.path("testdata", "not-a-dir")), 6 | regexp = "does not exist", 7 | info = "dir passed to `dupree_dir` should exist" 8 | ) 9 | }) 10 | 11 | test_that("All .R files in subdirs are assessed by dupree_dir()", { 12 | # the test-package "anRpackage" contains 13 | # - ./R/anRpackage-internal.R 14 | # - and ./inst/dir1/R/dont_dup_me.R 15 | # - both of these files should be included by `dupree_dir()` 16 | package <- file.path("testdata", "anRpackage") 17 | r_content <- c( 18 | file.path("R", "anRpackage-internal.R"), 19 | file.path("inst", "dir1", "R", "dont_dup_me.R") 20 | ) 21 | expect_is( 22 | dupree_dir(package), 23 | "dups", 24 | info = "dupree_dir should return with class `dups`" 25 | ) 26 | expect_silent( 27 | dupree_dir(package) 28 | ) 29 | 30 | dups <- as.data.frame(dupree_dir(package)) 31 | observed_files <- unique(c(dups$file_a, dups$file_b)) 32 | expected_files <- file.path(package, r_content) 33 | expect_equal( 34 | sort(observed_files), 35 | sort(expected_files) 36 | ) 37 | }) 38 | -------------------------------------------------------------------------------- /tests/testthat/test-dupree_integration.R: -------------------------------------------------------------------------------- 1 | test_that("multiplication works", { 2 | expect_is( 3 | dupree( 4 | files = file.path("testdata", "anRpackage", "R", "anRpackage-internal.R") 5 | ), 6 | "dups" 7 | ) 8 | }) 9 | -------------------------------------------------------------------------------- /tests/testthat/test-dupree_package_integration.R: -------------------------------------------------------------------------------- 1 | context("Integration tests for `dupree_package`") 2 | 3 | test_that("`dupree_package` results only include files from /R/", { 4 | # the test-package "anRpackage" contains 5 | # - ./R/anRpackage-internal.R 6 | # - and ./inst/dir1/R/dont_dup_me.R 7 | # - the latter should not be included by `dupree_package` by default 8 | expect_is( 9 | dupree_package(file.path("testdata", "anRpackage")), "dups" 10 | ) 11 | expect_silent( 12 | dupree_package(file.path("testdata", "anRpackage")) 13 | ) 14 | 15 | dups <- as.data.frame(dupree_package(file.path("testdata", "anRpackage"))) 16 | files <- unique(c(dups$file_a, dups$file_b)) 17 | expect_equal( 18 | files, 19 | file.path("testdata", "anRpackage", "R", "anRpackage-internal.R") 20 | ) 21 | }) 22 | 23 | test_that("`dupree_package` fails early when passed a nonexisting dir", { 24 | expect_error( 25 | dupree_package(file.path("testdata", "not_a_dir")), 26 | regexp = "does not exist", 27 | info = "dir passed to `dupree_package` should exist" 28 | ) 29 | }) 30 | 31 | test_that("`dupree_package` fails when passed a non-R package structure", { 32 | 33 | # There must be a DESCRIPTION file present 34 | d <- tempfile(pattern = "not_an_r_package") 35 | dir.create(d) 36 | dir.create(file.path(d, "R")) 37 | 38 | expect_error( 39 | dupree_package(d), 40 | regexp = "not an R package", 41 | info = "DESCRIPTION must be present in the path passed to dupree_package" 42 | ) 43 | 44 | # There must be an R/ subdirectory present 45 | d <- tempfile(pattern = "not_an_r_package") 46 | dir.create(d) 47 | file.create(file.path(d, "DESCRIPTION")) 48 | 49 | expect_error( 50 | dupree_package(d), 51 | regexp = "not an R package", 52 | info = "R/ subdir must be present in the path passed to dupree_package" 53 | ) 54 | }) 55 | -------------------------------------------------------------------------------- /tests/testthat/test-dups-class.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("`dups` object can be converted to `data.frame`", { 3 | x <- get_empty_dups_df() 4 | dups <- as_dups(x) 5 | 6 | expect_equal( 7 | as.data.frame(dups), 8 | x, 9 | info = "conversion `dups` -> `data.frame`" 10 | ) 11 | 12 | expect_equal( 13 | as_dups(as_dups(x)), 14 | as_dups(x), 15 | info = "dups -> dups conversion is an identity map" 16 | ) 17 | 18 | y <- get_dups_tbl() 19 | dups <- as_dups(y) 20 | 21 | expect_equal( 22 | as_tibble(dups), 23 | y, 24 | info = "conversion `dups` -> `tibble`" 25 | ) 26 | }) 27 | 28 | test_that("non dups/data-frames can't be converted to dups", { 29 | expect_error(as_dups("NOT A data.frame or dups object")) 30 | }) 31 | 32 | describe("printing a 'dups' object", { 33 | dups <- as_dups(get_dups_tbl(file_a = paste0(letters, ".R"))) 34 | 35 | it("includes the first line in the output", { 36 | expect_output(print(dups), regexp = "a\\.R") 37 | }) 38 | 39 | it("respects print(tibble, n = ...)", { 40 | # "z.R" is on the last line of the table, it shouldn't be visible by default 41 | # because `print(tibble)` shows the first 10 lines for large tables 42 | expect_output(print(dups), regexp = "[^z].\\R") 43 | # But when 26 lines of the table are printed, then the file "z.R" should be 44 | # seen 45 | expect_output(print(dups, n = 26), regexp = "z\\.R") 46 | }) 47 | }) 48 | -------------------------------------------------------------------------------- /tests/testthat/test_dupree.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | 3 | context("Tests duplicated code analysis functions") 4 | 5 | ############################################################################### 6 | 7 | .make_exprs <- function(strs) { 8 | # Convert a vector of strings into an source_expressions structure for use 9 | # in parse_code_blocks 10 | exprs <- setNames( 11 | Map( 12 | function(s) { 13 | list( 14 | content = s, 15 | parsed_content = getParseData(parse(text = s)) 16 | ) 17 | }, 18 | strs 19 | ), 20 | NULL 21 | ) 22 | 23 | list( 24 | expressions = exprs, 25 | error = character(), 26 | lines = list() 27 | ) 28 | } 29 | 30 | 31 | ############################################################################### 32 | -------------------------------------------------------------------------------- /tests/testthat/test_dupree_classes.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | 3 | context("Tests for classes in `dupree` package") 4 | 5 | ############################################################################### 6 | 7 | # - Note that we use seq(4) rather than 1:4 to define the integer-sequence for 8 | # alignment by stringdist::seq_sim; there is an issue with lazily-evaluated 9 | # values in tests on stringdist - see NEWS for version 0.9.5.4 of stringdist 10 | 11 | ############################################################################### 12 | 13 | test_that("EnumeratedCodeTable: construction / validity", { 14 | expect_is( 15 | new("EnumeratedCodeTable"), 16 | "EnumeratedCodeTable", 17 | info = "Constructor for EnumeratedCodeTable" 18 | ) 19 | 20 | expect_error( 21 | new("EnumeratedCodeTable", blocks = tibble()), 22 | info = paste( 23 | "EnumeratedCodeTable should have `file`, `block`, `start_line` and", 24 | "`enumerated_code columns`" 25 | ) 26 | ) 27 | 28 | default_blocks <- tibble::tibble( 29 | file = character(0), block = integer(0), start_line = integer(0), 30 | enumerated_code = list() 31 | ) 32 | 33 | expect_equal_tbl( 34 | new("EnumeratedCodeTable")@blocks, 35 | default_blocks, 36 | info = paste( 37 | "Default 'blocks' entry should have no rows, and have", 38 | "file|block|start_line|enumerated_code columns" 39 | ) 40 | ) 41 | 42 | my_blocks <- tibble::tibble( 43 | file = "a", 44 | block = 1, 45 | start_line = 1, 46 | enumerated_code = list(as.integer(c(1, 2, 3, 4, 5))) 47 | ) 48 | 49 | expect_equal_tbl( 50 | new("EnumeratedCodeTable", my_blocks)@blocks, 51 | my_blocks, 52 | info = paste( 53 | "'blocks' entry should match the defining data-frame" 54 | ) 55 | ) 56 | }) 57 | 58 | ############################################################################### 59 | 60 | test_that("EnumeratedCodeTable: find_best_match_for_single_block", { 61 | # TODO: 62 | }) 63 | 64 | test_that("EnumeratedCodeTable: find_best_matches", { 65 | # - return at most 1 best-hit for each block 66 | 67 | empty_results <- tibble::tibble( 68 | file_a = character(0), file_b = character(0), block_a = integer(0), 69 | block_b = integer(0), line_a = integer(0), line_b = integer(0), 70 | score = numeric(0) 71 | ) 72 | 73 | # Where block X is the best match for block Y and vice versa, dupree should 74 | # only return a single line: there is no value in reporting 75 | # file_a file_b block_a block_b line_a line_b score 76 | # X X 1 2 10 20 0 77 | # X X 2 1 20 10 0 78 | # # matches the preceding line 79 | # X Y 30 10 350 150 0.5236 80 | # Y X 10 30 150 350 0.5236 81 | # # matches the preceding line (non-matchable score) 82 | 83 | # When results are replicated like this, sort file/block alphanumerically: 84 | # so file_a <= file_b and block_a <= block_b 85 | 86 | # No overlap between the symbols in the two code-blocks 87 | my_blocks <- tibble::tibble( 88 | file = "a", 89 | block = as.integer(c(1, 2)), 90 | start_line = as.integer(c(1, 2)), 91 | enumerated_code = list( 92 | as.integer(c(1, 2, 3)), 93 | 4L 94 | ) 95 | ) 96 | my_code_table <- new("EnumeratedCodeTable", my_blocks) 97 | 98 | expect_equal( 99 | find_best_matches(my_code_table), 100 | tibble::tibble( 101 | file_a = "a", file_b = "a", block_a = 1L, block_b = 2L, 102 | line_a = 1L, line_b = 2L, score = 0 103 | ), 104 | info = "find_best_matches on two distinct code-blocks (same file)" 105 | ) 106 | expect_equal( 107 | find_best_matches(new("EnumeratedCodeTable", my_blocks[2:1, ])), 108 | find_best_matches(my_code_table), 109 | info = paste( 110 | "when two blocks are mutually-best-matches from the same file,", 111 | "return a single row (we use block_a <= block_b)" 112 | ) 113 | ) 114 | 115 | # Identical code-blocks 116 | identical_blocks <- tibble::tibble( 117 | file = "a", 118 | block = as.integer(c(1, 2)), 119 | start_line = as.integer(c(1, 2)), 120 | enumerated_code = list( 121 | as.integer(c(1, 2, 3)), 122 | as.integer(c(1, 2, 3)) 123 | ) 124 | ) 125 | identical_code_table <- new("EnumeratedCodeTable", identical_blocks) 126 | expect_equal( 127 | find_best_matches(identical_code_table), 128 | tibble::tibble( 129 | file_a = "a", file_b = "a", block_a = 1L, block_b = 2L, 130 | line_a = 1L, line_b = 2L, score = 1 131 | ), 132 | info = "find_best_matches on two identical code-blocks (same file)" 133 | ) 134 | 135 | # Overlapping, non-equal code-blocks, using longest-common-subsequence 136 | 137 | nonequal_blocks <- tibble::tibble( 138 | file = letters[1:2], block = 1L, start_line = 1L, 139 | enumerated_code = list( 140 | as.integer(c(1, 2, 3, 4)), 141 | as.integer(c(3, 4, 5, 6)) 142 | ) 143 | # seq_dist_LCS = 4; length_sum = 8; seq_sim = 1 - dist/len_sum = 0.5 144 | ) 145 | nonequal_blocks_tbl <- new("EnumeratedCodeTable", nonequal_blocks) 146 | expect_equal( 147 | find_best_matches(nonequal_blocks_tbl), 148 | tibble::tibble( 149 | file_a = c("a"), file_b = c("b"), block_a = 1L, block_b = 1L, 150 | line_a = 1L, line_b = 1L, score = 1 / 2 151 | ), 152 | info = "find_best_matches on non-equal code-blocks (LCS; different file)" 153 | ) 154 | expect_equal( 155 | find_best_matches(new("EnumeratedCodeTable", nonequal_blocks[2:1, ])), 156 | find_best_matches(nonequal_blocks_tbl), 157 | info = paste( 158 | "when two blocks are mutually-best-matches from different files,", 159 | "return a single row (we use file_a <= file_b alphanumerically)" 160 | ) 161 | ) 162 | 163 | # - if there's 1 or fewer blocks, return an empty data-frame 164 | single_block <- tibble::tibble( 165 | file = "a", 166 | block = 1L, 167 | start_line = 1L, 168 | enumerated_code = list(as.integer(c(1, 2, 3, 4))) 169 | ) 170 | single_code_table <- new("EnumeratedCodeTable", single_block) 171 | expect_equal( 172 | find_best_matches(single_code_table), 173 | empty_results, 174 | info = paste( 175 | "A single code-block can't be compared to anything: results should be", 176 | "empty" 177 | ) 178 | ) 179 | 180 | # - If A-B, B-A and C-A are optimal then B-C shouldn't be in the results 181 | three_blocks <- tibble::tibble( 182 | file = "a", 183 | block = as.integer(c(1, 2, 3)), 184 | start_line = as.integer(c(1, 2, 3)), 185 | # A) 1-2-3-4-5 186 | # B) 1-2-3-6-5 (A-B: 2) 187 | # C) 7-2-3-4-8 (A-C: 4; B-C: 6) 188 | enumerated_code = list( 189 | as.integer(c(1, 2, 3, 4, 5)), 190 | as.integer(c(1, 2, 3, 6, 5)), 191 | as.integer(c(7, 2, 3, 4, 8)) 192 | ) 193 | ) 194 | triple_code_table <- new("EnumeratedCodeTable", three_blocks) 195 | expect_equal( 196 | find_best_matches(triple_code_table), 197 | tibble::tibble( 198 | file_a = "a", file_b = "a", block_a = rep(1L, 2), block_b = 2:3, 199 | line_a = 1L, line_b = 2:3, score = c(1 - 2 / 10, 1 - 4 / 10) 200 | ), 201 | info = "no cycles should be present in the results (by default)" 202 | ) 203 | }) 204 | -------------------------------------------------------------------------------- /tests/testthat/test_dupree_code_enumeration.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | 3 | context("Tests for code import and code-symbol enumeration in `dupree`") 4 | 5 | ############################################################################### 6 | 7 | test_that("get_localised_parsed_code_blocks", { 8 | expect_equal( 9 | object = nrow(.get_default_annotated_parsed_content()), 10 | expected = 0, 11 | info = "Default annotated-parsed-content has no rows" 12 | ) 13 | expect_equal( 14 | object = nrow(get_localised_parsed_code_blocks(list())), 15 | expected = 0, 16 | info = paste( 17 | "An empty list of source-expressions should return a", 18 | "data-frame of 0 rows" 19 | ) 20 | ) 21 | }) 22 | 23 | ############################################################################### 24 | 25 | test_that("Number of code blocks in imported files", { 26 | # For empty files, an empty data-frame should be returned by 27 | # `import_parsed_code_blocks` 28 | 29 | # Empty .R files 30 | # - No content 31 | empty_file <- file.path("testdata", "empty.R") 32 | expect_true( 33 | file.exists(empty_file), 34 | info = "Just checking the test-files exist" 35 | ) 36 | expect_true( 37 | nrow(import_parsed_code_blocks(empty_file)) == 0, 38 | info = "empty .R file should provide no code-blocks: import function" 39 | ) 40 | expect_true( 41 | nrow(preprocess_code_blocks(empty_file)@blocks) == 0, 42 | info = "empty .R file should provide no code-blocks: preprocess workflow" 43 | ) 44 | # - Only comments 45 | comment_file <- file.path("testdata", "comments.R") 46 | expect_true( 47 | nrow(import_parsed_code_blocks(comment_file)) == 0, 48 | info = paste( 49 | "comment-only .R file should provide no code-blocks: import function" 50 | ) 51 | ) 52 | expect_true( 53 | nrow(preprocess_code_blocks(comment_file)@blocks) == 0, 54 | info = paste( 55 | "comment-only .R file should provide no code-blocks: preprocess workflow" 56 | ) 57 | ) 58 | 59 | # All the following tests fail with lintr<2.0.0 since they require parsing 60 | # data from .Rmd files with non-R-codeblocks or with an absence of codeblocks 61 | 62 | # Empty .Rmd files: 63 | # - No content 64 | empty_rmd <- file.path("testdata", "empty.Rmd") 65 | expect_true( 66 | nrow(import_parsed_code_blocks(empty_rmd)) == 0, 67 | info = "empty .Rmd file should import no code-blocks" 68 | ) 69 | 70 | # - Only header 71 | header_rmd <- file.path("testdata", "header_only.Rmd") 72 | expect_true( 73 | nrow(import_parsed_code_blocks(header_rmd)) == 0, 74 | info = "header-only .Rmd file should import no code-blocks" 75 | ) 76 | 77 | # - Only text 78 | text_rmd <- file.path("testdata", "text_only.Rmd") 79 | expect_true( 80 | nrow(import_parsed_code_blocks(text_rmd)) == 0, 81 | info = "text-only .Rmd file should import no code-blocks" 82 | ) 83 | 84 | # - Some non-R blocks 85 | non_r_rmd <- file.path("testdata", "non_r_blocks.Rmd") 86 | expect_true( 87 | nrow(import_parsed_code_blocks(non_r_rmd)) == 0, 88 | info = ".Rmd with only non-R blocks should import not code-blocks" 89 | ) 90 | }) 91 | 92 | ############################################################################### 93 | 94 | test_that("Filtering by the number of symbols in the code-blocks", { 95 | 96 | # If there is less than `N` symbols in each input code-block, and 97 | # `min_block_size` is `N` then every code-block will be disregarded 98 | max_9_symbols <- file.path("testdata", "max_9_symbols.R") 99 | expect_equal( 100 | nrow(preprocess_code_blocks(max_9_symbols, min_block_size = 10)@blocks), 101 | 0, 102 | info = paste( 103 | "If there's less than 10 symbols per code-block, no blocks should", 104 | "return on preprocessing with min_block_size = 10" 105 | ) 106 | ) 107 | expect_equal( 108 | nrow(preprocess_code_blocks(max_9_symbols, min_block_size = 1)@blocks), 109 | 5, 110 | info = paste( 111 | "A file with 5 code-blocks, keeping blocks with >= 1 non-trivial symbol" 112 | ) 113 | ) 114 | }) 115 | 116 | ############################################################################### 117 | 118 | test_that("summarise_enumerated_blocks", { 119 | input <- tibble::tibble( 120 | line1 = 1L, col1 = 1L, line2 = 1L, col2 = 7L, id = 21L, parent = 24L, 121 | token = "SYMBOL_PACKAGE", terminal = TRUE, text = "methods", 122 | file = "some/file.R", block = 1L, start_line = 1L, symbol_enum = 60L 123 | ) 124 | expected <- tibble::tibble( 125 | file = "some/file.R", block = 1L, start_line = 1L, 126 | enumerated_code = list(c(60L)), block_size = 1L 127 | ) 128 | # some attribute differences may be expected 129 | expect_equivalent_tbl( 130 | object = summarise_enumerated_blocks(input), 131 | expected = expected, 132 | info = "block with a single code symbol" 133 | ) 134 | }) 135 | 136 | ############################################################################### 137 | -------------------------------------------------------------------------------- /tests/testthat/testdata/anRpackage/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: anRpackage 2 | Type: Package 3 | Title: What the package does (short line) 4 | Version: 1.0 5 | Date: 2019-11-20 6 | Author: Who wrote it 7 | Maintainer: Who to complain to 8 | Description: More about what it does (maybe more than one line) 9 | License: What license is it under? 10 | -------------------------------------------------------------------------------- /tests/testthat/testdata/anRpackage/NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /tests/testthat/testdata/anRpackage/R/anRpackage-internal.R: -------------------------------------------------------------------------------- 1 | .Random.seed <- 2 | c( 3 | 403L, 136L, -1741379319L, 1446513467L, -241958134L, 872221420L, 4 | -1863432337L, -1395316571L, -950746532L, -1337855438L, -1218119379L, 5 | 1555202023L, -757938418L, -1671391000L, 80440043L, 137945993L, 6 | -1183777000L, 2037736006L, 1963934369L, -661576397L, 392535202L, 7 | -889334892L, -735720937L, 1520274957L, -1494521052L, 625964810L, 8 | -1223599499L, -1918727729L, 487953110L, 1614296704L, -2103101245L, 9 | -1584957855L, -1936748848L, -189622226L, -1395560999L, 839242219L, 10 | 125326426L, -1223856612L, 1185816351L, -457493643L, -1778861556L, 11 | -1861633054L, 919767229L, -1032177865L, -1899020322L, 688958744L, 12 | 549863547L, -915220967L, -557924696L, -679244714L, 380455121L, 13 | -1486506877L, -706177262L, -1406766428L, 803815015L, -934621251L, 14 | -1021304524L, 1573896858L, -933723931L, 1191248511L, 251735046L, 15 | -960258160L, -1778531789L, -1470506543L, 980061056L, 885450334L, 16 | -1299399255L, 239692891L, 1208816746L, -954651380L, -1883803825L, 17 | -24286203L, -1615842628L, 1553916818L, 785064077L, 501397191L, 18 | 86613934L, 96659976L, 558616075L, 1052362473L, 1349632440L, 1257520742L, 19 | 1933515521L, 1194454419L, 914802242L, 301095412L, -478772233L, 20 | 960706541L, -2047184764L, -1016772950L, 332471317L, -270340945L, 21 | -135912778L, 489447648L, 2072136739L, -251948223L, 507231792L, 22 | -574432178L, -914180935L, -690245429L, 1967693178L, -1469300612L, 23 | 427304127L, 1549595029L, -1691280980L, -286127230L, -611028643L, 24 | 2126868567L, 1477986110L, 415076216L, 1575412827L, -2048166983L, 25 | 805376520L, 1153734838L, 1025409009L, -469431773L, -2049416590L, 26 | 1293686980L, -892793209L, -65780131L, 1969793108L, 1634417274L, 27 | -10542651L, -2005601505L, -322240858L, -2091749328L, -1804815661L, 28 | 37349617L, -434121696L, 1118712510L, -1238927671L, 1317770363L, 29 | -276477366L, -1677742420L, 1583180207L, 1392279909L, 942771996L, 30 | 2004251506L, 2107142125L, -706691033L, 1185134670L, 2012363944L, 31 | 1059451563L, 1346344905L, -1595487272L, -1928439162L, -875750559L, 32 | 649944819L, -2125995678L, -1381924908L, -1568363817L, 288329933L, 33 | 1319381092L, 1133430346L, -1185631307L, 377859855L, -1559811050L, 34 | 1788999360L, 605778051L, 1044327585L, 360530576L, 961085806L, 35 | -1225588967L, 673199019L, 1055154586L, 591844700L, 2142899295L, 36 | 640586805L, 2059366348L, 1669368610L, 1967953277L, -1501866377L, 37 | 38465438L, 1469091416L, 2065852347L, 1720928601L, 946914024L, 38 | -1620624618L, -1180611439L, -2108309181L, 1398164562L, 160960228L, 39 | 87931175L, 1221482877L, -501968780L, -754675110L, -1433600347L, 40 | 1107725503L, -881111226L, -230363696L, -100265997L, 291772177L, 41 | -915974336L, -245391586L, 1032105961L, -1118419685L, 813784874L, 42 | -31666100L, -724666353L, 1433056069L, -1173605124L, 935174994L, 43 | -409996083L, -428215161L, -1317884050L, 2038178120L, -798702261L, 44 | -298923863L, -1456571912L, 705718566L, -1612203455L, -725060653L, 45 | 775332226L, -1855822412L, -2115038921L, 796349485L, 659480900L, 46 | 796112490L, -1499406123L, -1642738065L, -1803703434L, -305595232L, 47 | 308491619L, 1472804353L, -1826310800L, 774890766L, 1037395833L, 48 | 2003277579L, 2046539834L, 894058524L, 1227334802L, 526945408L, 49 | 1188987924L, 592634664L, -955411398L, 1151233232L, -686689172L, 50 | 1068078772L, 2074255378L, 1208860640L, -1924674612L, -753287712L, 51 | 1808239650L, 463164712L, 1633638668L, -702951876L, -686226190L, 52 | -182978320L, -129419452L, 1650765912L, 2104263226L, 2050967792L, 53 | 493593276L, 1142327828L, -614847294L, -1365772704L, 1382466940L, 54 | 1204069584L, -439569374L, 508414120L, -1039721076L, 958325180L, 55 | 610158642L, -1233923264L, -1692180812L, -1245982392L, -752291142L, 56 | 252231440L, -735166228L, -871630764L, -1752561902L, 1680531872L, 57 | 997320652L, -1975034016L, -1149113150L, 1713917480L, -1562482260L, 58 | -377674948L, -1633803406L, 29413168L, -1274380252L, -1607516616L, 59 | -89091366L, 1015715728L, 1832989948L, 1244941716L, 1082655554L, 60 | -1458927104L, -1296545732L, 1681785424L, -670662494L, -1908784312L, 61 | 1926117900L, 253663964L, 258011218L, -1237136256L, -1145600172L, 62 | 912474088L, -701287686L, 1828489040L, 225551404L, -1120106124L, 63 | 1180053138L, -1349623200L, 1386147980L, -992046752L, 31480034L, 64 | -1397986136L, 1896863436L, -704357892L, 1278659506L, -129743888L, 65 | 383943108L, 1641659736L, 1285190010L, 1926820272L, -849663556L, 66 | -933183916L, -1993958974L, 341508896L, 270791612L, -1311092656L, 67 | -839902238L, -2013479064L, -31566004L, 623275772L, -238156046L, 68 | -258147072L, -1937982348L, -1172394488L, 1106798010L, 783918544L, 69 | -1054611476L, 1645554580L, 128936146L, -190213024L, 1270762828L, 70 | -369265888L, -1122637182L, -2081811992L, 1601493996L, -2030219716L, 71 | -1993095566L, 356674416L, 449875236L, 1536285240L, -573734502L, 72 | -567210544L, 523503036L, 1847496340L, -1329335678L, 50073024L, 73 | -380348612L, 1351672464L, 1327867682L, 1083950600L, 1752343308L, 74 | 949449628L, 1528855570L, 1263700864L, -834761964L, -417835736L, 75 | 141484090L, -1229851696L, 1636670060L, -12653516L, -469995374L, 76 | -452150944L, 431390028L, -473426464L, 1704049570L, 952662952L, 77 | -1758942068L, 863904700L, 934007922L, 114894832L, -177384380L, 78 | 1590241368L, 526938170L, -513922960L, 127118908L, 1376560916L, 79 | 1374227138L, 1714860768L, 99247228L, -173552432L, 392332194L, 80 | -1667318232L, 855503116L, 1870739388L, 1213727026L, -360212288L, 81 | 409858612L, -1622906168L, 1878872762L, -980575088L, 907654892L, 82 | -35400108L, -123521390L, -900438240L, 530126796L, -1539659552L, 83 | 396828354L, -947882968L, -1939129428L, 2110388924L, -1352162830L, 84 | -1431333328L, 1206097828L, -449515080L, 1889651546L, 1459037328L, 85 | -1415545988L, 1452273428L, 553119042L, 1295101696L, -997246020L, 86 | 1921290064L, 301969698L, 438848072L, -1135724532L, -415454116L, 87 | -598683182L, -183126528L, -844639916L, 1629540584L, 786329850L, 88 | 1651832528L, 946358572L, 187984116L, -523182702L, 2111498592L, 89 | 205776908L, 1677519584L, -478948766L, 949160616L, 303761996L, 90 | 1585182588L, -354534094L, -1618901392L, -1574651068L, 1357191128L, 91 | 1827377018L, -1991221712L, 609384380L, 1562265172L, -1085999038L, 92 | 243314848L, -38138180L, -15152688L, 1886061410L, 617726696L, 93 | 632377548L, -137776644L, 25467378L, -1434604928L, 1637282516L, 94 | -860338587L, 971625583L, -1613664600L, 479136774L, 1192163635L, 95 | 33908117L, 1023060466L, 2138785344L, -859068503L, 1509542427L, 96 | 1425294316L, -1010260998L, -2032578513L, 1574240313L, -464842162L, 97 | -1585433956L, 1523767309L, -1945353097L, 1091458176L, 584794846L, 98 | -269995605L, 614951597L, -1618202854L, 104493016L, 1765407809L, 99 | -139810061L, -1157108460L, 1036826786L, 44471735L, 433351041L, 100 | 1599180678L, 283303364L, -674202379L, -789162657L, 619780024L, 101 | -2140759402L, 1624130723L, 850574949L, 744352642L, -408598288L, 102 | 83718297L, 1223461003L, -1256377988L, -246517846L, 1277759455L, 103 | 1749957993L, 1425228158L, -1466052244L, 408324541L, 1241053831L, 104 | 141983344L, -1803270898L, 2121353787L, 1079554333L, -226327606L, 105 | 1305607400L, -1209081903L, -550268477L, 1180162756L, 1153948018L, 106 | 1918436423L, -259258351L, -1361030762L, 1000015092L, -532156795L, 107 | -2008053809L, 1249404616L, 1851998054L, 934159763L, 1039046389L, 108 | -59009326L, -779886304L, -1430761207L, -1643080133L, -929363060L, 109 | -1367274982L, -1140434289L, -934885927L, -873330066L, -125593092L, 110 | -229983379L, -1099299561L, -1078611744L, 969533118L, -1606908789L, 111 | 1456714509L, 645305146L, -1679168648L, 1666603297L, 289042899L, 112 | -34004748L, -289037950L, 1767302167L, -902188319L, 823274534L, 113 | 1321384228L, 902330837L, -1510451841L, 870083608L, -663968970L, 114 | -171043389L, 1159109637L, 54359970L, 317051792L, -1154148551L, 115 | -1360821269L, -89713444L, -2122502902L, 899732479L, -577968567L, 116 | -858516514L, -1996018804L, 946219869L, 2067503975L, -1691004784L, 117 | 696995502L, 1236249435L, -595690691L, -542772822L, 1662869064L, 118 | 122564209L, 1904111587L, 1893448420L, 968262162L, -1861290521L, 119 | 1825186225L, -1083578378L, 1158846996L, -1195956955L, 1904547375L, 120 | 946607080L, -2142821178L, -1019018893L, -2010347051L, -439034446L, 121 | -468601984L, 1988822889L, -476091429L, -663686356L, -1226871494L, 122 | -1320037393L, 1840195193L, 1333436686L, 500875484L, -229206195L, 123 | 1693324983L, 144700480L, 1094651166L, -81352725L, -1426789523L, 124 | -665277990L, 1078821784L, -646915583L, 1968266163L, -427762988L, 125 | -1138824222L, -1726357897L, -881432639L, -108718778L, 1666573444L, 126 | -1190245835L, -936005089L, -1092754056L, -776977322L, -1200572189L, 127 | -2056958299L, -1337785534L, -1182447312L, 1840370377L 128 | ) 129 | 130 | 131 | 132 | another_long_block <- function() { 133 | paste( 134 | 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 135 | 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 136 | 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 137 | 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 138 | 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 139 | 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10 140 | ) 141 | } 142 | -------------------------------------------------------------------------------- /tests/testthat/testdata/anRpackage/Read-and-delete-me: -------------------------------------------------------------------------------- 1 | * Edit the help file skeletons in 'man', possibly combining help files for multiple 2 | functions. 3 | * Edit the exports in 'NAMESPACE', and add necessary imports. 4 | * Put any C/C++/Fortran code in 'src'. 5 | * If you have compiled code, add a useDynLib() directive to 'NAMESPACE'. 6 | * Run R CMD build to build the package tarball. 7 | * Run R CMD check to check the package tarball. 8 | 9 | Read "Writing R Extensions" for more information. 10 | -------------------------------------------------------------------------------- /tests/testthat/testdata/anRpackage/data/ok.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/russHyde/dupree/6be55893a839717d36e6b3e21bdd662951a59bf8/tests/testthat/testdata/anRpackage/data/ok.rda -------------------------------------------------------------------------------- /tests/testthat/testdata/anRpackage/inst/dir1/R/dont_dup_me.R: -------------------------------------------------------------------------------- 1 | some <- function() { 2 | rnorm(10, 1, 2) 3 | rnorm(10, 1, 2) 4 | rnorm(10, 1, 2) 5 | rnorm(10, 1, 2) 6 | rnorm(10, 1, 2) 7 | rnorm(10, 1, 2) 8 | rnorm(10, 1, 2) 9 | rnorm(10, 1, 2) 10 | rnorm(10, 1, 2) 11 | rnorm(10, 1, 2) 12 | rnorm(10, 1, 2) 13 | rnorm(10, 1, 2) 14 | rnorm(10, 1, 2) 15 | rnorm(10, 1, 2) 16 | rnorm(10, 1, 2) 17 | rnorm(10, 1, 2) 18 | rnorm(10, 1, 2) 19 | rnorm(10, 1, 2) 20 | rnorm(10, 1, 2) 21 | rnorm(10, 1, 2) 22 | rnorm(10, 1, 2) 23 | rnorm(10, 1, 2) 24 | rnorm(10, 1, 2) 25 | rnorm(10, 1, 2) 26 | rnorm(10, 1, 2) 27 | rnorm(10) 28 | rnorm(10) 29 | rnorm(10) 30 | rnorm(10) 31 | rnorm(10) 32 | rnorm(10) 33 | rnorm(10) 34 | rnorm(10) 35 | rnorm(10) 36 | rnorm(10) 37 | rnorm(10) 38 | rnorm(10) 39 | rnorm(10) 40 | rnorm(10) 41 | rnorm(10) 42 | rnorm(10) 43 | rnorm(10) 44 | rnorm(10) 45 | rnorm(10) 46 | rnorm(10) 47 | rnorm(10) 48 | rnorm(10) 49 | rnorm(10) 50 | rnorm(10) 51 | rnorm(10) 52 | } 53 | -------------------------------------------------------------------------------- /tests/testthat/testdata/anRpackage/man/anRpackage-package.Rd: -------------------------------------------------------------------------------- 1 | \name{anRpackage-package} 2 | \alias{anRpackage-package} 3 | \alias{anRpackage} 4 | \docType{package} 5 | \title{ 6 | \packageTitle{anRpackage} 7 | } 8 | \description{ 9 | \packageDescription{anRpackage} 10 | } 11 | \details{ 12 | 13 | The DESCRIPTION file: 14 | \packageDESCRIPTION{anRpackage} 15 | \packageIndices{anRpackage} 16 | ~~ An overview of how to use the package, including the most important functions ~~ 17 | } 18 | \author{ 19 | \packageAuthor{anRpackage} 20 | 21 | Maintainer: \packageMaintainer{anRpackage} 22 | } 23 | \references{ 24 | ~~ Literature or other references for background information ~~ 25 | } 26 | ~~ Optionally other standard keywords, one per line, from file KEYWORDS in the R ~~ 27 | ~~ documentation directory ~~ 28 | \keyword{ package } 29 | \seealso{ 30 | ~~ Optional links to other man pages, e.g. ~~ 31 | ~~ \code{\link[:-package]{}} ~~ 32 | } 33 | \examples{ 34 | ~~ simple examples of the most important functions ~~ 35 | } 36 | -------------------------------------------------------------------------------- /tests/testthat/testdata/anRpackage/man/ok.Rd: -------------------------------------------------------------------------------- 1 | \name{ok} 2 | \alias{ok} 3 | \docType{data} 4 | \title{ 5 | %% ~~ data name/kind ... ~~ 6 | } 7 | \description{ 8 | %% ~~ A concise (1-5 lines) description of the dataset. ~~ 9 | } 10 | \usage{data("ok")} 11 | \format{ 12 | The format is: 13 | logi TRUE 14 | } 15 | \details{ 16 | %% ~~ If necessary, more details than the __description__ above ~~ 17 | } 18 | \source{ 19 | %% ~~ reference to a publication or URL from which the data were obtained ~~ 20 | } 21 | \references{ 22 | %% ~~ possibly secondary sources and usages ~~ 23 | } 24 | \examples{ 25 | data(ok) 26 | ## maybe str(ok) ; plot(ok) ... 27 | } 28 | \keyword{datasets} 29 | -------------------------------------------------------------------------------- /tests/testthat/testdata/comments.R: -------------------------------------------------------------------------------- 1 | # JUST A COMMENT 2 | -------------------------------------------------------------------------------- /tests/testthat/testdata/empty.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/russHyde/dupree/6be55893a839717d36e6b3e21bdd662951a59bf8/tests/testthat/testdata/empty.R -------------------------------------------------------------------------------- /tests/testthat/testdata/empty.Rmd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/russHyde/dupree/6be55893a839717d36e6b3e21bdd662951a59bf8/tests/testthat/testdata/empty.Rmd -------------------------------------------------------------------------------- /tests/testthat/testdata/header_only.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | -------------------------------------------------------------------------------- /tests/testthat/testdata/max_9_symbols.R: -------------------------------------------------------------------------------- 1 | # Each code block (of which there are 5) has at most 9 non-trivial symbols 2 | abc <- 123 3 | 4 | def <- rnorm(10, 2, sd = 1:10) 5 | 6 | nine_symbols <- c(3, 4, 5, 6, 7, 8, 9) 7 | 8 | with_a_comment <- NA # this shouldn't affect the count for this block 9 | 10 | NA 11 | -------------------------------------------------------------------------------- /tests/testthat/testdata/non_r_blocks.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | This is an Rmarkdown file, but the code blocks it contains are not interpreted 5 | by R. 6 | 7 | ```{python} 8 | print("Hello") 9 | print([1, 2, 3, 4]) 10 | ``` 11 | -------------------------------------------------------------------------------- /tests/testthat/testdata/r_blocks.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | author: "Russ Hyde" 4 | date: "1 February 2019" 5 | output: pdf_document 6 | --- 7 | 8 | ```{r} 9 | abc <- 123 10 | ``` 11 | 12 | ## R Markdown 13 | 14 | This is an R Markdown document. Markdown is a simple formatting syntax for 15 | authoring HTML, PDF, and MS Word documents. For more details on using R 16 | Markdown see . 17 | 18 | When you click the **Knit** button a document will be generated that includes 19 | both content as well as the output of any embedded R code chunks within the 20 | document. You can embed an R code chunk like this: 21 | 22 | ```{r} 23 | abc^2 24 | ``` 25 | -------------------------------------------------------------------------------- /tests/testthat/testdata/text_only.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Some title" 3 | --- 4 | 5 | Just some text. Note there are no code-blocks at all in here. 6 | --------------------------------------------------------------------------------