├── .github
    ├── .gitignore
    ├── CODE_OF_CONDUCT.md
    ├── workflows
    │   └── check-bioc.yml
    ├── ISSUE_TEMPLATE
    │   └── issue_template.md
    ├── SUPPORT.md
    └── CONTRIBUTING.md
├── vignettes
    ├── .gitignore
    ├── bibliography.bib
    └── doubletrouble_vignette.Rmd
├── .Rbuildignore
├── data
    ├── gmax_ks.rda
    ├── fungi_kaks.rda
    ├── yeast_annot.rda
    ├── yeast_seq.rda
    ├── diamond_inter.rda
    ├── diamond_intra.rda
    └── cds_scerevisiae.rda
├── man
    ├── figures
    │   └── logo.png
    ├── cds_scerevisiae.Rd
    ├── diamond_intra.Rd
    ├── yeast_seq.Rd
    ├── yeast_annot.Rd
    ├── diamond_inter.Rd
    ├── gmax_ks.Rd
    ├── fungi_kaks.Rd
    ├── plot_ks_peaks.Rd
    ├── classify_genes.Rd
    ├── get_intron_counts.Rd
    ├── plot_duplicate_freqs.Rd
    ├── duplicates2counts.Rd
    ├── get_segmental.Rd
    ├── plot_rates_by_species.Rd
    ├── plot_ks_distro.Rd
    ├── pairs2kaks.Rd
    ├── split_pairs_by_peak.Rd
    ├── get_tandem_proximal.Rd
    ├── find_ks_peaks.Rd
    ├── get_anchors_list.Rd
    ├── get_transposed_classes.Rd
    ├── get_transposed.Rd
    └── classify_gene_pairs.Rd
├── .gitignore
├── codecov.yml
├── tests
    ├── testthat
    │   ├── test-data_validation.R
    │   ├── test-visualization.R
    │   ├── test-ka_ks_analyses.R
    │   └── test-duplicate_classification.R
    └── testthat.R
├── NEWS.md
├── inst
    ├── _pkgdown.yml
    ├── CITATION
    └── script
    │   └── data_acquisition.md
├── R
    ├── data_validation.R
    ├── data.R
    ├── duplicate_classification.R
    ├── ka_ks_analyses.R
    ├── utils.R
    ├── visualization.R
    └── utils_duplicate_classification.R
├── NAMESPACE
├── DESCRIPTION
├── README.Rmd
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^\.github$
2 | ^codecov\.yml$
3 | ^.*\.Rproj$
4 | ^\.Rproj\.user$
5 | 


--------------------------------------------------------------------------------
/data/gmax_ks.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/almeidasilvaf/doubletrouble/HEAD/data/gmax_ks.rda


--------------------------------------------------------------------------------
/data/fungi_kaks.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/almeidasilvaf/doubletrouble/HEAD/data/fungi_kaks.rda


--------------------------------------------------------------------------------
/data/yeast_annot.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/almeidasilvaf/doubletrouble/HEAD/data/yeast_annot.rda


--------------------------------------------------------------------------------
/data/yeast_seq.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/almeidasilvaf/doubletrouble/HEAD/data/yeast_seq.rda


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/almeidasilvaf/doubletrouble/HEAD/man/figures/logo.png


--------------------------------------------------------------------------------
/data/diamond_inter.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/almeidasilvaf/doubletrouble/HEAD/data/diamond_inter.rda


--------------------------------------------------------------------------------
/data/diamond_intra.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/almeidasilvaf/doubletrouble/HEAD/data/diamond_intra.rda


--------------------------------------------------------------------------------
/data/cds_scerevisiae.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/almeidasilvaf/doubletrouble/HEAD/data/cds_scerevisiae.rda


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | inst/script/*.Rmd
6 | inst/doc
7 | *.Rproj
8 | *.BiocCheck


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/tests/testthat/test-data_validation.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Start tests ----
 3 | test_that("check_geneid_match() flags mismatches between gene sets", {
 4 |     
 5 |     set1 <- c("gene1", "gene2A", "gene3", "gene4A")
 6 |     set2 <- c("gene1", "gene2", "gene3", "gene4")
 7 |     
 8 |     expect_error(check_geneid_match(set1, set2))
 9 | })
10 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(doubletrouble)
11 | 
12 | test_check("doubletrouble")
13 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # doubletrouble 0.99.0
 3 | 
 4 | NEW FEATURES
 5 | 
 6 | * Added a `NEWS.md` file to track changes to the package.
 7 | 
 8 | 
 9 | # doubletrouble 0.99.2
10 | 
11 | CHANGES
12 | 
13 | * Small change in coding style after Bioconductor peer-review (`m:n` replaced
14 | with `c(m, n)` and `seq(m,n)`)
15 | 
16 | # doubletrouble 0.99.3
17 | 
18 | BUG FIXES
19 | 
20 | * Updated functions (e.g., get_anchor_list(), collinearity2blocks()) after
21 | update in syntenet.
22 | 
23 | 


--------------------------------------------------------------------------------
/man/cds_scerevisiae.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{cds_scerevisiae}
 5 | \alias{cds_scerevisiae}
 6 | \title{Coding sequences (CDS) of S. cerevisiae}
 7 | \format{
 8 | A DNAStringSet object with CDS of S. cerevisiae.
 9 | }
10 | \usage{
11 | data(cds_scerevisiae)
12 | }
13 | \description{
14 | Data were obtained from Ensembl Fungi, and only CDS of primary transcripts
15 | were included.
16 | }
17 | \examples{
18 | data(cds_scerevisiae)
19 | }
20 | \keyword{datasets}
21 | 


--------------------------------------------------------------------------------
/man/diamond_intra.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{diamond_intra}
 5 | \alias{diamond_intra}
 6 | \title{Intraspecies DIAMOND output for S. cerevisiae}
 7 | \format{
 8 | A list of data frames (length 1) containing the whole paranome of
 9 | S. cerevisiae resulting from intragenome similarity searches.
10 | }
11 | \usage{
12 | data(diamond_intra)
13 | }
14 | \description{
15 | List obtained with \code{run_diamond()}.
16 | }
17 | \examples{
18 | data(diamond_intra)
19 | }
20 | \keyword{datasets}
21 | 


--------------------------------------------------------------------------------
/inst/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | template:
 2 |   bootstrap: 5
 3 |   bslib:
 4 |     preset: "bootstrap"
 5 |     font_scale: 1.0
 6 |     base_font:
 7 |       google: "Atkinson Hyperlegible"           # font for better accessibility
 8 |     code_font:
 9 |       google: "Source Code Pro"
10 |     primary: "#18416a"                          # navbar hover and sidebar: dark blue
11 |     navbar-light-brand-color: "#18416a"         # pkg name: white
12 |     navbar-light-hover-color: "#18416a"         # navbar text on hover: Bioc green
13 |     pkgdown-nav-height: 78px
14 | 
15 | navbar:
16 |   type: light
17 | 


--------------------------------------------------------------------------------
/man/yeast_seq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{yeast_seq}
 5 | \alias{yeast_seq}
 6 | \title{Protein sequences of the yeast species S. cerevisiae and C. glabrata}
 7 | \format{
 8 | A list of AAStringSet objects with the elements
 9 | \strong{Scerevisiae} and \strong{Cglabrata}.
10 | }
11 | \usage{
12 | data(yeast_seq)
13 | }
14 | \description{
15 | Data obtained from Ensembl Fungi. Only translated sequences of primary
16 | transcripts were included.
17 | }
18 | \examples{
19 | data(yeast_seq)
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/yeast_annot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{yeast_annot}
 5 | \alias{yeast_annot}
 6 | \title{Genome annotation of the yeast species S. cerevisiae and C. glabrata}
 7 | \format{
 8 | A CompressedGRangesList containing
 9 | the elements \strong{Scerevisiae} and \strong{Cglabrata}.
10 | }
11 | \usage{
12 | data(yeast_annot)
13 | }
14 | \description{
15 | Data obtained from Ensembl Fungi. Only annotation data protein-coding
16 | genes (with associated mRNA, exons, CDS, etc) are included.
17 | }
18 | \examples{
19 | data(yeast_annot)
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/diamond_inter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{diamond_inter}
 5 | \alias{diamond_inter}
 6 | \title{Interspecies DIAMOND output for yeast species}
 7 | \format{
 8 | A list of data frames (length 1) containing the output of a
 9 | DIAMOND search of S. cerevisiae against C. glabrata (outgroup).
10 | }
11 | \usage{
12 | data(diamond_inter)
13 | }
14 | \description{
15 | This list contains a similarity search of S. cerevisiae against
16 | C. glabrata, and it was obtained with \code{run_diamond()}.
17 | }
18 | \examples{
19 | data(diamond_inter)
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/gmax_ks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{gmax_ks}
 5 | \alias{gmax_ks}
 6 | \title{Duplicate pairs and Ks values for Glycine max}
 7 | \format{
 8 | A data frame with the following variables:
 9 | \describe{
10 | \item{dup1}{Character, duplicated gene 1.}
11 | \item{dup2}{Character, duplicated gene 2.}
12 | \item{Ks}{Numeric, Ks values.}
13 | \item{type}{Factor, duplication mode.}
14 | }
15 | }
16 | \usage{
17 | data(gmax_ks)
18 | }
19 | \description{
20 | This data set was obtained with \code{classify_gene_pairs()} followed
21 | by \code{pairs2kaks()}.
22 | }
23 | \examples{
24 | data(gmax_ks)
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite doubletrouble in publications, use:")
 2 | 
 3 | citEntry(
 4 |   entry    = "Article",
 5 |   title    = "doubletrouble: an R/Bioconductor package for the identification, classification, and analysis of gene and genome duplications",
 6 |   author   = personList(
 7 |       as.person("Fabricio Almeida-Silva"),
 8 |       as.person("Yves Van de Peer")
 9 |   ),
10 |   journal  = "Bioinformatics",
11 |   year     = "2025",
12 |   volume   = "41",
13 |   number   = "2",
14 |   pages    = "btaf043",
15 |   url      = "https://academic.oup.com/bioinformatics/article/41/2/btaf043/7979242",
16 |   doi      = "10.1093/bioinformatics/btaf043",
17 |   textVersion = paste(
18 |       "Almeida-Silva F, Van de Peer Y",
19 |       "doubletrouble: an R/Bioconductor package for the identification, classification, and analysis of gene and genome duplications.",
20 |       "Bioinformatics, 41(2), btaf043. (2025). https://doi.org/10.1093/bioinformatics/btaf043"
21 |   )
22 | )


--------------------------------------------------------------------------------
/man/fungi_kaks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{fungi_kaks}
 5 | \alias{fungi_kaks}
 6 | \title{Duplicate pairs and Ka, Ks, and Ka/Ks values for fungi species}
 7 | \format{
 8 | A list of data frame with elements
 9 | named \strong{saccharomyces_cerevisiae}, \strong{candida_glabrata},
10 | and \strong{schizosaccharomyces_pombe}. Each data frame contains
11 | the following variables:
12 | \describe{
13 | \item{dup1}{Character, duplicated gene 1.}
14 | \item{dup2}{Character, duplicated gene 2.}
15 | \item{Ka}{Numeric, Ka values.}
16 | \item{Ks}{Numeric, Ks values.}
17 | \item{Ka_Ks}{Numeric, Ka/Ks values.}
18 | \item{type}{Character, mode of duplication}
19 | }
20 | }
21 | \usage{
22 | data(fungi_kaks)
23 | }
24 | \description{
25 | This data set was obtained with \code{classify_gene_pairs()} followed
26 | by \code{pairs2kaks()}.
27 | }
28 | \examples{
29 | data(fungi_kaks)
30 | }
31 | \keyword{datasets}
32 | 


--------------------------------------------------------------------------------
/man/plot_ks_peaks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plot_ks_peaks}
 4 | \alias{plot_ks_peaks}
 5 | \title{Plot histogram of Ks distribution with peaks}
 6 | \usage{
 7 | plot_ks_peaks(peaks = NULL, binwidth = 0.05)
 8 | }
 9 | \arguments{
10 | \item{peaks}{A list with elements \strong{mean}, \strong{sd},
11 | \strong{lambda}, and \strong{ks}, as returned by the
12 | function \code{fins_ks_peaks()}.}
13 | 
14 | \item{binwidth}{Numeric scalar with binwidth for the histogram.
15 | Default: 0.05.}
16 | }
17 | \value{
18 | A ggplot object with a histogram and lines for each Ks peak.
19 | }
20 | \description{
21 | Plot histogram of Ks distribution with peaks
22 | }
23 | \examples{
24 | data(fungi_kaks)
25 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
26 | ks <- scerevisiae_kaks$Ks
27 | 
28 | # Find 2 peaks in Ks distribution
29 | peaks <- find_ks_peaks(ks, npeaks = 2)
30 | 
31 | # Plot
32 | plot_ks_peaks(peaks, binwidth = 0.05)
33 | }
34 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | The Bioconductor community values
 2 | 
 3 | * an open approach to science that promotes the sharing of ideas, code, and expertise
 4 | * collaboration
 5 | * diversity and inclusivity
 6 | * a kind and welcoming environment
 7 | * community contributions
 8 | 
 9 | In line with these values, Bioconductor is dedicated to providing a welcoming, supportive, collegial, experience free of harassment, intimidation, and bullying regardless of:
10 | 
11 | * identity: gender, gender identity and expression, sexual orientation, disability, physical appearance, ethnicity, body size, race, age, religion, etc.
12 | * intellectual position: approaches to data analysis, software preferences, coding style, scientific perspective, etc.
13 | * stage of career
14 | 
15 | In order to uphold these values, members of the Bioconductor community are required to follow the Code of Conduct.The latest version of Bioconductor project Code of Conduct is available at http://bioconductor.org/about/code-of-conduct/. Please read the Code of Conduct before contributing to this project.
16 | 
17 | Thank you!
18 | 


--------------------------------------------------------------------------------
/.github/workflows/check-bioc.yml:
--------------------------------------------------------------------------------
 1 | name: rworkflows.devel
 2 | 'on':
 3 |   push:
 4 |     branches: devel
 5 |   pull_request:
 6 |     branches: devel
 7 | jobs:
 8 |   rworkflows:
 9 |     permissions: write-all
10 |     runs-on: ${{ matrix.config.os }}
11 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
12 |     container: ${{ matrix.config.cont }}
13 |     strategy:
14 |       fail-fast: ${{ false }}
15 |       matrix:
16 |         config:
17 |         - os: ubuntu-latest
18 |           bioc: devel
19 |           r: auto
20 |           cont: ghcr.io/bioconductor/bioconductor_docker:devel
21 |           rspm: ~
22 |     steps:
23 |     - uses: neurogenomics/rworkflows@master
24 |       with:
25 |         run_bioccheck: ${{ true }}
26 |         run_rcmdcheck: ${{ true }}
27 |         as_cran: ${{ false }}
28 |         run_vignettes: ${{ false }}
29 |         has_testthat: ${{ true }}
30 |         run_covr: ${{ true }}
31 |         run_pkgdown: ${{ true }}
32 |         has_runit: ${{ false }}
33 |         has_latex: ${{ false }}
34 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
35 |         run_docker: ${{ false }}
36 |         DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
37 |         runner_os: ${{ runner.os }}
38 |         cache_version: cache-v1
39 |         docker_registry: ghcr.io


--------------------------------------------------------------------------------
/man/classify_genes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duplicate_classification.R
 3 | \name{classify_genes}
 4 | \alias{classify_genes}
 5 | \title{Classify genes into unique modes of duplication}
 6 | \usage{
 7 | classify_genes(gene_pairs_list = NULL)
 8 | }
 9 | \arguments{
10 | \item{gene_pairs_list}{List of classified gene pairs as returned
11 | by \code{classify_gene_pairs()}.}
12 | }
13 | \value{
14 | A list of 2-column data frames with variables \strong{gene}
15 | and \strong{type} representing gene ID and duplication type, respectively.
16 | }
17 | \description{
18 | Classify genes into unique modes of duplication
19 | }
20 | \details{
21 | If a gene is present in pairs with different duplication modes, the gene
22 | is classified into a unique mode of duplication following the order
23 | of priority indicated in the levels of the factor \strong{type}.
24 | 
25 | For scheme "binary", the order is SD > SSD.
26 | For scheme "standard", the order is SD > TD > PD > DD.
27 | For scheme "extended", the order is SD > TD > PD > TRD > DD.
28 | For scheme "full", the order is SD > TD > PD > rTRD > dTRD > DD.
29 | }
30 | \examples{
31 | data(fungi_kaks)
32 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
33 | 
34 | cols <- c("dup1", "dup2", "type")
35 | gene_pairs_list <- list(Scerevisiae = scerevisiae_kaks[, cols])
36 | 
37 | class_genes <- classify_genes(gene_pairs_list)
38 | }
39 | 


--------------------------------------------------------------------------------
/man/get_intron_counts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_intron_counts}
 4 | \alias{get_intron_counts}
 5 | \title{Get a data frame of intron counts per gene}
 6 | \usage{
 7 | get_intron_counts(txdb)
 8 | }
 9 | \arguments{
10 | \item{txdb}{A \code{TxDb} object with transcript annotations. See details below
11 | for examples on how to create \code{TxDb} objects from different kinds of input.}
12 | }
13 | \value{
14 | A data frame with intron counts per gene, with variables:
15 | \describe{
16 | \item{gene}{Character with gene IDs.}
17 | \item{introns}{Numeric with number of introns per gene.}
18 | }
19 | }
20 | \description{
21 | Get a data frame of intron counts per gene
22 | }
23 | \details{
24 | The family of functions \code{makeTxDbFrom*} from
25 | the \strong{txdbmaker} package can be used to create \code{TxDb} objects
26 | from a variety of input data types. You can create \code{TxDb} objects
27 | from e.g., \code{GRanges} objects (\code{makeTxDbFromGRanges()}),
28 | GFF files (\code{makeTxDbFromGFF()}),
29 | an Ensembl database (\code{makeTxDbFromEnsembl}), and
30 | a Biomart database (\code{makeTxDbFromBiomart}).
31 | }
32 | \examples{
33 | data(yeast_annot)
34 | 
35 | # Create TxDb object from GRanges
36 | library(txdbmaker)
37 | txdb <- txdbmaker::makeTxDbFromGRanges(yeast_annot[[1]])
38 | 
39 | # Get intron counts
40 | intron_counts <- get_intron_counts(txdb)
41 | }
42 | 


--------------------------------------------------------------------------------
/man/plot_duplicate_freqs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plot_duplicate_freqs}
 4 | \alias{plot_duplicate_freqs}
 5 | \title{Plot frequency of duplicates per mode for each species}
 6 | \usage{
 7 | plot_duplicate_freqs(dup_counts, plot_type = "facet", remove_zero = TRUE)
 8 | }
 9 | \arguments{
10 | \item{dup_counts}{A data frame in long format with the number of
11 | duplicates per mode for each species, as returned by
12 | the function \code{duplicates2counts}.}
13 | 
14 | \item{plot_type}{Character indicating how to plot frequencies. One of
15 | 'facet' (facets for each level of the variable \strong{type}),
16 | 'stack' (levels of the variable \strong{type} as stacked bars), or
17 | 'stack_percent' (levels of the variable \strong{type} as stacked bars,
18 | with x-axis representing relative frequencies). Default: 'facet'.}
19 | 
20 | \item{remove_zero}{Logical indicating whether or not to remove rows
21 | with zero values. Default: TRUE.}
22 | }
23 | \value{
24 | A ggplot object.
25 | }
26 | \description{
27 | Plot frequency of duplicates per mode for each species
28 | }
29 | \examples{
30 | data(fungi_kaks)
31 | 
32 | # Get unique duplicates
33 | duplicate_list <- classify_genes(fungi_kaks)
34 | 
35 | # Get count table
36 | dup_counts <- duplicates2counts(duplicate_list)
37 | 
38 | # Plot counts
39 | plot_duplicate_freqs(dup_counts, plot_type = "stack_percent")
40 | }
41 | 


--------------------------------------------------------------------------------
/man/duplicates2counts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{duplicates2counts}
 4 | \alias{duplicates2counts}
 5 | \title{Get a duplicate count matrix for each genome}
 6 | \usage{
 7 | duplicates2counts(duplicate_list, shape = "long")
 8 | }
 9 | \arguments{
10 | \item{duplicate_list}{A list of data frames with the duplicated genes or
11 | gene pairs and their modes of duplication as returned
12 | by \code{classify_gene_pairs()} or \code{classify_genes()}.}
13 | 
14 | \item{shape}{Character specifying the shape of the output data frame.
15 | One of "long" (data frame in the long shape, in the tidyverse sense),
16 | or "wide" (data frame in the wide shape, in the tidyverse sense).
17 | Default: "long".}
18 | }
19 | \value{
20 | If \strong{shape = "wide"}, a count matrix containing the
21 | frequency of duplicated genes (or gene pairs) by mode for each species,
22 | with species in rows and duplication modes in columns.
23 | If \strong{shape = "long"}, a data frame in long format with the following
24 | variables:
25 | \describe{
26 | \item{type}{Factor, type of duplication.}
27 | \item{n}{Numeric, number of duplicates.}
28 | \item{species}{Character, species name}
29 | }
30 | }
31 | \description{
32 | Get a duplicate count matrix for each genome
33 | }
34 | \examples{
35 | data(fungi_kaks)
36 | 
37 | # Get unique duplicates
38 | duplicate_list <- classify_genes(fungi_kaks)
39 | 
40 | # Get count table
41 | counts <- duplicates2counts(duplicate_list)
42 | }
43 | 


--------------------------------------------------------------------------------
/man/get_segmental.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_duplicate_classification.R
 3 | \name{get_segmental}
 4 | \alias{get_segmental}
 5 | \title{Classify gene pairs derived from segmental duplications}
 6 | \usage{
 7 | get_segmental(anchor_pairs = NULL, pairs = NULL)
 8 | }
 9 | \arguments{
10 | \item{anchor_pairs}{A 2-column data frame with anchor pairs in columns 1
11 | and 2.}
12 | 
13 | \item{pairs}{A 2-column data frame with all duplicate pairs. This
14 | is equivalent to the first 2 columns of the tabular output of BLAST-like
15 | programs.}
16 | }
17 | \value{
18 | A 3-column data frame with the variables:
19 | \describe{
20 | \item{dup1}{Character, duplicated gene 1}
21 | \item{dup2}{Character, duplicated gene 2}
22 | \item{type}{Factor indicating duplication types, with levels
23 | "SD" (segmental duplication) or
24 | "DD" (dispersed duplication).}
25 | }
26 | }
27 | \description{
28 | Classify gene pairs derived from segmental duplications
29 | }
30 | \examples{
31 | data(diamond_intra)
32 | data(yeast_annot)
33 | data(yeast_seq)
34 | blast_list <- diamond_intra
35 | 
36 | # Get processed annotation for S. cerevisiae
37 | annotation <- syntenet::process_input(yeast_seq, yeast_annot)$annotation[1]
38 | 
39 | # Get list of intraspecies anchor pairs
40 | anchor_pairs <- get_anchors_list(blast_list, annotation)
41 | anchor_pairs <- anchor_pairs[[1]][, c(1, 2)]
42 | 
43 | # Get duplicate pairs from DIAMOND output
44 | duplicates <- diamond_intra[[1]][, c(1, 2)]
45 | dups <- get_segmental(anchor_pairs, duplicates)
46 | }
47 | 


--------------------------------------------------------------------------------
/R/data_validation.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Check if gene names in set 1 are present in set 2
 3 | #' 
 4 | #' @param ref_ids Character vector of reference gene set.
 5 | #' @param test_ids Character vector of test gene set.
 6 | #' @param setnames Character vector of length with set names. 
 7 | #' Default: \code{c("gene pairs", "CDS")}
 8 | #' 
 9 | #' @return TRUE if names match, otherwise an error is shown.
10 | #' @importFrom utils head
11 | #' @details 
12 | #' This internal function can be used, for instance, to check if CDS names
13 | #' match gene IDs in the gene pair list.
14 | #' @noRd
15 | check_geneid_match <- function(
16 |         ref_ids, test_ids, setnames = c("gene pairs", "CDS")
17 | ) {
18 |     
19 |     mismatch_ids <- ref_ids[!ref_ids %in% test_ids]
20 |     mismatch_perc <- length(mismatch_ids) / length(ref_ids)
21 |     mismatch_perc <- round(mismatch_perc * 100, 2)
22 |     
23 |     if(mismatch_perc >0) {
24 |         stop(
25 |             mismatch_perc, "%", " (N=", length(mismatch_ids), ") of the IDs in ", setnames[1], 
26 |             " were not found in ", setnames[2], ".\n", 
27 |             "All gene IDs in ", setnames[1], " must be in ", setnames[2], 
28 |             ". Did you check if gene IDs match?",
29 |             "\n\nHere are some examples of nonmatching IDs (from ", setnames[1], ") :\n",
30 |             paste0(head(mismatch_ids, n = 5), collapse = "\n"),
31 |             "\n\nAnd here are some examples of IDs in ", setnames[2], ":\n",
32 |             paste0(head(test_ids, n = 5), collapse = "\n")
33 |         )
34 |     }
35 |     
36 |     return(TRUE)
37 | }


--------------------------------------------------------------------------------
/man/plot_rates_by_species.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plot_rates_by_species}
 4 | \alias{plot_rates_by_species}
 5 | \title{Plot distributions of substitution rates (Ka, Ks, or Ka/Ks) per species}
 6 | \usage{
 7 | plot_rates_by_species(
 8 |   kaks_list,
 9 |   rate_column = "Ks",
10 |   bytype = FALSE,
11 |   range = c(0, 2),
12 |   fill = "deepskyblue3",
13 |   color = "deepskyblue4"
14 | )
15 | }
16 | \arguments{
17 | \item{kaks_list}{A list of data frames with substitution rates per gene
18 | pair in each species as returned by \code{pairs2kaks()}.}
19 | 
20 | \item{rate_column}{Character indicating the name of the column to plot.
21 | Default: "Ks".}
22 | 
23 | \item{bytype}{Logical indicating whether or not to show distributions by
24 | type of duplication. Default: FALSE.}
25 | 
26 | \item{range}{Numeric vector of length 2 indicating the minimum and maximum
27 | values to plot. Default: \code{c(0, 2)}.}
28 | 
29 | \item{fill}{Character with color to use for the fill aesthetic. Ignored
30 | if \strong{bytype = TRUE}. Default: "deepskyblue3".}
31 | 
32 | \item{color}{Character with color to use for the color aesthetic. Ignored
33 | if \strong{bytype = FALSE}. Default: "deepskyblue4".}
34 | }
35 | \value{
36 | A ggplot object.
37 | }
38 | \description{
39 | Plot distributions of substitution rates (Ka, Ks, or Ka/Ks) per species
40 | }
41 | \details{
42 | Data will be plotted using the species order of the list. To change the
43 | order of the species to plot, reorder the list elements
44 | in \strong{kaks_list}.
45 | }
46 | \examples{
47 | data(fungi_kaks)
48 | 
49 | # Plot rates
50 | plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks") 
51 | }
52 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(classify_gene_pairs)
 4 | export(classify_genes)
 5 | export(duplicates2counts)
 6 | export(find_ks_peaks)
 7 | export(get_anchors_list)
 8 | export(get_intron_counts)
 9 | export(get_segmental)
10 | export(get_tandem_proximal)
11 | export(get_transposed)
12 | export(get_transposed_classes)
13 | export(pairs2kaks)
14 | export(plot_duplicate_freqs)
15 | export(plot_ks_distro)
16 | export(plot_ks_peaks)
17 | export(plot_rates_by_species)
18 | export(split_pairs_by_peak)
19 | importFrom(AnnotationDbi,select)
20 | importFrom(Biostrings,width)
21 | importFrom(GenomicFeatures,intronsByTranscript)
22 | importFrom(GenomicRanges,GRangesList)
23 | importFrom(MSA2dist,indices2kaks)
24 | importFrom(ggplot2,aes)
25 | importFrom(ggplot2,after_stat)
26 | importFrom(ggplot2,element_blank)
27 | importFrom(ggplot2,facet_grid)
28 | importFrom(ggplot2,facet_wrap)
29 | importFrom(ggplot2,geom_bar)
30 | importFrom(ggplot2,geom_boxplot)
31 | importFrom(ggplot2,geom_density)
32 | importFrom(ggplot2,geom_histogram)
33 | importFrom(ggplot2,geom_violin)
34 | importFrom(ggplot2,geom_vline)
35 | importFrom(ggplot2,ggplot)
36 | importFrom(ggplot2,ggplot_build)
37 | importFrom(ggplot2,labs)
38 | importFrom(ggplot2,scale_fill_manual)
39 | importFrom(ggplot2,scale_x_continuous)
40 | importFrom(ggplot2,scale_y_continuous)
41 | importFrom(ggplot2,stat_function)
42 | importFrom(ggplot2,theme)
43 | importFrom(ggplot2,theme_bw)
44 | importFrom(ggplot2,vars)
45 | importFrom(mclust,densityMclust)
46 | importFrom(rlang,.data)
47 | importFrom(stats,density)
48 | importFrom(stats,dnorm)
49 | importFrom(syntenet,interspecies_synteny)
50 | importFrom(syntenet,intraspecies_synteny)
51 | importFrom(utils,head)
52 | importFrom(utils,read.table)
53 | 


--------------------------------------------------------------------------------
/man/plot_ks_distro.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plot_ks_distro}
 4 | \alias{plot_ks_distro}
 5 | \title{Plot distribution of synonymous substitution rates (Ks)}
 6 | \usage{
 7 | plot_ks_distro(
 8 |   ks_df,
 9 |   min_ks = 0.01,
10 |   max_ks = 2,
11 |   bytype = FALSE,
12 |   type_levels = NULL,
13 |   plot_type = "histogram",
14 |   binwidth = 0.03
15 | )
16 | }
17 | \arguments{
18 | \item{ks_df}{A data frame with Ks values for each gene pair
19 | as returned by \code{pairs2kaks()}.}
20 | 
21 | \item{min_ks}{Numeric indicating the minimum Ks value to keep.
22 | Default: 0.01.}
23 | 
24 | \item{max_ks}{Numeric indicating the maximum Ks value to keep.
25 | Default: 2.}
26 | 
27 | \item{bytype}{Logical indicating whether or not to plot the distribution
28 | by type of duplication (requires a column named \code{type}).}
29 | 
30 | \item{type_levels}{(Only valid if \strong{bytype} is not NULL) Character
31 | indicating which levels of the variable specified in
32 | parameter \strong{group_by} should be kept. By default, all levels are kept.}
33 | 
34 | \item{plot_type}{Character indicating the type of plot to create.
35 | If \strong{bytype = TRUE}, possible types are "histogram" or "violin".
36 | If \strong{bytype = FALSE}, possible types are "histogram", "density",
37 | or "density_histogram". Default: "histogram".}
38 | 
39 | \item{binwidth}{(Only valid if \strong{plot_type = "histogram"})
40 | Numeric indicating the bin width. Default: 0.03.}
41 | }
42 | \value{
43 | A ggplot object.
44 | }
45 | \description{
46 | Plot distribution of synonymous substitution rates (Ks)
47 | }
48 | \examples{
49 | data(fungi_kaks)
50 | ks_df <- fungi_kaks$saccharomyces_cerevisiae
51 | 
52 | # Plot distro
53 | plot_ks_distro(ks_df, bytype = TRUE)
54 | }
55 | 


--------------------------------------------------------------------------------
/man/pairs2kaks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ka_ks_analyses.R
 3 | \name{pairs2kaks}
 4 | \alias{pairs2kaks}
 5 | \title{Calculate Ka, Ks, and Ka/Ks from duplicate gene pairs}
 6 | \usage{
 7 | pairs2kaks(gene_pairs_list, cds, model = "MYN", threads = 1, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{gene_pairs_list}{List of data frames containing duplicated gene pairs
11 | as returned by \code{classify_gene_pairs()}.}
12 | 
13 | \item{cds}{List of DNAStringSet objects containing the coding sequences
14 | of each gene.}
15 | 
16 | \item{model}{Character scalar indicating which codon model to use.
17 | Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY",
18 | "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN",
19 | and "GMYN". Default: "MYN".}
20 | 
21 | \item{threads}{Numeric indicating the number of threads to use. Default: 1.}
22 | 
23 | \item{verbose}{Logical indicating whether progress messages should be
24 | printed on screen. Default: FALSE.}
25 | }
26 | \value{
27 | A list of data frames containing gene pairs and their Ka, Ks,
28 | and Ka/Ks values.
29 | }
30 | \description{
31 | Calculate Ka, Ks, and Ka/Ks from duplicate gene pairs
32 | }
33 | \examples{
34 | data(diamond_intra)
35 | data(diamond_inter)
36 | data(yeast_annot)
37 | data(yeast_seq)
38 | data(cds_scerevisiae)
39 | blast_list <- diamond_intra
40 | blast_inter <- diamond_inter
41 | 
42 | pdata <- syntenet::process_input(yeast_seq, yeast_annot)
43 | annot <- pdata$annotation["Scerevisiae"]
44 | 
45 | # Binary classification scheme
46 | pairs <- classify_gene_pairs(annot, blast_list)
47 | td_pairs <- pairs[[1]][pairs[[1]]$type == "TD", ]
48 | gene_pairs_list <- list(
49 |     Scerevisiae = td_pairs[seq(1, 3, by = 1), ]
50 | )
51 | 
52 | cds <- list(Scerevisiae = cds_scerevisiae)
53 | 
54 | kaks <- pairs2kaks(gene_pairs_list, cds)
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/man/split_pairs_by_peak.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ka_ks_analyses.R
 3 | \name{split_pairs_by_peak}
 4 | \alias{split_pairs_by_peak}
 5 | \title{Split gene pairs based on their Ks peaks}
 6 | \usage{
 7 | split_pairs_by_peak(ks_df, peaks, nsd = 2, binwidth = 0.05)
 8 | }
 9 | \arguments{
10 | \item{ks_df}{A 3-column data frame with gene pairs in columns 1 and 2,
11 | and Ks values for the gene pair in column 3.}
12 | 
13 | \item{peaks}{A list with mean, standard deviation, and amplitude of Ks
14 | peaks as generated by \code{find_ks_peaks}.}
15 | 
16 | \item{nsd}{Numeric with the number of standard deviations to consider
17 | for each peak.}
18 | 
19 | \item{binwidth}{Numeric scalar with binwidth for the histogram.
20 | Default: 0.05.}
21 | }
22 | \value{
23 | A list with the following elements:
24 | \describe{
25 | \item{pairs}{A 4-column data frame with the variables
26 | \strong{dup1} (character), \strong{dup2} (character),
27 | \strong{ks} (numeric), and \strong{peak} (numeric),
28 | representing duplicate gene pair, Ks values, and peak ID,
29 | respectively.}
30 | \item{plot}{A ggplot object with Ks peaks as returned by
31 | \code{plot_ks_peaks}, but with dashed red lines indicating
32 | boundaries for each peak.}
33 | }
34 | }
35 | \description{
36 | The purpose of this function is to classify gene pairs by age when there
37 | are 2+ Ks peaks. This way, newer gene pairs are found within a
38 | certain number of standard deviations from the highest peak,
39 | and older genes are found close within smaller peaks.
40 | }
41 | \examples{
42 | data(fungi_kaks)
43 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
44 | 
45 | # Create a data frame of duplicate pairs and Ks values
46 | ks_df <- scerevisiae_kaks[, c("dup1", "dup2", "Ks")]
47 | 
48 | # Create list of peaks
49 | peaks <- find_ks_peaks(ks_df$Ks, npeaks = 2)
50 | 
51 | # Split pairs
52 | spairs <- split_pairs_by_peak(ks_df, peaks) 
53 | }
54 | 


--------------------------------------------------------------------------------
/man/get_tandem_proximal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_duplicate_classification.R
 3 | \name{get_tandem_proximal}
 4 | \alias{get_tandem_proximal}
 5 | \title{Classify gene pairs derived from tandem and proximal duplications}
 6 | \usage{
 7 | get_tandem_proximal(pairs = NULL, annotation_granges = NULL, proximal_max = 10)
 8 | }
 9 | \arguments{
10 | \item{pairs}{A 3-column data frame with columns \strong{dup1}, \strong{dup2},
11 | and \strong{type} indicating duplicated gene 1, duplicated gene 2, and
12 | the mode of duplication associated with the pair. This data frame
13 | is returned by \code{get_segmental()}.}
14 | 
15 | \item{annotation_granges}{A processed GRanges object as in each element
16 | of the list returned by \code{syntenet::process_input()}.}
17 | 
18 | \item{proximal_max}{Numeric scalar with the maximum distance (in number
19 | of genes) between two genes to consider them as proximal duplicates.
20 | Default: 10.}
21 | }
22 | \value{
23 | A 3-column data frame with the variables:
24 | \describe{
25 | \item{dup1}{Character, duplicated gene 1.}
26 | \item{dup2}{Character, duplicated gene 2.}
27 | \item{type}{Factor of duplication types, with levels
28 | "SD" (segmental duplication),
29 | "TD" (tandem duplication),
30 | "PD" (proximal duplication), and
31 | "DD" (dispersed duplication).}
32 | }
33 | }
34 | \description{
35 | Classify gene pairs derived from tandem and proximal duplications
36 | }
37 | \examples{
38 | data(yeast_annot)
39 | data(yeast_seq)
40 | data(fungi_kaks)
41 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
42 | 
43 | # Get processed annotation for S. cerevisiae
44 | pdata <- annotation <- syntenet::process_input(yeast_seq, yeast_annot)
45 | annot <- pdata$annotation[[1]]
46 | 
47 | # Get duplicated pairs
48 | pairs <- scerevisiae_kaks[, c("dup1", "dup2", "type")]
49 | pairs$dup1 <- paste0("Sce_", pairs$dup1)
50 | pairs$dup2 <- paste0("Sce_", pairs$dup2)
51 | 
52 | # Get tandem and proximal duplicates
53 | td_pd_pairs <- get_tandem_proximal(pairs, annot)
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/man/find_ks_peaks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ka_ks_analyses.R
 3 | \name{find_ks_peaks}
 4 | \alias{find_ks_peaks}
 5 | \title{Find peaks in a Ks distribution with Gaussian Mixture Models}
 6 | \usage{
 7 | find_ks_peaks(ks, npeaks = 2, min_ks = 0.01, max_ks = 4, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{ks}{A numeric vector of Ks values.}
11 | 
12 | \item{npeaks}{Numeric scalar indicating the number of peaks in
13 | the Ks distribution. If you don't know how many peaks there are,
14 | you can include a range of values, and the number of peaks that produces
15 | the lowest BIC (Bayesian Information Criterion) will be selected as the
16 | optimal. Default: 2.}
17 | 
18 | \item{min_ks}{Numeric scalar with the minimum Ks value. Removing
19 | very small Ks values is generally used to avoid the incorporation of allelic
20 | and/or splice variants and to prevent the fitting of a component to infinity.
21 | Default: 0.01.}
22 | 
23 | \item{max_ks}{Numeric scalar indicating the maximum Ks value. Removing
24 | very large Ks values is usually performed to account for Ks saturation.
25 | Default: 4.}
26 | 
27 | \item{verbose}{Logical indicating if messages should be printed on screen.
28 | Default: FALSE.}
29 | }
30 | \value{
31 | A list with the following elements:
32 | \describe{
33 | \item{mean}{Numeric with the estimated means.}
34 | \item{sd}{Numeric with the estimated standard deviations.}
35 | \item{lambda}{Numeric with the estimated mixture weights.}
36 | \item{ks}{Numeric vector of filtered Ks distribution based on
37 | arguments passed to min_ks and max_ks.}
38 | }
39 | }
40 | \description{
41 | Find peaks in a Ks distribution with Gaussian Mixture Models
42 | }
43 | \examples{
44 | data(fungi_kaks)
45 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
46 | ks <- scerevisiae_kaks$Ks
47 | 
48 | # Find 2 peaks in Ks distribution
49 | peaks <- find_ks_peaks(ks, npeaks = 2)
50 | 
51 | # From 2 to 4 peaks, verbose = TRUE to show BIC values
52 | peaks <- find_ks_peaks(ks, npeaks = c(2, 3, 4), verbose = TRUE)
53 | }
54 | 


--------------------------------------------------------------------------------
/man/get_anchors_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_anchors_list}
 4 | \alias{get_anchors_list}
 5 | \title{Get a list of anchor pairs for each species}
 6 | \usage{
 7 | get_anchors_list(
 8 |   blast_list = NULL,
 9 |   annotation = NULL,
10 |   evalue = 1e-10,
11 |   anchors = 5,
12 |   max_gaps = 25,
13 |   collinearity_dir = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{blast_list}{A list of data frames containing BLAST tabular output
18 | for intraspecies comparisons.
19 | Each list element corresponds to the BLAST output for a given species,
20 | and names of list elements must match the names of list elements in
21 | \code{annotation}. BLASTp, DIAMOND or simular programs must be run on processed
22 | sequence data as returned by \code{process_input()}.}
23 | 
24 | \item{annotation}{A processed GRangesList or CompressedGRangesList object as
25 | returned by \code{syntenet::process_input()}.}
26 | 
27 | \item{evalue}{Numeric scalar indicating the E-value threshold.
28 | Default: 1e-10.}
29 | 
30 | \item{anchors}{Numeric indicating the minimum required number of genes
31 | to call a syntenic block, as in \code{syntenet::infer_syntenet}.
32 | Default: 5.}
33 | 
34 | \item{max_gaps}{Numeric indicating the number of upstream and downstream
35 | genes to search for anchors, as in \code{syntenet::infer_syntenet}.
36 | Default: 25.}
37 | 
38 | \item{collinearity_dir}{Character indicating the path to the directory
39 | where .collinearity files will be stored. If NULL, files will
40 | be stored in a subdirectory of \code{tempdir()}. Default: NULL.}
41 | }
42 | \value{
43 | A list of data frames representing intraspecies anchor pairs.
44 | }
45 | \description{
46 | Get a list of anchor pairs for each species
47 | }
48 | \examples{
49 | data(diamond_intra)
50 | data(yeast_annot)
51 | data(yeast_seq)
52 | blast_list <- diamond_intra
53 | 
54 | # Get processed annotation for S. cerevisiae
55 | annotation <- syntenet::process_input(yeast_seq, yeast_annot)$annotation
56 | 
57 | # Get list of intraspecies anchor pairs
58 | anchorpairs <- get_anchors_list(blast_list, annotation)
59 | }
60 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report or feature request
 3 | about: Describe a bug you've seen or make a case for a new feature
 4 | title: "[BUG] Your bug or feature request"
 5 | labels: ''
 6 | assignees: ''
 7 | ---
 8 | 
 9 | Please briefly describe your problem and what output you expect. If you have a question, please don't use this form. Instead, ask on <https://support.bioconductor.org/> using the appropriate tag(s) including one for this package.
10 | 
11 | ## Context
12 | 
13 | Provide some context for your bug report or feature request. This could be the:
14 | 
15 | * link to raw code, example: https://github.com/lcolladotor/osca_LIIGH_UNAM_2020/blob/master/00-template.Rmd#L24-L28
16 | * link to a commit, example: https://github.com/lcolladotor/osca_LIIGH_UNAM_2020/commit/6aa30b22eda614d932c12997ba611ba582c435d7
17 | * link to a line of code inside a commit, example: https://github.com/lcolladotor/osca_LIIGH_UNAM_2020/commit/6aa30b22eda614d932c12997ba611ba582c435d7#diff-e265269fe4f17929940e81341b92b116R17
18 | * link to code from an R package, example: https://github.com/LieberInstitute/spatialLIBD/blob/master/R/run_app.R#L51-L55
19 | 
20 | ## Code
21 | 
22 | Include the code you ran and comments
23 | 
24 | ```R
25 | ## prompt an error
26 | stop('hola')
27 | 
28 | ## check the error trace
29 | traceback()
30 | ```
31 | 
32 | ## Small reproducible example
33 | 
34 | If you copy the lines of code that lead to your error, you can then run [`reprex::reprex()`](https://reprex.tidyverse.org/reference/reprex.html) which will create a small website with code you can then easily copy-paste here in a way that will be easy to work with later on.
35 | 
36 | ```R
37 | ## prompt an error
38 | stop('hola')
39 | #> Error in eval(expr, envir, enclos): hola
40 | 
41 | ## check the error trace
42 | traceback()
43 | #> No traceback available
44 | ```
45 | 
46 | 
47 | ## R session information
48 | 
49 | Remember to include your full R session information.
50 | 
51 | ```R
52 | options(width = 120)
53 | sessioninfo::session_info()
54 | ```
55 | 
56 | The output of `sessioninfo::session_info()` includes relevant GitHub installation information and other details that are missed by `sessionInfo()`.
57 | 


--------------------------------------------------------------------------------
/man/get_transposed_classes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_duplicate_classification.R
 3 | \name{get_transposed_classes}
 4 | \alias{get_transposed_classes}
 5 | \title{Classify TRD genes as derived from either DNA transposons or retrotransposons}
 6 | \usage{
 7 | get_transposed_classes(pairs, intron_counts)
 8 | }
 9 | \arguments{
10 | \item{pairs}{A 3-column data frame with columns \strong{dup1}, \strong{dup2},
11 | and \strong{type} indicating duplicated gene 1, duplicated gene 2, and
12 | the mode of duplication associated with the pair. This data frame
13 | is returned by \code{get_transposed()}.}
14 | 
15 | \item{intron_counts}{A 2-column data frame with columns \strong{gene}
16 | and \strong{introns} indicating the number of introns for each gene,
17 | as returned by \code{get_intron_counts}.}
18 | }
19 | \value{
20 | A 3-column data frame with the following variables:
21 | \describe{
22 | \item{dup1}{Character, duplicated gene 1.}
23 | \item{dup2}{Character, duplicated gene 2.}
24 | \item{type}{Factor of duplication types, with levels
25 | "SD" (segmental duplication),
26 | "TD" (tandem duplication),
27 | "PD" (proximal duplication),
28 | "dTRD" (DNA transposon-derived duplication),
29 | "rTRD" (retrotransposon-derived duplication), and
30 | "DD" (dispersed duplication).}
31 | }
32 | }
33 | \description{
34 | Classify TRD genes as derived from either DNA transposons or retrotransposons
35 | }
36 | \examples{
37 | data(diamond_inter)
38 | data(diamond_intra)
39 | data(yeast_seq)
40 | data(yeast_annot)
41 | data(fungi_kaks)
42 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
43 | 
44 | # Get processed annotation
45 | pdata <- syntenet::process_input(yeast_seq, yeast_annot)
46 | annotation <- pdata$annotation
47 | 
48 | # Get duplicated pairs
49 | pairs <- scerevisiae_kaks[, c("dup1", "dup2", "type")]
50 | pairs$dup1 <- paste0("Sce_", pairs$dup1)
51 | pairs$dup2 <- paste0("Sce_", pairs$dup2)
52 | 
53 | # Classify pairs
54 | trd <- get_transposed(pairs, diamond_inter, annotation)
55 | 
56 | # Create TxDb object from GRanges
57 | library(txdbmaker)
58 | txdb <- txdbmaker::makeTxDbFromGRanges(yeast_annot[[1]])
59 | 
60 | # Get intron counts
61 | intron_counts <- get_intron_counts(txdb)
62 | 
63 | # Get TRD classes
64 | trd_classes <- get_transposed_classes(trd, intron_counts)
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: doubletrouble
 2 | Title: Identification and classification of duplicated genes
 3 | Version: 1.9.1
 4 | Date: 2024-03-19
 5 | Authors@R: 
 6 |     c(
 7 |     person(given = "Fabrício",
 8 |            family = "Almeida-Silva",
 9 |            role = c("aut", "cre"),
10 |            email = "fabricio_almeidasilva@hotmail.com",
11 |            comment = c(ORCID = "0000-0002-5314-2964")),
12 |     person(given = "Yves",
13 |            family = "Van de Peer",
14 |            role = "aut",
15 |            email = "yves.vandepeer@psb.vib-ugent.be",
16 |            comment = c(ORCID = "0000-0003-4327-3730"))
17 |     )
18 | Description: doubletrouble aims to identify duplicated genes from
19 |     whole-genome protein sequences and classify them based on their modes
20 |     of duplication. The duplication modes are i. segmental duplication (SD);
21 |     ii. tandem duplication (TD);
22 |     iii. proximal duplication (PD);
23 |     iv. transposed duplication (TRD) and;
24 |     v. dispersed duplication (DD).
25 |     Transposon-derived duplicates (TRD) can be further subdivided into
26 |     rTRD (retrotransposon-derived duplication) and 
27 |     dTRD (DNA transposon-derived duplication).
28 |     If users want a simpler classification scheme, duplicates can also be
29 |     classified into SD- and SSD-derived (small-scale duplication) gene pairs.
30 |     Besides classifying gene pairs, users can also classify genes, so that
31 |     each gene is assigned a unique mode of duplication. 
32 |     Users can also calculate substitution rates per substitution site (i.e., Ka
33 |     and Ks) from duplicate pairs, find peaks in Ks distributions with Gaussian
34 |     Mixture Models (GMMs), and classify gene pairs into age groups based on Ks
35 |     peaks.
36 | License: GPL-3
37 | URL: https://github.com/almeidasilvaf/doubletrouble
38 | BugReports: https://support.bioconductor.org/t/doubletrouble
39 | biocViews: 
40 |     Software,
41 |     WholeGenome,
42 |     ComparativeGenomics,
43 |     FunctionalGenomics,
44 |     Phylogenetics,
45 |     Network,
46 |     Classification
47 | Encoding: UTF-8
48 | LazyData: false
49 | Roxygen: list(markdown = TRUE)
50 | RoxygenNote: 7.3.2
51 | Imports:
52 |     syntenet,
53 |     GenomicRanges,
54 |     Biostrings,
55 |     mclust,
56 |     MSA2dist (>= 1.1.5),
57 |     ggplot2,
58 |     rlang,
59 |     stats,
60 |     utils,
61 |     AnnotationDbi,
62 |     GenomicFeatures
63 | Depends: 
64 |     R (>= 4.2.0)
65 | Suggests: 
66 |     txdbmaker,
67 |     testthat (>= 3.0.0),
68 |     knitr,
69 |     feature,
70 |     patchwork,
71 |     BiocStyle,
72 |     rmarkdown,
73 |     covr,
74 |     sessioninfo
75 | Config/testthat/edition: 3
76 | VignetteBuilder: knitr
77 | 


--------------------------------------------------------------------------------
/tests/testthat/test-visualization.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Load data ----
 3 | data(fungi_kaks)
 4 | 
 5 | # Start tests ----
 6 | test_that("duplicates2counts() returns counts", {
 7 |     
 8 |     duplicate_list <- classify_genes(fungi_kaks)
 9 |     
10 |     # Get count table
11 |     d1 <- duplicates2counts(duplicate_list, shape = "wide")
12 |     d2 <- duplicates2counts(duplicate_list, shape = "long")
13 |     
14 |     expect_true(is.data.frame(d1))
15 |     expect_true(is.data.frame(d2))
16 |     
17 |     expect_error(duplicates2counts(duplicate_list, shape = "error"))
18 | })
19 | 
20 | 
21 | test_that("plot_duplicate_freqs() returns a ggplot object", {
22 |     
23 |     duplicate_list <- classify_genes(fungi_kaks)
24 |     
25 |     # Get count table
26 |     dup_counts <- duplicates2counts(duplicate_list)
27 |     
28 |     # Plot
29 |     p1 <- plot_duplicate_freqs(dup_counts, plot_type = "facet")
30 |     p2 <- plot_duplicate_freqs(dup_counts, plot_type = "stack")
31 |     p3 <- plot_duplicate_freqs(dup_counts, plot_type = "stack_percent")
32 |     
33 |     expect_true("ggplot" %in% class(p1))
34 |     expect_true("ggplot" %in% class(p2))
35 |     expect_true("ggplot" %in% class(p3))
36 |     
37 |     expect_error(plot_duplicate_freqs(dup_counts, plot_type = "error"))
38 | })
39 | 
40 | 
41 | test_that("plot_ks_distro() returns a ggplot object", {
42 |     
43 |     df <- fungi_kaks$saccharomyces_cerevisiae
44 |     
45 |     p1 <- plot_ks_distro(df, bytype = TRUE)
46 |     p2 <- plot_ks_distro(
47 |         df, bytype = TRUE, plot_type = "violin", type_levels = c("SD", "All")
48 |     )
49 |     p3 <- plot_ks_distro(df, bytype = FALSE, plot_type = "histogram")
50 |     p4 <- plot_ks_distro(df, bytype = FALSE, plot_type = "density")
51 |     p5 <- plot_ks_distro(df, bytype = FALSE, plot_type = "density_histogram")
52 |     
53 |     expect_true("ggplot" %in% class(p1))
54 |     expect_true("ggplot" %in% class(p2))
55 |     expect_true("ggplot" %in% class(p3))
56 |     expect_true("ggplot" %in% class(p4))
57 |     expect_true("ggplot" %in% class(p5))
58 |     
59 |     expect_error(plot_ks_distro(df, bytype = TRUE, plot_type = "error"))
60 |     expect_error(plot_ks_distro(df, bytype = FALSE, plot_type = "error"))
61 | })
62 | 
63 | 
64 | test_that("plot_rates_by_species() returns a ggplot object", {
65 |     
66 |     p1 <- plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks")
67 |     p2 <- plot_rates_by_species(fungi_kaks, rate_column = "Ks", bytype = TRUE)
68 |     p3 <- plot_rates_by_species(fungi_kaks, rate_column = "Ka")
69 |     
70 |     expect_true("ggplot" %in% class(p1))
71 |     expect_true("ggplot" %in% class(p2))
72 |     expect_true("ggplot" %in% class(p3))
73 | 
74 |     expect_error(plot_rates_by_species(fungi_kaks, rate_column = "Kw"))
75 | })
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/.github/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Getting help with `doubletrouble`
 2 | 
 3 | Thanks for using `doubletrouble`!
 4 | Before filing an issue, there are a few places to explore and pieces to put together to make the process as smooth as possible.
 5 | 
 6 | ## Make a reprex
 7 | 
 8 | Start by making a minimal **repr**oducible **ex**ample using the  [reprex](https://reprex.tidyverse.org/) package.
 9 | If you haven't heard of or used reprex before, you're in for a treat!
10 | Seriously, reprex will make all of your R-question-asking endeavors easier (which is a pretty insane ROI for the five to ten minutes it'll take you to learn what it's all about).
11 | For additional reprex pointers, check out the [Get help!](https://www.tidyverse.org/help/) section of the tidyverse site.
12 | 
13 | ## Where to ask?
14 | 
15 | Armed with your reprex, the next step is to figure out [where to ask](https://www.tidyverse.org/help/#where-to-ask). See also the [Bioconductor help](http://bioconductor.org/help/) website.
16 | 
17 | *   If it's a question: start with [community.rstudio.com](https://community.rstudio.com/), and/or StackOverflow. If this a Bioconductor-related question, please ask it at the [Bioconductor Support Website](https://support.bioconductor.org/) using the [appropriate package tag](https://support.bioconductor.org/t/doubletrouble) (the website will send an automatic email to the package authors). There are more people there to answer questions.
18 | 
19 | *   If it's a bug: you're in the right place, [file an issue](https://github.com/almeidasilvaf/doubletrouble/issues/new).
20 | 
21 | *   If you're not sure: let the community help you figure it out!
22 |     If your problem _is_ a bug or a feature request, you can easily return here and report it.
23 | 
24 | Before opening a new issue, be sure to [search issues and pull requests](https://github.com/almeidasilvaf/doubletrouble/issues) to make sure the bug hasn't been reported and/or already fixed in the development version.
25 | By default, the search will be pre-populated with `is:issue is:open`.
26 | You can [edit the qualifiers](https://help.github.com/articles/searching-issues-and-pull-requests/)  (e.g. `is:pr`, `is:closed`) as needed.
27 | For example, you'd simply remove `is:open` to search _all_ issues in the repo, open or closed.
28 | 
29 | ## What happens next?
30 | 
31 | To be as efficient as possible, development of tidyverse packages tends to be very bursty, so you shouldn't worry if you don't get an immediate response.
32 | Typically we don't look at a repo until a sufficient quantity of issues accumulates, then there’s a burst of intense activity as we focus our efforts.
33 | That makes development more efficient because it avoids expensive context switching between problems, at the cost of taking longer to get back to you.
34 | This process makes a good reprex particularly important because it might be multiple months between your initial report and when we start working on it.
35 | If we can’t reproduce the bug, we can’t fix it!
36 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to doubletrouble
 2 | 
 3 | This outlines how to propose a change to doubletrouble. 
 4 | For more detailed info about contributing to this, and other tidyverse packages, please see the
 5 | [**development contributing guide**](https://rstd.io/tidy-contrib). 
 6 | 
 7 | ## Fixing typos
 8 | 
 9 | You can fix typos, spelling mistakes, or grammatical errors in the documentation directly using the GitHub web interface, as long as the changes are made in the _source_ file. 
10 | This generally means you'll need to edit [roxygen2 comments](https://roxygen2.r-lib.org/articles/roxygen2.html) in an `.R`, not a `.Rd` file. 
11 | You can find the `.R` file that generates the `.Rd` by reading the comment in the first line.
12 | 
13 | ## Bigger changes
14 | 
15 | If you want to make a bigger change, it's a good idea to first file an issue and make sure someone from the team agrees that it’s needed. 
16 | If you’ve found a bug, please file an issue that illustrates the bug with a minimal 
17 | [reprex](https://www.tidyverse.org/help/#reprex) (this will also help you write a unit test, if needed).
18 | 
19 | ### Pull request process
20 | 
21 | *   Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis::create_from_github("almeidasilvaf/doubletrouble", fork = TRUE)`.
22 | 
23 | *   Install all development dependencies with `devtools::install_dev_deps()`, and then make sure the package passes R CMD check by running `devtools::check()`. 
24 |     If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing. 
25 | *   Create a Git branch for your pull request (PR). We recommend using `usethis::pr_init("brief-description-of-change")`.
26 | 
27 | *   Make your changes, commit to git, and then create a PR by running `usethis::pr_push()`, and following the prompts in your browser.
28 |     The title of your PR should briefly describe the change.
29 |     The body of your PR should contain `Fixes #issue-number`.
30 | 
31 | *  For user-facing changes, add a bullet to the top of `NEWS.md` (i.e. just below the first header). Follow the style described in <https://style.tidyverse.org/news.html>.
32 | 
33 | ### Code style
34 | 
35 | *   New code should follow the tidyverse [style guide](https://style.tidyverse.org). 
36 |     You can use the [styler](https://CRAN.R-project.org/package=styler) package to apply these styles, but please don't restyle code that has nothing to do with your PR.  
37 | 
38 | *  We use [roxygen2](https://cran.r-project.org/package=roxygen2), with [Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html), for documentation.  
39 | 
40 | *  We use [testthat](https://cran.r-project.org/package=testthat) for unit tests. 
41 |    Contributions with test cases included are easier to accept.  
42 | 
43 | ## Code of Conduct
44 | 
45 | Please note that the doubletrouble project is released with a
46 | [Contributor Code of Conduct](CODE_OF_CONDUCT.md). By contributing to this
47 | project you agree to abide by its terms.
48 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ka_ks_analyses.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #----Load data------------------------------------------------------------------
  3 | data(fungi_kaks)
  4 | data(diamond_intra)
  5 | data(diamond_inter)
  6 | data(yeast_annot)
  7 | data(yeast_seq)
  8 | data(cds_scerevisiae)
  9 | blast_list <- diamond_intra
 10 | blast_inter <- diamond_inter
 11 | 
 12 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 13 | 
 14 | pdata <- syntenet::process_input(yeast_seq, yeast_annot)
 15 | annot <- pdata$annotation["Scerevisiae"]
 16 | 
 17 | pairs <- classify_gene_pairs(annot, blast_list)
 18 | td_pairs <- pairs[[1]][pairs[[1]]$type == "TD", ]
 19 | gene_pairs_list <- list(
 20 |     Scerevisiae = td_pairs[seq(1, 3, by = 1), ]
 21 | )
 22 | 
 23 | cds <- list(Scerevisiae = cds_scerevisiae)
 24 | 
 25 | ks <- scerevisiae_kaks$Ks
 26 | 
 27 | ## Simulate gene with CDS that is not a multiple of 3
 28 | cds2 <- cds
 29 | cds2$Scerevisiae$Q0055 <- Biostrings::subseq(
 30 |     cds2$Scerevisiae$Q0055, 1, length(cds2$Scerevisiae$Q0055) - 1
 31 | )
 32 | 
 33 | #----Start tests----------------------------------------------------------------
 34 | test_that("pairs2kaks() returns a data frame with Ka, Ks, and Ka/Ks", {
 35 |     
 36 |     kaks <- pairs2kaks(gene_pairs_list, cds, verbose = TRUE)
 37 |     kaks2 <- pairs2kaks(gene_pairs_list, cds2)
 38 |     
 39 |     expect_equal(class(kaks), "list")
 40 |     expect_equal(class(kaks[[1]]), "data.frame")
 41 |     expect_equal(nrow(kaks[[1]]), nrow(gene_pairs_list[[1]]))
 42 |     expect_true("Ks" %in% names(kaks[[1]]))
 43 |     expect_true("Ka" %in% names(kaks[[1]]))
 44 |     expect_true("Ka_Ks" %in% names(kaks[[1]]))
 45 |     expect_equal(class(kaks2), "list")
 46 | })
 47 | 
 48 | 
 49 | test_that("find_ks_peaks() returns a list of mean, sd, amplitudes, ks vals", {
 50 |     
 51 |     expect_message(
 52 |         peaks <- find_ks_peaks(ks, npeaks = 1:2, verbose = TRUE)
 53 |     )
 54 | 
 55 |     expect_equal(class(peaks), "list")
 56 |     expect_equal(names(peaks), c("mean", "sd", "lambda", "ks"))
 57 |     expect_equal(length(peaks$mean), 2)
 58 | })
 59 | 
 60 | 
 61 | test_that("plot_ks_peaks() returns a ggplot object", {
 62 |     
 63 |     peaks <- list(
 64 |         mean = c(0.118717925754829, 0.534196999662316), 
 65 |         sd = c(0.054568151633283, 0.227909257694474), 
 66 |         lambda = c(0.49001230165837, 0.509987698341629)
 67 |     )
 68 |     
 69 |     peaks_plot <- plot_ks_peaks(peaks, binwidth = 0.05)
 70 |     expect_true("ggplot" %in% class(peaks_plot))
 71 |         
 72 | })
 73 | 
 74 | test_that("find_intersect_mixtures() returns a numeric scalar", {
 75 |     
 76 |     peaks <- find_ks_peaks(ks, npeaks = 2)
 77 |     peak1 <- find_ks_peaks(ks, npeaks = 1)
 78 |     inters <- find_intersect_mixtures(peaks)
 79 |     
 80 |     expect_error(
 81 |         find_intersect_mixtures(peak1)
 82 |     )
 83 |     
 84 |     expect_equal(class(inters), "numeric")
 85 |     expect_equal(length(inters), 1)
 86 | })
 87 | 
 88 | test_that("split_pairs_by_peak() returns a list", {
 89 |     
 90 |     ks_df <- scerevisiae_kaks[, c("dup1", "dup2", "Ks")]
 91 |     peaks <- find_ks_peaks(ks_df$Ks, npeaks = 2)
 92 |     peak1 <- find_ks_peaks(ks, npeaks = 1)
 93 |     
 94 |     spairs <- split_pairs_by_peak(ks_df, peaks) 
 95 |     spairs1 <- split_pairs_by_peak(ks_df, peak1) 
 96 |     
 97 |     expect_equal(class(spairs1), "list")
 98 |     expect_equal(class(spairs), "list")
 99 |     expect_equal(names(spairs), c("pairs", "plot"))
100 |     expect_equal(class(spairs$pairs), "data.frame")
101 |     expect_equal(ncol(spairs$pairs), 4)
102 |     expect_true("ggplot" %in% class(spairs$plot))
103 |     
104 | })
105 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Protein sequences of the yeast species S. cerevisiae and C. glabrata
  3 | #'
  4 | #' Data obtained from Ensembl Fungi. Only translated sequences of primary
  5 | #' transcripts were included. 
  6 | #' 
  7 | #' @name yeast_seq
  8 | #' @format A list of AAStringSet objects with the elements
  9 | #' \strong{Scerevisiae} and \strong{Cglabrata}.
 10 | #' @examples
 11 | #' data(yeast_seq)
 12 | #' @usage data(yeast_seq)
 13 | "yeast_seq"
 14 | 
 15 | 
 16 | #' Genome annotation of the yeast species S. cerevisiae and C. glabrata
 17 | #'
 18 | #' Data obtained from Ensembl Fungi. Only annotation data protein-coding
 19 | #' genes (with associated mRNA, exons, CDS, etc) are included.
 20 | #' 
 21 | #' @name yeast_annot
 22 | #' @format A CompressedGRangesList containing 
 23 | #' the elements \strong{Scerevisiae} and \strong{Cglabrata}.
 24 | #' @examples
 25 | #' data(yeast_annot)
 26 | #' @usage data(yeast_annot)
 27 | "yeast_annot"
 28 | 
 29 | 
 30 | #' Intraspecies DIAMOND output for S. cerevisiae
 31 | #'
 32 | #' List obtained with \code{run_diamond()}.
 33 | #' 
 34 | #' @name diamond_intra
 35 | #' @format A list of data frames (length 1) containing the whole paranome of
 36 | #' S. cerevisiae resulting from intragenome similarity searches.
 37 | #' @examples 
 38 | #' data(diamond_intra)
 39 | #' @usage data(diamond_intra)
 40 | "diamond_intra"
 41 | 
 42 | 
 43 | #' Interspecies DIAMOND output for yeast species
 44 | #' 
 45 | #' This list contains a similarity search of S. cerevisiae against
 46 | #' C. glabrata, and it was obtained with \code{run_diamond()}.
 47 | #' 
 48 | #' @name diamond_inter
 49 | #' @format A list of data frames (length 1) containing the output of a 
 50 | #' DIAMOND search of S. cerevisiae against C. glabrata (outgroup).
 51 | #' @examples 
 52 | #' data(diamond_inter)
 53 | #' @usage data(diamond_inter)
 54 | "diamond_inter"
 55 | 
 56 | 
 57 | #' Coding sequences (CDS) of S. cerevisiae
 58 | #' 
 59 | #' Data were obtained from Ensembl Fungi, and only CDS of primary transcripts
 60 | #' were included.
 61 | #' 
 62 | #' @name cds_scerevisiae 
 63 | #' @format A DNAStringSet object with CDS of S. cerevisiae.
 64 | #' @examples 
 65 | #' data(cds_scerevisiae)
 66 | #' @usage data(cds_scerevisiae)
 67 | "cds_scerevisiae"
 68 | 
 69 | 
 70 | #' Duplicate pairs and Ka, Ks, and Ka/Ks values for fungi species
 71 | #'
 72 | #' This data set was obtained with \code{classify_gene_pairs()} followed
 73 | #' by \code{pairs2kaks()}.
 74 | #' 
 75 | #' @name fungi_kaks
 76 | #' @format A list of data frame with elements 
 77 | #' named \strong{saccharomyces_cerevisiae}, \strong{candida_glabrata},
 78 | #' and \strong{schizosaccharomyces_pombe}. Each data frame contains 
 79 | #' the following variables:
 80 | #' \describe{
 81 | #'   \item{dup1}{Character, duplicated gene 1.}
 82 | #'   \item{dup2}{Character, duplicated gene 2.}
 83 | #'   \item{Ka}{Numeric, Ka values.}
 84 | #'   \item{Ks}{Numeric, Ks values.}
 85 | #'   \item{Ka_Ks}{Numeric, Ka/Ks values.}
 86 | #'   \item{type}{Character, mode of duplication}
 87 | #' }
 88 | #' @examples 
 89 | #' data(fungi_kaks)
 90 | #' @usage data(fungi_kaks)
 91 | "fungi_kaks"
 92 | 
 93 | 
 94 | #' Duplicate pairs and Ks values for Glycine max
 95 | #'
 96 | #' This data set was obtained with \code{classify_gene_pairs()} followed
 97 | #' by \code{pairs2kaks()}.
 98 | #' 
 99 | #' @name gmax_ks
100 | #' @format A data frame with the following variables:
101 | #' \describe{
102 | #'   \item{dup1}{Character, duplicated gene 1.}
103 | #'   \item{dup2}{Character, duplicated gene 2.}
104 | #'   \item{Ks}{Numeric, Ks values.}
105 | #'   \item{type}{Factor, duplication mode.}
106 | #' }
107 | #' @examples 
108 | #' data(gmax_ks)
109 | #' @usage data(gmax_ks)
110 | "gmax_ks"
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/vignettes/bibliography.bib:
--------------------------------------------------------------------------------
  1 | 
  2 | @book{ohno2013evolution,
  3 |   title={Evolution by gene duplication},
  4 |   author={Ohno, Susumu},
  5 |   year={2013},
  6 |   publisher={Springer Science \& Business Media}
  7 | }
  8 | 
  9 | @article{yates2022ensembl,
 10 |   title={Ensembl Genomes 2022: an expanding genome resource for non-vertebrates},
 11 |   author={Yates, Andrew D and Allen, James and Amode, Ridwan M and Azov, Andrey G and Barba, Matthieu and Becerra, Andr{\'e}s and Bhai, Jyothish and Campbell, Lahcen I and Carbajo Martinez, Manuel and Chakiachvili, Marc and others},
 12 |   journal={Nucleic acids research},
 13 |   volume={50},
 14 |   number={D1},
 15 |   pages={D996--D1003},
 16 |   year={2022},
 17 |   publisher={Oxford University Press}
 18 | }
 19 | 
 20 | @article{wang2010kaks_calculator,
 21 |   title={KaKs\_Calculator 2.0: a toolkit incorporating gamma-series methods and sliding window strategies},
 22 |   author={Wang, Dapeng and Zhang, Yubin and Zhang, Zhang and Zhu, Jiang and Yu, Jun},
 23 |   journal={Genomics, proteomics \& bioinformatics},
 24 |   volume={8},
 25 |   number={1},
 26 |   pages={77--80},
 27 |   year={2010},
 28 |   publisher={Elsevier}
 29 | }
 30 | 
 31 | @article{qiao2019gene,
 32 |   title={Gene duplication and evolution in recurring polyploidization--diploidization cycles in plants},
 33 |   author={Qiao, Xin and Li, Qionghou and Yin, Hao and Qi, Kaijie and Li, Leiting and Wang, Runze and Zhang, Shaoling and Paterson, Andrew H},
 34 |   journal={Genome biology},
 35 |   volume={20},
 36 |   number={1},
 37 |   pages={1--23},
 38 |   year={2019},
 39 |   publisher={BioMed Central}
 40 | }
 41 | 
 42 | 
 43 | @article{vanneste2013inference,
 44 |   title={Inference of genome duplications from age distributions revisited},
 45 |   author={Vanneste, Kevin and Van de Peer, Yves and Maere, Steven},
 46 |   journal={Molecular biology and evolution},
 47 |   volume={30},
 48 |   number={1},
 49 |   pages={177--190},
 50 |   year={2013},
 51 |   publisher={Oxford University Press}
 52 | }
 53 | 
 54 | @article{chaudhuri1999sizer,
 55 |   title={SiZer for exploration of structures in curves},
 56 |   author={Chaudhuri, Probal and Marron, James S},
 57 |   journal={Journal of the American Statistical Association},
 58 |   volume={94},
 59 |   number={447},
 60 |   pages={807--823},
 61 |   year={1999},
 62 |   publisher={Taylor \& Francis}
 63 | }
 64 | 
 65 | @article{schmutz2010genome,
 66 |   title={Genome sequence of the palaeopolyploid soybean},
 67 |   author={Schmutz, Jeremy and Cannon, Steven B and Schlueter, Jessica and Ma, Jianxin and Mitros, Therese and Nelson, William and Hyten, David L and Song, Qijian and Thelen, Jay J and Cheng, Jianlin and others},
 68 |   journal={nature},
 69 |   volume={463},
 70 |   number={7278},
 71 |   pages={178--183},
 72 |   year={2010},
 73 |   publisher={Nature Publishing Group}
 74 | }
 75 | 
 76 | 
 77 | @article{tiley2018assessing,
 78 |   title={Assessing the performance of Ks plots for detecting ancient whole genome duplications},
 79 |   author={Tiley, George P and Barker, Michael S and Burleigh, J Gordon},
 80 |   journal={Genome biology and evolution},
 81 |   volume={10},
 82 |   number={11},
 83 |   pages={2882--2898},
 84 |   year={2018},
 85 |   publisher={Oxford University Press}
 86 | }
 87 | 
 88 | 
 89 | @article{buchfink2021sensitive,
 90 |   title={Sensitive protein alignments at tree-of-life scale using DIAMOND},
 91 |   author={Buchfink, Benjamin and Reuter, Klaus and Drost, Hajk-Georg},
 92 |   journal={Nature methods},
 93 |   volume={18},
 94 |   number={4},
 95 |   pages={366--368},
 96 |   year={2021},
 97 |   publisher={Nature Publishing Group US New York}
 98 | }
 99 | 
100 | 
101 | @article{altschul1997gapped,
102 |   title={Gapped BLAST and PSI-BLAST: a new generation of protein database search programs},
103 |   author={Altschul, Stephen F and Madden, Thomas L and Sch{\"a}ffer, Alejandro A and Zhang, Jinghui and Zhang, Zheng and Miller, Webb and Lipman, David J},
104 |   journal={Nucleic acids research},
105 |   volume={25},
106 |   number={17},
107 |   pages={3389--3402},
108 |   year={1997},
109 |   publisher={Oxford University Press}
110 | }
111 | 


--------------------------------------------------------------------------------
/tests/testthat/test-duplicate_classification.R:
--------------------------------------------------------------------------------
  1 | library(txdbmaker)  # for makeTxDbFromGRanges()
  2 | 
  3 | #----Load data------------------------------------------------------------------
  4 | data(diamond_intra)
  5 | data(fungi_kaks)
  6 | data(diamond_inter)
  7 | data(yeast_annot)
  8 | data(yeast_seq)
  9 | blast_list <- diamond_intra
 10 | blast_inter <- syntenet::collapse_bidirectional_hits(
 11 |     diamond_inter,
 12 |     data.frame("Scerevisiae", "Cglabrata")
 13 | )
 14 | 
 15 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 16 | 
 17 | pdata <- syntenet::process_input(yeast_seq, yeast_annot)
 18 | annotation <- pdata$annotation
 19 | annotation_granges <- pdata$annotation[["Scerevisiae"]]
 20 | 
 21 | ## Get anchor pairs
 22 | all <- scerevisiae_kaks
 23 | all$dup1 <- paste0("Sce_", all$dup1)
 24 | all$dup2 <- paste0("Sce_", all$dup2)
 25 | anchor_pairs <- all[all$type == "SD", 1:2]
 26 | ssd <- all[all$type != "SD", 1:2]
 27 | 
 28 | ## Get duplicate pairs from DIAMOND output
 29 | duplicates <- diamond_intra[[1]][, 1:2]
 30 | 
 31 | 
 32 | txdb <- txdbmaker::makeTxDbFromGRanges(yeast_annot[[1]])
 33 | intron_counts <- get_intron_counts(txdb)
 34 | 
 35 | ic_list <- list(Scerevisiae = intron_counts)
 36 | 
 37 | #----Start tests----------------------------------------------------------------
 38 | test_that("get_* functions returned classified duplicates", {
 39 |     
 40 |     # 1) get_anchors_list()
 41 |     anchorpairs <- get_anchors_list(blast_list, annotation)
 42 |     
 43 |     expect_equal(class(anchorpairs), "list")
 44 |     expect_equal(class(anchorpairs[[1]]), "data.frame")
 45 |     expect_equal(ncol(anchorpairs[[1]]), 2)
 46 |     
 47 |     # 2) get_segmental()
 48 |     dups <- get_segmental(anchor_pairs, duplicates)
 49 |     dups2 <- get_segmental(NULL, duplicates)
 50 |     
 51 |     expect_equal(class(dups2), "data.frame")
 52 |     expect_equal(ncol(dups), 3)
 53 |     expect_equal(names(dups), c("dup1", "dup2", "type"))
 54 |     expect_equal(length(unique(dups$type)), 2)
 55 |     
 56 |     # 3) get_tandem_proximal()
 57 |     td_pd <- get_tandem_proximal(dups, annotation_granges)
 58 |     
 59 |     expect_equal(class(td_pd), "data.frame")
 60 |     expect_equal(ncol(td_pd), 3)
 61 |     expect_equal(names(td_pd), c("dup1", "dup2", "type"))
 62 |     expect_equal(length(unique(td_pd$type)), 4)
 63 |     
 64 |     # 4) get_transposed
 65 |     trd <- get_transposed(td_pd, blast_inter, annotation)
 66 |     
 67 |     binter2 <- list(e1 = blast_inter[[1]], e2 = blast_inter[[1]])
 68 |     expect_error(get_transposed(td_pd, binter2, annotation))
 69 |     
 70 |     expect_equal(class(trd), "data.frame")
 71 |     expect_equal(ncol(trd), 3)
 72 |     expect_equal(names(trd), c("dup1", "dup2", "type"))
 73 |     expect_equal(length(unique(trd$type)), 5)
 74 |     
 75 |     # 5) get_transposed_classes
 76 |     trdc <- get_transposed_classes(trd, intron_counts)
 77 |     
 78 |     expect_equal(class(trdc), "data.frame")
 79 |     expect_equal(ncol(trdc), 3)
 80 |     expect_equal(names(trdc), c("dup1", "dup2", "type"))
 81 |     expect_equal(length(unique(trdc$type)), 6)
 82 | })
 83 | 
 84 | 
 85 | test_that("classify_gene_pairs() and classify_genes() return a data frame", {
 86 |     
 87 |     # 1) classify_gene_pairs()
 88 |     dup_full <- classify_gene_pairs(
 89 |         annotation = annotation,
 90 |         blast_list = diamond_intra,
 91 |         scheme = "full",
 92 |         blast_inter = blast_inter,
 93 |         intron_counts = ic_list
 94 |     )
 95 |     
 96 |     dup_binary <- classify_gene_pairs(
 97 |         annotation = annotation,
 98 |         blast_list = diamond_intra,
 99 |         scheme = "binary"
100 |     )
101 |     
102 |     expect_equal(class(dup_full), "list")
103 |     expect_equal(class(dup_full[[1]]), "data.frame")
104 |     expect_equal(ncol(dup_full[[1]]), 3)
105 |     
106 |     expect_equal(class(dup_binary), "list")
107 |     expect_equal(class(dup_binary[[1]]), "data.frame")
108 |     expect_equal(ncol(dup_binary[[1]]), 3)
109 |     
110 |     # 2) classify_genes()
111 |     dup_genes <- classify_genes(dup_full)
112 |     
113 |     expect_equal(class(dup_genes[[1]]), "data.frame")
114 |     expect_equal(class(dup_genes), "list")
115 |     expect_equal(ncol(dup_genes[[1]]), 2)
116 | })
117 | 
118 | 


--------------------------------------------------------------------------------
/man/get_transposed.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/utils_duplicate_classification.R
  3 | \name{get_transposed}
  4 | \alias{get_transposed}
  5 | \title{Classify gene pairs originating from transposon-derived duplications}
  6 | \usage{
  7 | get_transposed(
  8 |   pairs,
  9 |   blast_inter,
 10 |   annotation,
 11 |   evalue = 1e-10,
 12 |   anchors = 5,
 13 |   max_gaps = 25,
 14 |   collinearity_dir = NULL,
 15 |   outgroup_coverage = 70
 16 | )
 17 | }
 18 | \arguments{
 19 | \item{pairs}{A 3-column data frame with columns \strong{dup1}, \strong{dup2},
 20 | and \strong{type} indicating duplicated gene 1, duplicated gene 2, and
 21 | the mode of duplication associated with the pair. This data frame
 22 | is returned by \code{get_tandem_proximal()}.}
 23 | 
 24 | \item{blast_inter}{A list of data frames of length 1
 25 | containing BLAST tabular output for the comparison between the target
 26 | species and an outgroup. Names of list elements must match the names of
 27 | list elements in \code{annotation}. BLASTp, DIAMOND or simular programs must
 28 | be run on processed sequence data as returned
 29 | by \code{syntenet::process_input()}.}
 30 | 
 31 | \item{annotation}{A processed GRangesList or CompressedGRangesList object as
 32 | returned by \code{syntenet::process_input()}.}
 33 | 
 34 | \item{evalue}{Numeric scalar indicating the E-value threshold.
 35 | Default: 1e-10.}
 36 | 
 37 | \item{anchors}{Numeric indicating the minimum required number of genes
 38 | to call a syntenic block, as in \code{syntenet::infer_syntenet}.
 39 | Default: 5.}
 40 | 
 41 | \item{max_gaps}{Numeric indicating the number of upstream and downstream
 42 | genes to search for anchors, as in \code{syntenet::infer_syntenet}.
 43 | Default: 25.}
 44 | 
 45 | \item{collinearity_dir}{Character indicating the path to the directory
 46 | where .collinearity files will be stored. If NULL, files will
 47 | be stored in a subdirectory of \code{tempdir()}. Default: NULL.}
 48 | 
 49 | \item{outgroup_coverage}{Numeric indicating the minimum percentage of
 50 | outgroup species to use to consider genes as transposed duplicates. Only
 51 | valid if multiple outgroup species are present (see details below). Values
 52 | should range from 0 to 100. Default: 70.}
 53 | }
 54 | \value{
 55 | A 3-column data frame with the following variables:
 56 | \describe{
 57 | \item{dup1}{Character, duplicated gene 1.}
 58 | \item{dup2}{Character, duplicated gene 2.}
 59 | \item{type}{Factor of duplication types, with levels
 60 | "SD" (segmental duplication),
 61 | "TD" (tandem duplication),
 62 | "PD" (proximal duplication),
 63 | "TRD" (transposon-derived duplication), and
 64 | "DD" (dispersed duplication).}
 65 | }
 66 | }
 67 | \description{
 68 | Classify gene pairs originating from transposon-derived duplications
 69 | }
 70 | \details{
 71 | If the list of interspecies DIAMOND tables contain comparisons of the
 72 | same species to multiple outgroups (e.g.,
 73 | 'speciesA_speciesB', 'speciesA_speciesC'), this function will check if
 74 | gene pairs are classified as transposed (i.e.,
 75 | only one gene is an ancestral locus) in each of the outgroup species,
 76 | and then calculate the percentage of outgroup species in which each pair
 77 | is considered 'transposed'. For instance, gene pair 1 is transposed based on
 78 | 30\\% of the outgroup species, gene pair is considered as transposed based
 79 | on  100\\% of the outgroup species, gene pair 3 is considered as transposed
 80 | based on 0\\% of the outgroup species, and so on.
 81 | Parameter \strong{outgroup_coverage} lets you choose a minimum percentage
 82 | cut-off to classify pairs as transposed.
 83 | }
 84 | \examples{
 85 | # Load example data
 86 | data(diamond_inter)
 87 | data(yeast_seq)
 88 | data(yeast_annot)
 89 | data(fungi_kaks)
 90 | scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 91 | 
 92 | # Get processed annotation
 93 | pdata <- syntenet::process_input(yeast_seq, yeast_annot)
 94 | annotation <- pdata$annotation
 95 | 
 96 | # Get duplicated pairs
 97 | pairs <- scerevisiae_kaks[, c("dup1", "dup2", "type")]
 98 | pairs$dup1 <- paste0("Sce_", pairs$dup1)
 99 | pairs$dup2 <- paste0("Sce_", pairs$dup2)
100 | 
101 | # Collapse bidirectional hits
102 | compare <- data.frame(target = "Scerevisiae", outgroup = "Cglabrata")
103 | blast_inter <- syntenet::collapse_bidirectional_hits(diamond_inter, compare)
104 | 
105 | # Classify pairs
106 | trd <- get_transposed(pairs, blast_inter, annotation)
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |     collapse = TRUE,
 10 |     comment = "#>",
 11 |     fig.path = "man/figures/README-",
 12 |     out.width = "100%"
 13 | )
 14 | ```
 15 | 
 16 | # doubletrouble <img src="man/figures/logo.png" align="right" height="139" />
 17 | 
 18 | <!-- badges: start -->
 19 | [![GitHub issues](https://img.shields.io/github/issues/almeidasilvaf/doubletrouble)](https://github.com/almeidasilvaf/doubletrouble/issues)
 20 | [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html#stable)
 21 | [![R-CMD-check-bioc](https://github.com/almeidasilvaf/doubletrouble/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/almeidasilvaf/doubletrouble/actions)
 22 | [![Codecov test
 23 | coverage](https://codecov.io/gh/almeidasilvaf/doubletrouble/branch/devel/graph/badge.svg)](https://codecov.io/gh/almeidasilvaf/doubletrouble?branch=devel)
 24 | <!-- badges: end -->
 25 | 
 26 | The major goal of __doubletrouble__ is to identify duplicated genes from
 27 | whole-genome protein sequences and classify them based on their modes
 28 | of duplication. Duplicates can be classified using four different 
 29 | classification schemes, which increase the complexity and level of details
 30 | in a stepwise manner. The classification schemes and the duplication modes
 31 | they can classify are:
 32 | 
 33 | 
 34 | | Scheme   | Duplication modes           |
 35 | |:---------|:----------------------------|
 36 | | binary   | SD, SSD                     |
 37 | | standard | SD, TD, PD, DD              |
 38 | | extended | SD, TD, PD, TRD, DD         |
 39 | | full     | SD, TD, PD, rTRD, dTRD, DD  |
 40 | 
 41 | *Legend:* **SD**, segmental duplication. **SSD**, small-scale duplication.
 42 | **TD**, tandem duplication. **PD**, proximal duplication. 
 43 | **TRD**, transposon-derived duplication. 
 44 | **rTRD**, retrotransposon-derived duplication.
 45 | **dTRD**, DNA transposon-derived duplication. **DD**, dispersed duplication.
 46 | 
 47 | 
 48 | Besides classifying gene pairs, users can also classify genes, so that
 49 | each gene is assigned to a unique mode of duplication.
 50 | 
 51 | Users can also calculate substitution rates per substitution site (i.e., 
 52 | $K_a$, $K_s$ and their ratios $\frac{K_a}{K_s}$) from duplicate pairs, 
 53 | find peaks in Ks distributions with Gaussian Mixture Models (GMMs), 
 54 | and classify gene pairs into age groups based on Ks peaks.
 55 | 
 56 | ## Installation instructions
 57 | 
 58 | Get the latest stable `R` release from [CRAN](http://cran.r-project.org/). 
 59 | Then install __doubletrouble__ from [Bioconductor](http://bioconductor.org/) 
 60 | using the following code:
 61 | 
 62 | ```{r 'install', eval = FALSE}
 63 | if (!requireNamespace("BiocManager", quietly = TRUE)) {
 64 |     install.packages("BiocManager")
 65 | }
 66 | 
 67 | BiocManager::install("doubletrouble")
 68 | ```
 69 | 
 70 | And the development version from [GitHub](https://github.com/almeidasilvaf/doubletrouble) with:
 71 | 
 72 | ```{r 'install_dev', eval = FALSE}
 73 | BiocManager::install("almeidasilvaf/doubletrouble")
 74 | ```
 75 | 
 76 | ## Citation
 77 | 
 78 | Below is the citation output from using `citation('doubletrouble')` in R. Please
 79 | run this yourself to check for any updates on how to cite __doubletrouble__.
 80 | 
 81 | ```{r 'citation', eval = requireNamespace('doubletrouble')}
 82 | print(citation('doubletrouble'), bibtex = TRUE)
 83 | ```
 84 | 
 85 | Please note that the __doubletrouble__ was only made possible thanks to many other R and bioinformatics software authors, which are cited either in the vignettes and/or the paper(s) describing this package.
 86 | 
 87 | ## Code of Conduct
 88 | 
 89 | Please note that the __doubletrouble__ project is released with 
 90 | a [Contributor Code of Conduct](http://bioconductor.org/about/code-of-conduct/). 
 91 | By contributing to this project, you agree to abide by its terms.
 92 | 
 93 | ## Development tools
 94 | 
 95 | * Continuous code testing is possible thanks to [GitHub actions](https://www.tidyverse.org/blog/2020/04/usethis-1-6-0/)  through `r BiocStyle::CRANpkg('usethis')`, `r BiocStyle::CRANpkg('remotes')`, and `r BiocStyle::CRANpkg('rcmdcheck')` customized to use [Bioconductor's docker containers](https://www.bioconductor.org/help/docker/) and `r BiocStyle::Biocpkg('BiocCheck')`.
 96 | * Code coverage assessment is possible thanks to [codecov](https://codecov.io/gh) and `r BiocStyle::CRANpkg('covr')`.
 97 | * The [documentation website](http://almeidasilvaf.github.io/doubletrouble) is automatically updated thanks to `r BiocStyle::CRANpkg('pkgdown')`.
 98 | * The code is styled automatically thanks to `r BiocStyle::CRANpkg('styler')`.
 99 | * The documentation is formatted thanks to `r BiocStyle::CRANpkg('devtools')` and `r BiocStyle::CRANpkg('roxygen2')`.
100 | 
101 | For more details, check the `dev` directory.
102 | 
103 | This package was developed using `r BiocStyle::Biocpkg('biocthis')`.
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/man/classify_gene_pairs.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/duplicate_classification.R
  3 | \name{classify_gene_pairs}
  4 | \alias{classify_gene_pairs}
  5 | \title{Classify duplicate gene pairs based on their modes of duplication}
  6 | \usage{
  7 | classify_gene_pairs(
  8 |   annotation = NULL,
  9 |   blast_list = NULL,
 10 |   scheme = "standard",
 11 |   blast_inter = NULL,
 12 |   intron_counts,
 13 |   evalue = 1e-10,
 14 |   anchors = 5,
 15 |   max_gaps = 25,
 16 |   proximal_max = 10,
 17 |   collinearity_dir = NULL,
 18 |   outgroup_coverage = 70
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{annotation}{A processed GRangesList or CompressedGRangesList object as
 23 | returned by \code{syntenet::process_input()}.}
 24 | 
 25 | \item{blast_list}{A list of data frames containing BLAST tabular output
 26 | for intraspecies comparisons.
 27 | Each list element corresponds to the BLAST output for a given species,
 28 | and names of list elements must match the names of list elements in
 29 | \strong{annotation}. BLASTp, DIAMOND or simular programs must be run
 30 | on processed sequence data as returned by \code{process_input()}.}
 31 | 
 32 | \item{scheme}{Character indicating which classification scheme to use.
 33 | One of "binary", "standard", "extended", or "full". See details below
 34 | for information on what each scheme means. Default: "standard".}
 35 | 
 36 | \item{blast_inter}{(Only valid if \code{scheme == "extended" or "full"}).
 37 | A list of data frames containing BLAST tabular output
 38 | for the comparison between target species and outgroups.
 39 | Names of list elements must match the names of
 40 | list elements in \code{annotation}. BLASTp, DIAMOND or simular programs must
 41 | be run on processed sequence data as returned by \code{process_input()}.}
 42 | 
 43 | \item{intron_counts}{(Only valid if \code{scheme == "full"}).
 44 | A list of 2-column data frames with the number of
 45 | introns per gene as returned by \code{get_intron_counts()}. Names
 46 | of list elements must match names of \strong{annotation}.}
 47 | 
 48 | \item{evalue}{Numeric scalar indicating the E-value threshold.
 49 | Default: 1e-10.}
 50 | 
 51 | \item{anchors}{Numeric indicating the minimum required number of genes
 52 | to call a syntenic block, as in \code{syntenet::infer_syntenet}.
 53 | Default: 5.}
 54 | 
 55 | \item{max_gaps}{Numeric indicating the number of upstream and downstream
 56 | genes to search for anchors, as in \code{syntenet::infer_syntenet}.
 57 | Default: 25.}
 58 | 
 59 | \item{proximal_max}{Numeric scalar with the maximum distance (in number
 60 | of genes) between two genes to consider them as proximal duplicates.
 61 | Default: 10.}
 62 | 
 63 | \item{collinearity_dir}{Character indicating the path to the directory
 64 | where .collinearity files will be stored. If NULL, files will
 65 | be stored in a subdirectory of \code{tempdir()}. Default: NULL.}
 66 | 
 67 | \item{outgroup_coverage}{Numeric indicating the minimum percentage of
 68 | outgroup species to use to consider genes as transposed duplicates. Only
 69 | valid if multiple outgroup species are present (see details below). Values
 70 | should range from 0 to 100. Default: 70.}
 71 | }
 72 | \value{
 73 | A list of 3-column data frames of duplicated gene pairs
 74 | (columns 1 and 2), and their modes of duplication (column 3).
 75 | }
 76 | \description{
 77 | Classify duplicate gene pairs based on their modes of duplication
 78 | }
 79 | \details{
 80 | The classification schemes increase in complexity (number of classes)
 81 | in the order 'binary', 'standard', 'extended', and 'full'.
 82 | 
 83 | For classification scheme "binary", duplicates are classified into
 84 | one of 'SD' (segmental duplications) or 'SSD' (small-scale duplications).
 85 | 
 86 | For classification scheme "standard" (default), duplicates are
 87 | classified into 'SD' (segmental duplication), 'TD' (tandem duplication),
 88 | 'PD' (proximal duplication), and 'DD' (dispersed duplication).
 89 | 
 90 | For classification scheme "extended", duplicates are classified into
 91 | 'SD' (segmental duplication), 'TD' (tandem duplication),
 92 | 'PD' (proximal duplication), 'TRD' (transposon-derived duplication),
 93 | and 'DD' (dispersed duplication).
 94 | 
 95 | Finally, for classification scheme "full", duplicates are classified into
 96 | 'SD' (segmental duplication), 'TD' (tandem duplication),
 97 | 'PD' (proximal duplication), 'rTRD' (retrotransposon-derived duplication),
 98 | 'dTRD' (DNA transposon-derived duplication), and
 99 | 'DD' (dispersed duplication).
100 | }
101 | \examples{
102 | # Load example data
103 | data(diamond_intra)
104 | data(diamond_inter)
105 | data(yeast_annot)
106 | data(yeast_seq)
107 | 
108 | # Get processed annotation data
109 | annotation <- syntenet::process_input(yeast_seq, yeast_annot)$annotation
110 | 
111 | # Get collapsed DIAMOND inter
112 | blast_inter <- syntenet::collapse_bidirectional_hits(
113 |     diamond_inter,
114 |     data.frame("Scerevisiae", "Cglabrata")
115 | )
116 | 
117 | # Get list of intron counts
118 | library(txdbmaker)
119 | txdb_list <- lapply(yeast_annot, txdbmaker::makeTxDbFromGRanges)
120 | intron_counts <- lapply(txdb_list, get_intron_counts)
121 | 
122 | # Classify duplicates - full scheme
123 | dup_class <- classify_gene_pairs(
124 |     annotation = annotation, 
125 |     blast_list = diamond_intra, 
126 |     scheme = "full",
127 |     blast_inter = blast_inter, 
128 |     intron_counts = intron_counts
129 | )
130 | 
131 | # Check number of gene pairs per class
132 | table(dup_class$Scerevisiae$type)
133 | 
134 | }
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # doubletrouble <img src="man/figures/logo.png" align="right" height="139" />
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![GitHub
  9 | issues](https://img.shields.io/github/issues/almeidasilvaf/doubletrouble)](https://github.com/almeidasilvaf/doubletrouble/issues)
 10 | [![Lifecycle:
 11 | stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html#stable)
 12 | [![R-CMD-check-bioc](https://github.com/almeidasilvaf/doubletrouble/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/almeidasilvaf/doubletrouble/actions)
 13 | [![Codecov test
 14 | coverage](https://codecov.io/gh/almeidasilvaf/doubletrouble/branch/devel/graph/badge.svg)](https://codecov.io/gh/almeidasilvaf/doubletrouble?branch=devel)
 15 | <!-- badges: end -->
 16 | 
 17 | The major goal of **doubletrouble** is to identify duplicated genes from
 18 | whole-genome protein sequences and classify them based on their modes of
 19 | duplication. Duplicates can be classified using four different
 20 | classification schemes, which increase the complexity and level of
 21 | details in a stepwise manner. The classification schemes and the
 22 | duplication modes they can classify are:
 23 | 
 24 | | Scheme   | Duplication modes          |
 25 | |:---------|:---------------------------|
 26 | | binary   | SD, SSD                    |
 27 | | standard | SD, TD, PD, DD             |
 28 | | extended | SD, TD, PD, TRD, DD        |
 29 | | full     | SD, TD, PD, rTRD, dTRD, DD |
 30 | 
 31 | *Legend:* **SD**, segmental duplication. **SSD**, small-scale
 32 | duplication. **TD**, tandem duplication. **PD**, proximal duplication.
 33 | **TRD**, transposon-derived duplication. **rTRD**,
 34 | retrotransposon-derived duplication. **dTRD**, DNA transposon-derived
 35 | duplication. **DD**, dispersed duplication.
 36 | 
 37 | Besides classifying gene pairs, users can also classify genes, so that
 38 | each gene is assigned to a unique mode of duplication.
 39 | 
 40 | Users can also calculate substitution rates per substitution site (i.e.,
 41 | $K_a$, $K_s$ and their ratios $\frac{K_a}{K_s}$) from duplicate pairs,
 42 | find peaks in Ks distributions with Gaussian Mixture Models (GMMs), and
 43 | classify gene pairs into age groups based on Ks peaks.
 44 | 
 45 | ## Installation instructions
 46 | 
 47 | Get the latest stable `R` release from
 48 | [CRAN](http://cran.r-project.org/). Then install **doubletrouble** from
 49 | [Bioconductor](http://bioconductor.org/) using the following code:
 50 | 
 51 | ``` r
 52 | if (!requireNamespace("BiocManager", quietly = TRUE)) {
 53 |     install.packages("BiocManager")
 54 | }
 55 | 
 56 | BiocManager::install("doubletrouble")
 57 | ```
 58 | 
 59 | And the development version from
 60 | [GitHub](https://github.com/almeidasilvaf/doubletrouble) with:
 61 | 
 62 | ``` r
 63 | BiocManager::install("almeidasilvaf/doubletrouble")
 64 | ```
 65 | 
 66 | ## Citation
 67 | 
 68 | Below is the citation output from using `citation('doubletrouble')` in
 69 | R. Please run this yourself to check for any updates on how to cite
 70 | **doubletrouble**.
 71 | 
 72 | ``` r
 73 | print(citation('doubletrouble'), bibtex = TRUE)
 74 | #> To cite doubletrouble in publications, use:
 75 | #> 
 76 | #>   Almeida-Silva F, Van de Peer Y doubletrouble: an R/Bioconductor
 77 | #>   package for the identification, classification, and analysis of gene
 78 | #>   and genome duplications. Bioinformatics, 41(2), btaf043. (2025).
 79 | #>   https://doi.org/10.1093/bioinformatics/btaf043
 80 | #> 
 81 | #> A BibTeX entry for LaTeX users is
 82 | #> 
 83 | #>   @Article{,
 84 | #>     title = {doubletrouble: an R/Bioconductor package for the identification, classification, and analysis of gene and genome duplications},
 85 | #>     author = {Fabricio Almeida-Silva and Yves {Van de Peer}},
 86 | #>     journal = {Bioinformatics},
 87 | #>     year = {2025},
 88 | #>     volume = {41},
 89 | #>     number = {2},
 90 | #>     pages = {btaf043},
 91 | #>     url = {https://academic.oup.com/bioinformatics/article/41/2/btaf043/7979242},
 92 | #>     doi = {10.1093/bioinformatics/btaf043},
 93 | #>   }
 94 | ```
 95 | 
 96 | Please note that the **doubletrouble** was only made possible thanks to
 97 | many other R and bioinformatics software authors, which are cited either
 98 | in the vignettes and/or the paper(s) describing this package.
 99 | 
100 | ## Code of Conduct
101 | 
102 | Please note that the **doubletrouble** project is released with a
103 | [Contributor Code of
104 | Conduct](http://bioconductor.org/about/code-of-conduct/). By
105 | contributing to this project, you agree to abide by its terms.
106 | 
107 | ## Development tools
108 | 
109 | - Continuous code testing is possible thanks to [GitHub
110 |   actions](https://www.tidyverse.org/blog/2020/04/usethis-1-6-0/)
111 |   through *[usethis](https://CRAN.R-project.org/package=usethis)*,
112 |   *[remotes](https://CRAN.R-project.org/package=remotes)*, and
113 |   *[rcmdcheck](https://CRAN.R-project.org/package=rcmdcheck)* customized
114 |   to use [Bioconductor’s docker
115 |   containers](https://www.bioconductor.org/help/docker/) and
116 |   *[BiocCheck](https://bioconductor.org/packages/3.19/BiocCheck)*.
117 | - Code coverage assessment is possible thanks to
118 |   [codecov](https://codecov.io/gh) and
119 |   *[covr](https://CRAN.R-project.org/package=covr)*.
120 | - The [documentation
121 |   website](http://almeidasilvaf.github.io/doubletrouble) is
122 |   automatically updated thanks to
123 |   *[pkgdown](https://CRAN.R-project.org/package=pkgdown)*.
124 | - The code is styled automatically thanks to
125 |   *[styler](https://CRAN.R-project.org/package=styler)*.
126 | - The documentation is formatted thanks to
127 |   *[devtools](https://CRAN.R-project.org/package=devtools)* and
128 |   *[roxygen2](https://CRAN.R-project.org/package=roxygen2)*.
129 | 
130 | For more details, check the `dev` directory.
131 | 
132 | This package was developed using
133 | *[biocthis](https://bioconductor.org/packages/3.19/biocthis)*.
134 | 


--------------------------------------------------------------------------------
/R/duplicate_classification.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Classify duplicate gene pairs based on their modes of duplication
  4 | #'
  5 | #' @param annotation A processed GRangesList or CompressedGRangesList object as
  6 | #' returned by \code{syntenet::process_input()}.
  7 | #' @param blast_list A list of data frames containing BLAST tabular output
  8 | #' for intraspecies comparisons.
  9 | #' Each list element corresponds to the BLAST output for a given species,
 10 | #' and names of list elements must match the names of list elements in
 11 | #' \strong{annotation}. BLASTp, DIAMOND or simular programs must be run 
 12 | #' on processed sequence data as returned by \code{process_input()}.
 13 | #' @param scheme Character indicating which classification scheme to use.
 14 | #' One of "binary", "standard", "extended", or "full". See details below
 15 | #' for information on what each scheme means. Default: "standard".
 16 | #' @param blast_inter (Only valid if \code{scheme == "extended" or "full"}).
 17 | #' A list of data frames containing BLAST tabular output 
 18 | #' for the comparison between target species and outgroups. 
 19 | #' Names of list elements must match the names of 
 20 | #' list elements in `annotation`. BLASTp, DIAMOND or simular programs must 
 21 | #' be run on processed sequence data as returned by \code{process_input()}.
 22 | #' @param intron_counts (Only valid if \code{scheme == "full"}). 
 23 | #' A list of 2-column data frames with the number of
 24 | #' introns per gene as returned by \code{get_intron_counts()}. Names
 25 | #' of list elements must match names of \strong{annotation}.
 26 | #' @param evalue Numeric scalar indicating the E-value threshold. 
 27 | #' Default: 1e-10.
 28 | #' @param anchors Numeric indicating the minimum required number of genes
 29 | #' to call a syntenic block, as in \code{syntenet::infer_syntenet}. 
 30 | #' Default: 5.
 31 | #' @param max_gaps Numeric indicating the number of upstream and downstream
 32 | #' genes to search for anchors, as in \code{syntenet::infer_syntenet}. 
 33 | #' Default: 25.
 34 | #' @param proximal_max Numeric scalar with the maximum distance (in number
 35 | #' of genes) between two genes to consider them as proximal duplicates.
 36 | #' Default: 10.
 37 | #' @param collinearity_dir Character indicating the path to the directory
 38 | #' where .collinearity files will be stored. If NULL, files will
 39 | #' be stored in a subdirectory of \code{tempdir()}. Default: NULL.
 40 | #' @param outgroup_coverage Numeric indicating the minimum percentage of 
 41 | #' outgroup species to use to consider genes as transposed duplicates. Only
 42 | #' valid if multiple outgroup species are present (see details below). Values
 43 | #' should range from 0 to 100. Default: 70.
 44 | #'
 45 | #'  
 46 | #' @return A list of 3-column data frames of duplicated gene pairs 
 47 | #' (columns 1 and 2), and their modes of duplication (column 3).
 48 | #' 
 49 | #' @details
 50 | #' The classification schemes increase in complexity (number of classes)
 51 | #' in the order 'binary', 'standard', 'extended', and 'full'.
 52 | #' 
 53 | #' For classification scheme "binary", duplicates are classified into
 54 | #' one of 'SD' (segmental duplications) or 'SSD' (small-scale duplications).
 55 | #' 
 56 | #' For classification scheme "standard" (default), duplicates are
 57 | #' classified into 'SD' (segmental duplication), 'TD' (tandem duplication),
 58 | #' 'PD' (proximal duplication), and 'DD' (dispersed duplication).
 59 | #' 
 60 | #' For classification scheme "extended", duplicates are classified into
 61 | #' 'SD' (segmental duplication), 'TD' (tandem duplication), 
 62 | #' 'PD' (proximal duplication), 'TRD' (transposon-derived duplication), 
 63 | #' and 'DD' (dispersed duplication).
 64 | #' 
 65 | #' Finally, for classification scheme "full", duplicates are classified into
 66 | #' 'SD' (segmental duplication), 'TD' (tandem duplication), 
 67 | #' 'PD' (proximal duplication), 'rTRD' (retrotransposon-derived duplication), 
 68 | #' 'dTRD' (DNA transposon-derived duplication), and 
 69 | #' 'DD' (dispersed duplication).
 70 | #' 
 71 | #' @export
 72 | #' @rdname classify_gene_pairs
 73 | #' @examples 
 74 | #' # Load example data
 75 | #' data(diamond_intra)
 76 | #' data(diamond_inter)
 77 | #' data(yeast_annot)
 78 | #' data(yeast_seq)
 79 | #' 
 80 | #' # Get processed annotation data
 81 | #' annotation <- syntenet::process_input(yeast_seq, yeast_annot)$annotation
 82 | #' 
 83 | #' # Get collapsed DIAMOND inter
 84 | #' blast_inter <- syntenet::collapse_bidirectional_hits(
 85 | #'     diamond_inter,
 86 | #'     data.frame("Scerevisiae", "Cglabrata")
 87 | #' )
 88 | #' 
 89 | #' # Get list of intron counts
 90 | #' library(txdbmaker)
 91 | #' txdb_list <- lapply(yeast_annot, txdbmaker::makeTxDbFromGRanges)
 92 | #' intron_counts <- lapply(txdb_list, get_intron_counts)
 93 | #' 
 94 | #' # Classify duplicates - full scheme
 95 | #' dup_class <- classify_gene_pairs(
 96 | #'     annotation = annotation, 
 97 | #'     blast_list = diamond_intra, 
 98 | #'     scheme = "full",
 99 | #'     blast_inter = blast_inter, 
100 | #'     intron_counts = intron_counts
101 | #' )
102 | #' 
103 | #' # Check number of gene pairs per class
104 | #' table(dup_class$Scerevisiae$type)
105 | #' 
106 | classify_gene_pairs <- function(
107 |         annotation = NULL, blast_list = NULL, scheme = "standard",
108 |         blast_inter = NULL, intron_counts,
109 |         evalue = 1e-10, anchors = 5, max_gaps = 25, proximal_max = 10,
110 |         collinearity_dir = NULL, outgroup_coverage = 70
111 | ) {
112 |     
113 |     anchorp <- get_anchors_list(
114 |         blast_list, annotation, evalue, anchors, max_gaps, collinearity_dir
115 |     )
116 |     
117 |     # Get duplicate pairs and filter duplicate entries
118 |     pairs <- lapply(blast_list, function(x) {
119 |         fpair <- x[x$evalue <= evalue, c(1, 2)]
120 |         fpair <- fpair[fpair[, 1] != fpair[, 2], ]
121 |         fpair <- fpair[!duplicated(t(apply(fpair, 1, sort))), ]
122 |         names(fpair) <- c("dup1", "dup2")
123 |         return(fpair)
124 |     })
125 |     
126 |     dup_list <- lapply(seq_along(anchorp), function(x) {
127 |         # 1) Get segmental duplicates
128 |         sp <- names(anchorp)[x]
129 |         p <- pairs[[grep(paste0(sp, "$"), names(pairs))]]
130 |         
131 |         dups <- get_segmental(anchorp[[x]], p)
132 |         if(scheme == "binary") {
133 |             dups$type <- gsub("DD", "SSD", dups$type)
134 |             dups$type <- factor(dups$type, levels = c("SD", "SSD"))
135 |         } else {
136 |             # 2) Get tandem and proximal duplicates
137 |             dups <- get_tandem_proximal(
138 |                 dups, annotation_granges = annotation[[sp]], 
139 |                 proximal_max = proximal_max
140 |             )
141 |             
142 |             if(scheme %in% c("extended", "full")) {
143 |                 # 3) Get transposed duplicates
144 |                 binter <- blast_inter[startsWith(names(blast_inter), paste0(sp, "_"))]
145 |                 if(length(binter) == 0) {
146 |                     message(
147 |                         "Could not find outgroup for species '", sp, 
148 |                         "'. Skipping identification of TRD duplicates..."
149 |                     )
150 |                 } else {
151 |                     dups <- get_transposed(
152 |                         pairs = dups, 
153 |                         blast_inter = binter, 
154 |                         annotation = annotation, 
155 |                         evalue = evalue,
156 |                         anchors = anchors, max_gaps = max_gaps,
157 |                         collinearity_dir = collinearity_dir,
158 |                         outgroup_coverage = outgroup_coverage
159 |                     )
160 |                     
161 |                     if(scheme == "full") {
162 |                         # 4) Get TRD classes (rTRD and dTRD)
163 |                         dups <- get_transposed_classes(dups, intron_counts[[sp]])
164 |                     }
165 |                 }
166 |             }
167 |         }
168 |         
169 |         return(dups)
170 |     })
171 |     names(dup_list) <- names(anchorp)
172 |     
173 |     return(dup_list)
174 | }
175 | 
176 | 
177 | #' Classify genes into unique modes of duplication
178 | #'
179 | #' @param gene_pairs_list List of classified gene pairs as returned 
180 | #' by \code{classify_gene_pairs()}.
181 | #' 
182 | #' @return A list of 2-column data frames with variables \strong{gene} 
183 | #' and \strong{type} representing gene ID and duplication type, respectively.
184 | #' 
185 | #' @details
186 | #' If a gene is present in pairs with different duplication modes, the gene
187 | #' is classified into a unique mode of duplication following the order
188 | #' of priority indicated in the levels of the factor \strong{type}.
189 | #' 
190 | #' For scheme "binary", the order is SD > SSD.
191 | #' For scheme "standard", the order is SD > TD > PD > DD.
192 | #' For scheme "extended", the order is SD > TD > PD > TRD > DD.
193 | #' For scheme "full", the order is SD > TD > PD > rTRD > dTRD > DD.
194 | #'
195 | #' @rdname classify_genes
196 | #' @export
197 | #' @importFrom GenomicRanges GRangesList
198 | #' @examples
199 | #' data(fungi_kaks)
200 | #' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
201 | #' 
202 | #' cols <- c("dup1", "dup2", "type")
203 | #' gene_pairs_list <- list(Scerevisiae = scerevisiae_kaks[, cols])
204 | #' 
205 | #' class_genes <- classify_genes(gene_pairs_list)
206 | classify_genes <- function(gene_pairs_list = NULL) {
207 |     
208 |     class_genes <- lapply(gene_pairs_list, function(x) {
209 |         
210 |         pairs_by_type <- split(x, x$type)
211 |         gene_type <- Reduce(rbind, lapply(pairs_by_type, function(y) {
212 |             
213 |             genes_df <- NULL
214 |             if(nrow(y) > 0) {
215 |                 genes <- unique(c(y$dup1, y$dup2))
216 |                 genes_df <- data.frame(gene = genes, type = y$type[1])
217 |                 genes_df <- genes_df[!duplicated(genes_df$gene), ]
218 |             }
219 |             
220 |             return(genes_df)
221 |         }))
222 |         ref <- levels(x$type)
223 |         gene_type <- gene_type[order(match(gene_type$type, ref)), ]
224 |         gene_type <- gene_type[!duplicated(gene_type$gene), ]
225 |     })
226 |     return(class_genes)
227 | }
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 


--------------------------------------------------------------------------------
/inst/script/data_acquisition.md:
--------------------------------------------------------------------------------
  1 | Data acquisition
  2 | ================
  3 | 
  4 | # Data in data/
  5 | 
  6 | Here, we will use genome data for two yeast species:
  7 | 
  8 | - *Saccharomyces cerevisiae*
  9 | - *Candida glabrata*
 10 | 
 11 | Data will be obtained from Ensembl Fungi.
 12 | 
 13 | First of all, let’s obtain a list of only protein-coding genes for each
 14 | species.
 15 | 
 16 | ``` r
 17 | library(tidyverse)
 18 | 
 19 | # Get character vector of protein coding gene IDs
 20 | ## S. cerevisiae
 21 | scerevisiae_coding <- as.data.frame(read_delim(
 22 |     "ftp://ftp.psb.ugent.be/pub/plaza/plaza_pico_03/Annotation/annotation.selected_transcript.sac.csv.gz", skip = 8, delim = ";", show_col_types = FALSE
 23 | ))
 24 | scerevisiae_coding <- scerevisiae_coding[scerevisiae_coding$type == "coding", 1]
 25 | 
 26 | ## C. glabrata
 27 | cglabrata_coding <- rtracklayer::import(
 28 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-54/fungi/gff3/candida_glabrata/Candida_glabrata.GCA000002545v2.54.gff3.gz"
 29 | )
 30 | cglabrata_coding <- cglabrata_coding[cglabrata_coding$type == "gene", ]
 31 | cglabrata_coding <- cglabrata_coding[cglabrata_coding$biotype == "protein_coding", ]
 32 | ```
 33 | 
 34 | ## yeast_annot.rda
 35 | 
 36 | The object `yeast_annot` is a `GRangesList` object with elements
 37 | *Scerevisiae* and *Cglabrata*. Only ranges for protein-coding genes are
 38 | included.
 39 | 
 40 | ``` r
 41 | library(rtracklayer)
 42 | 
 43 | # Get gene ranges
 44 | scerevisiae_annot <- import(
 45 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-54/fungi/gff3/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.54.gff3.gz"
 46 | )
 47 | cglabrata_annot <- import(
 48 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-54/fungi/gff3/candida_glabrata/Candida_glabrata.GCA000002545v2.54.gff3.gz"
 49 | )
 50 | 
 51 | # Filter GRanges (include only protein-coding genes and relevant metadata)
 52 | ## Combine GRanges objects in a list
 53 | yeast_annot <- list(
 54 |     Scerevisiae = scerevisiae_annot,
 55 |     Cglabrata = cglabrata_annot
 56 | )
 57 | 
 58 | ## Filter data
 59 | yeast_annot <- lapply(yeast_annot, function(x) {
 60 |     
 61 |     # Get ranges for coding genes, and use them to extract exons, mRNA, etc.
 62 |     gene_ranges <- x[x$biotype == "protein_coding" & x$type == "gene"]
 63 |     cranges <- subsetByOverlaps(x, gene_ranges)
 64 |     
 65 |     # Remove exons for TEs (to avoid warnings when building TxDb)
 66 |     te_tx <- cranges[cranges$type == "transposable_element", ]$transcript_id
 67 |     if(length(te_tx) > 0) {
 68 |         te_exonid <- paste0(rep(te_tx, each = 9), paste0("-E", 1:9))
 69 |         cranges <- cranges[-which(cranges$Name %in% te_exonid)]
 70 |     }
 71 | 
 72 |     # Remove unnecessary columns (for package size issues)
 73 |     cols <- c(
 74 |         "type", "phase", "ID", "Parent", "Name", 
 75 |         "gene_id", "transcript_id", "exon_id", "protein_id"
 76 |     )
 77 |     cranges <- cranges[, cols]
 78 |     
 79 |     return(cranges)
 80 | })
 81 | 
 82 | yeast_annot <- GenomicRanges::GRangesList(yeast_annot)
 83 | 
 84 | # Save data
 85 | usethis::use_data(yeast_annot, compress = "xz", overwrite = TRUE)
 86 | ```
 87 | 
 88 | ## yeast_seq.rda
 89 | 
 90 | The object `yeast_seq` is a list of `AAStringSet` objects with elements
 91 | *Scerevisiae* and *Cglabrata*. Only translated sequences for primary
 92 | transcripts (protein-coding only) are included.
 93 | 
 94 | ``` r
 95 | library(Biostrings)
 96 | 
 97 | # Define small function to keep only longest isoform
 98 | ensembl_longest_isoform <- function(proteome = NULL) {
 99 | 
100 |     pnames <- gsub(".*gene:", "", names(proteome))
101 |     pnames <- gsub(" .*", "", pnames)
102 | 
103 |     names(proteome) <- pnames
104 |     proteome <- proteome[order(Biostrings::width(proteome), decreasing = TRUE),]
105 |     proteome <- proteome[!duplicated(names(proteome)), ]
106 |     return(proteome)
107 | }
108 | 
109 | # Get proteome data
110 | scerevisiae_proteome <- readAAStringSet(
111 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-54/fungi/fasta/saccharomyces_cerevisiae/pep/Saccharomyces_cerevisiae.R64-1-1.pep.all.fa.gz"
112 | ) |> ensembl_longest_isoform()
113 | 
114 | cglabrata_proteome <- readAAStringSet(
115 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-54/fungi/fasta/candida_glabrata/pep/Candida_glabrata.GCA000002545v2.pep.all.fa.gz"
116 | ) |> ensembl_longest_isoform()
117 | 
118 | # Remove non-coding genes
119 | scerevisiae_proteome <- scerevisiae_proteome[names(scerevisiae_proteome) %in%
120 |                                                  scerevisiae_annot$gene_id, ]
121 | 
122 | cglabrata_proteome <- cglabrata_proteome[names(cglabrata_proteome) %in% 
123 |                                              cglabrata_annot$gene_id]
124 | 
125 | # Store AAStringSet objects in a list
126 | yeast_seq <- list(
127 |     Scerevisiae = scerevisiae_proteome,
128 |     Cglabrata = cglabrata_proteome
129 | )
130 | 
131 | # Save object
132 | usethis::use_data(yeast_seq, compress = "xz", overwrite = TRUE)
133 | ```
134 | 
135 | ## diamond_intra.rda and diamond_inter.rda
136 | 
137 | The object `diamond_intra` is a list of DIAMOND data frames for
138 | intraspecies comparisons of *S. cerevisiae*, while `diamond_inter`
139 | contains the DIAMOND output of a comparison between *S. cerevisiae* and
140 | *C. glabrata*.
141 | 
142 | ``` r
143 | # Load and process data
144 | data(yeast_seq)
145 | data(yeast_annot)
146 | 
147 | pdata <- process_input(yeast_seq, yeast_annot)
148 | 
149 | # Intraspecies DIAMOND
150 | diamond_intra <- run_diamond(
151 |     seq = pdata$seq["Scerevisiae"],
152 |     compare = "intraspecies", 
153 |     outdir = file.path(tempdir(), "diamond_intra_data"),
154 |     ... = "--sensitive"
155 | )
156 | 
157 | # Interspecies DIAMOND
158 | comparisons <- data.frame(
159 |     species = "Scerevisiae",
160 |     outgroup = "Cglabrata"
161 | )
162 | 
163 | diamond_inter <- run_diamond(
164 |     seq = pdata$seq,
165 |     compare = comparisons,
166 |     outdir = file.path(tempdir(), "diamond_inter_data"),
167 |     ... = "--sensitive"
168 | )
169 | 
170 | # Save data
171 | usethis::use_data(diamond_intra, compress = "xz", overwrite = TRUE)
172 | usethis::use_data(diamond_inter, compress = "xz", overwrite = TRUE)
173 | ```
174 | 
175 | ## cds_scerevisiae.rda
176 | 
177 | This is a `DNAStringSet object` containing the CDS of duplicated genes
178 | in the S. cerevisiae genome.
179 | 
180 | ``` r
181 | library(Biostrings)
182 | 
183 | # Load and process data
184 | data("yeast_seq")
185 | data("yeast_annot")
186 | pdata <- syntenet::process_input(yeast_seq, yeast_annot)
187 | 
188 | data(diamond_intra)
189 | 
190 | # Classify gene pairs
191 | c_standard <- classify_gene_pairs(
192 |     annotation = pdata$annotation,
193 |     blast_list = diamond_intra,
194 |     scheme = "standard"
195 | )
196 | 
197 | # Get TD-derived pairs
198 | td_pairs <- c_standard$Scerevisiae |>
199 |     dplyr::filter(type == "TD")
200 | td_pairs <- unique(c(td_pairs$dup1, td_pairs$dup2))
201 | td_pairs <- gsub(".*_", "", td_pairs)
202 | 
203 | # Get CDS and keep only longest isoform
204 | cds_scerevisiae_full <- readDNAStringSet(
205 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-54/fungi/fasta/saccharomyces_cerevisiae/cds/Saccharomyces_cerevisiae.R64-1-1.cds.all.fa.gz"
206 | ) |> ensembl_longest_isoform()
207 | 
208 | # Keep only duplicated genes
209 | cds_scerevisiae <- cds_scerevisiae_full[names(cds_scerevisiae_full) %in% 
210 |                                             td_pairs]
211 | 
212 | # Write, read, and export file
213 | out <- tempfile(fileext = ".fa")
214 | writeXStringSet(cds_scerevisiae, filepath = out)
215 | 
216 | cds_scerevisiae <- Biostrings::readDNAStringSet(out)
217 | 
218 | usethis::use_data(cds_scerevisiae, compress = "xz", overwrite = TRUE)
219 | ```
220 | 
221 | ## scerevisiae_kaks.rda
222 | 
223 | This object is a data frame of duplicate pairs and their Ks values.
224 | 
225 | ``` r
226 | # Get all duplicated gene pairs
227 | library(Biostrings)
228 | 
229 | data(yeast_seq)
230 | data(yeast_annot)
231 | data(diamond_intra)
232 | data(diamond_inter)
233 | pdata <- syntenet::process_input(yeast_seq, yeast_annot)
234 | 
235 | # Classify genes into the extended scheme
236 | c_extended <- classify_gene_pairs(
237 |     blast_list = diamond_intra,
238 |     annotation = pdata$annotation,
239 |     scheme = "extended",
240 |     blast_inter = diamond_inter
241 | )
242 | 
243 | # Get CDS
244 | cds <- list(Scerevisiae = cds_scerevisiae_all)
245 | 
246 | # Calculate Ks values
247 | scerevisiae_kaks_list <- pairs2kaks(c_extended, cds)
248 | scerevisiae_kaks <- scerevisiae_kaks_list$Scerevisiae
249 | 
250 | fungi_kaks2 <- fungi_kaks
251 | fungi_kaks2 <- lapply(fungi_kaks2, function(x) {
252 |     
253 |     x$Ka <- signif(x$Ka, 3)
254 |     x$Ks <- signif(x$Ks, 3)
255 |     x$Ka_Ks <- signif(x$Ka_Ks, 3)
256 |     
257 |     return(x)
258 | })
259 | 
260 | usethis::use_data(scerevisiae_kaks, compress = "xz", overwrite = TRUE)
261 | ```
262 | 
263 | ## gmax_ks.rda
264 | 
265 | This object is a 3-column data frame of duplicate pairs and their Ks
266 | values for *Glycine max* (soybean).
267 | 
268 | ``` r
269 | # Get data
270 | annot <- rtracklayer::import(
271 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-53/plants/gtf/glycine_max/Glycine_max.Glycine_max_v2.1.53.gtf.gz"
272 | )
273 | annot <- list(Gmax = annot)
274 | 
275 | seq <- Biostrings::readAAStringSet(
276 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-53/plants/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz"
277 | ) |> ensembl_longest_isoform()
278 | seq <- list(Gmax = seq)
279 | 
280 | cds <- Biostrings::readDNAStringSet(
281 |     "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-53/plants/fasta/glycine_max/cds/Glycine_max.Glycine_max_v2.1.cds.all.fa.gz"
282 | ) |> ensembl_longest_isoform()
283 | 
284 | # Process data
285 | pdata <- syntenet::process_input(seq, annot)
286 | 
287 | # Intraspecies comparison
288 | diamond_intra <- run_diamond(
289 |     seq = pdata$seq["Gmax"],
290 |     compare = "intraspecies", 
291 |     outdir = file.path(tempdir(), "diamond_intra_data"),
292 |     ... = "--sensitive"
293 | )
294 | 
295 | # Binary classification
296 | c_binary <- classify_gene_pairs(
297 |     blast_list = diamond_intra,
298 |     annotation = pdata$annotation,
299 |     binary = TRUE
300 | )
301 | 
302 | cds <- list(Gmax = cds)
303 | 
304 | # Calculate Ks values
305 | gmax_kaks_list <- pairs2kaks(c_binary, cds)
306 | gmax_ks <- gmax_kaks_list$Gmax
307 | gmax_ks <- gmax_ks[, c("dup1", "dup2", "Ks", "type")]
308 | 
309 | gmax_ks <- gmax_ks[gmax_ks$Ks <= 2, ]
310 | gmax_ks <- gmax_ks[!is.na(gmax_ks$Ks), ]
311 | 
312 | gmax_ks$Ks <- signif(gmax_ks$Ks, 3) # to reduce object size
313 | 
314 | usethis::use_data(gmax_ks, compress = "xz", overwrite = TRUE)
315 | ```
316 | 


--------------------------------------------------------------------------------
/R/ka_ks_analyses.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Calculate Ka, Ks, and Ka/Ks from duplicate gene pairs
  4 | #'
  5 | #' @param gene_pairs_list List of data frames containing duplicated gene pairs
  6 | #' as returned by \code{classify_gene_pairs()}.
  7 | #' @param cds List of DNAStringSet objects containing the coding sequences 
  8 | #' of each gene.
  9 | #' @param model Character scalar indicating which codon model to use.
 10 | #' Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY", 
 11 | #' "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN", 
 12 | #' and "GMYN". Default: "MYN".
 13 | #' @param threads Numeric indicating the number of threads to use. Default: 1.
 14 | #' @param verbose Logical indicating whether progress messages should be 
 15 | #' printed on screen. Default: FALSE.
 16 | #' 
 17 | #' @return A list of data frames containing gene pairs and their Ka, Ks,
 18 | #' and Ka/Ks values.
 19 | #' 
 20 | #' @importFrom MSA2dist indices2kaks
 21 | #' @importFrom Biostrings width
 22 | #' @export
 23 | #' @rdname pairs2kaks
 24 | #' @examples 
 25 | #' data(diamond_intra)
 26 | #' data(diamond_inter)
 27 | #' data(yeast_annot)
 28 | #' data(yeast_seq)
 29 | #' data(cds_scerevisiae)
 30 | #' blast_list <- diamond_intra
 31 | #' blast_inter <- diamond_inter
 32 | #' 
 33 | #' pdata <- syntenet::process_input(yeast_seq, yeast_annot)
 34 | #' annot <- pdata$annotation["Scerevisiae"]
 35 | #' 
 36 | #' # Binary classification scheme
 37 | #' pairs <- classify_gene_pairs(annot, blast_list)
 38 | #' td_pairs <- pairs[[1]][pairs[[1]]$type == "TD", ]
 39 | #' gene_pairs_list <- list(
 40 | #'     Scerevisiae = td_pairs[seq(1, 3, by = 1), ]
 41 | #' )
 42 | #' 
 43 | #' cds <- list(Scerevisiae = cds_scerevisiae)
 44 | #' 
 45 | #' kaks <- pairs2kaks(gene_pairs_list, cds)
 46 | #' 
 47 | pairs2kaks <- function(
 48 |         gene_pairs_list, cds, model = "MYN", threads = 1, verbose = FALSE
 49 | ) {
 50 |     
 51 |     kaks_list <- lapply(seq_along(gene_pairs_list), function(x) {
 52 |         
 53 |         # Get pairs and CDS for species x
 54 |         species <- names(gene_pairs_list)[x]
 55 |         if(verbose) { message("Calculating rates for species '", species, "'") }
 56 |         
 57 |         pairs <- gene_pairs_list[[x]]
 58 |         names(pairs)[c(1, 2)] <- c("dup1", "dup2")
 59 |         pairs$dup1 <- gsub("^[a-zA-Z]{2,5}_", "", pairs$dup1)
 60 |         pairs$dup2 <- gsub("^[a-zA-Z]{2,5}_", "", pairs$dup2)
 61 |         fcds <- cds[[species]]
 62 |         
 63 |         # Check if IDs in pairs are all present in CDS
 64 |         c1 <- check_geneid_match(unique(c(pairs$dup1, pairs$dup2)), names(fcds))
 65 |         
 66 |         # Remove CDS that are not multiple of 3
 67 |         fcds <- cds[[species]]
 68 |         m3 <- Biostrings::width(fcds) %% 3
 69 |         remove <- which(m3 != 0)
 70 |         if(length(remove) != 0) {
 71 |             message(
 72 |                 "For species ", species, ", the lengths of ", length(remove), 
 73 |                 " CDS are not multiples of 3. Removing them..."
 74 |             )
 75 |             pairs <- pairs[!pairs$dup1 %in% names(fcds)[remove], ]
 76 |             pairs <- pairs[!pairs$dup2 %in% names(fcds)[remove], ]
 77 |             fcds <- fcds[-remove]
 78 |         }
 79 |         
 80 |         # Create a list of indices - vectors of length 2
 81 |         idx_df <- data.frame(gene = names(fcds), idx = seq_along(fcds))
 82 |         pairs_idx <- lapply(seq_len(nrow(pairs)), function(p) {
 83 |             idx <- c(
 84 |                 idx_df$idx[idx_df$gene == pairs[p, 1]],
 85 |                 idx_df$idx[idx_df$gene == pairs[p, 2]]
 86 |             )
 87 |             
 88 |             return(idx)
 89 |         })
 90 |         
 91 |         # Calculate rates
 92 |         rates <- MSA2dist::indices2kaks(
 93 |             cds = fcds, indices = pairs_idx,
 94 |             model = model, 
 95 |             threads = threads,
 96 |             isMSA = FALSE, 
 97 |             verbose = FALSE
 98 |         )
 99 |         
100 |         rates <- data.frame(
101 |             dup1 = rates$seq1,
102 |             dup2 = rates$seq2,
103 |             Ka = as.numeric(ifelse(rates$Ka == "NA", NA, rates$Ka)),
104 |             Ks = as.numeric(ifelse(rates$Ks == "NA", NA, rates$Ks)),
105 |             Ka_Ks = as.numeric(ifelse(rates[["Ka/Ks"]] == "NA", NA, rates[["Ka/Ks"]]))
106 |         )
107 |         if("type" %in% names(pairs)) {
108 |             rates$type <- pairs$type
109 |         }
110 |         
111 |         return(rates)
112 |     })
113 |     names(kaks_list) <- names(gene_pairs_list)
114 |     
115 |     return(kaks_list)
116 | }
117 | 
118 | 
119 | #' Find peaks in a Ks distribution with Gaussian Mixture Models
120 | #'
121 | #' @param ks A numeric vector of Ks values.
122 | #' @param npeaks Numeric scalar indicating the number of peaks in 
123 | #' the Ks distribution. If you don't know how many peaks there are, 
124 | #' you can include a range of values, and the number of peaks that produces
125 | #' the lowest BIC (Bayesian Information Criterion) will be selected as the
126 | #' optimal. Default: 2.
127 | #' @param min_ks Numeric scalar with the minimum Ks value. Removing
128 | #' very small Ks values is generally used to avoid the incorporation of allelic 
129 | #' and/or splice variants and to prevent the fitting of a component to infinity.
130 | #' Default: 0.01.
131 | #' @param max_ks Numeric scalar indicating the maximum Ks value. Removing
132 | #' very large Ks values is usually performed to account for Ks saturation.
133 | #' Default: 4.
134 | #' @param verbose Logical indicating if messages should be printed on screen.
135 | #' Default: FALSE.
136 | #' 
137 | #' @return A list with the following elements:
138 | #' \describe{
139 | #'   \item{mean}{Numeric with the estimated means.}
140 | #'   \item{sd}{Numeric with the estimated standard deviations.}
141 | #'   \item{lambda}{Numeric with the estimated mixture weights.}
142 | #'   \item{ks}{Numeric vector of filtered Ks distribution based on
143 | #'             arguments passed to min_ks and max_ks.}
144 | #' }
145 | #' @importFrom mclust densityMclust
146 | #' @export
147 | #' @rdname find_ks_peaks
148 | #' @examples 
149 | #' data(fungi_kaks)
150 | #' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
151 | #' ks <- scerevisiae_kaks$Ks
152 | #' 
153 | #' # Find 2 peaks in Ks distribution
154 | #' peaks <- find_ks_peaks(ks, npeaks = 2)
155 | #' 
156 | #' # From 2 to 4 peaks, verbose = TRUE to show BIC values
157 | #' peaks <- find_ks_peaks(ks, npeaks = c(2, 3, 4), verbose = TRUE)
158 | find_ks_peaks <- function(ks, npeaks = 2, min_ks = 0.01, max_ks = 4,
159 |                           verbose = FALSE) {
160 |     
161 |     # Data preprocessing
162 |     ks <- ks[!is.na(ks)]
163 |     fks <- ks[ks >= min_ks]
164 |     fks <- fks[fks <= max_ks]
165 |     
166 |     # Find peaks
167 |     peaks <- mclust::densityMclust(
168 |         fks, G = npeaks, verbose = FALSE, plot = FALSE
169 |     )
170 |     
171 |     if(verbose & length(npeaks) > 1) {
172 |         message("Optimal number of peaks: ", peaks$G)
173 |         print(peaks$BIC)
174 |     }
175 |     
176 |     # Create result list
177 |     peak_list <- list(
178 |         mean = peaks$parameters$mean, 
179 |         sd = sqrt(peaks$parameters$variance$sigmasq), 
180 |         lambda = peaks$parameters$pro,
181 |         ks = as.numeric(peaks$data[,1])
182 |     )
183 |     return(peak_list)
184 | }
185 | 
186 | 
187 | 
188 | 
189 | 
190 | #' Split gene pairs based on their Ks peaks
191 | #' 
192 | #' The purpose of this function is to classify gene pairs by age when there
193 | #' are 2+ Ks peaks. This way, newer gene pairs are found within a 
194 | #' certain number of standard deviations from the highest peak, 
195 | #' and older genes are found close within smaller peaks.
196 | #'
197 | #' @param ks_df A 3-column data frame with gene pairs in columns 1 and 2,
198 | #' and Ks values for the gene pair in column 3.
199 | #' @param peaks A list with mean, standard deviation, and amplitude of Ks
200 | #' peaks as generated by \code{find_ks_peaks}.
201 | #' @param nsd Numeric with the number of standard deviations to consider
202 | #' for each peak.
203 | #' @param binwidth Numeric scalar with binwidth for the histogram.
204 | #' Default: 0.05.
205 | #'
206 | #' @return A list with the following elements:
207 | #' \describe{
208 | #'   \item{pairs}{A 4-column data frame with the variables 
209 | #'                \strong{dup1} (character), \strong{dup2} (character), 
210 | #'                \strong{ks} (numeric), and \strong{peak} (numeric),
211 | #'                representing duplicate gene pair, Ks values, and peak ID,
212 | #'                respectively.}
213 | #'   \item{plot}{A ggplot object with Ks peaks as returned by 
214 | #'               \code{plot_ks_peaks}, but with dashed red lines indicating
215 | #'               boundaries for each peak.}
216 | #' }
217 | #' 
218 | #' @importFrom ggplot2 geom_vline
219 | #' @export
220 | #' @rdname split_pairs_by_peak
221 | #' @examples
222 | #' data(fungi_kaks)
223 | #' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
224 | #'
225 | #' # Create a data frame of duplicate pairs and Ks values
226 | #' ks_df <- scerevisiae_kaks[, c("dup1", "dup2", "Ks")]
227 | #'
228 | #' # Create list of peaks
229 | #' peaks <- find_ks_peaks(ks_df$Ks, npeaks = 2)
230 | #' 
231 | #' # Split pairs
232 | #' spairs <- split_pairs_by_peak(ks_df, peaks) 
233 | split_pairs_by_peak <- function(ks_df, peaks, nsd = 2, binwidth = 0.05) {
234 |     
235 |     names(ks_df) <- c("dup1", "dup2", "ks")
236 |     npeaks <- length(peaks$mean)
237 |     
238 |     # Filter Ks data frame as done in find_ks_peaks()
239 |     max_ks <- max(peaks$ks)
240 |     min_ks <- min(peaks$ks)
241 |     ks_df <- ks_df[!is.na(ks_df$ks), ]
242 |     ks_df <- ks_df[ks_df$ks >= min_ks & ks_df$ks <= max_ks, ]
243 |     
244 |     # Get minimum, intersection points, and maximum
245 |     min_boun <- peaks$mean[1] - nsd * peaks$sd[1]
246 |     if(min_boun < 0) { min_boun <- 0 }
247 |     max_boun <- peaks$mean[npeaks] + nsd * peaks$sd[npeaks]
248 |     if(max_boun > max(ks_df$ks)) { max_boun <- max(ks_df$ks) }
249 |     
250 |     if(npeaks == 1) {
251 |         cutpoints <- c(min_boun, max_boun)
252 |     } else {
253 |         inter <- find_intersect_mixtures(peaks)
254 |         cutpoints <- c(min_boun, inter, max_boun)
255 |     }
256 | 
257 |     # Plot histogram with cutpoints in "brown2" dashed lines
258 |     p <- plot_ks_peaks(peaks, binwidth = binwidth)
259 |     for(i in seq_along(cutpoints)) {
260 |         p <- p + geom_vline(xintercept = cutpoints[i],
261 |                             linetype = "dashed", color = "brown2")
262 |     }
263 |     
264 |     # Create list of intervals
265 |     int_list <- lapply(seq_len(length(cutpoints)-1), function(x) {
266 |         return(c(cutpoints[x], cutpoints[x+1]))
267 |     })
268 |     
269 |     # Create list of data frames for each interval
270 |     split_pairs <- Reduce(rbind, lapply(seq_along(int_list), function(x) {
271 |         ivec <- int_list[[x]]
272 |         pairs <- ks_df[ks_df$ks >= ivec[1] & ks_df$ks < ivec[2], ]
273 |         pairs$peak <- x
274 |         return(pairs)
275 |     }))
276 |     
277 |     result_list <- list(pairs = split_pairs, plot = p)
278 | }
279 | 
280 | 
281 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Get a list of anchor pairs for each species
  3 | #'
  4 | #' @param blast_list A list of data frames containing BLAST tabular output
  5 | #' for intraspecies comparisons.
  6 | #' Each list element corresponds to the BLAST output for a given species,
  7 | #' and names of list elements must match the names of list elements in
  8 | #' `annotation`. BLASTp, DIAMOND or simular programs must be run on processed
  9 | #' sequence data as returned by \code{process_input()}.
 10 | #' @param annotation A processed GRangesList or CompressedGRangesList object as
 11 | #' returned by \code{syntenet::process_input()}.
 12 | #' @param evalue Numeric scalar indicating the E-value threshold. 
 13 | #' Default: 1e-10.
 14 | #' @param anchors Numeric indicating the minimum required number of genes
 15 | #' to call a syntenic block, as in \code{syntenet::infer_syntenet}. 
 16 | #' Default: 5.
 17 | #' @param max_gaps Numeric indicating the number of upstream and downstream
 18 | #' genes to search for anchors, as in \code{syntenet::infer_syntenet}. 
 19 | #' Default: 25.
 20 | #' @param collinearity_dir Character indicating the path to the directory
 21 | #' where .collinearity files will be stored. If NULL, files will
 22 | #' be stored in a subdirectory of \code{tempdir()}. Default: NULL.
 23 | #' 
 24 | #' @return A list of data frames representing intraspecies anchor pairs.
 25 | #' @importFrom syntenet intraspecies_synteny
 26 | #' @export
 27 | #' @rdname get_anchors_list 
 28 | #' @examples 
 29 | #' data(diamond_intra)
 30 | #' data(yeast_annot)
 31 | #' data(yeast_seq)
 32 | #' blast_list <- diamond_intra
 33 | #' 
 34 | #' # Get processed annotation for S. cerevisiae
 35 | #' annotation <- syntenet::process_input(yeast_seq, yeast_annot)$annotation
 36 | #' 
 37 | #' # Get list of intraspecies anchor pairs
 38 | #' anchorpairs <- get_anchors_list(blast_list, annotation)
 39 | get_anchors_list <- function(
 40 |         blast_list = NULL, annotation = NULL,
 41 |         evalue = 1e-10, anchors = 5, max_gaps = 25,
 42 |         collinearity_dir = NULL
 43 | ) {
 44 |     
 45 |     # Create output directory
 46 |     intradir <- collinearity_dir
 47 |     if(is.null(intradir)) {
 48 |         daytime <- format(Sys.time(), "%d_%b_%Y_%Hh%M")
 49 |         intradir <- file.path(tempdir(), paste0("intra_", daytime))
 50 |     }
 51 | 
 52 |     # Filter DIAMOND list by e-value
 53 |     fblast <- lapply(blast_list, function(x) return(x[x$evalue <= evalue, ]))
 54 |     
 55 |     # Get .collinearity files for intragenome comparisons
 56 |     col_files <- syntenet::intraspecies_synteny(
 57 |         blast_intra = fblast, 
 58 |         annotation = annotation,
 59 |         intra_dir = intradir, 
 60 |         anchors = anchors, 
 61 |         max_gaps = max_gaps
 62 |     )
 63 |     
 64 |     # Parse files
 65 |     anchors <- lapply(col_files, syntenet::parse_collinearity)
 66 |     names(anchors) <- gsub("\\.collinearity", "", basename(col_files))
 67 |     
 68 |     return(anchors)
 69 | }
 70 | 
 71 | 
 72 | #' Parse .collinearity files into a data frame of syntenic blocks
 73 | #'
 74 | #' @param collinearity_paths Character vector of paths to .collinearity files.
 75 | #'
 76 | #' @return A 4-column data frame with the variables:
 77 | #' \describe{
 78 | #'   \item{block}{Syntenic block}
 79 | #'   \item{anchor1}{Anchor pair 1}
 80 | #'   \item{anchor2}{Anchor pair 2}
 81 | #' }
 82 | #'
 83 | #' @importFrom utils read.table
 84 | #' @noRd
 85 | collinearity2blocks <- function(collinearity_paths = NULL) {
 86 |     
 87 |     fname <- gsub("\\.collinearity", "", basename(collinearity_paths))
 88 |     names(collinearity_paths) <- fname
 89 |     
 90 |     blocks <- lapply(seq_along(collinearity_paths), function(x) {
 91 |         lines <- readLines(collinearity_paths[x])
 92 |         nlines <- length(lines[!startsWith(lines, "#")])
 93 |         
 94 |         df <- NULL
 95 |         if(nlines > 0) {
 96 |             df <- read.table(
 97 |                 collinearity_paths[x], sep = "\t", comment.char = "#"
 98 |             )
 99 |             df <- df[, c(1, 2, 3)]
100 |             
101 |             # Reorder columns based on original order (not alphabetically)
102 |             spp1 <- unlist(strsplit(names(collinearity_paths[x]), "_"))[1]
103 |             id1 <- gsub("_.*", "", df$V2[1])
104 |             new_names <- c("anchor1", "anchor2") 
105 |             if(!startsWith(spp1, id1)) {
106 |                 new_names <- c("anchor2", "anchor1")
107 |             }
108 |             names(df)[c(2, 3)] <- new_names
109 |             
110 |             # Get syntenic block IDs
111 |             df$V1 <- gsub(":", "", df$V1)
112 |             block_ids <- strsplit(df$V1, "-")
113 |             blocks <- lapply(block_ids, function(x) return(as.numeric(x[1])))
114 |             
115 |             # Add syntenic block IDs to data frame
116 |             df$block <- unlist(blocks)
117 |             df <- df[, c("block", "anchor1", "anchor2")]
118 |         }
119 |         return(df)
120 |     })
121 |     blocks <- Reduce(rbind, blocks)
122 |     return(blocks)
123 | }
124 | 
125 | 
126 | #' Get a data frame of intron counts per gene
127 | #' 
128 | #' @param txdb A `TxDb` object with transcript annotations. See details below 
129 | #' for examples on how to create `TxDb` objects from different kinds of input.
130 | #' 
131 | #' 
132 | #' @return A data frame with intron counts per gene, with variables:
133 | #' \describe{
134 | #'   \item{gene}{Character with gene IDs.}
135 | #'   \item{introns}{Numeric with number of introns per gene.}
136 | #' }
137 | #' 
138 | #' @details
139 | #' The family of functions \code{makeTxDbFrom*} from 
140 | #' the \strong{txdbmaker} package can be used to create `TxDb` objects
141 | #' from a variety of input data types. You can create `TxDb` objects
142 | #' from e.g., `GRanges` objects (\code{makeTxDbFromGRanges()}),
143 | #' GFF files (\code{makeTxDbFromGFF()}), 
144 | #' an Ensembl database (\code{makeTxDbFromEnsembl}), and
145 | #' a Biomart database (\code{makeTxDbFromBiomart}).
146 | #' 
147 | #' @rdname get_intron_counts
148 | #' @export
149 | #' @importFrom GenomicFeatures intronsByTranscript
150 | #' @importFrom AnnotationDbi select
151 | #'
152 | #' @examples
153 | #' data(yeast_annot)
154 | #' 
155 | #' # Create TxDb object from GRanges
156 | #' library(txdbmaker)
157 | #' txdb <- txdbmaker::makeTxDbFromGRanges(yeast_annot[[1]])
158 | #'
159 | #' # Get intron counts
160 | #' intron_counts <- get_intron_counts(txdb)
161 | get_intron_counts <- function(txdb) {
162 |     
163 |     # Get a data frame with the number of introns per transcript
164 |     introns_by_tx <- intronsByTranscript(txdb, use.names = TRUE)
165 |     
166 |     intron_counts <- data.frame(
167 |         tx = names(introns_by_tx),
168 |         introns = lengths(introns_by_tx)
169 |     )
170 |     
171 |     # Create a data frame of transcript-to-gene mapping
172 |     suppressMessages({
173 |         tx2gene <- AnnotationDbi::select(
174 |             txdb, 
175 |             keys = unique(intron_counts$tx),
176 |             columns = "GENEID", 
177 |             keytype = "TXNAME"
178 |         )
179 |     })
180 |     names(tx2gene) <- c("tx", "gene")
181 |     
182 |     # Create a data frame of intron counts per gene
183 |     intron_counts_gene <- merge(intron_counts, tx2gene)[, c("gene", "introns")]
184 |     intron_counts_gene <- intron_counts_gene[order(-intron_counts_gene$introns), ]
185 |     intron_counts_gene <- intron_counts_gene[!duplicated(intron_counts_gene$gene), ]
186 |     rownames(intron_counts_gene) <- NULL
187 |     
188 |     return(intron_counts_gene)
189 | }
190 | 
191 | 
192 | 
193 | #' Find line intersect between pairs of Gaussian mixtures
194 | #'
195 | #' This function finds x-axis coordinate of n-1 intersections between lines 
196 | #' of n Gaussian mixtures. Thus, it will find 1 intersection for Ks distros
197 | #' with 2 peaks, 2 intersections for distros with 2 peaks, and so on.
198 | #' 
199 | #' @param peaks A list with elements \strong{mean}, \strong{sd}, 
200 | #' \strong{lambda}, and \strong{ks}, as returned by the 
201 | #' function \code{fins_ks_peaks()}.
202 | #'
203 | #' @return A numeric scalar or vector with the x-axis coordinates of the 
204 | #' intersections.
205 | #' @importFrom ggplot2 ggplot_build
206 | #' @noRd
207 | #' @rdname find_intersect_mixtures
208 | #' @examples
209 | #' data(fungi_kaks)
210 | #' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
211 | #' ks <- scerevisiae_kaks$Ks
212 | #' 
213 | #' # Find 2 peaks in Ks distribution
214 | #' peaks <- find_ks_peaks(ks, npeaks = 2)
215 | #'
216 | #' # Get intersects
217 | #' inter <- find_intersect_mixtures(peaks)
218 | find_intersect_mixtures <- function(peaks) {
219 |     
220 |     p <- plot_ks_peaks(peaks)
221 |     npeaks <- length(peaks$mean)
222 |     if(npeaks == 1) {
223 |         stop("Cannot find intersect of peaks with only 1 peak.")
224 |     }
225 |     
226 |     # Create list of density line indices to iterate through
227 |     iteration_list <- list(
228 |         c(2,3), c(3,4), c(4,5), c(5,6), c(6,7), c(7,8), c(8,9)
229 |     )
230 |     iteration_list <- iteration_list[seq_len(npeaks-1)]
231 |     
232 |     # Get intersection between density line i and density line i+1    
233 |     ints <- unlist(lapply(iteration_list, function(x) {
234 |         l1 <- x[1]
235 |         l2 <- x[2]
236 |         line_df <- data.frame(
237 |             x = ggplot_build(p)$data[[l1]]$x,
238 |             line1 = ggplot_build(p)$data[[l1]]$y,
239 |             line2 = ggplot_build(p)$data[[l2]]$y
240 |         )
241 |         # Get minimal distance between lines along y axis
242 |         line_df$delta <- line_df$line1 - line_df$line2
243 |         
244 |         # Get x value for minimal delta y
245 |         int <- line_df$x[which(diff(sign(diff((abs(line_df$delta))))) == 2)+1]
246 |         return(int)
247 |     }))
248 |     return(ints)
249 | }
250 | 
251 | 
252 | #' Get a duplicate count matrix for each genome
253 | #'
254 | #' @param duplicate_list A list of data frames with the duplicated genes or
255 | #' gene pairs and their modes of duplication as returned 
256 | #' by \code{classify_gene_pairs()} or \code{classify_genes()}.
257 | #' @param shape Character specifying the shape of the output data frame.
258 | #' One of "long" (data frame in the long shape, in the tidyverse sense),
259 | #' or "wide" (data frame in the wide shape, in the tidyverse sense).
260 | #' Default: "long".
261 | #' 
262 | #' @return If \strong{shape = "wide"}, a count matrix containing the 
263 | #' frequency of duplicated genes (or gene pairs) by mode for each species, 
264 | #' with species in rows and duplication modes in columns.
265 | #' If \strong{shape = "long"}, a data frame in long format with the following
266 | #' variables:
267 | #' \describe{
268 | #'   \item{type}{Factor, type of duplication.}
269 | #'   \item{n}{Numeric, number of duplicates.}
270 | #'   \item{species}{Character, species name}
271 | #' }
272 | #' 
273 | #' @export
274 | #' @rdname duplicates2counts
275 | #' @examples
276 | #' data(fungi_kaks)
277 | #' 
278 | #' # Get unique duplicates
279 | #' duplicate_list <- classify_genes(fungi_kaks)
280 | #' 
281 | #' # Get count table
282 | #' counts <- duplicates2counts(duplicate_list)
283 | duplicates2counts <- function(duplicate_list, shape = "long") {
284 |     
285 |     # Get factor levels for variable `type`
286 |     tlevels <- lapply(duplicate_list, function(x) return(levels(x$type)))
287 |     tlevels <- tlevels[[names(sort(lengths(tlevels), decreasing = TRUE)[1])]]
288 |     
289 |     counts <- Reduce(rbind, lapply(seq_along(duplicate_list), function(x) {
290 |         
291 |         species <- names(duplicate_list)[x]
292 |         
293 |         dup_table <- duplicate_list[[x]]
294 |         dup_table$type <- factor(dup_table$type, levels = tlevels)
295 |         
296 |         if(shape == "long") {
297 |             final_dups <- as.data.frame(table(dup_table$type))
298 |             names(final_dups) <- c("type", "n")
299 |             final_dups$species <- species
300 |         } else if(shape == "wide") {
301 |             final_dups <- t(as.matrix(table(dup_table$type)))
302 |             final_dups <- cbind(species, as.data.frame(final_dups))
303 |         } else {
304 |             stop("Argument 'format' must be one of 'long' or 'wide'.")
305 |         }
306 |         
307 |         return(final_dups)
308 |     }))
309 |     
310 |     return(counts)
311 | }
312 | 
313 | 


--------------------------------------------------------------------------------
/R/visualization.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Create a named vector with a color palette for duplication modes
  3 | #' 
  4 | #' @return A named character vector with colors for each duplication mode.
  5 | #' @noRd
  6 | #' 
  7 | dup_palette <- function() {
  8 |     
  9 |     pal <- c(
 10 |         All = "gray20",
 11 |         SD = "#EFC000FF",
 12 |         TD = "#CD534CFF",
 13 |         PD = "#79AF97FF",
 14 |         TRD = "#7AA6DCFF",
 15 |         rTRD = "#7AA6DCFF",
 16 |         dTRD = "#003C67FF",
 17 |         DD = "#6A6599FF",
 18 |         SSD = "#7AA6DCFF"
 19 |     )
 20 |     
 21 |     return(pal)
 22 | }
 23 | 
 24 | 
 25 | #' Plot frequency of duplicates per mode for each species
 26 | #'
 27 | #' @param dup_counts A data frame in long format with the number of
 28 | #' duplicates per mode for each species, as returned by 
 29 | #' the function \code{duplicates2counts}.
 30 | #' @param plot_type Character indicating how to plot frequencies. One of
 31 | #' 'facet' (facets for each level of the variable \strong{type}),
 32 | #' 'stack' (levels of the variable \strong{type} as stacked bars), or
 33 | #' 'stack_percent' (levels of the variable \strong{type} as stacked bars,
 34 | #' with x-axis representing relative frequencies). Default: 'facet'.
 35 | #' @param remove_zero Logical indicating whether or not to remove rows
 36 | #' with zero values. Default: TRUE.
 37 | #' 
 38 | #' @return A ggplot object.
 39 | #' 
 40 | #' @importFrom ggplot2 ggplot aes geom_bar facet_wrap theme_bw theme labs
 41 | #' scale_fill_manual element_blank
 42 | #' @importFrom rlang .data
 43 | #' @export
 44 | #' @rdname plot_duplicate_freqs
 45 | #' @examples
 46 | #' data(fungi_kaks)
 47 | #' 
 48 | #' # Get unique duplicates
 49 | #' duplicate_list <- classify_genes(fungi_kaks)
 50 | #' 
 51 | #' # Get count table
 52 | #' dup_counts <- duplicates2counts(duplicate_list)
 53 | #' 
 54 | #' # Plot counts
 55 | #' plot_duplicate_freqs(dup_counts, plot_type = "stack_percent")
 56 | plot_duplicate_freqs <- function(
 57 |         dup_counts, plot_type = "facet", remove_zero = TRUE
 58 | ) {
 59 |     
 60 |     # Define palette
 61 |     pal <- dup_palette()
 62 |     
 63 |     # Remove zeros
 64 |     if(remove_zero) { dup_counts <- dup_counts[dup_counts$n != 0, ] }
 65 |     
 66 |     if(plot_type == "facet") {
 67 |         p <- ggplot(dup_counts, aes(x = .data$n, y = .data$species)) +
 68 |             geom_bar(
 69 |                 aes(fill = .data$type), stat = "identity", color = "grey20",
 70 |                 show.legend = FALSE
 71 |             ) +
 72 |             facet_wrap("type", nrow = 1, scales = "free_x") +
 73 |             labs(y = "", x = "Absolute frequency")
 74 |         
 75 |     } else if(plot_type == "stack") {
 76 |         p <- ggplot(
 77 |             dup_counts, aes(x = .data$n, y = .data$species, fill = .data$type)
 78 |         ) +
 79 |             geom_bar(color = "gray20", position = "stack", stat = "identity") +
 80 |             labs(fill = "Type", y = "", x = "Absolute frequency")
 81 |         
 82 |     } else if(plot_type == "stack_percent") {
 83 |         p <- ggplot(
 84 |             dup_counts, aes(x = .data$n, y = .data$species, fill = .data$type)
 85 |         ) +
 86 |             geom_bar(position = "fill", stat = "identity", color = "gray20") +
 87 |             labs(fill = "Type", y = "", x = "Relative frequency")
 88 |         
 89 |     } else {
 90 |         stop("Input to argument 'plot_type' must be one of 'facet', 'stack', or 'stack_percent'.")
 91 |     }
 92 |     
 93 |     p <- p + 
 94 |         scale_fill_manual(values = pal) +
 95 |         theme_bw() +
 96 |         theme(panel.grid = element_blank())
 97 | 
 98 |     return(p)
 99 | }
100 | 
101 | 
102 | #' Plot distribution of synonymous substitution rates (Ks)
103 | #' 
104 | #' @param ks_df A data frame with Ks values for each gene pair
105 | #' as returned by \code{pairs2kaks()}.
106 | #' @param min_ks Numeric indicating the minimum Ks value to keep. 
107 | #' Default: 0.01.
108 | #' @param max_ks Numeric indicating the maximum Ks value to keep.
109 | #' Default: 2.
110 | #' @param bytype Logical indicating whether or not to plot the distribution
111 | #' by type of duplication (requires a column named `type`).
112 | #' @param type_levels (Only valid if \strong{bytype} is not NULL) Character
113 | #' indicating which levels of the variable specified in 
114 | #' parameter \strong{group_by} should be kept. By default, all levels are kept.
115 | #' @param plot_type Character indicating the type of plot to create. 
116 | #' If \strong{bytype = TRUE}, possible types are "histogram" or "violin".
117 | #' If \strong{bytype = FALSE}, possible types are "histogram", "density",
118 | #' or "density_histogram". Default: "histogram".
119 | #' @param binwidth (Only valid if \strong{plot_type = "histogram"}) 
120 | #' Numeric indicating the bin width. Default: 0.03.
121 | #' 
122 | #' @return A ggplot object.
123 | #'
124 | #' @importFrom ggplot2 ggplot aes geom_density geom_histogram facet_grid
125 | #' theme theme_bw geom_violin geom_boxplot scale_x_continuous vars
126 | #' scale_y_continuous after_stat
127 | #' @importFrom stats density
128 | #' @export
129 | #' @rdname plot_ks_distro
130 | #' @examples
131 | #' data(fungi_kaks)
132 | #' ks_df <- fungi_kaks$saccharomyces_cerevisiae
133 | #' 
134 | #' # Plot distro
135 | #' plot_ks_distro(ks_df, bytype = TRUE)
136 | plot_ks_distro <- function(
137 |         ks_df, min_ks = 0.01, max_ks = 2,
138 |         bytype = FALSE, type_levels = NULL,
139 |         plot_type = "histogram",
140 |         binwidth = 0.03
141 | ) {
142 |     
143 |     pal <- dup_palette()
144 |     
145 |     # Filter Ks values
146 |     filt_ks <- ks_df[ks_df$Ks >= min_ks & ks_df$Ks <= max_ks, ]
147 |     filt_ks <- filt_ks[!is.na(filt_ks$Ks), ]
148 |     
149 |     if(bytype) {
150 |         # Add level for all combined
151 |         ks_all <- filt_ks
152 |         ks_all$type <- factor("All", levels = "All")
153 |         filt_ks <- rbind(ks_all, filt_ks)
154 |         
155 |         # Keep only desired levels (optional)
156 |         if(!is.null(type_levels)) {
157 |             filt_ks <- filt_ks[filt_ks$type %in% type_levels, ]
158 |             filt_ks$type <- droplevels(filt_ks$type)
159 |         }
160 |         
161 |         # Plot
162 |         if(plot_type == "histogram") {
163 |             p <- ggplot(filt_ks, aes(x = .data$Ks)) + 
164 |                 geom_histogram(
165 |                     aes(fill = .data$type),
166 |                     color = "gray30", binwidth = binwidth, show.legend = FALSE
167 |                 ) +
168 |                 scale_fill_manual(values = pal) +
169 |                 facet_grid(rows = vars(.data$type), scales = "free_y") +
170 |                 labs(y = "Count")
171 |         } else if(plot_type == "violin") {
172 |             p <- ggplot(filt_ks, aes(x = .data$Ks, y = .data$type)) +
173 |                 geom_violin(aes(fill = .data$type), show.legend = FALSE) +
174 |                 scale_fill_manual(values = pal) +
175 |                 labs(y = "Type")
176 |         } else {
177 |             stop("When plotting by groups, `plot_type` must be either 'histogram' or 'violin'.")
178 |         }
179 |         p <- p + labs(title = "Ks distribution for gene pairs by mode")
180 |     } else {
181 |         
182 |         if(plot_type == "histogram") {
183 |             p <- ggplot(filt_ks, aes(x = .data$Ks)) + 
184 |                 geom_histogram(
185 |                     fill = "#9196ca", color = "#3e57a7", binwidth = binwidth
186 |                 ) +
187 |                 labs(y = "Count")
188 |         } else if(plot_type == "density") {
189 |             p <- ggplot(filt_ks, aes(x = .data$Ks)) +
190 |                 geom_density(
191 |                     fill = "#9196ca", color = "#3e57a7"
192 |                 ) +
193 |                 labs(y = "Density")
194 |         } else if(plot_type == "density_histogram") {
195 |             p <- ggplot(filt_ks, aes(x = .data$Ks)) + 
196 |                 geom_histogram(
197 |                     aes(y = after_stat(density)), alpha = 0.5,
198 |                     fill = "#9196ca", color = "#3e57a7", binwidth = binwidth
199 |                 ) +
200 |                 geom_density(color = "gray30", linewidth = 1) +
201 |                 labs(y = "Density")
202 |         } else {
203 |             stop("Without groups, `plot_type` must be one of 'histogram', 'density', or 'density_histogram'.")
204 |         }
205 |         p <- p + scale_y_continuous(expand = c(1e-2, 1e-2)) +
206 |             labs(title = "Ks distribution for gene pairs")
207 |     }
208 |     
209 |     # Polish plot
210 |     p <- p +
211 |         theme_bw() +
212 |         theme(panel.grid = element_blank()) +
213 |         scale_x_continuous(expand = c(1e-2, 1e-2)) +
214 |         labs(x = expression(K[s]))
215 |     
216 |     return(p)
217 | }
218 | 
219 | 
220 | #' Plot distributions of substitution rates (Ka, Ks, or Ka/Ks) per species
221 | #'
222 | #' @param kaks_list A list of data frames with substitution rates per gene
223 | #' pair in each species as returned by \code{pairs2kaks()}.
224 | #' @param rate_column Character indicating the name of the column to plot.
225 | #' Default: "Ks".
226 | #' @param bytype Logical indicating whether or not to show distributions by
227 | #' type of duplication. Default: FALSE.
228 | #' @param range Numeric vector of length 2 indicating the minimum and maximum
229 | #' values to plot. Default: \code{c(0, 2)}.
230 | #' @param fill Character with color to use for the fill aesthetic. Ignored
231 | #' if \strong{bytype = TRUE}. Default: "deepskyblue3".
232 | #' @param color Character with color to use for the color aesthetic. Ignored
233 | #' if \strong{bytype = FALSE}. Default: "deepskyblue4".
234 | #' 
235 | #' @return A ggplot object.
236 | #'
237 | #' @details
238 | #' Data will be plotted using the species order of the list. To change the
239 | #' order of the species to plot, reorder the list elements 
240 | #' in \strong{kaks_list}.
241 | #' 
242 | #' @importFrom ggplot2 geom_violin scale_fill_manual facet_wrap
243 | #' @export
244 | #' @rdname plot_rates_by_species
245 | #' @examples
246 | #' data(fungi_kaks)
247 | #'
248 | #' # Plot rates
249 | #' plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks") 
250 | plot_rates_by_species <- function(
251 |         kaks_list, rate_column = "Ks", bytype = FALSE, range = c(0, 2),
252 |         fill = "deepskyblue3", color = "deepskyblue4"
253 | ) {
254 |     
255 |     xl <- switch(
256 |         rate_column,
257 |         "Ks" = expression(K[s]),
258 |         "Ka" = expression(K[a]),
259 |         "Ka_Ks" = expression(K[a] / K[s]),
260 |         stop("Input to 'rate_column' must be one of 'Ka', 'Ks', or 'Ka_Ks'.")
261 |     )
262 |     
263 |     # From list to data frame
264 |     rate_df <- Reduce(rbind, lapply(seq_along(kaks_list), function(x) {
265 |         
266 |         df <- kaks_list[[x]]
267 |         df$species <- names(kaks_list)[x]
268 |         
269 |         return(df)
270 |     }))
271 |     rate_df <- rate_df[!is.na(rate_df[[rate_column]]), ]
272 |     rate_df <- rate_df[rate_df[[rate_column]] >= range[1] & 
273 |                            rate_df[[rate_column]] <= range[2], ]
274 |     rate_df$species <- factor(rate_df$species, levels = names(kaks_list))
275 |     
276 |     # Create plot
277 |     if(bytype) {
278 |         p <- ggplot(rate_df, aes(x = .data[[rate_column]], y = .data$species)) +
279 |             geom_violin(aes(fill = .data$type), show.legend = FALSE) +
280 |             scale_fill_manual(values = dup_palette()) +
281 |             facet_wrap("type", nrow = 1) 
282 |         
283 |     } else {
284 |         p <- ggplot(rate_df, aes(x = .data[[rate_column]], y = .data$species)) +
285 |             geom_violin(fill = fill, color = color)
286 |     }
287 |     
288 |     # Polish the plot
289 |     p <- p +
290 |         theme_bw() +
291 |         labs(x = xl, y = "") +
292 |         theme(panel.grid = element_blank())
293 |     
294 |     return(p)
295 | }
296 | 
297 | 
298 | #' Plot histogram of Ks distribution with peaks
299 | #'
300 | #' @param peaks A list with elements \strong{mean}, \strong{sd}, 
301 | #' \strong{lambda}, and \strong{ks}, as returned by the 
302 | #' function \code{fins_ks_peaks()}.
303 | #' @param binwidth Numeric scalar with binwidth for the histogram.
304 | #' Default: 0.05.
305 | #'
306 | #' @return A ggplot object with a histogram and lines for each Ks peak.
307 | #'
308 | #' @importFrom ggplot2 ggplot aes geom_histogram ggplot stat_function
309 | #' labs theme_bw
310 | #' @importFrom stats dnorm
311 | #' @importFrom rlang .data
312 | #' @rdname plot_ks_peaks
313 | #' @export
314 | #' @examples 
315 | #' data(fungi_kaks)
316 | #' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
317 | #' ks <- scerevisiae_kaks$Ks
318 | #' 
319 | #' # Find 2 peaks in Ks distribution
320 | #' peaks <- find_ks_peaks(ks, npeaks = 2)
321 | #'
322 | #' # Plot
323 | #' plot_ks_peaks(peaks, binwidth = 0.05)
324 | plot_ks_peaks <- function(peaks = NULL, binwidth = 0.05) {
325 |     
326 |     ks_df <- data.frame(ks = peaks$ks)
327 |     
328 |     # Define color palette
329 |     pal <- c(
330 |         "#6A6599FF", "#79AF97FF", "#B24745FF", "#00A1D5FF", 
331 |         "#DF8F44FF", "#374E55FF", "#F39B7FFF", "#3C5488FF"
332 |     )
333 |     
334 |     pal <- as.list(rev(pal[seq_along(peaks$mean)]))
335 |     
336 |     # Plot 
337 |     p <- ggplot(ks_df, aes(x = .data$ks)) +
338 |         geom_histogram(binwidth = binwidth, color = "black", fill = "grey80") +
339 |         mapply(function(mean, sd, lambda, n, binwidth, color) {
340 |             stat_function(geom = "line", fun = function(x) {
341 |                 (dnorm(x, mean = mean, sd = sd)) * n * binwidth * lambda
342 |             }, 
343 |             color = color, linewidth = 1.5)
344 |         }, mean = peaks$mean, sd = peaks$sd, lambda = peaks$lambda,
345 |         n = length(ks_df$ks), binwidth = binwidth,
346 |         color = pal) +
347 |         theme_bw() +
348 |         theme(panel.grid = element_blank()) +
349 |         labs(title = "Ks distribution with peaks", y = "Frequency",
350 |              x = "Ks values")
351 |     
352 |     return(p)
353 | }


--------------------------------------------------------------------------------
/R/utils_duplicate_classification.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Classify gene pairs derived from segmental duplications
  4 | #'
  5 | #' @param anchor_pairs A 2-column data frame with anchor pairs in columns 1
  6 | #' and 2.
  7 | #' @param pairs A 2-column data frame with all duplicate pairs. This
  8 | #' is equivalent to the first 2 columns of the tabular output of BLAST-like
  9 | #' programs.
 10 | #'
 11 | #' @return A 3-column data frame with the variables:
 12 | #' \describe{
 13 | #'   \item{dup1}{Character, duplicated gene 1}
 14 | #'   \item{dup2}{Character, duplicated gene 2}
 15 | #'   \item{type}{Factor indicating duplication types, with levels
 16 | #'               "SD" (segmental duplication) or 
 17 | #'               "DD" (dispersed duplication).}
 18 | #' }
 19 | #' @rdname get_segmental
 20 | #' @export
 21 | #' @examples
 22 | #' data(diamond_intra)
 23 | #' data(yeast_annot)
 24 | #' data(yeast_seq)
 25 | #' blast_list <- diamond_intra
 26 | #' 
 27 | #' # Get processed annotation for S. cerevisiae
 28 | #' annotation <- syntenet::process_input(yeast_seq, yeast_annot)$annotation[1]
 29 | #' 
 30 | #' # Get list of intraspecies anchor pairs
 31 | #' anchor_pairs <- get_anchors_list(blast_list, annotation)
 32 | #' anchor_pairs <- anchor_pairs[[1]][, c(1, 2)]
 33 | #' 
 34 | #' # Get duplicate pairs from DIAMOND output
 35 | #' duplicates <- diamond_intra[[1]][, c(1, 2)]
 36 | #' dups <- get_segmental(anchor_pairs, duplicates)
 37 | get_segmental <- function(anchor_pairs = NULL, pairs = NULL) {
 38 |     
 39 |     names(pairs) <- c("dup1", "dup2")
 40 |     p <- pairs[pairs$dup1 != pairs$dup2, ]
 41 |     anchorp <- anchor_pairs
 42 |     
 43 |     duplicates <- p
 44 |     duplicates$type <- "DD"
 45 |     
 46 |     if(!is.null(anchorp)) {
 47 |         names(anchorp) <- c("anchor1", "anchor2")
 48 |         
 49 |         # Look for anchor pairs in duplicate pairs - vector-based approach
 50 |         p_vector <- paste0(p$dup1, p$dup2)
 51 |         anchor_vector <- c(
 52 |             paste0(anchorp$anchor1, anchorp$anchor2),
 53 |             paste0(anchorp$anchor2, anchorp$anchor1)
 54 |         )
 55 |         
 56 |         # Add column `type` with classification
 57 |         dup_mode <- ifelse(p_vector %in% anchor_vector, "SD", "DD")
 58 |         duplicates$type <- dup_mode
 59 |     }
 60 |     duplicates$type <- factor(duplicates$type, levels = c("SD", "DD"))
 61 |     
 62 |     return(duplicates)
 63 | }
 64 | 
 65 | 
 66 | #' Classify gene pairs derived from tandem and proximal duplications
 67 | #' 
 68 | #' @param pairs A 3-column data frame with columns \strong{dup1}, \strong{dup2},
 69 | #' and \strong{type} indicating duplicated gene 1, duplicated gene 2, and
 70 | #' the mode of duplication associated with the pair. This data frame
 71 | #' is returned by \code{get_segmental()}.
 72 | #' @param annotation_granges A processed GRanges object as in each element 
 73 | #' of the list returned by \code{syntenet::process_input()}.
 74 | #' @param proximal_max Numeric scalar with the maximum distance (in number
 75 | #' of genes) between two genes to consider them as proximal duplicates.
 76 | #' Default: 10.
 77 | #'
 78 | #' @return A 3-column data frame with the variables:
 79 | #' \describe{
 80 | #'   \item{dup1}{Character, duplicated gene 1.}
 81 | #'   \item{dup2}{Character, duplicated gene 2.}
 82 | #'   \item{type}{Factor of duplication types, with levels
 83 | #'               "SD" (segmental duplication),
 84 | #'               "TD" (tandem duplication), 
 85 | #'               "PD" (proximal duplication), and
 86 | #'               "DD" (dispersed duplication).}
 87 | #' }
 88 | #' @rdname get_tandem_proximal
 89 | #' @export
 90 | #' @examples
 91 | #' data(yeast_annot)
 92 | #' data(yeast_seq)
 93 | #' data(fungi_kaks)
 94 | #' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 95 | #' 
 96 | #' # Get processed annotation for S. cerevisiae
 97 | #' pdata <- annotation <- syntenet::process_input(yeast_seq, yeast_annot)
 98 | #' annot <- pdata$annotation[[1]]
 99 | #' 
100 | #' # Get duplicated pairs
101 | #' pairs <- scerevisiae_kaks[, c("dup1", "dup2", "type")]
102 | #' pairs$dup1 <- paste0("Sce_", pairs$dup1)
103 | #' pairs$dup2 <- paste0("Sce_", pairs$dup2)
104 | #' 
105 | #' # Get tandem and proximal duplicates
106 | #' td_pd_pairs <- get_tandem_proximal(pairs, annot)
107 | #' 
108 | get_tandem_proximal <- function(
109 |         pairs = NULL, annotation_granges = NULL, proximal_max = 10
110 | ) {
111 |     
112 |     annot <- as.data.frame(annotation_granges)
113 |     annot <- annot[, c("seqnames", "gene", "start", "end")]
114 |     pairs$type <- as.character(pairs$type)
115 |     ssd_pairs <- pairs[pairs$type == "DD", ]
116 |     
117 |     # Add chromosome number and order in the chromosome for each gene
118 |     annot <- annot[order(annot$seqnames, annot$start), ]
119 |     annot_bychr <- split(annot, annot$seqnames)
120 |     annot_order <- Reduce(rbind, lapply(annot_bychr, function(x) {
121 |         x$order <- seq_len(nrow(x))
122 |         return(x[, c("seqnames", "gene", "order")])
123 |     }))
124 |     
125 |     # Create df with `dup1`, `dup2`, `type`, `chr1`, `order1`, `chr2`, `order2`
126 |     ssd_pos <- merge(ssd_pairs, annot_order, by.x = "dup1", by.y = "gene")
127 |     names(ssd_pos)[c(4, 5)] <- c("chr_dup1", "order_dup1")
128 |     ssd_pos <- merge(
129 |         ssd_pos, annot_order, sort = FALSE, by.x = "dup2", by.y = "gene"
130 |     )[, c(2, 1, 3, 4, 5, 6, 7)] # dup1, dup2, type, chr1, order1, chr2, order2
131 |     names(ssd_pos)[c(6, 7)] <- c("chr_dup2", "order_dup2")
132 |     
133 |     # Find tandem and proximal duplicates
134 |     ssd_pos$dist <- abs(ssd_pos$order_dup1 - ssd_pos$order_dup2)
135 |     td_idx <- which(ssd_pos$chr_dup1 == ssd_pos$chr_dup2 & ssd_pos$dist == 1)
136 |     pd_idx <- which(ssd_pos$chr_dup1 == ssd_pos$chr_dup2 & ssd_pos$dist > 1 &
137 |                         ssd_pos$dist <= proximal_max)
138 |     
139 |     if(length(td_idx) > 0) { ssd_pos$type[td_idx] <- "TD" }
140 |     if(length(pd_idx) > 0) { ssd_pos$type[pd_idx] <- "PD" }
141 | 
142 |     duplicates <- rbind(pairs[pairs$type != "DD", ], ssd_pos[, c(1, 2, 3)])
143 |     l <- c("SD", "TD", "PD", "DD")
144 |     duplicates$type <- factor(duplicates$type, levels = l)
145 |     
146 |     return(duplicates)
147 | }
148 | 
149 | 
150 | #' Get syntenic block ID for each gene in a gene pair
151 | #' 
152 | #' @param pair A 2-column data frame with gene pairs.
153 | #' @param syn_df A 2-column data frame with gene ID in column 1, and synteny
154 | #' block ID in column 2.
155 | #'
156 | #' @return A data frame of 4 columns as below:
157 | #' \describe{
158 | #'   \item{dup1}{Character, ID of duplicated gene 1.}
159 | #'   \item{dup2}{Character, ID of duplicated gene 2.}
160 | #'   \item{block1}{Numeric, syntenic block ID of gene 1.}
161 | #'   \item{block2}{Numeric, syntenic block ID of gene 2.}
162 | #' }
163 | #'
164 | #' @noRd
165 | pairs_and_synblocks <- function(pairs, syn_df) {
166 |     
167 |     names(pairs)[1:2] <- c("dup1", "dup2")
168 |     names(syn_df)[1:2] <- c("anchor", "block")
169 |     
170 |     pairs_ancestral <- merge(
171 |         pairs[, c(1, 2)], syn_df, by.x = "dup1", by.y = "anchor",
172 |         all.x = TRUE
173 |     )
174 |     pairs_ancestral <- merge(
175 |         pairs_ancestral, syn_df, by.x = "dup2", by.y = "anchor",
176 |         all.x = TRUE
177 |     )
178 |     names(pairs_ancestral)[c(3, 4)] <- c("block1", "block2")
179 |     
180 |     return(pairs_ancestral)
181 | }
182 | 
183 | 
184 | #' Classify gene pairs originating from transposon-derived duplications
185 | #'
186 | #' @param pairs A 3-column data frame with columns \strong{dup1}, \strong{dup2},
187 | #' and \strong{type} indicating duplicated gene 1, duplicated gene 2, and
188 | #' the mode of duplication associated with the pair. This data frame
189 | #' is returned by \code{get_tandem_proximal()}.
190 | #' @param blast_inter A list of data frames of length 1 
191 | #' containing BLAST tabular output for the comparison between the target
192 | #' species and an outgroup. Names of list elements must match the names of 
193 | #' list elements in `annotation`. BLASTp, DIAMOND or simular programs must 
194 | #' be run on processed sequence data as returned 
195 | #' by \code{syntenet::process_input()}.
196 | #' @param annotation A processed GRangesList or CompressedGRangesList object as
197 | #' returned by \code{syntenet::process_input()}.
198 | #' @param evalue Numeric scalar indicating the E-value threshold. 
199 | #' Default: 1e-10.
200 | #' @param anchors Numeric indicating the minimum required number of genes
201 | #' to call a syntenic block, as in \code{syntenet::infer_syntenet}. 
202 | #' Default: 5.
203 | #' @param max_gaps Numeric indicating the number of upstream and downstream
204 | #' genes to search for anchors, as in \code{syntenet::infer_syntenet}. 
205 | #' Default: 25.
206 | #' @param collinearity_dir Character indicating the path to the directory
207 | #' where .collinearity files will be stored. If NULL, files will
208 | #' be stored in a subdirectory of \code{tempdir()}. Default: NULL.
209 | #' @param outgroup_coverage Numeric indicating the minimum percentage of 
210 | #' outgroup species to use to consider genes as transposed duplicates. Only
211 | #' valid if multiple outgroup species are present (see details below). Values
212 | #' should range from 0 to 100. Default: 70.
213 | #' 
214 | #'
215 | #' @return A 3-column data frame with the following variables:
216 | #' \describe{
217 | #'   \item{dup1}{Character, duplicated gene 1.}
218 | #'   \item{dup2}{Character, duplicated gene 2.}
219 | #'   \item{type}{Factor of duplication types, with levels
220 | #'               "SD" (segmental duplication),
221 | #'               "TD" (tandem duplication), 
222 | #'               "PD" (proximal duplication), 
223 | #'               "TRD" (transposon-derived duplication), and
224 | #'               "DD" (dispersed duplication).}
225 | #' }
226 | #' 
227 | #' @details 
228 | #' If the list of interspecies DIAMOND tables contain comparisons of the
229 | #' same species to multiple outgroups (e.g., 
230 | #' 'speciesA_speciesB', 'speciesA_speciesC'), this function will check if
231 | #' gene pairs are classified as transposed (i.e.,
232 | #' only one gene is an ancestral locus) in each of the outgroup species,
233 | #' and then calculate the percentage of outgroup species in which each pair
234 | #' is considered 'transposed'. For instance, gene pair 1 is transposed based on
235 | #' 30\% of the outgroup species, gene pair is considered as transposed based 
236 | #' on  100\% of the outgroup species, gene pair 3 is considered as transposed
237 | #' based on 0\% of the outgroup species, and so on. 
238 | #' Parameter \strong{outgroup_coverage} lets you choose a minimum percentage 
239 | #' cut-off to classify pairs as transposed.
240 | #' 
241 | #' @importFrom syntenet interspecies_synteny
242 | #' @export
243 | #' @rdname get_transposed
244 | #' @examples 
245 | #' # Load example data
246 | #' data(diamond_inter)
247 | #' data(yeast_seq)
248 | #' data(yeast_annot)
249 | #' data(fungi_kaks)
250 | #' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
251 | #' 
252 | #' # Get processed annotation
253 | #' pdata <- syntenet::process_input(yeast_seq, yeast_annot)
254 | #' annotation <- pdata$annotation
255 | #' 
256 | #' # Get duplicated pairs
257 | #' pairs <- scerevisiae_kaks[, c("dup1", "dup2", "type")]
258 | #' pairs$dup1 <- paste0("Sce_", pairs$dup1)
259 | #' pairs$dup2 <- paste0("Sce_", pairs$dup2)
260 | #' 
261 | #' # Collapse bidirectional hits
262 | #' compare <- data.frame(target = "Scerevisiae", outgroup = "Cglabrata")
263 | #' blast_inter <- syntenet::collapse_bidirectional_hits(diamond_inter, compare)
264 | #' 
265 | #' # Classify pairs
266 | #' trd <- get_transposed(pairs, blast_inter, annotation)
267 | #' 
268 | get_transposed <- function(
269 |         pairs, blast_inter, annotation, 
270 |         evalue = 1e-10, anchors = 5, max_gaps = 25,
271 |         collinearity_dir = NULL, outgroup_coverage = 70
272 | ) {
273 |     
274 |     blast_inter <- lapply(blast_inter, function(x) return(x[x$evalue <= evalue, ]))
275 |     pairs$type <- as.character(pairs$type)
276 |     pairs_dd <- pairs[pairs$type == "DD", ]
277 |     
278 |     # Define directory where interspecies .collinearity files will be stored
279 |     interdir <- collinearity_dir
280 |     if(is.null(interdir)) {
281 |         daytime <- format(Sys.time(), "%d_%b_%Y_%Hh%M")
282 |         interdir <- file.path(tempdir(), paste0("inter_", daytime))
283 |     }
284 |     
285 |     # Get name of target and outgroup species
286 |     target <- unlist(lapply(names(annotation), function(x) {
287 |         nfound <- sum(grepl(paste0(x, "_"), names(blast_inter)))
288 |         found <- rep(x, nfound)
289 |         return(found)
290 |     }))
291 |     
292 |     outgroup <- unlist(lapply(names(annotation), function(x) {
293 |         nfound <- sum(grepl(paste0(x, "$"), names(blast_inter)))
294 |         found <- rep(x, nfound)
295 |         return(found)
296 |     }))
297 | 
298 |     # For each outgroup, get data frame indicating if pairs is tranposed
299 |     trd_df <- lapply(seq_along(outgroup), function(n) {
300 |         # Detect syntenic regions between `target` and `outgroup`
301 |         syn <- syntenet::interspecies_synteny(
302 |             blast_inter[n],
303 |             annotation = annotation[c(target[n], outgroup[n])],
304 |             inter_dir = interdir,
305 |             anchors = anchors,
306 |             max_gaps = max_gaps
307 |         )
308 |         
309 |         # Read and parse interspecies synteny results
310 |         parsed_syn <- collinearity2blocks(syn)[, c("anchor1", "block")]
311 |         parsed_syn <- parsed_syn[!duplicated(parsed_syn$anchor1), ]
312 |         
313 |         pairs_ancestral <- pairs_dd[, c(1, 2)]
314 |         pairs_ancestral$ancestral <- FALSE
315 |         if(!is.null(parsed_syn)) {
316 |             
317 |             # Find TRD-derived genes (only one member of pair in ancestral loci)
318 |             pairs_ancestral <- pairs_and_synblocks(pairs_dd, parsed_syn)
319 |             
320 |             nas <- apply(pairs_ancestral[, c(3, 4)], 1, function(x) return(sum(is.na(x))))
321 |             pairs_ancestral$ancestral <- ifelse(nas == 1, TRUE, FALSE)
322 |             pairs_ancestral <- pairs_ancestral[, c("dup1", "dup2", "ancestral")]
323 |         }
324 |         
325 |         return(pairs_ancestral)
326 |     })
327 |     nout <- length(trd_df)
328 |     trd_df <- Reduce(function(x, y) merge(x, y, by = c("dup1", "dup2"), all = TRUE), trd_df)
329 |     names(trd_df)[seq(3, nout+2, 1)] <- paste0("ancestral", seq_len(nout))
330 |     
331 |     # Calculate percentage of outgroups in which pair is classified as transposed
332 |     perc_trd <- (rowSums(trd_df[, -c(1,2), drop = FALSE]) / nout) * 100
333 |     
334 |     # Classify pairs as TRD if percentage >= outgroup_coverage
335 |     trd_df$type <- ifelse(perc_trd >= outgroup_coverage, "TRD", "DD")
336 |     final <- rbind(
337 |         pairs[pairs$type != "DD", ],
338 |         trd_df[, c("dup1", "dup2", "type")]
339 |     )
340 |     final$type <- factor(final$type, levels = c("SD", "TD", "PD", "TRD", "DD"))
341 |     
342 |     return(final)
343 | }
344 | 
345 | 
346 | 
347 | #' Classify TRD genes as derived from either DNA transposons or retrotransposons
348 | #'
349 | #' @param pairs A 3-column data frame with columns \strong{dup1}, \strong{dup2},
350 | #' and \strong{type} indicating duplicated gene 1, duplicated gene 2, and
351 | #' the mode of duplication associated with the pair. This data frame
352 | #' is returned by \code{get_transposed()}.
353 | #' @param intron_counts A 2-column data frame with columns \strong{gene}
354 | #' and \strong{introns} indicating the number of introns for each gene,
355 | #' as returned by \code{get_intron_counts}.
356 | #'
357 | #' @return A 3-column data frame with the following variables:
358 | #' \describe{
359 | #'   \item{dup1}{Character, duplicated gene 1.}
360 | #'   \item{dup2}{Character, duplicated gene 2.}
361 | #'   \item{type}{Factor of duplication types, with levels
362 | #'               "SD" (segmental duplication),
363 | #'               "TD" (tandem duplication), 
364 | #'               "PD" (proximal duplication), 
365 | #'               "dTRD" (DNA transposon-derived duplication),
366 | #'               "rTRD" (retrotransposon-derived duplication), and
367 | #'               "DD" (dispersed duplication).}
368 | #' }
369 | #' 
370 | #' @rdname get_transposed_classes
371 | #' @export
372 | #' @examples
373 | #' data(diamond_inter)
374 | #' data(diamond_intra)
375 | #' data(yeast_seq)
376 | #' data(yeast_annot)
377 | #' data(fungi_kaks)
378 | #' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
379 | #' 
380 | #' # Get processed annotation
381 | #' pdata <- syntenet::process_input(yeast_seq, yeast_annot)
382 | #' annotation <- pdata$annotation
383 | #' 
384 | #' # Get duplicated pairs
385 | #' pairs <- scerevisiae_kaks[, c("dup1", "dup2", "type")]
386 | #' pairs$dup1 <- paste0("Sce_", pairs$dup1)
387 | #' pairs$dup2 <- paste0("Sce_", pairs$dup2)
388 | #' 
389 | #' # Classify pairs
390 | #' trd <- get_transposed(pairs, diamond_inter, annotation)
391 | #' 
392 | #' # Create TxDb object from GRanges
393 | #' library(txdbmaker)
394 | #' txdb <- txdbmaker::makeTxDbFromGRanges(yeast_annot[[1]])
395 | #'
396 | #' # Get intron counts
397 | #' intron_counts <- get_intron_counts(txdb)
398 | #'
399 | #' # Get TRD classes
400 | #' trd_classes <- get_transposed_classes(trd, intron_counts)
401 | #'
402 | get_transposed_classes <- function(pairs, intron_counts) {
403 |     
404 |     # Get TRD pairs
405 |     pairs$type <- as.character(pairs$type)
406 |     tpairs <- pairs[pairs$type == "TRD", ]
407 |     
408 |     final_pairs <- pairs
409 |     if(nrow(tpairs) > 0) {
410 |         
411 |         id <- unique(gsub("_.*", "", tpairs$dup1))
412 |         tpairs$dup1 <- gsub("^[a-zA-Z]{2,5}_", "", tpairs$dup1)
413 |         tpairs$dup2 <- gsub("^[a-zA-Z]{2,5}_", "", tpairs$dup2)
414 |         
415 |         # Combine `tpairs` and `intron_counts`
416 |         pairs_ic <- merge(
417 |             tpairs, intron_counts, by.x = "dup1", by.y = "gene", all.x = TRUE
418 |         )
419 |         pairs_ic <- merge(
420 |             pairs_ic, intron_counts, by.x = "dup2", by.y = "gene", all.x = TRUE
421 |         )
422 |         names(pairs_ic)[c(4,5)] <- c("introns1", "introns2")
423 |         
424 |         # Create a column with number of genes in pair with 0 introns
425 |         pairs_ic$count <- apply(pairs_ic[, 4:5], 1, function(x) sum(x == 0))
426 |         
427 |         # 'rTRD' if only one gene has no introns, as 'dTRD' otherwise
428 |         pairs_ic$type <- ifelse(pairs_ic$count == 1, "rTRD", "dTRD")
429 |         
430 |         final_pairs <- pairs_ic[, c("dup1", "dup2", "type")]
431 |         
432 |         # Add species IDs back
433 |         final_pairs$dup1 <- paste0(id, "_", final_pairs$dup1)
434 |         final_pairs$dup2 <- paste0(id, "_", final_pairs$dup2)
435 |         
436 |         final_pairs <- rbind(pairs[pairs$type != "TRD", ], final_pairs)
437 |     }
438 |     
439 |     l <- c("SD", "TD", "PD", "rTRD", "dTRD", "DD")
440 |     final_pairs$type <- factor(final_pairs$type, levels = l)
441 |     
442 |     return(final_pairs)
443 | }
444 | 
445 | 


--------------------------------------------------------------------------------
/vignettes/doubletrouble_vignette.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Identification and classification of duplicated genes"
  3 | author: 
  4 |   - name: Fabricio Almeida-Silva
  5 |     affiliation: |
  6 |       VIB-UGent Center for Plant Systems Biology, Ghent University, 
  7 |       Ghent, Belgium
  8 |   - name: Yves Van de Peer
  9 |     affiliation: |
 10 |       VIB-UGent Center for Plant Systems Biology, Ghent University, 
 11 |       Ghent, Belgium
 12 | output: 
 13 |   BiocStyle::html_document:
 14 |     toc: true
 15 |     number_sections: yes
 16 | bibliography: bibliography.bib
 17 | vignette: >
 18 |   %\VignetteIndexEntry{Identification and classification of duplicated genes}
 19 |   %\VignetteEngine{knitr::rmarkdown}
 20 |   %\VignetteEncoding{UTF-8}  
 21 | ---
 22 | 
 23 | ```{r setup, include = FALSE}
 24 | knitr::opts_chunk$set(
 25 |     collapse = TRUE,
 26 |     comment = "#>",
 27 |     crop = NULL
 28 | )
 29 | ```
 30 | 
 31 | # Introduction
 32 | 
 33 | Gene and genome duplications are a source of raw genetic material for evolution
 34 | [@ohno2013evolution]. However, whole-genome duplications (WGD) and small-scale
 35 | duplications (SSD) contribute to genome evolution in different manners. To
 36 | help you explore the different contributions of WGD and SSD to evolution, we
 37 | developed `r BiocStyle::Githubpkg("doubletrouble")`, a package that can be
 38 | used to identify and classify duplicated genes from whole-genome 
 39 | protein sequences, calculate substitution rates per substitution site (i.e.,
 40 | $K_a$ and $K_s$) for gene pairs, find peaks in $K_s$ distributions, and classify
 41 | gene pairs by age groups.
 42 | 
 43 | # Installation
 44 | 
 45 | You can install `r BiocStyle::Githubpkg("doubletrouble")` from Bioconductor
 46 | with the following code:
 47 | 
 48 | ```{r installation, eval = FALSE}
 49 | if(!requireNamespace("BiocManager", quietly = TRUE)) {
 50 |     install.packages("BiocManager")
 51 | }
 52 | 
 53 | BiocManager::install("doubletrouble")
 54 | 
 55 | ## Check that you have a valid Bioconductor installation
 56 | BiocManager::valid()
 57 | ```
 58 | 
 59 | Then, you can load the package:
 60 | 
 61 | ```{r load_package}
 62 | library(doubletrouble)
 63 | ```
 64 | 
 65 | # Input data
 66 | 
 67 | To identify and classify duplicated gene pairs, users need two types of input 
 68 | data:
 69 | 
 70 | 1. Whole-genome protein sequences (a.k.a. "proteome"), with only one protein 
 71 | sequence per gene (i.e., translated sequence of the primary transcript). 
 72 | These are typically stored in *.fasta* files.
 73 | 
 74 | 2. Gene annotation, with genomic coordinates of all features (i.e., genes, exons, 
 75 | etc). These are typically stored in *.gff3/.gff/.gtf* files.
 76 | 
 77 | 3. (Optional) Coding sequences (CDS), with only one DNA sequence sequence per 
 78 | gene. These are only required for users who want to calculate 
 79 | substitution rates (i.e., $K_a$, $K_s$, and their ratio $K_a/K_s$), 
 80 | and they are typically stored in *.fasta* files.
 81 | 
 82 | In the Bioconductor ecosystem, sequences and ranges are stored in 
 83 | standardized S4 classes named 
 84 | `XStringSet` (`AAStringSet` for proteins, `DNAStringSet` for DNA) and `GRanges`, 
 85 | respectively. This ensures seamless interoperability across packages, which
 86 | is important for both users and package developers. 
 87 | Thus, `r BiocStyle::Biocpkg("doubletrouble")` expects 
 88 | proteomes in lists of `AAStringSet` objects, and annotations in lists of
 89 | `GRanges` objects. Below you can find a summary of input data types, their
 90 | typical file formats, and Bioconductor class.
 91 | 
 92 | | Input data | File format | Bioconductor class | Requirement |
 93 | |:-----------|:------------|:-------------------|:------------|
 94 | | Proteome   | FASTA       | `AAStringSet`      | Mandatory   |
 95 | | Annotation | GFF3/GTF    | `GRanges`          | Mandatory   |
 96 | | CDS        | FASTA       | `DNAStringSet`     | Optional    |
 97 | 
 98 | Names of list elements represent species identifiers
 99 | (e.g., name, abbreviations, taxonomy IDs, or anything you like), and **must**
100 | be consistent across different lists, so correspondence can be made. 
101 | For instance, suppose you have an object named `seqs` with a list of 
102 | `AAStringSet` objects (proteomes for each species) 
103 | named *Athaliana*, *Alyrata*, and *Bnapus*. You also have an object 
104 | named `annotation` with a list of `GRanges` objects (gene annotation for
105 | each species). In this example, list names in `annotation` must also be
106 | *Athaliana*, *Alyrata*, and *Bnapus* (not necessarily in that order), so that
107 | `r BiocStyle::Biocpkg("doubletrouble")` knows that element *Athaliana* 
108 | in `seqs` corresponds to element *Athaliana* in `annotation`. You can check
109 | that with:
110 | 
111 | ```{r eval=FALSE}
112 | # Checking if names of lists match
113 | setequal(names(seqs), names(annotation)) # should return TRUE
114 | ```
115 | 
116 | **IMPORTANT:** If you have protein sequences as FASTA files in a directory,
117 | you can read them into a list of `AAStringSet` objects with the function
118 | `fasta2AAStringSetlist()` from the Bioconductor package
119 | `r BiocStyle::Biocpkg("syntenet")`. Likewise, you can get a `GRangesList`
120 | object from GFF/GTF files with the function `gff2GRangesList()`, also
121 | from `r BiocStyle::Biocpkg("syntenet")`.
122 | 
123 | 
124 | # Getting to know the example data sets
125 | 
126 | In this vignette, we will use data (proteome, gene annotations, and CDS) from 
127 | the yeast species *Saccharomyces cerevisiae* and *Candida glabrata*, 
128 | since their genomes are relatively small (and, hence, great for 
129 | demonstration purposes). Our goal here is to identify and classify duplicated 
130 | genes in the *S. cerevisiae* genome. The *C. glabrata* genome will be used as an outgroup to identify transposed duplicates later in this vignette.
131 | 
132 | 
133 | Data were obtained from Ensembl Fungi release 54 [@yates2022ensembl].
134 | While you can download these data manually from the Ensembl Fungi webpage (or
135 | through another repository such as NCBI RefSeq), here we will demonstrate how
136 | you can get data from Ensembl using the `r BiocStyle::Biocpkg("biomartr")` 
137 | package.
138 | 
139 | ```{r eval=FALSE}
140 | species <- c("Saccharomyces cerevisiae", "Candida glabrata")
141 | 
142 | # Download data from Ensembl with {biomartr}
143 | ## Whole-genome protein sequences (.fa)
144 | fasta_dir <- file.path(tempdir(), "proteomes")
145 | fasta_files <- biomartr::getProteomeSet(
146 |     db = "ensembl", organisms = species, path = fasta_dir
147 | )
148 | 
149 | ## Gene annotation (.gff3)
150 | gff_dir <- file.path(tempdir(), "annotation")
151 | gff_files <- biomartr::getGFFSet(
152 |     db = "ensembl", organisms = species, path = gff_dir
153 | )
154 | 
155 | ## CDS (.fa)
156 | cds_dir <- file.path(tempdir(), "CDS")
157 | cds_files <- biomartr::getCDSSet(
158 |     db = "ensembl", organisms = species, path = cds_dir
159 | )
160 | 
161 | # Import data to the R session
162 | ## Read .fa files with proteomes as a list of AAStringSet + clean names
163 | seq <- syntenet::fasta2AAStringSetlist(fasta_dir)
164 | names(seq) <- gsub("\\..*", "", names(seq))
165 | 
166 | ## Read .gff3 files as a list of GRanges
167 | annot <- syntenet::gff2GRangesList(gff_dir)
168 | names(annot) <- gsub("\\..*", "", names(annot))
169 | 
170 | ## Read .fa files with CDS as a list of DNAStringSet objects
171 | cds <- lapply(cds_files, Biostrings::readDNAStringSet)
172 | names(cds) <- gsub("\\..*", "", basename(cds_files))
173 | 
174 | # Process data
175 | ## Keep ranges for protein-coding genes only
176 | yeast_annot <- lapply(annot, function(x) {
177 |     gene_ranges <- x[x$biotype == "protein_coding" & x$type == "gene"]
178 |     gene_ranges <- IRanges::subsetByOverlaps(x, gene_ranges)
179 |     return(gene_ranges)
180 | })
181 | 
182 | ## Keep only longest sequence for each protein-coding gene (no isoforms)
183 | yeast_seq <- lapply(seq, function(x) {
184 |     # Keep only protein-coding genes
185 |     x <- x[grep("protein_coding", names(x))]
186 |     
187 |     # Leave only gene IDs in sequence names
188 |     names(x) <- gsub(".*gene:| .*", "", names(x))
189 |     
190 |     # If isoforms are present (same gene ID multiple times), keep the longest
191 |     x <- x[order(Biostrings::width(x), decreasing = TRUE)]
192 |     x <- x[!duplicated(names(x))]
193 |     
194 |     return(x)
195 | })
196 | ```
197 | 
198 | Note that processing might differ depending on the data source. For instance, 
199 | Ensembl adds gene 'biotypes' (e.g., protein-coding, pseudogene, etc) in sequence
200 | names and in a field named *biotype* in annotation files. Other databases
201 | might add these information elsewhere.
202 | 
203 | To avoid problems building this vignette (due to no/slow/unstable internet
204 | connection), the code chunk above is not executed. Instead, we ran such code
205 | and saved data in the following objects:
206 | 
207 | - **yeast_seq:** A list of `AAStringSet` objects with elements
208 | named *Scerevisiae* and *Cglabrata*.
209 | 
210 | - **yeast_annot:** A `GRangesList` object with elements 
211 | named *Scerevisiae* and *Cglabrata*.
212 | 
213 | Let's take a look at them.
214 | 
215 | ```{r example_data}
216 | # Load example data
217 | data(yeast_seq)
218 | data(yeast_annot)
219 | 
220 | yeast_seq
221 | yeast_annot
222 | ```
223 | 
224 | # Data preparation
225 | 
226 | First of all, we need to process the list of protein sequences and gene ranges
227 | to detect synteny with `r BiocStyle::Biocpkg("syntenet")`. We will do that
228 | using the function `process_input()` from 
229 | the `r BiocStyle::Biocpkg("syntenet")` package.
230 | 
231 | ```{r process_input}
232 | library(syntenet)
233 | 
234 | # Process input data
235 | pdata <- process_input(yeast_seq, yeast_annot)
236 | 
237 | # Inspect the output
238 | names(pdata)
239 | pdata$seq
240 | pdata$annotation
241 | ```
242 | 
243 | The processed data are represented as a list with the elements `seq` and
244 | `annotation`, each containing the protein sequences and gene ranges for
245 | each species, respectively.
246 | 
247 | Finally, we need to perform pairwise sequence similarity searches to
248 | identify the whole set of paralogous gene pairs. We can do this 
249 | using the function `run_diamond()` from the `r BiocStyle::Biocpkg("syntenet")`
250 | package [^1], setting `compare = "intraspecies"` to perform only intraspecies
251 | comparisons.
252 | 
253 | [^1]: **Note:** you need to have DIAMOND installed in your machine to run
254 | this function. If you don't have it, you can use 
255 | the `r BiocStyle::Biocpkg("Herper")` package to install DIAMOND in a Conda
256 | environment and run DIAMOND from this virtual environment.
257 | 
258 | ```{r run_diamond_intraspecies}
259 | data(diamond_intra)
260 | 
261 | # Run DIAMOND in sensitive mode for S. cerevisiae only
262 | if(diamond_is_installed()) {
263 |     diamond_intra <- run_diamond(
264 |         seq = pdata$seq["Scerevisiae"],
265 |         compare = "intraspecies", 
266 |         outdir = file.path(tempdir(), "diamond_intra"),
267 |         ... = "--sensitive"
268 |     )
269 | }
270 | 
271 | # Inspect output
272 | names(diamond_intra)
273 | head(diamond_intra$Scerevisiae_Scerevisiae)
274 | ```
275 | 
276 | And voilà! Now that we have the DIAMOND output and the processed annotation,
277 | you can classify the duplicated genes.
278 | 
279 | # Classifying duplicated gene pairs and genes
280 | 
281 | To classify duplicated gene pairs based on their modes of duplication,
282 | you will use the function `classify_gene_pairs()`. This function offers
283 | four different classification schemes, depending on how much detail you
284 | want. The classification schemes, along with the duplication modes
285 | they identify and their required input, are summarized in the table below:
286 | 
287 | 
288 | | Scheme   | Duplication modes           | Required input                                     |
289 | |:---------|:----------------------------|:---------------------------------------------------|
290 | | binary   | SD, SSD                     | `blast_list`, `annotation`                         |
291 | | standard | SD, TD, PD, DD              | `blast_list`, `annotation`                         |
292 | | extended | SD, TD, PD, TRD, DD         | `blast_list`, `annotation`, `blast_inter`          |
293 | | full     | SD, TD, PD, rTRD, dTRD, DD  | `blast_list`, `annotation`, `blast_inter`, `intron_counts` |
294 | 
295 | **Legend:** SD, segmental duplication. SSD, small-scale duplication.
296 | TD, tandem duplication. PD, proximal duplication. 
297 | TRD, transposon-derived duplication. rTRD, retrotransposon-derived duplication.
298 | dTRD, DNA transposon-derived duplication. DD, dispersed duplication.
299 | 
300 | 
301 | As shown in the table, the minimal input objects are:
302 | 
303 | - **blast_list**: A list of data frames with DIAMOND (or BLASTp, etc.) tabular 
304 | output for intraspecies comparisons as returned 
305 | by `syntenet::run_diamond(..., compare = 'intraspecies')`.
306 | - **annotation**: The processed annotation list (a `GRangesList` object) 
307 | as returned by `syntenet::process_input()`.
308 | 
309 | 
310 | However, if you also want to identify transposon-derived duplicates (TRD)
311 | and further classify them as retrotransposon-derived duplicates (rTRD) or 
312 | DNA transposon-derived duplicates (dTRD), you will need the following objects:
313 | 
314 | - **blast_list**: A list of data frames with DIAMOND (or BLASTp, etc.) tabular 
315 | output for interspecies comparisons (target species vs an outgroup) as returned 
316 | by `syntenet::run_diamond(..., compare = <comparison_data_frame>)`.
317 | - **intron_counts**: A list of data frames with the number of introns per gene
318 | for each species, as returned by `get_intron_counts()`.
319 | 
320 | 
321 | Below, we demonstrate each classification scheme with examples.
322 | 
323 | ## The *binary* scheme (SD vs SSD)
324 | 
325 | The binary scheme classifies duplicates as derived from either 
326 | segmental duplications (SD) or small-scale duplications (SSD).
327 | To identify segmental duplicates, the function `classify_gene_pairs()` 
328 | performs intragenome synteny detection scans 
329 | with `r BiocStyle::Biocpkg("syntenet")` and classifies any detected anchor
330 | pairs as segmental duplicates. The remaining pairs are classified as
331 | originating from small-scale duplications.
332 | 
333 | This scheme can be used by specifying `scheme = "binary"` in the
334 | function `classify_gene_pairs()`. 
335 | 
336 | ```{r binary_classification}
337 | # Binary scheme
338 | c_binary <- classify_gene_pairs(
339 |     annotation = pdata$annotation,
340 |     blast_list = diamond_intra,
341 |     scheme = "binary"
342 | )
343 | 
344 | # Inspecting the output
345 | names(c_binary)
346 | head(c_binary$Scerevisiae)
347 | 
348 | # How many pairs are there for each duplication mode?
349 | table(c_binary$Scerevisiae$type)
350 | ```
351 | 
352 | The function returns a list of data frames, each containing the duplicated
353 | gene pairs and their modes of duplication for each species (here, because
354 | we have only one species, this is a list of length 1).
355 | 
356 | ## The *standard* scheme (SSD &rarr; TD, PD, DD)
357 | 
358 | Gene pairs derived from small-scale duplications can be further classified
359 | as originating from tandem duplications (TD, genes are adjacent to each other),
360 | proximal duplications (PD, genes are separated by only a few genes), or
361 | dispersed duplications (DD, duplicates that do not fit in any of the previous
362 | categories).
363 | 
364 | This is the default classification scheme in `classify_gene_pairs()`,
365 | and it can be specified by setting `scheme = "standard"`.
366 | 
367 | ```{r expanded_classification}
368 | # Standard scheme
369 | c_standard <- classify_gene_pairs(
370 |     annotation = pdata$annotation,
371 |     blast_list = diamond_intra,
372 |     scheme = "standard"
373 | )
374 | 
375 | # Inspecting the output
376 | names(c_standard)
377 | head(c_standard$Scerevisiae)
378 | 
379 | # How many pairs are there for each duplication mode?
380 | table(c_standard$Scerevisiae$type)
381 | ```
382 | 
383 | ## The *extended* scheme (SSD &rarr; TD, PD, TRD, DD)
384 | 
385 | To find transposon-derived duplicates (TRD), the 
386 | function `classify_gene_pairs()` detects syntenic regions between a target
387 | species and an outgroup species. Genes in the target species that are in 
388 | syntenic regions with the outgroup are treated as *ancestral loci*. Then,
389 | if only one gene of the duplicate pair is an ancestral locus, this 
390 | duplicate pair is classified as originating from transposon-derived
391 | duplications. 
392 | 
393 | 
394 | Since finding transposon-derived duplicates requires detecting syntenic regions
395 | between a target species and an outgroup species, you will first need to 
396 | perform similarity searches with DIAMOND [@buchfink2021sensitive], 
397 | BLAST [@altschul1997gapped], or similar programs. This can be done with
398 | `syntenet::run_diamond(seq, compare = compare_df)`. For the parameter `compare`,
399 | you will pass a 2-column data frame specifying the comparisons to be made. [^3]
400 | Importantly, for a more accurate detection of interspecies synteny, you need to
401 | perform bidirectional similarity searches for each comparison. For instance,
402 | if you want to use `speciesA` as target species and `speciesB` as outgroup,
403 | you need to perform similarity searches in both directions: 
404 | `speciesA_speciesB` and `speciesB_speciesA`.
405 | 
406 | [^3]: **Pro tip:** If you want to identify and classify duplicated genes for
407 | multiple species in batch, you must include the outgroup for each of them
408 | in the comparisons data frame.
409 | 
410 | Here, we will identify duplicated gene pairs for *Saccharomyces cerevisiae*
411 | using *Candida glabrata* as an outgroup. To create a data frame with the 
412 | bidirectional comparisons to be made, we will use the helper function `make_bidirectional()` from the `r BiocStyle::Biocpkg("syntenet")` package.
413 | 
414 | ```{r make_bidirectional_comparisons}
415 | # Create a data frame of species and outgroups for `syntenet::run_diamond()`
416 | spp_outgroup <- data.frame(
417 |     species = "Scerevisiae",
418 |     outgroup = "Cglabrata"
419 | )
420 | spp_outgroup
421 | 
422 | # Expand the data frame to make bidirectional comparisons
423 | comparisons <- syntenet::make_bidirectional(spp_outgroup)
424 | comparisons
425 | ```
426 | 
427 | Now that we have a data frame with our desired comparisons, we can pass it
428 | to `syntenet::run_diamond()`.
429 | 
430 | ```{r blast_interspecies}
431 | data(diamond_inter) # load pre-computed output in case DIAMOND is not installed
432 | 
433 | # Run DIAMOND for the comparisons we specified
434 | if(diamond_is_installed()) {
435 |     diamond_inter <- run_diamond(
436 |         seq = pdata$seq,
437 |         compare = comparisons,
438 |         outdir = file.path(tempdir(), "diamond_inter"),
439 |         ... = "--sensitive"
440 |     )
441 | }
442 | 
443 | names(diamond_inter)
444 | head(diamond_inter$Scerevisiae_Cglabrata)
445 | ```
446 | 
447 | As you can see, for each species-outgroup pair, `diamond_inter` contains two
448 | data frames: one named `Scerevisiae_Cglabrata`, and one named 
449 | `Cglabrata_Scerevisiae`. Before actually classifying gene pairs, we will
450 | need to collapse these data frames so that we have 
451 | **only one data frame per species-outgroup pair**. This can be easily done
452 | with the function `collapse_bidirectional_hits()` from 
453 | `r BiocStyle::Biocpkg("syntenet")`. As input, this function takes a 
454 | list of interspecies DIAMOND data frames, and a 2-column data frame indicating
455 | the target species and the outgroup species (columns 1 and 2, respectively;
456 | double-check the order of the columns!).
457 | 
458 | ```{r collapse_hits}
459 | # For each species-outgroup pair, collapse bidirectional hits in one data frame
460 | diamond_inter <- syntenet::collapse_bidirectional_hits(
461 |     diamond_inter, compare = spp_outgroup
462 | )
463 | names(diamond_inter)
464 | ```
465 | 
466 | Then, we can pass this interspecies DIAMOND output as an argument to 
467 | the parameter `blast_inter` of `classify_gene_pairs()`.
468 | 
469 | ```{r full_classification}
470 | # Extended scheme
471 | c_extended <- classify_gene_pairs(
472 |     annotation = pdata$annotation,
473 |     blast_list = diamond_intra,
474 |     scheme = "extended",
475 |     blast_inter = diamond_inter
476 | )
477 | 
478 | # Inspecting the output
479 | names(c_extended)
480 | head(c_extended$Scerevisiae)
481 | 
482 | # How many pairs are there for each duplication mode?
483 | table(c_extended$Scerevisiae$type)
484 | ```
485 | 
486 | In the example above, we used only one outgroup species (*C. glabrata*). 
487 | However, since results might change depending on the chosen outgroup, 
488 | you can also use multiple outgroups in the comparisons data frame, and then
489 | run interspecies DIAMOND searches as above. For instance, suppose you want
490 | to use *speciesB*, *speciesC*, and *speciesD* as outgroups to *speciesA*.
491 | In this case, your data frame of comparisons (to be passed to the `compare`
492 | argument of `syntenet::run_diamond()`) would look like the following:
493 | 
494 | ```{r}
495 | # Example: multiple outgroups for the same species
496 | spp_outgroup_many <- data.frame(
497 |     species = rep("speciesA", 3),
498 |     outgroup = c("speciesB", "speciesC", "speciesD")
499 | )
500 | 
501 | comparisons_many <- syntenet::make_bidirectional(spp_outgroup_many)
502 | comparisons_many
503 | ```
504 | 
505 | When multiple outgroups are present, `classify_gene_pairs()` will check if
506 | gene pairs are classified as transposed (i.e., only one gene is an ancestral 
507 | locus) in each of the outgroup species, and then calculate the percentage of 
508 | outgroup species in which each pair is considered 'transposed'. For instance, 
509 | you could have gene pair 1 as transposed based on 30\% of the outgroup species, 
510 | gene pair 2 as transposed based on 100\% of the outgroup species, 
511 | gene pair 3 based on 0\% of the outgroup species, and so on. By default, 
512 | pairs are considered 'transposed' if they are classified as such 
513 | in >70% of the outgroups, but you can choose a different minimum percentage 
514 | cut-off using parameter `outgroup_coverage`.
515 | 
516 | ## The *full* scheme (SSD &rarr; TD, PD, rTRD, dTRD, DD)
517 | 
518 | Finally, the full scheme consists in classifying transposon-derived
519 | duplicates (TRD) further as originating from retrotransposons (rTRD) or
520 | DNA transposons (dTRD). To do that, the function `classify_gene_pairs()`
521 | uses the number of introns per gene to find TRD pairs for which
522 | one gene has at least 1 intron, and the other gene has no introns; if that
523 | is the case, the pair is classified as originating from the activity
524 | of retrotransposons (rTRD, i.e., the transposed gene without introns is
525 | a processed transcript that was retrotransposed back to the genome). All the 
526 | other TRD pairs are classified as DNA transposon-derived duplicates (dTRD).
527 | 
528 | 
529 | To classify duplicates using this scheme, you will first need to create a list
530 | of data frames with the number of introns per gene for each species. This
531 | can be done with the function `get_intron_counts()`, which takes a `TxDb` 
532 | object as input. `TxDb` objects store transcript annotations, and they 
533 | can be created with a family of functions
534 | named `makeTxDbFrom*` from the `r BiocStyle::Biocpkg("txdbmaker")`
535 | package (see `?get_intron_counts()` for a summary of all functions).
536 | 
537 | 
538 | Here, we will create a list of `TxDb` objects from a list of `GRanges` objects
539 | using the function `makeTxDbFromGRanges()` 
540 | from `r BiocStyle::Biocpkg("txdbmaker")`. Importantly, to create
541 | a `TxDb` from a `GRanges`, the `GRanges` object must contain genomic coordinates
542 | for all features, including transcripts, exons, etc. Because of that, we
543 | will use annotation from the example data set `yeast_annot`,
544 | which was not processed with `syntenet::process_input()`.
545 | 
546 | ```{r message=FALSE}
547 | library(txdbmaker)
548 | 
549 | # Create a list of `TxDb` objects from a list of `GRanges` objects
550 | txdb_list <- lapply(yeast_annot, txdbmaker::makeTxDbFromGRanges)
551 | txdb_list
552 | ```
553 | 
554 | Once we have the `TxDb` objects, we can get intron counts per gene with
555 | `get_intron_counts()`.
556 | 
557 | ```{r}
558 | # Get a list of data frames with intron counts per gene for each species
559 | intron_counts <- lapply(txdb_list, get_intron_counts)
560 | 
561 | # Inspecting the list
562 | names(intron_counts)
563 | head(intron_counts$Scerevisiae)
564 | ```
565 | 
566 | Finally, we can use this list to classify duplicates using the full scheme
567 | as follows:
568 | 
569 | ```{r}
570 | # Full scheme
571 | c_full <- classify_gene_pairs(
572 |     annotation = pdata$annotation,
573 |     blast_list = diamond_intra,
574 |     scheme = "full",
575 |     blast_inter = diamond_inter,
576 |     intron_counts = intron_counts
577 | )
578 | 
579 | # Inspecting the output
580 | names(c_full)
581 | head(c_full$Scerevisiae)
582 | 
583 | # How many pairs are there for each duplication mode?
584 | table(c_full$Scerevisiae$type)
585 | ```
586 | 
587 | 
588 | 
589 | # Classifying genes into unique modes of duplication
590 | 
591 | If you look carefully at the output of `classify_gene_pairs()`, you will notice
592 | that some genes appear in more than one duplicate pair, and these pairs can
593 | have different duplication modes assigned. There's nothing wrong with it.
594 | Consider, for example, a gene that was originated by a segmental duplication
595 | some 60 million years ago, and then it underwent a tandem duplication
596 | 5 million years ago. In the output of `classify_gene_pairs()`, you'd see
597 | this gene in two pairs, one with **SD** in the `type` column, and one
598 | with **TD**.
599 | 
600 | If you want to assign each gene to a unique mode of duplication, you can
601 | use the function `classify_genes()`. This function assigns duplication modes
602 | hierarchically using factor levels in column `type` as the priority order.
603 | The priority orders for each classification scheme are:
604 | 
605 | 1. **Binary:** SD > SSD.
606 | 2. **Standard:** SD > TD > PD > DD.
607 | 3. **Extended:** SD > TD > PD > TRD > DD. 
608 | 4. **Full:** SD > TD > PD > rTRD > dTRD > DD.
609 | 
610 | The input for `classify_genes()` is the list of gene pairs returned by
611 | `classify_gene_pairs()`.
612 | 
613 | ```{r classify_genes}
614 | # Classify genes into unique modes of duplication
615 | c_genes <- classify_genes(c_full)
616 | 
617 | # Inspecting the output
618 | names(c_genes)
619 | head(c_genes$Scerevisiae)
620 | 
621 | # Number of genes per mode
622 | table(c_genes$Scerevisiae$type)
623 | ```
624 | 
625 | # Calculating substitution rates for duplicated gene pairs
626 | 
627 | You can use the function `pairs2kaks()` to calculate rates of nonsynonymous 
628 | substitutions per nonsynonymous site ($K_a$), synonymouys substitutions per
629 | synonymous site ($K_s$), and their ratios ($K_a/K_s$). These rates are calculated
630 | using the Bioconductor package `r BiocStyle::Biocpkg("MSA2dist")`, which
631 | implements all codon models in KaKs_Calculator 2.0 [@wang2010kaks_calculator].
632 | 
633 | 
634 | For the purpose of demonstration, we will only calculate $K_a$, $K_s$, 
635 | and $K_a/K_s$ for 5 TD-derived gene pairs. The CDS for TD-derived 
636 | genes were obtained from Ensembl Fungi [@yates2022ensembl], and 
637 | they are stored in `cds_scerevisiae`.
638 | 
639 | ```{r kaks_calculation}
640 | data(cds_scerevisiae)
641 | head(cds_scerevisiae)
642 | 
643 | # Store DNAStringSet object in a list
644 | cds_list <- list(Scerevisiae = cds_scerevisiae)
645 | 
646 | # Keep only top five TD-derived gene pairs for demonstration purposes
647 | td_pairs <- c_full$Scerevisiae[c_full$Scerevisiae$type == "TD", ]
648 | gene_pairs <- list(Scerevisiae = td_pairs[seq(1, 5, by = 1), ])
649 | 
650 | # Calculate Ka, Ks, and Ka/Ks
651 | kaks <- pairs2kaks(gene_pairs, cds_list)
652 | 
653 | # Inspect the output
654 | head(kaks)
655 | ```
656 | 
657 | Importantly, `pairs2kaks()` expects all genes in the gene pairs to be present
658 | in the CDS, with matching names. Species abbreviations in gene pairs (added
659 | by `r BiocStyle::Biocpkg("syntenet")`) are automatically removed, so you should
660 | not add them to the sequence names of your CDS.
661 | 
662 | # Identifying and visualizing $K_s$ peaks
663 | 
664 | Peaks in $K_s$ distributions typically indicate whole-genome duplication (WGD) 
665 | events, and they can be identified by fitting Gaussian mixture models (GMMs) to 
666 | $K_s$ distributions. In `r BiocStyle::Githubpkg("doubletrouble")`, this can be 
667 | performed with the function `find_ks_peaks()`.
668 | 
669 | 
670 | However, because of saturation at higher $K_s$ values, only **recent WGD**
671 | events can be reliably identified from $K_s$ 
672 | distributions [@vanneste2013inference]. Recent WGD events are commonly found 
673 | in plant species, such as maize, soybean, apple, etc.
674 | Although the genomes of yeast species have signatures of WGD,
675 | these events are ancient, so it is very hard to find evidence for them
676 | using $K_s$ distributions. [^4] 
677 | 
678 | [^4]: **Tip:** You might be asking yourself: "How does one identify ancient
679 | WGD, then?". A common approach is to look for syntenic blocks (i.e.,
680 | regions with conserved gene content and order) within genomes. This is what
681 | `classify_gene_pairs()` does under the hood to find SD-derived gene pairs.
682 | 
683 | To demonstrate how you can find peaks in $K_s$ distributions
684 | with `find_ks_peaks()`, we will use a data frame containing $K_s$ values for
685 | duplicate pairs in the soybean (*Glycine max*) genome, which has undergone 
686 | 2 WGDs events ~13 and ~58 million years ago [@schmutz2010genome]. 
687 | Then, we will visualize $K_s$ distributions with peaks using the function
688 | `plot_ks_peaks()`.
689 | 
690 | First of all, let's look at the data and have a quick look at the distribution
691 | with the function `plot_ks_distro()` (more details on this function in the
692 | data visualization section).
693 | 
694 | ```{r ks_eda}
695 | # Load data and inspect it
696 | data(gmax_ks)
697 | head(gmax_ks)
698 | 
699 | # Plot distribution
700 | plot_ks_distro(gmax_ks)
701 | ```
702 | 
703 | By visual inspection, we can see 2 or 3 peaks. Based on our prior knowledge,
704 | we know that 2 WGD events have occurred in the ancestral of the *Glycine* genus
705 | and in the ancestral of all Fabaceae, which seem to correspond to the
706 | peaks we see at $K_s$ values around 0.1 and 0.5, respectively. There could be
707 | a third, flattened peak at around 1.6, which would represent the WGD shared
708 | by all eudicots. Let's test which number of peaks has more support: 2 or 3.
709 | 
710 | ```{r find_ks_peaks}
711 | # Find 2 and 3 peaks and test which one has more support
712 | peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = c(2, 3), verbose = TRUE)
713 | names(peaks)
714 | str(peaks)
715 | 
716 | # Visualize Ks distribution
717 | plot_ks_peaks(peaks)
718 | ```
719 | 
720 | As we can see, the presence of 3 peaks is more supported (lowest BIC). The
721 | function returns a list with the mean, variance and amplitude 
722 | of mixture components (i.e., peaks), as well as the $K_s$ distribution itself.
723 | 
724 | Now, suppose you just want to get the first 2 peaks. You can do that by
725 | explictly saying to `find_ks_peaks()` how many peaks there are. 
726 | 
727 | ```{r find_peaks_explicit}
728 | # Find 2 peaks ignoring Ks values > 1
729 | peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = 2, max_ks = 1)
730 | plot_ks_peaks(peaks)
731 | ```
732 | 
733 | **Important consideration on GMMs and $K_s$ distributions:**
734 | Peaks identified with GMMs should not be blindly regarded as "the truth".
735 | Using GMMs to find peaks in $K_s$ distributions can lead to problems such as
736 | overfitting and overclustering [@tiley2018assessing]. Some general 
737 | recommendations are:
738 | 
739 | 1. Use your prior knowledge. If you know how many peaks there are (e.g.,
740 | based on literature evidence), just tell the number to `find_ks_peaks()`.
741 | Likewise, if you are not sure about how many peaks there are, but you know
742 | the maximum number of peaks is N, don't test for the presence of >N peaks.
743 | GMMs can incorrectly identify more peaks than the actual number.
744 | 
745 | 2. Test the significance of each peak with SiZer (Significant ZERo crossings
746 | of derivatives) maps [@chaudhuri1999sizer].
747 | This can be done with the function `SiZer()` from 
748 | the R package `r BiocStyle::CRANpkg("feature")`.
749 | 
750 | As an example of a SiZer map, let's use `feature::SiZer()` to assess
751 | the significance of the 2 peaks we found previously.
752 | 
753 | ```{r sizer}
754 | # Get numeric vector of Ks values <= 1
755 | ks <- gmax_ks$Ks[gmax_ks$Ks <= 1]
756 | 
757 | # Get SiZer map
758 | feature::SiZer(ks)
759 | ```
760 | 
761 | The blue regions in the SiZer map indicate significantly increasing regions
762 | of the curve, which support the 2 peaks we found. 
763 | 
764 | # Classifying genes by age groups
765 | 
766 | Finally, you can use the peaks you obtained before to classify gene pairs
767 | by age group. Age groups are defined based on the $K_s$ peak to which pairs belong.
768 | This is useful if you want to analyze duplicate pairs 
769 | from a specific WGD event, for instance. You can do this with
770 | the function `split_pairs_by_peak()`. This function returns a list containing
771 | the classified pairs in a data frame, and a ggplot object with the 
772 | age boundaries highlighted in the histogram of $K_s$ values.
773 | 
774 | ```{r split_by_peak}
775 | # Gene pairs without age-based classification
776 | head(gmax_ks)
777 | 
778 | # Classify gene pairs by age group
779 | pairs_age_group <- split_pairs_by_peak(gmax_ks[, c(1,2,3)], peaks)
780 | 
781 | # Inspecting the output
782 | names(pairs_age_group)
783 | 
784 | # Take a look at the classified gene pairs
785 | head(pairs_age_group$pairs)
786 | 
787 | # Visualize Ks distro with age boundaries
788 | pairs_age_group$plot
789 | ```
790 | 
791 | Age groups can also be used to identify SD gene pairs that likely originated
792 | from whole-genome duplications. The rationale here is that segmental duplicates
793 | with $K_s$ values near $K_s$ peaks (indicating WGD events) were likely
794 | created by such WGDs. In a similar logic, SD pairs with $K_s$ values that
795 | are too distant from $K_s$ peaks (e.g., >2 standard deviations away from
796 | the mean) were likely created by duplications of large genomic segments, but 
797 | not duplications of the entire genome. 
798 | 
799 | As an example, to find gene pairs in the soybean genome that likely originated 
800 | from the WGD event shared by all legumes (at ~58 million years ago), 
801 | you'd need to extract SD pairs in age group 2 using the following code:
802 | 
803 | ```{r}
804 | # Get all pairs in age group 2
805 | pairs_ag2 <- pairs_age_group$pairs[pairs_age_group$pairs$peak == 2, c(1,2)]
806 | 
807 | # Get all SD pairs
808 | sd_pairs <- gmax_ks[gmax_ks$type == "SD", c(1,2)]
809 | 
810 | # Merge tables
811 | pairs_wgd_legumes <- merge(pairs_ag2, sd_pairs)
812 | 
813 | head(pairs_wgd_legumes)
814 | ```
815 | 
816 | # Data visualization
817 | 
818 | Last but not least, `r BiocStyle::Biocpkg("doubletrouble")` provides users
819 | with graphical functions to produce publication-ready plots from the output
820 | of `classify_gene_pairs()`, `classify_genes()`, and `pairs2kaks()`.
821 | Let's take a look at them one by one.
822 | 
823 | ## Visualizing the frequency of duplicates per mode
824 | 
825 | To visualize the frequency of duplicated gene pairs or genes by duplication
826 | type (as returned by `classify_gene_pairs()` and `classify_genes()`, 
827 | respectively), you will first need to create a data frame of counts with
828 | `duplicates2counts()`. To demonstrate how this works, we will use an
829 | example data set with duplicate pairs for 3 fungi species (and substitution
830 | rates, which will be ignored by `duplicates2counts()`).
831 | 
832 | ```{r}
833 | # Load data set with pre-computed duplicates for 3 fungi species
834 | data(fungi_kaks)
835 | names(fungi_kaks)
836 | head(fungi_kaks$saccharomyces_cerevisiae)
837 | 
838 | # Get a data frame of counts per mode in all species
839 | counts_table <- duplicates2counts(fungi_kaks |> classify_genes())
840 | 
841 | counts_table
842 | ```
843 | 
844 | Now, let's visualize the frequency of duplicate gene pairs by duplication
845 | type with the function `plot_duplicate_freqs()`. You can visualize frequencies
846 | in three different ways, as demonstrated below.
847 | 
848 | ```{r}
849 | # A) Facets
850 | p1 <- plot_duplicate_freqs(counts_table)
851 | 
852 | # B) Stacked barplot, absolute frequencies
853 | p2 <- plot_duplicate_freqs(counts_table, plot_type = "stack")
854 | 
855 | # C) Stacked barplot, relative frequencies
856 | p3 <- plot_duplicate_freqs(counts_table, plot_type = "stack_percent")
857 | 
858 | # Combine plots, one per row
859 | patchwork::wrap_plots(p1, p2, p3, nrow = 3) + 
860 |     patchwork::plot_annotation(tag_levels = "A")
861 | ```
862 | 
863 | If you want to visually the frequency of duplicated **genes** (not gene pairs),
864 | you'd first need to classify genes into unique modes of duplication
865 | with `classify_genes()`, and then repeat the code above. For example:
866 | 
867 | ```{r fig.height = 3, fig.width = 8}
868 | # Frequency of duplicated genes by mode
869 | classify_genes(fungi_kaks) |>   # classify genes into unique duplication types
870 |     duplicates2counts() |>      # get a data frame of counts (long format)
871 |     plot_duplicate_freqs()      # plot frequencies
872 | ```
873 | 
874 | ## Visualizing $K_s$ distributions
875 | 
876 | As briefly demonstrated before, to plot a $K_s$ distribution for the
877 | whole paranome, you will use the function `plot_ks_distro()`.
878 | 
879 | ```{r fig.height=3, fig.width=9}
880 | ks_df <- fungi_kaks$saccharomyces_cerevisiae
881 | 
882 | # A) Histogram, whole paranome
883 | p1 <- plot_ks_distro(ks_df, plot_type = "histogram")
884 | 
885 | # B) Density, whole paranome
886 | p2 <- plot_ks_distro(ks_df, plot_type = "density") 
887 | 
888 | # C) Histogram with density lines, whole paranome
889 | p3 <- plot_ks_distro(ks_df, plot_type = "density_histogram")
890 | 
891 | # Combine plots side by side
892 | patchwork::wrap_plots(p1, p2, p3, nrow = 1) +
893 |     patchwork::plot_annotation(tag_levels = "A")
894 | ```
895 | 
896 | However, visualizing the distribution for the whole paranome can mask patterns
897 | that only happen for duplicates originating from particular duplication types.
898 | For instance, when looking for evidence of WGD events,
899 | visualizing the $K_s$ distribution for SD-derived pairs only can reveal
900 | whether syntenic genes cluster together, suggesting the presence of WGD history.
901 | To visualize the distribution by duplication type, use `bytype = TRUE` in
902 | `plot_ks_distro()`.
903 | 
904 | ```{r fig.width = 8, fig.height=4}
905 | # A) Duplicates by type, histogram
906 | p1 <- plot_ks_distro(ks_df, bytype = TRUE, plot_type = "histogram")
907 | 
908 | # B) Duplicates by type, violin
909 | p2 <- plot_ks_distro(ks_df, bytype = TRUE, plot_type = "violin")
910 | 
911 | # Combine plots side by side
912 | patchwork::wrap_plots(p1, p2) +
913 |     patchwork::plot_annotation(tag_levels = "A")
914 | ```
915 | 
916 | ## Visualizing substitution rates by species
917 | 
918 | The function `plot_rates_by_species()` can be used to show distributions of
919 | substitution rates ($K_s$, $K_a$, or their ratio $K_a/K_s$) by species.
920 | You can choose which rate you want to visualize, and whether or not to
921 | group gene pairs by duplication mode, as demonstrated below.
922 | 
923 | ```{r fig.width = 6, fig.height = 4}
924 | # A) Ks for each species
925 | p1 <- plot_rates_by_species(fungi_kaks)
926 | 
927 | # B) Ka/Ks by duplication type for each species
928 | p2 <- plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks", bytype = TRUE)
929 | 
930 | # Combine plots - one per row
931 | patchwork::wrap_plots(p1, p2, nrow = 2) +
932 |     patchwork::plot_annotation(tag_levels = "A")
933 | ```
934 | 
935 | # Session information {.unnumbered}
936 | 
937 | This document was created under the following conditions:
938 | 
939 | ```{r session_info}
940 | sessioninfo::session_info()
941 | ```
942 | 
943 | # References {.unnumbered}
944 | 
945 | 


--------------------------------------------------------------------------------