├── _pkgdown.yml
├── .github
├── .gitignore
└── workflows
│ ├── test-coverage.yaml
│ ├── pkgdown.yaml
│ ├── R-CMD-check.yaml
│ └── format-code.yml
├── vignettes
├── .gitignore
└── figures
│ ├── interaction_2hwg.png
│ ├── peptide_map_1zmr.png
│ ├── peptide_map_2hwg.png
│ ├── peptide_map_1zmr_score.png
│ └── peptide_map_2hwg_score.png
├── revdep
├── failures.md
├── problems.md
├── .gitignore
├── cran.md
└── README.md
├── data
├── metal_list.rda
├── ptsi_pgk.rda
├── mako_colours.rda
├── protti_colours.rda
├── rapamycin_10uM.rda
├── viridis_colours.rda
├── metal_chebi_uniprot.rda
├── metal_go_slim_subset.rda
└── rapamycin_dose_response.rda
├── tests
├── testthat.R
└── testthat
│ ├── test_import.csv
│ └── test-queue_functions.R
├── LICENSE
├── man
├── figures
│ ├── logo.png
│ ├── README-volcano-1.png
│ ├── lifecycle-stable.svg
│ ├── lifecycle-defunct.svg
│ ├── lifecycle-archived.svg
│ ├── lifecycle-maturing.svg
│ ├── lifecycle-deprecated.svg
│ ├── lifecycle-superseded.svg
│ ├── lifecycle-experimental.svg
│ └── lifecycle-questioning.svg
├── protti_colours.Rd
├── mako_colours.Rd
├── viridis_colours.Rd
├── metal_list.Rd
├── plot_peptide_profiles.Rd
├── peptide_type.Rd
├── plot_pval_distribution.Rd
├── sequence_coverage.Rd
├── split_metal_name.Rd
├── volcano_protti.Rd
├── plot_drc_4p.Rd
├── median_normalisation.Rd
├── kegg_enrichment.Rd
├── network_analysis.Rd
├── fetch_go.Rd
├── metal_chebi_uniprot.Rd
├── replace_identified_by_x.Rd
├── metal_go_slim_subset.Rd
├── fetch_kegg.Rd
├── treatment_enrichment.Rd
├── read_protti.Rd
├── go_enrichment.Rd
├── rapamycin_10uM.Rd
├── scale_protti.Rd
├── rapamycin_dose_response.Rd
├── fetch_chebi.Rd
├── calculate_sequence_coverage.Rd
├── find_chebis.Rd
├── find_peptide.Rd
├── drc_4p.Rd
├── ttest_protti.Rd
├── normalise.Rd
├── pval_distribution_plot.Rd
├── anova_protti.Rd
├── fetch_uniprot_proteome.Rd
├── fetch_mobidb.Rd
├── assign_peptide_type.Rd
├── ptsi_pgk.Rd
├── try_query.Rd
├── find_all_subs.Rd
├── qc_sequence_coverage.Rd
├── qc_median_intensities.Rd
├── fetch_uniprot.Rd
├── qc_contaminants.Rd
├── qc_intensity_distribution.Rd
├── qc_proteome_coverage.Rd
├── calculate_imputation.Rd
├── qc_data_completeness.Rd
├── fetch_alphafold_aligned_error.Rd
├── qc_sample_correlation.Rd
├── randomise_queue.Rd
├── qc_cvs.Rd
├── fetch_eco.Rd
├── filter_cv.Rd
├── qc_ranked_intensities.Rd
├── calculate_aa_scores.Rd
├── qc_peak_width.Rd
├── qc_ids.Rd
├── qc_pca.Rd
├── qc_peptide_type.Rd
├── qc_charge_states.Rd
├── predict_alphafold_domain.Rd
├── diff_abundance.Rd
├── fetch_quickgo.Rd
├── fetch_pdb.Rd
├── qc_missed_cleavages.Rd
├── barcode_plot.Rd
└── calculate_kegg_enrichment.Rd
├── pkgdown
└── favicon
│ ├── favicon.ico
│ ├── favicon-16x16.png
│ ├── favicon-32x32.png
│ ├── apple-touch-icon.png
│ ├── apple-touch-icon-120x120.png
│ ├── apple-touch-icon-152x152.png
│ ├── apple-touch-icon-180x180.png
│ ├── apple-touch-icon-60x60.png
│ └── apple-touch-icon-76x76.png
├── .Rbuildignore
├── codecov.yml
├── protti.Rproj
├── R
├── zzz.R
├── read_protti.R
├── scale_protti.R
├── replace_identified_by_x.R
├── ttest_protti.R
├── fetch_go.R
├── find_chebis.R
├── find_peptide.R
├── drc_4p.R
├── anova_protti.R
├── fetch_kegg.R
├── find_all_subs.R
├── normalise.R
├── fetch_uniprot_proteome.R
├── calculate_sequence_coverage.R
├── pval_distribution_plot.R
├── assign_peptide_type.R
├── qc_median_intensities.R
├── calculate_imputation.R
└── calculate_aa_scores.R
├── inst
└── CITATION
├── cran-comments.md
├── .gitignore
├── LICENSE.md
├── data-raw
├── rapamycin_10uM.R
├── rapamycin_dose_response.R
├── ptsi_pgk.R
└── protti_colours.R
└── DESCRIPTION
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/revdep/failures.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*
--------------------------------------------------------------------------------
/revdep/problems.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*
--------------------------------------------------------------------------------
/data/metal_list.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/metal_list.rda
--------------------------------------------------------------------------------
/data/ptsi_pgk.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/ptsi_pgk.rda
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(protti)
3 |
4 | test_check("protti")
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: ETH Zurich, Jan-Philipp Quast, Dina Schuster
3 |
--------------------------------------------------------------------------------
/data/mako_colours.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/mako_colours.rda
--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/man/figures/logo.png
--------------------------------------------------------------------------------
/data/protti_colours.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/protti_colours.rda
--------------------------------------------------------------------------------
/data/rapamycin_10uM.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/rapamycin_10uM.rda
--------------------------------------------------------------------------------
/data/viridis_colours.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/viridis_colours.rda
--------------------------------------------------------------------------------
/data/metal_chebi_uniprot.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/metal_chebi_uniprot.rda
--------------------------------------------------------------------------------
/data/metal_go_slim_subset.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/metal_go_slim_subset.rda
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/favicon.ico
--------------------------------------------------------------------------------
/data/rapamycin_dose_response.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/data/rapamycin_dose_response.rda
--------------------------------------------------------------------------------
/man/figures/README-volcano-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/man/figures/README-volcano-1.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/favicon-16x16.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/favicon-32x32.png
--------------------------------------------------------------------------------
/tests/testthat/test_import.csv:
--------------------------------------------------------------------------------
1 | Test.column,TestColumn,Test_column
2 | 10.1,_ABC_,1
3 | 11.3,_ABC_,2
4 | 14.1,_ABC_,3
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon.png
--------------------------------------------------------------------------------
/vignettes/figures/interaction_2hwg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/interaction_2hwg.png
--------------------------------------------------------------------------------
/vignettes/figures/peptide_map_1zmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/peptide_map_1zmr.png
--------------------------------------------------------------------------------
/vignettes/figures/peptide_map_2hwg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/peptide_map_2hwg.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-120x120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-120x120.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-152x152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-152x152.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-180x180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-180x180.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-60x60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-60x60.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-76x76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-76x76.png
--------------------------------------------------------------------------------
/vignettes/figures/peptide_map_1zmr_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/peptide_map_1zmr_score.png
--------------------------------------------------------------------------------
/vignettes/figures/peptide_map_2hwg_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/peptide_map_2hwg_score.png
--------------------------------------------------------------------------------
/revdep/.gitignore:
--------------------------------------------------------------------------------
1 | checks
2 | library
3 | checks.noindex
4 | library.noindex
5 | data.sqlite
6 | *.html
7 | download
8 | lib
9 | cloud.noindex
--------------------------------------------------------------------------------
/revdep/cran.md:
--------------------------------------------------------------------------------
1 | ## revdepcheck results
2 |
3 | We checked 1 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
4 |
5 | * We saw 0 new problems
6 | * We failed to check 0 packages
7 |
8 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^protti\.Rproj$
2 | ^\.Rproj\.user$
3 | ^LICENSE\.md$
4 | ^README\.Rmd$
5 | ^data-raw$
6 | ^\.travis\.yml$
7 | ^\.github$
8 | ^codecov\.yml$
9 | ^doc$
10 | ^Meta$
11 | ^_pkgdown\.yml$
12 | ^docs$
13 | ^pkgdown$
14 | ^cran-comments\.md$
15 | ^CRAN-RELEASE$
16 | ^CRAN-SUBMISSION$
17 | ^revdep$
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 |
3 | coverage:
4 | status:
5 | project:
6 | default:
7 | target: auto
8 | threshold: 1%
9 | informational: true
10 | patch:
11 | default:
12 | target: auto
13 | threshold: 1%
14 | informational: true
15 |
--------------------------------------------------------------------------------
/man/protti_colours.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{protti_colours}
5 | \alias{protti_colours}
6 | \title{Colour scheme for protti}
7 | \format{
8 | A vector containing 100 colours
9 | }
10 | \source{
11 | Dina's imagination.
12 | }
13 | \usage{
14 | protti_colours
15 | }
16 | \description{
17 | A colour scheme for protti that contains 100 colours.
18 | }
19 | \keyword{datasets}
20 |
--------------------------------------------------------------------------------
/protti.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: No
4 | SaveWorkspace: No
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 |
--------------------------------------------------------------------------------
/man/mako_colours.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{mako_colours}
5 | \alias{mako_colours}
6 | \title{Viridis colour scheme}
7 | \format{
8 | A vector containing 256 colours
9 | }
10 | \source{
11 | created for the Seaborn statistical data visualization package for Python
12 | }
13 | \usage{
14 | mako_colours
15 | }
16 | \description{
17 | A perceptually uniform colour scheme originally created for the Seaborn python package.
18 | }
19 | \keyword{datasets}
20 |
--------------------------------------------------------------------------------
/man/viridis_colours.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{viridis_colours}
5 | \alias{viridis_colours}
6 | \title{Viridis colour scheme}
7 | \format{
8 | A vector containing 256 colours
9 | }
10 | \source{
11 | viridis R package, created by Stéfan van der Walt (stefanv) and Nathaniel Smith (njsmith)
12 | }
13 | \usage{
14 | viridis_colours
15 | }
16 | \description{
17 | A colour scheme by the viridis colour scheme from the viridis R package.
18 | }
19 | \keyword{datasets}
20 |
--------------------------------------------------------------------------------
/man/metal_list.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{metal_list}
5 | \alias{metal_list}
6 | \title{List of metals}
7 | \format{
8 | A data.frame containing the columns \code{atomic_number}, \code{symbol}, \code{name},
9 | \code{type}, \code{chebi_id}.
10 | }
11 | \source{
12 | https://en.wikipedia.org/wiki/Metal and https://en.wikipedia.org/wiki/Metalloid
13 | }
14 | \usage{
15 | metal_list
16 | }
17 | \description{
18 | A list of all metals and metalloids in the periodic table.
19 | }
20 | \keyword{datasets}
21 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onAttach <- function(libname, pkgname) {
2 | if (.Platform$OS.type == "unix") {
3 | packageStartupMessage(
4 | "\U1F469\U1F3FD\U200D\U1F52C Welcome to protti version ",
5 | utils::packageVersion("protti"),
6 | "! \U1F468\U1F3FC\U200D\U1F4BB
7 | \n\U1F52C Have fun analysing your data! \U1F4BB"
8 | )
9 | }
10 | if (.Platform$OS.type == "windows") {
11 | packageStartupMessage(
12 | "Welcome to protti version ",
13 | utils::packageVersion("protti"), "!
14 | \nHave fun analysing your data!"
15 | )
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | citHeader("To cite protti in publications, please use:")
2 |
3 | bibentry(bibtype = "article",
4 | textVersion = "Quast, J.P., Schuster, D., Picotti, P. (2022). protti: an R package for comprehensive data analysis of peptide- and protein-centric bottom-up proteomics data. Bioinformatics Advances, 2(1).",
5 | author = "Jan-Philipp Quast, Dina Schuster, Paola Picotti",
6 | title = "protti: an R package for comprehensive data analysis of peptide- and protein-centric bottom-up proteomics data",
7 | journal = "Bioinformatics Advances",
8 | year = "2022",
9 | volume = "2",
10 | number = "1",
11 | )
12 |
--------------------------------------------------------------------------------
/man/plot_peptide_profiles.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/peptide_profile_plot.R
3 | \name{plot_peptide_profiles}
4 | \alias{plot_peptide_profiles}
5 | \title{Peptide abundance profile plot}
6 | \usage{
7 | plot_peptide_profiles(...)
8 | }
9 | \value{
10 | A list of peptide profile plots.
11 | }
12 | \description{
13 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
14 | This function was deprecated due to its name changing to \code{peptide_profile_plot()}.
15 | }
16 | \keyword{internal}
17 |
--------------------------------------------------------------------------------
/man/peptide_type.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/assign_peptide_type.R
3 | \name{peptide_type}
4 | \alias{peptide_type}
5 | \title{Assign peptide type}
6 | \usage{
7 | peptide_type(...)
8 | }
9 | \value{
10 | A data frame that contains the input data and an additional column with the peptide
11 | type information.
12 | }
13 | \description{
14 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
15 | This function was deprecated due to its name changing to \code{assign_peptide_type()}.
16 | }
17 | \keyword{internal}
18 |
--------------------------------------------------------------------------------
/man/plot_pval_distribution.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/pval_distribution_plot.R
3 | \name{plot_pval_distribution}
4 | \alias{plot_pval_distribution}
5 | \title{Plot histogram of p-value distribution}
6 | \usage{
7 | plot_pval_distribution(...)
8 | }
9 | \value{
10 | A histogram plot that shows the p-value distribution.
11 | }
12 | \description{
13 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
14 | This function was deprecated due to its name changing to \code{pval_distribution_plot()}.
15 | }
16 | \keyword{internal}
17 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Submission
2 |
3 | * We specifically addressed and fixed the issue raised by Prof. Brian Ripley:
4 | * We updated `try_query()` to also handle request unrelated errors successfully.
5 |
6 | ## Test environments
7 | * macOS-latest (on GitHub actions), R 4.4.1
8 | * windows-latest (on GitHub actions), R 4.4.1
9 | * ubuntu-20.04 (on GitHub actions), R 4.4.1
10 | * ubuntu-20.04 (on GitHub actions), r-devel
11 | * windows-ix86+x86_64 (win-builder), r-devel
12 | * fedora-clang-devel (R-hub), r-devel
13 | * windows-x86_64-devel (R-hub), r-devel
14 | * Ubuntu Linux 20.04.1 LTS (R-hub), r-release
15 |
16 | ## R CMD check results
17 |
18 | 0 errors ✓ | 0 warnings ✓ | 0 notes ✓
19 |
20 |
--------------------------------------------------------------------------------
/man/sequence_coverage.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_sequence_coverage.R
3 | \name{sequence_coverage}
4 | \alias{sequence_coverage}
5 | \title{Protein sequence coverage}
6 | \usage{
7 | sequence_coverage(...)
8 | }
9 | \value{
10 | A new column in the \code{data} data frame containing the calculated sequence coverage
11 | for each identified protein
12 | }
13 | \description{
14 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
15 | This function was deprecated due to its name changing to \code{calculate_sequence_coverage()}.
16 | }
17 | \keyword{internal}
18 |
--------------------------------------------------------------------------------
/man/split_metal_name.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_metal_name.R
3 | \name{split_metal_name}
4 | \alias{split_metal_name}
5 | \title{Convert metal names to search pattern}
6 | \usage{
7 | split_metal_name(metal_names)
8 | }
9 | \arguments{
10 | \item{metal_names}{a character vector containing names of metals and metal containing molecules.}
11 | }
12 | \value{
13 | A character vector with metal name search patterns.
14 | }
15 | \description{
16 | Converts a vector of metal names extracted from the \code{ft_metal} column
17 | obtained with \code{fetch_uniprot} to a pattern that can be used to search for corresponding
18 | ChEBI IDs. This is used as a helper function for other functions.
19 | }
20 |
--------------------------------------------------------------------------------
/man/volcano_protti.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/volcano_plot.R
3 | \name{volcano_protti}
4 | \alias{volcano_protti}
5 | \title{Volcano plot}
6 | \usage{
7 | volcano_protti(...)
8 | }
9 | \value{
10 | Depending on the method used a volcano plot with either highlighted targets
11 | (\code{method = "target"}) or highlighted significant proteins (\code{method = "significant"})
12 | is returned.
13 | }
14 | \description{
15 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
16 | This function was deprecated due to its name changing to \code{volcano_plot()}.
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/man/plot_drc_4p.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/drc_4p_plot.R
3 | \name{plot_drc_4p}
4 | \alias{plot_drc_4p}
5 | \title{Perform gene ontology enrichment analysis}
6 | \usage{
7 | plot_drc_4p(...)
8 | }
9 | \value{
10 | If \code{targets = "all"} a list containing plots for every unique identifier in the
11 | \code{grouping} variable is created. Otherwise a plot for the specified targets is created with
12 | maximally 20 facets.
13 | }
14 | \description{
15 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
16 | This function was deprecated due to its name changing to \code{drc_4p_plot()}.
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/man/median_normalisation.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/normalise.R
3 | \name{median_normalisation}
4 | \alias{median_normalisation}
5 | \title{Intensity normalisation}
6 | \usage{
7 | median_normalisation(...)
8 | }
9 | \value{
10 | A data frame with a column called \code{normalised_intensity_log2} containing the
11 | normalised intensity values.
12 | }
13 | \description{
14 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
15 | This function was deprecated due to its name changing to \code{normalise()}.
16 | The normalisation method in the new function needs to be provided as an argument.
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/man/kegg_enrichment.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_kegg_enrichment.R
3 | \name{kegg_enrichment}
4 | \alias{kegg_enrichment}
5 | \title{Perform KEGG pathway enrichment analysis}
6 | \usage{
7 | kegg_enrichment(...)
8 | }
9 | \value{
10 | A bar plot displaying negative log10 adjusted p-values for the top 10 enriched pathways.
11 | Bars are coloured according to the direction of the enrichment. If \code{plot = FALSE}, a data
12 | frame is returned.
13 | }
14 | \description{
15 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
16 | This function was deprecated due to its name changing to \code{calculate_kegg_enrichment()}.
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | .Rapp.history
4 |
5 | # Session Data files
6 | .RData
7 |
8 | # User-specific files
9 | .Ruserdata
10 |
11 | # Example code in package build process
12 | *-Ex.R
13 |
14 | # Output files from R CMD build
15 | /*.tar.gz
16 |
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 |
20 | # RStudio files
21 | .Rproj.user/
22 |
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 |
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 |
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 |
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 |
38 | # R Environment Variables
39 | .Renviron
40 |
41 | .DS_Store
42 | inst/doc
43 | doc
44 | Meta
45 | docs
46 | /doc/
47 | /Meta/
48 |
--------------------------------------------------------------------------------
/man/network_analysis.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/analyse_functional_network.R
3 | \name{network_analysis}
4 | \alias{network_analysis}
5 | \title{Analyse protein interaction network for significant hits}
6 | \usage{
7 | network_analysis(...)
8 | }
9 | \value{
10 | A network plot displaying interactions of the provided proteins. If
11 | \code{binds_treatment} was provided halos around the proteins show which proteins interact with
12 | the treatment. If \code{plot = FALSE} a data frame with interaction information is returned.
13 | }
14 | \description{
15 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
16 | This function was deprecated due to its name changing to \code{analyse_functional_network()}.
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/man/fetch_go.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_go.R
3 | \name{fetch_go}
4 | \alias{fetch_go}
5 | \title{Fetch gene ontology information from geneontology.org}
6 | \usage{
7 | fetch_go(organism_id)
8 | }
9 | \arguments{
10 | \item{organism_id}{a character value NCBI taxonomy identifier of an organism (TaxId).
11 | Possible inputs inlude only: "9606" (Human), "559292" (Yeast) and "83333" (E. coli).}
12 | }
13 | \value{
14 | A data frame that contains gene ontology mappings to UniProt or SGD IDs. The original
15 | file is a .GAF file. A detailed description of all columns can be found here:
16 | http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/
17 | }
18 | \description{
19 | Fetches gene ontology data from geneontology.org for the provided organism ID.
20 | }
21 | \examples{
22 | \donttest{
23 | go <- fetch_go("9606")
24 |
25 | head(go)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/man/metal_chebi_uniprot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{metal_chebi_uniprot}
5 | \alias{metal_chebi_uniprot}
6 | \title{List of metal-related ChEBI IDs in UniProt}
7 | \format{
8 | A data.frame containing information retrieved from ChEBI using \code{fetch_chebi(stars = c(2, 3))},
9 | filtered using symbols in the \code{metal_list} and manual annotation of metal related ChEBI IDs that do not
10 | contain a formula.
11 | }
12 | \source{
13 | UniProt (cc_cofactor, cc_catalytic_activity, ft_binding) and ChEBI
14 | }
15 | \usage{
16 | metal_chebi_uniprot
17 | }
18 | \description{
19 | A list that contains all ChEBI IDs that appear in UniProt and that contain either a metal atom
20 | in their formula or that do not have a formula but the ChEBI term is related to metals.
21 | This was last updated on the 19/02/24.
22 | }
23 | \keyword{datasets}
24 |
--------------------------------------------------------------------------------
/man/replace_identified_by_x.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/replace_identified_by_x.R
3 | \name{replace_identified_by_x}
4 | \alias{replace_identified_by_x}
5 | \title{Replace identified positions in protein sequence by "x"}
6 | \usage{
7 | replace_identified_by_x(sequence, positions_start, positions_end)
8 | }
9 | \arguments{
10 | \item{sequence}{a character value that contains the protein sequence.}
11 |
12 | \item{positions_start}{a numeric vector of start positions of the identified peptides.}
13 |
14 | \item{positions_end}{a numeric vector of end positions of the identified peptides.}
15 | }
16 | \value{
17 | A character vector that contains the modified protein sequence with each identified
18 | position replaced by "x".
19 | }
20 | \description{
21 | Helper function for the calculation of sequence coverage, replaces identified positions with an
22 | "x" within the protein sequence.
23 | }
24 |
--------------------------------------------------------------------------------
/man/figures/lifecycle-stable.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/lifecycle-defunct.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/metal_go_slim_subset.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{metal_go_slim_subset}
5 | \alias{metal_go_slim_subset}
6 | \title{Molecular function gene ontology metal subset}
7 | \format{
8 | A data.frame containing a slim subset of molecular function gene ontology terms
9 | that are related to metal binding. The \code{slims_from_id} column contains all IDs relevant
10 | in this subset while the \code{slims_to_ids} column contains the starting IDs. If ChEBI IDs
11 | have been annotated manually this is indicated in the \code{database} column.
12 | }
13 | \source{
14 | QuickGO and ChEBI
15 | }
16 | \usage{
17 | metal_go_slim_subset
18 | }
19 | \description{
20 | A subset of molecular function gene ontology terms related to metals that was created
21 | using the slimming process provided by the QuickGO EBI database.
22 | This was last updated on the 19/02/24.
23 | }
24 | \keyword{datasets}
25 |
--------------------------------------------------------------------------------
/man/figures/lifecycle-archived.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/lifecycle-maturing.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/lifecycle-deprecated.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/lifecycle-superseded.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/lifecycle-experimental.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/lifecycle-questioning.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/fetch_kegg.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_kegg.R
3 | \name{fetch_kegg}
4 | \alias{fetch_kegg}
5 | \title{Fetch KEGG pathway data from KEGG}
6 | \usage{
7 | fetch_kegg(species)
8 | }
9 | \arguments{
10 | \item{species}{a character value providing an abreviated species name. "hsa" for human, "eco"
11 | for E. coli and "sce" for S. cerevisiae. Additional possible names can be found for
12 | \href{https://www.genome.jp/kegg-bin/show_organism?category=Eukaryotes}{eukaryotes} and for
13 | \href{https://www.genome.jp/kegg-bin/show_organism?category=Prokaryotes}{prokaryotes}.}
14 | }
15 | \value{
16 | A data frame that contains gene IDs with corresponding pathway IDs and names for a
17 | selected organism.
18 | }
19 | \description{
20 | Fetches gene IDs and corresponding pathway IDs and names for the provided organism.
21 | }
22 | \examples{
23 | \donttest{
24 | kegg <- fetch_kegg(species = "hsa")
25 |
26 | head(kegg)
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/man/treatment_enrichment.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_treatment_enrichment.R
3 | \name{treatment_enrichment}
4 | \alias{treatment_enrichment}
5 | \title{Check treatment enrichment}
6 | \usage{
7 | treatment_enrichment(...)
8 | }
9 | \value{
10 | A bar plot displaying the percentage of all detect proteins and all significant proteins
11 | that bind to the treatment. A Fisher's exact test is performed to calculate the significance of
12 | the enrichment in significant proteins compared to all proteins. The result is reported as a
13 | p-value. If \code{plot = FALSE} a contingency table in long format is returned.
14 | }
15 | \description{
16 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
17 | This function was deprecated due to its name changing to \code{calculate_treatment_enrichment()}.
18 | }
19 | \keyword{internal}
20 |
--------------------------------------------------------------------------------
/man/read_protti.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/read_protti.R
3 | \name{read_protti}
4 | \alias{read_protti}
5 | \title{Read, clean and convert}
6 | \usage{
7 | read_protti(filename, ...)
8 | }
9 | \arguments{
10 | \item{filename}{a character value that specifies the path to the file.}
11 |
12 | \item{...}{additional arguments for the fread function.}
13 | }
14 | \value{
15 | A data frame (with class tibble) that contains the content of the specified file.
16 | }
17 | \description{
18 | The function uses the very fast \code{fread} function form the \code{data.table} package. The
19 | column names of the resulting data table are made more r-friendly using \code{clean_names} from
20 | the \code{janitor} package. It replaces "." and " " with "_" and converts names to lower case
21 | which is also known as snake_case. In the end the data table is converted to a tibble.
22 | }
23 | \examples{
24 | \dontrun{
25 | read_protti("folder\\\\filename")
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/man/go_enrichment.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_go_enrichment.R
3 | \name{go_enrichment}
4 | \alias{go_enrichment}
5 | \title{Perform gene ontology enrichment analysis}
6 | \usage{
7 | go_enrichment(...)
8 | }
9 | \value{
10 | A bar plot displaying negative log10 adjusted p-values for the top 10 enriched or
11 | depleted gene ontology terms. Alternatively, plot cutoffs can be chosen individually with the
12 | \code{plot_cutoff} argument. Bars are colored according to the direction of the enrichment
13 | (enriched or deenriched). If \code{plot = FALSE}, a data frame is returned. P-values are
14 | adjusted with Benjamini-Hochberg.
15 | }
16 | \description{
17 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
18 | This function was deprecated due to its name changing to \code{calculate_go_enrichment()}.
19 | }
20 | \keyword{internal}
21 |
--------------------------------------------------------------------------------
/man/rapamycin_10uM.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{rapamycin_10uM}
5 | \alias{rapamycin_10uM}
6 | \title{Rapamycin 10 uM example data}
7 | \format{
8 | A data frame containing peptide level data from a Spectronaut report.
9 | }
10 | \source{
11 | Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic
12 | approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200
13 | (2020). \doi{10.1038/s41467-020-18071-x}
14 | }
15 | \usage{
16 | rapamycin_10uM
17 | }
18 | \description{
19 | Rapamycin example data used for the vignette about binary control/treated data. The data was
20 | obtained from \href{https://www.nature.com/articles/s41467-020-18071-x}{Piazza 2020}
21 | and corresponds to experiment 18. FKBP1A the rapamycin binding protein and 49 other randomly
22 | sampled proteins were used for this example dataset. Furthermore, only the DMSO control and the
23 | 10 uM condition were used.
24 | }
25 | \keyword{datasets}
26 |
--------------------------------------------------------------------------------
/R/read_protti.R:
--------------------------------------------------------------------------------
1 | #' Read, clean and convert
2 | #'
3 | #' The function uses the very fast \code{fread} function form the \code{data.table} package. The
4 | #' column names of the resulting data table are made more r-friendly using \code{clean_names} from
5 | #' the \code{janitor} package. It replaces "." and " " with "_" and converts names to lower case
6 | #' which is also known as snake_case. In the end the data table is converted to a tibble.
7 | #'
8 | #' @param filename a character value that specifies the path to the file.
9 | #' @param ... additional arguments for the fread function.
10 | #'
11 | #' @importFrom data.table fread
12 | #' @importFrom janitor clean_names
13 | #' @importFrom magrittr %>%
14 | #'
15 | #' @return A data frame (with class tibble) that contains the content of the specified file.
16 | #' @export
17 | #'
18 | #' @examples
19 | #' \dontrun{
20 | #' read_protti("folder\\filename")
21 | #' }
22 | read_protti <-
23 | function(filename, ...) {
24 | data.table::fread(filename, ...) %>%
25 | janitor::clean_names() %>%
26 | tibble::as_tibble()
27 | }
28 |
--------------------------------------------------------------------------------
/man/scale_protti.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/scale_protti.R
3 | \name{scale_protti}
4 | \alias{scale_protti}
5 | \title{Scaling a vector}
6 | \usage{
7 | scale_protti(x, method)
8 | }
9 | \arguments{
10 | \item{x}{a numeric vector}
11 |
12 | \item{method}{a character value that specifies the method to be used for scaling. "01" scales
13 | the vector between 0 and 1. "center" scales the vector equal to \code{base::scale} around a
14 | center. This is done by subtracting the mean from every value and then deviding them by the
15 | standard deviation.}
16 | }
17 | \value{
18 | A scaled numeric vector.
19 | }
20 | \description{
21 | \code{scale_protti} is used to scale a numeric vector either between 0 and 1 or around a
22 | centered value using the standard deviation. If a vector containing only one value or
23 | repeatedly the same value is provided, 1 is returned as the scaled value for \code{method = "01"}
24 | and 0 is returned for \code{metod = "center"}.
25 | }
26 | \examples{
27 | scale_protti(c(1, 2, 1, 4, 6, 8), method = "01")
28 | }
29 |
--------------------------------------------------------------------------------
/man/rapamycin_dose_response.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{rapamycin_dose_response}
5 | \alias{rapamycin_dose_response}
6 | \title{Rapamycin dose response example data}
7 | \format{
8 | A data frame containing peptide level data from a Spectronaut report.
9 | }
10 | \source{
11 | Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic
12 | approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200
13 | (2020). \doi{10.1038/s41467-020-18071-x}
14 | }
15 | \usage{
16 | rapamycin_dose_response
17 | }
18 | \description{
19 | Rapamycin example data used for the vignette about dose response data. The data was obtained
20 | from \href{https://www.nature.com/articles/s41467-020-18071-x}{Piazza 2020} and corresponds
21 | to experiment 18. FKBP1A the rapamycin binding protein and 39 other randomly sampled proteins
22 | were used for this example dataset. The concentration range includes the following points:
23 | 0 (DMSO control), 10 pM, 100 pM, 1 nM, 10 nM, 100 nM, 1 uM, 10 uM and 100 uM.
24 | }
25 | \keyword{datasets}
26 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2021 ETH Zurich, Jan-Philipp Quast, Dina Schuster
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/man/fetch_chebi.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_chebi.R
3 | \name{fetch_chebi}
4 | \alias{fetch_chebi}
5 | \title{Fetch ChEBI database information}
6 | \usage{
7 | fetch_chebi(relation = FALSE, stars = c(3), timeout = 60)
8 | }
9 | \arguments{
10 | \item{relation}{a logical value that indicates if ChEBI Ontology data will be returned instead
11 | the main compound data. This data can be used to check the relations of ChEBI ID's to each other.
12 | Default is FALSE.}
13 |
14 | \item{stars}{a numeric vector indicating the "star" level (confidence) for which entries should
15 | be retrieved (Possible levels are 1, 2 and 3). Default is \code{c(3)} retrieving only "3-star"
16 | entries, which are manually annotated by the ChEBI curator team.}
17 |
18 | \item{timeout}{a numeric value specifying the time in seconds until the download of an organism
19 | archive times out. The default is 60 seconds.}
20 | }
21 | \value{
22 | A data frame that contains information about each molecule in the ChEBI database.
23 | }
24 | \description{
25 | Fetches information from the ChEBI database.
26 | }
27 | \examples{
28 | \donttest{
29 | chebi <- fetch_chebi()
30 |
31 | head(chebi)
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/data-raw/rapamycin_10uM.R:
--------------------------------------------------------------------------------
1 | # library(tidyverse)
2 | # library(protti)
3 | #
4 | # set.seed(1234)
5 | #
6 | # # Source: Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200 (2020). https://doi.org/10.1038/s41467-020-18071-x
7 | #
8 | # rapa <- read_protti("rapamycin_dose_response.csv")
9 | #
10 | # # filter to only retain DMSO control and 10 uM concentration
11 | #
12 | # rapa_filtered <- rapa %>%
13 | # distinct(r_file_name, r_condition, pep_stripped_sequence, eg_precursor_id, pg_protein_accessions, fg_quantity, pep_is_proteotypic, eg_is_decoy) %>%
14 | # filter(r_condition == 0 | r_condition == 7) %>%
15 | # mutate(r_condition = ifelse(r_condition == 0, "control", "rapamycin")) %>%
16 | # mutate(r_file_name = paste0(r_condition, "_", str_sub(r_file_name, start = 35, end = 36)))
17 | #
18 | # all_proteins <- unique(rapa_filter$pg_protein_accessions)
19 | #
20 | # all_proteins_wo_FKBP1A <- all_proteins[all_proteins != "P62942"]
21 | #
22 | # sampled_bg <- sample(all_proteins_wo_FKBP1A, size = 49)
23 | #
24 | # rapamycin_10uM <- rapa_filtered %>%
25 | # filter(pg_protein_accessions %in% c(sampled_bg, "P62942"))
26 | #
27 | # usethis::use_data(rapamycin_10uM, overwrite = TRUE)
28 |
--------------------------------------------------------------------------------
/man/calculate_sequence_coverage.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_sequence_coverage.R
3 | \name{calculate_sequence_coverage}
4 | \alias{calculate_sequence_coverage}
5 | \title{Protein sequence coverage}
6 | \usage{
7 | calculate_sequence_coverage(data, protein_sequence, peptides)
8 | }
9 | \arguments{
10 | \item{data}{a data frame containing at least the protein sequence and the identified peptides
11 | as columns.}
12 |
13 | \item{protein_sequence}{a character column in the \code{data} data frame that contains protein
14 | sequences. Can be obtained by using the function \code{fetch_uniprot()}}
15 |
16 | \item{peptides}{a character column in the \code{data} data frame that contains the identified
17 | peptides.}
18 | }
19 | \value{
20 | A new column in the \code{data} data frame containing the calculated sequence coverage
21 | for each identified protein
22 | }
23 | \description{
24 | Calculate sequence coverage for each identified protein.
25 | }
26 | \examples{
27 | data <- data.frame(
28 | protein_sequence = c("abcdefghijklmnop", "abcdefghijklmnop"),
29 | pep_stripped_sequence = c("abc", "jklmn")
30 | )
31 |
32 | calculate_sequence_coverage(
33 | data,
34 | protein_sequence = protein_sequence,
35 | peptides = pep_stripped_sequence
36 | )
37 | }
38 |
--------------------------------------------------------------------------------
/man/find_chebis.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/find_chebis.R
3 | \name{find_chebis}
4 | \alias{find_chebis}
5 | \title{Find ChEBI IDs for name patterns}
6 | \usage{
7 | find_chebis(chebi_data, pattern)
8 | }
9 | \arguments{
10 | \item{chebi_data}{a data frame that contains at least information on ChEBI IDs (id) and their
11 | names (name). This data frame can be obtained by calling \code{fetch_chebi()}. Ideally this
12 | should be subsetted to only contain molecules of a specific type e.g. metals. This can be
13 | achieved by calling \code{find_all_subs} with a general ID such as "25213" (Metal cation) and
14 | then subset the complete ChEBI database to only include the returned sub-IDs. Using a subsetted
15 | database ensures better search results. This is a helper function for other functions.}
16 |
17 | \item{pattern}{a character vector that contains names or name patterns of molecules. Name
18 | patterns can be for example obtained with the \code{split_metal_name} function.}
19 | }
20 | \value{
21 | A list of character vectors containing ChEBI IDs that have a name matching the supplied
22 | pattern. It contains one element per pattern.
23 | }
24 | \description{
25 | Search for chebi IDs that match a specific name pattern. A list of corresponding ChEBI IDs is
26 | returned.
27 | }
28 |
--------------------------------------------------------------------------------
/man/find_peptide.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/find_peptide.R
3 | \name{find_peptide}
4 | \alias{find_peptide}
5 | \title{Find peptide location}
6 | \usage{
7 | find_peptide(data, protein_sequence, peptide_sequence)
8 | }
9 | \arguments{
10 | \item{data}{a data frame that contains at least the protein and peptide sequence.}
11 |
12 | \item{protein_sequence}{a character column in the \code{data} data frame that contains the
13 | protein sequence.}
14 |
15 | \item{peptide_sequence}{a character column in the \code{data} data frame that contains the
16 | peptide sequence.}
17 | }
18 | \value{
19 | A data frame that contains the input data and four additional columns with peptide
20 | start and end position, the last amino acid and the amino acid before the peptide.
21 | }
22 | \description{
23 | The position of the given peptide sequence is searched within the given protein sequence. In
24 | addition the last amino acid of the peptide and the amino acid right before are reported.
25 | }
26 | \examples{
27 | # Create example data
28 | data <- data.frame(
29 | protein_sequence = c("abcdefg"),
30 | peptide_sequence = c("cde")
31 | )
32 |
33 | # Find peptide
34 | find_peptide(
35 | data = data,
36 | protein_sequence = protein_sequence,
37 | peptide_sequence = peptide_sequence
38 | )
39 | }
40 |
--------------------------------------------------------------------------------
/man/drc_4p.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/drc_4p.R
3 | \name{drc_4p}
4 | \alias{drc_4p}
5 | \title{Dose response curve helper function}
6 | \usage{
7 | drc_4p(data, response, dose, log_logarithmic = TRUE, pb = NULL)
8 | }
9 | \arguments{
10 | \item{data}{a data frame that contains at least the dose and response column the model should
11 | be fitted to.}
12 |
13 | \item{response}{a numeric column that contains the response values.}
14 |
15 | \item{dose}{a numeric column that contains the dose values.}
16 |
17 | \item{log_logarithmic}{a logical value indicating if a logarithmic or log-logarithmic model is
18 | fitted. If response values form a symmetric curve for non-log transformed dose values, a
19 | logarithmic model instead of a log-logarithmic model should be used. Usually biological dose
20 | response data has a log-logarithmic distribution, which is the reason this is the default.
21 | Log-logarithmic models are symmetric if dose values are log transformed.}
22 |
23 | \item{pb}{progress bar object. This is only necessary if the function is used in an iteration.}
24 | }
25 | \value{
26 | An object of class \code{drc}. If no fit was performed a character vector with content
27 | "no_fit".
28 | }
29 | \description{
30 | This function peforms the four-parameter dose response curve fit. It is the helper function
31 | for the fit in the \code{fit_drc_4p} function.
32 | }
33 |
--------------------------------------------------------------------------------
/man/ttest_protti.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ttest_protti.R
3 | \name{ttest_protti}
4 | \alias{ttest_protti}
5 | \title{Perform Welch's t-test}
6 | \usage{
7 | ttest_protti(mean1, mean2, sd1, sd2, n1, n2, log_values = TRUE)
8 | }
9 | \arguments{
10 | \item{mean1}{a numeric vector that contains the means of group1.}
11 |
12 | \item{mean2}{a numeric vector that contains the means of group2.}
13 |
14 | \item{sd1}{a numeric vector that contains the standard deviations of group1.}
15 |
16 | \item{sd2}{a numeric vector that contains the standard deviations of group2.}
17 |
18 | \item{n1}{a numeric vector that contains the number of replicates used for the calculation of
19 | each mean and standard deviation of group1.}
20 |
21 | \item{n2}{a numeric vector that contains the number of replicates used for the calculation of
22 | each mean and standard deviation of group2.}
23 |
24 | \item{log_values}{a logical value that indicates if values are log transformed. This determines
25 | how fold changes are calculated. Default is \code{log_values = TRUE}.}
26 | }
27 | \value{
28 | A data frame that contains the calculated differences of means, standard error, t
29 | statistic and p-values.
30 | }
31 | \description{
32 | Performs a Welch's t-test and calculates p-values between two groups.
33 | }
34 | \examples{
35 | ttest_protti(
36 | mean1 = 10,
37 | mean2 = 15.5,
38 | sd1 = 1,
39 | sd2 = 0.5,
40 | n1 = 3,
41 | n2 = 3
42 | )
43 | }
44 |
--------------------------------------------------------------------------------
/man/normalise.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/normalise.R
3 | \name{normalise}
4 | \alias{normalise}
5 | \title{Intensity normalisation}
6 | \usage{
7 | normalise(data, sample, intensity_log2, method = "median")
8 | }
9 | \arguments{
10 | \item{data}{a data frame containing at least sample names and intensity values. Please note that if the
11 | data frame is grouped, the normalisation will be computed by group.}
12 |
13 | \item{sample}{a character column in the \code{data} data frame that contains the sample names.}
14 |
15 | \item{intensity_log2}{a numeric column in the \code{data} data frame that contains the log2 transformed
16 | intensity values to be normalised.}
17 |
18 | \item{method}{a character value specifying the method to be used for normalisation. Default
19 | is "median".}
20 | }
21 | \value{
22 | A data frame with a column called \code{normalised_intensity_log2} containing the
23 | normalised intensity values.
24 | }
25 | \description{
26 | Performs normalisation on intensities. For median normalisation the normalised intensity is the
27 | original intensity minus the run median plus the global median. This is also the way it is
28 | implemented in the Spectronaut search engine.
29 | }
30 | \examples{
31 | data <- data.frame(
32 | r_file_name = c("s1", "s2", "s3", "s1", "s2", "s3"),
33 | intensity_log2 = c(18, 19, 17, 20, 21, 19)
34 | )
35 |
36 | normalise(data,
37 | sample = r_file_name,
38 | intensity_log2 = intensity_log2,
39 | method = "median"
40 | )
41 | }
42 |
--------------------------------------------------------------------------------
/data-raw/rapamycin_dose_response.R:
--------------------------------------------------------------------------------
1 | # library(tidyverse)
2 | # library(protti)
3 | #
4 | # set.seed(123)
5 | #
6 | # # Source: Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200 (2020). https://doi.org/10.1038/s41467-020-18071-x
7 | #
8 | rapa <- read_protti("rapamycin_dose_response.csv")
9 |
10 | # Filter to only contain necessary columns. Simplify file names. Annotate conditions with concentrations in pM.
11 |
12 | rapa_filtered <- rapa %>%
13 | distinct(r_file_name, r_condition, eg_precursor_id, pg_protein_accessions, fg_quantity, pep_is_proteotypic, eg_is_decoy) %>%
14 | mutate(r_file_name = paste0("sample_", str_sub(r_file_name, start = 35, end = 36))) %>%
15 | mutate(r_condition = case_when(
16 | r_condition == 0 ~ 0,
17 | r_condition == 1 ~ 10,
18 | r_condition == 2 ~ 100,
19 | r_condition == 3 ~ 1000,
20 | r_condition == 4 ~ 10000,
21 | r_condition == 5 ~ 100000,
22 | r_condition == 6 ~ 1000000,
23 | r_condition == 7 ~ 10000000,
24 | r_condition == 8 ~ 100000000,
25 | ))
26 |
27 | all_proteins <- unique(rapa_filtered$pg_protein_accessions)
28 |
29 | all_proteins_wo_FKBP1A <- all_proteins[all_proteins != "P62942"]
30 |
31 | sampled_bg <- sample(all_proteins_wo_FKBP1A, size = 39)
32 |
33 | rapamycin_dose_response <- rapa_filtered %>%
34 | filter(pg_protein_accessions %in% c(sampled_bg, "P62942"))
35 |
36 | usethis::use_data(rapamycin_dose_response, overwrite = TRUE)
37 |
--------------------------------------------------------------------------------
/R/scale_protti.R:
--------------------------------------------------------------------------------
1 | #' Scaling a vector
2 | #'
3 | #' \code{scale_protti} is used to scale a numeric vector either between 0 and 1 or around a
4 | #' centered value using the standard deviation. If a vector containing only one value or
5 | #' repeatedly the same value is provided, 1 is returned as the scaled value for \code{method = "01"}
6 | #' and 0 is returned for \code{metod = "center"}.
7 | #'
8 | #' @param x a numeric vector
9 | #' @param method a character value that specifies the method to be used for scaling. "01" scales
10 | #' the vector between 0 and 1. "center" scales the vector equal to \code{base::scale} around a
11 | #' center. This is done by subtracting the mean from every value and then deviding them by the
12 | #' standard deviation.
13 | #'
14 | #' @return A scaled numeric vector.
15 | #' @export
16 | #'
17 | #' @examples
18 | #' scale_protti(c(1, 2, 1, 4, 6, 8), method = "01")
19 | scale_protti <- function(x, method) {
20 | if (is.numeric(x) == FALSE) {
21 | stop("x is a ", typeof(x), " vector but needs to be a numeric vector!")
22 | }
23 | if (method == "01") {
24 | result <- (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
25 |
26 | if ((max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) == 0) {
27 | result <- rep(1, length(x))
28 | }
29 | }
30 | if (method == "center") {
31 | result <- (x - mean(x, na.rm = TRUE)) / stats::sd(x, na.rm = TRUE)
32 |
33 | if (stats::sd(x, na.rm = TRUE) == 0) {
34 | result <- rep(0, length(x))
35 | }
36 | }
37 | result
38 | }
39 |
--------------------------------------------------------------------------------
/R/replace_identified_by_x.R:
--------------------------------------------------------------------------------
1 | #' Replace identified positions in protein sequence by "x"
2 | #'
3 | #' Helper function for the calculation of sequence coverage, replaces identified positions with an
4 | #' "x" within the protein sequence.
5 | #'
6 | #' @param sequence a character value that contains the protein sequence.
7 | #' @param positions_start a numeric vector of start positions of the identified peptides.
8 | #' @param positions_end a numeric vector of end positions of the identified peptides.
9 | #'
10 | #' @return A character vector that contains the modified protein sequence with each identified
11 | #' position replaced by "x".
12 | #' @importFrom purrr map2
13 | #' @importFrom stringr str_sub
14 | replace_identified_by_x <-
15 | function(sequence, positions_start, positions_end) {
16 | sequence <- unique(sequence)
17 | if (sequence == "" | is.na(sequence)) {
18 | return(NA)
19 | }
20 | remove_na <- !is.na(positions_start) & !is.na(positions_end)
21 | positions_start <- positions_start[remove_na]
22 | positions_end <- positions_end[remove_na]
23 | result <- purrr::map2(
24 | .x = positions_start, .y = positions_end,
25 | function(x, y) {
26 | times <- y - x + 1
27 | stringr::str_sub(sequence, start = x, end = y) <- paste(rep("x", times = times), collapse = "")
28 | # this does not modify the global environment but only the
29 | # environment of the parent function (replace_identified_by_x).
30 | sequence <<- sequence
31 | }
32 | )
33 | result[[length(result)]]
34 | }
35 |
--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | branches:
4 | - main
5 | - master
6 | pull_request:
7 | branches:
8 | - main
9 | - master
10 |
11 | name: test-coverage
12 |
13 | jobs:
14 | test-coverage:
15 | runs-on: macOS-latest
16 | env:
17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
18 | steps:
19 | - uses: actions/checkout@v2
20 |
21 | - uses: r-lib/actions/setup-r@v2
22 |
23 | - uses: r-lib/actions/setup-pandoc@v2
24 |
25 | - name: Query dependencies
26 | run: |
27 | install.packages('remotes')
28 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
29 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
30 | shell: Rscript {0}
31 |
32 | - name: Cache R packages
33 | uses: actions/cache@v2
34 | with:
35 | path: ${{ env.R_LIBS_USER }}
36 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
37 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
38 |
39 | - name: Install dependencies
40 | run: |
41 | install.packages(c("remotes"))
42 | remotes::install_deps(dependencies = TRUE)
43 | remotes::install_cran("covr")
44 | shell: Rscript {0}
45 |
46 | - name: Test coverage
47 | env:
48 | TEST_PROTTI: true
49 | BUILD_VIGNETTE: true
50 | run: covr::codecov()
51 | shell: Rscript {0}
52 |
--------------------------------------------------------------------------------
/man/pval_distribution_plot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/pval_distribution_plot.R
3 | \name{pval_distribution_plot}
4 | \alias{pval_distribution_plot}
5 | \title{Plot histogram of p-value distribution}
6 | \usage{
7 | pval_distribution_plot(data, grouping, pval, facet_by = NULL)
8 | }
9 | \arguments{
10 | \item{data}{a data frame that contains at least grouping identifiers (precursor, peptide or
11 | protein) and p-values derived from any statistical test.}
12 |
13 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor,
14 | peptide or protein identifiers. For each entry in this column there should be one unique p-value.
15 | That means the statistical test that created the p-value should have been performed on the
16 | level of the content of this column.}
17 |
18 | \item{pval}{a numeric column in the \code{data} data frame that contains p-values.}
19 |
20 | \item{facet_by}{optional, a character column that contains information by which the data should
21 | be faceted into multiple plots.}
22 | }
23 | \value{
24 | A histogram plot that shows the p-value distribution.
25 | }
26 | \description{
27 | Plots the distribution of p-values derived from any statistical test as a histogram.
28 | }
29 | \examples{
30 | set.seed(123) # Makes example reproducible
31 |
32 | # Create example data
33 | data <- data.frame(
34 | peptide = paste0("peptide", 1:1000),
35 | pval = runif(n = 1000)
36 | )
37 |
38 | # Plot p-values
39 | pval_distribution_plot(
40 | data = data,
41 | grouping = peptide,
42 | pval = pval
43 | )
44 | }
45 |
--------------------------------------------------------------------------------
/tests/testthat/test-queue_functions.R:
--------------------------------------------------------------------------------
1 | context("test-queue_functions")
2 |
3 | queue <- create_queue(
4 | date = c("200722"),
5 | instrument = c("EX1"),
6 | user = c("username"),
7 | measurement_type = c("DIA"),
8 | experiment_name = c("N01"),
9 | digestion = c("LiP", "tryptic control"),
10 | treatment_type_1 = c("EDTA", "H2O"),
11 | treatment_type_2 = c("Zeba", "unfiltered"),
12 | treatment_dose_1 = c(10, 30, 60),
13 | treatment_unit_1 = c("min"),
14 | n_replicates = 4,
15 | number_runs = FALSE,
16 | organism = c("E. coli"),
17 | exclude_combinations = list(list(
18 | treatment_type_1 = c("H2O"),
19 | treatment_type_2 = c("Zeba", "unfiltered"),
20 | treatment_dose_1 = c(10, 30)
21 | )),
22 | inj_vol = c(2),
23 | data_path = "D:\\2007_Data",
24 | method_path = "C:\\Xcalibur\\methods\\username\\DIA_120min_41var_AGC200",
25 | position_row = c("A", "B", "C", "D", "E", "F"),
26 | position_column = 8,
27 | blank_every_n = 4,
28 | blank_position = "1-V1",
29 | blank_method_path = "C:\\Xcalibur\\methods\\blank",
30 | export = FALSE
31 | )
32 |
33 | test_that("create_queue works", {
34 | expect_is(queue, "data.frame")
35 | expect_equal(ncol(queue), 21)
36 | expect_equal(nrow(queue), 80)
37 | })
38 |
39 | test_that("randomise_queue works", {
40 | set.seed(123)
41 | randomised_queue <- randomise_queue(data = queue, rows = 71:80)
42 | expect_is(randomised_queue, "data.frame")
43 | expect_equal(ncol(randomised_queue), 21)
44 | expect_equal(nrow(randomised_queue), 80)
45 | expect_equal(randomised_queue$Position[71:80], c("1-V1", "B8", "B5", "B3", "B6", "1-V1", "B7", "B4", "B1", "B2"))
46 | })
47 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | branches:
4 | - main
5 | - master
6 |
7 | name: pkgdown
8 |
9 | jobs:
10 | pkgdown:
11 | runs-on: macOS-latest
12 | env:
13 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
14 | TEST_PROTTI: true
15 | BUILD_VIGNETTE: true
16 | steps:
17 | - uses: actions/checkout@v4
18 |
19 | - uses: r-lib/actions/setup-r@v2
20 |
21 | - uses: r-lib/actions/setup-pandoc@v2
22 |
23 | - name: Query dependencies
24 | run: |
25 | install.packages('remotes')
26 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
27 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
28 | shell: Rscript {0}
29 |
30 | - name: Cache R packages
31 | uses: actions/cache@v2
32 | with:
33 | path: ${{ env.R_LIBS_USER }}
34 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
35 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
36 |
37 | - name: Install dependencies
38 | run: |
39 | remotes::install_deps(dependencies = TRUE)
40 | install.packages("pkgdown", type = "binary")
41 | shell: Rscript {0}
42 |
43 | - name: Install package
44 | run: R CMD INSTALL .
45 |
46 | - name: Deploy package
47 | run: |
48 | git config --local user.email "actions@github.com"
49 | git config --local user.name "GitHub Actions"
50 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
51 |
--------------------------------------------------------------------------------
/man/anova_protti.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/anova_protti.R
3 | \name{anova_protti}
4 | \alias{anova_protti}
5 | \title{Perform ANOVA}
6 | \usage{
7 | anova_protti(data, grouping, condition, mean_ratio, sd, n)
8 | }
9 | \arguments{
10 | \item{data}{a data frame containing at least the input variables.}
11 |
12 | \item{grouping}{a character column in the \code{data} data frame that contains precursor or
13 | peptide identifiers.}
14 |
15 | \item{condition}{a character or numeric column in the \code{data} data frame that contains the
16 | conditions.}
17 |
18 | \item{mean_ratio}{a numeric column in the \code{data} data frame that contains mean intensities
19 | or mean intensity ratios.}
20 |
21 | \item{sd}{a numeric column in the \code{data} data frame that contains the standard deviation
22 | corresponding to the mean.}
23 |
24 | \item{n}{a numeric column in the \code{data} data frame that contains the number of replicates
25 | for which the corresponding mean was calculated.}
26 | }
27 | \value{
28 | a data frame that contains the within group error (\code{ms_group}) and the between
29 | group error (\code{ms_error}), f statistic and p-values.
30 | }
31 | \description{
32 | Performs an ANOVA statistical test
33 | }
34 | \examples{
35 | data <- data.frame(
36 | precursor = c("A", "A", "A", "B", "B", "B"),
37 | condition = c("C1", "C2", "C3", "C1", "C2", "C3"),
38 | mean = c(10, 12, 20, 11, 12, 8),
39 | sd = c(2, 1, 1.5, 1, 2, 4),
40 | n = c(4, 4, 4, 4, 4, 4)
41 | )
42 |
43 | anova_protti(
44 | data,
45 | grouping = precursor,
46 | condition = condition,
47 | mean = mean,
48 | sd = sd,
49 | n = n
50 | )
51 | }
52 |
--------------------------------------------------------------------------------
/man/fetch_uniprot_proteome.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_uniprot_proteome.R
3 | \name{fetch_uniprot_proteome}
4 | \alias{fetch_uniprot_proteome}
5 | \title{Fetch proteome data from UniProt}
6 | \usage{
7 | fetch_uniprot_proteome(
8 | organism_id,
9 | columns = c("accession"),
10 | reviewed = TRUE,
11 | timeout = 120,
12 | max_tries = 5
13 | )
14 | }
15 | \arguments{
16 | \item{organism_id}{a numeric value that specifies the NCBI taxonomy identifier (TaxId) for an
17 | organism.}
18 |
19 | \item{columns}{a character vector of metadata columns that should be imported from UniProt (all
20 | possible columns can be found \href{https://www.uniprot.org/help/return_fields}{here}. For
21 | cross-referenced database provide the database name with the prefix "xref_", e.g. \code{"xref_pdb"}).
22 | Note: Not more than one or two columns should be selected otherwise the function will not be
23 | able to efficiently retrieve the information. If more information is needed, \code{fetch_uniprot()}
24 | can be used with the IDs retrieved by this function.}
25 |
26 | \item{reviewed}{a logical value that determines if only reviewed protein entries will be retrieved.}
27 |
28 | \item{timeout}{a numeric value specifying the time in seconds until the download times out.
29 | The default is 60 seconds.}
30 |
31 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download
32 | the data in case an error occurs. The default is 2.}
33 | }
34 | \value{
35 | A data frame that contains all protein metadata specified in \code{columns} for the
36 | organism of choice.
37 | }
38 | \description{
39 | Fetches proteome data from UniProt for the provided organism ID.
40 | }
41 | \examples{
42 | \donttest{
43 | head(fetch_uniprot_proteome(9606))
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/man/fetch_mobidb.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_mobidb.R
3 | \name{fetch_mobidb}
4 | \alias{fetch_mobidb}
5 | \title{Fetch protein disorder and mobility information from MobiDB}
6 | \usage{
7 | fetch_mobidb(
8 | uniprot_ids = NULL,
9 | organism_id = NULL,
10 | show_progress = TRUE,
11 | timeout = 60,
12 | max_tries = 2
13 | )
14 | }
15 | \arguments{
16 | \item{uniprot_ids}{optional, a character vector of UniProt identifiers for which information
17 | should be fetched. This argument is mutually exclusive to the \code{organism_id} argument.}
18 |
19 | \item{organism_id}{optional, a character value providing the NCBI taxonomy identifier of an organism
20 | (TaxId) of an organism for which all available information should be retreived. This
21 | argument is mutually exclusive to the \code{uniprot_ids} argument.}
22 |
23 | \item{show_progress}{a logical value; if \code{TRUE} a progress bar will be shown.
24 | Default is \code{TRUE}.}
25 |
26 | \item{timeout}{a numeric value specifying the time in seconds until the download of an organism
27 | archive times out. The default is 60 seconds.}
28 |
29 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download
30 | the data in case an error occurs. The default is 2.}
31 | }
32 | \value{
33 | A data frame that contains start and end positions for disordered and flexible protein
34 | regions. The \code{feature} column contains information on the source of this
35 | annotation. More information on the source can be found
36 | \href{https://mobidb.org/about/mobidb}{here}.
37 | }
38 | \description{
39 | Fetches information about disordered and flexible protein regions from MobiDB.
40 | }
41 | \examples{
42 | \donttest{
43 | fetch_mobidb(
44 | uniprot_ids = c("P0A799", "P62707")
45 | )
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/man/assign_peptide_type.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/assign_peptide_type.R
3 | \name{assign_peptide_type}
4 | \alias{assign_peptide_type}
5 | \title{Assign peptide type}
6 | \usage{
7 | assign_peptide_type(
8 | data,
9 | aa_before = aa_before,
10 | last_aa = last_aa,
11 | aa_after = aa_after
12 | )
13 | }
14 | \arguments{
15 | \item{data}{a data frame containing at least information about the preceding and C-terminal
16 | amino acids of peptides.}
17 |
18 | \item{aa_before}{a character column in the \code{data} data frame that contains the preceding amino
19 | acid as one letter code.}
20 |
21 | \item{last_aa}{a character column in the \code{data} data frame that contains the C-terminal amino
22 | acid as one letter code.}
23 |
24 | \item{aa_after}{a character column in the \code{data} data frame that contains the following amino
25 | acid as one letter code.}
26 | }
27 | \value{
28 | A data frame that contains the input data and an additional column with the peptide
29 | type information.
30 | }
31 | \description{
32 | Based on preceding and C-terminal amino acid, the peptide type of a given peptide is assigned.
33 | Peptides with preceeding and C-terminal lysine or arginine are considered fully-tryptic. If a
34 | peptide is located at the N- or C-terminus of a protein and fulfills the criterium to be
35 | fully-tryptic otherwise, it is also considered as fully-tryptic. Peptides that only fulfill the
36 | criterium on one terminus are semi-tryptic peptides. Lastly, peptides that are not fulfilling
37 | the criteria for both termini are non-tryptic peptides.
38 | }
39 | \examples{
40 | data <- data.frame(
41 | aa_before = c("K", "S", "T"),
42 | last_aa = c("R", "K", "Y"),
43 | aa_after = c("T", "R", "T")
44 | )
45 |
46 | assign_peptide_type(data, aa_before, last_aa, aa_after)
47 | }
48 |
--------------------------------------------------------------------------------
/man/ptsi_pgk.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{ptsi_pgk}
5 | \alias{ptsi_pgk}
6 | \title{Structural analysis example data}
7 | \format{
8 | A data frame containing differential abundances and adjusted p-values for
9 | peptides/precursors of two proteins.
10 | }
11 | \source{
12 | Cappelletti V, Hauser T, Piazza I, Pepelnjak M, Malinovska L, Fuhrer T, Li Y, Dörig C,
13 | Boersema P, Gillet L, Grossbach J, Dugourd A, Saez-Rodriguez J, Beyer A, Zamboni N, Caflisch A,
14 | de Souza N, Picotti P. Dynamic 3D proteomes reveal protein functional alterations at high
15 | resolution in situ. Cell. 2021 Jan 21;184(2):545-559.e22. \doi{10.1016/j.cell.2020.12.021}.
16 | Epub 2020 Dec 23. PMID: 33357446; PMCID: PMC7836100.
17 | }
18 | \usage{
19 | ptsi_pgk
20 | }
21 | \description{
22 | Example data used for the vignette about structural analysis. The data was obtained from
23 | Cappelletti et al. 2021 (\doi{10.1016/j.cell.2020.12.021})
24 | and corresponds to two separate experiments. Both experiments were limited proteolyis coupled to
25 | mass spectrometry (LiP-MS) experiments conducted on purified proteins. The first protein is
26 | phosphoglycerate kinase 1 (pgk) and it was treated with 25mM 3-phosphoglyceric acid (3PG).
27 | The second protein is phosphoenolpyruvate-protein phosphotransferase (ptsI) and it was treated
28 | with 25mM fructose 1,6-bisphosphatase (FBP). From both experiments only peptides belonging to
29 | either protein were used for this data set. The ptsI data set contains precursor level data
30 | while the pgk data set contains peptide level data. The pgk data can be obtained from
31 | supplementary table 3 from the tab named "pgk+3PG". The ptsI data is only included as raw data
32 | and was analysed using the functions of this package.
33 | }
34 | \keyword{datasets}
35 |
--------------------------------------------------------------------------------
/man/try_query.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/try_query.R
3 | \name{try_query}
4 | \alias{try_query}
5 | \title{Query from URL}
6 | \usage{
7 | try_query(
8 | url,
9 | max_tries = 5,
10 | silent = TRUE,
11 | type = "text/tab-separated-values",
12 | timeout = 60,
13 | accept = NULL,
14 | ...
15 | )
16 | }
17 | \arguments{
18 | \item{url}{a character value of an URL to the website that contains the table that should be
19 | downloaded.}
20 |
21 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download
22 | the data in case an error occurs. Default is 5.}
23 |
24 | \item{silent}{a logical value that specifies if individual messages are printed after each try
25 | that failed.}
26 |
27 | \item{type}{a character value that specifies the type of data at the target URL. Options are
28 | all options that can be supplied to httr::content, these include e.g.
29 | "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values".}
30 |
31 | \item{timeout}{a numeric value that specifies the maximum request time. Default is 60 seconds.}
32 |
33 | \item{accept}{a character value that specifies the type of data that should be sent by the API if
34 | it uses content negotiation. The default is NULL and it should only be set for APIs that use
35 | content negotiation.}
36 |
37 | \item{...}{other parameters supplied to the parsing function used by httr::content.}
38 | }
39 | \value{
40 | A data frame that contains the table from the url.
41 | }
42 | \description{
43 | Downloads data table from URL. If an error occurs during the query (for example due to no
44 | connection) the function waits 3 seconds and tries again. If no result could be obtained
45 | after the given number of tries a message indicating the problem is returned.
46 | }
47 |
--------------------------------------------------------------------------------
/man/find_all_subs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/find_all_subs.R
3 | \name{find_all_subs}
4 | \alias{find_all_subs}
5 | \title{Find all sub IDs of an ID in a network}
6 | \usage{
7 | find_all_subs(
8 | data,
9 | ids,
10 | main_id = id,
11 | type = type,
12 | accepted_types = "is_a",
13 | exclude_parent_id = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{data}{a data frame that contains relational information on IDs (main_id) their sub
18 | IDs (sub_id) and their relationship (type). For ChEBI this data frame can be obtained by calling
19 | \code{fetch_chebi(relation = TRUE)}. For ECO data it can be obtained by calling fetch_eco(relation = TRUE).}
20 |
21 | \item{ids}{a character vector of IDs for which sub IDs should be searched.}
22 |
23 | \item{main_id}{a character or integer column containing IDs. Default is \code{id} for ChEBI IDs.}
24 |
25 | \item{type}{a character column that contains the type of interactions. Default is \code{type} for ChEBI IDs.}
26 |
27 | \item{accepted_types}{a character vector containing the accepted_types of relationships that should be considered
28 | for the search. It is possible to use "all" relationships. The default type is "is_a". A list of
29 | possible relationships for e.g. ChEBI IDs can be found
30 | \href{https://docs.google.com/document/d/1_w-DwBdCCOh1gMeeP6yqGzcnkpbHYOa3AGSODe5epcg/edit#heading=h.hnsqoqu978s5}{here}.}
31 |
32 | \item{exclude_parent_id}{a logical value that specifies if the parent ID should be included in
33 | the returned list.}
34 | }
35 | \value{
36 | A list of character vectors containing the provided ID and all of its sub IDs. It
37 | contains one element per input ID.
38 | }
39 | \description{
40 | For a given ID, find all sub IDs and their sub IDs etc. The type of
41 | relationship can be selected too. This is a helper function for other functions.
42 | }
43 |
--------------------------------------------------------------------------------
/man/qc_sequence_coverage.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_sequence_coverage.R
3 | \name{qc_sequence_coverage}
4 | \alias{qc_sequence_coverage}
5 | \title{Protein coverage distribution}
6 | \usage{
7 | qc_sequence_coverage(
8 | data,
9 | protein_identifier,
10 | coverage,
11 | sample = NULL,
12 | interactive = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{data}{a data frame that contains at least the input variables.}
17 |
18 | \item{protein_identifier}{a character column in the \code{data} data frame that contains protein
19 | identifiers.}
20 |
21 | \item{coverage}{a numeric column in the \code{data} data frame that contains protein coverage
22 | in percent. This information can be obtained using the \code{\link{sequence_coverage}} function.}
23 |
24 | \item{sample}{optional, a character or factor column in the \code{data} data frame that contains sample names.
25 | Please only provide this argument if you want to facet the distribution plot by sample
26 | otherwise do not provide this argument.}
27 |
28 | \item{interactive}{a logical value that specifies whether the plot should be interactive
29 | (default is FALSE).}
30 | }
31 | \value{
32 | A protein coverage histogram with 5 percent binwidth. The vertical dotted line
33 | indicates the median.
34 | }
35 | \description{
36 | Plots the distribution of protein coverages in a histogram.
37 | }
38 | \examples{
39 | set.seed(123) # Makes example reproducible
40 |
41 | # Create example data
42 | data <- create_synthetic_data(
43 | n_proteins = 100,
44 | frac_change = 0.05,
45 | n_replicates = 3,
46 | n_conditions = 2,
47 | method = "effect_random"
48 | )
49 |
50 | # Plot sequence coverage
51 | qc_sequence_coverage(
52 | data = data,
53 | protein_identifier = protein,
54 | coverage = coverage
55 | )
56 | }
57 | \seealso{
58 | \code{\link{sequence_coverage}}
59 | }
60 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches:
8 | - '*'
9 |
10 | name: R-CMD-check
11 |
12 | jobs:
13 | R-CMD-check:
14 | runs-on: ${{ matrix.config.os }}
15 |
16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
17 |
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | config:
22 | - {os: macos-latest, r: 'release'}
23 | - {os: windows-latest, r: 'release'}
24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
25 | - {os: ubuntu-latest, r: 'release'}
26 | - {os: ubuntu-latest, r: 'oldrel-1'}
27 |
28 | env:
29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 | R_KEEP_PKG_SOURCE: yes
31 | TEST_PROTTI: true
32 | BUILD_VIGNETTE: true
33 |
34 | steps:
35 | - uses: actions/checkout@v3
36 |
37 | - uses: r-lib/actions/setup-pandoc@v2
38 |
39 | - uses: r-lib/actions/setup-r@v2
40 | with:
41 | r-version: ${{ matrix.config.r }}
42 | http-user-agent: ${{ matrix.config.http-user-agent }}
43 | use-public-rspm: true
44 |
45 | - uses: r-lib/actions/setup-r-dependencies@v2
46 | with:
47 | extra-packages:
48 | any::rcmdcheck
49 | needs: check
50 |
51 | # run: |
52 | # - name: Install remotes and lme4 package
53 | # Rscript -e 'install.packages("remotes", lib=Sys.getenv("R_LIB_FOR_PAK"))'
54 | # Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")'
55 |
56 | - uses: r-lib/actions/check-r-package@v2
57 | with:
58 | upload-snapshots: true
59 |
--------------------------------------------------------------------------------
/R/ttest_protti.R:
--------------------------------------------------------------------------------
1 | #' Perform Welch's t-test
2 | #'
3 | #' Performs a Welch's t-test and calculates p-values between two groups.
4 | #'
5 | #' @param mean1 a numeric vector that contains the means of group1.
6 | #' @param mean2 a numeric vector that contains the means of group2.
7 | #' @param sd1 a numeric vector that contains the standard deviations of group1.
8 | #' @param sd2 a numeric vector that contains the standard deviations of group2.
9 | #' @param n1 a numeric vector that contains the number of replicates used for the calculation of
10 | #' each mean and standard deviation of group1.
11 | #' @param n2 a numeric vector that contains the number of replicates used for the calculation of
12 | #' each mean and standard deviation of group2.
13 | #' @param log_values a logical value that indicates if values are log transformed. This determines
14 | #' how fold changes are calculated. Default is \code{log_values = TRUE}.
15 | #'
16 | #' @return A data frame that contains the calculated differences of means, standard error, t
17 | #' statistic and p-values.
18 | #' @importFrom stats pt
19 | #' @export
20 | #'
21 | #' @examples
22 | #' ttest_protti(
23 | #' mean1 = 10,
24 | #' mean2 = 15.5,
25 | #' sd1 = 1,
26 | #' sd2 = 0.5,
27 | #' n1 = 3,
28 | #' n2 = 3
29 | #' )
30 | ttest_protti <- function(mean1, mean2, sd1, sd2, n1, n2, log_values = TRUE) {
31 | std_error <- sqrt((sd1^2 / n1) + (sd2^2 / n2))
32 | # Welch-Satterwhite equation to estimate the degrees of freedom
33 | df <- ((sd1^2 / n1) + (sd2^2 / n2))^2 / (sd1^4 / (n1^2 * (n1 - 1)) + sd2^4 / (n2^2 * (n2 - 1)))
34 | # fold change calculation
35 | if (log_values == TRUE) {
36 | diff <- mean1 - mean2
37 | } else {
38 | diff <- mean1 / mean2
39 | }
40 | # t statistic calculation
41 | t <- (diff) / std_error
42 | result <- data.frame(cbind(diff, std_error, t, 2 * pt(-abs(t), df)))
43 | colnames(result) <- c("diff", "std_error", "t_statistic", "pval")
44 | return(result)
45 | }
46 |
--------------------------------------------------------------------------------
/data-raw/ptsi_pgk.R:
--------------------------------------------------------------------------------
1 | # library(tidyverse)
2 | # library(protti)
3 | #
4 | # # Source: Cappelletti V, Hauser T, Piazza I, Pepelnjak M, Malinovska L, Fuhrer T, Li Y, Dörig C, Boersema P, Gillet L, Grossbach J, Dugourd A, Saez-Rodriguez J, Beyer A, Zamboni N, Caflisch A, de Souza N, Picotti P. Dynamic 3D proteomes reveal protein functional alterations at high resolution in situ. Cell. 2021 Jan 21;184(2):545-559.e22. doi: 10.1016/j.cell.2020.12.021. Epub 2020 Dec 23. PMID: 33357446; PMCID: PMC7836100.
5 | #
6 | # # The pgk data set is from supplementary table 3, the tab is called "pgk+3PG". The data does not contain precursor level data since charge states are
7 | # # missing from peptides.
8 | # pgk <- read_protti("pgk.csv")
9 | #
10 | # # The ptsI data set is not part of the supplementary tables. The raw data is included in the PRIDE repository. We exported the Spectronaut report
11 | # # and analysed that data using prottis standard pipeline.
12 | # ptsi <- read_protti("ptsi.csv")
13 | #
14 | # # pgk data tidying
15 | #
16 | # pgk_tidy <- pgk %>%
17 | # filter(concentration == "25mM") %>% # filter to only retain the 25 mM concentration
18 | # rename(eg_precursor_id = peptide_sequence,
19 | # pg_protein_accessions = uniprot_id,
20 | # diff = log2fc,
21 | # adj_pval = qvalue) %>%
22 | # distinct(eg_precursor_id,
23 | # diff,
24 | # adj_pval,
25 | # pg_protein_accessions) %>%
26 | # mutate(pep_stripped_sequence = str_remove_all(eg_precursor_id, pattern = "(?<=\\[)[\\w\\(\\)\\s\\-]+(?=\\])")) %>% # removes "[Carbamidomethyl]" from peptides.
27 | # mutate(pep_stripped_sequence = str_remove_all(pep_stripped_sequence, pattern = "[\\[\\]]"))
28 | #
29 | # # ptsi data tidying
30 | #
31 | # ptsi_tidy <- ptsi %>%
32 | # rename(eg_precursor_id = precursor_id)
33 | #
34 | # # combining data
35 | #
36 | # ptsi_pgk <- pgk_tidy %>%
37 | # bind_rows(ptsi_tidy)
38 | #
39 | # usethis::use_data(ptsi_pgk, overwrite = TRUE)
40 |
--------------------------------------------------------------------------------
/data-raw/protti_colours.R:
--------------------------------------------------------------------------------
1 | protti_colours <- c(
2 | "#5680C1",
3 | "#B96DAD",
4 | "#64CACA",
5 | "#81ABE9",
6 | "#F6B8D1",
7 | "#99F1E4",
8 | "#9AD1FF",
9 | "#548BDF",
10 | "#A55098",
11 | "#3EB6B6",
12 | "#87AEE8",
13 | "#CA91C1",
14 | "#A4E0E0",
15 | "#1D4F9A",
16 | "#D7ACD2",
17 | "#49C1C1",
18 | "#00A2D9",
19 | "#6B77BF",
20 | "#00C2D4",
21 | "#816DB8",
22 | "#00DCB5",
23 | "#9561AD",
24 | "#95EF8C",
25 | "#A6549C",
26 | "#F9F871",
27 | "#B44688",
28 | "#65D8C2",
29 | "#40B4D5",
30 | "#7AE4B2",
31 | "#529AD4",
32 | "#9DEE9C",
33 | "#7B7BC0",
34 | "#C8F585",
35 | "#995997",
36 | "#7368B8",
37 | "#A03960",
38 | "#DA5D8C",
39 | "#077AC1",
40 | "#C793BD",
41 | "#0086B3",
42 | "#FFE6FF",
43 | "#00C897",
44 | "#B8A6B4",
45 | "#8292B3",
46 | "#B38DAC",
47 | "#9CCDCD",
48 | "#A7B6D2",
49 | "#E4CBD4",
50 | "#C8EDE7",
51 | "#C1D5E9",
52 | "#899BC4",
53 | "#A6739D",
54 | "#76BFBF",
55 | "#ABB9D3",
56 | "#C3A9BE",
57 | "#C7E0E0",
58 | "#4667AC",
59 | "#D0BECE",
60 | "#87C7C7",
61 | "#3BB1E7",
62 | "#888CAF",
63 | "#12CEE1",
64 | "#8F87AB",
65 | "#12E6BD",
66 | "#9980A7",
67 | "#C2EABF",
68 | "#A5779F",
69 | "#F8F7BB",
70 | "#AF7092",
71 | "#A2D8CC",
72 | "#85BCD1",
73 | "#B4E1C9",
74 | "#8BA6C5",
75 | "#C7E9C7",
76 | "#9191B1",
77 | "#DBF2C1",
78 | "#9E789C",
79 | "#8682A9",
80 | "#AA5C76",
81 | "#C4899B",
82 | "#428DD1",
83 | "#C1AABC",
84 | "#039ACD",
85 | "#F7EDF7",
86 | "#02D5A1",
87 | "#BDB5BB",
88 | "#516C9A",
89 | "#9B5C91",
90 | "#4BAAAA",
91 | "#6F8FC0",
92 | "#D397B0",
93 | "#7BCABE",
94 | "#7EAFD7",
95 | "#4C75B8",
96 | "#844A7B",
97 | "#3E9898",
98 | "#7492C0",
99 | "#A97AA1",
100 | "#87BBBB",
101 | "#1E4381"
102 | )
103 |
104 | usethis::use_data(protti_colours, overwrite = TRUE)
105 |
--------------------------------------------------------------------------------
/R/fetch_go.R:
--------------------------------------------------------------------------------
1 | #' Fetch gene ontology information from geneontology.org
2 | #'
3 | #' Fetches gene ontology data from geneontology.org for the provided organism ID.
4 | #'
5 | #' @param organism_id a character value NCBI taxonomy identifier of an organism (TaxId).
6 | #' Possible inputs inlude only: "9606" (Human), "559292" (Yeast) and "83333" (E. coli).
7 | #'
8 | #' @return A data frame that contains gene ontology mappings to UniProt or SGD IDs. The original
9 | #' file is a .GAF file. A detailed description of all columns can be found here:
10 | #' http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/
11 | #' @export
12 | #'
13 | #' @examples
14 | #' \donttest{
15 | #' go <- fetch_go("9606")
16 | #'
17 | #' head(go)
18 | #' }
19 | fetch_go <- function(organism_id) {
20 | if (!curl::has_internet()) {
21 | message("No internet connection.")
22 | return(invisible(NULL))
23 | }
24 |
25 | organism_id <- match.arg(organism_id, c("9606", "559292", "83333"))
26 |
27 | organism_url <- switch(organism_id,
28 | "9606" = "http://current.geneontology.org/annotations/goa_human.gaf.gz",
29 | "559292" = "http://current.geneontology.org/annotations/sgd.gaf.gz",
30 | "83333" = "http://current.geneontology.org/annotations/ecocyc.gaf.gz"
31 | )
32 | go_download <- tryCatch(readLines(gzcon(url(organism_url))),
33 | error = function(e) conditionMessage(e),
34 | warning = function(w) conditionMessage(w)
35 | )
36 | go <- utils::read.delim(textConnection(go_download),
37 | quote = "",
38 | stringsAsFactors = FALSE,
39 | comment.char = "!",
40 | header = FALSE
41 | )
42 | if (nrow(go) == 1) {
43 | message(go$V1)
44 | return(invisible(NULL))
45 | }
46 | colnames(go) <- c(
47 | "db", "db_id", "symbol", "qualifier", "go_id", "db_reference",
48 | "evidence", "with_from", "ontology", "name", "synonyme",
49 | "type", "taxon", "date", "assigned_by", "annotation_extension",
50 | "gene_product_form_id"
51 | )
52 | return(go)
53 | }
54 |
--------------------------------------------------------------------------------
/R/find_chebis.R:
--------------------------------------------------------------------------------
1 | #' Find ChEBI IDs for name patterns
2 | #'
3 | #' Search for chebi IDs that match a specific name pattern. A list of corresponding ChEBI IDs is
4 | #' returned.
5 | #'
6 | #' @param chebi_data a data frame that contains at least information on ChEBI IDs (id) and their
7 | #' names (name). This data frame can be obtained by calling \code{fetch_chebi()}. Ideally this
8 | #' should be subsetted to only contain molecules of a specific type e.g. metals. This can be
9 | #' achieved by calling \code{find_all_subs} with a general ID such as "25213" (Metal cation) and
10 | #' then subset the complete ChEBI database to only include the returned sub-IDs. Using a subsetted
11 | #' database ensures better search results. This is a helper function for other functions.
12 | #' @param pattern a character vector that contains names or name patterns of molecules. Name
13 | #' patterns can be for example obtained with the \code{split_metal_name} function.
14 | #'
15 | #' @return A list of character vectors containing ChEBI IDs that have a name matching the supplied
16 | #' pattern. It contains one element per pattern.
17 | #' @importFrom dplyr distinct
18 | #' @importFrom magrittr %>%
19 | #' @importFrom purrr map
20 | #' @importFrom stringr str_detect regex
21 | #' @importFrom rlang .data
22 | #' @importFrom stats na.omit
23 | find_chebis <- function(chebi_data, pattern) {
24 | if (!requireNamespace("stringi", quietly = TRUE)) {
25 | message("Package \"stringi\" is needed for this function to work. Please install it.", call. = FALSE)
26 | return(invisible(NULL))
27 | }
28 | data <- chebi_data %>%
29 | dplyr::distinct(.data$id, .data$name)
30 |
31 | purrr::map(pattern, function(x) {
32 | stringi::stri_remove_empty(stats::na.omit(unique(
33 | ifelse(
34 | stringr::str_detect(data$name,
35 | pattern = stringr::regex(
36 | x,
37 | ignore_case = TRUE
38 | )
39 | ),
40 | data$id,
41 | ""
42 | )
43 | )))
44 | })
45 | }
46 |
--------------------------------------------------------------------------------
/man/qc_median_intensities.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_median_intensities.R
3 | \name{qc_median_intensities}
4 | \alias{qc_median_intensities}
5 | \title{Median run intensities}
6 | \usage{
7 | qc_median_intensities(
8 | data,
9 | sample,
10 | grouping,
11 | intensity,
12 | plot = TRUE,
13 | interactive = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{data}{a data frame that contains at least the input variables.}
18 |
19 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.}
20 |
21 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor or
22 | peptide identifiers.}
23 |
24 | \item{intensity}{a numeric column in the \code{data} data frame that contains intensity values.
25 | The intensity should be ideally log2 transformed, but also non-transformed values can be used.}
26 |
27 | \item{plot}{a logical value that indicates whether the result should be plotted.}
28 |
29 | \item{interactive}{a logical value that specifies whether the plot should be interactive
30 | (default is FALSE).}
31 | }
32 | \value{
33 | A plot that displays median intensity over all samples. If \code{plot = FALSE} a data
34 | frame containing median intensities is returned.
35 | }
36 | \description{
37 | Median intensities per run are returned either as a plot or a table.
38 | }
39 | \examples{
40 | set.seed(123) # Makes example reproducible
41 |
42 | # Create example data
43 | data <- create_synthetic_data(
44 | n_proteins = 100,
45 | frac_change = 0.05,
46 | n_replicates = 3,
47 | n_conditions = 2,
48 | method = "effect_random"
49 | )
50 |
51 | # Calculate median intensities
52 | qc_median_intensities(
53 | data = data,
54 | sample = sample,
55 | grouping = peptide,
56 | intensity = peptide_intensity_missing,
57 | plot = FALSE
58 | )
59 |
60 | # Plot median intensities
61 | qc_median_intensities(
62 | data = data,
63 | sample = sample,
64 | grouping = peptide,
65 | intensity = peptide_intensity_missing,
66 | plot = TRUE
67 | )
68 | }
69 |
--------------------------------------------------------------------------------
/revdep/README.md:
--------------------------------------------------------------------------------
1 | # Platform
2 |
3 | |field |value |
4 | |:--------|:------------------------------------------|
5 | |version |R version 4.3.1 (2023-06-16) |
6 | |os |macOS Sonoma 14.2.1 |
7 | |system |aarch64, darwin20 |
8 | |ui |RStudio |
9 | |language |(EN) |
10 | |collate |en_US.UTF-8 |
11 | |ctype |en_US.UTF-8 |
12 | |tz |Europe/Zurich |
13 | |date |2024-03-27 |
14 | |rstudio |2023.06.1+524 Mountain Hydrangea (desktop) |
15 | |pandoc |NA |
16 |
17 | # Dependencies
18 |
19 | |package |old |new |Δ |
20 | |:-----------|:-----|:------|:--|
21 | |protti |0.7.0 |0.8.0 |* |
22 | |bslib |NA |0.6.2 |* |
23 | |crosstalk |NA |1.2.1 |* |
24 | |curl |NA |5.2.1 |* |
25 | |data.table |NA |1.15.2 |* |
26 | |digest |NA |0.6.35 |* |
27 | |dplyr |NA |1.1.4 |* |
28 | |fontawesome |NA |0.5.2 |* |
29 | |ggplot2 |NA |3.5.0 |* |
30 | |ggrepel |NA |0.9.5 |* |
31 | |gtable |NA |0.3.4 |* |
32 | |htmltools |NA |0.5.8 |* |
33 | |htmlwidgets |NA |1.6.4 |* |
34 | |labeling |NA |0.4.3 |* |
35 | |later |NA |1.3.2 |* |
36 | |lubridate |NA |1.9.3 |* |
37 | |plotly |NA |4.10.4 |* |
38 | |R.oo |NA |1.26.0 |* |
39 | |R.utils |NA |2.12.3 |* |
40 | |Rcpp |NA |1.0.12 |* |
41 | |readr |NA |2.1.5 |* |
42 | |rmarkdown |NA |2.26 |* |
43 | |sass |NA |0.4.9 |* |
44 | |scales |NA |1.3.0 |* |
45 | |snakecase |NA |0.11.1 |* |
46 | |stringi |NA |1.8.3 |* |
47 | |stringr |NA |1.5.1 |* |
48 | |tidyr |NA |1.3.1 |* |
49 | |tidyselect |NA |1.2.1 |* |
50 | |timechange |NA |0.3.0 |* |
51 | |tinytex |NA |0.50 |* |
52 | |vroom |NA |1.6.5 |* |
53 | |xfun |NA |0.43 |* |
54 |
55 | # Revdeps
56 |
57 |
--------------------------------------------------------------------------------
/man/fetch_uniprot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_uniprot.R
3 | \name{fetch_uniprot}
4 | \alias{fetch_uniprot}
5 | \title{Fetch protein data from UniProt}
6 | \usage{
7 | fetch_uniprot(
8 | uniprot_ids,
9 | columns = c("protein_name", "length", "sequence", "gene_names", "xref_geneid",
10 | "xref_string", "go_f", "go_p", "go_c", "cc_interaction", "ft_act_site", "ft_binding",
11 | "cc_cofactor", "cc_catalytic_activity", "xref_pdb"),
12 | batchsize = 200,
13 | max_tries = 10,
14 | timeout = 20,
15 | show_progress = TRUE
16 | )
17 | }
18 | \arguments{
19 | \item{uniprot_ids}{a character vector of UniProt accession numbers.}
20 |
21 | \item{columns}{a character vector of metadata columns that should be imported from UniProt (all
22 | possible columns can be found \href{https://www.uniprot.org/help/return_fields}{here}. For
23 | cross-referenced database provide the database name with the prefix "xref_", e.g. \code{"xref_pdb"})}
24 |
25 | \item{batchsize}{a numeric value that specifies the number of proteins processed in a single
26 | single query. Default and max value is 200.}
27 |
28 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download
29 | the data in case an error occurs.}
30 |
31 | \item{timeout}{a numeric value that specifies the maximum request time per try. Default is 20 seconds.}
32 |
33 | \item{show_progress}{a logical value that determines if a progress bar will be shown. Default
34 | is TRUE.}
35 | }
36 | \value{
37 | A data frame that contains all protein metadata specified in \code{columns} for the
38 | proteins provided. The \code{input_id} column contains the provided UniProt IDs. If an invalid ID
39 | was provided that contains a valid UniProt ID, the valid portion of the ID is still fetched and
40 | present in the \code{accession} column, while the \code{input_id} column contains the original not completely
41 | valid ID.
42 | }
43 | \description{
44 | Fetches protein metadata from UniProt.
45 | }
46 | \examples{
47 | \donttest{
48 | fetch_uniprot(c("P36578", "O43324", "Q00796"))
49 |
50 | # Not completely valid ID
51 | fetch_uniprot(c("P02545", "P02545;P20700"))
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: protti
2 | Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
3 | Version: 0.9.1
4 | Authors@R:
5 | c(person(given = "Jan-Philipp",
6 | family = "Quast",
7 | role = c("aut", "cre"),
8 | email = "quast@imsb.biol.ethz.ch",
9 | comment = c(ORCID = "0000-0003-2713-778X")),
10 | person(given = "Dina",
11 | family = "Schuster",
12 | role = c("aut"),
13 | email = "dschuster@ethz.ch",
14 | comment = c(ORCID = "0000-0001-6611-8237")),
15 | person(given = "ETH Zurich",
16 | role = c("cph", "fnd")))
17 | Description: Useful functions and workflows for proteomics quality control and data analysis of both limited proteolysis-coupled mass spectrometry (LiP-MS) (Feng et. al. (2014) ) and regular bottom-up proteomics experiments. Data generated with search tools such as 'Spectronaut', 'MaxQuant' and 'Proteome Discover' can be easily used due to flexibility of functions.
18 | License: MIT + file LICENSE
19 | Encoding: UTF-8
20 | LazyData: true
21 | biocViews:
22 | Imports:
23 | rlang,
24 | dplyr,
25 | stringr,
26 | magrittr,
27 | data.table,
28 | janitor,
29 | progress,
30 | purrr,
31 | tidyr,
32 | ggplot2,
33 | forcats,
34 | tibble,
35 | plotly,
36 | ggrepel,
37 | utils,
38 | grDevices,
39 | curl,
40 | readr,
41 | lifecycle,
42 | httr,
43 | methods,
44 | R.utils,
45 | stats
46 | RoxygenNote: 7.3.2
47 | Suggests:
48 | testthat,
49 | covr,
50 | knitr,
51 | rmarkdown,
52 | shiny,
53 | r3dmol,
54 | proDA,
55 | limma,
56 | dendextend,
57 | pheatmap,
58 | heatmaply,
59 | furrr,
60 | future,
61 | parallel,
62 | seriation,
63 | drc,
64 | igraph,
65 | stringi,
66 | STRINGdb,
67 | iq,
68 | scales,
69 | farver,
70 | ggforce,
71 | xml2,
72 | jsonlite
73 | Depends:
74 | R (>= 4.0)
75 | URL: https://github.com/jpquast/protti, https://jpquast.github.io/protti/
76 | BugReports: https://github.com/jpquast/protti/issues
77 | VignetteBuilder: knitr
78 | Roxygen: list(markdown = TRUE)
79 |
--------------------------------------------------------------------------------
/man/qc_contaminants.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_contaminants.R
3 | \name{qc_contaminants}
4 | \alias{qc_contaminants}
5 | \title{Percentage of contaminants per sample}
6 | \usage{
7 | qc_contaminants(
8 | data,
9 | sample,
10 | protein,
11 | is_contaminant,
12 | intensity,
13 | n_contaminants = 5,
14 | plot = TRUE,
15 | interactive = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{data}{a data frame that contains at least the input variables.}
20 |
21 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.}
22 |
23 | \item{protein}{a character column in the \code{data} data frame that contains protein IDs or
24 | protein names.}
25 |
26 | \item{is_contaminant}{a logical column that indicates if the protein is a contaminant.}
27 |
28 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding
29 | raw or normalised intensity values (not log2).}
30 |
31 | \item{n_contaminants}{a numeric value that indicates how many contaminants should be displayed
32 | individually. The rest is combined to a group called "other". The default is 5.}
33 |
34 | \item{plot}{a logical value that indicates if a plot is returned. If FALSE a table is returned.}
35 |
36 | \item{interactive}{a logical value that indicates if the plot is made interactive using the r
37 | package \code{plotly}.}
38 | }
39 | \value{
40 | A bar plot that displays the percentage of contaminating proteins over all samples.
41 | If \code{plot = FALSE} a data frame is returned.
42 | }
43 | \description{
44 | Calculates the percentage of contaminating proteins as the share of total intensity.
45 | }
46 | \examples{
47 | data <- data.frame(
48 | sample = c(rep("sample_1", 10), rep("sample_2", 10)),
49 | leading_razor_protein = c(rep(c("P1", "P1", "P1", "P2", "P2", "P2", "P2", "P3", "P3", "P3"), 2)),
50 | potential_contaminant = c(rep(c(rep(TRUE, 7), rep(FALSE, 3)), 2)),
51 | intensity = c(rep(1, 2), rep(4, 4), rep(6, 4), rep(2, 3), rep(3, 5), rep(4, 2))
52 | )
53 |
54 | qc_contaminants(
55 | data,
56 | sample = sample,
57 | protein = leading_razor_protein,
58 | is_contaminant = potential_contaminant,
59 | intensity = intensity
60 | )
61 | }
62 |
--------------------------------------------------------------------------------
/man/qc_intensity_distribution.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_intensity_distribution.R
3 | \name{qc_intensity_distribution}
4 | \alias{qc_intensity_distribution}
5 | \title{Check intensity distribution per sample and overall}
6 | \usage{
7 | qc_intensity_distribution(
8 | data,
9 | sample = NULL,
10 | grouping,
11 | intensity_log2,
12 | plot_style
13 | )
14 | }
15 | \arguments{
16 | \item{data}{a data frame that contains at least sample names, grouping identifiers (precursor,
17 | peptide or protein) and log2 transformed intensities for each grouping identifier.}
18 |
19 | \item{sample}{an optional character or factor column in the \code{data} data frame that contains the
20 | sample name. If the sample column is of type factor, the ordering is based on the factor
21 | levels. NOTE: If the overall distribution should be returned please do not provide the name of the
22 | sample column.}
23 |
24 | \item{grouping}{a character column in the \code{data} data frame that contains the grouping
25 | variables (e.g. peptides, precursors or proteins).}
26 |
27 | \item{intensity_log2}{a numeric column in the \code{data} data frame that contains the log2
28 | transformed intensities of each grouping identifier sample combination.}
29 |
30 | \item{plot_style}{a character value that indicates the plot type. This can be either
31 | "histogram", "boxplot" or "violin". Plot style "boxplot" and "violin" can only be used if a
32 | sample column is provided.}
33 | }
34 | \value{
35 | A histogram or boxplot that shows the intensity distribution over all samples or by
36 | sample.
37 | }
38 | \description{
39 | Plots the overall or sample-wise distribution of all peptide intensities as a boxplot or
40 | histogram.
41 | }
42 | \examples{
43 | set.seed(123) # Makes example reproducible
44 |
45 | # Create example data
46 | data <- create_synthetic_data(
47 | n_proteins = 100,
48 | frac_change = 0.05,
49 | n_replicates = 3,
50 | n_conditions = 2,
51 | method = "effect_random"
52 | )
53 |
54 | # Plot intensity distribution
55 | # The plot style can be changed
56 | qc_intensity_distribution(
57 | data = data,
58 | sample = sample,
59 | grouping = peptide,
60 | intensity_log2 = peptide_intensity_missing,
61 | plot_style = "boxplot"
62 | )
63 | }
64 |
--------------------------------------------------------------------------------
/R/find_peptide.R:
--------------------------------------------------------------------------------
1 | #' Find peptide location
2 | #'
3 | #' The position of the given peptide sequence is searched within the given protein sequence. In
4 | #' addition the last amino acid of the peptide and the amino acid right before are reported.
5 | #'
6 | #' @param data a data frame that contains at least the protein and peptide sequence.
7 | #' @param protein_sequence a character column in the \code{data} data frame that contains the
8 | #' protein sequence.
9 | #' @param peptide_sequence a character column in the \code{data} data frame that contains the
10 | #' peptide sequence.
11 | #'
12 | #' @return A data frame that contains the input data and four additional columns with peptide
13 | #' start and end position, the last amino acid and the amino acid before the peptide.
14 | #' @import dplyr
15 | #' @import stringr
16 | #' @importFrom magrittr %>%
17 | #' @importFrom rlang .data
18 | #' @export
19 | #'
20 | #' @examples
21 | #' # Create example data
22 | #' data <- data.frame(
23 | #' protein_sequence = c("abcdefg"),
24 | #' peptide_sequence = c("cde")
25 | #' )
26 | #'
27 | #' # Find peptide
28 | #' find_peptide(
29 | #' data = data,
30 | #' protein_sequence = protein_sequence,
31 | #' peptide_sequence = peptide_sequence
32 | #' )
33 | find_peptide <-
34 | function(data, protein_sequence, peptide_sequence) {
35 | result <- data %>%
36 | dplyr::ungroup() %>%
37 | dplyr::distinct({{ protein_sequence }}, {{ peptide_sequence }}) %>%
38 | dplyr::mutate(
39 | start = stringr::str_locate({{ protein_sequence }}, {{ peptide_sequence }})[, 1],
40 | end = stringr::str_locate({{ protein_sequence }}, {{ peptide_sequence }})[, 2]
41 | ) %>%
42 | dplyr::mutate(aa_before = stringr::str_sub({{ protein_sequence }},
43 | start = .data$start - 1,
44 | end = .data$start - 1
45 | )) %>%
46 | dplyr::mutate(last_aa = stringr::str_sub({{ protein_sequence }},
47 | start = .data$end,
48 | end = .data$end
49 | )) %>%
50 | dplyr::mutate(aa_after = stringr::str_sub({{ protein_sequence }},
51 | start = .data$end + 1,
52 | end = .data$end + 1
53 | ))
54 |
55 | data %>% dplyr::left_join(result, c(
56 | rlang::as_name(rlang::enquo(protein_sequence)),
57 | rlang::as_name(rlang::enquo(peptide_sequence))
58 | ))
59 | }
60 |
--------------------------------------------------------------------------------
/.github/workflows/format-code.yml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | paths: ["**.[rR]", "**.[qrR]md", "**.[rR]markdown", "**.[rR]nw", "**.[rR]profile"]
4 |
5 | name: Style
6 | env:
7 | GITHUB_ACTOR: "actions-user"
8 |
9 | jobs:
10 | style:
11 | runs-on: ubuntu-latest
12 | permissions:
13 | contents: write
14 | env:
15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | steps:
17 | - name: Checkout repo
18 | uses: actions/checkout@v4
19 | with:
20 | fetch-depth: 0
21 |
22 | - name: Setup R
23 | uses: r-lib/actions/setup-r@v2
24 | with:
25 | use-public-rspm: true
26 |
27 | - name: Install dependencies
28 | uses: r-lib/actions/setup-r-dependencies@v2
29 | with:
30 | extra-packages: any::styler, any::roxygen2
31 | needs: styler
32 |
33 | - name: Enable styler cache
34 | run: styler::cache_activate()
35 | shell: Rscript {0}
36 |
37 | - name: Determine cache location
38 | id: styler-location
39 | run: |
40 | cat(
41 | "location=",
42 | styler::cache_info(format = "tabular")$location,
43 | "\n",
44 | file = Sys.getenv("GITHUB_OUTPUT"),
45 | append = TRUE,
46 | sep = ""
47 | )
48 | shell: Rscript {0}
49 |
50 | - name: Cache styler
51 | uses: actions/cache@v4
52 | with:
53 | path: ${{ steps.styler-location.outputs.location }}
54 | key: ${{ runner.os }}-styler-${{ github.sha }}
55 | restore-keys: |
56 | ${{ runner.os }}-styler-
57 | ${{ runner.os }}-
58 |
59 | - name: Style
60 | run: styler::style_pkg()
61 | shell: Rscript {0}
62 |
63 | - name: Commit and push changes
64 | run: |
65 | if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \
66 | | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$'))
67 | then
68 | git config --local user.name "$GITHUB_ACTOR"
69 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
70 | git commit ${FILES_TO_COMMIT[*]} -m "Style code (GHA)"
71 | git pull --ff-only
72 | git push origin
73 | else
74 | echo "No changes to commit."
75 | fi
76 |
--------------------------------------------------------------------------------
/man/qc_proteome_coverage.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_proteome_coverage.R
3 | \name{qc_proteome_coverage}
4 | \alias{qc_proteome_coverage}
5 | \title{Proteome coverage per sample and total}
6 | \usage{
7 | qc_proteome_coverage(
8 | data,
9 | sample,
10 | protein_id,
11 | organism_id,
12 | reviewed = TRUE,
13 | plot = TRUE,
14 | interactive = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{data}{a data frame that contains at least sample names and protein ID's.}
19 |
20 | \item{sample}{a character column in the \code{data} data frame that contains the sample name.}
21 |
22 | \item{protein_id}{a character or numeric column in the \code{data} data frame that contains
23 | protein identifiers such as UniProt accessions.}
24 |
25 | \item{organism_id}{a numeric value that specifies a NCBI taxonomy identifier (TaxId) of the
26 | organism used. Human: 9606, S. cerevisiae: 559292, E. coli: 83333.}
27 |
28 | \item{reviewed}{a logical value that determines if only reviewed protein entries will be considered
29 | as the full proteome. Default is TRUE.}
30 |
31 | \item{plot}{a logical value that specifies whether the result should be plotted.}
32 |
33 | \item{interactive}{a logical value that indicates whether the plot should be interactive
34 | (default is FALSE).}
35 | }
36 | \value{
37 | A bar plot showing the percentage of of the proteome detected and undetected in total
38 | and for each sample. If \code{plot = FALSE} a data frame containing the numbers is returned.
39 | }
40 | \description{
41 | Calculates the proteome coverage for each samples and for all samples combined. In other words t
42 | he fraction of detected proteins to all proteins in the proteome is calculated.
43 | }
44 | \examples{
45 | \donttest{
46 | # Create example data
47 | proteome <- data.frame(id = 1:4518)
48 | data <- data.frame(
49 | sample = c(rep("A", 101), rep("B", 1000), rep("C", 1000)),
50 | protein_id = c(proteome$id[1:100], proteome$id[1:1000], proteome$id[1000:2000])
51 | )
52 |
53 | # Calculate proteome coverage
54 | qc_proteome_coverage(
55 | data = data,
56 | sample = sample,
57 | protein_id = protein_id,
58 | organism_id = 83333,
59 | plot = FALSE
60 | )
61 |
62 | # Plot proteome coverage
63 | qc_proteome_coverage(
64 | data = data,
65 | sample = sample,
66 | protein_id = protein_id,
67 | organism_id = 83333,
68 | plot = TRUE
69 | )
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/man/calculate_imputation.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_imputation.R
3 | \name{calculate_imputation}
4 | \alias{calculate_imputation}
5 | \title{Sampling of values for imputation}
6 | \usage{
7 | calculate_imputation(
8 | min = NULL,
9 | noise = NULL,
10 | mean = NULL,
11 | sd,
12 | missingness = c("MNAR", "MAR"),
13 | method = c("ludovic", "noise"),
14 | skip_log2_transform_error = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{min}{a numeric value specifying the minimal intensity value of the precursor/peptide.
19 | Is only required if \code{method = "ludovic"} and \code{missingness = "MNAR"}.}
20 |
21 | \item{noise}{a numeric value specifying a noise value for the precursor/peptide. Is only
22 | required if \code{method = "noise"} and \code{missingness = "MNAR"}.}
23 |
24 | \item{mean}{a numeric value specifying the mean intensity value of the condition with missing
25 | values for a given precursor/peptide. Is only required if \code{missingness = "MAR"}.}
26 |
27 | \item{sd}{a numeric value specifying the mean of the standard deviation of all conditions for
28 | a given precursor/peptide.}
29 |
30 | \item{missingness}{a character value specifying the missingness type of the data determines
31 | how values for imputation are sampled. This can be \code{"MAR"} or \code{"MNAR"}.}
32 |
33 | \item{method}{a character value specifying the method to be used for imputation. For
34 | \code{method = "ludovic"}, MNAR missingness is sampled around a value that is three lower
35 | (log2) than the lowest intensity value recorded for the precursor/peptide. For
36 | \code{method = "noise"}, MNAR missingness is sampled around the noise value for the
37 | precursor/peptide.}
38 |
39 | \item{skip_log2_transform_error}{a logical value, if FALSE a check is performed to validate that
40 | input values are log2 transformed. If input values are > 40 the test is failed and an error is
41 | returned.}
42 | }
43 | \value{
44 | A value sampled from a normal distribution with the input parameters. Method specifics
45 | are applied to input parameters prior to sampling.
46 | }
47 | \description{
48 | \code{calculate_imputation} is a helper function that is used in the \code{impute} function.
49 | Depending on the type of missingness and method, it samples values from a normal distribution
50 | that can be used for the imputation. Note: The input intensities should be log2 transformed.
51 | }
52 |
--------------------------------------------------------------------------------
/man/qc_data_completeness.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_data_completeness.R
3 | \name{qc_data_completeness}
4 | \alias{qc_data_completeness}
5 | \title{Data completeness}
6 | \usage{
7 | qc_data_completeness(
8 | data,
9 | sample,
10 | grouping,
11 | intensity,
12 | digestion = NULL,
13 | plot = TRUE,
14 | interactive = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{data}{a data frame containing at least the input variables.}
19 |
20 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.}
21 |
22 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor
23 | or peptide identifiers.}
24 |
25 | \item{intensity}{a numeric column in the \code{data} data frame that contains any intensity
26 | intensity values that missingness should be determined for.}
27 |
28 | \item{digestion}{optional, a character column in the \code{data} data frame that indicates the
29 | mode of digestion (limited proteolysis or tryptic digest). Alternatively, any other variable
30 | by which the data should be split can be provided.}
31 |
32 | \item{plot}{a logical value that indicates whether the result should be plotted.}
33 |
34 | \item{interactive}{a logical value that specifies whether the plot should be interactive
35 | (default is FALSE).}
36 | }
37 | \value{
38 | A bar plot that displays the percentage of data completeness over all samples.
39 | If \code{plot = FALSE} a data frame is returned. If \code{interactive = TRUE}, the plot is
40 | interactive.
41 | }
42 | \description{
43 | Calculates the percentage of data completeness. That means, what percentage of all detected
44 | precursors is present in each sample.
45 | }
46 | \examples{
47 | set.seed(123) # Makes example reproducible
48 |
49 | # Create example data
50 | data <- create_synthetic_data(
51 | n_proteins = 100,
52 | frac_change = 0.05,
53 | n_replicates = 3,
54 | n_conditions = 2,
55 | method = "effect_random"
56 | )
57 |
58 | # Determine data completeness
59 | qc_data_completeness(
60 | data = data,
61 | sample = sample,
62 | grouping = peptide,
63 | intensity = peptide_intensity_missing,
64 | plot = FALSE
65 | )
66 |
67 | # Plot data completeness
68 | qc_data_completeness(
69 | data = data,
70 | sample = sample,
71 | grouping = peptide,
72 | intensity = peptide_intensity_missing,
73 | plot = TRUE
74 | )
75 | }
76 |
--------------------------------------------------------------------------------
/R/drc_4p.R:
--------------------------------------------------------------------------------
1 | #' Dose response curve helper function
2 | #'
3 | #' This function peforms the four-parameter dose response curve fit. It is the helper function
4 | #' for the fit in the \code{fit_drc_4p} function.
5 | #'
6 | #' @param data a data frame that contains at least the dose and response column the model should
7 | #' be fitted to.
8 | #' @param response a numeric column that contains the response values.
9 | #' @param dose a numeric column that contains the dose values.
10 | #' @param log_logarithmic a logical value indicating if a logarithmic or log-logarithmic model is
11 | #' fitted. If response values form a symmetric curve for non-log transformed dose values, a
12 | #' logarithmic model instead of a log-logarithmic model should be used. Usually biological dose
13 | #' response data has a log-logarithmic distribution, which is the reason this is the default.
14 | #' Log-logarithmic models are symmetric if dose values are log transformed.
15 | #' @param pb progress bar object. This is only necessary if the function is used in an iteration.
16 | #'
17 | #' @return An object of class \code{drc}. If no fit was performed a character vector with content
18 | #' "no_fit".
19 | drc_4p <- function(data, response, dose, log_logarithmic = TRUE, pb = NULL) {
20 | if (!requireNamespace("drc", quietly = TRUE)) {
21 | message("Package \"drc\" is needed for this function to work. Please install it.", call. = FALSE)
22 | return(invisible(NULL))
23 | }
24 | if (!is.null(pb)) pb$tick()
25 | if (log_logarithmic == TRUE) {
26 | result <- tryCatch(
27 | {
28 | suppressWarnings(drc::drm(
29 | stats::as.formula(paste(ensym(response), "~", ensym(dose))),
30 | data = data,
31 | fct = drc::LL.4(names = c("hill", "min_value", "max_value", "ec_50")),
32 | control = drc::drmc(otrace = TRUE)
33 | ))
34 | },
35 | error = function(error) {
36 | c("no_fit")
37 | }
38 | )
39 | return(result)
40 | }
41 | if (log_logarithmic == FALSE) {
42 | result <- tryCatch(
43 | {
44 | suppressWarnings(drc::drm(
45 | stats::as.formula(paste(ensym(response), "~", ensym(dose))),
46 | data = data,
47 | fct = drc::L.4(names = c("hill", "min_value", "max_value", "ec_50")),
48 | control = drc::drmc(otrace = TRUE)
49 | ))
50 | },
51 | error = function(error) {
52 | c("no_fit")
53 | }
54 | )
55 | return(result)
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/R/anova_protti.R:
--------------------------------------------------------------------------------
1 | #' Perform ANOVA
2 | #'
3 | #' Performs an ANOVA statistical test
4 | #'
5 | #' @param data a data frame containing at least the input variables.
6 | #' @param grouping a character column in the \code{data} data frame that contains precursor or
7 | #' peptide identifiers.
8 | #' @param condition a character or numeric column in the \code{data} data frame that contains the
9 | #' conditions.
10 | #' @param mean_ratio a numeric column in the \code{data} data frame that contains mean intensities
11 | #' or mean intensity ratios.
12 | #' @param sd a numeric column in the \code{data} data frame that contains the standard deviation
13 | #' corresponding to the mean.
14 | #' @param n a numeric column in the \code{data} data frame that contains the number of replicates
15 | #' for which the corresponding mean was calculated.
16 | #'
17 | #' @return a data frame that contains the within group error (\code{ms_group}) and the between
18 | #' group error (\code{ms_error}), f statistic and p-values.
19 | #' @import dplyr
20 | #' @export
21 | #'
22 | #' @examples
23 | #' data <- data.frame(
24 | #' precursor = c("A", "A", "A", "B", "B", "B"),
25 | #' condition = c("C1", "C2", "C3", "C1", "C2", "C3"),
26 | #' mean = c(10, 12, 20, 11, 12, 8),
27 | #' sd = c(2, 1, 1.5, 1, 2, 4),
28 | #' n = c(4, 4, 4, 4, 4, 4)
29 | #' )
30 | #'
31 | #' anova_protti(
32 | #' data,
33 | #' grouping = precursor,
34 | #' condition = condition,
35 | #' mean = mean,
36 | #' sd = sd,
37 | #' n = n
38 | #' )
39 | anova_protti <- function(data, grouping, condition, mean_ratio, sd, n) {
40 | result <- data %>%
41 | dplyr::distinct({{ grouping }}, {{ condition }}, {{ mean_ratio }}, {{ sd }}, {{ n }}) %>%
42 | dplyr::group_by({{ grouping }}) %>%
43 | dplyr::filter({{ n }} != 0) %>%
44 | dplyr::mutate(n_groups = dplyr::n_distinct(!!ensym(condition))) %>%
45 | dplyr::mutate(grand_mean = mean({{ mean_ratio }})) %>%
46 | dplyr::mutate(total_n = sum({{ n }})) %>%
47 | dplyr::mutate(ms_group = sum(({{ mean_ratio }} - .data$grand_mean)^2 * {{ n }}) / (.data$n_groups - 1)) %>%
48 | dplyr::mutate(ms_error = sum({{ sd }}^2 * ({{ n }} - 1)) / (.data$total_n - .data$n_groups)) %>%
49 | dplyr::mutate(f = .data$ms_group / .data$ms_error) %>%
50 | dplyr::mutate(pval = stats::pf(.data$f, .data$n_groups - 1, .data$total_n - .data$n_groups, lower.tail = FALSE)) %>%
51 | dplyr::distinct({{ grouping }}, .data$ms_group, .data$ms_error, .data$f, .data$pval) %>%
52 | dplyr::ungroup()
53 |
54 | result
55 | }
56 |
--------------------------------------------------------------------------------
/man/fetch_alphafold_aligned_error.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_alphafold_aligned_error.R
3 | \name{fetch_alphafold_aligned_error}
4 | \alias{fetch_alphafold_aligned_error}
5 | \title{Fetch AlphaFold aligned error}
6 | \usage{
7 | fetch_alphafold_aligned_error(
8 | uniprot_ids = NULL,
9 | error_cutoff = 20,
10 | timeout = 30,
11 | max_tries = 1,
12 | return_data_frame = FALSE,
13 | show_progress = TRUE
14 | )
15 | }
16 | \arguments{
17 | \item{uniprot_ids}{a character vector of UniProt identifiers for which predictions
18 | should be fetched.}
19 |
20 | \item{error_cutoff}{a numeric value specifying the maximum position error (in Angstroms) that should be retained.
21 | setting this value to a low number reduces the size of the retrieved data. Default is 20.}
22 |
23 | \item{timeout}{a numeric value specifying the time in seconds until the download times out.
24 | The default is 30 seconds.}
25 |
26 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download
27 | the data in case an error occurs. The default is 1.}
28 |
29 | \item{return_data_frame}{a logical value; if \code{TRUE} a data frame instead of a list
30 | is returned. It is recommended to only use this if information for few proteins is retrieved.
31 | Default is \code{FALSE}.}
32 |
33 | \item{show_progress}{a logical value; if \code{TRUE} a progress bar will be shown.
34 | Default is \code{TRUE}.}
35 | }
36 | \value{
37 | A list that contains aligned errors for AlphaFold predictions. If return_data_frame is
38 | TRUE, a data frame with this information is returned instead. The data frame contains the
39 | following columns:
40 | \itemize{
41 | \item scored_residue: The error for this position is calculated based on the alignment to the
42 | aligned residue.
43 | \item aligned_residue: The residue that is aligned for the calculation of the error of the scored
44 | residue
45 | \item error: The predicted aligned error computed by alpha fold.
46 | \item accession: The UniProt protein identifier.
47 | }
48 | }
49 | \description{
50 | Fetches the aligned error for AlphaFold predictions for provided proteins.
51 | The aligned error is useful for assessing inter-domain accuracy. In detail it
52 | represents the expected position error at residue x (scored residue), when
53 | the predicted and true structures are aligned on residue y (aligned residue).
54 | }
55 | \examples{
56 | \donttest{
57 | aligned_error <- fetch_alphafold_aligned_error(
58 | uniprot_ids = c("F4HVG8", "O15552"),
59 | error_cutoff = 5,
60 | return_data_frame = TRUE
61 | )
62 |
63 | head(aligned_error, n = 10)
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/man/qc_sample_correlation.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_sample_correlation.R
3 | \name{qc_sample_correlation}
4 | \alias{qc_sample_correlation}
5 | \title{Correlation based hirachical clustering of samples}
6 | \usage{
7 | qc_sample_correlation(
8 | data,
9 | sample,
10 | grouping,
11 | intensity_log2,
12 | condition,
13 | digestion = NULL,
14 | run_order = NULL,
15 | method = "spearman",
16 | interactive = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{data}{a data frame that contains at least the input variables.}
21 |
22 | \item{sample}{a character column in the \code{data} data frame that contains the sample names.}
23 |
24 | \item{grouping}{a character column in the \code{data} data frame that contains precursor or
25 | peptide identifiers.}
26 |
27 | \item{intensity_log2}{a numeric column in the \code{data} data frame that contains log2
28 | intensity values.}
29 |
30 | \item{condition}{a character or numeric column in the \code{data} data frame that contains the
31 | conditions.}
32 |
33 | \item{digestion}{optional, a character column in the \code{data} data frame that contains
34 | information about the digestion method used. e.g. "LiP" or "tryptic control".}
35 |
36 | \item{run_order}{optional, a character or numeric column in the \code{data} data frame that
37 | contains the order in which samples were measured. Useful to investigate batch effects due to
38 | run order.}
39 |
40 | \item{method}{a character value that specifies the method to be used for correlation.
41 | \code{"spearman"} is the default but can be changed to \code{"pearson"} or \code{"kendall"}.}
42 |
43 | \item{interactive}{a logical value that specifies whether the plot should be interactive.
44 | Determines if an interactive or static heatmap should be created using \code{heatmaply} or
45 | \code{pheatmap}, respectively.}
46 | }
47 | \value{
48 | A correlation heatmap that compares each sample. The dendrogram is sorted by optimal
49 | leaf ordering.
50 | }
51 | \description{
52 | A correlation heatmap is created that uses hirachical clustering to determine sample similarity.
53 | }
54 | \examples{
55 | \donttest{
56 | set.seed(123) # Makes example reproducible
57 |
58 | # Create example data
59 | data <- create_synthetic_data(
60 | n_proteins = 100,
61 | frac_change = 0.05,
62 | n_replicates = 3,
63 | n_conditions = 2,
64 | method = "effect_random"
65 | )
66 |
67 | # Create sample correlation heatmap
68 | qc_sample_correlation(
69 | data = data,
70 | sample = sample,
71 | grouping = peptide,
72 | intensity_log2 = peptide_intensity_missing,
73 | condition = condition
74 | )
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/man/randomise_queue.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/randomise_queue.R
3 | \name{randomise_queue}
4 | \alias{randomise_queue}
5 | \title{Randomise samples in MS queue}
6 | \usage{
7 | randomise_queue(data = NULL, rows = NULL, export = FALSE)
8 | }
9 | \arguments{
10 | \item{data}{optional, a data frame that contains a queue. If not provided a queue file can be
11 | chosen interactively.}
12 |
13 | \item{rows}{optional, a numeric vector that specifies a range of rows in for which samples
14 | should be randomized.}
15 |
16 | \item{export}{a logical value that determines if a \code{"randomised_queue.csv"} file will be
17 | saved in the working directory. If FALSE a data frame will be returned.}
18 | }
19 | \value{
20 | If \code{export = TRUE} a \code{"randomised_queue.csv"} file will be saved in the
21 | working directory. If \code{export = FALSE} a data frame that contains the randomised queue
22 | is returned.
23 | }
24 | \description{
25 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
26 | This function randomises the order of samples in an MS queue. QC and Blank samples are left in
27 | place. It is also possible to randomise only parts of the queue. Before running this make sure
28 | to set a specific seed with the \code{set.seed()} function. This ensures that the randomisation
29 | of the result is consistent if the function is run again.
30 | }
31 | \examples{
32 | queue <- create_queue(
33 | date = c("200722"),
34 | instrument = c("EX1"),
35 | user = c("jquast"),
36 | measurement_type = c("DIA"),
37 | experiment_name = c("JPQ031"),
38 | digestion = c("LiP", "tryptic control"),
39 | treatment_type_1 = c("EDTA", "H2O"),
40 | treatment_type_2 = c("Zeba", "unfiltered"),
41 | treatment_dose_1 = c(10, 30, 60),
42 | treatment_unit_1 = c("min"),
43 | n_replicates = 4,
44 | number_runs = FALSE,
45 | organism = c("E. coli"),
46 | exclude_combinations = list(list(
47 | treatment_type_1 = c("H2O"),
48 | treatment_type_2 = c("Zeba", "unfiltered"),
49 | treatment_dose_1 = c(10, 30)
50 | )),
51 | inj_vol = c(2),
52 | data_path = "D:\\\\2007_Data",
53 | method_path = "C:\\\\Xcalibur\\\\methods\\\\DIA_120min",
54 | position_row = c("A", "B", "C", "D", "E", "F"),
55 | position_column = 8,
56 | blank_every_n = 4,
57 | blank_position = "1-V1",
58 | blank_method_path = "C:\\\\Xcalibur\\\\methods\\\\blank"
59 | )
60 |
61 | head(queue, n = 20)
62 |
63 | randomised_queue <- randomise_queue(
64 | data = queue,
65 | export = FALSE
66 | )
67 |
68 | head(randomised_queue, n = 20)
69 | }
70 |
--------------------------------------------------------------------------------
/man/qc_cvs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_cvs.R
3 | \name{qc_cvs}
4 | \alias{qc_cvs}
5 | \title{Check CV distribution}
6 | \usage{
7 | qc_cvs(
8 | data,
9 | grouping,
10 | condition,
11 | intensity,
12 | plot = TRUE,
13 | plot_style = "density",
14 | max_cv = 200
15 | )
16 | }
17 | \arguments{
18 | \item{data}{a data frame containing at least peptide, precursor or protein identifiers,
19 | information on conditions and intensity values for each peptide, precursor or protein.}
20 |
21 | \item{grouping}{a character column in the \code{data} data frame that contains the grouping
22 | variables (e.g. peptides, precursors or proteins).}
23 |
24 | \item{condition}{a character or factor column in the \code{data} data frame that contains condition information
25 | (e.g. "treated" and "control").}
26 |
27 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding
28 | raw or untransformed normalised intensity values for each peptide or precursor.}
29 |
30 | \item{plot}{a logical value that indicates whether the result should be plotted.}
31 |
32 | \item{plot_style}{a character value that indicates the plotting style. \code{plot_style = "boxplot"}
33 | plots a boxplot, whereas \code{plot_style = "density"} plots the CV density distribution.
34 | \code{plot_style = "violin"} returns a violin plot. Default is \code{plot_style = "density"}.}
35 |
36 | \item{max_cv}{a numeric value that specifies the maximum percentage of CVs that should be included
37 | in the returned plot. The default value is \code{max_cv = 200}.}
38 | }
39 | \value{
40 | Either a data frame with the median CVs in \% or a plot showing the distribution of the CVs
41 | is returned.
42 | }
43 | \description{
44 | Calculates and plots the coefficients of variation for the selected grouping.
45 | }
46 | \examples{
47 | # Load libraries
48 | library(dplyr)
49 |
50 | set.seed(123) # Makes example reproducible
51 |
52 | # Create example data
53 | data <- create_synthetic_data(
54 | n_proteins = 100,
55 | frac_change = 0.05,
56 | n_replicates = 3,
57 | n_conditions = 2,
58 | method = "effect_random"
59 | ) \%>\%
60 | mutate(intensity_non_log2 = 2^peptide_intensity_missing)
61 |
62 | # Calculate coefficients of variation
63 | qc_cvs(
64 | data = data,
65 | grouping = peptide,
66 | condition = condition,
67 | intensity = intensity_non_log2,
68 | plot = FALSE
69 | )
70 |
71 | # Plot coefficients of variation
72 | # Different plot styles are available
73 | qc_cvs(
74 | data = data,
75 | grouping = peptide,
76 | condition = condition,
77 | intensity = intensity_non_log2,
78 | plot = TRUE,
79 | plot_style = "violin"
80 | )
81 | }
82 |
--------------------------------------------------------------------------------
/man/fetch_eco.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_eco.R
3 | \name{fetch_eco}
4 | \alias{fetch_eco}
5 | \title{Fetch evidence & conclusion ontology}
6 | \usage{
7 | fetch_eco(
8 | return_relation = FALSE,
9 | return_history = FALSE,
10 | show_progress = TRUE
11 | )
12 | }
13 | \arguments{
14 | \item{return_relation}{a logical value that indicates if relational information should be returned instead
15 | the main descriptive information. This data can be used to check the relations of ECO terms to each other.
16 | Default is FALSE.}
17 |
18 | \item{return_history}{a logical value that indicates if the entry history of an ECO term should be
19 | returned instead the main descriptive information.
20 | Default is FALSE.}
21 |
22 | \item{show_progress}{a logical value that indicates if a progress bar will be shown.
23 | Default is TRUE.}
24 | }
25 | \value{
26 | A data frame that contains descriptive information about each ECO term in the EBI database.
27 | If either \code{return_relation} or \code{return_history} is set to \code{TRUE}, the respective information is
28 | returned instead of the usual output.
29 | }
30 | \description{
31 | Fetches all evidence & conclusion ontology (ECO) information from the QuickGO EBI database. The ECO project is
32 | maintained through a public \href{https://github.com/evidenceontology/evidenceontology}{GitHub repository}.
33 | }
34 | \details{
35 | According to the GitHub repository ECO is defined as follows:
36 |
37 | "The Evidence & Conclusion Ontology (ECO) describes types of scientific evidence within the
38 | biological research domain that arise from laboratory experiments, computational methods,
39 | literature curation, or other means. Researchers use evidence to support conclusions
40 | that arise out of scientific research. Documenting evidence during scientific research
41 | is essential, because evidence gives us a sense of why we believe what we think we know.
42 | Conclusions are asserted as statements about things that are believed to be true, for
43 | example that a protein has a particular function (i.e. a protein functional annotation) or
44 | that a disease is associated with a particular gene variant (i.e. a phenotype-gene association).
45 | A systematic and structured (i.e. ontological) classification of evidence allows us to store,
46 | retreive, share, and compare data associated with that evidence using computers, which are
47 | essential to navigating the ever-growing (in size and complexity) corpus of scientific
48 | information."
49 |
50 | More information can be found in their publication (\doi{10.1093/nar/gky1036}).
51 | }
52 | \examples{
53 | \donttest{
54 | eco <- fetch_eco()
55 |
56 | head(eco)
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/man/filter_cv.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/filter_cv.R
3 | \name{filter_cv}
4 | \alias{filter_cv}
5 | \title{Data filtering based on coefficients of variation (CV)}
6 | \usage{
7 | filter_cv(
8 | data,
9 | grouping,
10 | condition,
11 | log2_intensity,
12 | cv_limit = 0.25,
13 | min_conditions,
14 | silent = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{data}{a data frame that contains at least the input variables.}
19 |
20 | \item{grouping}{a character column in the \code{data} data frame that contains the grouping
21 | variable that can be either precursors, peptides or proteins.}
22 |
23 | \item{condition}{a character or numeric column in the \code{data} data frame that contains
24 | information on the sample condition.}
25 |
26 | \item{log2_intensity}{a numeric column in the \code{data} data frame that contains log2
27 | transformed intensities.}
28 |
29 | \item{cv_limit}{optional, a numeric value that specifies the CV cutoff that will be applied.
30 | Default is 0.25.}
31 |
32 | \item{min_conditions}{a numeric value that specifies the minimum number of conditions for
33 | which grouping CVs should be below the cutoff.}
34 |
35 | \item{silent}{a logical value that specifies if a message with the number of filtered out
36 | conditions should be returned. Default is FALSE.}
37 | }
38 | \value{
39 | The CV filtered data frame.
40 | }
41 | \description{
42 | Filters the input data based on precursor, peptide or protein intensity coefficients of variation.
43 | The function should be used to ensure that only robust measurements and quantifications are used for
44 | data analysis. It is advised to use the function after inspection of raw values (quality control)
45 | and median normalisation. Generally, the function calculates CVs of each peptide, precursor or
46 | protein for each condition and removes peptides, precursors or proteins that have a CV above
47 | the cutoff in less than the (user-defined) required number of conditions. Since the user-defined
48 | cutoff is fixed and does not depend on the number of conditions that have detected values, the
49 | function might bias for data completeness.
50 | }
51 | \examples{
52 | set.seed(123) # Makes example reproducible
53 |
54 | # Create synthetic data
55 | data <- create_synthetic_data(
56 | n_proteins = 50,
57 | frac_change = 0.05,
58 | n_replicates = 3,
59 | n_conditions = 2,
60 | method = "effect_random",
61 | additional_metadata = FALSE
62 | )
63 |
64 | # Filter coefficients of variation
65 | data_filtered <- filter_cv(
66 | data = data,
67 | grouping = peptide,
68 | condition = condition,
69 | log2_intensity = peptide_intensity_missing,
70 | cv_limit = 0.25,
71 | min_conditions = 2
72 | )
73 | }
74 |
--------------------------------------------------------------------------------
/R/fetch_kegg.R:
--------------------------------------------------------------------------------
1 | #' Fetch KEGG pathway data from KEGG
2 | #'
3 | #' Fetches gene IDs and corresponding pathway IDs and names for the provided organism.
4 | #'
5 | #' @param species a character value providing an abreviated species name. "hsa" for human, "eco"
6 | #' for E. coli and "sce" for S. cerevisiae. Additional possible names can be found for
7 | #' \href{https://www.genome.jp/kegg-bin/show_organism?category=Eukaryotes}{eukaryotes} and for
8 | #' \href{https://www.genome.jp/kegg-bin/show_organism?category=Prokaryotes}{prokaryotes}.
9 | #'
10 | #' @return A data frame that contains gene IDs with corresponding pathway IDs and names for a
11 | #' selected organism.
12 | #' @importFrom dplyr left_join
13 | #' @importFrom stringr str_replace_all
14 | #' @importFrom magrittr %>%
15 | #' @importFrom curl has_internet
16 | #' @export
17 | #'
18 | #' @examples
19 | #' \donttest{
20 | #' kegg <- fetch_kegg(species = "hsa")
21 | #'
22 | #' head(kegg)
23 | #' }
24 | fetch_kegg <- function(species) {
25 | if (!curl::has_internet()) {
26 | message("No internet connection.")
27 | return(invisible(NULL))
28 | }
29 | # download kegg_id pathway link
30 | url_link <- paste("https://rest.kegg.jp/link/pathway", species, sep = "/")
31 | result_link <- try_query(url_link, col_names = FALSE, progress = FALSE, show_col_types = FALSE)
32 | if (methods::is(result_link, "character")) {
33 | message(result_link)
34 | return(invisible(NULL))
35 | }
36 | colnames(result_link) <- c("kegg_id", "pathway_id")
37 | result_link$pathway_id <- stringr::str_replace_all(result_link$pathway_id,
38 | pattern = "path:",
39 | replacement = ""
40 | )
41 | # download pathway_id names
42 | url_name <- paste("https://rest.kegg.jp/list/pathway", species, sep = "/")
43 | result_name <- try_query(url_name, col_names = FALSE, progress = FALSE, show_col_types = FALSE)
44 | if (methods::is(result_name, "character")) {
45 | message(result_name)
46 | return(invisible(NULL))
47 | }
48 | colnames(result_name) <- c("pathway_id", "pathway_name")
49 |
50 | # download kegg_id to uniprot_id conversion
51 | url_conv <- paste("https://rest.kegg.jp/conv/uniprot", species, sep = "/")
52 | result_conv <- try_query(url_conv, col_names = FALSE, progress = FALSE, show_col_types = FALSE)
53 | if (methods::is(result_conv, "character")) {
54 | message(result_conv)
55 | return(invisible(NULL))
56 | }
57 | colnames(result_conv) <- c("kegg_id", "uniprot_id")
58 | result_conv$uniprot_id <- stringr::str_replace_all(result_conv$uniprot_id,
59 | pattern = "up:",
60 | replacement = ""
61 | )
62 | # combine datasets
63 | result <- result_link %>%
64 | dplyr::left_join(result_name, by = "pathway_id") %>%
65 | dplyr::left_join(result_conv, by = "kegg_id", relationship = "many-to-many")
66 | result
67 | }
68 |
--------------------------------------------------------------------------------
/man/qc_ranked_intensities.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_ranked_intensities.R
3 | \name{qc_ranked_intensities}
4 | \alias{qc_ranked_intensities}
5 | \title{Check ranked intensities}
6 | \usage{
7 | qc_ranked_intensities(
8 | data,
9 | sample,
10 | grouping,
11 | intensity_log2,
12 | facet = FALSE,
13 | plot = FALSE,
14 | y_axis_transformation = "log10",
15 | interactive = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{data}{a data frame that contains at least sample names, grouping identifiers (precursor,
20 | peptide or protein) and log2 transformed intensities for each grouping identifier.}
21 |
22 | \item{sample}{a character column in the \code{data} data frame that contains the sample names.}
23 |
24 | \item{grouping}{a character column in the \code{data} data frame that contains protein, precursor,
25 | or peptide identifiers.}
26 |
27 | \item{intensity_log2}{a numeric column in the \code{data} data frame that contains the log2
28 | transformed intensities of the selected grouping variable.}
29 |
30 | \item{facet}{a logical value that specifies whether the calculation should be done group wise by
31 | sample and if the resulting plot should be faceted by sample. (default is \code{FALSE}).
32 | If \code{facet = FALSE} the median of each protein intensity will be returned.}
33 |
34 | \item{plot}{a logical value that specifies whether the result should be plotted (default is \code{FALSE}).}
35 |
36 | \item{y_axis_transformation}{a character value that determines that y-axis transformation. The
37 | value is either "log2" or "log10" (default is "log10").}
38 |
39 | \item{interactive}{a logical value that specifies whether the plot should be interactive
40 | (default is \code{FALSE}).}
41 | }
42 | \value{
43 | A data frame containing the ranked intensities is returned. If \code{plot = TRUE} a plot
44 | is returned. The intensities are log10 transformed for the plot.
45 | }
46 | \description{
47 | Calculates and plots ranked intensities for proteins, peptides or precursors.
48 | }
49 | \examples{
50 | set.seed(123) # Makes example reproducible
51 |
52 | # Create synthetic data
53 | data <- create_synthetic_data(
54 | n_proteins = 50,
55 | frac_change = 0.05,
56 | n_replicates = 4,
57 | n_conditions = 3,
58 | method = "effect_random",
59 | additional_metadata = FALSE
60 | )
61 |
62 | # Plot ranked intensities for all samples combined
63 | qc_ranked_intensities(
64 | data = data,
65 | sample = sample,
66 | grouping = peptide,
67 | intensity_log2 = peptide_intensity,
68 | plot = TRUE,
69 | )
70 |
71 | # Plot ranked intensities for each sample separately
72 | qc_ranked_intensities(
73 | data = data,
74 | sample = sample,
75 | grouping = peptide,
76 | intensity_log2 = peptide_intensity,
77 | plot = TRUE,
78 | facet = TRUE
79 | )
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/R/find_all_subs.R:
--------------------------------------------------------------------------------
1 | #' Find all sub IDs of an ID in a network
2 | #'
3 | #' For a given ID, find all sub IDs and their sub IDs etc. The type of
4 | #' relationship can be selected too. This is a helper function for other functions.
5 | #'
6 | #' @param data a data frame that contains relational information on IDs (main_id) their sub
7 | #' IDs (sub_id) and their relationship (type). For ChEBI this data frame can be obtained by calling
8 | #' \code{fetch_chebi(relation = TRUE)}. For ECO data it can be obtained by calling fetch_eco(relation = TRUE).
9 | #' @param ids a character vector of IDs for which sub IDs should be searched.
10 | #' @param main_id a character or integer column containing IDs. Default is \code{id} for ChEBI IDs.
11 | #' @param type a character column that contains the type of interactions. Default is \code{type} for ChEBI IDs.
12 | #' @param accepted_types a character vector containing the accepted_types of relationships that should be considered
13 | #' for the search. It is possible to use "all" relationships. The default type is "is_a". A list of
14 | #' possible relationships for e.g. ChEBI IDs can be found
15 | #' \href{https://docs.google.com/document/d/1_w-DwBdCCOh1gMeeP6yqGzcnkpbHYOa3AGSODe5epcg/edit#heading=h.hnsqoqu978s5}{here}.
16 | #' @param exclude_parent_id a logical value that specifies if the parent ID should be included in
17 | #' the returned list.
18 | #'
19 | #' @return A list of character vectors containing the provided ID and all of its sub IDs. It
20 | #' contains one element per input ID.
21 | #' @importFrom dplyr select filter pull
22 | #' @importFrom magrittr %>%
23 | #' @importFrom purrr map
24 | #' @importFrom rlang .data
25 | find_all_subs <- function(data,
26 | ids,
27 | main_id = id,
28 | type = type,
29 | accepted_types = "is_a",
30 | exclude_parent_id = FALSE) {
31 | if (!requireNamespace("igraph", quietly = TRUE)) {
32 | message("Package \"igraph\" is needed for this function to work. Please install it.", call. = FALSE)
33 | return(invisible(NULL))
34 | }
35 | if (ifelse(length(accepted_types) == 1, accepted_types == "all", FALSE)) {
36 | data <- data %>%
37 | dplyr::select(-{{ type }})
38 | } else {
39 | data <- data %>%
40 | dplyr::filter({{ type }} %in% accepted_types) %>%
41 | dplyr::select(-{{ type }})
42 | }
43 | # Generate graph
44 | g <- igraph::graph_from_data_frame(data, directed = TRUE)
45 |
46 | result <- purrr::map(ids, function(x) {
47 | if (!(x %in% dplyr::pull(data, {{ main_id }}))) {
48 | return(NULL)
49 | }
50 | r <- igraph::subcomponent(g, match(x, igraph::V(g)$name), "out")$name
51 | if (exclude_parent_id) {
52 | r <- r[r != x]
53 | }
54 |
55 | r
56 | })
57 | result
58 | }
59 |
--------------------------------------------------------------------------------
/man/calculate_aa_scores.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_aa_scores.R
3 | \name{calculate_aa_scores}
4 | \alias{calculate_aa_scores}
5 | \title{Calculate scores for each amino acid position in a protein sequence}
6 | \usage{
7 | calculate_aa_scores(
8 | data,
9 | protein,
10 | diff = diff,
11 | adj_pval = adj_pval,
12 | start_position,
13 | end_position,
14 | retain_columns = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{data}{a data frame containing at least the input columns.}
19 |
20 | \item{protein}{a character column in the data frame containing the protein identifier or name.}
21 |
22 | \item{diff}{a numeric column in the \code{data} data frame containing the log2 fold change.}
23 |
24 | \item{adj_pval}{a numeric column in the \code{data} data frame containing the adjusted p-value.}
25 |
26 | \item{start_position}{a numeric column \code{data} in the data frame containing the start position
27 | of a peptide or precursor.}
28 |
29 | \item{end_position}{a numeric column in the data frame containing the end position of a peptide or
30 | precursor.}
31 |
32 | \item{retain_columns}{a vector indicating if certain columns should be retained from the input
33 | data frame. Default is not retaining additional columns \code{retain_columns = NULL}. Specific
34 | columns can be retained by providing their names (not in quotations marks, just like other
35 | column names, but in a vector).}
36 | }
37 | \value{
38 | A data frame that contains the aggregated scores per amino acid position, enabling to
39 | draw fingerprints for each individual protein.
40 | }
41 | \description{
42 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
43 | Calculate a score for each amino acid position in a protein sequence based on the product of the
44 | -log10(adjusted p-value) and the absolute log2(fold change) per peptide covering this amino acid. In detail, all the
45 | peptides are aligned along the sequence of the corresponding protein, and the average score per
46 | amino acid position is computed. In a limited proteolysis coupled to mass spectrometry (LiP-MS)
47 | experiment, the score allows to prioritize and narrow down structurally affected regions.
48 | }
49 | \examples{
50 |
51 | data <- data.frame(
52 | pg_protein_accessions = c(rep("protein_1", 10)),
53 | diff = c(2, -3, 1, 2, 3, -3, 5, 1, -0.5, 2),
54 | adj_pval = c(0.001, 0.01, 0.2, 0.05, 0.002, 0.5, 0.4, 0.7, 0.001, 0.02),
55 | start = c(1, 3, 5, 10, 15, 25, 28, 30, 41, 51),
56 | end = c(6, 8, 10, 16, 23, 35, 35, 35, 48, 55)
57 | )
58 | calculate_aa_scores(
59 | data,
60 | protein = pg_protein_accessions,
61 | diff = diff,
62 | adj_pval = adj_pval,
63 | start_position = start,
64 | end_position = end
65 | )
66 | }
67 | \author{
68 | Patrick Stalder
69 | }
70 |
--------------------------------------------------------------------------------
/man/qc_peak_width.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_peak_width.R
3 | \name{qc_peak_width}
4 | \alias{qc_peak_width}
5 | \title{Peak width over retention time}
6 | \usage{
7 | qc_peak_width(
8 | data,
9 | sample,
10 | intensity,
11 | retention_time,
12 | peak_width = NULL,
13 | retention_time_start = NULL,
14 | retention_time_end = NULL,
15 | remove_na_intensities = TRUE,
16 | interactive = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{data}{a data frame containing at least sample names and protein IDs.}
21 |
22 | \item{sample}{a character column in the \code{data} data frame that contains the sample names.}
23 |
24 | \item{intensity}{a numeric column in the \code{data} data frame that contains intensities. If
25 | \code{remove_na_intensities = FALSE}, this argument is not required.}
26 |
27 | \item{retention_time}{a numeric column in the \code{data} data frame that contains retention
28 | times of precursors.}
29 |
30 | \item{peak_width}{a numeric column in the \code{data} data frame that contains peak width
31 | information. It is not required if \code{retention_time_start} and \code{retention_time_end}
32 | columns are provided.}
33 |
34 | \item{retention_time_start}{a numeric column in the \code{data} data frame that contains the
35 | start time of the precursor elution peak. It is not required if the \code{peak_width} column
36 | is provided.}
37 |
38 | \item{retention_time_end}{a numeric column in the \code{data} data frame that contains the end
39 | time of the precursor elution peak. It is not required if the \code{peak_width} column is
40 | provided.}
41 |
42 | \item{remove_na_intensities}{a logical value that specifies if sample/grouping combinations
43 | with intensities that are NA (not quantified IDs) should be dropped from the data frame.
44 | Default is TRUE since we are usually interested in the peak width of quantifiable data.}
45 |
46 | \item{interactive}{a logical value that specifies whether the plot should be interactive
47 | (default is FALSE).}
48 | }
49 | \value{
50 | A line plot displaying one minute binned median precursor elution peak width over
51 | retention time for each sample.
52 | }
53 | \description{
54 | Plots one minute binned median precursor elution peak width over retention time for each sample.
55 | }
56 | \examples{
57 |
58 | data <- data.frame(
59 | r_file_name = c(rep("sample_1", 10), rep("sample2", 10)),
60 | fg_quantity = c(rep(2000, 20)),
61 | eg_mean_apex_rt = c(rep(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 2)),
62 | eg_start_rt = c(0.5, 1, 3, 4, 5, 6, 7, 7.5, 8, 9, 1, 2, 2, 3, 4, 5, 5, 8, 9, 9),
63 | eg_end_rt = c(
64 | 1.5, 2, 3.1, 4.5, 5.8, 6.6, 8, 8, 8.4,
65 | 9.1, 3, 2.2, 4, 3.4, 4.5, 5.5, 5.6, 8.3, 10, 12
66 | )
67 | )
68 | qc_peak_width(
69 | data,
70 | sample = r_file_name,
71 | intensity = fg_quantity,
72 | retention_time = eg_mean_apex_rt,
73 | retention_time_start = eg_start_rt,
74 | retention_time_end = eg_end_rt
75 | )
76 | }
77 |
--------------------------------------------------------------------------------
/man/qc_ids.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_ids.R
3 | \name{qc_ids}
4 | \alias{qc_ids}
5 | \title{Check number of precursor, peptide or protein IDs}
6 | \usage{
7 | qc_ids(
8 | data,
9 | sample,
10 | grouping,
11 | intensity,
12 | remove_na_intensities = TRUE,
13 | condition = NULL,
14 | title = "ID count per sample",
15 | plot = TRUE,
16 | interactive = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{data}{a data frame containing at least sample names and precursor/peptide/protein IDs.}
21 |
22 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.}
23 |
24 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor or
25 | peptide identifiers.}
26 |
27 | \item{intensity}{a character column in the \code{data} data frame that contains raw or log2
28 | transformed intensities. If \code{remove_na_intensities = FALSE}, this argument is optional.}
29 |
30 | \item{remove_na_intensities}{a logical value that specifies if sample/grouping combinations with
31 | intensities that are NA (not quantified IDs) should be dropped from the data frame. Default is
32 | TRUE since we are usually interested in the number of quantifiable IDs.}
33 |
34 | \item{condition}{optional, a column in the \code{data} data frame that contains condition information
35 | (e.g. "treated" and "control"). If this column is provided, the bars in the plot will be coloured
36 | according to the condition.}
37 |
38 | \item{title}{optional, a character value that specifies the plot title (default is "ID count
39 | per sample").}
40 |
41 | \item{plot}{a logical value that indicates whether the result should be plotted.}
42 |
43 | \item{interactive}{a logical value that specifies whether the plot should be interactive
44 | (default is FALSE).}
45 | }
46 | \value{
47 | A bar plot with the height corresponding to the number of IDs, each bar represents one
48 | sample (if \code{plot = TRUE}). If \code{plot = FALSE} a table with ID counts is returned.
49 | }
50 | \description{
51 | Returns a plot or table of the number of IDs for each sample. The default settings remove
52 | grouping variables without quantitative information (intensity is NA). These will not be
53 | counted as IDs.
54 | }
55 | \examples{
56 | set.seed(123) # Makes example reproducible
57 |
58 | # Create example data
59 | data <- create_synthetic_data(
60 | n_proteins = 100,
61 | frac_change = 0.05,
62 | n_replicates = 3,
63 | n_conditions = 2,
64 | method = "effect_random"
65 | )
66 |
67 | # Calculate number of identifications
68 | qc_ids(
69 | data = data,
70 | sample = sample,
71 | grouping = peptide,
72 | intensity = peptide_intensity_missing,
73 | condition = condition,
74 | plot = FALSE
75 | )
76 |
77 | # Plot number of identifications
78 | qc_ids(
79 | data = data,
80 | sample = sample,
81 | grouping = peptide,
82 | intensity = peptide_intensity_missing,
83 | condition = condition,
84 | plot = TRUE
85 | )
86 | }
87 |
--------------------------------------------------------------------------------
/man/qc_pca.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_pca.R
3 | \name{qc_pca}
4 | \alias{qc_pca}
5 | \title{Plot principal component analysis}
6 | \usage{
7 | qc_pca(
8 | data,
9 | sample,
10 | grouping,
11 | intensity,
12 | condition,
13 | components = c("PC1", "PC2"),
14 | digestion = NULL,
15 | plot_style = "pca"
16 | )
17 | }
18 | \arguments{
19 | \item{data}{a data frame that contains sample names, peptide or precursor identifiers,
20 | corresponding intensities and a condition column indicating e.g. the treatment.}
21 |
22 | \item{sample}{a character column in the \code{data} data frame that contains the sample name.}
23 |
24 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor
25 | or peptide identifiers.}
26 |
27 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding
28 | intensity values for each peptide or precursor.}
29 |
30 | \item{condition}{a numeric or character column in the \code{data} data frame that contains condition information
31 | (e.g. "treated" and "control").}
32 |
33 | \item{components}{a character vector indicating the two components that should be displayed in
34 | the plot. By default these are PC1 and PC2. You can provide these using a character vector of
35 | the form c("PC1", "PC2").}
36 |
37 | \item{digestion}{optional, a character column in the \code{data} data frame that indicates the
38 | mode of digestion (limited proteolysis or tryptic digest). Alternatively, any other variable
39 | by which the data should be split can be provided.}
40 |
41 | \item{plot_style}{a character value that specifies what plot should be returned. If
42 | \code{plot_style = "pca"} is selected the two PCA components supplied with the \code{components} argument
43 | are plottet against each other. This is the default. \code{plot_style = "scree"} returns a scree
44 | plot that displays the variance explained by each principal component in percent. The scree is
45 | useful for checking if any other than the default first two components should be plotted.}
46 | }
47 | \value{
48 | A principal component analysis plot showing PC1 and PC2. If \code{plot_style = "scree"}, a
49 | scree plot for all dimensions is returned.
50 | }
51 | \description{
52 | Plots a principal component analysis based on peptide or precursor intensities.
53 | }
54 | \examples{
55 | set.seed(123) # Makes example reproducible
56 |
57 | # Create example data
58 | data <- create_synthetic_data(
59 | n_proteins = 100,
60 | frac_change = 0.05,
61 | n_replicates = 3,
62 | n_conditions = 2,
63 | )
64 |
65 | # Plot scree plot
66 | qc_pca(
67 | data = data,
68 | sample = sample,
69 | grouping = peptide,
70 | intensity = peptide_intensity_missing,
71 | condition = condition,
72 | plot_style = "scree"
73 | )
74 |
75 | # Plot principal components
76 | qc_pca(
77 | data = data,
78 | sample = sample,
79 | grouping = peptide,
80 | intensity = peptide_intensity_missing,
81 | condition = condition
82 | )
83 | }
84 |
--------------------------------------------------------------------------------
/R/normalise.R:
--------------------------------------------------------------------------------
1 | #' Intensity normalisation
2 | #'
3 | #' `r lifecycle::badge('deprecated')`
4 | #' This function was deprecated due to its name changing to `normalise()`.
5 | #' The normalisation method in the new function needs to be provided as an argument.
6 | #'
7 | #' @return A data frame with a column called \code{normalised_intensity_log2} containing the
8 | #' normalised intensity values.
9 | #' @keywords internal
10 | #' @export
11 | median_normalisation <- function(...) {
12 | # This function has been renamed and is therefore deprecated.
13 | lifecycle::deprecate_warn("0.2.0",
14 | "median_normalisation()",
15 | "normalise()",
16 | details = "This function has been renamed."
17 | )
18 |
19 | normalise(...)
20 | }
21 | #' Intensity normalisation
22 | #'
23 | #' Performs normalisation on intensities. For median normalisation the normalised intensity is the
24 | #' original intensity minus the run median plus the global median. This is also the way it is
25 | #' implemented in the Spectronaut search engine.
26 | #'
27 | #' @param data a data frame containing at least sample names and intensity values. Please note that if the
28 | #' data frame is grouped, the normalisation will be computed by group.
29 | #' @param sample a character column in the \code{data} data frame that contains the sample names.
30 | #' @param intensity_log2 a numeric column in the \code{data} data frame that contains the log2 transformed
31 | #' intensity values to be normalised.
32 | #' @param method a character value specifying the method to be used for normalisation. Default
33 | #' is "median".
34 | #'
35 | #' @return A data frame with a column called \code{normalised_intensity_log2} containing the
36 | #' normalised intensity values.
37 | #' @import dplyr
38 | #' @importFrom magrittr %>%
39 | #' @importFrom rlang .data
40 | #' @importFrom stats median
41 | #' @export
42 | #'
43 | #' @examples
44 | #' data <- data.frame(
45 | #' r_file_name = c("s1", "s2", "s3", "s1", "s2", "s3"),
46 | #' intensity_log2 = c(18, 19, 17, 20, 21, 19)
47 | #' )
48 | #'
49 | #' normalise(data,
50 | #' sample = r_file_name,
51 | #' intensity_log2 = intensity_log2,
52 | #' method = "median"
53 | #' )
54 | normalise <-
55 | function(data,
56 | sample,
57 | intensity_log2,
58 | method = "median") {
59 | # Ensure method is valid
60 | if (!(method %in% c("median"))) {
61 | stop("Invalid method. Available methods: median")
62 | }
63 |
64 | if (method == "median") {
65 | median_normalised <- data %>%
66 | dplyr::distinct() %>%
67 | dplyr::mutate(global_median = stats::median({{ intensity_log2 }}, na.rm = TRUE)) %>%
68 | dplyr::group_by({{ sample }}, .add = TRUE) %>%
69 | dplyr::mutate(run_median = stats::median({{ intensity_log2 }}, na.rm = TRUE)) %>%
70 | dplyr::ungroup({{ sample }}) %>%
71 | dplyr::mutate(normalised_intensity_log2 = {{ intensity_log2 }} - .data$run_median + .data$global_median) %>%
72 | dplyr::select(-.data$run_median, -.data$global_median)
73 |
74 | return(median_normalised)
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/R/fetch_uniprot_proteome.R:
--------------------------------------------------------------------------------
1 | #' Fetch proteome data from UniProt
2 | #'
3 | #' Fetches proteome data from UniProt for the provided organism ID.
4 | #'
5 | #' @param organism_id a numeric value that specifies the NCBI taxonomy identifier (TaxId) for an
6 | #' organism.
7 | #' @param columns a character vector of metadata columns that should be imported from UniProt (all
8 | #' possible columns can be found \href{https://www.uniprot.org/help/return_fields}{here}. For
9 | #' cross-referenced database provide the database name with the prefix "xref_", e.g. \code{"xref_pdb"}).
10 | #' Note: Not more than one or two columns should be selected otherwise the function will not be
11 | #' able to efficiently retrieve the information. If more information is needed, \code{fetch_uniprot()}
12 | #' can be used with the IDs retrieved by this function.
13 | #' @param reviewed a logical value that determines if only reviewed protein entries will be retrieved.
14 | #' @param timeout a numeric value specifying the time in seconds until the download times out.
15 | #' The default is 60 seconds.
16 | #' @param max_tries a numeric value that specifies the number of times the function tries to download
17 | #' the data in case an error occurs. The default is 2.
18 | #'
19 | #' @return A data frame that contains all protein metadata specified in \code{columns} for the
20 | #' organism of choice.
21 | #' @importFrom janitor make_clean_names
22 | #' @export
23 | #'
24 | #' @examples
25 | #' \donttest{
26 | #' head(fetch_uniprot_proteome(9606))
27 | #' }
28 | fetch_uniprot_proteome <-
29 | function(organism_id,
30 | columns = c("accession"),
31 | reviewed = TRUE,
32 | timeout = 120,
33 | max_tries = 5) {
34 | if (!curl::has_internet()) {
35 | message("No internet connection.")
36 | return(invisible(NULL))
37 | }
38 |
39 | if (length(organism_id) == 0) {
40 | stop("No valid organism ID found.")
41 | }
42 | if (length(columns) > 4) {
43 | warning(strwrap("We suggest to use the fetch_uniprot function to fetch more than four columns.",
44 | prefix = "\n", initial = ""
45 | ))
46 | }
47 | url <- "http://rest.uniprot.org/uniprotkb/stream?query="
48 | column_names <- janitor::make_clean_names(columns)
49 | collapsed_columns <- paste(columns, collapse = ",")
50 | reviewed <- paste0("reviewed:", ifelse(reviewed == TRUE, "true", "false"))
51 | organism_id <- paste0("organism_id:", organism_id)
52 | query_url <-
53 | utils::URLencode(paste0(
54 | url,
55 | reviewed,
56 | "+AND+",
57 | organism_id,
58 | "&format=tsv&fields=",
59 | collapsed_columns
60 | ))
61 | result <- try_query(query_url, timeout = timeout, max_tries = max_tries, silent = FALSE, progress = FALSE, show_col_types = FALSE)
62 | # result can either be a data.frame or it is a character string with the error message
63 | if (!methods::is(result, "data.frame")) {
64 | if (stringr::str_detect(result, pattern = "Timeout")) {
65 | message('The data retrieval timed out. Consider increasing the "timeout" or "max_tries" argument. \n')
66 | }
67 | return(invisible(result))
68 | }
69 | colnames(result) <- column_names
70 | result
71 | }
72 |
--------------------------------------------------------------------------------
/R/calculate_sequence_coverage.R:
--------------------------------------------------------------------------------
1 | #' Protein sequence coverage
2 | #'
3 | #' `r lifecycle::badge('deprecated')`
4 | #' This function was deprecated due to its name changing to `calculate_sequence_coverage()`.
5 | #'
6 | #' @return A new column in the \code{data} data frame containing the calculated sequence coverage
7 | #' for each identified protein
8 | #' @keywords internal
9 | #' @export
10 | sequence_coverage <- function(...) {
11 | # This function has been renamed and is therefore deprecated.
12 | lifecycle::deprecate_warn("0.2.0",
13 | "sequence_coverage()",
14 | "calculate_sequence_coverage()",
15 | details = "This function has been renamed."
16 | )
17 | calculate_sequence_coverage(...)
18 | }
19 | #' Protein sequence coverage
20 | #'
21 | #' Calculate sequence coverage for each identified protein.
22 | #'
23 | #' @param data a data frame containing at least the protein sequence and the identified peptides
24 | #' as columns.
25 | #' @param protein_sequence a character column in the \code{data} data frame that contains protein
26 | #' sequences. Can be obtained by using the function \code{fetch_uniprot()}
27 | #' @param peptides a character column in the \code{data} data frame that contains the identified
28 | #' peptides.
29 | #'
30 | #' @return A new column in the \code{data} data frame containing the calculated sequence coverage
31 | #' for each identified protein
32 | #' @import dplyr
33 | #' @importFrom magrittr %>%
34 | #' @importFrom stringr str_count
35 | #' @importFrom rlang .data as_name enquo
36 | #' @importFrom tidyr drop_na
37 | #' @export
38 | #'
39 | #' @examples
40 | #' data <- data.frame(
41 | #' protein_sequence = c("abcdefghijklmnop", "abcdefghijklmnop"),
42 | #' pep_stripped_sequence = c("abc", "jklmn")
43 | #' )
44 | #'
45 | #' calculate_sequence_coverage(
46 | #' data,
47 | #' protein_sequence = protein_sequence,
48 | #' peptides = pep_stripped_sequence
49 | #' )
50 | calculate_sequence_coverage <-
51 | function(data, protein_sequence, peptides) {
52 | groups <- dplyr::group_vars(data)
53 |
54 | result <- data %>%
55 | # drop_na prevents function from failing if a protein group contains only NA peptide sequences.
56 | tidyr::drop_na({{ peptides }}) %>%
57 | dplyr::distinct({{ protein_sequence }}, {{ peptides }}) %>%
58 | dplyr::group_by({{ protein_sequence }}, .add = TRUE) %>%
59 | find_peptide({{ protein_sequence }}, {{ peptides }}) %>%
60 | dplyr::mutate(sequence_length = nchar({{ protein_sequence }})) %>%
61 | dplyr::mutate(modified_sequence = replace_identified_by_x({{ protein_sequence }}, .data$start, .data$end)) %>%
62 | dplyr::mutate(covered = stringr::str_count(.data$modified_sequence, "x")) %>%
63 | dplyr::mutate(coverage = .data$covered / .data$sequence_length * 100) %>%
64 | dplyr::select(-c(
65 | .data$sequence_length,
66 | .data$modified_sequence,
67 | .data$covered,
68 | .data$start,
69 | .data$end,
70 | .data$aa_before,
71 | .data$last_aa,
72 | .data$aa_after,
73 | {{ peptides }}
74 | )) %>%
75 | dplyr::distinct() %>%
76 | dplyr::ungroup()
77 |
78 | data %>%
79 | dplyr::left_join(result, by = c(rlang::as_name(rlang::enquo(protein_sequence)), groups))
80 | }
81 |
--------------------------------------------------------------------------------
/R/pval_distribution_plot.R:
--------------------------------------------------------------------------------
1 | #' Plot histogram of p-value distribution
2 | #'
3 | #' `r lifecycle::badge('deprecated')`
4 | #' This function was deprecated due to its name changing to `pval_distribution_plot()`.
5 | #'
6 | #' @return A histogram plot that shows the p-value distribution.
7 | #' @keywords internal
8 | #' @export
9 | plot_pval_distribution <- function(...) {
10 | # This function has been renamed and is therefore deprecated.
11 | lifecycle::deprecate_warn("0.2.0",
12 | "plot_pval_distribution()",
13 | "pval_distribution_plot()",
14 | details = "This function has been renamed."
15 | )
16 |
17 | pval_distribution_plot(...)
18 | }
19 | #' Plot histogram of p-value distribution
20 | #'
21 | #' Plots the distribution of p-values derived from any statistical test as a histogram.
22 | #'
23 | #' @param data a data frame that contains at least grouping identifiers (precursor, peptide or
24 | #' protein) and p-values derived from any statistical test.
25 | #' @param grouping a character column in the \code{data} data frame that contains either precursor,
26 | #' peptide or protein identifiers. For each entry in this column there should be one unique p-value.
27 | #' That means the statistical test that created the p-value should have been performed on the
28 | #' level of the content of this column.
29 | #' @param pval a numeric column in the \code{data} data frame that contains p-values.
30 | #' @param facet_by optional, a character column that contains information by which the data should
31 | #' be faceted into multiple plots.
32 | #'
33 | #' @return A histogram plot that shows the p-value distribution.
34 | #' @import ggplot2
35 | #' @importFrom magrittr %>%
36 | #' @importFrom dplyr distinct
37 | #' @importFrom tidyr drop_na
38 | #' @export
39 | #'
40 | #' @examples
41 | #' set.seed(123) # Makes example reproducible
42 | #'
43 | #' # Create example data
44 | #' data <- data.frame(
45 | #' peptide = paste0("peptide", 1:1000),
46 | #' pval = runif(n = 1000)
47 | #' )
48 | #'
49 | #' # Plot p-values
50 | #' pval_distribution_plot(
51 | #' data = data,
52 | #' grouping = peptide,
53 | #' pval = pval
54 | #' )
55 | pval_distribution_plot <- function(data, grouping, pval, facet_by = NULL) {
56 | input <- data %>%
57 | dplyr::distinct({{ grouping }}, {{ pval }}, {{ facet_by }}) %>%
58 | tidyr::drop_na()
59 |
60 | plot <- input %>%
61 | ggplot2::ggplot(ggplot2::aes(x = {{ pval }})) +
62 | ggplot2::geom_histogram(
63 | binwidth = 0.05,
64 | boundary = 0,
65 | color = "black",
66 | fill = "#5680C1",
67 | size = 1
68 | ) +
69 | ggplot2::labs(title = "P-Value Distribution", x = "P-Value", y = "Frequency") +
70 | {
71 | if (!missing(facet_by)) {
72 | ggplot2::facet_wrap(rlang::new_formula(NULL, rlang::enquo(facet_by)),
73 | scales = "fixed"
74 | )
75 | }
76 | } +
77 | ggplot2::theme_bw() +
78 | ggplot2::theme(
79 | plot.title = ggplot2::element_text(size = 20),
80 | axis.title.x = ggplot2::element_text(size = 15),
81 | axis.text.y = ggplot2::element_text(size = 15),
82 | axis.text.x = ggplot2::element_text(size = 15),
83 | axis.title.y = ggplot2::element_text(size = 15),
84 | strip.text = ggplot2::element_text(size = 15),
85 | strip.background = element_blank()
86 | )
87 | plot
88 | }
89 |
--------------------------------------------------------------------------------
/man/qc_peptide_type.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_peptide_type.R
3 | \name{qc_peptide_type}
4 | \alias{qc_peptide_type}
5 | \title{Check peptide type percentage share}
6 | \usage{
7 | qc_peptide_type(
8 | data,
9 | sample,
10 | peptide,
11 | pep_type,
12 | intensity,
13 | remove_na_intensities = TRUE,
14 | method = "count",
15 | plot = FALSE,
16 | interactive = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{data}{a data frame that contains at least the input columns.}
21 |
22 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.}
23 |
24 | \item{peptide}{a character column in the \code{data} data frame that contains the peptide
25 | sequence.}
26 |
27 | \item{pep_type}{a character column in the \code{data} data frame that contains the peptide
28 | type. Can be obtained using the \code{find_peptide} and \code{assign_peptide_type} function
29 | together.}
30 |
31 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding
32 | raw or normalised intensity values (not log2) for each peptide or precursor. Required when
33 | "intensity" is chosen as the method.}
34 |
35 | \item{remove_na_intensities}{a logical value that specifies if sample/peptide combinations with
36 | intensities that are NA (not quantified IDs) should be dropped from the data frame for analysis
37 | of peptide type distributions. Default is TRUE since we are usually interested in the peptide
38 | type distribution of quantifiable IDs. This is only relevant for method = "count".}
39 |
40 | \item{method}{a character value that indicates the method used for evaluation.
41 | \code{method = "intensity"} calculates the peptide type percentage by intensity, whereas
42 | \code{method = "count"} calculates the percentage by peptide ID count. Default is
43 | \code{method = count}.}
44 |
45 | \item{plot}{a logical value that indicates whether the result should be plotted.}
46 |
47 | \item{interactive}{a logical value that indicates whether the plot should be interactive.}
48 | }
49 | \value{
50 | A data frame that contains the calculated percentage shares of each peptide type per
51 | sample. The \code{count} column contains the number of peptides with a specific type. The
52 | \code{peptide_type_percent} column contains the percentage share of a specific peptide type.
53 | }
54 | \description{
55 | Calculates the percentage share of each peptide types (fully-tryptic, semi-tryptic,
56 | non-tryptic) for each sample.
57 | }
58 | \examples{
59 | # Load libraries
60 | library(dplyr)
61 |
62 | set.seed(123) # Makes example reproducible
63 |
64 | # Create example data
65 | data <- create_synthetic_data(
66 | n_proteins = 100,
67 | frac_change = 0.05,
68 | n_replicates = 3,
69 | n_conditions = 2,
70 | method = "effect_random"
71 | ) \%>\%
72 | mutate(intensity_non_log2 = 2^peptide_intensity_missing)
73 |
74 | # Determine peptide type percentages
75 | qc_peptide_type(
76 | data = data,
77 | sample = sample,
78 | peptide = peptide,
79 | pep_type = pep_type,
80 | intensity = intensity_non_log2,
81 | method = "intensity",
82 | plot = FALSE
83 | )
84 |
85 | # Plot peptide type
86 | qc_peptide_type(
87 | data = data,
88 | sample = sample,
89 | peptide = peptide,
90 | pep_type = pep_type,
91 | intensity = intensity_non_log2,
92 | method = "intensity",
93 | plot = TRUE
94 | )
95 | }
96 |
--------------------------------------------------------------------------------
/man/qc_charge_states.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_charge_states.R
3 | \name{qc_charge_states}
4 | \alias{qc_charge_states}
5 | \title{Check charge state distribution}
6 | \usage{
7 | qc_charge_states(
8 | data,
9 | sample,
10 | grouping,
11 | charge_states,
12 | intensity = NULL,
13 | remove_na_intensities = TRUE,
14 | method = "count",
15 | plot = FALSE,
16 | interactive = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{data}{a data frame that contains at least sample names, peptide or precursor identifiers
21 | and missed cleavage counts for each peptide or precursor.}
22 |
23 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.}
24 |
25 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor or
26 | peptide identifiers.}
27 |
28 | \item{charge_states}{a character or numeric column in the \code{data} data frame that contains
29 | the different charge states assigned to the precursor or peptide.}
30 |
31 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding
32 | raw or normalised intensity values (not log2) for each peptide or precursor. Required when
33 | "intensity" is chosen as the method.}
34 |
35 | \item{remove_na_intensities}{a logical value that specifies if sample/grouping combinations with
36 | intensities that are NA (not quantified IDs) should be dropped from the data frame for analysis
37 | of missed cleavages. Default is TRUE since we are usually interested in quantifiable peptides.
38 | This is only relevant for method = "count".}
39 |
40 | \item{method}{a character value that indicates the method used for evaluation. "count"
41 | calculates the charge state distribution based on counts of the corresponding peptides or
42 | precursors in the charge state group, "intensity" calculates the percentage of precursors or
43 | peptides in each charge state group based on the corresponding intensity values.}
44 |
45 | \item{plot}{a logical value that indicates whether the result should be plotted.}
46 |
47 | \item{interactive}{a logical value that specifies whether the plot should be interactive
48 | (default is FALSE).}
49 | }
50 | \value{
51 | A data frame that contains the calculated percentage made up by the sum of either
52 | all counts or intensities of peptides or precursors of the corresponding charge state
53 | (depending on which method is chosen).
54 | }
55 | \description{
56 | Calculates the charge state distribution for each sample (by count or intensity).
57 | }
58 | \examples{
59 | # Load libraries
60 | library(dplyr)
61 |
62 | set.seed(123) # Makes example reproducible
63 |
64 | # Create example data
65 | data <- create_synthetic_data(
66 | n_proteins = 100,
67 | frac_change = 0.05,
68 | n_replicates = 3,
69 | n_conditions = 2,
70 | method = "effect_random"
71 | ) \%>\%
72 | mutate(intensity_non_log2 = 2^peptide_intensity_missing)
73 |
74 | # Calculate charge percentages
75 | qc_charge_states(
76 | data = data,
77 | sample = sample,
78 | grouping = peptide,
79 | charge_states = charge,
80 | intensity = intensity_non_log2,
81 | method = "intensity",
82 | plot = FALSE
83 | )
84 |
85 | # Plot charge states
86 | qc_charge_states(
87 | data = data,
88 | sample = sample,
89 | grouping = peptide,
90 | charge_states = charge,
91 | intensity = intensity_non_log2,
92 | method = "intensity",
93 | plot = TRUE
94 | )
95 | }
96 |
--------------------------------------------------------------------------------
/man/predict_alphafold_domain.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/predict_alphafold_domain.R
3 | \name{predict_alphafold_domain}
4 | \alias{predict_alphafold_domain}
5 | \title{Predict protein domains of AlphaFold predictions}
6 | \usage{
7 | predict_alphafold_domain(
8 | pae_list,
9 | pae_power = 1,
10 | pae_cutoff = 5,
11 | graph_resolution = 1,
12 | return_data_frame = FALSE,
13 | show_progress = TRUE
14 | )
15 | }
16 | \arguments{
17 | \item{pae_list}{a list of proteins that contains aligned errors for their AlphaFold predictions.
18 | This list can be retrieved with the \code{fetch_alphafold_aligned_error()} function. It should contain a
19 | column containing the scored residue (\code{scored_residue}), the aligned residue (\code{aligned_residue}) and
20 | the predicted aligned error (\code{error}).}
21 |
22 | \item{pae_power}{a numeric value, each edge in the graph will be weighted proportional to (\code{1 / pae^pae_power}).
23 | Default is \code{1}.}
24 |
25 | \item{pae_cutoff}{a numeric value, graph edges will only be created for residue pairs with \code{pae < pae_cutoff}.
26 | Default is \code{5}.}
27 |
28 | \item{graph_resolution}{a numeric value that regulates how aggressive the clustering algorithm is. Smaller values
29 | lead to larger clusters. Value should be larger than zero, and values larger than 5 are unlikely to be useful.
30 | Higher values lead to stricter (i.e. smaller) clusters. The value is provided to the Leiden clustering algorithm
31 | of the \code{igraph} package as \code{graph_resolution / 100}. Default is \code{1}.}
32 |
33 | \item{return_data_frame}{a logical value; if \code{TRUE} a data frame instead of a list
34 | is returned. It is recommended to only use this if information for few proteins is retrieved.
35 | Default is \code{FALSE}.}
36 |
37 | \item{show_progress}{a logical value that specifies if a progress bar will be shown. Default
38 | is \code{TRUE}.}
39 | }
40 | \value{
41 | A list of the provided proteins that contains domain assignments for each residue. If \code{return_data_frame} is
42 | \code{TRUE}, a data frame with this information is returned instead. The data frame contains the
43 | following columns:
44 | \itemize{
45 | \item residue: The protein residue number.
46 | \item domain: A numeric value representing a distinct predicted domain in the protein.
47 | \item accession: The UniProt protein identifier.
48 | }
49 | }
50 | \description{
51 | Uses the predicted aligned error (PAE) of AlphaFold predictions to find possible protein domains.
52 | A graph-based community clustering algorithm (Leiden clustering) is used on the predicted error
53 | (distance) between residues of a protein in order to infer pseudo-rigid groups in the protein. This is
54 | for example useful in order to know which parts of protein predictions are likely in a fixed relative
55 | position towards each other and which might have varying distances.
56 | This function is based on python code written by Tristan Croll. The original code can be found on his
57 | \href{https://github.com/tristanic/pae_to_domains}{GitHub page}.
58 | }
59 | \examples{
60 | \donttest{
61 | # Fetch aligned errors
62 | aligned_error <- fetch_alphafold_aligned_error(
63 | uniprot_ids = c("F4HVG8", "O15552"),
64 | error_cutoff = 4
65 | )
66 |
67 | # Predict protein domains
68 | af_domains <- predict_alphafold_domain(
69 | pae_list = aligned_error,
70 | return_data_frame = TRUE
71 | )
72 |
73 | head(af_domains, n = 10)
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/R/assign_peptide_type.R:
--------------------------------------------------------------------------------
1 | #' Assign peptide type
2 | #'
3 | #' `r lifecycle::badge('deprecated')`
4 | #' This function was deprecated due to its name changing to `assign_peptide_type()`.
5 | #'
6 | #' @return A data frame that contains the input data and an additional column with the peptide
7 | #' type information.
8 | #' @keywords internal
9 | #' @export
10 | peptide_type <- function(...) {
11 | # This function has been renamed and is therefore deprecated.
12 | lifecycle::deprecate_warn("0.2.0",
13 | "peptide_type()",
14 | "assign_peptide_type()",
15 | details = "This function has been renamed."
16 | )
17 |
18 | assign_peptide_type(...)
19 | }
20 | #' Assign peptide type
21 | #'
22 | #' Based on preceding and C-terminal amino acid, the peptide type of a given peptide is assigned.
23 | #' Peptides with preceeding and C-terminal lysine or arginine are considered fully-tryptic. If a
24 | #' peptide is located at the N- or C-terminus of a protein and fulfills the criterium to be
25 | #' fully-tryptic otherwise, it is also considered as fully-tryptic. Peptides that only fulfill the
26 | #' criterium on one terminus are semi-tryptic peptides. Lastly, peptides that are not fulfilling
27 | #' the criteria for both termini are non-tryptic peptides.
28 | #'
29 | #' @param data a data frame containing at least information about the preceding and C-terminal
30 | #' amino acids of peptides.
31 | #' @param aa_before a character column in the \code{data} data frame that contains the preceding amino
32 | #' acid as one letter code.
33 | #' @param last_aa a character column in the \code{data} data frame that contains the C-terminal amino
34 | #' acid as one letter code.
35 | #' @param aa_after a character column in the \code{data} data frame that contains the following amino
36 | #' acid as one letter code.
37 | #'
38 | #' @return A data frame that contains the input data and an additional column with the peptide
39 | #' type information.
40 | #' @import dplyr
41 | #' @importFrom magrittr %>%
42 | #' @importFrom rlang .data
43 | #' @export
44 | #'
45 | #' @examples
46 | #' data <- data.frame(
47 | #' aa_before = c("K", "S", "T"),
48 | #' last_aa = c("R", "K", "Y"),
49 | #' aa_after = c("T", "R", "T")
50 | #' )
51 | #'
52 | #' assign_peptide_type(data, aa_before, last_aa, aa_after)
53 | assign_peptide_type <- function(data,
54 | aa_before = aa_before,
55 | last_aa = last_aa,
56 | aa_after = aa_after) {
57 | data %>%
58 | dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>%
59 | dplyr::mutate(N_term_tryp = dplyr::if_else({{ aa_before }} == "" |
60 | {{ aa_before }} == "K" |
61 | {{ aa_before }} == "R",
62 | TRUE,
63 | FALSE
64 | )) %>%
65 | dplyr::mutate(C_term_tryp = dplyr::if_else({{ last_aa }} == "K" |
66 | {{ last_aa }} == "R" |
67 | {{ aa_after }} == "",
68 | TRUE,
69 | FALSE
70 | )) %>%
71 | dplyr::mutate(pep_type = dplyr::case_when(
72 | .data$N_term_tryp + .data$C_term_tryp == 2 ~ "fully-tryptic",
73 | .data$N_term_tryp + .data$C_term_tryp == 1 ~ "semi-tryptic",
74 | .data$N_term_tryp + .data$C_term_tryp == 0 ~ "non-tryptic"
75 | )) %>%
76 | dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>%
77 | dplyr::right_join(data, by = c(
78 | rlang::as_name(rlang::enquo(aa_before)),
79 | rlang::as_name(rlang::enquo(last_aa)),
80 | rlang::as_name(rlang::enquo(aa_after))
81 | ))
82 | }
83 |
--------------------------------------------------------------------------------
/man/diff_abundance.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_diff_abundance.R
3 | \name{diff_abundance}
4 | \alias{diff_abundance}
5 | \title{Calculate differential abundance between conditions}
6 | \usage{
7 | diff_abundance(...)
8 | }
9 | \value{
10 | A data frame that contains differential abundances (\code{diff}), p-values (\code{pval})
11 | and adjusted p-values (\code{adj_pval}) for each protein, peptide or precursor (depending on
12 | the \code{grouping} variable) and the associated treatment/reference pair. Depending on the
13 | method the data frame contains additional columns:
14 | \itemize{
15 | \item "t-test": The \code{std_error} column contains the standard error of the differential
16 | abundances. \code{n_obs} contains the number of observations for the specific protein, peptide
17 | or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.
18 | \item "t-test_mean_sd": Columns labeled as control refer to the second condition of the
19 | comparison pairs. Treated refers to the first condition. \code{mean_control} and \code{mean_treated}
20 | columns contain the means for the reference and treatment condition, respectively. \code{sd_control}
21 | and \code{sd_treated} columns contain the standard deviations for the reference and treatment
22 | condition, respectively. \code{n_control} and \code{n_treated} columns contain the numbers of
23 | samples for the reference and treatment condition, respectively. The \code{std_error} column
24 | contains the standard error of the differential abundances. \code{t_statistic} contains the
25 | t_statistic for the t-test.
26 | \item "moderated_t-test": \code{CI_2.5} and \code{CI_97.5} contain the 2.5\% and 97.5\%
27 | confidence interval borders for differential abundances. \code{avg_abundance} contains average
28 | abundances for treatment/reference pairs (mean of the two group means). \code{t_statistic}
29 | contains the t_statistic for the t-test. \code{B} The B-statistic is the log-odds that the
30 | protein, peptide or precursor (depending on \code{grouping}) has a differential abundance
31 | between the two groups. Suppose B=1.5. The odds of differential abundance is exp(1.5)=4.48, i.e,
32 | about four and a half to one. The probability that there is a differential abundance is
33 | 4.48/(1+4.48)=0.82, i.e., the probability is about 82\% that this group is differentially
34 | abundant. A B-statistic of zero corresponds to a 50-50 chance that the group is differentially
35 | abundant.\code{n_obs} contains the number of observations for the specific protein, peptide or
36 | precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.
37 | \item "proDA": The \code{std_error} column contains the standard error of the differential
38 | abundances. \code{avg_abundance} contains average abundances for treatment/reference pairs
39 | (mean of the two group means). \code{t_statistic} contains the t_statistic for the t-test.
40 | \code{n_obs} contains the number of observations for the specific protein, peptide or precursor
41 | (depending on the \code{grouping} variable) and the associated treatment/reference pair.
42 | }
43 | }
44 | \description{
45 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
46 | This function was deprecated due to its name changing to \code{calculate_diff_abundance()}.
47 | }
48 | \keyword{internal}
49 |
--------------------------------------------------------------------------------
/man/fetch_quickgo.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_quickgo.R
3 | \name{fetch_quickgo}
4 | \alias{fetch_quickgo}
5 | \title{Fetch information from the QuickGO API}
6 | \usage{
7 | fetch_quickgo(
8 | type = "annotations",
9 | id_annotations = NULL,
10 | taxon_id_annotations = NULL,
11 | ontology_annotations = "all",
12 | go_id_slims = NULL,
13 | relations_slims = c("is_a", "part_of", "regulates", "occurs_in"),
14 | timeout = 1200,
15 | max_tries = 2,
16 | show_progress = TRUE
17 | )
18 | }
19 | \arguments{
20 | \item{type}{a character value that indicates if gene ontology terms, annotations or slims
21 | should be retrieved. The possible values therefore include "annotations", "terms" and "slims".
22 | If annotations are retrieved, the maximum number of results is 2,000,000.}
23 |
24 | \item{id_annotations}{an optional character vector that specifies UniProt IDs for which GO annotations
25 | should be retrieved. This argument should only be provided if annotations are retrieved.}
26 |
27 | \item{taxon_id_annotations}{an optional character value that specifies the NCBI taxonomy identifier (TaxId)
28 | for an organism for which GO annotations should be retrieved.
29 | This argument should only be provided if annotations are retrieved.}
30 |
31 | \item{ontology_annotations}{an optional character value that specifies the ontology that should be retrieved.
32 | This can either have the values "all", "molecular_function", "biological_process" or
33 | "cellular_component". This argument should only be provided if annotations are retrieved.}
34 |
35 | \item{go_id_slims}{an optional character vector that specifies gene ontology IDs (e.g. GO:0046872) for which
36 | a slim go set should be generated. This argument should only be provided if slims are retrieved.}
37 |
38 | \item{relations_slims}{an optional character vector that specifies the relations of GO IDs that should be
39 | considered for the generation of the slim dataset. This argument should only be provided if slims are retrieved.}
40 |
41 | \item{timeout}{a numeric value specifying the time in seconds until the download times out.
42 | The default is 1200 seconds.}
43 |
44 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download
45 | the data in case an error occurs. The default is 2.}
46 |
47 | \item{show_progress}{a logical value that indicates if a progress bar will be shown.
48 | Default is TRUE.}
49 | }
50 | \value{
51 | A data frame that contains descriptive information about gene ontology annotations, terms or slims
52 | depending on what the input "type" was.
53 | }
54 | \description{
55 | Fetches gene ontology (GO) annotations, terms or slims from the QuickGO EBI database.
56 | Annotations can be retrieved for specific UniProt IDs or NCBI taxonomy identifiers. When
57 | terms are retrieved, a complete list of all GO terms is returned. For the generation of
58 | a slim dataset you can provide GO IDs that should be considered. A slim dataset is a subset
59 | GO dataset that considers all child terms of the supplied IDs.
60 | }
61 | \examples{
62 | \donttest{
63 | # Annotations
64 | annotations <- fetch_quickgo(
65 | type = "annotations",
66 | id = c("P63328", "Q4FFP4"),
67 | ontology = "molecular_function"
68 | )
69 |
70 | head(annotations)
71 |
72 | # Terms
73 | terms <- fetch_quickgo(type = "terms")
74 |
75 | head(terms)
76 |
77 | # Slims
78 | slims <- fetch_quickgo(
79 | type = "slims",
80 | go_id_slims = c("GO:0046872", "GO:0051540")
81 | )
82 |
83 | head(slims)
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/man/fetch_pdb.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fetch_pdb.R
3 | \name{fetch_pdb}
4 | \alias{fetch_pdb}
5 | \title{Fetch structure information from RCSB}
6 | \usage{
7 | fetch_pdb(pdb_ids, batchsize = 100, show_progress = TRUE)
8 | }
9 | \arguments{
10 | \item{pdb_ids}{a character vector of PDB identifiers.}
11 |
12 | \item{batchsize}{a numeric value that specifies the number of structures to be processed in a
13 | single query. Default is 100.}
14 |
15 | \item{show_progress}{a logical value that indicates if a progress bar will be shown. Default is
16 | TRUE.}
17 | }
18 | \value{
19 | A data frame that contains structure metadata for the PDB IDs provided. The data frame
20 | contains some columns that might not be self explanatory.
21 | \itemize{
22 | \item auth_asym_id: Chain identifier provided by the author of the structure in order to
23 | match the identification used in the publication that describes the structure.
24 | \item label_asym_id: Chain identifier following the standardised convention for mmCIF files.
25 | \item entity_beg_seq_id, ref_beg_seq_id, length, pdb_sequence: \code{entity_beg_seq_id} is a
26 | position in the structure sequence (\code{pdb_sequence}) that matches the position given in
27 | \code{ref_beg_seq_id}, which is a position within the protein sequence (not included in the
28 | data frame). \code{length} identifies the stretch of sequence for which positions match
29 | accordingly between structure and protein sequence. \code{entity_beg_seq_id} is a residue ID
30 | based on the standardised convention for mmCIF files.
31 | \item auth_seq_id: Residue identifier provided by the author of the structure in order to
32 | match the identification used in the publication that describes the structure. This character
33 | vector has the same length as the \code{pdb_sequence} and each position is the identifier for
34 | the matching amino acid position in \code{pdb_sequence}. The contained values are not
35 | necessarily numbers and the values do not have to be positive.
36 | \item modified_monomer: Is composed of first the composition ID of the modification, followed
37 | by the \code{label_seq_id} position. In parenthesis are the parent monomer identifiers as
38 | they appear in the sequence.
39 | \item ligand_*: Any column starting with the \code{ligand_*} prefix contains information about
40 | the position, identity and donors for ligand binding sites. If there are multiple entities of
41 | ligands they are separated by "|". Specific donor level information is separated by ";".
42 | \item secondar_structure: Contains information about helix and sheet secondary structure elements.
43 | Individual regions are separated by ";".
44 | \item unmodeled_structure: Contains information about unmodeled or partially modeled regions in
45 | the model. Individual regions are separated by ";".
46 | \item auth_seq_id_original: In some cases the sequence positions do not match the number of residues
47 | in the sequence either because positions are missing or duplicated. This always coincides with modified
48 | residues, however does not always occur when there is a modified residue in the sequence. This column
49 | contains the original \code{auth_seq_id} information that does not have these positions corrected.
50 | }
51 | }
52 | \description{
53 | Fetches structure metadata from RCSB. If you want to retrieve atom data such as positions, use
54 | the function \code{fetch_pdb_structure()}.
55 | }
56 | \examples{
57 | \donttest{
58 | pdb <- fetch_pdb(pdb_ids = c("6HG1", "1E9I", "6D3Q", "4JHW"))
59 |
60 | head(pdb)
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/man/qc_missed_cleavages.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/qc_missed_cleavages.R
3 | \name{qc_missed_cleavages}
4 | \alias{qc_missed_cleavages}
5 | \title{Check missed cleavages}
6 | \usage{
7 | qc_missed_cleavages(
8 | data,
9 | sample,
10 | grouping,
11 | missed_cleavages,
12 | intensity,
13 | remove_na_intensities = TRUE,
14 | method = "count",
15 | plot = FALSE,
16 | interactive = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{data}{a data frame containing at least sample names, peptide or precursor identifiers
21 | and missed cleavage counts for each peptide or precursor.}
22 |
23 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.}
24 |
25 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor or
26 | peptide identifiers.}
27 |
28 | \item{missed_cleavages}{a numeric column in the \code{data} data frame that contains the counts
29 | of missed cleavages per peptide or precursor.}
30 |
31 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding
32 | raw or normalised intensity values (not log2) for each peptide or precursor. Required when
33 | "intensity" is chosen as the method.}
34 |
35 | \item{remove_na_intensities}{a logical value that specifies if sample/grouping combinations with
36 | intensities that are NA (not quantified IDs) should be dropped from the data frame for analysis
37 | of missed cleavages. Default is TRUE since we are usually interested in quantifiable peptides.
38 | This is only relevant for method = "count".}
39 |
40 | \item{method}{a character value that indicates the method used for evaluation. "count"
41 | calculates the percentage of missed cleavages based on counts of the corresponding peptide or
42 | precursor, "intensity" calculates the percentage of missed cleavages by intensity of the
43 | corresponding peptide or precursor.}
44 |
45 | \item{plot}{a logical value that indicates whether the result should be plotted.}
46 |
47 | \item{interactive}{a logical value that specifies whether the plot should be interactive
48 | (default is FALSE).}
49 | }
50 | \value{
51 | A data frame that contains the calculated percentage made up by the sum of all peptides
52 | or precursors containing the corresponding amount of missed cleavages.
53 | }
54 | \description{
55 | Calculates the percentage of missed cleavages for each sample (by count or intensity). The
56 | default settings remove grouping variables without quantitative information (intensity is NA).
57 | These will not be used for the calculation of missed cleavage percentages.
58 | }
59 | \examples{
60 | library(dplyr)
61 |
62 | set.seed(123) # Makes example reproducible
63 |
64 | # Create example data
65 | data <- create_synthetic_data(
66 | n_proteins = 100,
67 | frac_change = 0.05,
68 | n_replicates = 3,
69 | n_conditions = 2,
70 | method = "effect_random"
71 | ) \%>\%
72 | mutate(intensity_non_log2 = 2^peptide_intensity_missing)
73 |
74 | # Calculate missed cleavage percentages
75 | qc_missed_cleavages(
76 | data = data,
77 | sample = sample,
78 | grouping = peptide,
79 | missed_cleavages = n_missed_cleavage,
80 | intensity = intensity_non_log2,
81 | method = "intensity",
82 | plot = FALSE
83 | )
84 |
85 | # Plot missed cleavages
86 | qc_missed_cleavages(
87 | data = data,
88 | sample = sample,
89 | grouping = peptide,
90 | missed_cleavages = n_missed_cleavage,
91 | intensity = intensity_non_log2,
92 | method = "intensity",
93 | plot = TRUE
94 | )
95 | }
96 |
--------------------------------------------------------------------------------
/R/qc_median_intensities.R:
--------------------------------------------------------------------------------
1 | #' Median run intensities
2 | #'
3 | #' Median intensities per run are returned either as a plot or a table.
4 | #'
5 | #' @param data a data frame that contains at least the input variables.
6 | #' @param sample a character or factor column in the \code{data} data frame that contains the sample name.
7 | #' @param grouping a character column in the \code{data} data frame that contains either precursor or
8 | #' peptide identifiers.
9 | #' @param intensity a numeric column in the \code{data} data frame that contains intensity values.
10 | #' The intensity should be ideally log2 transformed, but also non-transformed values can be used.
11 | #' @param plot a logical value that indicates whether the result should be plotted.
12 | #' @param interactive a logical value that specifies whether the plot should be interactive
13 | #' (default is FALSE).
14 | #'
15 | #' @return A plot that displays median intensity over all samples. If \code{plot = FALSE} a data
16 | #' frame containing median intensities is returned.
17 | #' @import dplyr
18 | #' @import ggplot2
19 | #' @importFrom plotly ggplotly
20 | #' @importFrom magrittr %>%
21 | #' @importFrom rlang .data
22 | #' @importFrom stringr str_sort
23 | #' @export
24 | #'
25 | #' @examples
26 | #' set.seed(123) # Makes example reproducible
27 | #'
28 | #' # Create example data
29 | #' data <- create_synthetic_data(
30 | #' n_proteins = 100,
31 | #' frac_change = 0.05,
32 | #' n_replicates = 3,
33 | #' n_conditions = 2,
34 | #' method = "effect_random"
35 | #' )
36 | #'
37 | #' # Calculate median intensities
38 | #' qc_median_intensities(
39 | #' data = data,
40 | #' sample = sample,
41 | #' grouping = peptide,
42 | #' intensity = peptide_intensity_missing,
43 | #' plot = FALSE
44 | #' )
45 | #'
46 | #' # Plot median intensities
47 | #' qc_median_intensities(
48 | #' data = data,
49 | #' sample = sample,
50 | #' grouping = peptide,
51 | #' intensity = peptide_intensity_missing,
52 | #' plot = TRUE
53 | #' )
54 | qc_median_intensities <- function(data,
55 | sample,
56 | grouping,
57 | intensity,
58 | plot = TRUE,
59 | interactive = FALSE) {
60 | table <- data %>%
61 | dplyr::distinct({{ sample }}, {{ grouping }}, {{ intensity }}) %>%
62 | dplyr::group_by({{ sample }}) %>%
63 | dplyr::summarize(
64 | median_intensity = stats::median({{ intensity }}, na.rm = TRUE),
65 | .groups = "drop"
66 | )
67 |
68 | if (plot == FALSE) {
69 | return(table)
70 | }
71 |
72 | if (is(dplyr::pull(table, {{ sample }}), "character")) {
73 | table <- table %>%
74 | mutate({{ sample }} := factor({{ sample }},
75 | levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE))
76 | ))
77 | }
78 |
79 | plot <- table %>%
80 | ggplot2::ggplot(ggplot2::aes({{ sample }}, .data$median_intensity, group = 1)) +
81 | ggplot2::geom_line(size = 1) +
82 | ggplot2::labs(title = "Medians of run intensities", x = "", y = "Intensity") +
83 | ggplot2::theme_bw() +
84 | ggplot2::theme(
85 | plot.title = ggplot2::element_text(size = 20),
86 | axis.title.x = ggplot2::element_text(size = 15),
87 | axis.text.y = ggplot2::element_text(size = 15),
88 | axis.text.x = ggplot2::element_text(size = 12, angle = 75, hjust = 1),
89 | axis.title.y = ggplot2::element_text(size = 15)
90 | )
91 |
92 | if (interactive == FALSE) {
93 | return(plot)
94 | }
95 |
96 | suppressWarnings(plotly::ggplotly(plot))
97 | }
98 |
--------------------------------------------------------------------------------
/man/barcode_plot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/barcode_plot.R
3 | \name{barcode_plot}
4 | \alias{barcode_plot}
5 | \title{Barcode plot}
6 | \usage{
7 | barcode_plot(
8 | data,
9 | start_position,
10 | end_position,
11 | protein_length,
12 | coverage = NULL,
13 | colouring = NULL,
14 | fill_colour_gradient = protti::mako_colours,
15 | fill_colour_discrete = c("#999999", protti::protti_colours),
16 | protein_id = NULL,
17 | facet = NULL,
18 | facet_n_col = 4,
19 | cutoffs = NULL
20 | )
21 | }
22 | \arguments{
23 | \item{data}{a data frame containing differential abundance, start and end peptide or precursor positions and protein length.}
24 |
25 | \item{start_position}{a numeric column in the data frame containing the start positions for each peptide or precursor.}
26 |
27 | \item{end_position}{a numeric column in the data frame containing the end positions for each peptide or precursor.}
28 |
29 | \item{protein_length}{a numeric column in the data frame containing the length of the protein.}
30 |
31 | \item{coverage}{optional, numeric column in the data frame containing coverage in percent. Will appear in the title of the barcode if provided.}
32 |
33 | \item{colouring}{optional, column in the data frame containing information by which peptide or precursors should
34 | be colored.}
35 |
36 | \item{fill_colour_gradient}{a vector that contains colours that should be used to create a colour gradient
37 | for the barcode plot bars if the \code{colouring} argument is continuous. Default is \code{mako_colours}.}
38 |
39 | \item{fill_colour_discrete}{a vector that contains colours that should be used to fill the barcode plot bars
40 | if the \code{colouring} argument is discrete. Default is \code{protti_colours}.}
41 |
42 | \item{protein_id}{optional, column in the data frame containing protein identifiers. Required if only one protein
43 | should be plotted and the data frame contains only information for this protein.}
44 |
45 | \item{facet}{optional, column in the data frame containing information by which data should be faceted. This can be
46 | protein identifiers. Only 20 proteins are plotted at a time, the rest is ignored. If more should be plotted, a mapper over a
47 | subsetted data frame should be created.}
48 |
49 | \item{facet_n_col}{a numeric value that specifies the number of columns the faceted plot should have
50 | if a column name is provided to group. The default is 4.}
51 |
52 | \item{cutoffs}{optional argument specifying the log2 fold change and significance cutoffs used for highlighting peptides.
53 | If this argument is provided colouring information will be overwritten with peptides that fulfill this condition.
54 | The cutoff should be provided in a vector of the form c(diff = 2, pval = 0.05). The name of the cutoff should reflect the
55 | column name that contains this information (log2 fold changes, p-values or adjusted p-values).}
56 | }
57 | \value{
58 | A barcode plot is returned.
59 | }
60 | \description{
61 | Plots a "barcode plot" - a vertical line for each identified peptide. Peptides can be colored based on an additional variable. Also differential
62 | abundance can be displayed.
63 | }
64 | \examples{
65 |
66 | data <- data.frame(
67 | start = c(5, 40, 55, 130, 181, 195),
68 | end = c(11, 51, 60, 145, 187, 200),
69 | length = rep(200, 6),
70 | pg_protein_accessions = rep("Protein 1", 6),
71 | diff = c(1, 2, 5, 2, 1, 1),
72 | pval = c(0.1, 0.01, 0.01, 0.2, 0.2, 0.01)
73 | )
74 |
75 | barcode_plot(
76 | data,
77 | start_position = start,
78 | end_position = end,
79 | protein_length = length,
80 | facet = pg_protein_accessions,
81 | cutoffs = c(diff = 2, pval = 0.05)
82 | )
83 | }
84 |
--------------------------------------------------------------------------------
/man/calculate_kegg_enrichment.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_kegg_enrichment.R
3 | \name{calculate_kegg_enrichment}
4 | \alias{calculate_kegg_enrichment}
5 | \title{Perform KEGG pathway enrichment analysis}
6 | \usage{
7 | calculate_kegg_enrichment(
8 | data,
9 | protein_id,
10 | is_significant,
11 | pathway_id = pathway_id,
12 | pathway_name = pathway_name,
13 | plot = TRUE,
14 | plot_cutoff = "adj_pval top10"
15 | )
16 | }
17 | \arguments{
18 | \item{data}{a data frame that contains at least the input variables.}
19 |
20 | \item{protein_id}{a character column in the \code{data} data frame that contains the protein
21 | accession numbers.}
22 |
23 | \item{is_significant}{a logical column in the \code{data} data frame that indicates if the
24 | corresponding protein has a significantly changing peptide. The input data frame may contain
25 | peptide level information with significance information. The function is able to extract
26 | protein level information from this.}
27 |
28 | \item{pathway_id}{a character column in the \code{data} data frame that contains KEGG pathway
29 | identifiers. These can be obtained from KEGG using \code{fetch_kegg}.}
30 |
31 | \item{pathway_name}{a character column in the \code{data} data frame that contains KEGG pathway
32 | names. These can be obtained from KEGG using \code{fetch_kegg}.}
33 |
34 | \item{plot}{a logical value indicating whether the result should be plotted or returned as a
35 | table.}
36 |
37 | \item{plot_cutoff}{a character value indicating if the plot should contain the top 10 most
38 | significant proteins (p-value or adjusted p-value), or if a significance cutoff should be used
39 | to determine the number of GO terms in the plot. This information should be provided with the
40 | type first followed by the threshold separated by a space. Example are
41 | \code{plot_cutoff = "adj_pval top10"}, \code{plot_cutoff = "pval 0.05"} or
42 | \code{plot_cutoff = "adj_pval 0.01"}. The threshold can be chosen freely.}
43 | }
44 | \value{
45 | A bar plot displaying negative log10 adjusted p-values for the top 10 enriched pathways.
46 | Bars are coloured according to the direction of the enrichment. If \code{plot = FALSE}, a data
47 | frame is returned.
48 | }
49 | \description{
50 | Analyses enrichment of KEGG pathways associated with proteins in the fraction of significant
51 | proteins compared to all detected proteins. A Fisher's exact test is performed to test
52 | significance of enrichment.
53 | }
54 | \examples{
55 | \donttest{
56 | # Load libraries
57 | library(dplyr)
58 |
59 | set.seed(123) # Makes example reproducible
60 |
61 | # Create example data
62 | kegg_data <- fetch_kegg(species = "eco")
63 |
64 | if (!is.null(kegg_data)) { # only proceed if information was retrieved
65 | data <- kegg_data \%>\%
66 | group_by(uniprot_id) \%>\%
67 | mutate(significant = rep(
68 | sample(
69 | x = c(TRUE, FALSE),
70 | size = 1,
71 | replace = TRUE,
72 | prob = c(0.2, 0.8)
73 | ),
74 | n = n()
75 | ))
76 |
77 | # Plot KEGG enrichment
78 | calculate_kegg_enrichment(
79 | data,
80 | protein_id = uniprot_id,
81 | is_significant = significant,
82 | pathway_id = pathway_id,
83 | pathway_name = pathway_name,
84 | plot = TRUE,
85 | plot_cutoff = "pval 0.05"
86 | )
87 |
88 | # Calculate KEGG enrichment
89 | kegg <- calculate_kegg_enrichment(
90 | data,
91 | protein_id = uniprot_id,
92 | is_significant = significant,
93 | pathway_id = pathway_id,
94 | pathway_name = pathway_name,
95 | plot = FALSE
96 | )
97 |
98 | head(kegg, n = 10)
99 | }
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/R/calculate_imputation.R:
--------------------------------------------------------------------------------
1 | #' Sampling of values for imputation
2 | #'
3 | #' \code{calculate_imputation} is a helper function that is used in the \code{impute} function.
4 | #' Depending on the type of missingness and method, it samples values from a normal distribution
5 | #' that can be used for the imputation. Note: The input intensities should be log2 transformed.
6 | #'
7 | #' @param min a numeric value specifying the minimal intensity value of the precursor/peptide.
8 | #' Is only required if \code{method = "ludovic"} and \code{missingness = "MNAR"}.
9 | #' @param noise a numeric value specifying a noise value for the precursor/peptide. Is only
10 | #' required if \code{method = "noise"} and \code{missingness = "MNAR"}.
11 | #' @param mean a numeric value specifying the mean intensity value of the condition with missing
12 | #' values for a given precursor/peptide. Is only required if \code{missingness = "MAR"}.
13 | #' @param sd a numeric value specifying the mean of the standard deviation of all conditions for
14 | #' a given precursor/peptide.
15 | #' @param missingness a character value specifying the missingness type of the data determines
16 | #' how values for imputation are sampled. This can be \code{"MAR"} or \code{"MNAR"}.
17 | #' @param method a character value specifying the method to be used for imputation. For
18 | #' \code{method = "ludovic"}, MNAR missingness is sampled around a value that is three lower
19 | #' (log2) than the lowest intensity value recorded for the precursor/peptide. For
20 | #' \code{method = "noise"}, MNAR missingness is sampled around the noise value for the
21 | #' precursor/peptide.
22 | #' @param skip_log2_transform_error a logical value, if FALSE a check is performed to validate that
23 | #' input values are log2 transformed. If input values are > 40 the test is failed and an error is
24 | #' returned.
25 | #'
26 | #' @return A value sampled from a normal distribution with the input parameters. Method specifics
27 | #' are applied to input parameters prior to sampling.
28 | calculate_imputation <-
29 | function(min = NULL,
30 | noise = NULL,
31 | mean = NULL,
32 | sd,
33 | missingness = c("MNAR", "MAR"),
34 | method = c("ludovic", "noise"),
35 | skip_log2_transform_error = FALSE) {
36 | if ((ifelse(is.na(ifelse(is.null(min), 0, min) > 40),
37 | FALSE,
38 | ifelse(is.null(min), 0, min) > 40
39 | ) |
40 | ifelse(is.na(ifelse(is.null(mean), 0, mean) > 40),
41 | FALSE,
42 | ifelse(is.null(mean), 0, mean) > 40
43 | ) |
44 | ifelse(is.na(ifelse(is.null(noise), 0, noise) > 40),
45 | FALSE,
46 | ifelse(is.null(noise), 0, noise) > 40
47 | )) &
48 | skip_log2_transform_error == FALSE) {
49 | stop(strwrap("Input intensities seem not to be log2 transformed. If they are and you want
50 | to proceed set the skip_log2_transform_error argument to TRUE. Notice that
51 | this function does not give correct results for non-log2 transformed data.",
52 | prefix = "\n", initial = ""
53 | ))
54 | }
55 | if (!(missingness %in% c("MNAR", "MAR"))) {
56 | return(NA)
57 | }
58 | if (method == "ludovic") {
59 | if (missingness == "MNAR") {
60 | result <- suppressWarnings(stats::rnorm(1, mean = min - 3, sd = sd))
61 | }
62 | if (missingness == "MAR") {
63 | result <- suppressWarnings(stats::rnorm(1, mean = mean, sd = sd))
64 | }
65 | }
66 | if (method == "noise") {
67 | if (missingness == "MNAR") {
68 | result <- suppressWarnings(stats::rnorm(1, mean = noise, sd = sd))
69 | }
70 | if (missingness == "MAR") {
71 | result <- suppressWarnings(stats::rnorm(1, mean = mean, sd = sd))
72 | }
73 | }
74 | result
75 | }
76 |
--------------------------------------------------------------------------------
/R/calculate_aa_scores.R:
--------------------------------------------------------------------------------
1 | #' Calculate scores for each amino acid position in a protein sequence
2 | #'
3 | #' `r lifecycle::badge("experimental")`
4 | #' Calculate a score for each amino acid position in a protein sequence based on the product of the
5 | #' -log10(adjusted p-value) and the absolute log2(fold change) per peptide covering this amino acid. In detail, all the
6 | #' peptides are aligned along the sequence of the corresponding protein, and the average score per
7 | #' amino acid position is computed. In a limited proteolysis coupled to mass spectrometry (LiP-MS)
8 | #' experiment, the score allows to prioritize and narrow down structurally affected regions.
9 | #'
10 | #' @param data a data frame containing at least the input columns.
11 | #' @param adj_pval a numeric column in the \code{data} data frame containing the adjusted p-value.
12 | #' @param diff a numeric column in the \code{data} data frame containing the log2 fold change.
13 | #' @param start_position a numeric column \code{data} in the data frame containing the start position
14 | #' of a peptide or precursor.
15 | #' @param end_position a numeric column in the data frame containing the end position of a peptide or
16 | #' precursor.
17 | #' @param protein a character column in the data frame containing the protein identifier or name.
18 | #' @param retain_columns a vector indicating if certain columns should be retained from the input
19 | #' data frame. Default is not retaining additional columns \code{retain_columns = NULL}. Specific
20 | #' columns can be retained by providing their names (not in quotations marks, just like other
21 | #' column names, but in a vector).
22 | #'
23 | #' @return A data frame that contains the aggregated scores per amino acid position, enabling to
24 | #' draw fingerprints for each individual protein.
25 | #'
26 | #' @author Patrick Stalder
27 | #' @import dplyr
28 | #' @import tidyr
29 | #' @export
30 | #'
31 | #' @examples
32 | #'
33 | #' data <- data.frame(
34 | #' pg_protein_accessions = c(rep("protein_1", 10)),
35 | #' diff = c(2, -3, 1, 2, 3, -3, 5, 1, -0.5, 2),
36 | #' adj_pval = c(0.001, 0.01, 0.2, 0.05, 0.002, 0.5, 0.4, 0.7, 0.001, 0.02),
37 | #' start = c(1, 3, 5, 10, 15, 25, 28, 30, 41, 51),
38 | #' end = c(6, 8, 10, 16, 23, 35, 35, 35, 48, 55)
39 | #' )
40 | #' calculate_aa_scores(
41 | #' data,
42 | #' protein = pg_protein_accessions,
43 | #' diff = diff,
44 | #' adj_pval = adj_pval,
45 | #' start_position = start,
46 | #' end_position = end
47 | #' )
48 | calculate_aa_scores <- function(data,
49 | protein,
50 | diff = diff,
51 | adj_pval = adj_pval,
52 | start_position,
53 | end_position,
54 | retain_columns = NULL) {
55 | output <- data %>%
56 | dplyr::ungroup() %>%
57 | dplyr::distinct({{ protein }}, {{ diff }}, {{ adj_pval }}, {{ start_position }}, {{ end_position }}) %>%
58 | tidyr::drop_na({{ diff }}, {{ adj_pval }}) %>%
59 | dplyr::mutate(score = -log10({{ adj_pval }}) * abs({{ diff }})) %>%
60 | dplyr::rowwise() %>%
61 | dplyr::mutate(residue = list(seq({{ start_position }}, {{ end_position }}))) %>%
62 | tidyr::unnest("residue") %>%
63 | dplyr::group_by({{ protein }}, .data$residue) %>%
64 | dplyr::mutate(amino_acid_score = mean(.data$score)) %>%
65 | dplyr::distinct({{ protein }}, .data$residue, .data$amino_acid_score)
66 |
67 |
68 | if (!missing(retain_columns)) {
69 | output <- data %>%
70 | dplyr::select(!!enquo(retain_columns), colnames(output)[!colnames(output) %in% c(
71 | "residue",
72 | "amino_acid_score"
73 | )]) %>%
74 | dplyr::distinct() %>%
75 | dplyr::right_join(output, by = colnames(output)[!colnames(output) %in% c(
76 | "residue",
77 | "amino_acid_score"
78 | )])
79 | }
80 |
81 | output
82 | }
83 |
--------------------------------------------------------------------------------