├── _pkgdown.yml ├── .github ├── .gitignore └── workflows │ ├── test-coverage.yaml │ ├── pkgdown.yaml │ ├── R-CMD-check.yaml │ └── format-code.yml ├── vignettes ├── .gitignore └── figures │ ├── interaction_2hwg.png │ ├── peptide_map_1zmr.png │ ├── peptide_map_2hwg.png │ ├── peptide_map_1zmr_score.png │ └── peptide_map_2hwg_score.png ├── revdep ├── failures.md ├── problems.md ├── .gitignore ├── cran.md └── README.md ├── data ├── metal_list.rda ├── ptsi_pgk.rda ├── mako_colours.rda ├── protti_colours.rda ├── rapamycin_10uM.rda ├── viridis_colours.rda ├── metal_chebi_uniprot.rda ├── metal_go_slim_subset.rda └── rapamycin_dose_response.rda ├── tests ├── testthat.R └── testthat │ ├── test_import.csv │ └── test-queue_functions.R ├── LICENSE ├── man ├── figures │ ├── logo.png │ ├── README-volcano-1.png │ ├── lifecycle-stable.svg │ ├── lifecycle-defunct.svg │ ├── lifecycle-archived.svg │ ├── lifecycle-maturing.svg │ ├── lifecycle-deprecated.svg │ ├── lifecycle-superseded.svg │ ├── lifecycle-experimental.svg │ └── lifecycle-questioning.svg ├── protti_colours.Rd ├── mako_colours.Rd ├── viridis_colours.Rd ├── metal_list.Rd ├── plot_peptide_profiles.Rd ├── peptide_type.Rd ├── plot_pval_distribution.Rd ├── sequence_coverage.Rd ├── split_metal_name.Rd ├── volcano_protti.Rd ├── plot_drc_4p.Rd ├── median_normalisation.Rd ├── kegg_enrichment.Rd ├── network_analysis.Rd ├── fetch_go.Rd ├── metal_chebi_uniprot.Rd ├── replace_identified_by_x.Rd ├── metal_go_slim_subset.Rd ├── fetch_kegg.Rd ├── treatment_enrichment.Rd ├── read_protti.Rd ├── go_enrichment.Rd ├── rapamycin_10uM.Rd ├── scale_protti.Rd ├── rapamycin_dose_response.Rd ├── fetch_chebi.Rd ├── calculate_sequence_coverage.Rd ├── find_chebis.Rd ├── find_peptide.Rd ├── drc_4p.Rd ├── ttest_protti.Rd ├── normalise.Rd ├── pval_distribution_plot.Rd ├── anova_protti.Rd ├── fetch_uniprot_proteome.Rd ├── fetch_mobidb.Rd ├── assign_peptide_type.Rd ├── ptsi_pgk.Rd ├── try_query.Rd ├── find_all_subs.Rd ├── qc_sequence_coverage.Rd ├── qc_median_intensities.Rd ├── fetch_uniprot.Rd ├── qc_contaminants.Rd ├── qc_intensity_distribution.Rd ├── qc_proteome_coverage.Rd ├── calculate_imputation.Rd ├── qc_data_completeness.Rd ├── fetch_alphafold_aligned_error.Rd ├── qc_sample_correlation.Rd ├── randomise_queue.Rd ├── qc_cvs.Rd ├── fetch_eco.Rd ├── filter_cv.Rd ├── qc_ranked_intensities.Rd ├── calculate_aa_scores.Rd ├── qc_peak_width.Rd ├── qc_ids.Rd ├── qc_pca.Rd ├── qc_peptide_type.Rd ├── qc_charge_states.Rd ├── predict_alphafold_domain.Rd ├── diff_abundance.Rd ├── fetch_quickgo.Rd ├── fetch_pdb.Rd ├── qc_missed_cleavages.Rd ├── barcode_plot.Rd └── calculate_kegg_enrichment.Rd ├── pkgdown └── favicon │ ├── favicon.ico │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── apple-touch-icon.png │ ├── apple-touch-icon-120x120.png │ ├── apple-touch-icon-152x152.png │ ├── apple-touch-icon-180x180.png │ ├── apple-touch-icon-60x60.png │ └── apple-touch-icon-76x76.png ├── .Rbuildignore ├── codecov.yml ├── protti.Rproj ├── R ├── zzz.R ├── read_protti.R ├── scale_protti.R ├── replace_identified_by_x.R ├── ttest_protti.R ├── fetch_go.R ├── find_chebis.R ├── find_peptide.R ├── drc_4p.R ├── anova_protti.R ├── fetch_kegg.R ├── find_all_subs.R ├── normalise.R ├── fetch_uniprot_proteome.R ├── calculate_sequence_coverage.R ├── pval_distribution_plot.R ├── assign_peptide_type.R ├── qc_median_intensities.R ├── calculate_imputation.R └── calculate_aa_scores.R ├── inst └── CITATION ├── cran-comments.md ├── .gitignore ├── LICENSE.md ├── data-raw ├── rapamycin_10uM.R ├── rapamycin_dose_response.R ├── ptsi_pgk.R └── protti_colours.R └── DESCRIPTION /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /revdep/failures.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /revdep/problems.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /data/metal_list.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/metal_list.rda -------------------------------------------------------------------------------- /data/ptsi_pgk.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/ptsi_pgk.rda -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(protti) 3 | 4 | test_check("protti") 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: ETH Zurich, Jan-Philipp Quast, Dina Schuster 3 | -------------------------------------------------------------------------------- /data/mako_colours.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/mako_colours.rda -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /data/protti_colours.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/protti_colours.rda -------------------------------------------------------------------------------- /data/rapamycin_10uM.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/rapamycin_10uM.rda -------------------------------------------------------------------------------- /data/viridis_colours.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/viridis_colours.rda -------------------------------------------------------------------------------- /data/metal_chebi_uniprot.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/metal_chebi_uniprot.rda -------------------------------------------------------------------------------- /data/metal_go_slim_subset.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/metal_go_slim_subset.rda -------------------------------------------------------------------------------- /pkgdown/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/favicon.ico -------------------------------------------------------------------------------- /data/rapamycin_dose_response.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/data/rapamycin_dose_response.rda -------------------------------------------------------------------------------- /man/figures/README-volcano-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/man/figures/README-volcano-1.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /tests/testthat/test_import.csv: -------------------------------------------------------------------------------- 1 | Test.column,TestColumn,Test_column 2 | 10.1,_ABC_,1 3 | 11.3,_ABC_,2 4 | 14.1,_ABC_,3 -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /vignettes/figures/interaction_2hwg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/interaction_2hwg.png -------------------------------------------------------------------------------- /vignettes/figures/peptide_map_1zmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/peptide_map_1zmr.png -------------------------------------------------------------------------------- /vignettes/figures/peptide_map_2hwg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/peptide_map_2hwg.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-120x120.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-152x152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-152x152.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-180x180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-180x180.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-60x60.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/pkgdown/favicon/apple-touch-icon-76x76.png -------------------------------------------------------------------------------- /vignettes/figures/peptide_map_1zmr_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/peptide_map_1zmr_score.png -------------------------------------------------------------------------------- /vignettes/figures/peptide_map_2hwg_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpquast/protti/HEAD/vignettes/figures/peptide_map_2hwg_score.png -------------------------------------------------------------------------------- /revdep/.gitignore: -------------------------------------------------------------------------------- 1 | checks 2 | library 3 | checks.noindex 4 | library.noindex 5 | data.sqlite 6 | *.html 7 | download 8 | lib 9 | cloud.noindex -------------------------------------------------------------------------------- /revdep/cran.md: -------------------------------------------------------------------------------- 1 | ## revdepcheck results 2 | 3 | We checked 1 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. 4 | 5 | * We saw 0 new problems 6 | * We failed to check 0 packages 7 | 8 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^protti\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^README\.Rmd$ 5 | ^data-raw$ 6 | ^\.travis\.yml$ 7 | ^\.github$ 8 | ^codecov\.yml$ 9 | ^doc$ 10 | ^Meta$ 11 | ^_pkgdown\.yml$ 12 | ^docs$ 13 | ^pkgdown$ 14 | ^cran-comments\.md$ 15 | ^CRAN-RELEASE$ 16 | ^CRAN-SUBMISSION$ 17 | ^revdep$ -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /man/protti_colours.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{protti_colours} 5 | \alias{protti_colours} 6 | \title{Colour scheme for protti} 7 | \format{ 8 | A vector containing 100 colours 9 | } 10 | \source{ 11 | Dina's imagination. 12 | } 13 | \usage{ 14 | protti_colours 15 | } 16 | \description{ 17 | A colour scheme for protti that contains 100 colours. 18 | } 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /protti.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /man/mako_colours.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{mako_colours} 5 | \alias{mako_colours} 6 | \title{Viridis colour scheme} 7 | \format{ 8 | A vector containing 256 colours 9 | } 10 | \source{ 11 | created for the Seaborn statistical data visualization package for Python 12 | } 13 | \usage{ 14 | mako_colours 15 | } 16 | \description{ 17 | A perceptually uniform colour scheme originally created for the Seaborn python package. 18 | } 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /man/viridis_colours.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{viridis_colours} 5 | \alias{viridis_colours} 6 | \title{Viridis colour scheme} 7 | \format{ 8 | A vector containing 256 colours 9 | } 10 | \source{ 11 | viridis R package, created by Stéfan van der Walt (stefanv) and Nathaniel Smith (njsmith) 12 | } 13 | \usage{ 14 | viridis_colours 15 | } 16 | \description{ 17 | A colour scheme by the viridis colour scheme from the viridis R package. 18 | } 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /man/metal_list.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{metal_list} 5 | \alias{metal_list} 6 | \title{List of metals} 7 | \format{ 8 | A data.frame containing the columns \code{atomic_number}, \code{symbol}, \code{name}, 9 | \code{type}, \code{chebi_id}. 10 | } 11 | \source{ 12 | https://en.wikipedia.org/wiki/Metal and https://en.wikipedia.org/wiki/Metalloid 13 | } 14 | \usage{ 15 | metal_list 16 | } 17 | \description{ 18 | A list of all metals and metalloids in the periodic table. 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(libname, pkgname) { 2 | if (.Platform$OS.type == "unix") { 3 | packageStartupMessage( 4 | "\U1F469\U1F3FD\U200D\U1F52C Welcome to protti version ", 5 | utils::packageVersion("protti"), 6 | "! \U1F468\U1F3FC\U200D\U1F4BB 7 | \n\U1F52C Have fun analysing your data! \U1F4BB" 8 | ) 9 | } 10 | if (.Platform$OS.type == "windows") { 11 | packageStartupMessage( 12 | "Welcome to protti version ", 13 | utils::packageVersion("protti"), "! 14 | \nHave fun analysing your data!" 15 | ) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite protti in publications, please use:") 2 | 3 | bibentry(bibtype = "article", 4 | textVersion = "Quast, J.P., Schuster, D., Picotti, P. (2022). protti: an R package for comprehensive data analysis of peptide- and protein-centric bottom-up proteomics data. Bioinformatics Advances, 2(1).", 5 | author = "Jan-Philipp Quast, Dina Schuster, Paola Picotti", 6 | title = "protti: an R package for comprehensive data analysis of peptide- and protein-centric bottom-up proteomics data", 7 | journal = "Bioinformatics Advances", 8 | year = "2022", 9 | volume = "2", 10 | number = "1", 11 | ) 12 | -------------------------------------------------------------------------------- /man/plot_peptide_profiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/peptide_profile_plot.R 3 | \name{plot_peptide_profiles} 4 | \alias{plot_peptide_profiles} 5 | \title{Peptide abundance profile plot} 6 | \usage{ 7 | plot_peptide_profiles(...) 8 | } 9 | \value{ 10 | A list of peptide profile plots. 11 | } 12 | \description{ 13 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 14 | This function was deprecated due to its name changing to \code{peptide_profile_plot()}. 15 | } 16 | \keyword{internal} 17 | -------------------------------------------------------------------------------- /man/peptide_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assign_peptide_type.R 3 | \name{peptide_type} 4 | \alias{peptide_type} 5 | \title{Assign peptide type} 6 | \usage{ 7 | peptide_type(...) 8 | } 9 | \value{ 10 | A data frame that contains the input data and an additional column with the peptide 11 | type information. 12 | } 13 | \description{ 14 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 15 | This function was deprecated due to its name changing to \code{assign_peptide_type()}. 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/plot_pval_distribution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pval_distribution_plot.R 3 | \name{plot_pval_distribution} 4 | \alias{plot_pval_distribution} 5 | \title{Plot histogram of p-value distribution} 6 | \usage{ 7 | plot_pval_distribution(...) 8 | } 9 | \value{ 10 | A histogram plot that shows the p-value distribution. 11 | } 12 | \description{ 13 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 14 | This function was deprecated due to its name changing to \code{pval_distribution_plot()}. 15 | } 16 | \keyword{internal} 17 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Submission 2 | 3 | * We specifically addressed and fixed the issue raised by Prof. Brian Ripley: 4 | * We updated `try_query()` to also handle request unrelated errors successfully. 5 | 6 | ## Test environments 7 | * macOS-latest (on GitHub actions), R 4.4.1 8 | * windows-latest (on GitHub actions), R 4.4.1 9 | * ubuntu-20.04 (on GitHub actions), R 4.4.1 10 | * ubuntu-20.04 (on GitHub actions), r-devel 11 | * windows-ix86+x86_64 (win-builder), r-devel 12 | * fedora-clang-devel (R-hub), r-devel 13 | * windows-x86_64-devel (R-hub), r-devel 14 | * Ubuntu Linux 20.04.1 LTS (R-hub), r-release 15 | 16 | ## R CMD check results 17 | 18 | 0 errors ✓ | 0 warnings ✓ | 0 notes ✓ 19 | 20 | -------------------------------------------------------------------------------- /man/sequence_coverage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_sequence_coverage.R 3 | \name{sequence_coverage} 4 | \alias{sequence_coverage} 5 | \title{Protein sequence coverage} 6 | \usage{ 7 | sequence_coverage(...) 8 | } 9 | \value{ 10 | A new column in the \code{data} data frame containing the calculated sequence coverage 11 | for each identified protein 12 | } 13 | \description{ 14 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 15 | This function was deprecated due to its name changing to \code{calculate_sequence_coverage()}. 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/split_metal_name.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/split_metal_name.R 3 | \name{split_metal_name} 4 | \alias{split_metal_name} 5 | \title{Convert metal names to search pattern} 6 | \usage{ 7 | split_metal_name(metal_names) 8 | } 9 | \arguments{ 10 | \item{metal_names}{a character vector containing names of metals and metal containing molecules.} 11 | } 12 | \value{ 13 | A character vector with metal name search patterns. 14 | } 15 | \description{ 16 | Converts a vector of metal names extracted from the \code{ft_metal} column 17 | obtained with \code{fetch_uniprot} to a pattern that can be used to search for corresponding 18 | ChEBI IDs. This is used as a helper function for other functions. 19 | } 20 | -------------------------------------------------------------------------------- /man/volcano_protti.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/volcano_plot.R 3 | \name{volcano_protti} 4 | \alias{volcano_protti} 5 | \title{Volcano plot} 6 | \usage{ 7 | volcano_protti(...) 8 | } 9 | \value{ 10 | Depending on the method used a volcano plot with either highlighted targets 11 | (\code{method = "target"}) or highlighted significant proteins (\code{method = "significant"}) 12 | is returned. 13 | } 14 | \description{ 15 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 16 | This function was deprecated due to its name changing to \code{volcano_plot()}. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/plot_drc_4p.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/drc_4p_plot.R 3 | \name{plot_drc_4p} 4 | \alias{plot_drc_4p} 5 | \title{Perform gene ontology enrichment analysis} 6 | \usage{ 7 | plot_drc_4p(...) 8 | } 9 | \value{ 10 | If \code{targets = "all"} a list containing plots for every unique identifier in the 11 | \code{grouping} variable is created. Otherwise a plot for the specified targets is created with 12 | maximally 20 facets. 13 | } 14 | \description{ 15 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 16 | This function was deprecated due to its name changing to \code{drc_4p_plot()}. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/median_normalisation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/normalise.R 3 | \name{median_normalisation} 4 | \alias{median_normalisation} 5 | \title{Intensity normalisation} 6 | \usage{ 7 | median_normalisation(...) 8 | } 9 | \value{ 10 | A data frame with a column called \code{normalised_intensity_log2} containing the 11 | normalised intensity values. 12 | } 13 | \description{ 14 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 15 | This function was deprecated due to its name changing to \code{normalise()}. 16 | The normalisation method in the new function needs to be provided as an argument. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/kegg_enrichment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_kegg_enrichment.R 3 | \name{kegg_enrichment} 4 | \alias{kegg_enrichment} 5 | \title{Perform KEGG pathway enrichment analysis} 6 | \usage{ 7 | kegg_enrichment(...) 8 | } 9 | \value{ 10 | A bar plot displaying negative log10 adjusted p-values for the top 10 enriched pathways. 11 | Bars are coloured according to the direction of the enrichment. If \code{plot = FALSE}, a data 12 | frame is returned. 13 | } 14 | \description{ 15 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 16 | This function was deprecated due to its name changing to \code{calculate_kegg_enrichment()}. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # produced vignettes 24 | vignettes/*.html 25 | vignettes/*.pdf 26 | 27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 28 | .httr-oauth 29 | 30 | # knitr and R markdown default cache directories 31 | *_cache/ 32 | /cache/ 33 | 34 | # Temporary files created by R markdown 35 | *.utf8.md 36 | *.knit.md 37 | 38 | # R Environment Variables 39 | .Renviron 40 | 41 | .DS_Store 42 | inst/doc 43 | doc 44 | Meta 45 | docs 46 | /doc/ 47 | /Meta/ 48 | -------------------------------------------------------------------------------- /man/network_analysis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/analyse_functional_network.R 3 | \name{network_analysis} 4 | \alias{network_analysis} 5 | \title{Analyse protein interaction network for significant hits} 6 | \usage{ 7 | network_analysis(...) 8 | } 9 | \value{ 10 | A network plot displaying interactions of the provided proteins. If 11 | \code{binds_treatment} was provided halos around the proteins show which proteins interact with 12 | the treatment. If \code{plot = FALSE} a data frame with interaction information is returned. 13 | } 14 | \description{ 15 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 16 | This function was deprecated due to its name changing to \code{analyse_functional_network()}. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/fetch_go.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_go.R 3 | \name{fetch_go} 4 | \alias{fetch_go} 5 | \title{Fetch gene ontology information from geneontology.org} 6 | \usage{ 7 | fetch_go(organism_id) 8 | } 9 | \arguments{ 10 | \item{organism_id}{a character value NCBI taxonomy identifier of an organism (TaxId). 11 | Possible inputs inlude only: "9606" (Human), "559292" (Yeast) and "83333" (E. coli).} 12 | } 13 | \value{ 14 | A data frame that contains gene ontology mappings to UniProt or SGD IDs. The original 15 | file is a .GAF file. A detailed description of all columns can be found here: 16 | http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/ 17 | } 18 | \description{ 19 | Fetches gene ontology data from geneontology.org for the provided organism ID. 20 | } 21 | \examples{ 22 | \donttest{ 23 | go <- fetch_go("9606") 24 | 25 | head(go) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/metal_chebi_uniprot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{metal_chebi_uniprot} 5 | \alias{metal_chebi_uniprot} 6 | \title{List of metal-related ChEBI IDs in UniProt} 7 | \format{ 8 | A data.frame containing information retrieved from ChEBI using \code{fetch_chebi(stars = c(2, 3))}, 9 | filtered using symbols in the \code{metal_list} and manual annotation of metal related ChEBI IDs that do not 10 | contain a formula. 11 | } 12 | \source{ 13 | UniProt (cc_cofactor, cc_catalytic_activity, ft_binding) and ChEBI 14 | } 15 | \usage{ 16 | metal_chebi_uniprot 17 | } 18 | \description{ 19 | A list that contains all ChEBI IDs that appear in UniProt and that contain either a metal atom 20 | in their formula or that do not have a formula but the ChEBI term is related to metals. 21 | This was last updated on the 19/02/24. 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/replace_identified_by_x.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/replace_identified_by_x.R 3 | \name{replace_identified_by_x} 4 | \alias{replace_identified_by_x} 5 | \title{Replace identified positions in protein sequence by "x"} 6 | \usage{ 7 | replace_identified_by_x(sequence, positions_start, positions_end) 8 | } 9 | \arguments{ 10 | \item{sequence}{a character value that contains the protein sequence.} 11 | 12 | \item{positions_start}{a numeric vector of start positions of the identified peptides.} 13 | 14 | \item{positions_end}{a numeric vector of end positions of the identified peptides.} 15 | } 16 | \value{ 17 | A character vector that contains the modified protein sequence with each identified 18 | position replaced by "x". 19 | } 20 | \description{ 21 | Helper function for the calculation of sequence coverage, replaces identified positions with an 22 | "x" within the protein sequence. 23 | } 24 | -------------------------------------------------------------------------------- /man/figures/lifecycle-stable.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecyclestablestable -------------------------------------------------------------------------------- /man/figures/lifecycle-defunct.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecycledefunctdefunct -------------------------------------------------------------------------------- /man/metal_go_slim_subset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{metal_go_slim_subset} 5 | \alias{metal_go_slim_subset} 6 | \title{Molecular function gene ontology metal subset} 7 | \format{ 8 | A data.frame containing a slim subset of molecular function gene ontology terms 9 | that are related to metal binding. The \code{slims_from_id} column contains all IDs relevant 10 | in this subset while the \code{slims_to_ids} column contains the starting IDs. If ChEBI IDs 11 | have been annotated manually this is indicated in the \code{database} column. 12 | } 13 | \source{ 14 | QuickGO and ChEBI 15 | } 16 | \usage{ 17 | metal_go_slim_subset 18 | } 19 | \description{ 20 | A subset of molecular function gene ontology terms related to metals that was created 21 | using the slimming process provided by the QuickGO EBI database. 22 | This was last updated on the 19/02/24. 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/figures/lifecycle-archived.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecyclearchivedarchived -------------------------------------------------------------------------------- /man/figures/lifecycle-maturing.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecyclematuringmaturing -------------------------------------------------------------------------------- /man/figures/lifecycle-deprecated.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecycledeprecateddeprecated -------------------------------------------------------------------------------- /man/figures/lifecycle-superseded.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecyclesupersededsuperseded -------------------------------------------------------------------------------- /man/figures/lifecycle-experimental.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecycleexperimentalexperimental -------------------------------------------------------------------------------- /man/figures/lifecycle-questioning.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecyclequestioningquestioning -------------------------------------------------------------------------------- /man/fetch_kegg.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_kegg.R 3 | \name{fetch_kegg} 4 | \alias{fetch_kegg} 5 | \title{Fetch KEGG pathway data from KEGG} 6 | \usage{ 7 | fetch_kegg(species) 8 | } 9 | \arguments{ 10 | \item{species}{a character value providing an abreviated species name. "hsa" for human, "eco" 11 | for E. coli and "sce" for S. cerevisiae. Additional possible names can be found for 12 | \href{https://www.genome.jp/kegg-bin/show_organism?category=Eukaryotes}{eukaryotes} and for 13 | \href{https://www.genome.jp/kegg-bin/show_organism?category=Prokaryotes}{prokaryotes}.} 14 | } 15 | \value{ 16 | A data frame that contains gene IDs with corresponding pathway IDs and names for a 17 | selected organism. 18 | } 19 | \description{ 20 | Fetches gene IDs and corresponding pathway IDs and names for the provided organism. 21 | } 22 | \examples{ 23 | \donttest{ 24 | kegg <- fetch_kegg(species = "hsa") 25 | 26 | head(kegg) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/treatment_enrichment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_treatment_enrichment.R 3 | \name{treatment_enrichment} 4 | \alias{treatment_enrichment} 5 | \title{Check treatment enrichment} 6 | \usage{ 7 | treatment_enrichment(...) 8 | } 9 | \value{ 10 | A bar plot displaying the percentage of all detect proteins and all significant proteins 11 | that bind to the treatment. A Fisher's exact test is performed to calculate the significance of 12 | the enrichment in significant proteins compared to all proteins. The result is reported as a 13 | p-value. If \code{plot = FALSE} a contingency table in long format is returned. 14 | } 15 | \description{ 16 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 17 | This function was deprecated due to its name changing to \code{calculate_treatment_enrichment()}. 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /man/read_protti.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_protti.R 3 | \name{read_protti} 4 | \alias{read_protti} 5 | \title{Read, clean and convert} 6 | \usage{ 7 | read_protti(filename, ...) 8 | } 9 | \arguments{ 10 | \item{filename}{a character value that specifies the path to the file.} 11 | 12 | \item{...}{additional arguments for the fread function.} 13 | } 14 | \value{ 15 | A data frame (with class tibble) that contains the content of the specified file. 16 | } 17 | \description{ 18 | The function uses the very fast \code{fread} function form the \code{data.table} package. The 19 | column names of the resulting data table are made more r-friendly using \code{clean_names} from 20 | the \code{janitor} package. It replaces "." and " " with "_" and converts names to lower case 21 | which is also known as snake_case. In the end the data table is converted to a tibble. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | read_protti("folder\\\\filename") 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/go_enrichment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_go_enrichment.R 3 | \name{go_enrichment} 4 | \alias{go_enrichment} 5 | \title{Perform gene ontology enrichment analysis} 6 | \usage{ 7 | go_enrichment(...) 8 | } 9 | \value{ 10 | A bar plot displaying negative log10 adjusted p-values for the top 10 enriched or 11 | depleted gene ontology terms. Alternatively, plot cutoffs can be chosen individually with the 12 | \code{plot_cutoff} argument. Bars are colored according to the direction of the enrichment 13 | (enriched or deenriched). If \code{plot = FALSE}, a data frame is returned. P-values are 14 | adjusted with Benjamini-Hochberg. 15 | } 16 | \description{ 17 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 18 | This function was deprecated due to its name changing to \code{calculate_go_enrichment()}. 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/rapamycin_10uM.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{rapamycin_10uM} 5 | \alias{rapamycin_10uM} 6 | \title{Rapamycin 10 uM example data} 7 | \format{ 8 | A data frame containing peptide level data from a Spectronaut report. 9 | } 10 | \source{ 11 | Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic 12 | approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200 13 | (2020). \doi{10.1038/s41467-020-18071-x} 14 | } 15 | \usage{ 16 | rapamycin_10uM 17 | } 18 | \description{ 19 | Rapamycin example data used for the vignette about binary control/treated data. The data was 20 | obtained from \href{https://www.nature.com/articles/s41467-020-18071-x}{Piazza 2020} 21 | and corresponds to experiment 18. FKBP1A the rapamycin binding protein and 49 other randomly 22 | sampled proteins were used for this example dataset. Furthermore, only the DMSO control and the 23 | 10 uM condition were used. 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /R/read_protti.R: -------------------------------------------------------------------------------- 1 | #' Read, clean and convert 2 | #' 3 | #' The function uses the very fast \code{fread} function form the \code{data.table} package. The 4 | #' column names of the resulting data table are made more r-friendly using \code{clean_names} from 5 | #' the \code{janitor} package. It replaces "." and " " with "_" and converts names to lower case 6 | #' which is also known as snake_case. In the end the data table is converted to a tibble. 7 | #' 8 | #' @param filename a character value that specifies the path to the file. 9 | #' @param ... additional arguments for the fread function. 10 | #' 11 | #' @importFrom data.table fread 12 | #' @importFrom janitor clean_names 13 | #' @importFrom magrittr %>% 14 | #' 15 | #' @return A data frame (with class tibble) that contains the content of the specified file. 16 | #' @export 17 | #' 18 | #' @examples 19 | #' \dontrun{ 20 | #' read_protti("folder\\filename") 21 | #' } 22 | read_protti <- 23 | function(filename, ...) { 24 | data.table::fread(filename, ...) %>% 25 | janitor::clean_names() %>% 26 | tibble::as_tibble() 27 | } 28 | -------------------------------------------------------------------------------- /man/scale_protti.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scale_protti.R 3 | \name{scale_protti} 4 | \alias{scale_protti} 5 | \title{Scaling a vector} 6 | \usage{ 7 | scale_protti(x, method) 8 | } 9 | \arguments{ 10 | \item{x}{a numeric vector} 11 | 12 | \item{method}{a character value that specifies the method to be used for scaling. "01" scales 13 | the vector between 0 and 1. "center" scales the vector equal to \code{base::scale} around a 14 | center. This is done by subtracting the mean from every value and then deviding them by the 15 | standard deviation.} 16 | } 17 | \value{ 18 | A scaled numeric vector. 19 | } 20 | \description{ 21 | \code{scale_protti} is used to scale a numeric vector either between 0 and 1 or around a 22 | centered value using the standard deviation. If a vector containing only one value or 23 | repeatedly the same value is provided, 1 is returned as the scaled value for \code{method = "01"} 24 | and 0 is returned for \code{metod = "center"}. 25 | } 26 | \examples{ 27 | scale_protti(c(1, 2, 1, 4, 6, 8), method = "01") 28 | } 29 | -------------------------------------------------------------------------------- /man/rapamycin_dose_response.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{rapamycin_dose_response} 5 | \alias{rapamycin_dose_response} 6 | \title{Rapamycin dose response example data} 7 | \format{ 8 | A data frame containing peptide level data from a Spectronaut report. 9 | } 10 | \source{ 11 | Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic 12 | approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200 13 | (2020). \doi{10.1038/s41467-020-18071-x} 14 | } 15 | \usage{ 16 | rapamycin_dose_response 17 | } 18 | \description{ 19 | Rapamycin example data used for the vignette about dose response data. The data was obtained 20 | from \href{https://www.nature.com/articles/s41467-020-18071-x}{Piazza 2020} and corresponds 21 | to experiment 18. FKBP1A the rapamycin binding protein and 39 other randomly sampled proteins 22 | were used for this example dataset. The concentration range includes the following points: 23 | 0 (DMSO control), 10 pM, 100 pM, 1 nM, 10 nM, 100 nM, 1 uM, 10 uM and 100 uM. 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 ETH Zurich, Jan-Philipp Quast, Dina Schuster 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /man/fetch_chebi.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_chebi.R 3 | \name{fetch_chebi} 4 | \alias{fetch_chebi} 5 | \title{Fetch ChEBI database information} 6 | \usage{ 7 | fetch_chebi(relation = FALSE, stars = c(3), timeout = 60) 8 | } 9 | \arguments{ 10 | \item{relation}{a logical value that indicates if ChEBI Ontology data will be returned instead 11 | the main compound data. This data can be used to check the relations of ChEBI ID's to each other. 12 | Default is FALSE.} 13 | 14 | \item{stars}{a numeric vector indicating the "star" level (confidence) for which entries should 15 | be retrieved (Possible levels are 1, 2 and 3). Default is \code{c(3)} retrieving only "3-star" 16 | entries, which are manually annotated by the ChEBI curator team.} 17 | 18 | \item{timeout}{a numeric value specifying the time in seconds until the download of an organism 19 | archive times out. The default is 60 seconds.} 20 | } 21 | \value{ 22 | A data frame that contains information about each molecule in the ChEBI database. 23 | } 24 | \description{ 25 | Fetches information from the ChEBI database. 26 | } 27 | \examples{ 28 | \donttest{ 29 | chebi <- fetch_chebi() 30 | 31 | head(chebi) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /data-raw/rapamycin_10uM.R: -------------------------------------------------------------------------------- 1 | # library(tidyverse) 2 | # library(protti) 3 | # 4 | # set.seed(1234) 5 | # 6 | # # Source: Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200 (2020). https://doi.org/10.1038/s41467-020-18071-x 7 | # 8 | # rapa <- read_protti("rapamycin_dose_response.csv") 9 | # 10 | # # filter to only retain DMSO control and 10 uM concentration 11 | # 12 | # rapa_filtered <- rapa %>% 13 | # distinct(r_file_name, r_condition, pep_stripped_sequence, eg_precursor_id, pg_protein_accessions, fg_quantity, pep_is_proteotypic, eg_is_decoy) %>% 14 | # filter(r_condition == 0 | r_condition == 7) %>% 15 | # mutate(r_condition = ifelse(r_condition == 0, "control", "rapamycin")) %>% 16 | # mutate(r_file_name = paste0(r_condition, "_", str_sub(r_file_name, start = 35, end = 36))) 17 | # 18 | # all_proteins <- unique(rapa_filter$pg_protein_accessions) 19 | # 20 | # all_proteins_wo_FKBP1A <- all_proteins[all_proteins != "P62942"] 21 | # 22 | # sampled_bg <- sample(all_proteins_wo_FKBP1A, size = 49) 23 | # 24 | # rapamycin_10uM <- rapa_filtered %>% 25 | # filter(pg_protein_accessions %in% c(sampled_bg, "P62942")) 26 | # 27 | # usethis::use_data(rapamycin_10uM, overwrite = TRUE) 28 | -------------------------------------------------------------------------------- /man/calculate_sequence_coverage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_sequence_coverage.R 3 | \name{calculate_sequence_coverage} 4 | \alias{calculate_sequence_coverage} 5 | \title{Protein sequence coverage} 6 | \usage{ 7 | calculate_sequence_coverage(data, protein_sequence, peptides) 8 | } 9 | \arguments{ 10 | \item{data}{a data frame containing at least the protein sequence and the identified peptides 11 | as columns.} 12 | 13 | \item{protein_sequence}{a character column in the \code{data} data frame that contains protein 14 | sequences. Can be obtained by using the function \code{fetch_uniprot()}} 15 | 16 | \item{peptides}{a character column in the \code{data} data frame that contains the identified 17 | peptides.} 18 | } 19 | \value{ 20 | A new column in the \code{data} data frame containing the calculated sequence coverage 21 | for each identified protein 22 | } 23 | \description{ 24 | Calculate sequence coverage for each identified protein. 25 | } 26 | \examples{ 27 | data <- data.frame( 28 | protein_sequence = c("abcdefghijklmnop", "abcdefghijklmnop"), 29 | pep_stripped_sequence = c("abc", "jklmn") 30 | ) 31 | 32 | calculate_sequence_coverage( 33 | data, 34 | protein_sequence = protein_sequence, 35 | peptides = pep_stripped_sequence 36 | ) 37 | } 38 | -------------------------------------------------------------------------------- /man/find_chebis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_chebis.R 3 | \name{find_chebis} 4 | \alias{find_chebis} 5 | \title{Find ChEBI IDs for name patterns} 6 | \usage{ 7 | find_chebis(chebi_data, pattern) 8 | } 9 | \arguments{ 10 | \item{chebi_data}{a data frame that contains at least information on ChEBI IDs (id) and their 11 | names (name). This data frame can be obtained by calling \code{fetch_chebi()}. Ideally this 12 | should be subsetted to only contain molecules of a specific type e.g. metals. This can be 13 | achieved by calling \code{find_all_subs} with a general ID such as "25213" (Metal cation) and 14 | then subset the complete ChEBI database to only include the returned sub-IDs. Using a subsetted 15 | database ensures better search results. This is a helper function for other functions.} 16 | 17 | \item{pattern}{a character vector that contains names or name patterns of molecules. Name 18 | patterns can be for example obtained with the \code{split_metal_name} function.} 19 | } 20 | \value{ 21 | A list of character vectors containing ChEBI IDs that have a name matching the supplied 22 | pattern. It contains one element per pattern. 23 | } 24 | \description{ 25 | Search for chebi IDs that match a specific name pattern. A list of corresponding ChEBI IDs is 26 | returned. 27 | } 28 | -------------------------------------------------------------------------------- /man/find_peptide.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_peptide.R 3 | \name{find_peptide} 4 | \alias{find_peptide} 5 | \title{Find peptide location} 6 | \usage{ 7 | find_peptide(data, protein_sequence, peptide_sequence) 8 | } 9 | \arguments{ 10 | \item{data}{a data frame that contains at least the protein and peptide sequence.} 11 | 12 | \item{protein_sequence}{a character column in the \code{data} data frame that contains the 13 | protein sequence.} 14 | 15 | \item{peptide_sequence}{a character column in the \code{data} data frame that contains the 16 | peptide sequence.} 17 | } 18 | \value{ 19 | A data frame that contains the input data and four additional columns with peptide 20 | start and end position, the last amino acid and the amino acid before the peptide. 21 | } 22 | \description{ 23 | The position of the given peptide sequence is searched within the given protein sequence. In 24 | addition the last amino acid of the peptide and the amino acid right before are reported. 25 | } 26 | \examples{ 27 | # Create example data 28 | data <- data.frame( 29 | protein_sequence = c("abcdefg"), 30 | peptide_sequence = c("cde") 31 | ) 32 | 33 | # Find peptide 34 | find_peptide( 35 | data = data, 36 | protein_sequence = protein_sequence, 37 | peptide_sequence = peptide_sequence 38 | ) 39 | } 40 | -------------------------------------------------------------------------------- /man/drc_4p.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/drc_4p.R 3 | \name{drc_4p} 4 | \alias{drc_4p} 5 | \title{Dose response curve helper function} 6 | \usage{ 7 | drc_4p(data, response, dose, log_logarithmic = TRUE, pb = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{a data frame that contains at least the dose and response column the model should 11 | be fitted to.} 12 | 13 | \item{response}{a numeric column that contains the response values.} 14 | 15 | \item{dose}{a numeric column that contains the dose values.} 16 | 17 | \item{log_logarithmic}{a logical value indicating if a logarithmic or log-logarithmic model is 18 | fitted. If response values form a symmetric curve for non-log transformed dose values, a 19 | logarithmic model instead of a log-logarithmic model should be used. Usually biological dose 20 | response data has a log-logarithmic distribution, which is the reason this is the default. 21 | Log-logarithmic models are symmetric if dose values are log transformed.} 22 | 23 | \item{pb}{progress bar object. This is only necessary if the function is used in an iteration.} 24 | } 25 | \value{ 26 | An object of class \code{drc}. If no fit was performed a character vector with content 27 | "no_fit". 28 | } 29 | \description{ 30 | This function peforms the four-parameter dose response curve fit. It is the helper function 31 | for the fit in the \code{fit_drc_4p} function. 32 | } 33 | -------------------------------------------------------------------------------- /man/ttest_protti.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ttest_protti.R 3 | \name{ttest_protti} 4 | \alias{ttest_protti} 5 | \title{Perform Welch's t-test} 6 | \usage{ 7 | ttest_protti(mean1, mean2, sd1, sd2, n1, n2, log_values = TRUE) 8 | } 9 | \arguments{ 10 | \item{mean1}{a numeric vector that contains the means of group1.} 11 | 12 | \item{mean2}{a numeric vector that contains the means of group2.} 13 | 14 | \item{sd1}{a numeric vector that contains the standard deviations of group1.} 15 | 16 | \item{sd2}{a numeric vector that contains the standard deviations of group2.} 17 | 18 | \item{n1}{a numeric vector that contains the number of replicates used for the calculation of 19 | each mean and standard deviation of group1.} 20 | 21 | \item{n2}{a numeric vector that contains the number of replicates used for the calculation of 22 | each mean and standard deviation of group2.} 23 | 24 | \item{log_values}{a logical value that indicates if values are log transformed. This determines 25 | how fold changes are calculated. Default is \code{log_values = TRUE}.} 26 | } 27 | \value{ 28 | A data frame that contains the calculated differences of means, standard error, t 29 | statistic and p-values. 30 | } 31 | \description{ 32 | Performs a Welch's t-test and calculates p-values between two groups. 33 | } 34 | \examples{ 35 | ttest_protti( 36 | mean1 = 10, 37 | mean2 = 15.5, 38 | sd1 = 1, 39 | sd2 = 0.5, 40 | n1 = 3, 41 | n2 = 3 42 | ) 43 | } 44 | -------------------------------------------------------------------------------- /man/normalise.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/normalise.R 3 | \name{normalise} 4 | \alias{normalise} 5 | \title{Intensity normalisation} 6 | \usage{ 7 | normalise(data, sample, intensity_log2, method = "median") 8 | } 9 | \arguments{ 10 | \item{data}{a data frame containing at least sample names and intensity values. Please note that if the 11 | data frame is grouped, the normalisation will be computed by group.} 12 | 13 | \item{sample}{a character column in the \code{data} data frame that contains the sample names.} 14 | 15 | \item{intensity_log2}{a numeric column in the \code{data} data frame that contains the log2 transformed 16 | intensity values to be normalised.} 17 | 18 | \item{method}{a character value specifying the method to be used for normalisation. Default 19 | is "median".} 20 | } 21 | \value{ 22 | A data frame with a column called \code{normalised_intensity_log2} containing the 23 | normalised intensity values. 24 | } 25 | \description{ 26 | Performs normalisation on intensities. For median normalisation the normalised intensity is the 27 | original intensity minus the run median plus the global median. This is also the way it is 28 | implemented in the Spectronaut search engine. 29 | } 30 | \examples{ 31 | data <- data.frame( 32 | r_file_name = c("s1", "s2", "s3", "s1", "s2", "s3"), 33 | intensity_log2 = c(18, 19, 17, 20, 21, 19) 34 | ) 35 | 36 | normalise(data, 37 | sample = r_file_name, 38 | intensity_log2 = intensity_log2, 39 | method = "median" 40 | ) 41 | } 42 | -------------------------------------------------------------------------------- /data-raw/rapamycin_dose_response.R: -------------------------------------------------------------------------------- 1 | # library(tidyverse) 2 | # library(protti) 3 | # 4 | # set.seed(123) 5 | # 6 | # # Source: Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200 (2020). https://doi.org/10.1038/s41467-020-18071-x 7 | # 8 | rapa <- read_protti("rapamycin_dose_response.csv") 9 | 10 | # Filter to only contain necessary columns. Simplify file names. Annotate conditions with concentrations in pM. 11 | 12 | rapa_filtered <- rapa %>% 13 | distinct(r_file_name, r_condition, eg_precursor_id, pg_protein_accessions, fg_quantity, pep_is_proteotypic, eg_is_decoy) %>% 14 | mutate(r_file_name = paste0("sample_", str_sub(r_file_name, start = 35, end = 36))) %>% 15 | mutate(r_condition = case_when( 16 | r_condition == 0 ~ 0, 17 | r_condition == 1 ~ 10, 18 | r_condition == 2 ~ 100, 19 | r_condition == 3 ~ 1000, 20 | r_condition == 4 ~ 10000, 21 | r_condition == 5 ~ 100000, 22 | r_condition == 6 ~ 1000000, 23 | r_condition == 7 ~ 10000000, 24 | r_condition == 8 ~ 100000000, 25 | )) 26 | 27 | all_proteins <- unique(rapa_filtered$pg_protein_accessions) 28 | 29 | all_proteins_wo_FKBP1A <- all_proteins[all_proteins != "P62942"] 30 | 31 | sampled_bg <- sample(all_proteins_wo_FKBP1A, size = 39) 32 | 33 | rapamycin_dose_response <- rapa_filtered %>% 34 | filter(pg_protein_accessions %in% c(sampled_bg, "P62942")) 35 | 36 | usethis::use_data(rapamycin_dose_response, overwrite = TRUE) 37 | -------------------------------------------------------------------------------- /R/scale_protti.R: -------------------------------------------------------------------------------- 1 | #' Scaling a vector 2 | #' 3 | #' \code{scale_protti} is used to scale a numeric vector either between 0 and 1 or around a 4 | #' centered value using the standard deviation. If a vector containing only one value or 5 | #' repeatedly the same value is provided, 1 is returned as the scaled value for \code{method = "01"} 6 | #' and 0 is returned for \code{metod = "center"}. 7 | #' 8 | #' @param x a numeric vector 9 | #' @param method a character value that specifies the method to be used for scaling. "01" scales 10 | #' the vector between 0 and 1. "center" scales the vector equal to \code{base::scale} around a 11 | #' center. This is done by subtracting the mean from every value and then deviding them by the 12 | #' standard deviation. 13 | #' 14 | #' @return A scaled numeric vector. 15 | #' @export 16 | #' 17 | #' @examples 18 | #' scale_protti(c(1, 2, 1, 4, 6, 8), method = "01") 19 | scale_protti <- function(x, method) { 20 | if (is.numeric(x) == FALSE) { 21 | stop("x is a ", typeof(x), " vector but needs to be a numeric vector!") 22 | } 23 | if (method == "01") { 24 | result <- (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) 25 | 26 | if ((max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) == 0) { 27 | result <- rep(1, length(x)) 28 | } 29 | } 30 | if (method == "center") { 31 | result <- (x - mean(x, na.rm = TRUE)) / stats::sd(x, na.rm = TRUE) 32 | 33 | if (stats::sd(x, na.rm = TRUE) == 0) { 34 | result <- rep(0, length(x)) 35 | } 36 | } 37 | result 38 | } 39 | -------------------------------------------------------------------------------- /R/replace_identified_by_x.R: -------------------------------------------------------------------------------- 1 | #' Replace identified positions in protein sequence by "x" 2 | #' 3 | #' Helper function for the calculation of sequence coverage, replaces identified positions with an 4 | #' "x" within the protein sequence. 5 | #' 6 | #' @param sequence a character value that contains the protein sequence. 7 | #' @param positions_start a numeric vector of start positions of the identified peptides. 8 | #' @param positions_end a numeric vector of end positions of the identified peptides. 9 | #' 10 | #' @return A character vector that contains the modified protein sequence with each identified 11 | #' position replaced by "x". 12 | #' @importFrom purrr map2 13 | #' @importFrom stringr str_sub 14 | replace_identified_by_x <- 15 | function(sequence, positions_start, positions_end) { 16 | sequence <- unique(sequence) 17 | if (sequence == "" | is.na(sequence)) { 18 | return(NA) 19 | } 20 | remove_na <- !is.na(positions_start) & !is.na(positions_end) 21 | positions_start <- positions_start[remove_na] 22 | positions_end <- positions_end[remove_na] 23 | result <- purrr::map2( 24 | .x = positions_start, .y = positions_end, 25 | function(x, y) { 26 | times <- y - x + 1 27 | stringr::str_sub(sequence, start = x, end = y) <- paste(rep("x", times = times), collapse = "") 28 | # this does not modify the global environment but only the 29 | # environment of the parent function (replace_identified_by_x). 30 | sequence <<- sequence 31 | } 32 | ) 33 | result[[length(result)]] 34 | } 35 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | pull_request: 7 | branches: 8 | - main 9 | - master 10 | 11 | name: test-coverage 12 | 13 | jobs: 14 | test-coverage: 15 | runs-on: macOS-latest 16 | env: 17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | 23 | - uses: r-lib/actions/setup-pandoc@v2 24 | 25 | - name: Query dependencies 26 | run: | 27 | install.packages('remotes') 28 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 29 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 30 | shell: Rscript {0} 31 | 32 | - name: Cache R packages 33 | uses: actions/cache@v2 34 | with: 35 | path: ${{ env.R_LIBS_USER }} 36 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 37 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 38 | 39 | - name: Install dependencies 40 | run: | 41 | install.packages(c("remotes")) 42 | remotes::install_deps(dependencies = TRUE) 43 | remotes::install_cran("covr") 44 | shell: Rscript {0} 45 | 46 | - name: Test coverage 47 | env: 48 | TEST_PROTTI: true 49 | BUILD_VIGNETTE: true 50 | run: covr::codecov() 51 | shell: Rscript {0} 52 | -------------------------------------------------------------------------------- /man/pval_distribution_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pval_distribution_plot.R 3 | \name{pval_distribution_plot} 4 | \alias{pval_distribution_plot} 5 | \title{Plot histogram of p-value distribution} 6 | \usage{ 7 | pval_distribution_plot(data, grouping, pval, facet_by = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{a data frame that contains at least grouping identifiers (precursor, peptide or 11 | protein) and p-values derived from any statistical test.} 12 | 13 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor, 14 | peptide or protein identifiers. For each entry in this column there should be one unique p-value. 15 | That means the statistical test that created the p-value should have been performed on the 16 | level of the content of this column.} 17 | 18 | \item{pval}{a numeric column in the \code{data} data frame that contains p-values.} 19 | 20 | \item{facet_by}{optional, a character column that contains information by which the data should 21 | be faceted into multiple plots.} 22 | } 23 | \value{ 24 | A histogram plot that shows the p-value distribution. 25 | } 26 | \description{ 27 | Plots the distribution of p-values derived from any statistical test as a histogram. 28 | } 29 | \examples{ 30 | set.seed(123) # Makes example reproducible 31 | 32 | # Create example data 33 | data <- data.frame( 34 | peptide = paste0("peptide", 1:1000), 35 | pval = runif(n = 1000) 36 | ) 37 | 38 | # Plot p-values 39 | pval_distribution_plot( 40 | data = data, 41 | grouping = peptide, 42 | pval = pval 43 | ) 44 | } 45 | -------------------------------------------------------------------------------- /tests/testthat/test-queue_functions.R: -------------------------------------------------------------------------------- 1 | context("test-queue_functions") 2 | 3 | queue <- create_queue( 4 | date = c("200722"), 5 | instrument = c("EX1"), 6 | user = c("username"), 7 | measurement_type = c("DIA"), 8 | experiment_name = c("N01"), 9 | digestion = c("LiP", "tryptic control"), 10 | treatment_type_1 = c("EDTA", "H2O"), 11 | treatment_type_2 = c("Zeba", "unfiltered"), 12 | treatment_dose_1 = c(10, 30, 60), 13 | treatment_unit_1 = c("min"), 14 | n_replicates = 4, 15 | number_runs = FALSE, 16 | organism = c("E. coli"), 17 | exclude_combinations = list(list( 18 | treatment_type_1 = c("H2O"), 19 | treatment_type_2 = c("Zeba", "unfiltered"), 20 | treatment_dose_1 = c(10, 30) 21 | )), 22 | inj_vol = c(2), 23 | data_path = "D:\\2007_Data", 24 | method_path = "C:\\Xcalibur\\methods\\username\\DIA_120min_41var_AGC200", 25 | position_row = c("A", "B", "C", "D", "E", "F"), 26 | position_column = 8, 27 | blank_every_n = 4, 28 | blank_position = "1-V1", 29 | blank_method_path = "C:\\Xcalibur\\methods\\blank", 30 | export = FALSE 31 | ) 32 | 33 | test_that("create_queue works", { 34 | expect_is(queue, "data.frame") 35 | expect_equal(ncol(queue), 21) 36 | expect_equal(nrow(queue), 80) 37 | }) 38 | 39 | test_that("randomise_queue works", { 40 | set.seed(123) 41 | randomised_queue <- randomise_queue(data = queue, rows = 71:80) 42 | expect_is(randomised_queue, "data.frame") 43 | expect_equal(ncol(randomised_queue), 21) 44 | expect_equal(nrow(randomised_queue), 80) 45 | expect_equal(randomised_queue$Position[71:80], c("1-V1", "B8", "B5", "B3", "B6", "1-V1", "B7", "B4", "B1", "B2")) 46 | }) 47 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | 7 | name: pkgdown 8 | 9 | jobs: 10 | pkgdown: 11 | runs-on: macOS-latest 12 | env: 13 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 14 | TEST_PROTTI: true 15 | BUILD_VIGNETTE: true 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - uses: r-lib/actions/setup-r@v2 20 | 21 | - uses: r-lib/actions/setup-pandoc@v2 22 | 23 | - name: Query dependencies 24 | run: | 25 | install.packages('remotes') 26 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 27 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 28 | shell: Rscript {0} 29 | 30 | - name: Cache R packages 31 | uses: actions/cache@v2 32 | with: 33 | path: ${{ env.R_LIBS_USER }} 34 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 35 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 36 | 37 | - name: Install dependencies 38 | run: | 39 | remotes::install_deps(dependencies = TRUE) 40 | install.packages("pkgdown", type = "binary") 41 | shell: Rscript {0} 42 | 43 | - name: Install package 44 | run: R CMD INSTALL . 45 | 46 | - name: Deploy package 47 | run: | 48 | git config --local user.email "actions@github.com" 49 | git config --local user.name "GitHub Actions" 50 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 51 | -------------------------------------------------------------------------------- /man/anova_protti.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/anova_protti.R 3 | \name{anova_protti} 4 | \alias{anova_protti} 5 | \title{Perform ANOVA} 6 | \usage{ 7 | anova_protti(data, grouping, condition, mean_ratio, sd, n) 8 | } 9 | \arguments{ 10 | \item{data}{a data frame containing at least the input variables.} 11 | 12 | \item{grouping}{a character column in the \code{data} data frame that contains precursor or 13 | peptide identifiers.} 14 | 15 | \item{condition}{a character or numeric column in the \code{data} data frame that contains the 16 | conditions.} 17 | 18 | \item{mean_ratio}{a numeric column in the \code{data} data frame that contains mean intensities 19 | or mean intensity ratios.} 20 | 21 | \item{sd}{a numeric column in the \code{data} data frame that contains the standard deviation 22 | corresponding to the mean.} 23 | 24 | \item{n}{a numeric column in the \code{data} data frame that contains the number of replicates 25 | for which the corresponding mean was calculated.} 26 | } 27 | \value{ 28 | a data frame that contains the within group error (\code{ms_group}) and the between 29 | group error (\code{ms_error}), f statistic and p-values. 30 | } 31 | \description{ 32 | Performs an ANOVA statistical test 33 | } 34 | \examples{ 35 | data <- data.frame( 36 | precursor = c("A", "A", "A", "B", "B", "B"), 37 | condition = c("C1", "C2", "C3", "C1", "C2", "C3"), 38 | mean = c(10, 12, 20, 11, 12, 8), 39 | sd = c(2, 1, 1.5, 1, 2, 4), 40 | n = c(4, 4, 4, 4, 4, 4) 41 | ) 42 | 43 | anova_protti( 44 | data, 45 | grouping = precursor, 46 | condition = condition, 47 | mean = mean, 48 | sd = sd, 49 | n = n 50 | ) 51 | } 52 | -------------------------------------------------------------------------------- /man/fetch_uniprot_proteome.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_uniprot_proteome.R 3 | \name{fetch_uniprot_proteome} 4 | \alias{fetch_uniprot_proteome} 5 | \title{Fetch proteome data from UniProt} 6 | \usage{ 7 | fetch_uniprot_proteome( 8 | organism_id, 9 | columns = c("accession"), 10 | reviewed = TRUE, 11 | timeout = 120, 12 | max_tries = 5 13 | ) 14 | } 15 | \arguments{ 16 | \item{organism_id}{a numeric value that specifies the NCBI taxonomy identifier (TaxId) for an 17 | organism.} 18 | 19 | \item{columns}{a character vector of metadata columns that should be imported from UniProt (all 20 | possible columns can be found \href{https://www.uniprot.org/help/return_fields}{here}. For 21 | cross-referenced database provide the database name with the prefix "xref_", e.g. \code{"xref_pdb"}). 22 | Note: Not more than one or two columns should be selected otherwise the function will not be 23 | able to efficiently retrieve the information. If more information is needed, \code{fetch_uniprot()} 24 | can be used with the IDs retrieved by this function.} 25 | 26 | \item{reviewed}{a logical value that determines if only reviewed protein entries will be retrieved.} 27 | 28 | \item{timeout}{a numeric value specifying the time in seconds until the download times out. 29 | The default is 60 seconds.} 30 | 31 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download 32 | the data in case an error occurs. The default is 2.} 33 | } 34 | \value{ 35 | A data frame that contains all protein metadata specified in \code{columns} for the 36 | organism of choice. 37 | } 38 | \description{ 39 | Fetches proteome data from UniProt for the provided organism ID. 40 | } 41 | \examples{ 42 | \donttest{ 43 | head(fetch_uniprot_proteome(9606)) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /man/fetch_mobidb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_mobidb.R 3 | \name{fetch_mobidb} 4 | \alias{fetch_mobidb} 5 | \title{Fetch protein disorder and mobility information from MobiDB} 6 | \usage{ 7 | fetch_mobidb( 8 | uniprot_ids = NULL, 9 | organism_id = NULL, 10 | show_progress = TRUE, 11 | timeout = 60, 12 | max_tries = 2 13 | ) 14 | } 15 | \arguments{ 16 | \item{uniprot_ids}{optional, a character vector of UniProt identifiers for which information 17 | should be fetched. This argument is mutually exclusive to the \code{organism_id} argument.} 18 | 19 | \item{organism_id}{optional, a character value providing the NCBI taxonomy identifier of an organism 20 | (TaxId) of an organism for which all available information should be retreived. This 21 | argument is mutually exclusive to the \code{uniprot_ids} argument.} 22 | 23 | \item{show_progress}{a logical value; if \code{TRUE} a progress bar will be shown. 24 | Default is \code{TRUE}.} 25 | 26 | \item{timeout}{a numeric value specifying the time in seconds until the download of an organism 27 | archive times out. The default is 60 seconds.} 28 | 29 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download 30 | the data in case an error occurs. The default is 2.} 31 | } 32 | \value{ 33 | A data frame that contains start and end positions for disordered and flexible protein 34 | regions. The \code{feature} column contains information on the source of this 35 | annotation. More information on the source can be found 36 | \href{https://mobidb.org/about/mobidb}{here}. 37 | } 38 | \description{ 39 | Fetches information about disordered and flexible protein regions from MobiDB. 40 | } 41 | \examples{ 42 | \donttest{ 43 | fetch_mobidb( 44 | uniprot_ids = c("P0A799", "P62707") 45 | ) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /man/assign_peptide_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assign_peptide_type.R 3 | \name{assign_peptide_type} 4 | \alias{assign_peptide_type} 5 | \title{Assign peptide type} 6 | \usage{ 7 | assign_peptide_type( 8 | data, 9 | aa_before = aa_before, 10 | last_aa = last_aa, 11 | aa_after = aa_after 12 | ) 13 | } 14 | \arguments{ 15 | \item{data}{a data frame containing at least information about the preceding and C-terminal 16 | amino acids of peptides.} 17 | 18 | \item{aa_before}{a character column in the \code{data} data frame that contains the preceding amino 19 | acid as one letter code.} 20 | 21 | \item{last_aa}{a character column in the \code{data} data frame that contains the C-terminal amino 22 | acid as one letter code.} 23 | 24 | \item{aa_after}{a character column in the \code{data} data frame that contains the following amino 25 | acid as one letter code.} 26 | } 27 | \value{ 28 | A data frame that contains the input data and an additional column with the peptide 29 | type information. 30 | } 31 | \description{ 32 | Based on preceding and C-terminal amino acid, the peptide type of a given peptide is assigned. 33 | Peptides with preceeding and C-terminal lysine or arginine are considered fully-tryptic. If a 34 | peptide is located at the N- or C-terminus of a protein and fulfills the criterium to be 35 | fully-tryptic otherwise, it is also considered as fully-tryptic. Peptides that only fulfill the 36 | criterium on one terminus are semi-tryptic peptides. Lastly, peptides that are not fulfilling 37 | the criteria for both termini are non-tryptic peptides. 38 | } 39 | \examples{ 40 | data <- data.frame( 41 | aa_before = c("K", "S", "T"), 42 | last_aa = c("R", "K", "Y"), 43 | aa_after = c("T", "R", "T") 44 | ) 45 | 46 | assign_peptide_type(data, aa_before, last_aa, aa_after) 47 | } 48 | -------------------------------------------------------------------------------- /man/ptsi_pgk.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{ptsi_pgk} 5 | \alias{ptsi_pgk} 6 | \title{Structural analysis example data} 7 | \format{ 8 | A data frame containing differential abundances and adjusted p-values for 9 | peptides/precursors of two proteins. 10 | } 11 | \source{ 12 | Cappelletti V, Hauser T, Piazza I, Pepelnjak M, Malinovska L, Fuhrer T, Li Y, Dörig C, 13 | Boersema P, Gillet L, Grossbach J, Dugourd A, Saez-Rodriguez J, Beyer A, Zamboni N, Caflisch A, 14 | de Souza N, Picotti P. Dynamic 3D proteomes reveal protein functional alterations at high 15 | resolution in situ. Cell. 2021 Jan 21;184(2):545-559.e22. \doi{10.1016/j.cell.2020.12.021}. 16 | Epub 2020 Dec 23. PMID: 33357446; PMCID: PMC7836100. 17 | } 18 | \usage{ 19 | ptsi_pgk 20 | } 21 | \description{ 22 | Example data used for the vignette about structural analysis. The data was obtained from 23 | Cappelletti et al. 2021 (\doi{10.1016/j.cell.2020.12.021}) 24 | and corresponds to two separate experiments. Both experiments were limited proteolyis coupled to 25 | mass spectrometry (LiP-MS) experiments conducted on purified proteins. The first protein is 26 | phosphoglycerate kinase 1 (pgk) and it was treated with 25mM 3-phosphoglyceric acid (3PG). 27 | The second protein is phosphoenolpyruvate-protein phosphotransferase (ptsI) and it was treated 28 | with 25mM fructose 1,6-bisphosphatase (FBP). From both experiments only peptides belonging to 29 | either protein were used for this data set. The ptsI data set contains precursor level data 30 | while the pgk data set contains peptide level data. The pgk data can be obtained from 31 | supplementary table 3 from the tab named "pgk+3PG". The ptsI data is only included as raw data 32 | and was analysed using the functions of this package. 33 | } 34 | \keyword{datasets} 35 | -------------------------------------------------------------------------------- /man/try_query.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/try_query.R 3 | \name{try_query} 4 | \alias{try_query} 5 | \title{Query from URL} 6 | \usage{ 7 | try_query( 8 | url, 9 | max_tries = 5, 10 | silent = TRUE, 11 | type = "text/tab-separated-values", 12 | timeout = 60, 13 | accept = NULL, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{url}{a character value of an URL to the website that contains the table that should be 19 | downloaded.} 20 | 21 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download 22 | the data in case an error occurs. Default is 5.} 23 | 24 | \item{silent}{a logical value that specifies if individual messages are printed after each try 25 | that failed.} 26 | 27 | \item{type}{a character value that specifies the type of data at the target URL. Options are 28 | all options that can be supplied to httr::content, these include e.g. 29 | "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values".} 30 | 31 | \item{timeout}{a numeric value that specifies the maximum request time. Default is 60 seconds.} 32 | 33 | \item{accept}{a character value that specifies the type of data that should be sent by the API if 34 | it uses content negotiation. The default is NULL and it should only be set for APIs that use 35 | content negotiation.} 36 | 37 | \item{...}{other parameters supplied to the parsing function used by httr::content.} 38 | } 39 | \value{ 40 | A data frame that contains the table from the url. 41 | } 42 | \description{ 43 | Downloads data table from URL. If an error occurs during the query (for example due to no 44 | connection) the function waits 3 seconds and tries again. If no result could be obtained 45 | after the given number of tries a message indicating the problem is returned. 46 | } 47 | -------------------------------------------------------------------------------- /man/find_all_subs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_all_subs.R 3 | \name{find_all_subs} 4 | \alias{find_all_subs} 5 | \title{Find all sub IDs of an ID in a network} 6 | \usage{ 7 | find_all_subs( 8 | data, 9 | ids, 10 | main_id = id, 11 | type = type, 12 | accepted_types = "is_a", 13 | exclude_parent_id = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{data}{a data frame that contains relational information on IDs (main_id) their sub 18 | IDs (sub_id) and their relationship (type). For ChEBI this data frame can be obtained by calling 19 | \code{fetch_chebi(relation = TRUE)}. For ECO data it can be obtained by calling fetch_eco(relation = TRUE).} 20 | 21 | \item{ids}{a character vector of IDs for which sub IDs should be searched.} 22 | 23 | \item{main_id}{a character or integer column containing IDs. Default is \code{id} for ChEBI IDs.} 24 | 25 | \item{type}{a character column that contains the type of interactions. Default is \code{type} for ChEBI IDs.} 26 | 27 | \item{accepted_types}{a character vector containing the accepted_types of relationships that should be considered 28 | for the search. It is possible to use "all" relationships. The default type is "is_a". A list of 29 | possible relationships for e.g. ChEBI IDs can be found 30 | \href{https://docs.google.com/document/d/1_w-DwBdCCOh1gMeeP6yqGzcnkpbHYOa3AGSODe5epcg/edit#heading=h.hnsqoqu978s5}{here}.} 31 | 32 | \item{exclude_parent_id}{a logical value that specifies if the parent ID should be included in 33 | the returned list.} 34 | } 35 | \value{ 36 | A list of character vectors containing the provided ID and all of its sub IDs. It 37 | contains one element per input ID. 38 | } 39 | \description{ 40 | For a given ID, find all sub IDs and their sub IDs etc. The type of 41 | relationship can be selected too. This is a helper function for other functions. 42 | } 43 | -------------------------------------------------------------------------------- /man/qc_sequence_coverage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_sequence_coverage.R 3 | \name{qc_sequence_coverage} 4 | \alias{qc_sequence_coverage} 5 | \title{Protein coverage distribution} 6 | \usage{ 7 | qc_sequence_coverage( 8 | data, 9 | protein_identifier, 10 | coverage, 11 | sample = NULL, 12 | interactive = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{data}{a data frame that contains at least the input variables.} 17 | 18 | \item{protein_identifier}{a character column in the \code{data} data frame that contains protein 19 | identifiers.} 20 | 21 | \item{coverage}{a numeric column in the \code{data} data frame that contains protein coverage 22 | in percent. This information can be obtained using the \code{\link{sequence_coverage}} function.} 23 | 24 | \item{sample}{optional, a character or factor column in the \code{data} data frame that contains sample names. 25 | Please only provide this argument if you want to facet the distribution plot by sample 26 | otherwise do not provide this argument.} 27 | 28 | \item{interactive}{a logical value that specifies whether the plot should be interactive 29 | (default is FALSE).} 30 | } 31 | \value{ 32 | A protein coverage histogram with 5 percent binwidth. The vertical dotted line 33 | indicates the median. 34 | } 35 | \description{ 36 | Plots the distribution of protein coverages in a histogram. 37 | } 38 | \examples{ 39 | set.seed(123) # Makes example reproducible 40 | 41 | # Create example data 42 | data <- create_synthetic_data( 43 | n_proteins = 100, 44 | frac_change = 0.05, 45 | n_replicates = 3, 46 | n_conditions = 2, 47 | method = "effect_random" 48 | ) 49 | 50 | # Plot sequence coverage 51 | qc_sequence_coverage( 52 | data = data, 53 | protein_identifier = protein, 54 | coverage = coverage 55 | ) 56 | } 57 | \seealso{ 58 | \code{\link{sequence_coverage}} 59 | } 60 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: 8 | - '*' 9 | 10 | name: R-CMD-check 11 | 12 | jobs: 13 | R-CMD-check: 14 | runs-on: ${{ matrix.config.os }} 15 | 16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | config: 22 | - {os: macos-latest, r: 'release'} 23 | - {os: windows-latest, r: 'release'} 24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 25 | - {os: ubuntu-latest, r: 'release'} 26 | - {os: ubuntu-latest, r: 'oldrel-1'} 27 | 28 | env: 29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 30 | R_KEEP_PKG_SOURCE: yes 31 | TEST_PROTTI: true 32 | BUILD_VIGNETTE: true 33 | 34 | steps: 35 | - uses: actions/checkout@v3 36 | 37 | - uses: r-lib/actions/setup-pandoc@v2 38 | 39 | - uses: r-lib/actions/setup-r@v2 40 | with: 41 | r-version: ${{ matrix.config.r }} 42 | http-user-agent: ${{ matrix.config.http-user-agent }} 43 | use-public-rspm: true 44 | 45 | - uses: r-lib/actions/setup-r-dependencies@v2 46 | with: 47 | extra-packages: 48 | any::rcmdcheck 49 | needs: check 50 | 51 | # run: | 52 | # - name: Install remotes and lme4 package 53 | # Rscript -e 'install.packages("remotes", lib=Sys.getenv("R_LIB_FOR_PAK"))' 54 | # Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' 55 | 56 | - uses: r-lib/actions/check-r-package@v2 57 | with: 58 | upload-snapshots: true 59 | -------------------------------------------------------------------------------- /R/ttest_protti.R: -------------------------------------------------------------------------------- 1 | #' Perform Welch's t-test 2 | #' 3 | #' Performs a Welch's t-test and calculates p-values between two groups. 4 | #' 5 | #' @param mean1 a numeric vector that contains the means of group1. 6 | #' @param mean2 a numeric vector that contains the means of group2. 7 | #' @param sd1 a numeric vector that contains the standard deviations of group1. 8 | #' @param sd2 a numeric vector that contains the standard deviations of group2. 9 | #' @param n1 a numeric vector that contains the number of replicates used for the calculation of 10 | #' each mean and standard deviation of group1. 11 | #' @param n2 a numeric vector that contains the number of replicates used for the calculation of 12 | #' each mean and standard deviation of group2. 13 | #' @param log_values a logical value that indicates if values are log transformed. This determines 14 | #' how fold changes are calculated. Default is \code{log_values = TRUE}. 15 | #' 16 | #' @return A data frame that contains the calculated differences of means, standard error, t 17 | #' statistic and p-values. 18 | #' @importFrom stats pt 19 | #' @export 20 | #' 21 | #' @examples 22 | #' ttest_protti( 23 | #' mean1 = 10, 24 | #' mean2 = 15.5, 25 | #' sd1 = 1, 26 | #' sd2 = 0.5, 27 | #' n1 = 3, 28 | #' n2 = 3 29 | #' ) 30 | ttest_protti <- function(mean1, mean2, sd1, sd2, n1, n2, log_values = TRUE) { 31 | std_error <- sqrt((sd1^2 / n1) + (sd2^2 / n2)) 32 | # Welch-Satterwhite equation to estimate the degrees of freedom 33 | df <- ((sd1^2 / n1) + (sd2^2 / n2))^2 / (sd1^4 / (n1^2 * (n1 - 1)) + sd2^4 / (n2^2 * (n2 - 1))) 34 | # fold change calculation 35 | if (log_values == TRUE) { 36 | diff <- mean1 - mean2 37 | } else { 38 | diff <- mean1 / mean2 39 | } 40 | # t statistic calculation 41 | t <- (diff) / std_error 42 | result <- data.frame(cbind(diff, std_error, t, 2 * pt(-abs(t), df))) 43 | colnames(result) <- c("diff", "std_error", "t_statistic", "pval") 44 | return(result) 45 | } 46 | -------------------------------------------------------------------------------- /data-raw/ptsi_pgk.R: -------------------------------------------------------------------------------- 1 | # library(tidyverse) 2 | # library(protti) 3 | # 4 | # # Source: Cappelletti V, Hauser T, Piazza I, Pepelnjak M, Malinovska L, Fuhrer T, Li Y, Dörig C, Boersema P, Gillet L, Grossbach J, Dugourd A, Saez-Rodriguez J, Beyer A, Zamboni N, Caflisch A, de Souza N, Picotti P. Dynamic 3D proteomes reveal protein functional alterations at high resolution in situ. Cell. 2021 Jan 21;184(2):545-559.e22. doi: 10.1016/j.cell.2020.12.021. Epub 2020 Dec 23. PMID: 33357446; PMCID: PMC7836100. 5 | # 6 | # # The pgk data set is from supplementary table 3, the tab is called "pgk+3PG". The data does not contain precursor level data since charge states are 7 | # # missing from peptides. 8 | # pgk <- read_protti("pgk.csv") 9 | # 10 | # # The ptsI data set is not part of the supplementary tables. The raw data is included in the PRIDE repository. We exported the Spectronaut report 11 | # # and analysed that data using prottis standard pipeline. 12 | # ptsi <- read_protti("ptsi.csv") 13 | # 14 | # # pgk data tidying 15 | # 16 | # pgk_tidy <- pgk %>% 17 | # filter(concentration == "25mM") %>% # filter to only retain the 25 mM concentration 18 | # rename(eg_precursor_id = peptide_sequence, 19 | # pg_protein_accessions = uniprot_id, 20 | # diff = log2fc, 21 | # adj_pval = qvalue) %>% 22 | # distinct(eg_precursor_id, 23 | # diff, 24 | # adj_pval, 25 | # pg_protein_accessions) %>% 26 | # mutate(pep_stripped_sequence = str_remove_all(eg_precursor_id, pattern = "(?<=\\[)[\\w\\(\\)\\s\\-]+(?=\\])")) %>% # removes "[Carbamidomethyl]" from peptides. 27 | # mutate(pep_stripped_sequence = str_remove_all(pep_stripped_sequence, pattern = "[\\[\\]]")) 28 | # 29 | # # ptsi data tidying 30 | # 31 | # ptsi_tidy <- ptsi %>% 32 | # rename(eg_precursor_id = precursor_id) 33 | # 34 | # # combining data 35 | # 36 | # ptsi_pgk <- pgk_tidy %>% 37 | # bind_rows(ptsi_tidy) 38 | # 39 | # usethis::use_data(ptsi_pgk, overwrite = TRUE) 40 | -------------------------------------------------------------------------------- /data-raw/protti_colours.R: -------------------------------------------------------------------------------- 1 | protti_colours <- c( 2 | "#5680C1", 3 | "#B96DAD", 4 | "#64CACA", 5 | "#81ABE9", 6 | "#F6B8D1", 7 | "#99F1E4", 8 | "#9AD1FF", 9 | "#548BDF", 10 | "#A55098", 11 | "#3EB6B6", 12 | "#87AEE8", 13 | "#CA91C1", 14 | "#A4E0E0", 15 | "#1D4F9A", 16 | "#D7ACD2", 17 | "#49C1C1", 18 | "#00A2D9", 19 | "#6B77BF", 20 | "#00C2D4", 21 | "#816DB8", 22 | "#00DCB5", 23 | "#9561AD", 24 | "#95EF8C", 25 | "#A6549C", 26 | "#F9F871", 27 | "#B44688", 28 | "#65D8C2", 29 | "#40B4D5", 30 | "#7AE4B2", 31 | "#529AD4", 32 | "#9DEE9C", 33 | "#7B7BC0", 34 | "#C8F585", 35 | "#995997", 36 | "#7368B8", 37 | "#A03960", 38 | "#DA5D8C", 39 | "#077AC1", 40 | "#C793BD", 41 | "#0086B3", 42 | "#FFE6FF", 43 | "#00C897", 44 | "#B8A6B4", 45 | "#8292B3", 46 | "#B38DAC", 47 | "#9CCDCD", 48 | "#A7B6D2", 49 | "#E4CBD4", 50 | "#C8EDE7", 51 | "#C1D5E9", 52 | "#899BC4", 53 | "#A6739D", 54 | "#76BFBF", 55 | "#ABB9D3", 56 | "#C3A9BE", 57 | "#C7E0E0", 58 | "#4667AC", 59 | "#D0BECE", 60 | "#87C7C7", 61 | "#3BB1E7", 62 | "#888CAF", 63 | "#12CEE1", 64 | "#8F87AB", 65 | "#12E6BD", 66 | "#9980A7", 67 | "#C2EABF", 68 | "#A5779F", 69 | "#F8F7BB", 70 | "#AF7092", 71 | "#A2D8CC", 72 | "#85BCD1", 73 | "#B4E1C9", 74 | "#8BA6C5", 75 | "#C7E9C7", 76 | "#9191B1", 77 | "#DBF2C1", 78 | "#9E789C", 79 | "#8682A9", 80 | "#AA5C76", 81 | "#C4899B", 82 | "#428DD1", 83 | "#C1AABC", 84 | "#039ACD", 85 | "#F7EDF7", 86 | "#02D5A1", 87 | "#BDB5BB", 88 | "#516C9A", 89 | "#9B5C91", 90 | "#4BAAAA", 91 | "#6F8FC0", 92 | "#D397B0", 93 | "#7BCABE", 94 | "#7EAFD7", 95 | "#4C75B8", 96 | "#844A7B", 97 | "#3E9898", 98 | "#7492C0", 99 | "#A97AA1", 100 | "#87BBBB", 101 | "#1E4381" 102 | ) 103 | 104 | usethis::use_data(protti_colours, overwrite = TRUE) 105 | -------------------------------------------------------------------------------- /R/fetch_go.R: -------------------------------------------------------------------------------- 1 | #' Fetch gene ontology information from geneontology.org 2 | #' 3 | #' Fetches gene ontology data from geneontology.org for the provided organism ID. 4 | #' 5 | #' @param organism_id a character value NCBI taxonomy identifier of an organism (TaxId). 6 | #' Possible inputs inlude only: "9606" (Human), "559292" (Yeast) and "83333" (E. coli). 7 | #' 8 | #' @return A data frame that contains gene ontology mappings to UniProt or SGD IDs. The original 9 | #' file is a .GAF file. A detailed description of all columns can be found here: 10 | #' http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/ 11 | #' @export 12 | #' 13 | #' @examples 14 | #' \donttest{ 15 | #' go <- fetch_go("9606") 16 | #' 17 | #' head(go) 18 | #' } 19 | fetch_go <- function(organism_id) { 20 | if (!curl::has_internet()) { 21 | message("No internet connection.") 22 | return(invisible(NULL)) 23 | } 24 | 25 | organism_id <- match.arg(organism_id, c("9606", "559292", "83333")) 26 | 27 | organism_url <- switch(organism_id, 28 | "9606" = "http://current.geneontology.org/annotations/goa_human.gaf.gz", 29 | "559292" = "http://current.geneontology.org/annotations/sgd.gaf.gz", 30 | "83333" = "http://current.geneontology.org/annotations/ecocyc.gaf.gz" 31 | ) 32 | go_download <- tryCatch(readLines(gzcon(url(organism_url))), 33 | error = function(e) conditionMessage(e), 34 | warning = function(w) conditionMessage(w) 35 | ) 36 | go <- utils::read.delim(textConnection(go_download), 37 | quote = "", 38 | stringsAsFactors = FALSE, 39 | comment.char = "!", 40 | header = FALSE 41 | ) 42 | if (nrow(go) == 1) { 43 | message(go$V1) 44 | return(invisible(NULL)) 45 | } 46 | colnames(go) <- c( 47 | "db", "db_id", "symbol", "qualifier", "go_id", "db_reference", 48 | "evidence", "with_from", "ontology", "name", "synonyme", 49 | "type", "taxon", "date", "assigned_by", "annotation_extension", 50 | "gene_product_form_id" 51 | ) 52 | return(go) 53 | } 54 | -------------------------------------------------------------------------------- /R/find_chebis.R: -------------------------------------------------------------------------------- 1 | #' Find ChEBI IDs for name patterns 2 | #' 3 | #' Search for chebi IDs that match a specific name pattern. A list of corresponding ChEBI IDs is 4 | #' returned. 5 | #' 6 | #' @param chebi_data a data frame that contains at least information on ChEBI IDs (id) and their 7 | #' names (name). This data frame can be obtained by calling \code{fetch_chebi()}. Ideally this 8 | #' should be subsetted to only contain molecules of a specific type e.g. metals. This can be 9 | #' achieved by calling \code{find_all_subs} with a general ID such as "25213" (Metal cation) and 10 | #' then subset the complete ChEBI database to only include the returned sub-IDs. Using a subsetted 11 | #' database ensures better search results. This is a helper function for other functions. 12 | #' @param pattern a character vector that contains names or name patterns of molecules. Name 13 | #' patterns can be for example obtained with the \code{split_metal_name} function. 14 | #' 15 | #' @return A list of character vectors containing ChEBI IDs that have a name matching the supplied 16 | #' pattern. It contains one element per pattern. 17 | #' @importFrom dplyr distinct 18 | #' @importFrom magrittr %>% 19 | #' @importFrom purrr map 20 | #' @importFrom stringr str_detect regex 21 | #' @importFrom rlang .data 22 | #' @importFrom stats na.omit 23 | find_chebis <- function(chebi_data, pattern) { 24 | if (!requireNamespace("stringi", quietly = TRUE)) { 25 | message("Package \"stringi\" is needed for this function to work. Please install it.", call. = FALSE) 26 | return(invisible(NULL)) 27 | } 28 | data <- chebi_data %>% 29 | dplyr::distinct(.data$id, .data$name) 30 | 31 | purrr::map(pattern, function(x) { 32 | stringi::stri_remove_empty(stats::na.omit(unique( 33 | ifelse( 34 | stringr::str_detect(data$name, 35 | pattern = stringr::regex( 36 | x, 37 | ignore_case = TRUE 38 | ) 39 | ), 40 | data$id, 41 | "" 42 | ) 43 | ))) 44 | }) 45 | } 46 | -------------------------------------------------------------------------------- /man/qc_median_intensities.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_median_intensities.R 3 | \name{qc_median_intensities} 4 | \alias{qc_median_intensities} 5 | \title{Median run intensities} 6 | \usage{ 7 | qc_median_intensities( 8 | data, 9 | sample, 10 | grouping, 11 | intensity, 12 | plot = TRUE, 13 | interactive = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{data}{a data frame that contains at least the input variables.} 18 | 19 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.} 20 | 21 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor or 22 | peptide identifiers.} 23 | 24 | \item{intensity}{a numeric column in the \code{data} data frame that contains intensity values. 25 | The intensity should be ideally log2 transformed, but also non-transformed values can be used.} 26 | 27 | \item{plot}{a logical value that indicates whether the result should be plotted.} 28 | 29 | \item{interactive}{a logical value that specifies whether the plot should be interactive 30 | (default is FALSE).} 31 | } 32 | \value{ 33 | A plot that displays median intensity over all samples. If \code{plot = FALSE} a data 34 | frame containing median intensities is returned. 35 | } 36 | \description{ 37 | Median intensities per run are returned either as a plot or a table. 38 | } 39 | \examples{ 40 | set.seed(123) # Makes example reproducible 41 | 42 | # Create example data 43 | data <- create_synthetic_data( 44 | n_proteins = 100, 45 | frac_change = 0.05, 46 | n_replicates = 3, 47 | n_conditions = 2, 48 | method = "effect_random" 49 | ) 50 | 51 | # Calculate median intensities 52 | qc_median_intensities( 53 | data = data, 54 | sample = sample, 55 | grouping = peptide, 56 | intensity = peptide_intensity_missing, 57 | plot = FALSE 58 | ) 59 | 60 | # Plot median intensities 61 | qc_median_intensities( 62 | data = data, 63 | sample = sample, 64 | grouping = peptide, 65 | intensity = peptide_intensity_missing, 66 | plot = TRUE 67 | ) 68 | } 69 | -------------------------------------------------------------------------------- /revdep/README.md: -------------------------------------------------------------------------------- 1 | # Platform 2 | 3 | |field |value | 4 | |:--------|:------------------------------------------| 5 | |version |R version 4.3.1 (2023-06-16) | 6 | |os |macOS Sonoma 14.2.1 | 7 | |system |aarch64, darwin20 | 8 | |ui |RStudio | 9 | |language |(EN) | 10 | |collate |en_US.UTF-8 | 11 | |ctype |en_US.UTF-8 | 12 | |tz |Europe/Zurich | 13 | |date |2024-03-27 | 14 | |rstudio |2023.06.1+524 Mountain Hydrangea (desktop) | 15 | |pandoc |NA | 16 | 17 | # Dependencies 18 | 19 | |package |old |new |Δ | 20 | |:-----------|:-----|:------|:--| 21 | |protti |0.7.0 |0.8.0 |* | 22 | |bslib |NA |0.6.2 |* | 23 | |crosstalk |NA |1.2.1 |* | 24 | |curl |NA |5.2.1 |* | 25 | |data.table |NA |1.15.2 |* | 26 | |digest |NA |0.6.35 |* | 27 | |dplyr |NA |1.1.4 |* | 28 | |fontawesome |NA |0.5.2 |* | 29 | |ggplot2 |NA |3.5.0 |* | 30 | |ggrepel |NA |0.9.5 |* | 31 | |gtable |NA |0.3.4 |* | 32 | |htmltools |NA |0.5.8 |* | 33 | |htmlwidgets |NA |1.6.4 |* | 34 | |labeling |NA |0.4.3 |* | 35 | |later |NA |1.3.2 |* | 36 | |lubridate |NA |1.9.3 |* | 37 | |plotly |NA |4.10.4 |* | 38 | |R.oo |NA |1.26.0 |* | 39 | |R.utils |NA |2.12.3 |* | 40 | |Rcpp |NA |1.0.12 |* | 41 | |readr |NA |2.1.5 |* | 42 | |rmarkdown |NA |2.26 |* | 43 | |sass |NA |0.4.9 |* | 44 | |scales |NA |1.3.0 |* | 45 | |snakecase |NA |0.11.1 |* | 46 | |stringi |NA |1.8.3 |* | 47 | |stringr |NA |1.5.1 |* | 48 | |tidyr |NA |1.3.1 |* | 49 | |tidyselect |NA |1.2.1 |* | 50 | |timechange |NA |0.3.0 |* | 51 | |tinytex |NA |0.50 |* | 52 | |vroom |NA |1.6.5 |* | 53 | |xfun |NA |0.43 |* | 54 | 55 | # Revdeps 56 | 57 | -------------------------------------------------------------------------------- /man/fetch_uniprot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_uniprot.R 3 | \name{fetch_uniprot} 4 | \alias{fetch_uniprot} 5 | \title{Fetch protein data from UniProt} 6 | \usage{ 7 | fetch_uniprot( 8 | uniprot_ids, 9 | columns = c("protein_name", "length", "sequence", "gene_names", "xref_geneid", 10 | "xref_string", "go_f", "go_p", "go_c", "cc_interaction", "ft_act_site", "ft_binding", 11 | "cc_cofactor", "cc_catalytic_activity", "xref_pdb"), 12 | batchsize = 200, 13 | max_tries = 10, 14 | timeout = 20, 15 | show_progress = TRUE 16 | ) 17 | } 18 | \arguments{ 19 | \item{uniprot_ids}{a character vector of UniProt accession numbers.} 20 | 21 | \item{columns}{a character vector of metadata columns that should be imported from UniProt (all 22 | possible columns can be found \href{https://www.uniprot.org/help/return_fields}{here}. For 23 | cross-referenced database provide the database name with the prefix "xref_", e.g. \code{"xref_pdb"})} 24 | 25 | \item{batchsize}{a numeric value that specifies the number of proteins processed in a single 26 | single query. Default and max value is 200.} 27 | 28 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download 29 | the data in case an error occurs.} 30 | 31 | \item{timeout}{a numeric value that specifies the maximum request time per try. Default is 20 seconds.} 32 | 33 | \item{show_progress}{a logical value that determines if a progress bar will be shown. Default 34 | is TRUE.} 35 | } 36 | \value{ 37 | A data frame that contains all protein metadata specified in \code{columns} for the 38 | proteins provided. The \code{input_id} column contains the provided UniProt IDs. If an invalid ID 39 | was provided that contains a valid UniProt ID, the valid portion of the ID is still fetched and 40 | present in the \code{accession} column, while the \code{input_id} column contains the original not completely 41 | valid ID. 42 | } 43 | \description{ 44 | Fetches protein metadata from UniProt. 45 | } 46 | \examples{ 47 | \donttest{ 48 | fetch_uniprot(c("P36578", "O43324", "Q00796")) 49 | 50 | # Not completely valid ID 51 | fetch_uniprot(c("P02545", "P02545;P20700")) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: protti 2 | Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools 3 | Version: 0.9.1 4 | Authors@R: 5 | c(person(given = "Jan-Philipp", 6 | family = "Quast", 7 | role = c("aut", "cre"), 8 | email = "quast@imsb.biol.ethz.ch", 9 | comment = c(ORCID = "0000-0003-2713-778X")), 10 | person(given = "Dina", 11 | family = "Schuster", 12 | role = c("aut"), 13 | email = "dschuster@ethz.ch", 14 | comment = c(ORCID = "0000-0001-6611-8237")), 15 | person(given = "ETH Zurich", 16 | role = c("cph", "fnd"))) 17 | Description: Useful functions and workflows for proteomics quality control and data analysis of both limited proteolysis-coupled mass spectrometry (LiP-MS) (Feng et. al. (2014) ) and regular bottom-up proteomics experiments. Data generated with search tools such as 'Spectronaut', 'MaxQuant' and 'Proteome Discover' can be easily used due to flexibility of functions. 18 | License: MIT + file LICENSE 19 | Encoding: UTF-8 20 | LazyData: true 21 | biocViews: 22 | Imports: 23 | rlang, 24 | dplyr, 25 | stringr, 26 | magrittr, 27 | data.table, 28 | janitor, 29 | progress, 30 | purrr, 31 | tidyr, 32 | ggplot2, 33 | forcats, 34 | tibble, 35 | plotly, 36 | ggrepel, 37 | utils, 38 | grDevices, 39 | curl, 40 | readr, 41 | lifecycle, 42 | httr, 43 | methods, 44 | R.utils, 45 | stats 46 | RoxygenNote: 7.3.2 47 | Suggests: 48 | testthat, 49 | covr, 50 | knitr, 51 | rmarkdown, 52 | shiny, 53 | r3dmol, 54 | proDA, 55 | limma, 56 | dendextend, 57 | pheatmap, 58 | heatmaply, 59 | furrr, 60 | future, 61 | parallel, 62 | seriation, 63 | drc, 64 | igraph, 65 | stringi, 66 | STRINGdb, 67 | iq, 68 | scales, 69 | farver, 70 | ggforce, 71 | xml2, 72 | jsonlite 73 | Depends: 74 | R (>= 4.0) 75 | URL: https://github.com/jpquast/protti, https://jpquast.github.io/protti/ 76 | BugReports: https://github.com/jpquast/protti/issues 77 | VignetteBuilder: knitr 78 | Roxygen: list(markdown = TRUE) 79 | -------------------------------------------------------------------------------- /man/qc_contaminants.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_contaminants.R 3 | \name{qc_contaminants} 4 | \alias{qc_contaminants} 5 | \title{Percentage of contaminants per sample} 6 | \usage{ 7 | qc_contaminants( 8 | data, 9 | sample, 10 | protein, 11 | is_contaminant, 12 | intensity, 13 | n_contaminants = 5, 14 | plot = TRUE, 15 | interactive = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{a data frame that contains at least the input variables.} 20 | 21 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.} 22 | 23 | \item{protein}{a character column in the \code{data} data frame that contains protein IDs or 24 | protein names.} 25 | 26 | \item{is_contaminant}{a logical column that indicates if the protein is a contaminant.} 27 | 28 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding 29 | raw or normalised intensity values (not log2).} 30 | 31 | \item{n_contaminants}{a numeric value that indicates how many contaminants should be displayed 32 | individually. The rest is combined to a group called "other". The default is 5.} 33 | 34 | \item{plot}{a logical value that indicates if a plot is returned. If FALSE a table is returned.} 35 | 36 | \item{interactive}{a logical value that indicates if the plot is made interactive using the r 37 | package \code{plotly}.} 38 | } 39 | \value{ 40 | A bar plot that displays the percentage of contaminating proteins over all samples. 41 | If \code{plot = FALSE} a data frame is returned. 42 | } 43 | \description{ 44 | Calculates the percentage of contaminating proteins as the share of total intensity. 45 | } 46 | \examples{ 47 | data <- data.frame( 48 | sample = c(rep("sample_1", 10), rep("sample_2", 10)), 49 | leading_razor_protein = c(rep(c("P1", "P1", "P1", "P2", "P2", "P2", "P2", "P3", "P3", "P3"), 2)), 50 | potential_contaminant = c(rep(c(rep(TRUE, 7), rep(FALSE, 3)), 2)), 51 | intensity = c(rep(1, 2), rep(4, 4), rep(6, 4), rep(2, 3), rep(3, 5), rep(4, 2)) 52 | ) 53 | 54 | qc_contaminants( 55 | data, 56 | sample = sample, 57 | protein = leading_razor_protein, 58 | is_contaminant = potential_contaminant, 59 | intensity = intensity 60 | ) 61 | } 62 | -------------------------------------------------------------------------------- /man/qc_intensity_distribution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_intensity_distribution.R 3 | \name{qc_intensity_distribution} 4 | \alias{qc_intensity_distribution} 5 | \title{Check intensity distribution per sample and overall} 6 | \usage{ 7 | qc_intensity_distribution( 8 | data, 9 | sample = NULL, 10 | grouping, 11 | intensity_log2, 12 | plot_style 13 | ) 14 | } 15 | \arguments{ 16 | \item{data}{a data frame that contains at least sample names, grouping identifiers (precursor, 17 | peptide or protein) and log2 transformed intensities for each grouping identifier.} 18 | 19 | \item{sample}{an optional character or factor column in the \code{data} data frame that contains the 20 | sample name. If the sample column is of type factor, the ordering is based on the factor 21 | levels. NOTE: If the overall distribution should be returned please do not provide the name of the 22 | sample column.} 23 | 24 | \item{grouping}{a character column in the \code{data} data frame that contains the grouping 25 | variables (e.g. peptides, precursors or proteins).} 26 | 27 | \item{intensity_log2}{a numeric column in the \code{data} data frame that contains the log2 28 | transformed intensities of each grouping identifier sample combination.} 29 | 30 | \item{plot_style}{a character value that indicates the plot type. This can be either 31 | "histogram", "boxplot" or "violin". Plot style "boxplot" and "violin" can only be used if a 32 | sample column is provided.} 33 | } 34 | \value{ 35 | A histogram or boxplot that shows the intensity distribution over all samples or by 36 | sample. 37 | } 38 | \description{ 39 | Plots the overall or sample-wise distribution of all peptide intensities as a boxplot or 40 | histogram. 41 | } 42 | \examples{ 43 | set.seed(123) # Makes example reproducible 44 | 45 | # Create example data 46 | data <- create_synthetic_data( 47 | n_proteins = 100, 48 | frac_change = 0.05, 49 | n_replicates = 3, 50 | n_conditions = 2, 51 | method = "effect_random" 52 | ) 53 | 54 | # Plot intensity distribution 55 | # The plot style can be changed 56 | qc_intensity_distribution( 57 | data = data, 58 | sample = sample, 59 | grouping = peptide, 60 | intensity_log2 = peptide_intensity_missing, 61 | plot_style = "boxplot" 62 | ) 63 | } 64 | -------------------------------------------------------------------------------- /R/find_peptide.R: -------------------------------------------------------------------------------- 1 | #' Find peptide location 2 | #' 3 | #' The position of the given peptide sequence is searched within the given protein sequence. In 4 | #' addition the last amino acid of the peptide and the amino acid right before are reported. 5 | #' 6 | #' @param data a data frame that contains at least the protein and peptide sequence. 7 | #' @param protein_sequence a character column in the \code{data} data frame that contains the 8 | #' protein sequence. 9 | #' @param peptide_sequence a character column in the \code{data} data frame that contains the 10 | #' peptide sequence. 11 | #' 12 | #' @return A data frame that contains the input data and four additional columns with peptide 13 | #' start and end position, the last amino acid and the amino acid before the peptide. 14 | #' @import dplyr 15 | #' @import stringr 16 | #' @importFrom magrittr %>% 17 | #' @importFrom rlang .data 18 | #' @export 19 | #' 20 | #' @examples 21 | #' # Create example data 22 | #' data <- data.frame( 23 | #' protein_sequence = c("abcdefg"), 24 | #' peptide_sequence = c("cde") 25 | #' ) 26 | #' 27 | #' # Find peptide 28 | #' find_peptide( 29 | #' data = data, 30 | #' protein_sequence = protein_sequence, 31 | #' peptide_sequence = peptide_sequence 32 | #' ) 33 | find_peptide <- 34 | function(data, protein_sequence, peptide_sequence) { 35 | result <- data %>% 36 | dplyr::ungroup() %>% 37 | dplyr::distinct({{ protein_sequence }}, {{ peptide_sequence }}) %>% 38 | dplyr::mutate( 39 | start = stringr::str_locate({{ protein_sequence }}, {{ peptide_sequence }})[, 1], 40 | end = stringr::str_locate({{ protein_sequence }}, {{ peptide_sequence }})[, 2] 41 | ) %>% 42 | dplyr::mutate(aa_before = stringr::str_sub({{ protein_sequence }}, 43 | start = .data$start - 1, 44 | end = .data$start - 1 45 | )) %>% 46 | dplyr::mutate(last_aa = stringr::str_sub({{ protein_sequence }}, 47 | start = .data$end, 48 | end = .data$end 49 | )) %>% 50 | dplyr::mutate(aa_after = stringr::str_sub({{ protein_sequence }}, 51 | start = .data$end + 1, 52 | end = .data$end + 1 53 | )) 54 | 55 | data %>% dplyr::left_join(result, c( 56 | rlang::as_name(rlang::enquo(protein_sequence)), 57 | rlang::as_name(rlang::enquo(peptide_sequence)) 58 | )) 59 | } 60 | -------------------------------------------------------------------------------- /.github/workflows/format-code.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | paths: ["**.[rR]", "**.[qrR]md", "**.[rR]markdown", "**.[rR]nw", "**.[rR]profile"] 4 | 5 | name: Style 6 | env: 7 | GITHUB_ACTOR: "actions-user" 8 | 9 | jobs: 10 | style: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | contents: write 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | steps: 17 | - name: Checkout repo 18 | uses: actions/checkout@v4 19 | with: 20 | fetch-depth: 0 21 | 22 | - name: Setup R 23 | uses: r-lib/actions/setup-r@v2 24 | with: 25 | use-public-rspm: true 26 | 27 | - name: Install dependencies 28 | uses: r-lib/actions/setup-r-dependencies@v2 29 | with: 30 | extra-packages: any::styler, any::roxygen2 31 | needs: styler 32 | 33 | - name: Enable styler cache 34 | run: styler::cache_activate() 35 | shell: Rscript {0} 36 | 37 | - name: Determine cache location 38 | id: styler-location 39 | run: | 40 | cat( 41 | "location=", 42 | styler::cache_info(format = "tabular")$location, 43 | "\n", 44 | file = Sys.getenv("GITHUB_OUTPUT"), 45 | append = TRUE, 46 | sep = "" 47 | ) 48 | shell: Rscript {0} 49 | 50 | - name: Cache styler 51 | uses: actions/cache@v4 52 | with: 53 | path: ${{ steps.styler-location.outputs.location }} 54 | key: ${{ runner.os }}-styler-${{ github.sha }} 55 | restore-keys: | 56 | ${{ runner.os }}-styler- 57 | ${{ runner.os }}- 58 | 59 | - name: Style 60 | run: styler::style_pkg() 61 | shell: Rscript {0} 62 | 63 | - name: Commit and push changes 64 | run: | 65 | if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \ 66 | | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$')) 67 | then 68 | git config --local user.name "$GITHUB_ACTOR" 69 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 70 | git commit ${FILES_TO_COMMIT[*]} -m "Style code (GHA)" 71 | git pull --ff-only 72 | git push origin 73 | else 74 | echo "No changes to commit." 75 | fi 76 | -------------------------------------------------------------------------------- /man/qc_proteome_coverage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_proteome_coverage.R 3 | \name{qc_proteome_coverage} 4 | \alias{qc_proteome_coverage} 5 | \title{Proteome coverage per sample and total} 6 | \usage{ 7 | qc_proteome_coverage( 8 | data, 9 | sample, 10 | protein_id, 11 | organism_id, 12 | reviewed = TRUE, 13 | plot = TRUE, 14 | interactive = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{a data frame that contains at least sample names and protein ID's.} 19 | 20 | \item{sample}{a character column in the \code{data} data frame that contains the sample name.} 21 | 22 | \item{protein_id}{a character or numeric column in the \code{data} data frame that contains 23 | protein identifiers such as UniProt accessions.} 24 | 25 | \item{organism_id}{a numeric value that specifies a NCBI taxonomy identifier (TaxId) of the 26 | organism used. Human: 9606, S. cerevisiae: 559292, E. coli: 83333.} 27 | 28 | \item{reviewed}{a logical value that determines if only reviewed protein entries will be considered 29 | as the full proteome. Default is TRUE.} 30 | 31 | \item{plot}{a logical value that specifies whether the result should be plotted.} 32 | 33 | \item{interactive}{a logical value that indicates whether the plot should be interactive 34 | (default is FALSE).} 35 | } 36 | \value{ 37 | A bar plot showing the percentage of of the proteome detected and undetected in total 38 | and for each sample. If \code{plot = FALSE} a data frame containing the numbers is returned. 39 | } 40 | \description{ 41 | Calculates the proteome coverage for each samples and for all samples combined. In other words t 42 | he fraction of detected proteins to all proteins in the proteome is calculated. 43 | } 44 | \examples{ 45 | \donttest{ 46 | # Create example data 47 | proteome <- data.frame(id = 1:4518) 48 | data <- data.frame( 49 | sample = c(rep("A", 101), rep("B", 1000), rep("C", 1000)), 50 | protein_id = c(proteome$id[1:100], proteome$id[1:1000], proteome$id[1000:2000]) 51 | ) 52 | 53 | # Calculate proteome coverage 54 | qc_proteome_coverage( 55 | data = data, 56 | sample = sample, 57 | protein_id = protein_id, 58 | organism_id = 83333, 59 | plot = FALSE 60 | ) 61 | 62 | # Plot proteome coverage 63 | qc_proteome_coverage( 64 | data = data, 65 | sample = sample, 66 | protein_id = protein_id, 67 | organism_id = 83333, 68 | plot = TRUE 69 | ) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /man/calculate_imputation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_imputation.R 3 | \name{calculate_imputation} 4 | \alias{calculate_imputation} 5 | \title{Sampling of values for imputation} 6 | \usage{ 7 | calculate_imputation( 8 | min = NULL, 9 | noise = NULL, 10 | mean = NULL, 11 | sd, 12 | missingness = c("MNAR", "MAR"), 13 | method = c("ludovic", "noise"), 14 | skip_log2_transform_error = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{min}{a numeric value specifying the minimal intensity value of the precursor/peptide. 19 | Is only required if \code{method = "ludovic"} and \code{missingness = "MNAR"}.} 20 | 21 | \item{noise}{a numeric value specifying a noise value for the precursor/peptide. Is only 22 | required if \code{method = "noise"} and \code{missingness = "MNAR"}.} 23 | 24 | \item{mean}{a numeric value specifying the mean intensity value of the condition with missing 25 | values for a given precursor/peptide. Is only required if \code{missingness = "MAR"}.} 26 | 27 | \item{sd}{a numeric value specifying the mean of the standard deviation of all conditions for 28 | a given precursor/peptide.} 29 | 30 | \item{missingness}{a character value specifying the missingness type of the data determines 31 | how values for imputation are sampled. This can be \code{"MAR"} or \code{"MNAR"}.} 32 | 33 | \item{method}{a character value specifying the method to be used for imputation. For 34 | \code{method = "ludovic"}, MNAR missingness is sampled around a value that is three lower 35 | (log2) than the lowest intensity value recorded for the precursor/peptide. For 36 | \code{method = "noise"}, MNAR missingness is sampled around the noise value for the 37 | precursor/peptide.} 38 | 39 | \item{skip_log2_transform_error}{a logical value, if FALSE a check is performed to validate that 40 | input values are log2 transformed. If input values are > 40 the test is failed and an error is 41 | returned.} 42 | } 43 | \value{ 44 | A value sampled from a normal distribution with the input parameters. Method specifics 45 | are applied to input parameters prior to sampling. 46 | } 47 | \description{ 48 | \code{calculate_imputation} is a helper function that is used in the \code{impute} function. 49 | Depending on the type of missingness and method, it samples values from a normal distribution 50 | that can be used for the imputation. Note: The input intensities should be log2 transformed. 51 | } 52 | -------------------------------------------------------------------------------- /man/qc_data_completeness.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_data_completeness.R 3 | \name{qc_data_completeness} 4 | \alias{qc_data_completeness} 5 | \title{Data completeness} 6 | \usage{ 7 | qc_data_completeness( 8 | data, 9 | sample, 10 | grouping, 11 | intensity, 12 | digestion = NULL, 13 | plot = TRUE, 14 | interactive = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{a data frame containing at least the input variables.} 19 | 20 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.} 21 | 22 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor 23 | or peptide identifiers.} 24 | 25 | \item{intensity}{a numeric column in the \code{data} data frame that contains any intensity 26 | intensity values that missingness should be determined for.} 27 | 28 | \item{digestion}{optional, a character column in the \code{data} data frame that indicates the 29 | mode of digestion (limited proteolysis or tryptic digest). Alternatively, any other variable 30 | by which the data should be split can be provided.} 31 | 32 | \item{plot}{a logical value that indicates whether the result should be plotted.} 33 | 34 | \item{interactive}{a logical value that specifies whether the plot should be interactive 35 | (default is FALSE).} 36 | } 37 | \value{ 38 | A bar plot that displays the percentage of data completeness over all samples. 39 | If \code{plot = FALSE} a data frame is returned. If \code{interactive = TRUE}, the plot is 40 | interactive. 41 | } 42 | \description{ 43 | Calculates the percentage of data completeness. That means, what percentage of all detected 44 | precursors is present in each sample. 45 | } 46 | \examples{ 47 | set.seed(123) # Makes example reproducible 48 | 49 | # Create example data 50 | data <- create_synthetic_data( 51 | n_proteins = 100, 52 | frac_change = 0.05, 53 | n_replicates = 3, 54 | n_conditions = 2, 55 | method = "effect_random" 56 | ) 57 | 58 | # Determine data completeness 59 | qc_data_completeness( 60 | data = data, 61 | sample = sample, 62 | grouping = peptide, 63 | intensity = peptide_intensity_missing, 64 | plot = FALSE 65 | ) 66 | 67 | # Plot data completeness 68 | qc_data_completeness( 69 | data = data, 70 | sample = sample, 71 | grouping = peptide, 72 | intensity = peptide_intensity_missing, 73 | plot = TRUE 74 | ) 75 | } 76 | -------------------------------------------------------------------------------- /R/drc_4p.R: -------------------------------------------------------------------------------- 1 | #' Dose response curve helper function 2 | #' 3 | #' This function peforms the four-parameter dose response curve fit. It is the helper function 4 | #' for the fit in the \code{fit_drc_4p} function. 5 | #' 6 | #' @param data a data frame that contains at least the dose and response column the model should 7 | #' be fitted to. 8 | #' @param response a numeric column that contains the response values. 9 | #' @param dose a numeric column that contains the dose values. 10 | #' @param log_logarithmic a logical value indicating if a logarithmic or log-logarithmic model is 11 | #' fitted. If response values form a symmetric curve for non-log transformed dose values, a 12 | #' logarithmic model instead of a log-logarithmic model should be used. Usually biological dose 13 | #' response data has a log-logarithmic distribution, which is the reason this is the default. 14 | #' Log-logarithmic models are symmetric if dose values are log transformed. 15 | #' @param pb progress bar object. This is only necessary if the function is used in an iteration. 16 | #' 17 | #' @return An object of class \code{drc}. If no fit was performed a character vector with content 18 | #' "no_fit". 19 | drc_4p <- function(data, response, dose, log_logarithmic = TRUE, pb = NULL) { 20 | if (!requireNamespace("drc", quietly = TRUE)) { 21 | message("Package \"drc\" is needed for this function to work. Please install it.", call. = FALSE) 22 | return(invisible(NULL)) 23 | } 24 | if (!is.null(pb)) pb$tick() 25 | if (log_logarithmic == TRUE) { 26 | result <- tryCatch( 27 | { 28 | suppressWarnings(drc::drm( 29 | stats::as.formula(paste(ensym(response), "~", ensym(dose))), 30 | data = data, 31 | fct = drc::LL.4(names = c("hill", "min_value", "max_value", "ec_50")), 32 | control = drc::drmc(otrace = TRUE) 33 | )) 34 | }, 35 | error = function(error) { 36 | c("no_fit") 37 | } 38 | ) 39 | return(result) 40 | } 41 | if (log_logarithmic == FALSE) { 42 | result <- tryCatch( 43 | { 44 | suppressWarnings(drc::drm( 45 | stats::as.formula(paste(ensym(response), "~", ensym(dose))), 46 | data = data, 47 | fct = drc::L.4(names = c("hill", "min_value", "max_value", "ec_50")), 48 | control = drc::drmc(otrace = TRUE) 49 | )) 50 | }, 51 | error = function(error) { 52 | c("no_fit") 53 | } 54 | ) 55 | return(result) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /R/anova_protti.R: -------------------------------------------------------------------------------- 1 | #' Perform ANOVA 2 | #' 3 | #' Performs an ANOVA statistical test 4 | #' 5 | #' @param data a data frame containing at least the input variables. 6 | #' @param grouping a character column in the \code{data} data frame that contains precursor or 7 | #' peptide identifiers. 8 | #' @param condition a character or numeric column in the \code{data} data frame that contains the 9 | #' conditions. 10 | #' @param mean_ratio a numeric column in the \code{data} data frame that contains mean intensities 11 | #' or mean intensity ratios. 12 | #' @param sd a numeric column in the \code{data} data frame that contains the standard deviation 13 | #' corresponding to the mean. 14 | #' @param n a numeric column in the \code{data} data frame that contains the number of replicates 15 | #' for which the corresponding mean was calculated. 16 | #' 17 | #' @return a data frame that contains the within group error (\code{ms_group}) and the between 18 | #' group error (\code{ms_error}), f statistic and p-values. 19 | #' @import dplyr 20 | #' @export 21 | #' 22 | #' @examples 23 | #' data <- data.frame( 24 | #' precursor = c("A", "A", "A", "B", "B", "B"), 25 | #' condition = c("C1", "C2", "C3", "C1", "C2", "C3"), 26 | #' mean = c(10, 12, 20, 11, 12, 8), 27 | #' sd = c(2, 1, 1.5, 1, 2, 4), 28 | #' n = c(4, 4, 4, 4, 4, 4) 29 | #' ) 30 | #' 31 | #' anova_protti( 32 | #' data, 33 | #' grouping = precursor, 34 | #' condition = condition, 35 | #' mean = mean, 36 | #' sd = sd, 37 | #' n = n 38 | #' ) 39 | anova_protti <- function(data, grouping, condition, mean_ratio, sd, n) { 40 | result <- data %>% 41 | dplyr::distinct({{ grouping }}, {{ condition }}, {{ mean_ratio }}, {{ sd }}, {{ n }}) %>% 42 | dplyr::group_by({{ grouping }}) %>% 43 | dplyr::filter({{ n }} != 0) %>% 44 | dplyr::mutate(n_groups = dplyr::n_distinct(!!ensym(condition))) %>% 45 | dplyr::mutate(grand_mean = mean({{ mean_ratio }})) %>% 46 | dplyr::mutate(total_n = sum({{ n }})) %>% 47 | dplyr::mutate(ms_group = sum(({{ mean_ratio }} - .data$grand_mean)^2 * {{ n }}) / (.data$n_groups - 1)) %>% 48 | dplyr::mutate(ms_error = sum({{ sd }}^2 * ({{ n }} - 1)) / (.data$total_n - .data$n_groups)) %>% 49 | dplyr::mutate(f = .data$ms_group / .data$ms_error) %>% 50 | dplyr::mutate(pval = stats::pf(.data$f, .data$n_groups - 1, .data$total_n - .data$n_groups, lower.tail = FALSE)) %>% 51 | dplyr::distinct({{ grouping }}, .data$ms_group, .data$ms_error, .data$f, .data$pval) %>% 52 | dplyr::ungroup() 53 | 54 | result 55 | } 56 | -------------------------------------------------------------------------------- /man/fetch_alphafold_aligned_error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_alphafold_aligned_error.R 3 | \name{fetch_alphafold_aligned_error} 4 | \alias{fetch_alphafold_aligned_error} 5 | \title{Fetch AlphaFold aligned error} 6 | \usage{ 7 | fetch_alphafold_aligned_error( 8 | uniprot_ids = NULL, 9 | error_cutoff = 20, 10 | timeout = 30, 11 | max_tries = 1, 12 | return_data_frame = FALSE, 13 | show_progress = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{uniprot_ids}{a character vector of UniProt identifiers for which predictions 18 | should be fetched.} 19 | 20 | \item{error_cutoff}{a numeric value specifying the maximum position error (in Angstroms) that should be retained. 21 | setting this value to a low number reduces the size of the retrieved data. Default is 20.} 22 | 23 | \item{timeout}{a numeric value specifying the time in seconds until the download times out. 24 | The default is 30 seconds.} 25 | 26 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download 27 | the data in case an error occurs. The default is 1.} 28 | 29 | \item{return_data_frame}{a logical value; if \code{TRUE} a data frame instead of a list 30 | is returned. It is recommended to only use this if information for few proteins is retrieved. 31 | Default is \code{FALSE}.} 32 | 33 | \item{show_progress}{a logical value; if \code{TRUE} a progress bar will be shown. 34 | Default is \code{TRUE}.} 35 | } 36 | \value{ 37 | A list that contains aligned errors for AlphaFold predictions. If return_data_frame is 38 | TRUE, a data frame with this information is returned instead. The data frame contains the 39 | following columns: 40 | \itemize{ 41 | \item scored_residue: The error for this position is calculated based on the alignment to the 42 | aligned residue. 43 | \item aligned_residue: The residue that is aligned for the calculation of the error of the scored 44 | residue 45 | \item error: The predicted aligned error computed by alpha fold. 46 | \item accession: The UniProt protein identifier. 47 | } 48 | } 49 | \description{ 50 | Fetches the aligned error for AlphaFold predictions for provided proteins. 51 | The aligned error is useful for assessing inter-domain accuracy. In detail it 52 | represents the expected position error at residue x (scored residue), when 53 | the predicted and true structures are aligned on residue y (aligned residue). 54 | } 55 | \examples{ 56 | \donttest{ 57 | aligned_error <- fetch_alphafold_aligned_error( 58 | uniprot_ids = c("F4HVG8", "O15552"), 59 | error_cutoff = 5, 60 | return_data_frame = TRUE 61 | ) 62 | 63 | head(aligned_error, n = 10) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /man/qc_sample_correlation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_sample_correlation.R 3 | \name{qc_sample_correlation} 4 | \alias{qc_sample_correlation} 5 | \title{Correlation based hirachical clustering of samples} 6 | \usage{ 7 | qc_sample_correlation( 8 | data, 9 | sample, 10 | grouping, 11 | intensity_log2, 12 | condition, 13 | digestion = NULL, 14 | run_order = NULL, 15 | method = "spearman", 16 | interactive = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{a data frame that contains at least the input variables.} 21 | 22 | \item{sample}{a character column in the \code{data} data frame that contains the sample names.} 23 | 24 | \item{grouping}{a character column in the \code{data} data frame that contains precursor or 25 | peptide identifiers.} 26 | 27 | \item{intensity_log2}{a numeric column in the \code{data} data frame that contains log2 28 | intensity values.} 29 | 30 | \item{condition}{a character or numeric column in the \code{data} data frame that contains the 31 | conditions.} 32 | 33 | \item{digestion}{optional, a character column in the \code{data} data frame that contains 34 | information about the digestion method used. e.g. "LiP" or "tryptic control".} 35 | 36 | \item{run_order}{optional, a character or numeric column in the \code{data} data frame that 37 | contains the order in which samples were measured. Useful to investigate batch effects due to 38 | run order.} 39 | 40 | \item{method}{a character value that specifies the method to be used for correlation. 41 | \code{"spearman"} is the default but can be changed to \code{"pearson"} or \code{"kendall"}.} 42 | 43 | \item{interactive}{a logical value that specifies whether the plot should be interactive. 44 | Determines if an interactive or static heatmap should be created using \code{heatmaply} or 45 | \code{pheatmap}, respectively.} 46 | } 47 | \value{ 48 | A correlation heatmap that compares each sample. The dendrogram is sorted by optimal 49 | leaf ordering. 50 | } 51 | \description{ 52 | A correlation heatmap is created that uses hirachical clustering to determine sample similarity. 53 | } 54 | \examples{ 55 | \donttest{ 56 | set.seed(123) # Makes example reproducible 57 | 58 | # Create example data 59 | data <- create_synthetic_data( 60 | n_proteins = 100, 61 | frac_change = 0.05, 62 | n_replicates = 3, 63 | n_conditions = 2, 64 | method = "effect_random" 65 | ) 66 | 67 | # Create sample correlation heatmap 68 | qc_sample_correlation( 69 | data = data, 70 | sample = sample, 71 | grouping = peptide, 72 | intensity_log2 = peptide_intensity_missing, 73 | condition = condition 74 | ) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /man/randomise_queue.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/randomise_queue.R 3 | \name{randomise_queue} 4 | \alias{randomise_queue} 5 | \title{Randomise samples in MS queue} 6 | \usage{ 7 | randomise_queue(data = NULL, rows = NULL, export = FALSE) 8 | } 9 | \arguments{ 10 | \item{data}{optional, a data frame that contains a queue. If not provided a queue file can be 11 | chosen interactively.} 12 | 13 | \item{rows}{optional, a numeric vector that specifies a range of rows in for which samples 14 | should be randomized.} 15 | 16 | \item{export}{a logical value that determines if a \code{"randomised_queue.csv"} file will be 17 | saved in the working directory. If FALSE a data frame will be returned.} 18 | } 19 | \value{ 20 | If \code{export = TRUE} a \code{"randomised_queue.csv"} file will be saved in the 21 | working directory. If \code{export = FALSE} a data frame that contains the randomised queue 22 | is returned. 23 | } 24 | \description{ 25 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} 26 | This function randomises the order of samples in an MS queue. QC and Blank samples are left in 27 | place. It is also possible to randomise only parts of the queue. Before running this make sure 28 | to set a specific seed with the \code{set.seed()} function. This ensures that the randomisation 29 | of the result is consistent if the function is run again. 30 | } 31 | \examples{ 32 | queue <- create_queue( 33 | date = c("200722"), 34 | instrument = c("EX1"), 35 | user = c("jquast"), 36 | measurement_type = c("DIA"), 37 | experiment_name = c("JPQ031"), 38 | digestion = c("LiP", "tryptic control"), 39 | treatment_type_1 = c("EDTA", "H2O"), 40 | treatment_type_2 = c("Zeba", "unfiltered"), 41 | treatment_dose_1 = c(10, 30, 60), 42 | treatment_unit_1 = c("min"), 43 | n_replicates = 4, 44 | number_runs = FALSE, 45 | organism = c("E. coli"), 46 | exclude_combinations = list(list( 47 | treatment_type_1 = c("H2O"), 48 | treatment_type_2 = c("Zeba", "unfiltered"), 49 | treatment_dose_1 = c(10, 30) 50 | )), 51 | inj_vol = c(2), 52 | data_path = "D:\\\\2007_Data", 53 | method_path = "C:\\\\Xcalibur\\\\methods\\\\DIA_120min", 54 | position_row = c("A", "B", "C", "D", "E", "F"), 55 | position_column = 8, 56 | blank_every_n = 4, 57 | blank_position = "1-V1", 58 | blank_method_path = "C:\\\\Xcalibur\\\\methods\\\\blank" 59 | ) 60 | 61 | head(queue, n = 20) 62 | 63 | randomised_queue <- randomise_queue( 64 | data = queue, 65 | export = FALSE 66 | ) 67 | 68 | head(randomised_queue, n = 20) 69 | } 70 | -------------------------------------------------------------------------------- /man/qc_cvs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_cvs.R 3 | \name{qc_cvs} 4 | \alias{qc_cvs} 5 | \title{Check CV distribution} 6 | \usage{ 7 | qc_cvs( 8 | data, 9 | grouping, 10 | condition, 11 | intensity, 12 | plot = TRUE, 13 | plot_style = "density", 14 | max_cv = 200 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{a data frame containing at least peptide, precursor or protein identifiers, 19 | information on conditions and intensity values for each peptide, precursor or protein.} 20 | 21 | \item{grouping}{a character column in the \code{data} data frame that contains the grouping 22 | variables (e.g. peptides, precursors or proteins).} 23 | 24 | \item{condition}{a character or factor column in the \code{data} data frame that contains condition information 25 | (e.g. "treated" and "control").} 26 | 27 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding 28 | raw or untransformed normalised intensity values for each peptide or precursor.} 29 | 30 | \item{plot}{a logical value that indicates whether the result should be plotted.} 31 | 32 | \item{plot_style}{a character value that indicates the plotting style. \code{plot_style = "boxplot"} 33 | plots a boxplot, whereas \code{plot_style = "density"} plots the CV density distribution. 34 | \code{plot_style = "violin"} returns a violin plot. Default is \code{plot_style = "density"}.} 35 | 36 | \item{max_cv}{a numeric value that specifies the maximum percentage of CVs that should be included 37 | in the returned plot. The default value is \code{max_cv = 200}.} 38 | } 39 | \value{ 40 | Either a data frame with the median CVs in \% or a plot showing the distribution of the CVs 41 | is returned. 42 | } 43 | \description{ 44 | Calculates and plots the coefficients of variation for the selected grouping. 45 | } 46 | \examples{ 47 | # Load libraries 48 | library(dplyr) 49 | 50 | set.seed(123) # Makes example reproducible 51 | 52 | # Create example data 53 | data <- create_synthetic_data( 54 | n_proteins = 100, 55 | frac_change = 0.05, 56 | n_replicates = 3, 57 | n_conditions = 2, 58 | method = "effect_random" 59 | ) \%>\% 60 | mutate(intensity_non_log2 = 2^peptide_intensity_missing) 61 | 62 | # Calculate coefficients of variation 63 | qc_cvs( 64 | data = data, 65 | grouping = peptide, 66 | condition = condition, 67 | intensity = intensity_non_log2, 68 | plot = FALSE 69 | ) 70 | 71 | # Plot coefficients of variation 72 | # Different plot styles are available 73 | qc_cvs( 74 | data = data, 75 | grouping = peptide, 76 | condition = condition, 77 | intensity = intensity_non_log2, 78 | plot = TRUE, 79 | plot_style = "violin" 80 | ) 81 | } 82 | -------------------------------------------------------------------------------- /man/fetch_eco.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_eco.R 3 | \name{fetch_eco} 4 | \alias{fetch_eco} 5 | \title{Fetch evidence & conclusion ontology} 6 | \usage{ 7 | fetch_eco( 8 | return_relation = FALSE, 9 | return_history = FALSE, 10 | show_progress = TRUE 11 | ) 12 | } 13 | \arguments{ 14 | \item{return_relation}{a logical value that indicates if relational information should be returned instead 15 | the main descriptive information. This data can be used to check the relations of ECO terms to each other. 16 | Default is FALSE.} 17 | 18 | \item{return_history}{a logical value that indicates if the entry history of an ECO term should be 19 | returned instead the main descriptive information. 20 | Default is FALSE.} 21 | 22 | \item{show_progress}{a logical value that indicates if a progress bar will be shown. 23 | Default is TRUE.} 24 | } 25 | \value{ 26 | A data frame that contains descriptive information about each ECO term in the EBI database. 27 | If either \code{return_relation} or \code{return_history} is set to \code{TRUE}, the respective information is 28 | returned instead of the usual output. 29 | } 30 | \description{ 31 | Fetches all evidence & conclusion ontology (ECO) information from the QuickGO EBI database. The ECO project is 32 | maintained through a public \href{https://github.com/evidenceontology/evidenceontology}{GitHub repository}. 33 | } 34 | \details{ 35 | According to the GitHub repository ECO is defined as follows: 36 | 37 | "The Evidence & Conclusion Ontology (ECO) describes types of scientific evidence within the 38 | biological research domain that arise from laboratory experiments, computational methods, 39 | literature curation, or other means. Researchers use evidence to support conclusions 40 | that arise out of scientific research. Documenting evidence during scientific research 41 | is essential, because evidence gives us a sense of why we believe what we think we know. 42 | Conclusions are asserted as statements about things that are believed to be true, for 43 | example that a protein has a particular function (i.e. a protein functional annotation) or 44 | that a disease is associated with a particular gene variant (i.e. a phenotype-gene association). 45 | A systematic and structured (i.e. ontological) classification of evidence allows us to store, 46 | retreive, share, and compare data associated with that evidence using computers, which are 47 | essential to navigating the ever-growing (in size and complexity) corpus of scientific 48 | information." 49 | 50 | More information can be found in their publication (\doi{10.1093/nar/gky1036}). 51 | } 52 | \examples{ 53 | \donttest{ 54 | eco <- fetch_eco() 55 | 56 | head(eco) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /man/filter_cv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/filter_cv.R 3 | \name{filter_cv} 4 | \alias{filter_cv} 5 | \title{Data filtering based on coefficients of variation (CV)} 6 | \usage{ 7 | filter_cv( 8 | data, 9 | grouping, 10 | condition, 11 | log2_intensity, 12 | cv_limit = 0.25, 13 | min_conditions, 14 | silent = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{a data frame that contains at least the input variables.} 19 | 20 | \item{grouping}{a character column in the \code{data} data frame that contains the grouping 21 | variable that can be either precursors, peptides or proteins.} 22 | 23 | \item{condition}{a character or numeric column in the \code{data} data frame that contains 24 | information on the sample condition.} 25 | 26 | \item{log2_intensity}{a numeric column in the \code{data} data frame that contains log2 27 | transformed intensities.} 28 | 29 | \item{cv_limit}{optional, a numeric value that specifies the CV cutoff that will be applied. 30 | Default is 0.25.} 31 | 32 | \item{min_conditions}{a numeric value that specifies the minimum number of conditions for 33 | which grouping CVs should be below the cutoff.} 34 | 35 | \item{silent}{a logical value that specifies if a message with the number of filtered out 36 | conditions should be returned. Default is FALSE.} 37 | } 38 | \value{ 39 | The CV filtered data frame. 40 | } 41 | \description{ 42 | Filters the input data based on precursor, peptide or protein intensity coefficients of variation. 43 | The function should be used to ensure that only robust measurements and quantifications are used for 44 | data analysis. It is advised to use the function after inspection of raw values (quality control) 45 | and median normalisation. Generally, the function calculates CVs of each peptide, precursor or 46 | protein for each condition and removes peptides, precursors or proteins that have a CV above 47 | the cutoff in less than the (user-defined) required number of conditions. Since the user-defined 48 | cutoff is fixed and does not depend on the number of conditions that have detected values, the 49 | function might bias for data completeness. 50 | } 51 | \examples{ 52 | set.seed(123) # Makes example reproducible 53 | 54 | # Create synthetic data 55 | data <- create_synthetic_data( 56 | n_proteins = 50, 57 | frac_change = 0.05, 58 | n_replicates = 3, 59 | n_conditions = 2, 60 | method = "effect_random", 61 | additional_metadata = FALSE 62 | ) 63 | 64 | # Filter coefficients of variation 65 | data_filtered <- filter_cv( 66 | data = data, 67 | grouping = peptide, 68 | condition = condition, 69 | log2_intensity = peptide_intensity_missing, 70 | cv_limit = 0.25, 71 | min_conditions = 2 72 | ) 73 | } 74 | -------------------------------------------------------------------------------- /R/fetch_kegg.R: -------------------------------------------------------------------------------- 1 | #' Fetch KEGG pathway data from KEGG 2 | #' 3 | #' Fetches gene IDs and corresponding pathway IDs and names for the provided organism. 4 | #' 5 | #' @param species a character value providing an abreviated species name. "hsa" for human, "eco" 6 | #' for E. coli and "sce" for S. cerevisiae. Additional possible names can be found for 7 | #' \href{https://www.genome.jp/kegg-bin/show_organism?category=Eukaryotes}{eukaryotes} and for 8 | #' \href{https://www.genome.jp/kegg-bin/show_organism?category=Prokaryotes}{prokaryotes}. 9 | #' 10 | #' @return A data frame that contains gene IDs with corresponding pathway IDs and names for a 11 | #' selected organism. 12 | #' @importFrom dplyr left_join 13 | #' @importFrom stringr str_replace_all 14 | #' @importFrom magrittr %>% 15 | #' @importFrom curl has_internet 16 | #' @export 17 | #' 18 | #' @examples 19 | #' \donttest{ 20 | #' kegg <- fetch_kegg(species = "hsa") 21 | #' 22 | #' head(kegg) 23 | #' } 24 | fetch_kegg <- function(species) { 25 | if (!curl::has_internet()) { 26 | message("No internet connection.") 27 | return(invisible(NULL)) 28 | } 29 | # download kegg_id pathway link 30 | url_link <- paste("https://rest.kegg.jp/link/pathway", species, sep = "/") 31 | result_link <- try_query(url_link, col_names = FALSE, progress = FALSE, show_col_types = FALSE) 32 | if (methods::is(result_link, "character")) { 33 | message(result_link) 34 | return(invisible(NULL)) 35 | } 36 | colnames(result_link) <- c("kegg_id", "pathway_id") 37 | result_link$pathway_id <- stringr::str_replace_all(result_link$pathway_id, 38 | pattern = "path:", 39 | replacement = "" 40 | ) 41 | # download pathway_id names 42 | url_name <- paste("https://rest.kegg.jp/list/pathway", species, sep = "/") 43 | result_name <- try_query(url_name, col_names = FALSE, progress = FALSE, show_col_types = FALSE) 44 | if (methods::is(result_name, "character")) { 45 | message(result_name) 46 | return(invisible(NULL)) 47 | } 48 | colnames(result_name) <- c("pathway_id", "pathway_name") 49 | 50 | # download kegg_id to uniprot_id conversion 51 | url_conv <- paste("https://rest.kegg.jp/conv/uniprot", species, sep = "/") 52 | result_conv <- try_query(url_conv, col_names = FALSE, progress = FALSE, show_col_types = FALSE) 53 | if (methods::is(result_conv, "character")) { 54 | message(result_conv) 55 | return(invisible(NULL)) 56 | } 57 | colnames(result_conv) <- c("kegg_id", "uniprot_id") 58 | result_conv$uniprot_id <- stringr::str_replace_all(result_conv$uniprot_id, 59 | pattern = "up:", 60 | replacement = "" 61 | ) 62 | # combine datasets 63 | result <- result_link %>% 64 | dplyr::left_join(result_name, by = "pathway_id") %>% 65 | dplyr::left_join(result_conv, by = "kegg_id", relationship = "many-to-many") 66 | result 67 | } 68 | -------------------------------------------------------------------------------- /man/qc_ranked_intensities.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_ranked_intensities.R 3 | \name{qc_ranked_intensities} 4 | \alias{qc_ranked_intensities} 5 | \title{Check ranked intensities} 6 | \usage{ 7 | qc_ranked_intensities( 8 | data, 9 | sample, 10 | grouping, 11 | intensity_log2, 12 | facet = FALSE, 13 | plot = FALSE, 14 | y_axis_transformation = "log10", 15 | interactive = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{a data frame that contains at least sample names, grouping identifiers (precursor, 20 | peptide or protein) and log2 transformed intensities for each grouping identifier.} 21 | 22 | \item{sample}{a character column in the \code{data} data frame that contains the sample names.} 23 | 24 | \item{grouping}{a character column in the \code{data} data frame that contains protein, precursor, 25 | or peptide identifiers.} 26 | 27 | \item{intensity_log2}{a numeric column in the \code{data} data frame that contains the log2 28 | transformed intensities of the selected grouping variable.} 29 | 30 | \item{facet}{a logical value that specifies whether the calculation should be done group wise by 31 | sample and if the resulting plot should be faceted by sample. (default is \code{FALSE}). 32 | If \code{facet = FALSE} the median of each protein intensity will be returned.} 33 | 34 | \item{plot}{a logical value that specifies whether the result should be plotted (default is \code{FALSE}).} 35 | 36 | \item{y_axis_transformation}{a character value that determines that y-axis transformation. The 37 | value is either "log2" or "log10" (default is "log10").} 38 | 39 | \item{interactive}{a logical value that specifies whether the plot should be interactive 40 | (default is \code{FALSE}).} 41 | } 42 | \value{ 43 | A data frame containing the ranked intensities is returned. If \code{plot = TRUE} a plot 44 | is returned. The intensities are log10 transformed for the plot. 45 | } 46 | \description{ 47 | Calculates and plots ranked intensities for proteins, peptides or precursors. 48 | } 49 | \examples{ 50 | set.seed(123) # Makes example reproducible 51 | 52 | # Create synthetic data 53 | data <- create_synthetic_data( 54 | n_proteins = 50, 55 | frac_change = 0.05, 56 | n_replicates = 4, 57 | n_conditions = 3, 58 | method = "effect_random", 59 | additional_metadata = FALSE 60 | ) 61 | 62 | # Plot ranked intensities for all samples combined 63 | qc_ranked_intensities( 64 | data = data, 65 | sample = sample, 66 | grouping = peptide, 67 | intensity_log2 = peptide_intensity, 68 | plot = TRUE, 69 | ) 70 | 71 | # Plot ranked intensities for each sample separately 72 | qc_ranked_intensities( 73 | data = data, 74 | sample = sample, 75 | grouping = peptide, 76 | intensity_log2 = peptide_intensity, 77 | plot = TRUE, 78 | facet = TRUE 79 | ) 80 | 81 | } 82 | -------------------------------------------------------------------------------- /R/find_all_subs.R: -------------------------------------------------------------------------------- 1 | #' Find all sub IDs of an ID in a network 2 | #' 3 | #' For a given ID, find all sub IDs and their sub IDs etc. The type of 4 | #' relationship can be selected too. This is a helper function for other functions. 5 | #' 6 | #' @param data a data frame that contains relational information on IDs (main_id) their sub 7 | #' IDs (sub_id) and their relationship (type). For ChEBI this data frame can be obtained by calling 8 | #' \code{fetch_chebi(relation = TRUE)}. For ECO data it can be obtained by calling fetch_eco(relation = TRUE). 9 | #' @param ids a character vector of IDs for which sub IDs should be searched. 10 | #' @param main_id a character or integer column containing IDs. Default is \code{id} for ChEBI IDs. 11 | #' @param type a character column that contains the type of interactions. Default is \code{type} for ChEBI IDs. 12 | #' @param accepted_types a character vector containing the accepted_types of relationships that should be considered 13 | #' for the search. It is possible to use "all" relationships. The default type is "is_a". A list of 14 | #' possible relationships for e.g. ChEBI IDs can be found 15 | #' \href{https://docs.google.com/document/d/1_w-DwBdCCOh1gMeeP6yqGzcnkpbHYOa3AGSODe5epcg/edit#heading=h.hnsqoqu978s5}{here}. 16 | #' @param exclude_parent_id a logical value that specifies if the parent ID should be included in 17 | #' the returned list. 18 | #' 19 | #' @return A list of character vectors containing the provided ID and all of its sub IDs. It 20 | #' contains one element per input ID. 21 | #' @importFrom dplyr select filter pull 22 | #' @importFrom magrittr %>% 23 | #' @importFrom purrr map 24 | #' @importFrom rlang .data 25 | find_all_subs <- function(data, 26 | ids, 27 | main_id = id, 28 | type = type, 29 | accepted_types = "is_a", 30 | exclude_parent_id = FALSE) { 31 | if (!requireNamespace("igraph", quietly = TRUE)) { 32 | message("Package \"igraph\" is needed for this function to work. Please install it.", call. = FALSE) 33 | return(invisible(NULL)) 34 | } 35 | if (ifelse(length(accepted_types) == 1, accepted_types == "all", FALSE)) { 36 | data <- data %>% 37 | dplyr::select(-{{ type }}) 38 | } else { 39 | data <- data %>% 40 | dplyr::filter({{ type }} %in% accepted_types) %>% 41 | dplyr::select(-{{ type }}) 42 | } 43 | # Generate graph 44 | g <- igraph::graph_from_data_frame(data, directed = TRUE) 45 | 46 | result <- purrr::map(ids, function(x) { 47 | if (!(x %in% dplyr::pull(data, {{ main_id }}))) { 48 | return(NULL) 49 | } 50 | r <- igraph::subcomponent(g, match(x, igraph::V(g)$name), "out")$name 51 | if (exclude_parent_id) { 52 | r <- r[r != x] 53 | } 54 | 55 | r 56 | }) 57 | result 58 | } 59 | -------------------------------------------------------------------------------- /man/calculate_aa_scores.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_aa_scores.R 3 | \name{calculate_aa_scores} 4 | \alias{calculate_aa_scores} 5 | \title{Calculate scores for each amino acid position in a protein sequence} 6 | \usage{ 7 | calculate_aa_scores( 8 | data, 9 | protein, 10 | diff = diff, 11 | adj_pval = adj_pval, 12 | start_position, 13 | end_position, 14 | retain_columns = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{a data frame containing at least the input columns.} 19 | 20 | \item{protein}{a character column in the data frame containing the protein identifier or name.} 21 | 22 | \item{diff}{a numeric column in the \code{data} data frame containing the log2 fold change.} 23 | 24 | \item{adj_pval}{a numeric column in the \code{data} data frame containing the adjusted p-value.} 25 | 26 | \item{start_position}{a numeric column \code{data} in the data frame containing the start position 27 | of a peptide or precursor.} 28 | 29 | \item{end_position}{a numeric column in the data frame containing the end position of a peptide or 30 | precursor.} 31 | 32 | \item{retain_columns}{a vector indicating if certain columns should be retained from the input 33 | data frame. Default is not retaining additional columns \code{retain_columns = NULL}. Specific 34 | columns can be retained by providing their names (not in quotations marks, just like other 35 | column names, but in a vector).} 36 | } 37 | \value{ 38 | A data frame that contains the aggregated scores per amino acid position, enabling to 39 | draw fingerprints for each individual protein. 40 | } 41 | \description{ 42 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} 43 | Calculate a score for each amino acid position in a protein sequence based on the product of the 44 | -log10(adjusted p-value) and the absolute log2(fold change) per peptide covering this amino acid. In detail, all the 45 | peptides are aligned along the sequence of the corresponding protein, and the average score per 46 | amino acid position is computed. In a limited proteolysis coupled to mass spectrometry (LiP-MS) 47 | experiment, the score allows to prioritize and narrow down structurally affected regions. 48 | } 49 | \examples{ 50 | 51 | data <- data.frame( 52 | pg_protein_accessions = c(rep("protein_1", 10)), 53 | diff = c(2, -3, 1, 2, 3, -3, 5, 1, -0.5, 2), 54 | adj_pval = c(0.001, 0.01, 0.2, 0.05, 0.002, 0.5, 0.4, 0.7, 0.001, 0.02), 55 | start = c(1, 3, 5, 10, 15, 25, 28, 30, 41, 51), 56 | end = c(6, 8, 10, 16, 23, 35, 35, 35, 48, 55) 57 | ) 58 | calculate_aa_scores( 59 | data, 60 | protein = pg_protein_accessions, 61 | diff = diff, 62 | adj_pval = adj_pval, 63 | start_position = start, 64 | end_position = end 65 | ) 66 | } 67 | \author{ 68 | Patrick Stalder 69 | } 70 | -------------------------------------------------------------------------------- /man/qc_peak_width.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_peak_width.R 3 | \name{qc_peak_width} 4 | \alias{qc_peak_width} 5 | \title{Peak width over retention time} 6 | \usage{ 7 | qc_peak_width( 8 | data, 9 | sample, 10 | intensity, 11 | retention_time, 12 | peak_width = NULL, 13 | retention_time_start = NULL, 14 | retention_time_end = NULL, 15 | remove_na_intensities = TRUE, 16 | interactive = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{a data frame containing at least sample names and protein IDs.} 21 | 22 | \item{sample}{a character column in the \code{data} data frame that contains the sample names.} 23 | 24 | \item{intensity}{a numeric column in the \code{data} data frame that contains intensities. If 25 | \code{remove_na_intensities = FALSE}, this argument is not required.} 26 | 27 | \item{retention_time}{a numeric column in the \code{data} data frame that contains retention 28 | times of precursors.} 29 | 30 | \item{peak_width}{a numeric column in the \code{data} data frame that contains peak width 31 | information. It is not required if \code{retention_time_start} and \code{retention_time_end} 32 | columns are provided.} 33 | 34 | \item{retention_time_start}{a numeric column in the \code{data} data frame that contains the 35 | start time of the precursor elution peak. It is not required if the \code{peak_width} column 36 | is provided.} 37 | 38 | \item{retention_time_end}{a numeric column in the \code{data} data frame that contains the end 39 | time of the precursor elution peak. It is not required if the \code{peak_width} column is 40 | provided.} 41 | 42 | \item{remove_na_intensities}{a logical value that specifies if sample/grouping combinations 43 | with intensities that are NA (not quantified IDs) should be dropped from the data frame. 44 | Default is TRUE since we are usually interested in the peak width of quantifiable data.} 45 | 46 | \item{interactive}{a logical value that specifies whether the plot should be interactive 47 | (default is FALSE).} 48 | } 49 | \value{ 50 | A line plot displaying one minute binned median precursor elution peak width over 51 | retention time for each sample. 52 | } 53 | \description{ 54 | Plots one minute binned median precursor elution peak width over retention time for each sample. 55 | } 56 | \examples{ 57 | 58 | data <- data.frame( 59 | r_file_name = c(rep("sample_1", 10), rep("sample2", 10)), 60 | fg_quantity = c(rep(2000, 20)), 61 | eg_mean_apex_rt = c(rep(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 2)), 62 | eg_start_rt = c(0.5, 1, 3, 4, 5, 6, 7, 7.5, 8, 9, 1, 2, 2, 3, 4, 5, 5, 8, 9, 9), 63 | eg_end_rt = c( 64 | 1.5, 2, 3.1, 4.5, 5.8, 6.6, 8, 8, 8.4, 65 | 9.1, 3, 2.2, 4, 3.4, 4.5, 5.5, 5.6, 8.3, 10, 12 66 | ) 67 | ) 68 | qc_peak_width( 69 | data, 70 | sample = r_file_name, 71 | intensity = fg_quantity, 72 | retention_time = eg_mean_apex_rt, 73 | retention_time_start = eg_start_rt, 74 | retention_time_end = eg_end_rt 75 | ) 76 | } 77 | -------------------------------------------------------------------------------- /man/qc_ids.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_ids.R 3 | \name{qc_ids} 4 | \alias{qc_ids} 5 | \title{Check number of precursor, peptide or protein IDs} 6 | \usage{ 7 | qc_ids( 8 | data, 9 | sample, 10 | grouping, 11 | intensity, 12 | remove_na_intensities = TRUE, 13 | condition = NULL, 14 | title = "ID count per sample", 15 | plot = TRUE, 16 | interactive = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{a data frame containing at least sample names and precursor/peptide/protein IDs.} 21 | 22 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.} 23 | 24 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor or 25 | peptide identifiers.} 26 | 27 | \item{intensity}{a character column in the \code{data} data frame that contains raw or log2 28 | transformed intensities. If \code{remove_na_intensities = FALSE}, this argument is optional.} 29 | 30 | \item{remove_na_intensities}{a logical value that specifies if sample/grouping combinations with 31 | intensities that are NA (not quantified IDs) should be dropped from the data frame. Default is 32 | TRUE since we are usually interested in the number of quantifiable IDs.} 33 | 34 | \item{condition}{optional, a column in the \code{data} data frame that contains condition information 35 | (e.g. "treated" and "control"). If this column is provided, the bars in the plot will be coloured 36 | according to the condition.} 37 | 38 | \item{title}{optional, a character value that specifies the plot title (default is "ID count 39 | per sample").} 40 | 41 | \item{plot}{a logical value that indicates whether the result should be plotted.} 42 | 43 | \item{interactive}{a logical value that specifies whether the plot should be interactive 44 | (default is FALSE).} 45 | } 46 | \value{ 47 | A bar plot with the height corresponding to the number of IDs, each bar represents one 48 | sample (if \code{plot = TRUE}). If \code{plot = FALSE} a table with ID counts is returned. 49 | } 50 | \description{ 51 | Returns a plot or table of the number of IDs for each sample. The default settings remove 52 | grouping variables without quantitative information (intensity is NA). These will not be 53 | counted as IDs. 54 | } 55 | \examples{ 56 | set.seed(123) # Makes example reproducible 57 | 58 | # Create example data 59 | data <- create_synthetic_data( 60 | n_proteins = 100, 61 | frac_change = 0.05, 62 | n_replicates = 3, 63 | n_conditions = 2, 64 | method = "effect_random" 65 | ) 66 | 67 | # Calculate number of identifications 68 | qc_ids( 69 | data = data, 70 | sample = sample, 71 | grouping = peptide, 72 | intensity = peptide_intensity_missing, 73 | condition = condition, 74 | plot = FALSE 75 | ) 76 | 77 | # Plot number of identifications 78 | qc_ids( 79 | data = data, 80 | sample = sample, 81 | grouping = peptide, 82 | intensity = peptide_intensity_missing, 83 | condition = condition, 84 | plot = TRUE 85 | ) 86 | } 87 | -------------------------------------------------------------------------------- /man/qc_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_pca.R 3 | \name{qc_pca} 4 | \alias{qc_pca} 5 | \title{Plot principal component analysis} 6 | \usage{ 7 | qc_pca( 8 | data, 9 | sample, 10 | grouping, 11 | intensity, 12 | condition, 13 | components = c("PC1", "PC2"), 14 | digestion = NULL, 15 | plot_style = "pca" 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{a data frame that contains sample names, peptide or precursor identifiers, 20 | corresponding intensities and a condition column indicating e.g. the treatment.} 21 | 22 | \item{sample}{a character column in the \code{data} data frame that contains the sample name.} 23 | 24 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor 25 | or peptide identifiers.} 26 | 27 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding 28 | intensity values for each peptide or precursor.} 29 | 30 | \item{condition}{a numeric or character column in the \code{data} data frame that contains condition information 31 | (e.g. "treated" and "control").} 32 | 33 | \item{components}{a character vector indicating the two components that should be displayed in 34 | the plot. By default these are PC1 and PC2. You can provide these using a character vector of 35 | the form c("PC1", "PC2").} 36 | 37 | \item{digestion}{optional, a character column in the \code{data} data frame that indicates the 38 | mode of digestion (limited proteolysis or tryptic digest). Alternatively, any other variable 39 | by which the data should be split can be provided.} 40 | 41 | \item{plot_style}{a character value that specifies what plot should be returned. If 42 | \code{plot_style = "pca"} is selected the two PCA components supplied with the \code{components} argument 43 | are plottet against each other. This is the default. \code{plot_style = "scree"} returns a scree 44 | plot that displays the variance explained by each principal component in percent. The scree is 45 | useful for checking if any other than the default first two components should be plotted.} 46 | } 47 | \value{ 48 | A principal component analysis plot showing PC1 and PC2. If \code{plot_style = "scree"}, a 49 | scree plot for all dimensions is returned. 50 | } 51 | \description{ 52 | Plots a principal component analysis based on peptide or precursor intensities. 53 | } 54 | \examples{ 55 | set.seed(123) # Makes example reproducible 56 | 57 | # Create example data 58 | data <- create_synthetic_data( 59 | n_proteins = 100, 60 | frac_change = 0.05, 61 | n_replicates = 3, 62 | n_conditions = 2, 63 | ) 64 | 65 | # Plot scree plot 66 | qc_pca( 67 | data = data, 68 | sample = sample, 69 | grouping = peptide, 70 | intensity = peptide_intensity_missing, 71 | condition = condition, 72 | plot_style = "scree" 73 | ) 74 | 75 | # Plot principal components 76 | qc_pca( 77 | data = data, 78 | sample = sample, 79 | grouping = peptide, 80 | intensity = peptide_intensity_missing, 81 | condition = condition 82 | ) 83 | } 84 | -------------------------------------------------------------------------------- /R/normalise.R: -------------------------------------------------------------------------------- 1 | #' Intensity normalisation 2 | #' 3 | #' `r lifecycle::badge('deprecated')` 4 | #' This function was deprecated due to its name changing to `normalise()`. 5 | #' The normalisation method in the new function needs to be provided as an argument. 6 | #' 7 | #' @return A data frame with a column called \code{normalised_intensity_log2} containing the 8 | #' normalised intensity values. 9 | #' @keywords internal 10 | #' @export 11 | median_normalisation <- function(...) { 12 | # This function has been renamed and is therefore deprecated. 13 | lifecycle::deprecate_warn("0.2.0", 14 | "median_normalisation()", 15 | "normalise()", 16 | details = "This function has been renamed." 17 | ) 18 | 19 | normalise(...) 20 | } 21 | #' Intensity normalisation 22 | #' 23 | #' Performs normalisation on intensities. For median normalisation the normalised intensity is the 24 | #' original intensity minus the run median plus the global median. This is also the way it is 25 | #' implemented in the Spectronaut search engine. 26 | #' 27 | #' @param data a data frame containing at least sample names and intensity values. Please note that if the 28 | #' data frame is grouped, the normalisation will be computed by group. 29 | #' @param sample a character column in the \code{data} data frame that contains the sample names. 30 | #' @param intensity_log2 a numeric column in the \code{data} data frame that contains the log2 transformed 31 | #' intensity values to be normalised. 32 | #' @param method a character value specifying the method to be used for normalisation. Default 33 | #' is "median". 34 | #' 35 | #' @return A data frame with a column called \code{normalised_intensity_log2} containing the 36 | #' normalised intensity values. 37 | #' @import dplyr 38 | #' @importFrom magrittr %>% 39 | #' @importFrom rlang .data 40 | #' @importFrom stats median 41 | #' @export 42 | #' 43 | #' @examples 44 | #' data <- data.frame( 45 | #' r_file_name = c("s1", "s2", "s3", "s1", "s2", "s3"), 46 | #' intensity_log2 = c(18, 19, 17, 20, 21, 19) 47 | #' ) 48 | #' 49 | #' normalise(data, 50 | #' sample = r_file_name, 51 | #' intensity_log2 = intensity_log2, 52 | #' method = "median" 53 | #' ) 54 | normalise <- 55 | function(data, 56 | sample, 57 | intensity_log2, 58 | method = "median") { 59 | # Ensure method is valid 60 | if (!(method %in% c("median"))) { 61 | stop("Invalid method. Available methods: median") 62 | } 63 | 64 | if (method == "median") { 65 | median_normalised <- data %>% 66 | dplyr::distinct() %>% 67 | dplyr::mutate(global_median = stats::median({{ intensity_log2 }}, na.rm = TRUE)) %>% 68 | dplyr::group_by({{ sample }}, .add = TRUE) %>% 69 | dplyr::mutate(run_median = stats::median({{ intensity_log2 }}, na.rm = TRUE)) %>% 70 | dplyr::ungroup({{ sample }}) %>% 71 | dplyr::mutate(normalised_intensity_log2 = {{ intensity_log2 }} - .data$run_median + .data$global_median) %>% 72 | dplyr::select(-.data$run_median, -.data$global_median) 73 | 74 | return(median_normalised) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /R/fetch_uniprot_proteome.R: -------------------------------------------------------------------------------- 1 | #' Fetch proteome data from UniProt 2 | #' 3 | #' Fetches proteome data from UniProt for the provided organism ID. 4 | #' 5 | #' @param organism_id a numeric value that specifies the NCBI taxonomy identifier (TaxId) for an 6 | #' organism. 7 | #' @param columns a character vector of metadata columns that should be imported from UniProt (all 8 | #' possible columns can be found \href{https://www.uniprot.org/help/return_fields}{here}. For 9 | #' cross-referenced database provide the database name with the prefix "xref_", e.g. \code{"xref_pdb"}). 10 | #' Note: Not more than one or two columns should be selected otherwise the function will not be 11 | #' able to efficiently retrieve the information. If more information is needed, \code{fetch_uniprot()} 12 | #' can be used with the IDs retrieved by this function. 13 | #' @param reviewed a logical value that determines if only reviewed protein entries will be retrieved. 14 | #' @param timeout a numeric value specifying the time in seconds until the download times out. 15 | #' The default is 60 seconds. 16 | #' @param max_tries a numeric value that specifies the number of times the function tries to download 17 | #' the data in case an error occurs. The default is 2. 18 | #' 19 | #' @return A data frame that contains all protein metadata specified in \code{columns} for the 20 | #' organism of choice. 21 | #' @importFrom janitor make_clean_names 22 | #' @export 23 | #' 24 | #' @examples 25 | #' \donttest{ 26 | #' head(fetch_uniprot_proteome(9606)) 27 | #' } 28 | fetch_uniprot_proteome <- 29 | function(organism_id, 30 | columns = c("accession"), 31 | reviewed = TRUE, 32 | timeout = 120, 33 | max_tries = 5) { 34 | if (!curl::has_internet()) { 35 | message("No internet connection.") 36 | return(invisible(NULL)) 37 | } 38 | 39 | if (length(organism_id) == 0) { 40 | stop("No valid organism ID found.") 41 | } 42 | if (length(columns) > 4) { 43 | warning(strwrap("We suggest to use the fetch_uniprot function to fetch more than four columns.", 44 | prefix = "\n", initial = "" 45 | )) 46 | } 47 | url <- "http://rest.uniprot.org/uniprotkb/stream?query=" 48 | column_names <- janitor::make_clean_names(columns) 49 | collapsed_columns <- paste(columns, collapse = ",") 50 | reviewed <- paste0("reviewed:", ifelse(reviewed == TRUE, "true", "false")) 51 | organism_id <- paste0("organism_id:", organism_id) 52 | query_url <- 53 | utils::URLencode(paste0( 54 | url, 55 | reviewed, 56 | "+AND+", 57 | organism_id, 58 | "&format=tsv&fields=", 59 | collapsed_columns 60 | )) 61 | result <- try_query(query_url, timeout = timeout, max_tries = max_tries, silent = FALSE, progress = FALSE, show_col_types = FALSE) 62 | # result can either be a data.frame or it is a character string with the error message 63 | if (!methods::is(result, "data.frame")) { 64 | if (stringr::str_detect(result, pattern = "Timeout")) { 65 | message('The data retrieval timed out. Consider increasing the "timeout" or "max_tries" argument. \n') 66 | } 67 | return(invisible(result)) 68 | } 69 | colnames(result) <- column_names 70 | result 71 | } 72 | -------------------------------------------------------------------------------- /R/calculate_sequence_coverage.R: -------------------------------------------------------------------------------- 1 | #' Protein sequence coverage 2 | #' 3 | #' `r lifecycle::badge('deprecated')` 4 | #' This function was deprecated due to its name changing to `calculate_sequence_coverage()`. 5 | #' 6 | #' @return A new column in the \code{data} data frame containing the calculated sequence coverage 7 | #' for each identified protein 8 | #' @keywords internal 9 | #' @export 10 | sequence_coverage <- function(...) { 11 | # This function has been renamed and is therefore deprecated. 12 | lifecycle::deprecate_warn("0.2.0", 13 | "sequence_coverage()", 14 | "calculate_sequence_coverage()", 15 | details = "This function has been renamed." 16 | ) 17 | calculate_sequence_coverage(...) 18 | } 19 | #' Protein sequence coverage 20 | #' 21 | #' Calculate sequence coverage for each identified protein. 22 | #' 23 | #' @param data a data frame containing at least the protein sequence and the identified peptides 24 | #' as columns. 25 | #' @param protein_sequence a character column in the \code{data} data frame that contains protein 26 | #' sequences. Can be obtained by using the function \code{fetch_uniprot()} 27 | #' @param peptides a character column in the \code{data} data frame that contains the identified 28 | #' peptides. 29 | #' 30 | #' @return A new column in the \code{data} data frame containing the calculated sequence coverage 31 | #' for each identified protein 32 | #' @import dplyr 33 | #' @importFrom magrittr %>% 34 | #' @importFrom stringr str_count 35 | #' @importFrom rlang .data as_name enquo 36 | #' @importFrom tidyr drop_na 37 | #' @export 38 | #' 39 | #' @examples 40 | #' data <- data.frame( 41 | #' protein_sequence = c("abcdefghijklmnop", "abcdefghijklmnop"), 42 | #' pep_stripped_sequence = c("abc", "jklmn") 43 | #' ) 44 | #' 45 | #' calculate_sequence_coverage( 46 | #' data, 47 | #' protein_sequence = protein_sequence, 48 | #' peptides = pep_stripped_sequence 49 | #' ) 50 | calculate_sequence_coverage <- 51 | function(data, protein_sequence, peptides) { 52 | groups <- dplyr::group_vars(data) 53 | 54 | result <- data %>% 55 | # drop_na prevents function from failing if a protein group contains only NA peptide sequences. 56 | tidyr::drop_na({{ peptides }}) %>% 57 | dplyr::distinct({{ protein_sequence }}, {{ peptides }}) %>% 58 | dplyr::group_by({{ protein_sequence }}, .add = TRUE) %>% 59 | find_peptide({{ protein_sequence }}, {{ peptides }}) %>% 60 | dplyr::mutate(sequence_length = nchar({{ protein_sequence }})) %>% 61 | dplyr::mutate(modified_sequence = replace_identified_by_x({{ protein_sequence }}, .data$start, .data$end)) %>% 62 | dplyr::mutate(covered = stringr::str_count(.data$modified_sequence, "x")) %>% 63 | dplyr::mutate(coverage = .data$covered / .data$sequence_length * 100) %>% 64 | dplyr::select(-c( 65 | .data$sequence_length, 66 | .data$modified_sequence, 67 | .data$covered, 68 | .data$start, 69 | .data$end, 70 | .data$aa_before, 71 | .data$last_aa, 72 | .data$aa_after, 73 | {{ peptides }} 74 | )) %>% 75 | dplyr::distinct() %>% 76 | dplyr::ungroup() 77 | 78 | data %>% 79 | dplyr::left_join(result, by = c(rlang::as_name(rlang::enquo(protein_sequence)), groups)) 80 | } 81 | -------------------------------------------------------------------------------- /R/pval_distribution_plot.R: -------------------------------------------------------------------------------- 1 | #' Plot histogram of p-value distribution 2 | #' 3 | #' `r lifecycle::badge('deprecated')` 4 | #' This function was deprecated due to its name changing to `pval_distribution_plot()`. 5 | #' 6 | #' @return A histogram plot that shows the p-value distribution. 7 | #' @keywords internal 8 | #' @export 9 | plot_pval_distribution <- function(...) { 10 | # This function has been renamed and is therefore deprecated. 11 | lifecycle::deprecate_warn("0.2.0", 12 | "plot_pval_distribution()", 13 | "pval_distribution_plot()", 14 | details = "This function has been renamed." 15 | ) 16 | 17 | pval_distribution_plot(...) 18 | } 19 | #' Plot histogram of p-value distribution 20 | #' 21 | #' Plots the distribution of p-values derived from any statistical test as a histogram. 22 | #' 23 | #' @param data a data frame that contains at least grouping identifiers (precursor, peptide or 24 | #' protein) and p-values derived from any statistical test. 25 | #' @param grouping a character column in the \code{data} data frame that contains either precursor, 26 | #' peptide or protein identifiers. For each entry in this column there should be one unique p-value. 27 | #' That means the statistical test that created the p-value should have been performed on the 28 | #' level of the content of this column. 29 | #' @param pval a numeric column in the \code{data} data frame that contains p-values. 30 | #' @param facet_by optional, a character column that contains information by which the data should 31 | #' be faceted into multiple plots. 32 | #' 33 | #' @return A histogram plot that shows the p-value distribution. 34 | #' @import ggplot2 35 | #' @importFrom magrittr %>% 36 | #' @importFrom dplyr distinct 37 | #' @importFrom tidyr drop_na 38 | #' @export 39 | #' 40 | #' @examples 41 | #' set.seed(123) # Makes example reproducible 42 | #' 43 | #' # Create example data 44 | #' data <- data.frame( 45 | #' peptide = paste0("peptide", 1:1000), 46 | #' pval = runif(n = 1000) 47 | #' ) 48 | #' 49 | #' # Plot p-values 50 | #' pval_distribution_plot( 51 | #' data = data, 52 | #' grouping = peptide, 53 | #' pval = pval 54 | #' ) 55 | pval_distribution_plot <- function(data, grouping, pval, facet_by = NULL) { 56 | input <- data %>% 57 | dplyr::distinct({{ grouping }}, {{ pval }}, {{ facet_by }}) %>% 58 | tidyr::drop_na() 59 | 60 | plot <- input %>% 61 | ggplot2::ggplot(ggplot2::aes(x = {{ pval }})) + 62 | ggplot2::geom_histogram( 63 | binwidth = 0.05, 64 | boundary = 0, 65 | color = "black", 66 | fill = "#5680C1", 67 | size = 1 68 | ) + 69 | ggplot2::labs(title = "P-Value Distribution", x = "P-Value", y = "Frequency") + 70 | { 71 | if (!missing(facet_by)) { 72 | ggplot2::facet_wrap(rlang::new_formula(NULL, rlang::enquo(facet_by)), 73 | scales = "fixed" 74 | ) 75 | } 76 | } + 77 | ggplot2::theme_bw() + 78 | ggplot2::theme( 79 | plot.title = ggplot2::element_text(size = 20), 80 | axis.title.x = ggplot2::element_text(size = 15), 81 | axis.text.y = ggplot2::element_text(size = 15), 82 | axis.text.x = ggplot2::element_text(size = 15), 83 | axis.title.y = ggplot2::element_text(size = 15), 84 | strip.text = ggplot2::element_text(size = 15), 85 | strip.background = element_blank() 86 | ) 87 | plot 88 | } 89 | -------------------------------------------------------------------------------- /man/qc_peptide_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_peptide_type.R 3 | \name{qc_peptide_type} 4 | \alias{qc_peptide_type} 5 | \title{Check peptide type percentage share} 6 | \usage{ 7 | qc_peptide_type( 8 | data, 9 | sample, 10 | peptide, 11 | pep_type, 12 | intensity, 13 | remove_na_intensities = TRUE, 14 | method = "count", 15 | plot = FALSE, 16 | interactive = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{a data frame that contains at least the input columns.} 21 | 22 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.} 23 | 24 | \item{peptide}{a character column in the \code{data} data frame that contains the peptide 25 | sequence.} 26 | 27 | \item{pep_type}{a character column in the \code{data} data frame that contains the peptide 28 | type. Can be obtained using the \code{find_peptide} and \code{assign_peptide_type} function 29 | together.} 30 | 31 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding 32 | raw or normalised intensity values (not log2) for each peptide or precursor. Required when 33 | "intensity" is chosen as the method.} 34 | 35 | \item{remove_na_intensities}{a logical value that specifies if sample/peptide combinations with 36 | intensities that are NA (not quantified IDs) should be dropped from the data frame for analysis 37 | of peptide type distributions. Default is TRUE since we are usually interested in the peptide 38 | type distribution of quantifiable IDs. This is only relevant for method = "count".} 39 | 40 | \item{method}{a character value that indicates the method used for evaluation. 41 | \code{method = "intensity"} calculates the peptide type percentage by intensity, whereas 42 | \code{method = "count"} calculates the percentage by peptide ID count. Default is 43 | \code{method = count}.} 44 | 45 | \item{plot}{a logical value that indicates whether the result should be plotted.} 46 | 47 | \item{interactive}{a logical value that indicates whether the plot should be interactive.} 48 | } 49 | \value{ 50 | A data frame that contains the calculated percentage shares of each peptide type per 51 | sample. The \code{count} column contains the number of peptides with a specific type. The 52 | \code{peptide_type_percent} column contains the percentage share of a specific peptide type. 53 | } 54 | \description{ 55 | Calculates the percentage share of each peptide types (fully-tryptic, semi-tryptic, 56 | non-tryptic) for each sample. 57 | } 58 | \examples{ 59 | # Load libraries 60 | library(dplyr) 61 | 62 | set.seed(123) # Makes example reproducible 63 | 64 | # Create example data 65 | data <- create_synthetic_data( 66 | n_proteins = 100, 67 | frac_change = 0.05, 68 | n_replicates = 3, 69 | n_conditions = 2, 70 | method = "effect_random" 71 | ) \%>\% 72 | mutate(intensity_non_log2 = 2^peptide_intensity_missing) 73 | 74 | # Determine peptide type percentages 75 | qc_peptide_type( 76 | data = data, 77 | sample = sample, 78 | peptide = peptide, 79 | pep_type = pep_type, 80 | intensity = intensity_non_log2, 81 | method = "intensity", 82 | plot = FALSE 83 | ) 84 | 85 | # Plot peptide type 86 | qc_peptide_type( 87 | data = data, 88 | sample = sample, 89 | peptide = peptide, 90 | pep_type = pep_type, 91 | intensity = intensity_non_log2, 92 | method = "intensity", 93 | plot = TRUE 94 | ) 95 | } 96 | -------------------------------------------------------------------------------- /man/qc_charge_states.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_charge_states.R 3 | \name{qc_charge_states} 4 | \alias{qc_charge_states} 5 | \title{Check charge state distribution} 6 | \usage{ 7 | qc_charge_states( 8 | data, 9 | sample, 10 | grouping, 11 | charge_states, 12 | intensity = NULL, 13 | remove_na_intensities = TRUE, 14 | method = "count", 15 | plot = FALSE, 16 | interactive = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{a data frame that contains at least sample names, peptide or precursor identifiers 21 | and missed cleavage counts for each peptide or precursor.} 22 | 23 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.} 24 | 25 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor or 26 | peptide identifiers.} 27 | 28 | \item{charge_states}{a character or numeric column in the \code{data} data frame that contains 29 | the different charge states assigned to the precursor or peptide.} 30 | 31 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding 32 | raw or normalised intensity values (not log2) for each peptide or precursor. Required when 33 | "intensity" is chosen as the method.} 34 | 35 | \item{remove_na_intensities}{a logical value that specifies if sample/grouping combinations with 36 | intensities that are NA (not quantified IDs) should be dropped from the data frame for analysis 37 | of missed cleavages. Default is TRUE since we are usually interested in quantifiable peptides. 38 | This is only relevant for method = "count".} 39 | 40 | \item{method}{a character value that indicates the method used for evaluation. "count" 41 | calculates the charge state distribution based on counts of the corresponding peptides or 42 | precursors in the charge state group, "intensity" calculates the percentage of precursors or 43 | peptides in each charge state group based on the corresponding intensity values.} 44 | 45 | \item{plot}{a logical value that indicates whether the result should be plotted.} 46 | 47 | \item{interactive}{a logical value that specifies whether the plot should be interactive 48 | (default is FALSE).} 49 | } 50 | \value{ 51 | A data frame that contains the calculated percentage made up by the sum of either 52 | all counts or intensities of peptides or precursors of the corresponding charge state 53 | (depending on which method is chosen). 54 | } 55 | \description{ 56 | Calculates the charge state distribution for each sample (by count or intensity). 57 | } 58 | \examples{ 59 | # Load libraries 60 | library(dplyr) 61 | 62 | set.seed(123) # Makes example reproducible 63 | 64 | # Create example data 65 | data <- create_synthetic_data( 66 | n_proteins = 100, 67 | frac_change = 0.05, 68 | n_replicates = 3, 69 | n_conditions = 2, 70 | method = "effect_random" 71 | ) \%>\% 72 | mutate(intensity_non_log2 = 2^peptide_intensity_missing) 73 | 74 | # Calculate charge percentages 75 | qc_charge_states( 76 | data = data, 77 | sample = sample, 78 | grouping = peptide, 79 | charge_states = charge, 80 | intensity = intensity_non_log2, 81 | method = "intensity", 82 | plot = FALSE 83 | ) 84 | 85 | # Plot charge states 86 | qc_charge_states( 87 | data = data, 88 | sample = sample, 89 | grouping = peptide, 90 | charge_states = charge, 91 | intensity = intensity_non_log2, 92 | method = "intensity", 93 | plot = TRUE 94 | ) 95 | } 96 | -------------------------------------------------------------------------------- /man/predict_alphafold_domain.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict_alphafold_domain.R 3 | \name{predict_alphafold_domain} 4 | \alias{predict_alphafold_domain} 5 | \title{Predict protein domains of AlphaFold predictions} 6 | \usage{ 7 | predict_alphafold_domain( 8 | pae_list, 9 | pae_power = 1, 10 | pae_cutoff = 5, 11 | graph_resolution = 1, 12 | return_data_frame = FALSE, 13 | show_progress = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{pae_list}{a list of proteins that contains aligned errors for their AlphaFold predictions. 18 | This list can be retrieved with the \code{fetch_alphafold_aligned_error()} function. It should contain a 19 | column containing the scored residue (\code{scored_residue}), the aligned residue (\code{aligned_residue}) and 20 | the predicted aligned error (\code{error}).} 21 | 22 | \item{pae_power}{a numeric value, each edge in the graph will be weighted proportional to (\code{1 / pae^pae_power}). 23 | Default is \code{1}.} 24 | 25 | \item{pae_cutoff}{a numeric value, graph edges will only be created for residue pairs with \code{pae < pae_cutoff}. 26 | Default is \code{5}.} 27 | 28 | \item{graph_resolution}{a numeric value that regulates how aggressive the clustering algorithm is. Smaller values 29 | lead to larger clusters. Value should be larger than zero, and values larger than 5 are unlikely to be useful. 30 | Higher values lead to stricter (i.e. smaller) clusters. The value is provided to the Leiden clustering algorithm 31 | of the \code{igraph} package as \code{graph_resolution / 100}. Default is \code{1}.} 32 | 33 | \item{return_data_frame}{a logical value; if \code{TRUE} a data frame instead of a list 34 | is returned. It is recommended to only use this if information for few proteins is retrieved. 35 | Default is \code{FALSE}.} 36 | 37 | \item{show_progress}{a logical value that specifies if a progress bar will be shown. Default 38 | is \code{TRUE}.} 39 | } 40 | \value{ 41 | A list of the provided proteins that contains domain assignments for each residue. If \code{return_data_frame} is 42 | \code{TRUE}, a data frame with this information is returned instead. The data frame contains the 43 | following columns: 44 | \itemize{ 45 | \item residue: The protein residue number. 46 | \item domain: A numeric value representing a distinct predicted domain in the protein. 47 | \item accession: The UniProt protein identifier. 48 | } 49 | } 50 | \description{ 51 | Uses the predicted aligned error (PAE) of AlphaFold predictions to find possible protein domains. 52 | A graph-based community clustering algorithm (Leiden clustering) is used on the predicted error 53 | (distance) between residues of a protein in order to infer pseudo-rigid groups in the protein. This is 54 | for example useful in order to know which parts of protein predictions are likely in a fixed relative 55 | position towards each other and which might have varying distances. 56 | This function is based on python code written by Tristan Croll. The original code can be found on his 57 | \href{https://github.com/tristanic/pae_to_domains}{GitHub page}. 58 | } 59 | \examples{ 60 | \donttest{ 61 | # Fetch aligned errors 62 | aligned_error <- fetch_alphafold_aligned_error( 63 | uniprot_ids = c("F4HVG8", "O15552"), 64 | error_cutoff = 4 65 | ) 66 | 67 | # Predict protein domains 68 | af_domains <- predict_alphafold_domain( 69 | pae_list = aligned_error, 70 | return_data_frame = TRUE 71 | ) 72 | 73 | head(af_domains, n = 10) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /R/assign_peptide_type.R: -------------------------------------------------------------------------------- 1 | #' Assign peptide type 2 | #' 3 | #' `r lifecycle::badge('deprecated')` 4 | #' This function was deprecated due to its name changing to `assign_peptide_type()`. 5 | #' 6 | #' @return A data frame that contains the input data and an additional column with the peptide 7 | #' type information. 8 | #' @keywords internal 9 | #' @export 10 | peptide_type <- function(...) { 11 | # This function has been renamed and is therefore deprecated. 12 | lifecycle::deprecate_warn("0.2.0", 13 | "peptide_type()", 14 | "assign_peptide_type()", 15 | details = "This function has been renamed." 16 | ) 17 | 18 | assign_peptide_type(...) 19 | } 20 | #' Assign peptide type 21 | #' 22 | #' Based on preceding and C-terminal amino acid, the peptide type of a given peptide is assigned. 23 | #' Peptides with preceeding and C-terminal lysine or arginine are considered fully-tryptic. If a 24 | #' peptide is located at the N- or C-terminus of a protein and fulfills the criterium to be 25 | #' fully-tryptic otherwise, it is also considered as fully-tryptic. Peptides that only fulfill the 26 | #' criterium on one terminus are semi-tryptic peptides. Lastly, peptides that are not fulfilling 27 | #' the criteria for both termini are non-tryptic peptides. 28 | #' 29 | #' @param data a data frame containing at least information about the preceding and C-terminal 30 | #' amino acids of peptides. 31 | #' @param aa_before a character column in the \code{data} data frame that contains the preceding amino 32 | #' acid as one letter code. 33 | #' @param last_aa a character column in the \code{data} data frame that contains the C-terminal amino 34 | #' acid as one letter code. 35 | #' @param aa_after a character column in the \code{data} data frame that contains the following amino 36 | #' acid as one letter code. 37 | #' 38 | #' @return A data frame that contains the input data and an additional column with the peptide 39 | #' type information. 40 | #' @import dplyr 41 | #' @importFrom magrittr %>% 42 | #' @importFrom rlang .data 43 | #' @export 44 | #' 45 | #' @examples 46 | #' data <- data.frame( 47 | #' aa_before = c("K", "S", "T"), 48 | #' last_aa = c("R", "K", "Y"), 49 | #' aa_after = c("T", "R", "T") 50 | #' ) 51 | #' 52 | #' assign_peptide_type(data, aa_before, last_aa, aa_after) 53 | assign_peptide_type <- function(data, 54 | aa_before = aa_before, 55 | last_aa = last_aa, 56 | aa_after = aa_after) { 57 | data %>% 58 | dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>% 59 | dplyr::mutate(N_term_tryp = dplyr::if_else({{ aa_before }} == "" | 60 | {{ aa_before }} == "K" | 61 | {{ aa_before }} == "R", 62 | TRUE, 63 | FALSE 64 | )) %>% 65 | dplyr::mutate(C_term_tryp = dplyr::if_else({{ last_aa }} == "K" | 66 | {{ last_aa }} == "R" | 67 | {{ aa_after }} == "", 68 | TRUE, 69 | FALSE 70 | )) %>% 71 | dplyr::mutate(pep_type = dplyr::case_when( 72 | .data$N_term_tryp + .data$C_term_tryp == 2 ~ "fully-tryptic", 73 | .data$N_term_tryp + .data$C_term_tryp == 1 ~ "semi-tryptic", 74 | .data$N_term_tryp + .data$C_term_tryp == 0 ~ "non-tryptic" 75 | )) %>% 76 | dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>% 77 | dplyr::right_join(data, by = c( 78 | rlang::as_name(rlang::enquo(aa_before)), 79 | rlang::as_name(rlang::enquo(last_aa)), 80 | rlang::as_name(rlang::enquo(aa_after)) 81 | )) 82 | } 83 | -------------------------------------------------------------------------------- /man/diff_abundance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_diff_abundance.R 3 | \name{diff_abundance} 4 | \alias{diff_abundance} 5 | \title{Calculate differential abundance between conditions} 6 | \usage{ 7 | diff_abundance(...) 8 | } 9 | \value{ 10 | A data frame that contains differential abundances (\code{diff}), p-values (\code{pval}) 11 | and adjusted p-values (\code{adj_pval}) for each protein, peptide or precursor (depending on 12 | the \code{grouping} variable) and the associated treatment/reference pair. Depending on the 13 | method the data frame contains additional columns: 14 | \itemize{ 15 | \item "t-test": The \code{std_error} column contains the standard error of the differential 16 | abundances. \code{n_obs} contains the number of observations for the specific protein, peptide 17 | or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. 18 | \item "t-test_mean_sd": Columns labeled as control refer to the second condition of the 19 | comparison pairs. Treated refers to the first condition. \code{mean_control} and \code{mean_treated} 20 | columns contain the means for the reference and treatment condition, respectively. \code{sd_control} 21 | and \code{sd_treated} columns contain the standard deviations for the reference and treatment 22 | condition, respectively. \code{n_control} and \code{n_treated} columns contain the numbers of 23 | samples for the reference and treatment condition, respectively. The \code{std_error} column 24 | contains the standard error of the differential abundances. \code{t_statistic} contains the 25 | t_statistic for the t-test. 26 | \item "moderated_t-test": \code{CI_2.5} and \code{CI_97.5} contain the 2.5\% and 97.5\% 27 | confidence interval borders for differential abundances. \code{avg_abundance} contains average 28 | abundances for treatment/reference pairs (mean of the two group means). \code{t_statistic} 29 | contains the t_statistic for the t-test. \code{B} The B-statistic is the log-odds that the 30 | protein, peptide or precursor (depending on \code{grouping}) has a differential abundance 31 | between the two groups. Suppose B=1.5. The odds of differential abundance is exp(1.5)=4.48, i.e, 32 | about four and a half to one. The probability that there is a differential abundance is 33 | 4.48/(1+4.48)=0.82, i.e., the probability is about 82\% that this group is differentially 34 | abundant. A B-statistic of zero corresponds to a 50-50 chance that the group is differentially 35 | abundant.\code{n_obs} contains the number of observations for the specific protein, peptide or 36 | precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. 37 | \item "proDA": The \code{std_error} column contains the standard error of the differential 38 | abundances. \code{avg_abundance} contains average abundances for treatment/reference pairs 39 | (mean of the two group means). \code{t_statistic} contains the t_statistic for the t-test. 40 | \code{n_obs} contains the number of observations for the specific protein, peptide or precursor 41 | (depending on the \code{grouping} variable) and the associated treatment/reference pair. 42 | } 43 | } 44 | \description{ 45 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} 46 | This function was deprecated due to its name changing to \code{calculate_diff_abundance()}. 47 | } 48 | \keyword{internal} 49 | -------------------------------------------------------------------------------- /man/fetch_quickgo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_quickgo.R 3 | \name{fetch_quickgo} 4 | \alias{fetch_quickgo} 5 | \title{Fetch information from the QuickGO API} 6 | \usage{ 7 | fetch_quickgo( 8 | type = "annotations", 9 | id_annotations = NULL, 10 | taxon_id_annotations = NULL, 11 | ontology_annotations = "all", 12 | go_id_slims = NULL, 13 | relations_slims = c("is_a", "part_of", "regulates", "occurs_in"), 14 | timeout = 1200, 15 | max_tries = 2, 16 | show_progress = TRUE 17 | ) 18 | } 19 | \arguments{ 20 | \item{type}{a character value that indicates if gene ontology terms, annotations or slims 21 | should be retrieved. The possible values therefore include "annotations", "terms" and "slims". 22 | If annotations are retrieved, the maximum number of results is 2,000,000.} 23 | 24 | \item{id_annotations}{an optional character vector that specifies UniProt IDs for which GO annotations 25 | should be retrieved. This argument should only be provided if annotations are retrieved.} 26 | 27 | \item{taxon_id_annotations}{an optional character value that specifies the NCBI taxonomy identifier (TaxId) 28 | for an organism for which GO annotations should be retrieved. 29 | This argument should only be provided if annotations are retrieved.} 30 | 31 | \item{ontology_annotations}{an optional character value that specifies the ontology that should be retrieved. 32 | This can either have the values "all", "molecular_function", "biological_process" or 33 | "cellular_component". This argument should only be provided if annotations are retrieved.} 34 | 35 | \item{go_id_slims}{an optional character vector that specifies gene ontology IDs (e.g. GO:0046872) for which 36 | a slim go set should be generated. This argument should only be provided if slims are retrieved.} 37 | 38 | \item{relations_slims}{an optional character vector that specifies the relations of GO IDs that should be 39 | considered for the generation of the slim dataset. This argument should only be provided if slims are retrieved.} 40 | 41 | \item{timeout}{a numeric value specifying the time in seconds until the download times out. 42 | The default is 1200 seconds.} 43 | 44 | \item{max_tries}{a numeric value that specifies the number of times the function tries to download 45 | the data in case an error occurs. The default is 2.} 46 | 47 | \item{show_progress}{a logical value that indicates if a progress bar will be shown. 48 | Default is TRUE.} 49 | } 50 | \value{ 51 | A data frame that contains descriptive information about gene ontology annotations, terms or slims 52 | depending on what the input "type" was. 53 | } 54 | \description{ 55 | Fetches gene ontology (GO) annotations, terms or slims from the QuickGO EBI database. 56 | Annotations can be retrieved for specific UniProt IDs or NCBI taxonomy identifiers. When 57 | terms are retrieved, a complete list of all GO terms is returned. For the generation of 58 | a slim dataset you can provide GO IDs that should be considered. A slim dataset is a subset 59 | GO dataset that considers all child terms of the supplied IDs. 60 | } 61 | \examples{ 62 | \donttest{ 63 | # Annotations 64 | annotations <- fetch_quickgo( 65 | type = "annotations", 66 | id = c("P63328", "Q4FFP4"), 67 | ontology = "molecular_function" 68 | ) 69 | 70 | head(annotations) 71 | 72 | # Terms 73 | terms <- fetch_quickgo(type = "terms") 74 | 75 | head(terms) 76 | 77 | # Slims 78 | slims <- fetch_quickgo( 79 | type = "slims", 80 | go_id_slims = c("GO:0046872", "GO:0051540") 81 | ) 82 | 83 | head(slims) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /man/fetch_pdb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fetch_pdb.R 3 | \name{fetch_pdb} 4 | \alias{fetch_pdb} 5 | \title{Fetch structure information from RCSB} 6 | \usage{ 7 | fetch_pdb(pdb_ids, batchsize = 100, show_progress = TRUE) 8 | } 9 | \arguments{ 10 | \item{pdb_ids}{a character vector of PDB identifiers.} 11 | 12 | \item{batchsize}{a numeric value that specifies the number of structures to be processed in a 13 | single query. Default is 100.} 14 | 15 | \item{show_progress}{a logical value that indicates if a progress bar will be shown. Default is 16 | TRUE.} 17 | } 18 | \value{ 19 | A data frame that contains structure metadata for the PDB IDs provided. The data frame 20 | contains some columns that might not be self explanatory. 21 | \itemize{ 22 | \item auth_asym_id: Chain identifier provided by the author of the structure in order to 23 | match the identification used in the publication that describes the structure. 24 | \item label_asym_id: Chain identifier following the standardised convention for mmCIF files. 25 | \item entity_beg_seq_id, ref_beg_seq_id, length, pdb_sequence: \code{entity_beg_seq_id} is a 26 | position in the structure sequence (\code{pdb_sequence}) that matches the position given in 27 | \code{ref_beg_seq_id}, which is a position within the protein sequence (not included in the 28 | data frame). \code{length} identifies the stretch of sequence for which positions match 29 | accordingly between structure and protein sequence. \code{entity_beg_seq_id} is a residue ID 30 | based on the standardised convention for mmCIF files. 31 | \item auth_seq_id: Residue identifier provided by the author of the structure in order to 32 | match the identification used in the publication that describes the structure. This character 33 | vector has the same length as the \code{pdb_sequence} and each position is the identifier for 34 | the matching amino acid position in \code{pdb_sequence}. The contained values are not 35 | necessarily numbers and the values do not have to be positive. 36 | \item modified_monomer: Is composed of first the composition ID of the modification, followed 37 | by the \code{label_seq_id} position. In parenthesis are the parent monomer identifiers as 38 | they appear in the sequence. 39 | \item ligand_*: Any column starting with the \code{ligand_*} prefix contains information about 40 | the position, identity and donors for ligand binding sites. If there are multiple entities of 41 | ligands they are separated by "|". Specific donor level information is separated by ";". 42 | \item secondar_structure: Contains information about helix and sheet secondary structure elements. 43 | Individual regions are separated by ";". 44 | \item unmodeled_structure: Contains information about unmodeled or partially modeled regions in 45 | the model. Individual regions are separated by ";". 46 | \item auth_seq_id_original: In some cases the sequence positions do not match the number of residues 47 | in the sequence either because positions are missing or duplicated. This always coincides with modified 48 | residues, however does not always occur when there is a modified residue in the sequence. This column 49 | contains the original \code{auth_seq_id} information that does not have these positions corrected. 50 | } 51 | } 52 | \description{ 53 | Fetches structure metadata from RCSB. If you want to retrieve atom data such as positions, use 54 | the function \code{fetch_pdb_structure()}. 55 | } 56 | \examples{ 57 | \donttest{ 58 | pdb <- fetch_pdb(pdb_ids = c("6HG1", "1E9I", "6D3Q", "4JHW")) 59 | 60 | head(pdb) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /man/qc_missed_cleavages.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qc_missed_cleavages.R 3 | \name{qc_missed_cleavages} 4 | \alias{qc_missed_cleavages} 5 | \title{Check missed cleavages} 6 | \usage{ 7 | qc_missed_cleavages( 8 | data, 9 | sample, 10 | grouping, 11 | missed_cleavages, 12 | intensity, 13 | remove_na_intensities = TRUE, 14 | method = "count", 15 | plot = FALSE, 16 | interactive = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{a data frame containing at least sample names, peptide or precursor identifiers 21 | and missed cleavage counts for each peptide or precursor.} 22 | 23 | \item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.} 24 | 25 | \item{grouping}{a character column in the \code{data} data frame that contains either precursor or 26 | peptide identifiers.} 27 | 28 | \item{missed_cleavages}{a numeric column in the \code{data} data frame that contains the counts 29 | of missed cleavages per peptide or precursor.} 30 | 31 | \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding 32 | raw or normalised intensity values (not log2) for each peptide or precursor. Required when 33 | "intensity" is chosen as the method.} 34 | 35 | \item{remove_na_intensities}{a logical value that specifies if sample/grouping combinations with 36 | intensities that are NA (not quantified IDs) should be dropped from the data frame for analysis 37 | of missed cleavages. Default is TRUE since we are usually interested in quantifiable peptides. 38 | This is only relevant for method = "count".} 39 | 40 | \item{method}{a character value that indicates the method used for evaluation. "count" 41 | calculates the percentage of missed cleavages based on counts of the corresponding peptide or 42 | precursor, "intensity" calculates the percentage of missed cleavages by intensity of the 43 | corresponding peptide or precursor.} 44 | 45 | \item{plot}{a logical value that indicates whether the result should be plotted.} 46 | 47 | \item{interactive}{a logical value that specifies whether the plot should be interactive 48 | (default is FALSE).} 49 | } 50 | \value{ 51 | A data frame that contains the calculated percentage made up by the sum of all peptides 52 | or precursors containing the corresponding amount of missed cleavages. 53 | } 54 | \description{ 55 | Calculates the percentage of missed cleavages for each sample (by count or intensity). The 56 | default settings remove grouping variables without quantitative information (intensity is NA). 57 | These will not be used for the calculation of missed cleavage percentages. 58 | } 59 | \examples{ 60 | library(dplyr) 61 | 62 | set.seed(123) # Makes example reproducible 63 | 64 | # Create example data 65 | data <- create_synthetic_data( 66 | n_proteins = 100, 67 | frac_change = 0.05, 68 | n_replicates = 3, 69 | n_conditions = 2, 70 | method = "effect_random" 71 | ) \%>\% 72 | mutate(intensity_non_log2 = 2^peptide_intensity_missing) 73 | 74 | # Calculate missed cleavage percentages 75 | qc_missed_cleavages( 76 | data = data, 77 | sample = sample, 78 | grouping = peptide, 79 | missed_cleavages = n_missed_cleavage, 80 | intensity = intensity_non_log2, 81 | method = "intensity", 82 | plot = FALSE 83 | ) 84 | 85 | # Plot missed cleavages 86 | qc_missed_cleavages( 87 | data = data, 88 | sample = sample, 89 | grouping = peptide, 90 | missed_cleavages = n_missed_cleavage, 91 | intensity = intensity_non_log2, 92 | method = "intensity", 93 | plot = TRUE 94 | ) 95 | } 96 | -------------------------------------------------------------------------------- /R/qc_median_intensities.R: -------------------------------------------------------------------------------- 1 | #' Median run intensities 2 | #' 3 | #' Median intensities per run are returned either as a plot or a table. 4 | #' 5 | #' @param data a data frame that contains at least the input variables. 6 | #' @param sample a character or factor column in the \code{data} data frame that contains the sample name. 7 | #' @param grouping a character column in the \code{data} data frame that contains either precursor or 8 | #' peptide identifiers. 9 | #' @param intensity a numeric column in the \code{data} data frame that contains intensity values. 10 | #' The intensity should be ideally log2 transformed, but also non-transformed values can be used. 11 | #' @param plot a logical value that indicates whether the result should be plotted. 12 | #' @param interactive a logical value that specifies whether the plot should be interactive 13 | #' (default is FALSE). 14 | #' 15 | #' @return A plot that displays median intensity over all samples. If \code{plot = FALSE} a data 16 | #' frame containing median intensities is returned. 17 | #' @import dplyr 18 | #' @import ggplot2 19 | #' @importFrom plotly ggplotly 20 | #' @importFrom magrittr %>% 21 | #' @importFrom rlang .data 22 | #' @importFrom stringr str_sort 23 | #' @export 24 | #' 25 | #' @examples 26 | #' set.seed(123) # Makes example reproducible 27 | #' 28 | #' # Create example data 29 | #' data <- create_synthetic_data( 30 | #' n_proteins = 100, 31 | #' frac_change = 0.05, 32 | #' n_replicates = 3, 33 | #' n_conditions = 2, 34 | #' method = "effect_random" 35 | #' ) 36 | #' 37 | #' # Calculate median intensities 38 | #' qc_median_intensities( 39 | #' data = data, 40 | #' sample = sample, 41 | #' grouping = peptide, 42 | #' intensity = peptide_intensity_missing, 43 | #' plot = FALSE 44 | #' ) 45 | #' 46 | #' # Plot median intensities 47 | #' qc_median_intensities( 48 | #' data = data, 49 | #' sample = sample, 50 | #' grouping = peptide, 51 | #' intensity = peptide_intensity_missing, 52 | #' plot = TRUE 53 | #' ) 54 | qc_median_intensities <- function(data, 55 | sample, 56 | grouping, 57 | intensity, 58 | plot = TRUE, 59 | interactive = FALSE) { 60 | table <- data %>% 61 | dplyr::distinct({{ sample }}, {{ grouping }}, {{ intensity }}) %>% 62 | dplyr::group_by({{ sample }}) %>% 63 | dplyr::summarize( 64 | median_intensity = stats::median({{ intensity }}, na.rm = TRUE), 65 | .groups = "drop" 66 | ) 67 | 68 | if (plot == FALSE) { 69 | return(table) 70 | } 71 | 72 | if (is(dplyr::pull(table, {{ sample }}), "character")) { 73 | table <- table %>% 74 | mutate({{ sample }} := factor({{ sample }}, 75 | levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) 76 | )) 77 | } 78 | 79 | plot <- table %>% 80 | ggplot2::ggplot(ggplot2::aes({{ sample }}, .data$median_intensity, group = 1)) + 81 | ggplot2::geom_line(size = 1) + 82 | ggplot2::labs(title = "Medians of run intensities", x = "", y = "Intensity") + 83 | ggplot2::theme_bw() + 84 | ggplot2::theme( 85 | plot.title = ggplot2::element_text(size = 20), 86 | axis.title.x = ggplot2::element_text(size = 15), 87 | axis.text.y = ggplot2::element_text(size = 15), 88 | axis.text.x = ggplot2::element_text(size = 12, angle = 75, hjust = 1), 89 | axis.title.y = ggplot2::element_text(size = 15) 90 | ) 91 | 92 | if (interactive == FALSE) { 93 | return(plot) 94 | } 95 | 96 | suppressWarnings(plotly::ggplotly(plot)) 97 | } 98 | -------------------------------------------------------------------------------- /man/barcode_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/barcode_plot.R 3 | \name{barcode_plot} 4 | \alias{barcode_plot} 5 | \title{Barcode plot} 6 | \usage{ 7 | barcode_plot( 8 | data, 9 | start_position, 10 | end_position, 11 | protein_length, 12 | coverage = NULL, 13 | colouring = NULL, 14 | fill_colour_gradient = protti::mako_colours, 15 | fill_colour_discrete = c("#999999", protti::protti_colours), 16 | protein_id = NULL, 17 | facet = NULL, 18 | facet_n_col = 4, 19 | cutoffs = NULL 20 | ) 21 | } 22 | \arguments{ 23 | \item{data}{a data frame containing differential abundance, start and end peptide or precursor positions and protein length.} 24 | 25 | \item{start_position}{a numeric column in the data frame containing the start positions for each peptide or precursor.} 26 | 27 | \item{end_position}{a numeric column in the data frame containing the end positions for each peptide or precursor.} 28 | 29 | \item{protein_length}{a numeric column in the data frame containing the length of the protein.} 30 | 31 | \item{coverage}{optional, numeric column in the data frame containing coverage in percent. Will appear in the title of the barcode if provided.} 32 | 33 | \item{colouring}{optional, column in the data frame containing information by which peptide or precursors should 34 | be colored.} 35 | 36 | \item{fill_colour_gradient}{a vector that contains colours that should be used to create a colour gradient 37 | for the barcode plot bars if the \code{colouring} argument is continuous. Default is \code{mako_colours}.} 38 | 39 | \item{fill_colour_discrete}{a vector that contains colours that should be used to fill the barcode plot bars 40 | if the \code{colouring} argument is discrete. Default is \code{protti_colours}.} 41 | 42 | \item{protein_id}{optional, column in the data frame containing protein identifiers. Required if only one protein 43 | should be plotted and the data frame contains only information for this protein.} 44 | 45 | \item{facet}{optional, column in the data frame containing information by which data should be faceted. This can be 46 | protein identifiers. Only 20 proteins are plotted at a time, the rest is ignored. If more should be plotted, a mapper over a 47 | subsetted data frame should be created.} 48 | 49 | \item{facet_n_col}{a numeric value that specifies the number of columns the faceted plot should have 50 | if a column name is provided to group. The default is 4.} 51 | 52 | \item{cutoffs}{optional argument specifying the log2 fold change and significance cutoffs used for highlighting peptides. 53 | If this argument is provided colouring information will be overwritten with peptides that fulfill this condition. 54 | The cutoff should be provided in a vector of the form c(diff = 2, pval = 0.05). The name of the cutoff should reflect the 55 | column name that contains this information (log2 fold changes, p-values or adjusted p-values).} 56 | } 57 | \value{ 58 | A barcode plot is returned. 59 | } 60 | \description{ 61 | Plots a "barcode plot" - a vertical line for each identified peptide. Peptides can be colored based on an additional variable. Also differential 62 | abundance can be displayed. 63 | } 64 | \examples{ 65 | 66 | data <- data.frame( 67 | start = c(5, 40, 55, 130, 181, 195), 68 | end = c(11, 51, 60, 145, 187, 200), 69 | length = rep(200, 6), 70 | pg_protein_accessions = rep("Protein 1", 6), 71 | diff = c(1, 2, 5, 2, 1, 1), 72 | pval = c(0.1, 0.01, 0.01, 0.2, 0.2, 0.01) 73 | ) 74 | 75 | barcode_plot( 76 | data, 77 | start_position = start, 78 | end_position = end, 79 | protein_length = length, 80 | facet = pg_protein_accessions, 81 | cutoffs = c(diff = 2, pval = 0.05) 82 | ) 83 | } 84 | -------------------------------------------------------------------------------- /man/calculate_kegg_enrichment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_kegg_enrichment.R 3 | \name{calculate_kegg_enrichment} 4 | \alias{calculate_kegg_enrichment} 5 | \title{Perform KEGG pathway enrichment analysis} 6 | \usage{ 7 | calculate_kegg_enrichment( 8 | data, 9 | protein_id, 10 | is_significant, 11 | pathway_id = pathway_id, 12 | pathway_name = pathway_name, 13 | plot = TRUE, 14 | plot_cutoff = "adj_pval top10" 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{a data frame that contains at least the input variables.} 19 | 20 | \item{protein_id}{a character column in the \code{data} data frame that contains the protein 21 | accession numbers.} 22 | 23 | \item{is_significant}{a logical column in the \code{data} data frame that indicates if the 24 | corresponding protein has a significantly changing peptide. The input data frame may contain 25 | peptide level information with significance information. The function is able to extract 26 | protein level information from this.} 27 | 28 | \item{pathway_id}{a character column in the \code{data} data frame that contains KEGG pathway 29 | identifiers. These can be obtained from KEGG using \code{fetch_kegg}.} 30 | 31 | \item{pathway_name}{a character column in the \code{data} data frame that contains KEGG pathway 32 | names. These can be obtained from KEGG using \code{fetch_kegg}.} 33 | 34 | \item{plot}{a logical value indicating whether the result should be plotted or returned as a 35 | table.} 36 | 37 | \item{plot_cutoff}{a character value indicating if the plot should contain the top 10 most 38 | significant proteins (p-value or adjusted p-value), or if a significance cutoff should be used 39 | to determine the number of GO terms in the plot. This information should be provided with the 40 | type first followed by the threshold separated by a space. Example are 41 | \code{plot_cutoff = "adj_pval top10"}, \code{plot_cutoff = "pval 0.05"} or 42 | \code{plot_cutoff = "adj_pval 0.01"}. The threshold can be chosen freely.} 43 | } 44 | \value{ 45 | A bar plot displaying negative log10 adjusted p-values for the top 10 enriched pathways. 46 | Bars are coloured according to the direction of the enrichment. If \code{plot = FALSE}, a data 47 | frame is returned. 48 | } 49 | \description{ 50 | Analyses enrichment of KEGG pathways associated with proteins in the fraction of significant 51 | proteins compared to all detected proteins. A Fisher's exact test is performed to test 52 | significance of enrichment. 53 | } 54 | \examples{ 55 | \donttest{ 56 | # Load libraries 57 | library(dplyr) 58 | 59 | set.seed(123) # Makes example reproducible 60 | 61 | # Create example data 62 | kegg_data <- fetch_kegg(species = "eco") 63 | 64 | if (!is.null(kegg_data)) { # only proceed if information was retrieved 65 | data <- kegg_data \%>\% 66 | group_by(uniprot_id) \%>\% 67 | mutate(significant = rep( 68 | sample( 69 | x = c(TRUE, FALSE), 70 | size = 1, 71 | replace = TRUE, 72 | prob = c(0.2, 0.8) 73 | ), 74 | n = n() 75 | )) 76 | 77 | # Plot KEGG enrichment 78 | calculate_kegg_enrichment( 79 | data, 80 | protein_id = uniprot_id, 81 | is_significant = significant, 82 | pathway_id = pathway_id, 83 | pathway_name = pathway_name, 84 | plot = TRUE, 85 | plot_cutoff = "pval 0.05" 86 | ) 87 | 88 | # Calculate KEGG enrichment 89 | kegg <- calculate_kegg_enrichment( 90 | data, 91 | protein_id = uniprot_id, 92 | is_significant = significant, 93 | pathway_id = pathway_id, 94 | pathway_name = pathway_name, 95 | plot = FALSE 96 | ) 97 | 98 | head(kegg, n = 10) 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /R/calculate_imputation.R: -------------------------------------------------------------------------------- 1 | #' Sampling of values for imputation 2 | #' 3 | #' \code{calculate_imputation} is a helper function that is used in the \code{impute} function. 4 | #' Depending on the type of missingness and method, it samples values from a normal distribution 5 | #' that can be used for the imputation. Note: The input intensities should be log2 transformed. 6 | #' 7 | #' @param min a numeric value specifying the minimal intensity value of the precursor/peptide. 8 | #' Is only required if \code{method = "ludovic"} and \code{missingness = "MNAR"}. 9 | #' @param noise a numeric value specifying a noise value for the precursor/peptide. Is only 10 | #' required if \code{method = "noise"} and \code{missingness = "MNAR"}. 11 | #' @param mean a numeric value specifying the mean intensity value of the condition with missing 12 | #' values for a given precursor/peptide. Is only required if \code{missingness = "MAR"}. 13 | #' @param sd a numeric value specifying the mean of the standard deviation of all conditions for 14 | #' a given precursor/peptide. 15 | #' @param missingness a character value specifying the missingness type of the data determines 16 | #' how values for imputation are sampled. This can be \code{"MAR"} or \code{"MNAR"}. 17 | #' @param method a character value specifying the method to be used for imputation. For 18 | #' \code{method = "ludovic"}, MNAR missingness is sampled around a value that is three lower 19 | #' (log2) than the lowest intensity value recorded for the precursor/peptide. For 20 | #' \code{method = "noise"}, MNAR missingness is sampled around the noise value for the 21 | #' precursor/peptide. 22 | #' @param skip_log2_transform_error a logical value, if FALSE a check is performed to validate that 23 | #' input values are log2 transformed. If input values are > 40 the test is failed and an error is 24 | #' returned. 25 | #' 26 | #' @return A value sampled from a normal distribution with the input parameters. Method specifics 27 | #' are applied to input parameters prior to sampling. 28 | calculate_imputation <- 29 | function(min = NULL, 30 | noise = NULL, 31 | mean = NULL, 32 | sd, 33 | missingness = c("MNAR", "MAR"), 34 | method = c("ludovic", "noise"), 35 | skip_log2_transform_error = FALSE) { 36 | if ((ifelse(is.na(ifelse(is.null(min), 0, min) > 40), 37 | FALSE, 38 | ifelse(is.null(min), 0, min) > 40 39 | ) | 40 | ifelse(is.na(ifelse(is.null(mean), 0, mean) > 40), 41 | FALSE, 42 | ifelse(is.null(mean), 0, mean) > 40 43 | ) | 44 | ifelse(is.na(ifelse(is.null(noise), 0, noise) > 40), 45 | FALSE, 46 | ifelse(is.null(noise), 0, noise) > 40 47 | )) & 48 | skip_log2_transform_error == FALSE) { 49 | stop(strwrap("Input intensities seem not to be log2 transformed. If they are and you want 50 | to proceed set the skip_log2_transform_error argument to TRUE. Notice that 51 | this function does not give correct results for non-log2 transformed data.", 52 | prefix = "\n", initial = "" 53 | )) 54 | } 55 | if (!(missingness %in% c("MNAR", "MAR"))) { 56 | return(NA) 57 | } 58 | if (method == "ludovic") { 59 | if (missingness == "MNAR") { 60 | result <- suppressWarnings(stats::rnorm(1, mean = min - 3, sd = sd)) 61 | } 62 | if (missingness == "MAR") { 63 | result <- suppressWarnings(stats::rnorm(1, mean = mean, sd = sd)) 64 | } 65 | } 66 | if (method == "noise") { 67 | if (missingness == "MNAR") { 68 | result <- suppressWarnings(stats::rnorm(1, mean = noise, sd = sd)) 69 | } 70 | if (missingness == "MAR") { 71 | result <- suppressWarnings(stats::rnorm(1, mean = mean, sd = sd)) 72 | } 73 | } 74 | result 75 | } 76 | -------------------------------------------------------------------------------- /R/calculate_aa_scores.R: -------------------------------------------------------------------------------- 1 | #' Calculate scores for each amino acid position in a protein sequence 2 | #' 3 | #' `r lifecycle::badge("experimental")` 4 | #' Calculate a score for each amino acid position in a protein sequence based on the product of the 5 | #' -log10(adjusted p-value) and the absolute log2(fold change) per peptide covering this amino acid. In detail, all the 6 | #' peptides are aligned along the sequence of the corresponding protein, and the average score per 7 | #' amino acid position is computed. In a limited proteolysis coupled to mass spectrometry (LiP-MS) 8 | #' experiment, the score allows to prioritize and narrow down structurally affected regions. 9 | #' 10 | #' @param data a data frame containing at least the input columns. 11 | #' @param adj_pval a numeric column in the \code{data} data frame containing the adjusted p-value. 12 | #' @param diff a numeric column in the \code{data} data frame containing the log2 fold change. 13 | #' @param start_position a numeric column \code{data} in the data frame containing the start position 14 | #' of a peptide or precursor. 15 | #' @param end_position a numeric column in the data frame containing the end position of a peptide or 16 | #' precursor. 17 | #' @param protein a character column in the data frame containing the protein identifier or name. 18 | #' @param retain_columns a vector indicating if certain columns should be retained from the input 19 | #' data frame. Default is not retaining additional columns \code{retain_columns = NULL}. Specific 20 | #' columns can be retained by providing their names (not in quotations marks, just like other 21 | #' column names, but in a vector). 22 | #' 23 | #' @return A data frame that contains the aggregated scores per amino acid position, enabling to 24 | #' draw fingerprints for each individual protein. 25 | #' 26 | #' @author Patrick Stalder 27 | #' @import dplyr 28 | #' @import tidyr 29 | #' @export 30 | #' 31 | #' @examples 32 | #' 33 | #' data <- data.frame( 34 | #' pg_protein_accessions = c(rep("protein_1", 10)), 35 | #' diff = c(2, -3, 1, 2, 3, -3, 5, 1, -0.5, 2), 36 | #' adj_pval = c(0.001, 0.01, 0.2, 0.05, 0.002, 0.5, 0.4, 0.7, 0.001, 0.02), 37 | #' start = c(1, 3, 5, 10, 15, 25, 28, 30, 41, 51), 38 | #' end = c(6, 8, 10, 16, 23, 35, 35, 35, 48, 55) 39 | #' ) 40 | #' calculate_aa_scores( 41 | #' data, 42 | #' protein = pg_protein_accessions, 43 | #' diff = diff, 44 | #' adj_pval = adj_pval, 45 | #' start_position = start, 46 | #' end_position = end 47 | #' ) 48 | calculate_aa_scores <- function(data, 49 | protein, 50 | diff = diff, 51 | adj_pval = adj_pval, 52 | start_position, 53 | end_position, 54 | retain_columns = NULL) { 55 | output <- data %>% 56 | dplyr::ungroup() %>% 57 | dplyr::distinct({{ protein }}, {{ diff }}, {{ adj_pval }}, {{ start_position }}, {{ end_position }}) %>% 58 | tidyr::drop_na({{ diff }}, {{ adj_pval }}) %>% 59 | dplyr::mutate(score = -log10({{ adj_pval }}) * abs({{ diff }})) %>% 60 | dplyr::rowwise() %>% 61 | dplyr::mutate(residue = list(seq({{ start_position }}, {{ end_position }}))) %>% 62 | tidyr::unnest("residue") %>% 63 | dplyr::group_by({{ protein }}, .data$residue) %>% 64 | dplyr::mutate(amino_acid_score = mean(.data$score)) %>% 65 | dplyr::distinct({{ protein }}, .data$residue, .data$amino_acid_score) 66 | 67 | 68 | if (!missing(retain_columns)) { 69 | output <- data %>% 70 | dplyr::select(!!enquo(retain_columns), colnames(output)[!colnames(output) %in% c( 71 | "residue", 72 | "amino_acid_score" 73 | )]) %>% 74 | dplyr::distinct() %>% 75 | dplyr::right_join(output, by = colnames(output)[!colnames(output) %in% c( 76 | "residue", 77 | "amino_acid_score" 78 | )]) 79 | } 80 | 81 | output 82 | } 83 | --------------------------------------------------------------------------------