├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── autoTranslate.R ├── diopt.R ├── geneHistory.R ├── homologene.R ├── homologeneData2.R ├── import.R └── updateHomologene.R ├── README.md ├── README.rmd ├── cran-comments.md ├── data-raw ├── homologene2.tsv ├── homologeneData.tsv ├── release └── taxData.tsv ├── data ├── homologeneData.rda ├── homologeneData2.rda ├── homologeneVersion.rda └── taxData.rda ├── docs ├── LICENSE-text.html ├── LICENSE.html ├── README.html ├── authors.html ├── docsearch.css ├── docsearch.js ├── index.html ├── jquery.sticky-kit.min.js ├── link.svg ├── news │ └── index.html ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml └── reference │ ├── autoTranslate.html │ ├── diopt.html │ ├── getGeneHistory.html │ ├── getGeneInfo.html │ ├── getHomologene.html │ ├── homologene.html │ ├── homologeneData.html │ ├── homologeneData2.html │ ├── homologeneVersion.html │ ├── human2mouse.html │ ├── index.html │ ├── mouse2human.html │ ├── reexports.html │ ├── taxData.html │ ├── updateHomologene.html │ └── updateIDs.html ├── homologene.Rproj ├── man ├── autoTranslate.Rd ├── diopt.Rd ├── getGeneHistory.Rd ├── getGeneInfo.Rd ├── getHomologene.Rd ├── homologene.Rd ├── homologeneData.Rd ├── homologeneData2.Rd ├── homologeneVersion.Rd ├── human2mouse.Rd ├── mouse2human.Rd ├── reexports.Rd ├── taxData.Rd ├── updateHomologene.Rd └── updateIDs.Rd ├── process ├── autoUpdate.sh ├── biomartTests.R ├── dioptMemory.R ├── prepHomologene.R └── prepHomologene2.R └── tests ├── testthat.R └── testthat ├── test_diopt.R ├── test_homologene.R ├── test_utilities.R └── testfiles └── gene_history_trimmed.tsv /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^LICENSE\.md$ 2 | ^cran-comments\.md$ 3 | ^.*\.Rproj$ 4 | ^\.Rproj\.user$ 5 | ^\.httr-oauth$ 6 | ^\.travis\.yml$ 7 | ^data-raw$ 8 | ^process 9 | README.rmd 10 | ^docs$ 11 | ^README_cache$ 12 | ^cache -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .httr-oauth 5 | auth -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | r_github_packages: 7 | - jimhester/covr 8 | after_success: 9 | - Rscript -e 'covr::codecov()' 10 | warnings_are_errors: false -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: homologene 2 | Type: Package 3 | Title: Quick Access to Homologene and Gene Annotation Updates 4 | Version: 1.7.68.23.10.31 5 | Depends: R (>= 3.1.2) 6 | Imports: 7 | dplyr (>= 0.7.4), 8 | magrittr (>= 1.5), 9 | purrr (>= 0.2.5), 10 | readr (>= 1.3.1), 11 | R.utils(>= 2.8.0), 12 | assertthat (>= 0.2.1), 13 | rvest (>= 1.0.0), 14 | xml2 (>= 1.3.2) 15 | Suggests: 16 | testthat (>= 1.0.2) 17 | Date: 2023-10-31 18 | Authors@R: c( 19 | person("Ogan", "Mancarci", email = "ogan.mancarci@gmail.com", role = c("aut", "cre")), 20 | person("Leon","French", role = c('ctb'))) 21 | BugReports: https://github.com/oganm/homologene/issues 22 | URL: https://github.com/oganm/homologene 23 | Description: A wrapper for the homologene database by the National Center for 24 | Biotechnology Information ('NCBI'). It allows searching for gene homologs across 25 | species. Data in this package can be found at . 26 | The package also includes an updated version of the homologene database where 27 | gene identifiers and symbols are replaced with their latest (at the time of 28 | submission) version and functions to fetch latest annotation data to keep updated. 29 | License: MIT + file LICENSE 30 | LazyData: true 31 | RoxygenNote: 7.2.3 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019 2 | COPYRIGHT HOLDER: Ogan Mancarci 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2019 Ogan Mancarci 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%$%") 4 | export("%<>%") 5 | export("%>%") 6 | export(autoTranslate) 7 | export(diopt) 8 | export(getGeneHistory) 9 | export(getGeneInfo) 10 | export(getHomologene) 11 | export(homologene) 12 | export(human2mouse) 13 | export(mouse2human) 14 | export(updateHomologene) 15 | export(updateIDs) 16 | importFrom(magrittr,"%$%") 17 | importFrom(magrittr,"%<>%") 18 | importFrom(magrittr,"%>%") 19 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # homologene 1.5.68.x 2 | 3 | * Added `diopt` function to make queries at diopt database. 4 | * Further automatic updates to homologeneData2 5 | 6 | # homologene 1.4.68.19.3.24 (since 1.1.68) 7 | 8 | * Added a `NEWS.md` file to track changes to the package. 9 | * Added `autoTranslate` function to allow automated translation of gene symbols or ids. 10 | * `homologeneData2` is added as an updated version of the original homologene database (original database is not updated since 2014). This database includes the latest gene symbols and identifiers for every gene included in the original database. Outside CRAN (github version), this database is updated weekly. 11 | * Version number is extended to include the last update date of homologeneData2. 12 | * `updateHomologene` function is added to allow users create their own updated 13 | versions of homologene. Using `homologeneData2` as a baseline with this function 14 | allows faster updates. 15 | * `getGeneHistory`, `updateIDs` and `getGeneInfo` functions are added to allow users to update arbitrary gene lists with latest symbols and identifiers. 16 | * All species originally repsented in the homologene database are added to the package. -------------------------------------------------------------------------------- /R/autoTranslate.R: -------------------------------------------------------------------------------- 1 | #' Attempt to automatically translate a gene list 2 | #' 3 | #' @description Given a list of query gene list and a target gene list, the function 4 | #' tries find the homology pairing that matches the query list to the target list. The query list 5 | #' is a short list of genes while the target list is supposed to represent a large number of genes from the target 6 | #' species. The default output will be the largest possible list. If \code{returnAllPossible = TRUE} then 7 | #' all possible pairings with any matches are returned. It is possible to limit the 8 | #' search by setting \code{possibleOrigins} and \code{possibleTargets}. Note that gene symbols of some species 9 | #' are more similar to each other than others. Using this with small gene lists and without providing any 10 | #' \code{possibleOrigins} or \code{possibleTargets} might return multiple hits, or if \code{returnAllPossible = TRUE} 11 | #' a wrong match can be returned. 12 | #' 13 | #' @param genes A list of genes to match the target. Symbols or NCBI ids 14 | #' @param targetGenes The target list. This list is supposed to represent a large number of genes 15 | #' from the target species. 16 | #' @param possibleOrigins Taxonomic identifiers of possible origin species 17 | #' @param possibleTargets Taxonomic identifiers of possible target species 18 | #' @param returnAllPossible if TRUE returns all possible pairings with non zero gene matches. If FALSE (default) returns the best match 19 | #' @return A data frame if \code{returnAllPossibe = FALSE} and a list of data frames if \code{TRUE} 20 | #' @param db Homologene database to use. 21 | #' @export 22 | autoTranslate = function(genes, 23 | targetGenes, 24 | possibleOrigins= NULL, 25 | possibleTargets = NULL, 26 | returnAllPossible = FALSE, 27 | db = homologene::homologeneData){ 28 | pairwise = db$Taxonomy %>% 29 | unique %>% utils::combn(2) %>% 30 | {cbind(.,.[c(2,1),], 31 | rbind(db$Taxonomy %>% 32 | unique,db$Taxonomy %>% 33 | unique))} 34 | 35 | if(!is.null(possibleOrigins)){ 36 | possibleOrigins[possibleOrigins == 'human'] = 9606 37 | possibleOrigins[possibleOrigins == 'mouse'] = 10090 38 | 39 | pairwise = pairwise[,pairwise[1,] %in% possibleOrigins, drop = FALSE] 40 | } else{ 41 | possibleOrigins = db$Taxonomy %>% unique 42 | } 43 | if(!is.null(possibleTargets)){ 44 | possibleTargets[possibleTargets == 'human'] = 9606 45 | possibleTargets[possibleTargets == 'mouse'] = 10090 46 | pairwise = pairwise[,pairwise[2,] %in% possibleTargets,drop = FALSE] 47 | } else{ 48 | possibleTargets = db$Taxonomy %>% unique 49 | } 50 | 51 | 52 | possibleOriginData = db %>% 53 | dplyr::filter(Taxonomy %in% possibleOrigins & (Gene.Symbol %in% genes | Gene.ID %in% genes)) %>% 54 | dplyr::group_by(Taxonomy) 55 | possibleOriginCounts = possibleOriginData %>% dplyr::summarise(n = dplyr::n()) 56 | 57 | possibleTargetData = db %>% 58 | dplyr::filter(Taxonomy %in% possibleTargets & (Gene.Symbol %in% targetGenes | Gene.ID %in% targetGenes)) %>% 59 | dplyr::group_by(Taxonomy) 60 | possibleTargetCounts = possibleTargetData%>% dplyr::summarise(n = dplyr::n()) 61 | 62 | 63 | pairwise = pairwise[,pairwise[1,] %in% possibleOriginCounts$Taxonomy,drop= FALSE] 64 | pairwise = pairwise[,pairwise[2,] %in% possibleTargetCounts$Taxonomy, drop = FALSE] 65 | 66 | 67 | pairwise %>% apply(2,function(taxes){ 68 | homologene(genes,inTax = taxes[1],outTax = taxes[2]) 69 | }) %>% {.[purrr::map_int(.,nrow)>0]} -> possibleTranslations 70 | 71 | possibleTranslations %>% sapply(function(trans){ 72 | sum(c(trans[,2],trans[,4]) %in% targetGenes) 73 | }) -> translationCounts 74 | 75 | if(!returnAllPossible){ 76 | translationCounts %>% which.max %>% {possibleTranslations[[.]]} -> possibleTranslations 77 | if(sum(translationCounts>0)>1){ 78 | bestMatch = translationCounts %>% which.max 79 | nextBest = max(translationCounts[-bestMatch]) 80 | warning('There are other pairings, best of which has ',nextBest, ' matching genes') 81 | } 82 | } else{ 83 | possibleTranslations = possibleTranslations[translationCounts!=0] 84 | } 85 | return(possibleTranslations) 86 | } 87 | -------------------------------------------------------------------------------- /R/diopt.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Query DIOPT database 4 | #' 5 | #' Query DIOPT database (\url{https://www.flyrnai.org/cgi-bin/DRSC_orthologs.pl}) for orthologues. 6 | #' DIOPT database uses multiple tools to find gene orthologues. Sadly they don't have an 7 | #' API so this function queries by visiting the site and filling up the form. By default 8 | #' each query will take a minimum of 10 seconds due to \code{delay} parameter. This 9 | #' is taken from their robots.txt at the time this function is written. 10 | #' Note that DIOPT is not necesariy in sync with homologene database as provided in this package. 11 | #' 12 | #' DIOPT does not support all species available in the homologene database. The supported 13 | #' species are: 14 | #' 15 | #' \describe{ 16 | #' \item{4896}{Schizosaccharomyces pombe} 17 | #' \item{4932}{Saccharomyces cerevisiae} 18 | #' \item{6239}{Caenorhabditis elegans} 19 | #' \item{7227}{Drosophila melanogaster} 20 | #' \item{7955}{Danio rerio} 21 | #' \item{8364}{Xenopus (Silurana) tropicalis} 22 | #' \item{9606}{Homo sapiens} 23 | #' \item{10090}{Mus musculus} 24 | #' \item{10116}{Rattus norvegicus} 25 | #' \item{3702}{Arabidopsis thaliana} 26 | #' } 27 | #' 28 | #' 29 | #' @param genes A vector of gene identifiers. Anything that DIOPT accepts 30 | #' @param inTax taxid of the species that the input genes are coming from 31 | #' @param outTax taxid of the species that you are seeking homology. 0 to query 32 | #' all species. It must be specificed unless paralogue = TRUE 33 | #' @param paralogue If TRUE, searches for paralogues instead of orthologues. 34 | #' outTax cannot be specified when searching for paralogues 35 | #' @param delay How many seconds of delay should be between queries. Default is 10 36 | #' based on the robots.txt at the time this function is written. 37 | #' 38 | #' @return A data frame 39 | #' @export 40 | #' 41 | diopt = function(genes, inTax, outTax = NULL, paralogue = FALSE, delay = 10){ 42 | # rtxt = robotstxt::robotstxt(domain = "flyrnai.org") 43 | # delay = rtxt$crawl_delay %>% filter(useragent =='*') %$% value %>% as.integer() 44 | session = rvest::session('https://www.flyrnai.org/cgi-bin/DRSC_orthologs.pl') 45 | # session = rvest::html_session('https://www.flyrnai.org/cgi-bin/DRSC_orthologs.pl', httr::config(ssl_verifypeer = 0L)) 46 | form = rvest::html_form(session)[[1]] 47 | 48 | if(paralogue){ 49 | assertthat::assert_that(is.null(outTax),msg = 'outTax cannot be specified when querying paralogues') 50 | form$fields[[1]]$attr$class = "btn btn-outline-primary" 51 | form$fields[[2]]$attr$class = "btn btn-outline-primary active" 52 | outTax = "9606" 53 | } else{ 54 | assertthat::assert_that(!is.null(outTax),msg = 'outTax must be specified when querying orthologues') 55 | acceptableOutTax = form$fields$output_species$options 56 | assertthat::assert_that(outTax %in% acceptableOutTax) 57 | } 58 | 59 | acceptableInTax= form$fields$input_species$options 60 | 61 | assertthat::assert_that(inTax %in% acceptableInTax) 62 | 63 | form = rvest::html_form_set(form, 64 | input_species = inTax, 65 | output_species = outTax, 66 | gene_list = paste(genes,collapse = '\n\r')) 67 | 68 | # additional_filters = which(names(form$fields) == 'additional_filter') 69 | 70 | # additional_filter_names = form$fields[additional_filters] %>% purrr::map_chr('value') 71 | 72 | # form$fields[additional_filters][additional_filter_names %in% 'None'][[1]]$attr$checked = 'checked' 73 | # form$fields[additional_filters][additional_filter_names %in% 'NoLow'][[1]]$attr$checked = NULL 74 | 75 | values = form$fields %>% purrr::map('value') 76 | additional_filters = names(values) == 'additional_filter' 77 | noneField = values %>% purrr::map_lgl(function(x){length(x)==1&&x !='None'}) 78 | form$fields = form$fields[!(additional_filters & noneField)] 79 | 80 | 81 | values = form$fields %>% purrr::map('value') 82 | search_datasets = names(values) == 'search_datasets' 83 | allField = values %>% purrr::map_lgl(function(x){length(x)==1&&x !='All'}) 84 | form$fields = form$fields[!(search_datasets & allField)] 85 | 86 | # values = form$fields %>% purrr::map('value') 87 | # search_datasets = names(values) == 'search_fields' 88 | # allField = values %>% purrr::map_lgl(function(x){length(x)==1&&x !='***'}) 89 | # form$fields = form$fields[!(search_datasets & allField)] 90 | 91 | Sys.sleep(delay) 92 | 93 | response = rvest::html_form_submit(form,submit = 'submit') 94 | 95 | # writeLines(ogbox::as.char(session$response),'hede.html') 96 | # utils::browseURL('hede.html') 97 | # writeBin(response$content,'hede.html') 98 | # utils::browseURL('hede.html') 99 | 100 | output = response %>% 101 | xml2::read_html() %>% 102 | rvest::html_node('#results-table') %>% 103 | rvest::html_table() 104 | return(output) 105 | } 106 | -------------------------------------------------------------------------------- /R/geneHistory.R: -------------------------------------------------------------------------------- 1 | #' Download gene symbol information 2 | #' 3 | #' This function downloads the gene_info file from NCBI website and returns the 4 | #' gene symbols for current IDs. 5 | #' 6 | #' @param destfile Path of the output file. If NULL a temp file will be used 7 | #' @param justRead If TRUE and destfile exists, it reads the file instead of 8 | #' downloading the latest one from NCBI 9 | #' @param chunk_size Chunk size to be used with \code{link[readr]{read_tsv_chunked}}. 10 | #' The gene_info file is big enough to make its intake difficult. If you don't 11 | #' have large amounts of free memory you may have to reduce this number to read 12 | #' the file in smaller chunks 13 | #' 14 | #' @return A data frame with gene symbols for each current gene id 15 | #' @export 16 | #' 17 | getGeneInfo = function(destfile = NULL, justRead = FALSE,chunk_size = 1000000){ 18 | if(is.null(destfile)){ 19 | destfile = tempfile() 20 | } 21 | if(!(!is.null(destfile) && file.exists(destfile) && justRead)){ 22 | utils::download.file('https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz', 23 | paste0(destfile,'.gz')) 24 | 25 | R.utils::gunzip(paste0(destfile,'.gz'), overwrite = TRUE) 26 | } 27 | 28 | callBack = function(x,pos){ 29 | x[,c(1,2,3)] 30 | } 31 | geneInfo = readr::read_tsv_chunked(destfile, 32 | readr::DataFrameCallback$new(callBack), 33 | col_names = c('tax_id','GeneID','Symbol'), 34 | chunk_size = chunk_size, skip = 1, 35 | col_types = 'iic') 36 | 37 | } 38 | 39 | 40 | #' Download gene history file 41 | #' 42 | #' Downloads and reads the gene history file from NCBI website. This file is needed for 43 | #' other functions 44 | #' 45 | #' @param destfile Path of the output file. If NULL a temp file will be used 46 | #' @param justRead If TRUE and destfile exists, it reads the file instead of 47 | #' downloading the latest one from NCBI 48 | #' 49 | #' @return A data frame with latest gene history information 50 | #' @export 51 | #' 52 | getGeneHistory = function(destfile = NULL, justRead = FALSE){ 53 | if(is.null(destfile)){ 54 | destfile = tempfile() 55 | } 56 | 57 | if(!(!is.null(destfile) && file.exists(destfile) && justRead)){ 58 | utils::download.file(url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_history.gz", 59 | destfile = paste0(destfile,'.gz')) 60 | 61 | 62 | R.utils::gunzip(paste0(destfile,'.gz'), overwrite = TRUE) 63 | } 64 | 65 | gene_history = readr::read_tsv(destfile, 66 | col_names = c('tax_id', 67 | 'GeneID', 68 | 'Discontinued_GeneID', 69 | 'Discontinued_Symbol', 70 | 'Discontinue_Date'),skip = 1, 71 | col_types = 'icici') 72 | return(gene_history) 73 | } 74 | 75 | 76 | #' Update gene IDs 77 | #' 78 | #' Given a list of gene ids and gene history information, traces changes in the 79 | #' gene's name to get the latest valid ID 80 | #' 81 | #' @param ids Gene ids 82 | #' @param gene_history Gene history information, probably returned by \code{\link{getGeneHistory}} 83 | #' 84 | #' @return A character vector. New ids for genes that changed ids, or "-" for discontinued genes. 85 | #' the input itself. 86 | #' @export 87 | #' 88 | #' @examples 89 | #' \dontrun{ 90 | #' gene_history = getGeneHistory() 91 | #' updateIDs(c("4340964", "4349034", "4332470", "4334151", "4323831"),gene_history) 92 | #' } 93 | #' 94 | updateIDs = function(ids, gene_history){ 95 | # we do not filter for taxonomy information as some genes use alternative 96 | # tax ids in non homologene sources 97 | # we do filter for earliest date found to run this a little faster 98 | earlierst_date = gene_history %>% 99 | dplyr::filter(Discontinued_GeneID %in% as.integer(ids)) %$% 100 | Discontinue_Date %>% 101 | {suppressWarnings(min(.))} 102 | 103 | relevant_gene_history = gene_history %>% 104 | dplyr::filter(Discontinue_Date >= earlierst_date 105 | ) 106 | 107 | # just speed things along if the input id list includes ids that 108 | # are not discontinued 109 | idsToProcess = ids %in% relevant_gene_history$Discontinued_GeneID 110 | if(sum(idsToProcess)>0){ 111 | ids[idsToProcess] = ids[idsToProcess] %>% sapply(traceID,relevant_gene_history) 112 | } 113 | return(ids) 114 | 115 | } 116 | 117 | 118 | 119 | traceID = function(id,gene_history){ 120 | event = gene_history %>% dplyr::filter(Discontinued_GeneID == as.integer(id)) 121 | if(nrow(event)>1){ 122 | # just in case. if the same ID is discontinued twice, there is a problem... 123 | return("multiple events") 124 | } else if(nrow(event) == 0){ 125 | return(id) 126 | } 127 | 128 | while(TRUE){ 129 | if(event$GeneID == '-'){ 130 | # if this condition wasn't there, this function would have worked just fine but 131 | # looking for '-'s take much longer than looking for IDs 132 | return('-') 133 | } 134 | # see if the new ID is discontinued as well 135 | # the check for the "-"s above allows us to do an integer matching here 136 | # which is faster 137 | next_event = gene_history %>% 138 | dplyr::filter(Discontinued_GeneID == as.integer(event$GeneID)) 139 | if(nrow(next_event)==0){ 140 | # if not, previous ID is the right one 141 | return(event$GeneID) 142 | } else if(nrow(next_event)>1){ 143 | # just in case, if the same ID is discontinued twice, there is a problem... 144 | return("multiple events") 145 | } else if(nrow(next_event) == 1){ 146 | # if the new IDs is discontinued, continue the loop and check if it has a parent 147 | event = next_event 148 | } 149 | } 150 | } 151 | 152 | 153 | 154 | #' Get the latest homologene file 155 | #' 156 | #' This function downloads the latest homologene file from NCBI. Note that Homologene 157 | #' has not been updated since 2014 so the output will be identical to \code{\link{homologeneData}} 158 | #' included in this package. This function is here for futureproofing purposes. 159 | #' 160 | #' @param destfile Path of the output file. If NULL a temp file will be used 161 | #' @param justRead If TRUE and destfile exists, it reads the file instead of 162 | #' downloading the latest one from NCBI 163 | #' 164 | #' @return A data frame with homology groups, gene ids and gene symbols 165 | #' @export 166 | #' 167 | getHomologene = function(destfile = NULL, justRead = FALSE){ 168 | if(is.null(destfile)){ 169 | destfile = tempfile() 170 | } 171 | if(!(!is.null(destfile) && file.exists(destfile) && justRead)){ 172 | utils::download.file('https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data', 173 | destfile) 174 | } 175 | 176 | homologene = readr::read_tsv(destfile, 177 | col_names = c('HID','Taxonomy','Gene.ID','Gene.Symbol','Protein.GI','Protein.Accession'), 178 | col_types = 'iiicic') 179 | 180 | homologeneData = homologene %>% 181 | dplyr::select(HID,Gene.ID,Gene.Symbol,Taxonomy) %>% 182 | unique %>% 183 | dplyr::arrange(HID) 184 | 185 | homologeneData %<>% as.data.frame 186 | } 187 | -------------------------------------------------------------------------------- /R/homologene.R: -------------------------------------------------------------------------------- 1 | #' Get homologues of given genes 2 | #' @description Given a list of genes and a taxid, returns a data frame inlcuding the genes and their corresponding homologues 3 | #' @param genes A vector of gene symbols or NCBI ids 4 | #' @param inTax taxid of the species that the input genes are coming from 5 | #' @param outTax taxid of the species that you are seeking homology 6 | #' @param db Homologene database to use. 7 | #' @export 8 | #' @examples 9 | #' homologene(c('Eno2','17441'), inTax = 10090, outTax = 9606) 10 | homologene = function(genes, inTax, outTax, db = homologene::homologeneData){ 11 | genes <- unique(genes) #remove duplicates 12 | out = db %>% 13 | dplyr::filter(Taxonomy %in% inTax & (Gene.Symbol %in% genes | Gene.ID %in% genes)) %>% 14 | dplyr::select(HID,Gene.Symbol,Gene.ID) 15 | names(out)[2] = inTax 16 | names(out)[3] = paste0(inTax,'_ID') 17 | 18 | out2 = db %>% dplyr::filter(Taxonomy %in% outTax & HID %in% out$HID) %>% 19 | dplyr::select(HID,Gene.Symbol,Gene.ID) 20 | names(out2)[2] = outTax 21 | names(out2)[3] = paste0(outTax,'_ID') 22 | 23 | # merge from HID to support translate from self 24 | output = merge(out,out2,'HID') %>% 25 | dplyr::select(2,4,3,5) 26 | 27 | # preserve order with temporary column 28 | output$sortBy <- factor(output[,1], levels = genes) 29 | output <- dplyr::arrange(output, sortBy) 30 | output$sortBy <- NULL 31 | output %<>% {colnames(.)= gsub('\\.(x|y)','',colnames(.));.} 32 | 33 | return(output) 34 | } 35 | 36 | #' Mouse/human wraper for homologene 37 | #' @param genes A vector of gene symbols or NCBI ids 38 | #' @param db Homologene database to use. 39 | #' @export 40 | #' @examples 41 | #' mouse2human(c('Eno2','17441')) 42 | mouse2human = function(genes, db = homologene::homologeneData){ 43 | out = homologene(genes,10090,9606, db) 44 | names(out) = c('mouseGene', 'humanGene','mouseID','humanID') 45 | return(out) 46 | } 47 | 48 | 49 | #' Human/mouse wraper for homologene 50 | #' @param genes A vector of gene symbols or NCBI ids 51 | #' @param db Homologene database to use. 52 | #' @export 53 | #' @examples 54 | #' human2mouse(c('ENO2','4340')) 55 | human2mouse = function(genes, db = homologene::homologeneData){ 56 | out = homologene(genes,9606,10090, db) 57 | names(out) = c('humanGene','mouseGene','humanID','mouseID') 58 | return(out) 59 | } 60 | 61 | 62 | #' homologeneData 63 | #' 64 | #' List of gene homologues used by homologene functions 65 | "homologeneData" 66 | 67 | 68 | #' Version of homologene used 69 | "homologeneVersion" 70 | 71 | #' Names and ids of included species 72 | "taxData" -------------------------------------------------------------------------------- /R/homologeneData2.R: -------------------------------------------------------------------------------- 1 | #' homologeneData2 2 | #' 3 | #' A modified copy of the homologene database. Homologene was updated at 2014 and many of its gene IDs and 4 | #' symbols are out of date. Here the IDs and symbols are replaced with their most current version 5 | #' Last update: Tue Oct 31 18:41:52 2023 6 | "homologeneData2" 7 | -------------------------------------------------------------------------------- /R/import.R: -------------------------------------------------------------------------------- 1 | #' @importFrom magrittr %>% 2 | #' @export 3 | magrittr::`%>%` 4 | 5 | #' @importFrom magrittr %<>% 6 | #' @export 7 | magrittr::`%<>%` 8 | 9 | #' @importFrom magrittr %$% 10 | #' @export 11 | magrittr::`%$%` 12 | 13 | utils::globalVariables(c("Taxonomy", 14 | "Gene.Symbol", 15 | "Gene.ID", 16 | "HID", 17 | "sortBy", 18 | ".", 19 | "Discontinued_GeneID", 20 | "Discontinue_Date", 21 | "Gene2FunctionDetails", 22 | "Feedback", 23 | "Alignment & Scores")) 24 | -------------------------------------------------------------------------------- /R/updateHomologene.R: -------------------------------------------------------------------------------- 1 | #' Update homologene database 2 | #' 3 | #' Creates an updated version of the homologene database. This is done by downloading 4 | #' the latest gene annotation information and tracing changes in gene symbols and 5 | #' identifiers over history. \code{\link{homologeneData2}} was created using 6 | #' this function over the original \code{\link{homologeneData}}. This function 7 | #' requires downloading large amounts of data from the NCBI ftp servers. 8 | #' 9 | #' @param destfile Optional. Path of the output file. 10 | #' @param baseline The baseline homologene file to be used. By default uses the 11 | #' \code{\link{homologeneData2}} that is included in this package. The more ids 12 | #' to update, the more time is needed for the update which is why the default option 13 | #' uses an already updated version of the original database. 14 | #' @param gene_history A gene history data frame, possibly returned by \code{\link{getGeneHistory}} 15 | #' function. Use this if you want to have a static gene_history file to update up to a specific date. 16 | #' An up to date gene_history object can be set to update to a specific date by trimming 17 | #' rows that have recent dates. Note that the same is not possible for the gene_info 18 | #' If not provided, the latest file will be downloaded. 19 | #' @param gene_info A gene info data frame that contatins ID-symbol matches, 20 | #' possibly returned by \code{\link{getGeneInfo}}. Use this if you 21 | #' want a static version. Should be in sync with the gene_history file. Note that there is 22 | #' no easy way to track changes in gene symbols back in time so if you want to update it up 23 | #' to a specific date, make sure you don't lose that file. 24 | #' 25 | #' @return Homologene database in a data frame with updated gene IDs and symbols 26 | #' @export 27 | #' 28 | updateHomologene = function(destfile = NULL, 29 | baseline = homologene::homologeneData2, 30 | gene_history = NULL, 31 | gene_info = NULL){ 32 | 33 | if(is.null(gene_history)){ 34 | message('acquiring gene history data') 35 | gene_history = getGeneHistory() 36 | } 37 | # identify discontinued ids 38 | discontinued_ids = baseline %>% 39 | dplyr::filter(Gene.ID %in% gene_history$Discontinued_GeneID) 40 | 41 | unchanged_ids = baseline %>% 42 | dplyr::filter(!Gene.ID %in% gene_history$Discontinued_GeneID) 43 | 44 | # we do not filter for taxonomy information as some genes use alternative 45 | # tax ids in non homologene sources 46 | # we do filter for earliest date found to run this a little faster 47 | 48 | message('Tracing discontinued IDs. This might take a while.') 49 | discontinued_ids$Gene.ID %>% updateIDs(gene_history) -> 50 | new_ids 51 | 52 | # create a frame with new ids 53 | discontinued_fix = data.frame(HID = discontinued_ids$HID, 54 | Gene.Symbol = discontinued_ids$Gene.Symbol, 55 | Taxonomy = discontinued_ids$Taxonomy, 56 | Gene.ID = new_ids, 57 | stringsAsFactors = FALSE) 58 | 59 | discontinued_fix %<>% dplyr::filter(Gene.ID != '-') 60 | 61 | new_homo_frame = 62 | rbind(discontinued_fix,unchanged_ids) %>% 63 | dplyr::arrange(HID) 64 | 65 | new_homo_frame %<>% dplyr::mutate( 66 | Gene.ID = as.integer(Gene.ID) 67 | ) 68 | 69 | 70 | if(is.null(gene_info)){ 71 | message('Downloading gene symbol information') 72 | gene_info = getGeneInfo() 73 | } 74 | 75 | message('Updating gene symbols') 76 | matchToHomologene = match(new_homo_frame$Gene.ID,gene_info$GeneID) 77 | 78 | # tax information isn't really needed here. just added for testing purposes 79 | modern_frame = data.frame(modern_ids = new_homo_frame$Gene.ID, 80 | modern_symbols = gene_info$Symbol[matchToHomologene], 81 | modern_tax = gene_info$tax_id[matchToHomologene],stringsAsFactors = FALSE) 82 | 83 | new_homo_frame %<>% 84 | dplyr::mutate(Gene.Symbol = modern_frame$modern_symbols) 85 | # remove convergent gene ids with same HIDs 86 | new_homo_frame %<>% unique() 87 | if(!is.null(destfile)){ 88 | utils::write.table(new_homo_frame,destfile, 89 | sep='\t', row.names=FALSE,quote = FALSE) 90 | 91 | } 92 | 93 | return(new_homo_frame) 94 | } -------------------------------------------------------------------------------- /README.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | github_document: 4 | html_preview: false 5 | --- 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | library(knitr) 9 | library(badger) 10 | library(magrittr) 11 | devtools::load_all() 12 | ``` 13 | 14 | # homologene 15 | [![Build Status](https://travis-ci.org/oganm/homologene.svg?branch=master)](https://travis-ci.org/oganm/homologene) [![codecov](https://codecov.io/gh/oganm/homologene/branch/master/graph/badge.svg)](https://codecov.io/gh/oganm/homologene) `r badge_cran_release('homologene',color = '#32BD36')` `r badge_devel("oganm/homologene", "blue")` 16 | 17 | An r package that works as a wrapper to homologene 18 | 19 | Available species are 20 | 21 | ```{r} 22 | homologene::taxData 23 | ``` 24 | 25 | Installation 26 | ============ 27 | ```r 28 | install.packages('homologene') 29 | ``` 30 | 31 | or 32 | 33 | ```r 34 | devtools::install_github('oganm/homologene') 35 | ``` 36 | 37 | Usage 38 | =========== 39 | Basic homologene function requires a list of gene symbols or NCBI ids, and an `inTax` and an `outTax`. In this example, `inTax` is the taxon id of *mus musculus* while `outTax` is for humans. 40 | ```{r} 41 | homologene(c('Eno2','Mog'), inTax = 10090, outTax = 9606) 42 | 43 | homologene(c('Eno2','17441'), inTax = 10090, outTax = 9606) 44 | ``` 45 | 46 | For mouse and humans two convenience functions exist that removes the need to provide taxonomic identifiers. Note that the column names are not the same as the `homologene` output. 47 | ```{r} 48 | mouse2human(c('Eno2','Mog')) 49 | human2mouse(c('ENO2','MOG','GZMH')) 50 | ``` 51 | 52 | 53 | homologeneData2 54 | ================= 55 | Original homologene database has not been updated since 2014. 56 | This package also includes an updated version of the homologene database that 57 | replaces gene symbols and identifiers with the their latest version. For the procedure followed for updating, 58 | see [this blog post](https://oganm.com/homologene-update/) and/or see the [processing code](R/updateHomologene.R). 59 | 60 | Using the updated version can help you match genes that cannot matched due to out of date annotations. 61 | 62 | 63 | ```{r} 64 | mouse2human(c('Mesd', 65 | 'Trp53rka', 66 | 'Cstdc4', 67 | 'Ifit3b')) 68 | 69 | 70 | mouse2human(c('Mesd', 71 | 'Trp53rka', 72 | 'Cstdc4', 73 | 'Ifit3b'), 74 | db = homologeneData2) 75 | ``` 76 | 77 | 78 | The `homologeneData2` object that comes with the GitHub version of this package 79 | is updated weekly but if you are using the CRAN version and want the latest 80 | annotations, or if you want to keep 81 | a frozen version homologene, you can use the `updateHomologene` function. 82 | 83 | 84 | ```r 85 | homologeneDataVeryNew = updateHomologene() # update the homologene database with the latest identifiers 86 | 87 | mouse2human(c('Mesd', 88 | 'Trp53rka', 89 | 'Cstdc4', 90 | 'Ifit3b'), 91 | db = homologeneDataVeryNew) 92 | 93 | ``` 94 | 95 | 96 | Gene ID syncronization 97 | ========================= 98 | 99 | The package also includes functions that were used to create the `homologeneData2`, for updating outdated gene symbols and identifiers. 100 | 101 | ```{r, cache = TRUE} 102 | library(dplyr) 103 | 104 | gene_history = getGeneHistory() 105 | oldIds = c(4340964, 4349034, 4332470, 4334151, 4323831) 106 | newIds = updateIDs(oldIds,gene_history) 107 | print(newIds) 108 | # get the latest gene symbols for the ids 109 | 110 | gene_info = getGeneInfo() 111 | 112 | gene_info %>% 113 | dplyr::filter(GeneID %in% as.integer(newIds)) # faster to match integers 114 | 115 | ``` 116 | 117 | Querying DIOPT 118 | ============== 119 | 120 | Instead of using just homologene, one can also make queries into the [DIOPT database](https://www.flyrnai.org/cgi-bin/DRSC_orthologs.pl). Diopt uses multiple databases 121 | to find gene homolog/orthologues. Note that this function has a `delay` parameter 122 | that is set to 10 seconds by default. This was done to obey the `robots.txt` of their website. 123 | 124 | ```{r, cache = TRUE} 125 | 126 | diopt(c('GZMH'),inTax = 9606, outTax = 10090) %>% 127 | knitr::kable() 128 | 129 | diopt(c('Eno2','Mog'),inTax = 10090, outTax =9606) %>% 130 | knitr::kable() 131 | 132 | ``` 133 | 134 | 135 | Mishaps 136 | ================= 137 | As of version version 1.1.68, the output now includes NCBI ids. Since it doesn't change any of the existing column names or their order, this shouldn't cause problems in most use cases. 138 | 139 | If a you can't find a gene you are looking for it may have synonyms. See [geneSynonym](https://github.com/oganm/geneSynonym.git) package to find them. If you have other problems open an issue or send a mail. 140 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | * local ubuntu 16.04, R 3.5.2 3 | * ubuntu 14.04 (on travis-ci), R 3.5.2 4 | 5 | ## R CMD check results 6 | 7 | 0 errors | 0 warnings | 0 notes 8 | 9 | * This is a resubmission. 10 | * License file is fixed 11 | * Date is updated 12 | -------------------------------------------------------------------------------- /data-raw/release: -------------------------------------------------------------------------------- 1 | 68 2 | -------------------------------------------------------------------------------- /data-raw/taxData.tsv: -------------------------------------------------------------------------------- 1 | tax_id name_txt 2 | 10090 Mus musculus 3 | 10116 Rattus norvegicus 4 | 28985 Kluyveromyces lactis 5 | 318829 Magnaporthe oryzae 6 | 33169 Eremothecium gossypii 7 | 3702 Arabidopsis thaliana 8 | 4530 Oryza sativa 9 | 4896 Schizosaccharomyces pombe 10 | 4932 Saccharomyces cerevisiae 11 | 5141 Neurospora crassa 12 | 6239 Caenorhabditis elegans 13 | 7165 Anopheles gambiae 14 | 7227 Drosophila melanogaster 15 | 7955 Danio rerio 16 | 8364 Xenopus (Silurana) tropicalis 17 | 9031 Gallus gallus 18 | 9544 Macaca mulatta 19 | 9598 Pan troglodytes 20 | 9606 Homo sapiens 21 | 9615 Canis lupus familiaris 22 | 9913 Bos taurus 23 | -------------------------------------------------------------------------------- /data/homologeneData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oganm/homologene/9a9f99c4b596ccdd05a1ea1d7f62323bffb3b721/data/homologeneData.rda -------------------------------------------------------------------------------- /data/homologeneData2.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oganm/homologene/9a9f99c4b596ccdd05a1ea1d7f62323bffb3b721/data/homologeneData2.rda -------------------------------------------------------------------------------- /data/homologeneVersion.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oganm/homologene/9a9f99c4b596ccdd05a1ea1d7f62323bffb3b721/data/homologeneVersion.rda -------------------------------------------------------------------------------- /data/taxData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oganm/homologene/9a9f99c4b596ccdd05a1ea1d7f62323bffb3b721/data/taxData.rda -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | License • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
51 |
52 | 95 | 96 | 97 |
98 | 99 |
100 |
101 | 104 | 105 |
YEAR: 2019
106 | COPYRIGHT HOLDER: Ogan Mancarci
107 | 
108 | 109 |
110 | 111 |
112 | 113 | 114 |
115 | 118 | 119 |
120 |

Site built with pkgdown 1.3.0.

121 |
122 |
123 |
124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /docs/LICENSE.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | MIT License • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
51 |
52 | 95 | 96 | 97 |
98 | 99 |
100 |
101 | 104 | 105 |
106 | 107 |

Copyright (c) 2019 Ogan Mancarci

108 |

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

109 |

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

110 |

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

111 |
112 | 113 |
114 | 115 |
116 | 117 | 118 |
119 | 122 | 123 |
124 |

Site built with pkgdown 1.3.0.

125 |
126 |
127 |
128 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /docs/README.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | /home/omancarci/git repos/homologene/README.md • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 38 | 39 | 40 | 41 | 42 | 43 |
44 |
45 | 79 | 80 | 81 |
82 | 83 |
84 |
85 | 88 | 89 | 90 |
91 |

92 | homologene

93 |

Build Status codecov CRAN_Status_Badge

94 |

An r package that works as a wrapper to homologene

95 |

Available species are

96 |
    97 |
  • Homo sapiens
  • 98 |
  • Mus musculus
  • 99 |
  • Rattus norvegicus
  • 100 |
  • Danio rerio
  • 101 |
  • Caenorhabditis elegans
  • 102 |
  • Drosophila melanogaster
  • 103 |
  • Rhesus macaque
  • 104 |
105 |

More species can be added on request

106 |
107 |
108 |

109 | Installation

110 |
install.packages('homologene')
111 |

or

112 |
devtools::install_github('oganm/homologene')
113 |
114 |
115 |

116 | Usage

117 |

Basic homologene function requires a list of gene symbols or NCBI ids, and an inTax and an outTax. In this example, inTax is the taxon id of mus musculus while outTax is for humans.

118 |
homologene(c('Eno2','Mog'), inTax = 10090, outTax = 9606)
119 |
##   10090 9606 10090_ID 9606_ID
120 | ## 1  Eno2 ENO2    13807    2026
121 | ## 2   Mog  MOG    17441    4340
122 |
homologene(c('Eno2','17441'), inTax = 10090, outTax = 9606)
123 |
##   10090 9606 10090_ID 9606_ID
124 | ## 1  Eno2 ENO2    13807    2026
125 | ## 2   Mog  MOG    17441    4340
126 |

For mouse and humans two convenience functions exist that removes the need to provide taxonomic identifiers. Note that the column names are not the same as the homologene output.

127 |
mouse2human(c('Eno2','Mog'))
128 |
##   mouseGene humanGene mouseID humanID
129 | ## 1      Eno2      ENO2   13807    2026
130 | ## 2       Mog       MOG   17441    4340
131 |
human2mouse(c('ENO2','MOG','GZMH'))
132 |
##   humanGene mouseGene humanID mouseID
133 | ## 1      ENO2      Eno2    2026   13807
134 | ## 2       MOG       Mog    4340   17441
135 | ## 3      GZMH      Gzmd    2999   14941
136 | ## 4      GZMH      Gzme    2999   14942
137 | ## 5      GZMH      Gzmg    2999   14944
138 | ## 6      GZMH      Gzmf    2999   14943
139 |
140 |
141 |

142 | Mishaps

143 |

As of version version 1.1.68, the output now includes NCBI ids. Since it doesn’t change any of the existing column names or their order, this shouldn’t cause problems in most use cases. If this is an issue for you plase notify me.

144 |

If a you can’t find a gene you are looking for it may have synonyms. See geneSynonym package to find them. If you have other problems open an issue or send a mail.

145 |
146 | 147 | 148 |
149 | 150 |
151 | 152 | 153 |
154 | 157 | 158 |
159 |

Site built with pkgdown.

160 |
161 | 162 |
163 |
164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Authors • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
51 |
52 | 95 | 96 | 97 |
98 | 99 |
100 |
101 | 104 | 105 |
    106 |
  • 107 |

    Ogan Mancarci. Author, maintainer. 108 |

    109 |
  • 110 |
  • 111 |

    Leon French. Contributor. 112 |

    113 |
  • 114 |
115 | 116 |
117 | 118 |
119 | 120 | 121 |
122 | 125 | 126 |
127 |

Site built with pkgdown 1.3.0.

128 |
129 |
130 |
131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/jquery.sticky-kit.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | http://leafo.net 3 | */ 4 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k)); 5 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
"))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q, 6 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}), 8 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize", 9 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/news/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Changelog • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
51 |
52 | 95 | 96 | 97 |
98 | 99 |
100 |
101 | 105 | 106 |
107 |

108 | homologene 1.5.68.x Unreleased 109 |

110 |
    111 |
  • Added diopt function to make queries at diopt database.
  • 112 |
  • Further automatic updates to homologeneData2
  • 113 |
114 |
115 |
116 |

117 | homologene 1.4.68.19.3.24 (since 1.1.68) Unreleased 118 |

119 |
    120 |
  • Added a NEWS.md file to track changes to the package.
  • 121 |
  • Added autoTranslate function to allow automated translation of gene symbols or ids.
  • 122 |
  • 123 | homologeneData2 is added as an updated version of the original homologene database (original database is not updated since 2014). This database includes the latest gene symbols and identifiers for every gene included in the original database. Outside CRAN (github version), this database is updated weekly.
  • 124 |
  • Version number is extended to include the last update date of homologeneData2.
  • 125 |
  • 126 | updateHomologene function is added to allow users create their own updated versions of homologene. Using homologeneData2 as a baseline with this function allows faster updates.
  • 127 |
  • 128 | getGeneHistory, updateIDs and getGeneInfo functions are added to allow users to update arbitrary gene lists with latest symbols and identifiers.
  • 129 |
  • All species originally repsented in the homologene database are added to the package.
  • 130 |
131 |
132 |
133 | 134 | 143 | 144 |
145 | 146 |
147 | 150 | 151 |
152 |

Site built with pkgdown 1.3.0.

153 |
154 |
155 |
156 | 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body > .container { 21 | display: flex; 22 | height: 100%; 23 | flex-direction: column; 24 | 25 | padding-top: 60px; 26 | } 27 | 28 | body > .container .row { 29 | flex: 1 0 auto; 30 | } 31 | 32 | footer { 33 | margin-top: 45px; 34 | padding: 35px 0 36px; 35 | border-top: 1px solid #e5e5e5; 36 | color: #666; 37 | display: flex; 38 | flex-shrink: 0; 39 | } 40 | footer p { 41 | margin-bottom: 0; 42 | } 43 | footer div { 44 | flex: 1; 45 | } 46 | footer .pkgdown { 47 | text-align: right; 48 | } 49 | footer p { 50 | margin-bottom: 0; 51 | } 52 | 53 | img.icon { 54 | float: right; 55 | } 56 | 57 | img { 58 | max-width: 100%; 59 | } 60 | 61 | /* Fix bug in bootstrap (only seen in firefox) */ 62 | summary { 63 | display: list-item; 64 | } 65 | 66 | /* Typographic tweaking ---------------------------------*/ 67 | 68 | .contents .page-header { 69 | margin-top: calc(-60px + 1em); 70 | } 71 | 72 | /* Section anchors ---------------------------------*/ 73 | 74 | a.anchor { 75 | margin-left: -30px; 76 | display:inline-block; 77 | width: 30px; 78 | height: 30px; 79 | visibility: hidden; 80 | 81 | background-image: url(./link.svg); 82 | background-repeat: no-repeat; 83 | background-size: 20px 20px; 84 | background-position: center center; 85 | } 86 | 87 | .hasAnchor:hover a.anchor { 88 | visibility: visible; 89 | } 90 | 91 | @media (max-width: 767px) { 92 | .hasAnchor:hover a.anchor { 93 | visibility: hidden; 94 | } 95 | } 96 | 97 | 98 | /* Fixes for fixed navbar --------------------------*/ 99 | 100 | .contents h1, .contents h2, .contents h3, .contents h4 { 101 | padding-top: 60px; 102 | margin-top: -40px; 103 | } 104 | 105 | /* Static header placement on mobile devices */ 106 | @media (max-width: 767px) { 107 | .navbar-fixed-top { 108 | position: absolute; 109 | } 110 | .navbar { 111 | padding: 0; 112 | } 113 | } 114 | 115 | 116 | /* Sidebar --------------------------*/ 117 | 118 | #sidebar { 119 | margin-top: 30px; 120 | } 121 | #sidebar h2 { 122 | font-size: 1.5em; 123 | margin-top: 1em; 124 | } 125 | 126 | #sidebar h2:first-child { 127 | margin-top: 0; 128 | } 129 | 130 | #sidebar .list-unstyled li { 131 | margin-bottom: 0.5em; 132 | } 133 | 134 | .orcid { 135 | height: 16px; 136 | vertical-align: middle; 137 | } 138 | 139 | /* Reference index & topics ----------------------------------------------- */ 140 | 141 | .ref-index th {font-weight: normal;} 142 | 143 | .ref-index td {vertical-align: top;} 144 | .ref-index .icon {width: 40px;} 145 | .ref-index .alias {width: 40%;} 146 | .ref-index-icons .alias {width: calc(40% - 40px);} 147 | .ref-index .title {width: 60%;} 148 | 149 | .ref-arguments th {text-align: right; padding-right: 10px;} 150 | .ref-arguments th, .ref-arguments td {vertical-align: top;} 151 | .ref-arguments .name {width: 20%;} 152 | .ref-arguments .desc {width: 80%;} 153 | 154 | /* Nice scrolling for wide elements --------------------------------------- */ 155 | 156 | table { 157 | display: block; 158 | overflow: auto; 159 | } 160 | 161 | /* Syntax highlighting ---------------------------------------------------- */ 162 | 163 | pre { 164 | word-wrap: normal; 165 | word-break: normal; 166 | border: 1px solid #eee; 167 | } 168 | 169 | pre, code { 170 | background-color: #f8f8f8; 171 | color: #333; 172 | } 173 | 174 | pre code { 175 | overflow: auto; 176 | word-wrap: normal; 177 | white-space: pre; 178 | } 179 | 180 | pre .img { 181 | margin: 5px 0; 182 | } 183 | 184 | pre .img img { 185 | background-color: #fff; 186 | display: block; 187 | height: auto; 188 | } 189 | 190 | code a, pre a { 191 | color: #375f84; 192 | } 193 | 194 | a.sourceLine:hover { 195 | text-decoration: none; 196 | } 197 | 198 | .fl {color: #1514b5;} 199 | .fu {color: #000000;} /* function */ 200 | .ch,.st {color: #036a07;} /* string */ 201 | .kw {color: #264D66;} /* keyword */ 202 | .co {color: #888888;} /* comment */ 203 | 204 | .message { color: black; font-weight: bolder;} 205 | .error { color: orange; font-weight: bolder;} 206 | .warning { color: #6A0366; font-weight: bolder;} 207 | 208 | /* Clipboard --------------------------*/ 209 | 210 | .hasCopyButton { 211 | position: relative; 212 | } 213 | 214 | .btn-copy-ex { 215 | position: absolute; 216 | right: 0; 217 | top: 0; 218 | visibility: hidden; 219 | } 220 | 221 | .hasCopyButton:hover button.btn-copy-ex { 222 | visibility: visible; 223 | } 224 | 225 | /* mark.js ----------------------------*/ 226 | 227 | mark { 228 | background-color: rgba(255, 255, 51, 0.5); 229 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 230 | padding: 1px; 231 | } 232 | 233 | /* vertical spacing after htmlwidgets */ 234 | .html-widget { 235 | margin-bottom: 10px; 236 | } 237 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $("#sidebar") 6 | .stick_in_parent({offset_top: 40}) 7 | .on('sticky_kit:bottom', function(e) { 8 | $(this).parent().css('position', 'static'); 9 | }) 10 | .on('sticky_kit:unbottom', function(e) { 11 | $(this).parent().css('position', 'relative'); 12 | }); 13 | 14 | $('body').scrollspy({ 15 | target: '#sidebar', 16 | offset: 60 17 | }); 18 | 19 | $('[data-toggle="tooltip"]').tooltip(); 20 | 21 | var cur_path = paths(location.pathname); 22 | var links = $("#navbar ul li a"); 23 | var max_length = -1; 24 | var pos = -1; 25 | for (var i = 0; i < links.length; i++) { 26 | if (links[i].getAttribute("href") === "#") 27 | continue; 28 | // Ignore external links 29 | if (links[i].host !== location.host) 30 | continue; 31 | 32 | var nav_path = paths(links[i].pathname); 33 | 34 | var length = prefix_length(nav_path, cur_path); 35 | if (length > max_length) { 36 | max_length = length; 37 | pos = i; 38 | } 39 | } 40 | 41 | // Add class to parent
  • , and enclosing
  • if in dropdown 42 | if (pos >= 0) { 43 | var menu_anchor = $(links[pos]); 44 | menu_anchor.parent().addClass("active"); 45 | menu_anchor.closest("li.dropdown").addClass("active"); 46 | } 47 | }); 48 | 49 | function paths(pathname) { 50 | var pieces = pathname.split("/"); 51 | pieces.shift(); // always starts with / 52 | 53 | var end = pieces[pieces.length - 1]; 54 | if (end === "index.html" || end === "") 55 | pieces.pop(); 56 | return(pieces); 57 | } 58 | 59 | // Returns -1 if not found 60 | function prefix_length(needle, haystack) { 61 | if (needle.length > haystack.length) 62 | return(-1); 63 | 64 | // Special case for length-0 haystack, since for loop won't run 65 | if (haystack.length === 0) { 66 | return(needle.length === 0 ? 0 : -1); 67 | } 68 | 69 | for (var i = 0; i < haystack.length; i++) { 70 | if (needle[i] != haystack[i]) 71 | return(i); 72 | } 73 | 74 | return(haystack.length); 75 | } 76 | 77 | /* Clipboard --------------------------*/ 78 | 79 | function changeTooltipMessage(element, msg) { 80 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 81 | element.setAttribute('data-original-title', msg); 82 | $(element).tooltip('show'); 83 | element.setAttribute('data-original-title', tooltipOriginalTitle); 84 | } 85 | 86 | if(ClipboardJS.isSupported()) { 87 | $(document).ready(function() { 88 | var copyButton = ""; 89 | 90 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 91 | 92 | // Insert copy buttons: 93 | $(copyButton).prependTo(".hasCopyButton"); 94 | 95 | // Initialize tooltips: 96 | $('.btn-copy-ex').tooltip({container: 'body'}); 97 | 98 | // Initialize clipboard: 99 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 100 | text: function(trigger) { 101 | return trigger.parentNode.textContent; 102 | } 103 | }); 104 | 105 | clipboardBtnCopies.on('success', function(e) { 106 | changeTooltipMessage(e.trigger, 'Copied!'); 107 | e.clearSelection(); 108 | }); 109 | 110 | clipboardBtnCopies.on('error', function() { 111 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 112 | }); 113 | }); 114 | } 115 | })(window.jQuery || window.$) 116 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.3.1 2 | pkgdown: 1.3.0 3 | pkgdown_sha: ~ 4 | articles: [] 5 | 6 | -------------------------------------------------------------------------------- /docs/reference/diopt.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Query DIOPT database — diopt • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 |
    59 |
    60 | 103 | 104 | 105 |
    106 | 107 |
    108 |
    109 | 114 | 115 |
    116 | 117 |

    Query DIOPT database (https://www.flyrnai.org/cgi-bin/DRSC_orthologs.pl) for orthologues. 118 | DIOPT database uses multiple tools to find gene orthologues. Sadly they don't have an 119 | API so this function queries by visiting the site and filling up the form. By default 120 | each query will take a minimum of 10 seconds due to delay parameter. This 121 | is taken from their robots.txt at the time this function is written. 122 | Note that DIOPT is not necesariy in sync with homologene database as provided in this package.

    123 | 124 |
    125 | 126 |
    diopt(genes, inTax, outTax, delay = 10)
    127 | 128 |

    Arguments

    129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 147 | 148 |
    genes

    A vector of gene identifiers. Anything that DIOPT accepts

    inTax

    taxid of the species that the input genes are coming from

    outTax

    taxid of the species that you are seeking homology

    delay

    How many seconds of delay should be between queries. Default is 10 146 | based on the robots.txt at the time this function is written.

    149 | 150 |

    Value

    151 | 152 |

    A data frame

    153 | 154 | 155 |
    156 | 165 |
    166 | 167 |
    168 | 171 | 172 |
    173 |

    Site built with pkgdown 1.3.0.

    174 |
    175 |
    176 |
    177 | 178 | 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /docs/reference/getGeneHistory.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Download gene history file — getGeneHistory • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 49 | 50 | 51 | 52 | 53 | 54 |
    55 |
    56 | 99 | 100 | 101 |
    102 | 103 |
    104 |
    105 | 110 | 111 |
    112 | 113 |

    Downloads and reads the gene history file from NCBI website. This file is needed for 114 | other functions

    115 | 116 |
    117 | 118 |
    getGeneHistory(destfile = NULL, justRead = FALSE)
    119 | 120 |

    Arguments

    121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 131 | 132 |
    destfile

    Path of the output file. If NULL a temp file will be used

    justRead

    If TRUE and destfile exists, it reads the file instead of 130 | downloading the latest one from NCBI

    133 | 134 |

    Value

    135 | 136 |

    A data frame with latest gene history information

    137 | 138 | 139 |
    140 | 149 |
    150 | 151 |
    152 | 155 | 156 |
    157 |

    Site built with pkgdown 1.3.0.

    158 |
    159 |
    160 |
    161 | 162 | 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /docs/reference/getGeneInfo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Download gene symbol information — getGeneInfo • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 49 | 50 | 51 | 52 | 53 | 54 |
    55 |
    56 | 99 | 100 | 101 |
    102 | 103 |
    104 |
    105 | 110 | 111 |
    112 | 113 |

    This function downloads the gene_info file from NCBI website and returns the 114 | gene symbols for current IDs.

    115 | 116 |
    117 | 118 |
    getGeneInfo(destfile = NULL, justRead = FALSE, chunk_size = 1e+06)
    119 | 120 |

    Arguments

    121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 131 | 132 | 133 | 134 | 138 | 139 |
    destfile

    Path of the output file. If NULL a temp file will be used

    justRead

    If TRUE and destfile exists, it reads the file instead of 130 | downloading the latest one from NCBI

    chunk_size

    Chunk size to be used with link[readr]{read_tsv_chunked}. 135 | The gene_info file is big enough to make its intake difficult. If you don't 136 | have large amounts of free memory you may have to reduce this number to read 137 | the file in smaller chunks

    140 | 141 |

    Value

    142 | 143 |

    A data frame with gene symbols for each current gene id

    144 | 145 | 146 |
    147 | 156 |
    157 | 158 |
    159 | 162 | 163 |
    164 |

    Site built with pkgdown 1.3.0.

    165 |
    166 |
    167 |
    168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /docs/reference/getHomologene.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Get the latest homologene file — getHomologene • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 50 | 51 | 52 | 53 | 54 | 55 |
    56 |
    57 | 100 | 101 | 102 |
    103 | 104 |
    105 |
    106 | 111 | 112 |
    113 | 114 |

    This function downloads the latest homologene file from NCBI. Note that Homologene 115 | has not been updated since 2014 so the output will be identical to homologeneData 116 | included in this package. This function is here for futureproofing purposes.

    117 | 118 |
    119 | 120 |
    getHomologene(destfile = NULL, justRead = FALSE)
    121 | 122 |

    Arguments

    123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 133 | 134 |
    destfile

    Path of the output file. If NULL a temp file will be used

    justRead

    If TRUE and destfile exists, it reads the file instead of 132 | downloading the latest one from NCBI

    135 | 136 |

    Value

    137 | 138 |

    A data frame with homology groups, gene ids and gene symbols

    139 | 140 | 141 |
    142 | 151 |
    152 | 153 |
    154 | 157 | 158 |
    159 |

    Site built with pkgdown 1.3.0.

    160 |
    161 |
    162 |
    163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /docs/reference/homologene.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Get homologues of given genes — homologene • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 98 | 99 | 100 |
    101 | 102 |
    103 |
    104 | 109 | 110 |
    111 | 112 |

    Given a list of genes and a taxid, returns a data frame inlcuding the genes and their corresponding homologues

    113 | 114 |
    115 | 116 |
    homologene(genes, inTax, outTax, db = homologene::homologeneData)
    117 | 118 |

    Arguments

    119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 |
    genes

    A vector of gene symbols or NCBI ids

    inTax

    taxid of the species that the input genes are coming from

    outTax

    taxid of the species that you are seeking homology

    db

    Homologene database to use.

    138 | 139 | 140 |

    Examples

    141 |
    homologene(c('Eno2','17441'), inTax = 10090, outTax = 9606)
    #> 10090 9606 10090_ID 9606_ID 142 | #> 1 Eno2 ENO2 13807 2026 143 | #> 2 Mog MOG 17441 4340
    144 |
    145 | 154 |
    155 | 156 |
    157 | 160 | 161 |
    162 |

    Site built with pkgdown 1.3.0.

    163 |
    164 |
    165 |
    166 | 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /docs/reference/homologeneData.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | homologeneData — homologeneData • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 98 | 99 | 100 |
    101 | 102 |
    103 |
    104 | 109 | 110 |
    111 | 112 |

    List of gene homologues used by homologene functions

    113 | 114 |
    115 | 116 |
    homologeneData
    117 | 118 |

    Format

    119 | 120 |

    An object of class data.frame with 275237 rows and 4 columns.

    121 | 122 | 123 |
    124 | 132 |
    133 | 134 |
    135 | 138 | 139 |
    140 |

    Site built with pkgdown 1.3.0.

    141 |
    142 |
    143 |
    144 | 145 | 146 | 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /docs/reference/homologeneData2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | homologeneData2 — homologeneData2 • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 50 | 51 | 52 | 53 | 54 | 55 |
    56 |
    57 | 100 | 101 | 102 |
    103 | 104 |
    105 |
    106 | 111 | 112 |
    113 | 114 |

    A modified copy of the homologene database. Homologene was updated at 2014 and many of its gene IDs and 115 | symbols are out of date. Here the IDs and symbols are replaced with their most current version 116 | Last update: Mon May 6 14:15:51 2019

    117 | 118 |
    119 | 120 |
    homologeneData2
    121 | 122 |

    Format

    123 | 124 |

    An object of class data.frame with 269545 rows and 4 columns.

    125 | 126 | 127 |
    128 | 136 |
    137 | 138 |
    139 | 142 | 143 |
    144 |

    Site built with pkgdown 1.3.0.

    145 |
    146 |
    147 |
    148 | 149 | 150 | 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /docs/reference/homologeneVersion.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Version of homologene used — homologeneVersion • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 98 | 99 | 100 |
    101 | 102 |
    103 |
    104 | 109 | 110 |
    111 | 112 |

    Version of homologene used

    113 | 114 |
    115 | 116 |
    homologeneVersion
    117 | 118 |

    Format

    119 | 120 |

    An object of class integer of length 1.

    121 | 122 | 123 |
    124 | 132 |
    133 | 134 |
    135 | 138 | 139 |
    140 |

    Site built with pkgdown 1.3.0.

    141 |
    142 |
    143 |
    144 | 145 | 146 | 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /docs/reference/human2mouse.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Human/mouse wraper for homologene — human2mouse • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 98 | 99 | 100 |
    101 | 102 |
    103 |
    104 | 109 | 110 |
    111 | 112 |

    Human/mouse wraper for homologene

    113 | 114 |
    115 | 116 |
    human2mouse(genes, db = homologene::homologeneData)
    117 | 118 |

    Arguments

    119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 |
    genes

    A vector of gene symbols or NCBI ids

    db

    Homologene database to use.

    130 | 131 | 132 |

    Examples

    133 |
    human2mouse(c('ENO2','4340'))
    #> humanGene mouseGene humanID mouseID 134 | #> 1 ENO2 Eno2 2026 13807 135 | #> 2 MOG Mog 4340 17441
    136 |
    137 | 146 |
    147 | 148 |
    149 | 152 | 153 |
    154 |

    Site built with pkgdown 1.3.0.

    155 |
    156 |
    157 |
    158 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Function reference • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
    51 |
    52 | 95 | 96 | 97 |
    98 | 99 |
    100 |
    101 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 119 | 120 | 121 | 122 | 125 | 126 | 127 | 128 | 131 | 132 | 133 | 134 | 137 | 138 | 139 | 140 | 143 | 144 | 145 | 146 | 149 | 150 | 151 | 152 | 155 | 156 | 157 | 158 | 161 | 162 | 163 | 164 | 167 | 168 | 169 | 170 | 173 | 174 | 175 | 176 | 179 | 180 | 181 | 182 | 185 | 186 | 187 | 188 | 191 | 192 | 193 | 194 | 197 | 198 | 199 | 200 | 203 | 204 | 205 | 206 |
    116 |

    All functions

    117 |

    118 |
    123 |

    autoTranslate()

    124 |

    Attempt to automatically translate a gene list

    129 |

    diopt()

    130 |

    Query DIOPT database

    135 |

    getGeneHistory()

    136 |

    Download gene history file

    141 |

    getGeneInfo()

    142 |

    Download gene symbol information

    147 |

    getHomologene()

    148 |

    Get the latest homologene file

    153 |

    homologene()

    154 |

    Get homologues of given genes

    159 |

    homologeneData

    160 |

    homologeneData

    165 |

    homologeneData2

    166 |

    homologeneData2

    171 |

    homologeneVersion

    172 |

    Version of homologene used

    177 |

    human2mouse()

    178 |

    Human/mouse wraper for homologene

    183 |

    mouse2human()

    184 |

    Mouse/human wraper for homologene

    189 |

    taxData

    190 |

    Names and ids of included species

    195 |

    updateHomologene()

    196 |

    Update homologene database

    201 |

    updateIDs()

    202 |

    Update gene IDs

    207 |
    208 | 209 | 215 |
    216 | 217 |
    218 | 221 | 222 |
    223 |

    Site built with pkgdown 1.3.0.

    224 |
    225 |
    226 |
    227 | 228 | 229 | 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /docs/reference/mouse2human.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Mouse/human wraper for homologene — mouse2human • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 98 | 99 | 100 |
    101 | 102 |
    103 |
    104 | 109 | 110 |
    111 | 112 |

    Mouse/human wraper for homologene

    113 | 114 |
    115 | 116 |
    mouse2human(genes, db = homologene::homologeneData)
    117 | 118 |

    Arguments

    119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 |
    genes

    A vector of gene symbols or NCBI ids

    db

    Homologene database to use.

    130 | 131 | 132 |

    Examples

    133 |
    mouse2human(c('Eno2','17441'))
    #> mouseGene humanGene mouseID humanID 134 | #> 1 Eno2 ENO2 13807 2026 135 | #> 2 Mog MOG 17441 4340
    136 |
    137 | 146 |
    147 | 148 |
    149 | 152 | 153 |
    154 |

    Site built with pkgdown 1.3.0.

    155 |
    156 |
    157 |
    158 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/reference/reexports.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Objects exported from other packages — reexports • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 52 | 53 | 54 | 55 | 56 | 57 |
    58 |
    59 | 102 | 103 | 104 |
    105 | 106 |
    107 |
    108 | 113 | 114 |
    115 | 116 |

    These objects are imported from other packages. Follow the links 117 | below to see their documentation.

    118 |
    119 |
    magrittr

    %>%, %<>%, %$%

    120 |
    121 | 122 |
    123 | 124 | 125 | 126 |
    127 | 133 |
    134 | 135 |
    136 | 139 | 140 |
    141 |

    Site built with pkgdown 1.3.0.

    142 |
    143 |
    144 |
    145 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /docs/reference/taxData.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Names and ids of included species — taxData • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 98 | 99 | 100 |
    101 | 102 |
    103 |
    104 | 109 | 110 |
    111 | 112 |

    Names and ids of included species

    113 | 114 |
    115 | 116 |
    taxData
    117 | 118 |

    Format

    119 | 120 |

    An object of class data.frame with 21 rows and 2 columns.

    121 | 122 | 123 |
    124 | 132 |
    133 | 134 |
    135 | 138 | 139 |
    140 |

    Site built with pkgdown 1.3.0.

    141 |
    142 |
    143 |
    144 | 145 | 146 | 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /docs/reference/updateHomologene.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Update homologene database — updateHomologene • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 52 | 53 | 54 | 55 | 56 | 57 |
    58 |
    59 | 102 | 103 | 104 |
    105 | 106 |
    107 |
    108 | 113 | 114 |
    115 | 116 |

    Creates an updated version of the homologene database. This is done by downloading 117 | the latest gene annotation information and tracing changes in gene symbols and 118 | identifiers over history. homologeneData2 was created using 119 | this function over the original homologeneData. This function 120 | requires downloading large amounts of data from the NCBI ftp servers.

    121 | 122 |
    123 | 124 |
    updateHomologene(destfile = NULL,
    125 |   baseline = homologene::homologeneData2, gene_history = NULL,
    126 |   gene_info = NULL)
    127 | 128 |

    Arguments

    129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 141 | 142 | 143 | 144 | 149 | 150 | 151 | 152 | 157 | 158 |
    destfile

    Optional. Path of the output file.

    baseline

    The baseline homologene file to be used. By default uses the 138 | homologeneData2 that is included in this package. The more ids 139 | to update, the more time is needed for the update which is why the default option 140 | uses an already updated version of the original database.

    gene_history

    A gene history data frame, possibly returned by getGeneHistory 145 | function. Use this if you want to have a static gene_history file to update up to a specific date. 146 | An up to date gene_history object can be set to update to a specific date by trimming 147 | rows that have recent dates. Note that the same is not possible for the gene_info 148 | If not provided, the latest file will be downloaded.

    gene_info

    A gene info data frame that contatins ID-symbol matches, 153 | possibly returned by getGeneInfo. Use this if you 154 | want a static version. Should be in sync with the gene_history file. Note that there is 155 | no easy way to track changes in gene symbols back in time so if you want to update it up 156 | to a specific date, make sure you don't lose that file.

    159 | 160 |

    Value

    161 | 162 |

    Homologene database in a data frame with updated gene IDs and symbols

    163 | 164 | 165 |
    166 | 175 |
    176 | 177 |
    178 | 181 | 182 |
    183 |

    Site built with pkgdown 1.3.0.

    184 |
    185 |
    186 |
    187 | 188 | 189 | 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /docs/reference/updateIDs.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Update gene IDs — updateIDs • homologene 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 49 | 50 | 51 | 52 | 53 | 54 |
    55 |
    56 | 99 | 100 | 101 |
    102 | 103 |
    104 |
    105 | 110 | 111 |
    112 | 113 |

    Given a list of gene ids and gene history information, traces changes in the 114 | gene's name to get the latest valid ID

    115 | 116 |
    117 | 118 |
    updateIDs(ids, gene_history)
    119 | 120 |

    Arguments

    121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 |
    ids

    Gene ids

    gene_history

    Gene history information, probably returned by getGeneHistory

    132 | 133 |

    Value

    134 | 135 |

    A character vector. New ids for genes that changed ids, or "-" for discontinued genes. 136 | the input itself.

    137 | 138 | 139 |

    Examples

    140 |
    # NOT RUN {
    141 | gene_history = getGeneHistory()
    142 | updateIDs(c("4340964", "4349034", "4332470", "4334151", "4323831"),gene_history)
    143 | # }
    144 |
    145 |
    146 | 157 |
    158 | 159 | 168 |
    169 | 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /homologene.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /man/autoTranslate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoTranslate.R 3 | \name{autoTranslate} 4 | \alias{autoTranslate} 5 | \title{Attempt to automatically translate a gene list} 6 | \usage{ 7 | autoTranslate( 8 | genes, 9 | targetGenes, 10 | possibleOrigins = NULL, 11 | possibleTargets = NULL, 12 | returnAllPossible = FALSE, 13 | db = homologene::homologeneData 14 | ) 15 | } 16 | \arguments{ 17 | \item{genes}{A list of genes to match the target. Symbols or NCBI ids} 18 | 19 | \item{targetGenes}{The target list. This list is supposed to represent a large number of genes 20 | from the target species.} 21 | 22 | \item{possibleOrigins}{Taxonomic identifiers of possible origin species} 23 | 24 | \item{possibleTargets}{Taxonomic identifiers of possible target species} 25 | 26 | \item{returnAllPossible}{if TRUE returns all possible pairings with non zero gene matches. If FALSE (default) returns the best match} 27 | 28 | \item{db}{Homologene database to use.} 29 | } 30 | \value{ 31 | A data frame if \code{returnAllPossibe = FALSE} and a list of data frames if \code{TRUE} 32 | } 33 | \description{ 34 | Given a list of query gene list and a target gene list, the function 35 | tries find the homology pairing that matches the query list to the target list. The query list 36 | is a short list of genes while the target list is supposed to represent a large number of genes from the target 37 | species. The default output will be the largest possible list. If \code{returnAllPossible = TRUE} then 38 | all possible pairings with any matches are returned. It is possible to limit the 39 | search by setting \code{possibleOrigins} and \code{possibleTargets}. Note that gene symbols of some species 40 | are more similar to each other than others. Using this with small gene lists and without providing any 41 | \code{possibleOrigins} or \code{possibleTargets} might return multiple hits, or if \code{returnAllPossible = TRUE} 42 | a wrong match can be returned. 43 | } 44 | -------------------------------------------------------------------------------- /man/diopt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/diopt.R 3 | \name{diopt} 4 | \alias{diopt} 5 | \title{Query DIOPT database} 6 | \usage{ 7 | diopt(genes, inTax, outTax, delay = 10) 8 | } 9 | \arguments{ 10 | \item{genes}{A vector of gene identifiers. Anything that DIOPT accepts} 11 | 12 | \item{inTax}{taxid of the species that the input genes are coming from} 13 | 14 | \item{outTax}{taxid of the species that you are seeking homology. 0 to query all species.} 15 | 16 | \item{delay}{How many seconds of delay should be between queries. Default is 10 17 | based on the robots.txt at the time this function is written.} 18 | } 19 | \value{ 20 | A data frame 21 | } 22 | \description{ 23 | Query DIOPT database (\url{https://www.flyrnai.org/cgi-bin/DRSC_orthologs.pl}) for orthologues. 24 | DIOPT database uses multiple tools to find gene orthologues. Sadly they don't have an 25 | API so this function queries by visiting the site and filling up the form. By default 26 | each query will take a minimum of 10 seconds due to \code{delay} parameter. This 27 | is taken from their robots.txt at the time this function is written. 28 | Note that DIOPT is not necesariy in sync with homologene database as provided in this package. 29 | } 30 | \details{ 31 | DIOPT does not support all species available in the homologene database. The supported 32 | species are: 33 | 34 | \describe{ 35 | \item{4896}{Schizosaccharomyces pombe} 36 | \item{4932}{Saccharomyces cerevisiae} 37 | \item{6239}{Caenorhabditis elegans} 38 | \item{7227}{Drosophila melanogaster} 39 | \item{7955}{Danio rerio} 40 | \item{8364}{Xenopus (Silurana) tropicalis} 41 | \item{9606}{Homo sapiens} 42 | \item{10090}{Mus musculus} 43 | \item{10116}{Rattus norvegicus} 44 | \item{3702}{Arabidopsis thaliana} 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /man/getGeneHistory.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geneHistory.R 3 | \name{getGeneHistory} 4 | \alias{getGeneHistory} 5 | \title{Download gene history file} 6 | \usage{ 7 | getGeneHistory(destfile = NULL, justRead = FALSE) 8 | } 9 | \arguments{ 10 | \item{destfile}{Path of the output file. If NULL a temp file will be used} 11 | 12 | \item{justRead}{If TRUE and destfile exists, it reads the file instead of 13 | downloading the latest one from NCBI} 14 | } 15 | \value{ 16 | A data frame with latest gene history information 17 | } 18 | \description{ 19 | Downloads and reads the gene history file from NCBI website. This file is needed for 20 | other functions 21 | } 22 | -------------------------------------------------------------------------------- /man/getGeneInfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geneHistory.R 3 | \name{getGeneInfo} 4 | \alias{getGeneInfo} 5 | \title{Download gene symbol information} 6 | \usage{ 7 | getGeneInfo(destfile = NULL, justRead = FALSE, chunk_size = 1e+06) 8 | } 9 | \arguments{ 10 | \item{destfile}{Path of the output file. If NULL a temp file will be used} 11 | 12 | \item{justRead}{If TRUE and destfile exists, it reads the file instead of 13 | downloading the latest one from NCBI} 14 | 15 | \item{chunk_size}{Chunk size to be used with \code{link[readr]{read_tsv_chunked}}. 16 | The gene_info file is big enough to make its intake difficult. If you don't 17 | have large amounts of free memory you may have to reduce this number to read 18 | the file in smaller chunks} 19 | } 20 | \value{ 21 | A data frame with gene symbols for each current gene id 22 | } 23 | \description{ 24 | This function downloads the gene_info file from NCBI website and returns the 25 | gene symbols for current IDs. 26 | } 27 | -------------------------------------------------------------------------------- /man/getHomologene.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geneHistory.R 3 | \name{getHomologene} 4 | \alias{getHomologene} 5 | \title{Get the latest homologene file} 6 | \usage{ 7 | getHomologene(destfile = NULL, justRead = FALSE) 8 | } 9 | \arguments{ 10 | \item{destfile}{Path of the output file. If NULL a temp file will be used} 11 | 12 | \item{justRead}{If TRUE and destfile exists, it reads the file instead of 13 | downloading the latest one from NCBI} 14 | } 15 | \value{ 16 | A data frame with homology groups, gene ids and gene symbols 17 | } 18 | \description{ 19 | This function downloads the latest homologene file from NCBI. Note that Homologene 20 | has not been updated since 2014 so the output will be identical to \code{\link{homologeneData}} 21 | included in this package. This function is here for futureproofing purposes. 22 | } 23 | -------------------------------------------------------------------------------- /man/homologene.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/homologene.R 3 | \name{homologene} 4 | \alias{homologene} 5 | \title{Get homologues of given genes} 6 | \usage{ 7 | homologene(genes, inTax, outTax, db = homologene::homologeneData) 8 | } 9 | \arguments{ 10 | \item{genes}{A vector of gene symbols or NCBI ids} 11 | 12 | \item{inTax}{taxid of the species that the input genes are coming from} 13 | 14 | \item{outTax}{taxid of the species that you are seeking homology} 15 | 16 | \item{db}{Homologene database to use.} 17 | } 18 | \description{ 19 | Given a list of genes and a taxid, returns a data frame inlcuding the genes and their corresponding homologues 20 | } 21 | \examples{ 22 | homologene(c('Eno2','17441'), inTax = 10090, outTax = 9606) 23 | } 24 | -------------------------------------------------------------------------------- /man/homologeneData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/homologene.R 3 | \docType{data} 4 | \name{homologeneData} 5 | \alias{homologeneData} 6 | \title{homologeneData} 7 | \format{ 8 | An object of class \code{data.frame} with 275237 rows and 4 columns. 9 | } 10 | \usage{ 11 | homologeneData 12 | } 13 | \description{ 14 | List of gene homologues used by homologene functions 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/homologeneData2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/homologeneData2.R 3 | \docType{data} 4 | \name{homologeneData2} 5 | \alias{homologeneData2} 6 | \title{homologeneData2} 7 | \format{ 8 | An object of class \code{data.frame} with 266573 rows and 4 columns. 9 | } 10 | \usage{ 11 | homologeneData2 12 | } 13 | \description{ 14 | A modified copy of the homologene database. Homologene was updated at 2014 and many of its gene IDs and 15 | symbols are out of date. Here the IDs and symbols are replaced with their most current version 16 | Last update: Tue Oct 31 18:41:52 2023 17 | } 18 | \keyword{datasets} 19 | -------------------------------------------------------------------------------- /man/homologeneVersion.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/homologene.R 3 | \docType{data} 4 | \name{homologeneVersion} 5 | \alias{homologeneVersion} 6 | \title{Version of homologene used} 7 | \format{ 8 | An object of class \code{integer} of length 1. 9 | } 10 | \usage{ 11 | homologeneVersion 12 | } 13 | \description{ 14 | Version of homologene used 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/human2mouse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/homologene.R 3 | \name{human2mouse} 4 | \alias{human2mouse} 5 | \title{Human/mouse wraper for homologene} 6 | \usage{ 7 | human2mouse(genes, db = homologene::homologeneData) 8 | } 9 | \arguments{ 10 | \item{genes}{A vector of gene symbols or NCBI ids} 11 | 12 | \item{db}{Homologene database to use.} 13 | } 14 | \description{ 15 | Human/mouse wraper for homologene 16 | } 17 | \examples{ 18 | human2mouse(c('ENO2','4340')) 19 | } 20 | -------------------------------------------------------------------------------- /man/mouse2human.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/homologene.R 3 | \name{mouse2human} 4 | \alias{mouse2human} 5 | \title{Mouse/human wraper for homologene} 6 | \usage{ 7 | mouse2human(genes, db = homologene::homologeneData) 8 | } 9 | \arguments{ 10 | \item{genes}{A vector of gene symbols or NCBI ids} 11 | 12 | \item{db}{Homologene database to use.} 13 | } 14 | \description{ 15 | Mouse/human wraper for homologene 16 | } 17 | \examples{ 18 | mouse2human(c('Eno2','17441')) 19 | } 20 | -------------------------------------------------------------------------------- /man/reexports.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/import.R 3 | \docType{import} 4 | \name{reexports} 5 | \alias{reexports} 6 | \alias{\%>\%} 7 | \alias{\%<>\%} 8 | \alias{\%$\%} 9 | \title{Objects exported from other packages} 10 | \keyword{internal} 11 | \description{ 12 | These objects are imported from other packages. Follow the links 13 | below to see their documentation. 14 | 15 | \describe{ 16 | \item{magrittr}{\code{\link[magrittr:compound]{\%<>\%}}, \code{\link[magrittr:pipe]{\%>\%}}, \code{\link[magrittr:exposition]{\%$\%}}} 17 | }} 18 | 19 | -------------------------------------------------------------------------------- /man/taxData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/homologene.R 3 | \docType{data} 4 | \name{taxData} 5 | \alias{taxData} 6 | \title{Names and ids of included species} 7 | \format{ 8 | An object of class \code{data.frame} with 21 rows and 2 columns. 9 | } 10 | \usage{ 11 | taxData 12 | } 13 | \description{ 14 | Names and ids of included species 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/updateHomologene.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/updateHomologene.R 3 | \name{updateHomologene} 4 | \alias{updateHomologene} 5 | \title{Update homologene database} 6 | \usage{ 7 | updateHomologene( 8 | destfile = NULL, 9 | baseline = homologene::homologeneData2, 10 | gene_history = NULL, 11 | gene_info = NULL 12 | ) 13 | } 14 | \arguments{ 15 | \item{destfile}{Optional. Path of the output file.} 16 | 17 | \item{baseline}{The baseline homologene file to be used. By default uses the 18 | \code{\link{homologeneData2}} that is included in this package. The more ids 19 | to update, the more time is needed for the update which is why the default option 20 | uses an already updated version of the original database.} 21 | 22 | \item{gene_history}{A gene history data frame, possibly returned by \code{\link{getGeneHistory}} 23 | function. Use this if you want to have a static gene_history file to update up to a specific date. 24 | An up to date gene_history object can be set to update to a specific date by trimming 25 | rows that have recent dates. Note that the same is not possible for the gene_info 26 | If not provided, the latest file will be downloaded.} 27 | 28 | \item{gene_info}{A gene info data frame that contatins ID-symbol matches, 29 | possibly returned by \code{\link{getGeneInfo}}. Use this if you 30 | want a static version. Should be in sync with the gene_history file. Note that there is 31 | no easy way to track changes in gene symbols back in time so if you want to update it up 32 | to a specific date, make sure you don't lose that file.} 33 | } 34 | \value{ 35 | Homologene database in a data frame with updated gene IDs and symbols 36 | } 37 | \description{ 38 | Creates an updated version of the homologene database. This is done by downloading 39 | the latest gene annotation information and tracing changes in gene symbols and 40 | identifiers over history. \code{\link{homologeneData2}} was created using 41 | this function over the original \code{\link{homologeneData}}. This function 42 | requires downloading large amounts of data from the NCBI ftp servers. 43 | } 44 | -------------------------------------------------------------------------------- /man/updateIDs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geneHistory.R 3 | \name{updateIDs} 4 | \alias{updateIDs} 5 | \title{Update gene IDs} 6 | \usage{ 7 | updateIDs(ids, gene_history) 8 | } 9 | \arguments{ 10 | \item{ids}{Gene ids} 11 | 12 | \item{gene_history}{Gene history information, probably returned by \code{\link{getGeneHistory}}} 13 | } 14 | \value{ 15 | A character vector. New ids for genes that changed ids, or "-" for discontinued genes. 16 | the input itself. 17 | } 18 | \description{ 19 | Given a list of gene ids and gene history information, traces changes in the 20 | gene's name to get the latest valid ID 21 | } 22 | \examples{ 23 | \dontrun{ 24 | gene_history = getGeneHistory() 25 | updateIDs(c("4340964", "4349034", "4332470", "4334151", "4323831"),gene_history) 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /process/autoUpdate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) 3 | 4 | cd "$parent_path" 5 | cd .. 6 | Rscript 'process/prepHomologene.R' > process/prepLog 2>process/prepLogErr 7 | Rscript 'process/prepHomologene2.R' > process/prepLog2 2>process/prepLog2Err 8 | -------------------------------------------------------------------------------- /process/biomartTests.R: -------------------------------------------------------------------------------- 1 | library(biomaRt) 2 | library(dplyr) 3 | library(magrittr) 4 | library(data.table) 5 | devtools::load_all() 6 | 7 | martdb = useMart('ENSEMBL_MART_ENSEMBL') 8 | biomartDBs = biomaRt::listDatasets(martdb) 9 | 10 | download.file(url ='ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz', destfile = "data-raw/taxdump.tar.gz") 11 | dir.create('data-raw/taxdump', showWarnings = FALSE) 12 | untar('data-raw/taxdump.tar.gz',exdir = 'data-raw/taxdump/') 13 | 14 | allTaxData = fread('data-raw/taxdump/names.dmp',data.table=FALSE, sep = '\t',quote = "") 15 | allTaxData = allTaxData[c(1,3,5,7)] 16 | names(allTaxData) = c('tax_id','name_txt','unique_name','name_class') 17 | 18 | # allTaxData = read.table('ftp://ftp.ncbi.nih.gov/pub/HomoloGene/build68/build_inputs/taxid_taxname', 19 | # sep = '\t', 20 | # stringsAsFactors = FALSE) 21 | # colnames(allTaxData) = c('tax_id','name_txt') 22 | 23 | # first we want to assign a tax id to all *_gene_ensembl databases 24 | allTaxData %<>% filter(name_class == 'scientific name') 25 | 26 | # common formula is first letter of the genus followed by the species name. 27 | 28 | splitSpecies = allTaxData$name_txt %>% 29 | stringr::str_split(' ') 30 | 31 | allTaxData %<>% filter(!sapply(splitSpecies,length)<2) 32 | splitSpecies %<>% {.[!sapply(.,length)<2]} 33 | 34 | splitSpecies %>% 35 | sapply(function(specie){ 36 | if(length(specie)<2){ 37 | return(NA) 38 | } 39 | 40 | genus = specie[1] %>% 41 | tolower() %>% 42 | substr(1,1) 43 | specie = specie[length(specie)] 44 | 45 | paste0(genus,specie) 46 | }) -> allTaxData$biomart 47 | 48 | allTaxData %<>% filter(paste0(biomart,'_gene_ensembl') %in% biomartDBs$dataset) 49 | 50 | # assign species IDs to biomart databases 51 | biomartDBs %<>% 52 | mutate(tax_id = sapply(dataset,function(x){ 53 | allTaxData %>% filter(paste0(biomart,'_gene_ensembl') == x) %$% tax_id %>% paste(collapse = '|') 54 | })) 55 | 56 | # take a look at species without taxIDs 57 | biomartDBs %>% filter(tax_id == '') 58 | 59 | # unconventional names. manually match if they turn out to be useful later. 60 | 61 | 62 | 63 | inDatabase = taxData$name_txt %>% gsub(' ','_',.) %>% lapply(function(x){ 64 | allDBs %>% dplyr::filter(grepl(dataset,x,ignore.case = TRUE)) 65 | }) 66 | 67 | 68 | 69 | biomaRt::listDatasets(hede) %>% dplyr::filter(grepl('gene_ensembl',dataset)) 70 | 71 | 72 | biomartDBs = data.frame( 73 | tax_id = taxData$tax_id, 74 | name_txt = taxData$name_txt, 75 | inDBs = c('mmusculus_gene_ensembl', 76 | 'rnorvegicus_gene_ensembl', 77 | 'celegans_gene_ensembl', 78 | 'dmelanogaster_gene_ensembl', 79 | 'drerio_gene_ensembl', 80 | 'mmulatta_gene_ensembl', 81 | 'hsapiens_gene_ensembl'), 82 | outDBs = c('mmusculus_homolog_ensembl_gene', 83 | 'rnorvegicus_homolog_ensembl_gene', 84 | 'celegans_homolog_ensembl_gene', 85 | 'dmelanogaster_homolog_ensembl_gene', 86 | 'drerio_homolog_ensembl_gene', 87 | 'mmulatta_homolog_ensembl_gene', 88 | 'hsapiens_homolog_ensembl_gene') 89 | ) 90 | 91 | biomartName = data.frame( 92 | tax_id = taxData$tax_id, 93 | name_txt = taxData$name_txt, 94 | biomartName = c('mmusculus', 95 | 'rnorvegicus', 96 | 'celegans', 97 | 'dmelanogaster', 98 | 'drerio', 99 | 'mmulatta', 100 | 'hsapiens'),stringsAsFactors = FALSE 101 | ) 102 | 103 | 104 | 105 | # look for the gene symbol column for each species. 106 | validSymbolFilters = lapply(seq_len(nrow(biomartName)), function(i){ 107 | name = biomartName[i,'biomartName'] 108 | targetGenes = homologeneData %>% filter(Taxonomy %in% biomartName[i,'tax_id']) %$% Gene.Symbol 109 | mart = biomaRt::useMart('ENSEMBL_MART_ENSEMBL',paste0(name,'_gene_ensembl')) 110 | 111 | filters = listFilters(mart) 112 | sapply(seq_len(nrow(filters)), function(t){ 113 | print(paste(name, filters[t,'name'])) 114 | tryCatch({ 115 | getBM(c('ensembl_gene_id'),filters = filters[t,'name'],value = targetGenes[sample(length(targetGenes),20)], mart = mart) %>% nrow 116 | }, 117 | error = function(e){ 118 | return(0) 119 | }) 120 | }) -> filterRowCounts 121 | names(filterRowCounts) = filters[,'name'] 122 | return(filterRowCounts) 123 | }) 124 | 125 | names(validSymbolFilters) = biomartName$biomartName 126 | validSymbolFilters %<>% purrr::map(function(x){x[x>0]}) 127 | 128 | 129 | validIDFilters = lapply(seq_len(nrow(biomartName)), function(i){ 130 | name = biomartName[i,'biomartName'] 131 | targetIDs = homologeneData %>% filter(Taxonomy %in% biomartName[i,'tax_id']) %$% Gene.ID 132 | mart = biomaRt::useMart('ENSEMBL_MART_ENSEMBL',paste0(name,'_gene_ensembl')) 133 | 134 | filters = listFilters(mart) 135 | sapply(seq_len(nrow(filters)), function(t){ 136 | print(paste(name, filters[t,'name'])) 137 | tryCatch({ 138 | getBM(c('ensembl_gene_id'),filters = filters[t,'name'],value = targetIDs[sample(length(targetIDs),20)], mart = mart) %>% nrow 139 | }, 140 | error = function(e){ 141 | return(0) 142 | }) 143 | }) -> filterRowCounts 144 | names(filterRowCounts) = filters[,'name'] 145 | return(filterRowCounts) 146 | }) 147 | 148 | names(validIDFilters) = biomartName$biomartName 149 | validIDFilters %<>% purrr::map(function(x){x[x>0]}) 150 | 151 | 152 | genes = c('Eno2','Mog') 153 | inTax = 10090 154 | outTax = 9606 155 | 156 | 157 | ensemblHomologs = function(genes, inTax, outTax,confidenceTreshold = 0){ 158 | 159 | inName = biomartName$biomartName[biomartName$tax_id == inTax] 160 | outName = biomartName$biomartName[biomartName$tax_id == outTax] 161 | 162 | mart = biomaRt::useMart('ENSEMBL_MART_ENSEMBL',paste0(inName,'_gene_ensembl')) 163 | 164 | entrezIDs = getBM(c('ensembl_gene_id','hgnc_symbol','mgi_symbol'),filters = 'mgi_symbol',value = genes, mart = mart) 165 | 166 | 167 | getBM(c( 168 | paste0(outName,'_homolog_ensembl_gene')), 169 | filters = 'mgi_symbol', 170 | values =genes, 171 | mart = mart) ->out 172 | ) 173 | 174 | } 175 | 176 | mouseMart = useMart('ENSEMBL_MART_ENSEMBL','mmusculus_gene_ensembl') 177 | elegansMart = useMart('ENSEMBL_MART_ENSEMBL','celegans_gene_ensembl') 178 | 179 | humanMart = useMart('ENSEMBL_MART_ENSEMBL','hsapiens_gene_ensembl') 180 | ratMart = useMart('ENSEMBL_MART_ENSEMBL','rnorvegicus_gene_ensembl') 181 | 182 | getBM() 183 | 184 | 185 | outDBs = -------------------------------------------------------------------------------- /process/dioptMemory.R: -------------------------------------------------------------------------------- 1 | devtools::install_github('oganm/geneSynonym') 2 | library(geneSynonym) 3 | library(dplyr) 4 | library(magrittr) 5 | session = rvest::html_session('https://www.flyrnai.org/cgi-bin/DRSC_orthologs.pl') 6 | form = rvest::html_form(session)[[1]] 7 | dioptSpecies = taxData %>% filter(tax_id %in% form$fields$input_species$options) 8 | 9 | dioptSpecies$common_name = form$fields$input_species$options %>% 10 | {.[match(dioptSpecies$tax_id,.)]} %>% 11 | names %>% stringr::str_extract('(?<=\\().*?(?=\\))') 12 | 13 | aliquot = function(vector, alisize){ 14 | vector %>% 15 | split(.,rep_len(1:(floor(length(.)/alisize)),length(.))) 16 | } 17 | 18 | 19 | dir.create('data-raw/diopt') 20 | dioptSpecies$tax_id %>% lapply(function(taxID){ 21 | homologeneData2 %>% 22 | filter(Taxonomy == taxID) %$% 23 | Gene.ID -> 24 | speciesGenes 25 | 26 | gene_infoDB = ogbox::teval(glue::glue('syno{taxID}')) 27 | 28 | speciesGenes %<>% c(., names(gene_infoDB)) %>% unique 29 | 30 | speciesGenes %>% 31 | aliquot(15) %>% 32 | {.[]} %>% 33 | lapply(function(x){ 34 | out = NULL 35 | times = 0 36 | while(is.null(out) && times<3){ 37 | Sys.sleep(10) 38 | out = tryCatch(diopt(x,inTax = taxID,outTax ='0',delay = 10), 39 | error = function(e){ 40 | NULL 41 | }) 42 | times = times + 1 43 | } 44 | if(is.null(out)){ 45 | cat(paste0(paste0(x,collapse = '\n'),'\n'), 46 | file = paste0('data-raw/diopt/',taxID,'_failures'),append = TRUE) 47 | } 48 | return(out) 49 | 50 | }) -> aliquotDiopt 51 | 52 | aliquotDiopt = aliquotDiopt[!aliquotDiopt %>% sapply(is.null)] 53 | 54 | dioptOut = aliquotDiopt %>% do.call(rbind,.) 55 | readr::write_tsv(dioptOut,path = glue::glue('data-raw/diopt/{taxID}')) 56 | NULL 57 | }) 58 | 59 | files = list.files('data-raw/diopt',full.names = TRUE) 60 | failures = files[grepl('failures',files)] 61 | 62 | failures[2] %>% sapply(function(x){ 63 | tax = stringr::str_extract(x,'[0-9]*?(?=_)') 64 | failedIDs = readLines(x) 65 | failedIDs %>% sapply(function(y){ 66 | tryCatch(diopt(y,inTax = tax,outTax ='0',delay = 10), 67 | error = function(e){NULL}) 68 | }) %>% do.call(rbind,.) -> failureFix 69 | 70 | readr::write_tsv(failureFix,path = glue::glue('data-raw/diopt/{tax}'),append = TRUE, 71 | col_names = FALSE) 72 | }) 73 | 74 | dioptFiles = files[!files %in% failures] 75 | 76 | allDiopt = dioptFiles %>% lapply(function(x){ 77 | readr::read_tsv(x) 78 | }) 79 | 80 | 81 | -------------------------------------------------------------------------------- /process/prepHomologene.R: -------------------------------------------------------------------------------- 1 | library(magrittr) 2 | library(dplyr) 3 | library(data.table) 4 | library(git2r) 5 | library(ogbox) 6 | library(stringr) 7 | devtools::load_all() 8 | usethis::use_data_raw() 9 | 10 | homologeneVersion = readLines('ftp://ftp.ncbi.nih.gov/pub/HomoloGene/current/RELEASE_NUMBER') %>% as.integer 11 | 12 | # if the release is new, update 13 | if(homologeneVersion!=readLines('data-raw/release')){ 14 | 15 | homologeneData = getHomologene() 16 | 17 | taxData = read.table('ftp://ftp.ncbi.nih.gov/pub/HomoloGene/build68/build_inputs/taxid_taxname', 18 | sep = '\t', 19 | stringsAsFactors = FALSE) 20 | colnames(taxData) = c('tax_id','name_txt') 21 | 22 | speciesToAdd = homologeneData$Taxonomy %>% unique 23 | 24 | taxData %<>% filter(tax_id %in% speciesToAdd) 25 | 26 | stopifnot(all(speciesToAdd %in% taxData$tax_id)) 27 | 28 | write.table(taxData,'data-raw/taxData.tsv',sep='\t', row.names=FALSE,quote = FALSE) 29 | usethis::use_data(taxData,overwrite = TRUE) 30 | 31 | write.table(homologeneData,file = 'data-raw/homologeneData.tsv',sep='\t', row.names=FALSE,quote = FALSE) 32 | usethis::use_data(homologeneData, overwrite= TRUE) 33 | usethis::use_data(homologeneVersion, overwrite= TRUE) 34 | writeLines(as.character(homologeneVersion),con = 'data-raw/release') 35 | 36 | repo = repository('.') 37 | 38 | version = getVersion() 39 | version %<>% strsplit('\\.') %>% {.[[1]]} 40 | version[3] = homologeneVersion 41 | setVersion(paste(version,collapse = '.')) 42 | 43 | description = readLines('DESCRIPTION') 44 | description[grepl('build[0-9]',description)] = str_replace(description[grepl('build[0-9]',description)], 45 | 'build[0-9]*?(?=/)', 46 | paste0('build',homologeneVersion)) 47 | writeLines(text = description,con = 'DESCRIPTION') 48 | 49 | git2r::add(repo,path ='DESCRIPTION') 50 | 51 | git2r::add(repo,'data/homologeneData.rda') 52 | git2r::add(repo,'data/homologeneVersion.rda') 53 | git2r::add(repo,'data-raw/homologeneData.tsv') 54 | git2r::add(repo,'man/homologeneData.Rd') 55 | git2r::add(repo,'data-raw/release') 56 | git2r::commit(repo,message = paste('Automatic update to version',homologeneVersion)) 57 | 58 | token = readLines('data-raw/auth') 59 | Sys.setenv(GITHUB_PAT = token) 60 | cred = git2r::cred_token() 61 | git2r::push(repo,credentials = cred) 62 | } 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /process/prepHomologene2.R: -------------------------------------------------------------------------------- 1 | library(readr) 2 | library(magrittr) 3 | library(dplyr) 4 | library(purrr) 5 | library(glue) 6 | library(git2r) 7 | library(usethis) 8 | library(ogbox) 9 | 10 | 11 | 12 | devtools::load_all() 13 | 14 | # if(!exists("homologeneData2")){ 15 | # homologeneData2 = homologeneData 16 | # } 17 | 18 | # takes about 15 minutes. I might as well update it from scratch each time. 19 | tictoc::tic() 20 | homologeneData2 = 21 | updateHomologene(destfile = 'data-raw/homologene2.tsv', 22 | baseline = homologeneData) 23 | tictoc::toc() 24 | 25 | usethis::use_data(homologeneData2,overwrite = TRUE) 26 | 27 | 28 | 29 | glue(' 30 | #\' homologeneData2 31 | #\' 32 | #\' A modified copy of the homologene database. Homologene was updated at 2014 and many of its gene IDs and 33 | #\' symbols are out of date. Here the IDs and symbols are replaced with their most current version 34 | #\' Last update: {date()} 35 | "homologeneData2" 36 | ') %>% 37 | writeLines(con = 'R/homologeneData2.R') 38 | 39 | devtools::document() 40 | 41 | 42 | # github stuff -------------- 43 | repo = repository('.') 44 | add(repo,'R/homologeneData2.R') 45 | Sys.sleep(1) 46 | add(repo,'data/homologeneData2.rda') 47 | Sys.sleep(1) 48 | add(repo,'man/homologeneData2.Rd') 49 | Sys.sleep(1) 50 | add(repo,'data-raw/homologene2.tsv') 51 | add(repo,'man') 52 | Sys.sleep(1) 53 | 54 | version = getVersion() 55 | version %<>% strsplit('\\.') %>% {.[[1]]} 56 | dateTail = format(Sys.Date(),'%y.%m.%d') %>% 57 | gsub(pattern = '\\.0','.',x=.) %>% strsplit('\\.') %>% {.[[1]]} 58 | 59 | version[4:6] = dateTail 60 | 61 | setVersion(paste(version,collapse = '.')) 62 | ogbox::setDate(format(Sys.Date(),'%Y-%m-%d')) 63 | 64 | add(repo,'DESCRIPTION') 65 | Sys.sleep(1) 66 | 67 | rmarkdown::render('README.rmd') 68 | 69 | add(repo,'README.md') 70 | 71 | git2r::commit(repo,message = 'homologeneData2 automatic update') 72 | Sys.sleep(1) 73 | 74 | 75 | token = readLines('data-raw/auth') 76 | Sys.setenv(GITHUB_PAT = token) 77 | cred = git2r::cred_token() 78 | git2r::push(repo,credentials = cred) 79 | 80 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(homologene) 3 | 4 | test_check("homologene") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_diopt.R: -------------------------------------------------------------------------------- 1 | context('diopt') 2 | 3 | 4 | test_that('DIOPT',{ 5 | httr::set_config(httr::config(ssl_verifypeer = 0L)) 6 | 7 | out = diopt(c('GZMH'),inTax = 9606, outTax = 10090) 8 | 9 | expect_true(all(c('Gzmd','Gzme','Gzmg','Gzmf') %in% out$`Mouse Symbol`)) 10 | }) 11 | -------------------------------------------------------------------------------- /tests/testthat/test_homologene.R: -------------------------------------------------------------------------------- 1 | context('homologene testing') 2 | 3 | test_that('Multiple orthologues',{ 4 | humanOrthos = human2mouse(c("GZMH")) 5 | expect_equal(humanOrthos$mouseGene,c('Gzmd','Gzme','Gzmg','Gzmf')) 6 | }) 7 | 8 | test_that('Regular functionality',{ 9 | expect_equal(mouse2human(c('Eno2','Mog'))$humanGene,c('ENO2','MOG')) 10 | expect_equal(dim(mouse2human(c('lolwut'))), c(0,4)) 11 | }) 12 | 13 | test_that('Other species',{ 14 | homoSubsets = homologene::taxData$tax_id %>% sapply(function(x){ 15 | homologene::homologeneData %>% subset(Taxonomy==x) %>% dim 16 | }) 17 | expect_true(all(homoSubsets[1,]>100)) 18 | 19 | }) 20 | 21 | 22 | 23 | test_that('homologene2',{ 24 | mouse2human(c('Mesd', 25 | 'Trp53rka', 26 | 'Cstdc4', 27 | 'Ifit3b'), 28 | db = homologeneData2) -> 29 | genes 30 | 31 | expect_true(all(genes$humanGene == c("MESD", "TP53RK", "CSTA", "IFIT3"))) 32 | 33 | mouse2human(c('Mesd', 34 | 'Trp53rka', 35 | 'Cstdc4', 36 | 'Ifit3b'), 37 | db = homologeneData) -> 38 | genes 39 | 40 | expect_true(nrow(genes)==0) 41 | 42 | }) 43 | -------------------------------------------------------------------------------- /tests/testthat/test_utilities.R: -------------------------------------------------------------------------------- 1 | context('utilities testing') 2 | 3 | 4 | test_that('Updating gene ',{ 5 | 6 | testIds = c(102978083,102976710,102975981) 7 | 8 | gene_history = getGeneHistory('testfiles/gene_history_trimmed.tsv',justRead = TRUE) 9 | 10 | # earlierst_date = gene_history %>% 11 | # dplyr::filter(Discontinued_GeneID %in% testIds) %$% 12 | # Discontinue_Date %>% 13 | # {suppressWarnings(min(.))} 14 | # 15 | # gene_history %<>% 16 | # dplyr::filter(Discontinue_Date >= earlierst_date 17 | # ) 18 | # 19 | # readr::write_tsv(gene_history,'testfiles/gene_history_trimmed.tsv') 20 | 21 | updatedGenes = updateIDs(testIds,gene_history) 22 | 23 | testthat::expect_is(updatedGenes,'character') 24 | testthat::expect_length(updatedGenes,3) 25 | 26 | }) 27 | 28 | 29 | test_that('automatic matching',{ 30 | inGenes = c('Eno2','Mog','Gzme','Gzmg','Gzmf') 31 | targetGenes = c('ENO2','MOG','GZMH') 32 | autoTransList = autoTranslate(inGenes,targetGenes,returnAllPossible = TRUE) 33 | expect_true(is.list(autoTransList)) 34 | expect_warning(autoTranslate(inGenes,targetGenes,returnAllPossible = FALSE),regexp = 'There are other pairings') 35 | 36 | autoTrans = autoTranslate(inGenes,targetGenes, 37 | possibleOrigins = c('human','mouse'),possibleTargets = c('human','mouse'), 38 | returnAllPossible = FALSE) 39 | 40 | expect_true(is.data.frame(autoTrans)) 41 | expect_true(all(colnames(autoTrans)[1:2] == c('10090','9606'))) 42 | 43 | autoTrans2 = autoTranslate(inGenes,targetGenes, 44 | possibleOrigins = c('10090','9606'),possibleTargets = c('10090','9606'), 45 | returnAllPossible = TRUE) 46 | 47 | expect_true(length(autoTrans2) == 1) 48 | 49 | expect_identical(autoTrans,autoTrans2[[1]]) 50 | 51 | selfMatch = suppressWarnings(autoTranslate(inGenes,inGenes,returnAllPossible = FALSE)) 52 | 53 | expect_true(all(names(selfMatch) == c("10090", "10090", "10090_ID", "10090_ID"))) 54 | 55 | # check to see if it works for gene IDs too 56 | expect_true(is.data.frame(autoTranslate(genes = autoTrans$`10090_ID`,targetGenes = autoTrans$`9606_ID`))) 57 | expect_true(length(autoTranslate(genes = autoTrans$`10090_ID`,targetGenes = autoTrans$`9606_ID`,returnAllPossible = TRUE)) == 1) 58 | 59 | }) 60 | 61 | --------------------------------------------------------------------------------