├── README.md ├── bioperl └── code │ └── R │ └── bioperl-l.R ├── brauer2007 ├── brauer2007.Rmd ├── brauer2007.md └── brauer2007_files │ └── figure-gfm │ └── plot-top20-genes-1.png ├── citeulike └── code │ └── ruby │ └── cul2mongo.rb ├── maSigPro ├── gse59671.Rmd ├── gse59671.md └── gse59671_files │ └── figure-markdown_github │ ├── plot1-1.png │ ├── plot2-1.png │ ├── plot3-1.png │ └── plot4-1.png ├── ncbi ├── biosample │ └── code │ │ └── ruby │ │ └── cell_lines.rb ├── entrez_db_terms │ ├── README.md │ ├── code │ │ └── ruby │ │ │ └── entrez_db_terms.rb │ └── data │ │ ├── assembly.txt │ │ ├── bioproject.txt │ │ ├── biosample.txt │ │ ├── biosystems.txt │ │ ├── blastdbinfo.txt │ │ ├── books.txt │ │ ├── cdd.txt │ │ ├── clinvar.txt │ │ ├── clone.txt │ │ ├── dbvar.txt │ │ ├── epigenomics.txt │ │ ├── gap.txt │ │ ├── gapplus.txt │ │ ├── gds.txt │ │ ├── gencoll.txt │ │ ├── gene.txt │ │ ├── genome.txt │ │ ├── genomeprj.txt │ │ ├── geoprofiles.txt │ │ ├── gtr.txt │ │ ├── homologene.txt │ │ ├── journals.txt │ │ ├── medgen.txt │ │ ├── mesh.txt │ │ ├── ncbisearch.txt │ │ ├── nlmcatalog.txt │ │ ├── nuccore.txt │ │ ├── nucest.txt │ │ ├── nucgss.txt │ │ ├── nucleotide.txt │ │ ├── omim.txt │ │ ├── orgtrack.txt │ │ ├── pcassay.txt │ │ ├── pccompound.txt │ │ ├── pcsubstance.txt │ │ ├── pmc.txt │ │ ├── popset.txt │ │ ├── probe.txt │ │ ├── protein.txt │ │ ├── proteinclusters.txt │ │ ├── pubmed.txt │ │ ├── pubmedhealth.txt │ │ ├── seqannot.txt │ │ ├── snp.txt │ │ ├── sra.txt │ │ ├── structure.txt │ │ ├── taxonomy.txt │ │ ├── toolkit.txt │ │ ├── toolkitall.txt │ │ ├── toolkitbook.txt │ │ └── unigene.txt └── taxonomy │ ├── README.md │ └── virus_hosts │ ├── README.md │ ├── code │ └── ruby │ │ └── virus2host.rb │ └── data │ ├── host_count.txt │ └── virus_host.tsv └── uniprot_words ├── code └── R │ └── match_words_uniprot.R └── data ├── word_matches_de.csv ├── word_matches_dk.csv ├── word_matches_en.csv ├── word_matches_es.csv ├── word_matches_fi.csv ├── word_matches_fr.csv ├── word_matches_it.csv ├── word_matches_nl.csv ├── word_matches_no.csv └── word_matches_se.csv /README.md: -------------------------------------------------------------------------------- 1 | # utils4bioinformatics 2 | 3 | Little code snippets that do (hopefully) useful things. 4 | 5 | ## Current contents 6 | 7 | 1. ncbi/entrez_db_terms - lists searchable fields for all Entrez databases 8 | 1. ncbi/taxonomy - utilities for working with the NCBI Taxonomy database 9 | 1. citeulike - code for working with CiteULike collections 10 | 1. maSigPro - tutorial for the Bioconductor maSigPro package 11 | 1. brauer2007 - trying out random forest on yeast expression data 12 | -------------------------------------------------------------------------------- /bioperl/code/R/bioperl-l.R: -------------------------------------------------------------------------------- 1 | # bioperl-l R 2 | # plot the size of the monthly archives from bioperl-l mail list 3 | 4 | library(XML) 5 | library(stringr) 6 | library(ggplot2) 7 | 8 | # download and get 1st table in list 9 | bp <- readHTMLTable("http://lists.open-bio.org/pipermail/bioperl-l/", stringsAsFactors = FALSE) 10 | bp <- bp[[1]] 11 | 12 | # get gzip sizes KB or MB 13 | size <- str_match(bp$`Downloadable version`, "Text (\\d+) (\\w+) ")[, 2:3] 14 | bp$size <- as.numeric(size[, 1]) 15 | bp$size <- ifelse(size[, 2] == "KB", bp$size * 1024, bp$size) 16 | bp$size <- ifelse(size[, 2] == "MB", bp$size * 1024 * 1024, bp$size) 17 | 18 | # parse & convert date 19 | bp$date <- gsub(":", "", bp$Archive) 20 | bp$date <- gsub(" ", " 1 ", bp$date) 21 | bp$date <- as.Date(bp$date, "%B %e %Y") 22 | 23 | # plot 24 | ggplot(bp) + geom_bar(aes(date, size), fill = "cornflowerblue", stat = "identity") + theme_bw() + scale_x_date(date_breaks = "2 years") + labs(x = "Date", y = "archive Gzip size (bytes)", title = "Approximate size of monthly Bioperl-l downloadable version 1996-present") 25 | -------------------------------------------------------------------------------- /brauer2007/brauer2007.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Can gene expression predict limiting nutrients in a random forest model?" 3 | author: "Neil Saunders" 4 | date: "`r Sys.time()`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r setup, include=FALSE} 11 | knitr::opts_chunk$set(echo = TRUE, 12 | message = FALSE, 13 | warning = FALSE) 14 | 15 | library(tidyverse) 16 | library(randomForest) 17 | library(randomForestExplainer) 18 | library(pander) 19 | 20 | theme_set(theme_dark()) 21 | ``` 22 | 23 | # Introduction 24 | Can we use random forest to predict which of 6 nutrients is limiting the growth of yeast, based on gene expression? 25 | 26 | Inspired by a now-deleted question on Stack Overflow [r]. 27 | 28 | # Dataset 29 | We obtain a tidy version of the [Brauer 2008](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2174172/) dataset in CSV format from [this page](https://4va.github.io/biodatasci/r-dataviz-homework.html). 30 | 31 | ```{r read-data} 32 | brauer2007_tidy <- read_csv("https://4va.github.io/biodatasci/data/brauer2007_tidy.csv") 33 | ``` 34 | The dataset contains `r nrow(brauer2007_tidy)` rows. It can be thought of as 36 separate experiments: yeast are grown at 6 different growth rates, with 6 nutrients where one is supplied at limiting levels. Gene expression is measured in each case - the number of genes varies slightly but is around 5 500. 35 | 36 | 37 | # Random forest model 38 | We specify a classification model where the categorical outcome variable is the nutrient, and the predictors are gene expression values and rate. Rate is assumed not to be important in this model. 39 | 40 | We used `set.seed` here for reproducibility, but it would not normally be used for random forest. 41 | 42 | ```{r build-model} 43 | set.seed(1001) 44 | 45 | brauer2007_tidy_rf1 <- brauer2007_tidy %>% 46 | mutate(systematic_name = gsub("-", "minus", systematic_name), 47 | nutrient = factor(nutrient)) %>% 48 | select(systematic_name, nutrient, rate, expression) %>% 49 | spread(systematic_name, expression, fill = 0) %>% 50 | randomForest(nutrient ~ ., data = ., localImp = TRUE, importance = TRUE) 51 | 52 | brauer2007_tidy_rf1 53 | ``` 54 | 55 | # Top 20 variables by importance 56 | We plot the expression of the top 20 most important variables (genes) by rate and nutrient. 57 | 58 | `important_variables` is a function from the `randomForestExplainer` package. 59 | 60 | ```{r plot-top20-genes} 61 | brauer2007_tidy %>% 62 | filter(systematic_name %in% important_variables(brauer2007_tidy_rf1, k = 20)) %>% 63 | ggplot(aes(rate, expression)) + 64 | geom_line(aes(color = nutrient)) + 65 | facet_wrap(~systematic_name, ncol = 5) + 66 | scale_color_brewer(palette = "Set2") 67 | ``` 68 | 69 | # Research into a selection of the top 20 genes 70 | We select for each of the 6 nutrients, one gene from the top 20 with a distinctive expression pattern when that nutrient is limited. 71 | 72 | Then we search the web using the term "gene name + nutrient" to see if there are any known associations, using resources such as the [Saccharomyces Genome Database](https://www.yeastgenome.org/). 73 | 74 | We can say that the expression pattern under nutrient limitation "makes sense" for 5 of the genes, given what is known about their function. The exception is YLR108C, which is moderately up-regulated under phosphate limitation. 75 | 76 | ```{r gene-function, echo=FALSE} 77 | genes <- c("YOR348C", "YOR374W", "YHR208W", "YLR108C", "YLL055W", "YKL216W") 78 | 79 | brauer2007_tidy %>% 80 | filter(systematic_name %in% genes) %>% 81 | distinct(systematic_name, bp) %>% 82 | arrange(systematic_name) %>% 83 | bind_cols(nutrient = c("leucine", "uracil", "sulfate", "phosphate", "ammonia", "glucose"), 84 | search_results = c("[Pathways - leucine biosynthesis](https://www.yeastgenome.org/locus/S000001251)", "[URA1 - null mutant requires uracil](https://www.yeastgenome.org/locus/S000001699)", "[Cysteine transporter; null mutant absent utilization of sulfur source](https://www.yeastgenome.org/locus/S000003978)", "", "[Proline permease; repressed in ammonia-grown cells](https://www.yeastgenome.org/locus/S000005875)", "[Aldehyde dehydrogenase; expression is glucose-repressed](https://www.yeastgenome.org/locus/S000005901)")) %>% 85 | pander(split.table = Inf) 86 | ``` 87 | 88 | # Summary 89 | Random forest would be far from my first method of choice for this problem. It would be more usual to determine first which genes were differentially-expressed, then go back and examine the nutrient limitation data. However, random forest does seem to have identified genes that are differentially expressed under nutrient limitation, and which have known biological functions consistent with their expression in the Brauer data. 90 | -------------------------------------------------------------------------------- /brauer2007/brauer2007.md: -------------------------------------------------------------------------------- 1 | Can gene expression predict limiting nutrients in a random forest model? 2 | ================ 3 | Neil Saunders 4 | 2019-06-26 21:40:27 5 | 6 | # Introduction 7 | 8 | Can we use random forest to predict which of 6 nutrients is limiting the 9 | growth of yeast, based on gene expression? 10 | 11 | Inspired by a now-deleted question on Stack Overflow \[r\]. 12 | 13 | # Dataset 14 | 15 | We obtain a tidy version of the 16 | [Brauer 2008](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2174172/) 17 | dataset in CSV format from [this 18 | page](https://4va.github.io/biodatasci/r-dataviz-homework.html). 19 | 20 | ``` r 21 | brauer2007_tidy <- read_csv("https://4va.github.io/biodatasci/data/brauer2007_tidy.csv") 22 | ``` 23 | 24 | The dataset contains 198430 rows. It can be thought of as 36 separate 25 | experiments: yeast are grown at 6 different growth rates, with 6 26 | nutrients where one is supplied at limiting levels. Gene expression is 27 | measured in each case - the number of genes varies slightly but is 28 | around 5 500. 29 | 30 | # Random forest model 31 | 32 | We specify a classification model where the categorical outcome variable 33 | is the nutrient, and the predictors are gene expression values and rate. 34 | Rate is assumed not to be important in this model. 35 | 36 | We used `set.seed` here for reproducibility, but it would not normally 37 | be used for random forest. 38 | 39 | ``` r 40 | set.seed(1001) 41 | 42 | brauer2007_tidy_rf1 <- brauer2007_tidy %>% 43 | mutate(systematic_name = gsub("-", "minus", systematic_name), 44 | nutrient = factor(nutrient)) %>% 45 | select(systematic_name, nutrient, rate, expression) %>% 46 | spread(systematic_name, expression, fill = 0) %>% 47 | randomForest(nutrient ~ ., data = ., localImp = TRUE, importance = TRUE) 48 | 49 | brauer2007_tidy_rf1 50 | ``` 51 | 52 | ## 53 | ## Call: 54 | ## randomForest(formula = nutrient ~ ., data = ., localImp = TRUE, importance = TRUE) 55 | ## Type of random forest: classification 56 | ## Number of trees: 500 57 | ## No. of variables tried at each split: 74 58 | ## 59 | ## OOB estimate of error rate: 5.56% 60 | ## Confusion matrix: 61 | ## Ammonia Glucose Leucine Phosphate Sulfate Uracil class.error 62 | ## Ammonia 6 0 0 0 0 0 0.0000000 63 | ## Glucose 0 6 0 0 0 0 0.0000000 64 | ## Leucine 0 1 5 0 0 0 0.1666667 65 | ## Phosphate 0 0 0 6 0 0 0.0000000 66 | ## Sulfate 0 0 0 0 6 0 0.0000000 67 | ## Uracil 0 1 0 0 0 5 0.1666667 68 | 69 | # Top 20 variables by importance 70 | 71 | We plot the expression of the top 20 most important variables (genes) by 72 | rate and nutrient. 73 | 74 | `important_variables` is a function from the `randomForestExplainer` 75 | package. 76 | 77 | ``` r 78 | brauer2007_tidy %>% 79 | filter(systematic_name %in% important_variables(brauer2007_tidy_rf1, k = 20)) %>% 80 | ggplot(aes(rate, expression)) + 81 | geom_line(aes(color = nutrient)) + 82 | facet_wrap(~systematic_name, ncol = 5) + 83 | scale_color_brewer(palette = "Set2") 84 | ``` 85 | 86 | ![](brauer2007_files/figure-gfm/plot-top20-genes-1.png) 87 | 88 | # Research into a selection of the top 20 genes 89 | 90 | We select for each of the 6 nutrients, one gene from the top 20 with a 91 | distinctive expression pattern when that nutrient is limited. 92 | 93 | Then we search the web using the term “gene name + nutrient” to see if 94 | there are any known associations, using resources such as the 95 | [Saccharomyces Genome Database](https://www.yeastgenome.org/). 96 | 97 | We can say that the expression pattern under nutrient limitation “makes 98 | sense” for 5 of the genes, given what is known about their function. The 99 | exception is YLR108C, which is moderately up-regulated under phosphate 100 | limitation. 101 | 102 | | systematic\_name | bp | nutrient | search\_results | 103 | | :--------------: | :---------------------------------------------: | :-------: | :-------------------------------------------------------------------------------------------------------------------: | 104 | | YHR208W | branched chain family amino acid biosynthesis\* | leucine | [Pathways - leucine biosynthesis](https://www.yeastgenome.org/locus/S000001251) | 105 | | YKL216W | ‘de novo’ pyrimidine base biosynthesis | uracil | [URA1 - null mutant requires uracil](https://www.yeastgenome.org/locus/S000001699) | 106 | | YLL055W | biological process unknown | sulfate | [Cysteine transporter; null mutant absent utilization of sulfur source](https://www.yeastgenome.org/locus/S000003978) | 107 | | YLR108C | biological process unknown | phosphate | | 108 | | YOR348C | proline catabolism\* | ammonia | [Proline permease; repressed in ammonia-grown cells](https://www.yeastgenome.org/locus/S000005875) | 109 | | YOR374W | ethanol metabolism | glucose | [Aldehyde dehydrogenase; expression is glucose-repressed](https://www.yeastgenome.org/locus/S000005901) | 110 | 111 | # Summary 112 | 113 | Random forest would be far from my first method of choice for this 114 | problem. It would be more usual to determine first which genes were 115 | differentially-expressed, then go back and examine the nutrient 116 | limitation data. However, random forest does seem to have identified 117 | genes that are differentially expressed under nutrient limitation, and 118 | which have known biological functions consistent with their expression 119 | in the Brauer data. 120 | -------------------------------------------------------------------------------- /brauer2007/brauer2007_files/figure-gfm/plot-top20-genes-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/brauer2007/brauer2007_files/figure-gfm/plot-top20-genes-1.png -------------------------------------------------------------------------------- /citeulike/code/ruby/cul2mongo.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | # save CiteULike JSON in mongodb database 4 | def json2mongo(db = "citeulike", col = "articles", user = "neils") 5 | require "mongo" 6 | require "json/pure" 7 | require "open-uri" 8 | 9 | puts "Fetching JSON..." 10 | db = Mongo::Connection.new.db(db) 11 | col = db.collection(col) 12 | url = "http://www.citeulike.org/json/user/" + user 13 | j = JSON.parse(open(url).read) 14 | j.each do |article| 15 | article[:_id] = article['article_id'] 16 | col.save(article) 17 | end 18 | puts "Done. Collection contains: #{col.count} articles." 19 | end 20 | 21 | # run with default options 22 | json2mongo 23 | -------------------------------------------------------------------------------- /maSigPro/gse59671.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analysis of gene expression timecourse data using maSigPro" 3 | author: "Neil Saunders" 4 | date: "`r Sys.time()`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r setup, include=FALSE} 11 | knitr::opts_chunk$set(echo = FALSE, 12 | message = FALSE, 13 | warning = FALSE) 14 | library(tidyverse) 15 | library(GEOquery) 16 | library(maSigPro) 17 | library(biomaRt) 18 | library(pander) 19 | 20 | theme_set(theme_bw()) 21 | 22 | getGenes <- function(sig, bm) { 23 | genes <- getBM(attributes = c("affy_hg_u133a_2", "hgnc_symbol"), 24 | filters = "affy_hg_u133a_2", 25 | values = rownames(sig), 26 | mart = bm) 27 | m <- match(rownames(sig), genes$affy_hg_u133a_2) 28 | sig$gene <- genes[m, "hgnc_symbol"] 29 | return(sig) 30 | } 31 | 32 | plotGenes <- function(e, probe, g, md) { 33 | d <- e[p, ] %>% 34 | as.data.frame() %>% 35 | setNames("value") %>% 36 | mutate(Rep = md$Replicate, 37 | time = md$Time, 38 | agent = md$agent) 39 | gg <- d %>% 40 | ggplot(aes(time, value)) + 41 | geom_boxplot(aes(position = factor(time)), outlier.shape = NA) + 42 | scale_x_continuous(breaks = unique(d$time)) + 43 | geom_jitter(aes(color = factor(agent))) + 44 | geom_smooth() + 45 | labs(title = paste(g, probe, sep = "/"), 46 | x = "time (hours)", 47 | y = "RMA value") + 48 | scale_color_discrete(name = "treatment") 49 | return(gg) 50 | } 51 | ``` 52 | 53 | # Introduction 54 | This tutorial looks at how to use the Bioconductor package [maSigPro](http://www.bioconductor.org/packages/release/bioc/html/maSigPro.html) to analyse the expression of genes over time. 55 | 56 | # Retrieving data using GEOquery 57 | First, we search the [NCBI GEO database](http://www.ncbi.nlm.nih.gov/geo) for suitable public datasets. The experimental design criteria that we would like to satisfy are: 58 | 59 | - several timepoints 60 | - several samples per timepoint 61 | - clarity in how expression data were obtained (normalisation, log2 values) 62 | - of some biological interest (easy to link to other data sources) 63 | 64 | Datasets that satisfy these criteria are not easy to discover. Eventually we settled on the study titled [Celecoxib, rofecoxib treated human smooth muscle cells microarray timecourse (GSE59671)](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE59671). In this study cells were pre-treated with one of two drugs then exposed to a protein, interleukin 1-beta, involved in inflammatory responses. Four biological replicates for each drug treatment and untreated controls were analysed at 0, 2, 8 and 24 hours post-IL1b exposure, generating 4 x 3 x 4 = 48 samples. A further 4 control samples were collected at "-2 hours", i.e. prior to IL1b-exposure. 65 | 66 | ## Expression data 67 | First we fetch the GEO series dataset using GEOquery. Getting the expression values is easy using _exprs()_. A quick check to see that they "look like" normalised log2 values (it's stated that they are at the GEO website). 68 | 69 | ```{r getGEO, cache=TRUE} 70 | gse <- getGEO("GSE59671") 71 | exp <- exprs(gse$GSE59671_series_matrix.txt.gz) 72 | 73 | exp %>% 74 | as.data.frame() %>% 75 | dplyr::select(1:4) %>% 76 | slice(1:4) %>% 77 | pander(justify = "right") 78 | ``` 79 | 80 | ## Phenotypic data 81 | "Phenotypic" data, the details of the experimental design, can be accessed using _pData()_. 82 | 83 | ```{r pdata} 84 | pd <- pData(gse$GSE59671_series_matrix.txt.gz) 85 | names(pd) 86 | ``` 87 | 88 | We can see that times and treatments are captured in the title attribute. 89 | ```{r} 90 | pd$title[1] %>% 91 | as.character() 92 | ``` 93 | 94 | # Creating a design matrix 95 | We can parse the sample titles using _str\_match_ from the _stringr_ package, then do some data cleaning to create the data frame used for the design matrix. 96 | 97 | ```{r designdata} 98 | pd.des <- str_match(pd$title, "^(.*?)_(.*?)_(.*?)_(.*?)\\s+(.*?)$")[, 2:6] %>% 99 | as.data.frame() %>% 100 | setNames(c("cell", "agent", "Time", "bio", "Replicate")) %>% 101 | mutate(bio = NULL, 102 | hasmc = ifelse(cell == "hasmc", 1, 0), 103 | Control = ifelse(agent == "none", 1, 0), 104 | celecoxib = ifelse(agent == "celecoxib", 1, 0), 105 | rofecoxib = ifelse(agent == "rofecoxib", 1, 0), 106 | Time = gsub("tp", "", Time), 107 | Time = gsub("hr", "", Time), 108 | Time = as.numeric(Time)) 109 | 110 | pd.des %>% 111 | slice(1:5) %>% 112 | pander(justify = "right") 113 | ``` 114 | 115 | The last steps are to drop the -2 hour samples (to simplify things), number the replicates correctly and importantly, add the GEO sample names as row names in the data frame pd.res, so as the expression data matches the phenotypic data. 116 | 117 | Now we can make the design matrix from the data frame with degree = 3 (4 timepoints - 1). Note that times and replicates are given their numeric values; conditions (cell type, control or treated) are signified by values of 0 or 1. 118 | 119 | ```{r design} 120 | exp.des <- exp[, c(1:16, 21:52)] 121 | pd.des <- pd.des[c(1:16, 21:52), ] 122 | pd.des$Replicate <- rep(1:12, 1, each = 4) 123 | rownames(pd.des) <- pd$geo_accession[c(1:16, 21:52)] 124 | 125 | # now we can make the design matrix from the appropriate columns 126 | design <- make.design.matrix(pd.des[, c(3, 4, 6:8)], degree = 3) 127 | 128 | design$edesign %>% 129 | as.data.frame() %>% 130 | slice(1:5) %>% 131 | pander(justify = "right") 132 | ``` 133 | 134 | # Fitting the regression model 135 | ## From regression model to significant genes 136 | Now we proceed exactly as described in the maSigPro users guide, fitting a regression model to discover probesets with significant differential expression over time. The functions _p.vector()_ and _T.fit()_ use _print()_ to report progress, so we're hiding that output here using _capture.output()_. 137 | 138 | ```{r regression} 139 | hide <- capture.output(fit <- p.vector(exp.des, design)) 140 | hide <- capture.output(tstep <- T.fit(fit, step.method = "backward", alfa = 0.05)) 141 | sigs <- get.siggenes(tstep, rsq = 0.6, vars = "groups") 142 | ``` 143 | 144 | The list _sigs_ is a surprisingly complex object. 145 | 146 | ```{r siggenes} 147 | sigs %>% 148 | glimpse() 149 | ``` 150 | 151 | Detailed information about significant genes is stored in the list _sig.genes_. Since we specified vars = "groups", times and treatments are returned together for each treatment. So for example, _sigs$sig.genes$Control_ returns data for the Control (untreated) time points; _sigs$sig.genes$celecoxibvsControl_ returns data for time points with the contrast celecoxib treatment versus Control. 152 | 153 | We can get the data frames with p-values for control, celecoxib- and rofecoxib-treated cells. 154 | 155 | ```{r pvals} 156 | control <- sigs$sig.genes$Control$sig.pvalues 157 | celecoxib <- sigs$sig.genes$celecoxibvsControl$sig.pvalues 158 | rofecoxib <- sigs$sig.genes$rofecoxibvsControl$sig.pvalues 159 | ``` 160 | 161 | ## Matching probesets to genes using biomaRt 162 | Next, we write a function that uses biomaRt to fetch HGNC gene symbols for the probesets. 163 | 164 | ```{r getGenes} 165 | mart.hs <- useMart("ensembl", "hsapiens_gene_ensembl") 166 | control <- getGenes(control, mart.hs) 167 | celecoxib <- getGenes(rofecoxib, mart.hs) 168 | rofecoxib <- getGenes(rofecoxib, mart.hs) 169 | ``` 170 | 171 | Now we're ready to look at "interesting genes". 172 | 173 | # Plotting timecourses for genes of interest 174 | 175 | ## Control samples 176 | Let's start with the control samples (no drug treatment), sorting on the p-value column. We then write a function that uses ggplot2 to plot the RMA expression values for the probeset corresponding to a given gene. We'll test it with the first probeset. 177 | 178 | ```{r plot1, fig.height=6, fig.width=9} 179 | # head(control[order(control$`p-value`, decreasing = FALSE), ]) 180 | 181 | p <- rownames(control[order(control$`p-value`, decreasing = FALSE), ])[1] 182 | gene <- ifelse(is.na(subset(control, rownames(control) == p)$gene), p, subset(control, rownames(control) == p)$gene) 183 | plotGenes(exp.des, p, gene, pd.des) 184 | ``` 185 | 186 | Expression of the gene CD83 rises dramatically in the first 2 hours after exposure to IL1b, then drops back to the base level by 8 hours. A quick Google search for the term "CD83 inflammation" indicates that the gene is involved with inflammatory responses. 187 | 188 | Let's try another - say, number 6 in the list. 189 | 190 | ```{r plot2, fig.height=6, fig.width=9} 191 | p <- rownames(control[order(control$`p-value`, decreasing = FALSE), ])[6] 192 | gene <- ifelse(is.na(subset(control, rownames(control) == p)$gene), p, subset(control, rownames(control) == p)$gene) 193 | plotGenes(exp.des, p, gene, pd.des) 194 | ``` 195 | 196 | The ANXA11 gene is also involved with inflammation and shows the opposite behaviour to CD83; expression drops ~ 3-4 fold in the first 8 hours, then rises gradually from 8-24 hours. 197 | 198 | ## Treated samples 199 | Did the drug treatments make any difference to the IL1b response? Let's start with the best p-value for celecoxib versus Control. 200 | 201 | ```{r plot3, fig.height=6, fig.width=9} 202 | p <- rownames(celecoxib[order(celecoxib$p.valor_celecoxibvsControl, decreasing = FALSE), ])[1] 203 | gene <- ifelse(is.na(subset(celecoxib, rownames(celecoxib) == p)$gene), p, subset(celecoxib, rownames(celecoxib) == p)$gene) 204 | plotGenes(exp.des, p, gene, pd.des) 205 | ``` 206 | 207 | There is some indication that expression of JRK post-IL1b exposure was lowered less between 0-2 hours in celecoxib-treated cells, but it's not incredibly convincing as the fold-change overall is quite low under all conditions. What about rofecoxib? 208 | 209 | ```{r plot4, fig.height=6, fig.width=9} 210 | p <- rownames(rofecoxib[order(rofecoxib$p.valor_rofecoxibvsControl, decreasing = FALSE), ])[1] 211 | gene <- ifelse(is.na(subset(rofecoxib, rownames(rofecoxib) == p)$gene), p, subset(rofecoxib, rownames(rofecoxib) == p)$gene) 212 | plotGenes(exp.des, p, gene, pd.des) 213 | ``` 214 | 215 | It appears that this p-value is driven by higher PIK3C2B expression in 3/4 rofecoxib-treated samples at t = 0. Perhaps the less-than-dramatic effects of drug treatment explain why this GEO series is not associated with a publication. 216 | 217 | # Summary 218 | * maSigPro is a useful, effective package for analysis of timecourse microarray data 219 | * It combines well with ggplot2 to generate attractive and informative plots of gene expression over time 220 | * The example dataset GSE59671 reveals some interesting effects on the expression of inflammation-associated genes when cells are exposed to ILb1 221 | -------------------------------------------------------------------------------- /maSigPro/gse59671.md: -------------------------------------------------------------------------------- 1 | Analysis of gene expression timecourse data using maSigPro 2 | ================ 3 | Neil Saunders 4 | 2018-04-03 21:50:36 5 | 6 | - [Introduction](#introduction) 7 | - [Retrieving data using GEOquery](#retrieving-data-using-geoquery) 8 | - [Expression data](#expression-data) 9 | - [Phenotypic data](#phenotypic-data) 10 | - [Creating a design matrix](#creating-a-design-matrix) 11 | - [Fitting the regression model](#fitting-the-regression-model) 12 | - [From regression model to significant genes](#from-regression-model-to-significant-genes) 13 | - [Matching probesets to genes using biomaRt](#matching-probesets-to-genes-using-biomart) 14 | - [Plotting timecourses for genes of interest](#plotting-timecourses-for-genes-of-interest) 15 | - [Control samples](#control-samples) 16 | - [Treated samples](#treated-samples) 17 | - [Summary](#summary) 18 | 19 | Introduction 20 | ============ 21 | 22 | This tutorial looks at how to use the Bioconductor package [maSigPro](http://www.bioconductor.org/packages/release/bioc/html/maSigPro.html) to analyse the expression of genes over time. 23 | 24 | Retrieving data using GEOquery 25 | ============================== 26 | 27 | First, we search the [NCBI GEO database](http://www.ncbi.nlm.nih.gov/geo) for suitable public datasets. The experimental design criteria that we would like to satisfy are: 28 | 29 | - several timepoints 30 | - several samples per timepoint 31 | - clarity in how expression data were obtained (normalisation, log2 values) 32 | - of some biological interest (easy to link to other data sources) 33 | 34 | Datasets that satisfy these criteria are not easy to discover. Eventually we settled on the study titled [Celecoxib, rofecoxib treated human smooth muscle cells microarray timecourse (GSE59671)](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE59671). In this study cells were pre-treated with one of two drugs then exposed to a protein, interleukin 1-beta, involved in inflammatory responses. Four biological replicates for each drug treatment and untreated controls were analysed at 0, 2, 8 and 24 hours post-IL1b exposure, generating 4 x 3 x 4 = 48 samples. A further 4 control samples were collected at "-2 hours", i.e. prior to IL1b-exposure. 35 | 36 | Expression data 37 | --------------- 38 | 39 | First we fetch the GEO series dataset using GEOquery. Getting the expression values is easy using *exprs()*. A quick check to see that they "look like" normalised log2 values (it's stated that they are at the GEO website). 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 |
GSM1442176GSM1442177GSM1442178GSM1442179
9.2299.3719.3699.458
7.2977.1137.3157.265
2.372.4122.4052.424
5.6865.7195.9415.725
83 | 84 | Phenotypic data 85 | --------------- 86 | 87 | "Phenotypic" data, the details of the experimental design, can be accessed using *pData()*. 88 | 89 | ## [1] "title" "geo_accession" 90 | ## [3] "status" "submission_date" 91 | ## [5] "last_update_date" "type" 92 | ## [7] "channel_count" "source_name_ch1" 93 | ## [9] "organism_ch1" "characteristics_ch1" 94 | ## [11] "characteristics_ch1.1" "characteristics_ch1.2" 95 | ## [13] "characteristics_ch1.3" "characteristics_ch1.4" 96 | ## [15] "characteristics_ch1.5" "biomaterial_provider_ch1" 97 | ## [17] "treatment_protocol_ch1" "growth_protocol_ch1" 98 | ## [19] "molecule_ch1" "extract_protocol_ch1" 99 | ## [21] "label_ch1" "label_protocol_ch1" 100 | ## [23] "taxid_ch1" "hyb_protocol" 101 | ## [25] "scan_protocol" "description" 102 | ## [27] "data_processing" "platform_id" 103 | ## [29] "contact_name" "contact_email" 104 | ## [31] "contact_laboratory" "contact_department" 105 | ## [33] "contact_institute" "contact_address" 106 | ## [35] "contact_city" "contact_state" 107 | ## [37] "contact_zip/postal_code" "contact_country" 108 | ## [39] "supplementary_file" "data_row_count" 109 | ## [41] "relation" "cell type:ch1" 110 | ## [43] "gender:ch1" "material type:ch1" 111 | ## [45] "nsaid treatment:ch1" "race:ch1" 112 | ## [47] "time point:ch1" 113 | 114 | We can see that times and treatments are captured in the title attribute. 115 | 116 | ## [1] "hasmc_celecoxib_tp0hr_biological rep1" 117 | 118 | Creating a design matrix 119 | ======================== 120 | 121 | We can parse the sample titles using *str\_match* from the *stringr* package, then do some data cleaning to create the data frame used for the design matrix. 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 |
cellagentTimeReplicatehasmcControlcelecoxibrofecoxib
hasmccelecoxib0rep11010
hasmccelecoxib0rep21010
hasmccelecoxib0rep31010
hasmccelecoxib0rep41010
hasmccelecoxib2rep11010
199 | 200 | The last steps are to drop the -2 hour samples (to simplify things), number the replicates correctly and importantly, add the GEO sample names as row names in the data frame pd.res, so as the expression data matches the phenotypic data. 201 | 202 | Now we can make the design matrix from the data frame with degree = 3 (4 timepoints - 1). Note that times and replicates are given their numeric values; conditions (cell type, control or treated) are signified by values of 0 or 1. 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 |
TimeReplicateControlcelecoxibrofecoxib
01010
01010
01010
01010
22010
259 | 260 | Fitting the regression model 261 | ============================ 262 | 263 | From regression model to significant genes 264 | ------------------------------------------ 265 | 266 | Now we proceed exactly as described in the maSigPro users guide, fitting a regression model to discover probesets with significant differential expression over time. The functions *p.vector()* and *T.fit()* use *print()* to report progress, so we're hiding that output here using *capture.output()*. 267 | 268 | The list *sigs* is a surprisingly complex object. 269 | 270 | ## List of 2 271 | ## $ sig.genes:List of 3 272 | ## ..$ Control :List of 7 273 | ## .. ..$ sig.profiles :'data.frame': 4757 obs. of 48 variables: 274 | ## .. ..$ coefficients :'data.frame': 4757 obs. of 12 variables: 275 | ## .. ..$ group.coeffs :'data.frame': 4757 obs. of 12 variables: 276 | ## .. ..$ sig.pvalues :'data.frame': 4757 obs. of 14 variables: 277 | ## .. ..$ g : int 4757 278 | ## .. ..$ edesign :'data.frame': 48 obs. of 5 variables: 279 | ## .. ..$ groups.vector: chr [1:12] "Control" "celecoxibvsControl" "rofecoxibvsControl" "Control" ... 280 | ## ..$ celecoxibvsControl:List of 7 281 | ## .. ..$ sig.profiles :'data.frame': 769 obs. of 48 variables: 282 | ## .. ..$ coefficients :'data.frame': 769 obs. of 12 variables: 283 | ## .. ..$ group.coeffs :'data.frame': 769 obs. of 12 variables: 284 | ## .. ..$ sig.pvalues :'data.frame': 769 obs. of 14 variables: 285 | ## .. ..$ g : int 769 286 | ## .. ..$ edesign :'data.frame': 48 obs. of 5 variables: 287 | ## .. ..$ groups.vector: chr [1:12] "Control" "celecoxibvsControl" "rofecoxibvsControl" "Control" ... 288 | ## ..$ rofecoxibvsControl:List of 7 289 | ## .. ..$ sig.profiles :'data.frame': 899 obs. of 48 variables: 290 | ## .. ..$ coefficients :'data.frame': 899 obs. of 12 variables: 291 | ## .. ..$ group.coeffs :'data.frame': 899 obs. of 12 variables: 292 | ## .. ..$ sig.pvalues :'data.frame': 899 obs. of 14 variables: 293 | ## .. ..$ g : int 899 294 | ## .. ..$ edesign :'data.frame': 48 obs. of 5 variables: 295 | ## .. ..$ groups.vector: chr [1:12] "Control" "celecoxibvsControl" "rofecoxibvsControl" "Control" ... 296 | ## $ summary :'data.frame': 4757 obs. of 3 variables: 297 | ## ..$ Control : Factor w/ 4757 levels "1053_at","1294_at",..: 1 2 3 4 5 6 7 8 9 10 ... 298 | ## ..$ celecoxibvsControl: Factor w/ 770 levels " ","1053_at",..: 2 3 4 5 6 7 8 9 10 11 ... 299 | ## ..$ rofecoxibvsControl: Factor w/ 900 levels " ","1053_at",..: 2 3 4 5 6 7 8 9 10 11 ... 300 | 301 | Detailed information about significant genes is stored in the list *sig.genes*. Since we specified vars = "groups", times and treatments are returned together for each treatment. So for example, *sigs*s**i**g*.*g**e**n**e**s*Control* returns data for the Control (untreated) time points; *sigs*s**i**g*.*g**e**n**e**s*celecoxibvsControl* returns data for time points with the contrast celecoxib treatment versus Control. 302 | 303 | We can get the data frames with p-values for control, celecoxib- and rofecoxib-treated cells. 304 | 305 | Matching probesets to genes using biomaRt 306 | ----------------------------------------- 307 | 308 | Next, we write a function that uses biomaRt to fetch HGNC gene symbols for the probesets. 309 | 310 | Now we're ready to look at "interesting genes". 311 | 312 | Plotting timecourses for genes of interest 313 | ========================================== 314 | 315 | Control samples 316 | --------------- 317 | 318 | Let's start with the control samples (no drug treatment), sorting on the p-value column. We then write a function that uses ggplot2 to plot the RMA expression values for the probeset corresponding to a given gene. We'll test it with the first probeset. 319 | 320 | ![](gse59671_files/figure-markdown_github/plot1-1.png) 321 | 322 | Expression of the gene CD83 rises dramatically in the first 2 hours after exposure to IL1b, then drops back to the base level by 8 hours. A quick Google search for the term "CD83 inflammation" indicates that the gene is involved with inflammatory responses. 323 | 324 | Let's try another - say, number 6 in the list. 325 | 326 | ![](gse59671_files/figure-markdown_github/plot2-1.png) 327 | 328 | The ANXA11 gene is also involved with inflammation and shows the opposite behaviour to CD83; expression drops ~ 3-4 fold in the first 8 hours, then rises gradually from 8-24 hours. 329 | 330 | Treated samples 331 | --------------- 332 | 333 | Did the drug treatments make any difference to the IL1b response? Let's start with the best p-value for celecoxib versus Control. 334 | 335 | ![](gse59671_files/figure-markdown_github/plot3-1.png) 336 | 337 | There is some indication that expression of JRK post-IL1b exposure was lowered less between 0-2 hours in celecoxib-treated cells, but it's not incredibly convincing as the fold-change overall is quite low under all conditions. What about rofecoxib? 338 | 339 | ![](gse59671_files/figure-markdown_github/plot4-1.png) 340 | 341 | It appears that this p-value is driven by higher PIK3C2B expression in 3/4 rofecoxib-treated samples at t = 0. Perhaps the less-than-dramatic effects of drug treatment explain why this GEO series is not associated with a publication. 342 | 343 | Summary 344 | ======= 345 | 346 | - maSigPro is a useful, effective package for analysis of timecourse microarray data 347 | - It combines well with ggplot2 to generate attractive and informative plots of gene expression over time 348 | - The example dataset GSE59671 reveals some interesting effects on the expression of inflammation-associated genes when cells are exposed to ILb1 349 | -------------------------------------------------------------------------------- /maSigPro/gse59671_files/figure-markdown_github/plot1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/maSigPro/gse59671_files/figure-markdown_github/plot1-1.png -------------------------------------------------------------------------------- /maSigPro/gse59671_files/figure-markdown_github/plot2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/maSigPro/gse59671_files/figure-markdown_github/plot2-1.png -------------------------------------------------------------------------------- /maSigPro/gse59671_files/figure-markdown_github/plot3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/maSigPro/gse59671_files/figure-markdown_github/plot3-1.png -------------------------------------------------------------------------------- /maSigPro/gse59671_files/figure-markdown_github/plot4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/maSigPro/gse59671_files/figure-markdown_github/plot4-1.png -------------------------------------------------------------------------------- /ncbi/biosample/code/ruby/cell_lines.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | # cell_lines.rb 4 | # search NCBI biosample database for misidentified cell lines 5 | # then search pubmed for those cell lines & return count 6 | 7 | require 'bio' 8 | 9 | Bio::NCBI.default_email = "me@me.com" 10 | ncbi = Bio::NCBI::REST.new 11 | 12 | search = ncbi.esearch("cell line status misidentified[Attribute]", {"db" => "biosample", "retmax" => 500}) 13 | 14 | search.each do |id| 15 | record = ncbi.efetch(id, {"report" => "full", "db" => "biosample", "mode" => "text"}) 16 | line = record.split("\n").find {|e| /\/cell line="(.*?)"/ =~ e } 17 | if line =~ /cell line="(.*?)"/ 18 | pubmed = ncbi.esearch_count("#{$1}[TIAB]", {"db" => "pubmed"}) 19 | puts "#{$1}\t#{pubmed}" 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/README.md: -------------------------------------------------------------------------------- 1 | # entrez_db_terms 2 | 3 | The script *entrez_db_terms.rb* generates a summary of searchable fields for each of the NCBI Entrez databases. 4 | 5 | Summary files (one per database) are written to the *data/* directory. 6 | 7 | The script requires Nokogiri and BioRuby. 8 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/code/ruby/entrez_db_terms.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | require 'bio' 3 | require 'nokogiri' 4 | require 'open-uri' 5 | 6 | Bio::NCBI.default_email = "me@me.com" 7 | outd = File.expand_path("../../../data", __FILE__) 8 | ncbi = Bio::NCBI::REST.new 9 | url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=" 10 | ncbi.einfo.each do |db| 11 | puts "Processing #{db}..." 12 | outf = outd + "/" + "#{db}.txt" 13 | File.open(outf, "w") do |f| 14 | doc = Nokogiri::XML(open("#{url + db}")) 15 | doc.xpath("//FieldList/Field").each do |field| 16 | name = field.xpath("Name").inner_html 17 | fullname = field.xpath("FullName").inner_html 18 | description = field.xpath("Description").inner_html 19 | f.write("#{name},#{fullname},#{description}\n") 20 | end 21 | end 22 | puts "Wrote file #{outf}" 23 | end 24 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/assembly.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACCN,Accession,Chromosome accessions 5 | ASAC,Assembly Accession,Space delimited assembly accessions w/ & w/o versions 6 | ASLV,Assembly Level,How assembled is this assembly. 'Contig' to 'Chromosome' 7 | TXID,Taxonomy ID,Taxonomy ID 8 | ORGN,Organism,Exploded organism names 9 | RUID,RefSeq Release ID,Release Id of RefSeq Assembly. 10 | GUID,GenBank Release ID,Release Id of GenBank synonym of this Assembly. 11 | UIDS,All Uids,Pair-id, GB-id, and RS-id of this Assembly. 12 | PROJ,BioProject IDs and Accessions,Uid and accessions of this assembly's projects 13 | SAMP,Biosample,Biosample Accession and Id 14 | NAME,Assembly Name,Assembly name 15 | ALLN,All Names,All names, space separated 16 | DESC,Description,Assembly description 17 | COV,Coverage,Sequencing coverage 18 | CLAS,Assembly Class,Type of the assembly 19 | RELS,Date - Assembly Release,Date the assembly was first released 20 | SRDT,Date - Sequences Release,Date the most recent sequence went live in ID 21 | UPDT,Date - Assembly Update,Date the assembly was last updated 22 | LEN,Total Sequence Length,Total length of chromosome/genome including bases and gaps divided by 1,000,000. 23 | REPL,Chromosome Count,Number of chromosomes in assembly 24 | PLAC,Placed Scaffolds Count,Number of placed scaffolds 25 | UNLO,Unlocalized Scaffolds Count,Number of unordered(unlocalized) scaffolds belonging to chromosomes 26 | UNPL,Unplaced Scaffolds Count,Number of unplaced scaffolds which do not belong to any chromosome, ie ChrUn 27 | CN50,Contig N50,Contig length at which 50% of total bases in assembly are in contigs of that length or greater 28 | SN50,Scaffold N50,Scaffold length at which 50% of total bases in assembly are in contigs of that length or greater 29 | CL50,Contig L50,Number of contigs that are greater than or equal to the N50 length. 30 | SL50,Scaffold L50,Number of scaffolds that are greater than or equal to the N50 length. 31 | CNTG,Contig Count,Number of contigs 32 | UNGL,Ungapped Length,Total length excluding gaps in chromosome/genome divided by 1,000,000 33 | PROP,Properties,Properties 34 | SUBO,Submitter Organization,Organization that submitted this assembly 35 | INFR,Infraspecifc name,Infraspecific name: breed, cultivar, strain, ecotype 36 | ISOL,Isolate,Isolate name 37 | SEX,Sex,Sex 38 | ASMM,Assembly Method,Assembly Method 39 | GCOV,Genome Coverage,Genome Coverage 40 | TECH,Sequencing Technology,Sequencing Technology 41 | EXFV,Expected Final Version,Expected Final Version 42 | RGAS,Reference Guided Assembly,Reference Guided Assembly 43 | SCAM,Single Cell Amplification,Single Cell Amplification 44 | RCAT,RefSeq Category,RefSeq Category 45 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/bioproject.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ORGN,Organism,Organism 5 | PRJA,Project Accession,Project Accession 6 | TYPE,Project Type,Project Type 7 | STPE,Project Subtype,Project Subtype 8 | DATE,Registration Date,Registration Date 9 | TITL,Title,Title 10 | CEN,Submitter Organization,Submitter Organization(s) 11 | ACCN,Replicon accession,Space delimited GenBank or RefSeq Replicon Accessions 12 | RTYP,Replicon type,Replicon Type 13 | RNME,Replicon name,Replicon Name 14 | LTP,Locus Tag Prefix,Locus Tag Prefix 15 | WORD,Description,Organism/Project Description 16 | KWRD,Keyword,Keywords 17 | PROP,Properties,Project/Organism Properties 18 | DTPE,Project Data Type,Project Data Type 19 | GRNT,Grant ID,Grant ID 20 | FUND,Funding Agency,Funding Agency 21 | PMID,PMID,Pubmed ID 22 | DOID,DOI,DOI ID 23 | PID,ProjectID,Project ID 24 | RELV,Relevance,Relevance 25 | ANME,Assembly name,Assembly Name 26 | BPRJ,BioProject ID,BioProject ID or accession 27 | TPRJ,Top Bioproject,Top Bioproject ID 28 | WGSA,WGS Accession,WGS Accessions 29 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/biosample.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACCN,Accession,Accession number of sequence 5 | TITL,Title,Words in definition line 6 | PROP,Properties,Classification by source qualifiers and molecule type 7 | WORD,Text Word,Free text associated with record 8 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 9 | AUTH,Author,Author(s) of publication 10 | PDAT,Publication Date,Date sequence added to GenBank 11 | MDAT,Modification Date,Date of last update 12 | ATNM,Attribute Name,Attribute Name 13 | ATTR,Attribute,Attribute 14 | CEN,Submitter Organization,Submitter Organization(s) 15 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/biosystems.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | SRC,SourceName,Name of the organization that is the source of the record 5 | SRID,SourceID,A numerical id that is assigned to a particular source of biosystem records 6 | TYPE,BioSystemType,Type of the biosystem 7 | PDAT,CreateDate,The date the biosystem record first appeared in the NCBI biosystems database 8 | MDAT,ModifyDate,The date the biosystem record last changed in the NCBI biosystems database 9 | SACC,SourceAccession,The accession used by the source of the biosystem 10 | TITL,Title,The name of the biosystem 11 | DESC,Description,The text description of a biosystem 12 | COM,Comments,Comments on the biosystem 13 | ORGN,Organism,Organism that contain the biosystem 14 | PN,ProteinName,Names of proteins in a biosystem (definition line) 15 | CN,ChemicalName,Names of small molecules in a biosystem (taken from PubChem) 16 | SEID,SidExternalID,Id given to a small molecule by the source of a biosystem 17 | GN,GeneName,Gene name 18 | GEID,GeneExternalID,Id given to a gene by the source of a biosystem 19 | PID,ProteinID,Protein accessions and gis in a biosystem 20 | CID,CID,PubChem compound identifiers (cid) found in a biosystem 21 | GID,GeneID,NCBI gene ids found in a biosystem 22 | SID,SID,PubChem substance ids (sid) found in a biosystem 23 | ACCN,Accession,The biosystem accession (bsid plus version) 24 | SCT,SIDCount,Total Count of PubChem substance ids (sid) found in a biosystem 25 | CCT,CIDCount,Total Count of PubChem compound ids (cid) found in a biosystem 26 | GCT,GeneCount,Total Count of NCBI Gene records (geneid) found in a biosystem 27 | PCT,ProteinCount,Total Count of NCBI Protein records (gi) found in a biosystem 28 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/blastdbinfo.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | DB,Database Name,Official name of the database 5 | TITL,Database Title,Words in the title of database (e.g., "NCBI Transcript Reference Sequences") 6 | DATE,Last Update,Date of last database update 7 | ORGN,Database Organism Taxid,Organism Taxid 8 | ASM,Genome Collection Assembly Name,Genome Collection Assembly Name 9 | SEQT,Blast Sequence Type,One of genomic, cdna, other-dna, or protein, of which genomic and cdna could be further specified 10 | SEQS,Blast Sequence Strategy,Appropriate sequence strategy for the sequence type specified 11 | SRC,Blast Database Source,States where the sequences came from, e.g., genbank, refseq, trace, etc. 12 | KEYW,Keyword,Search term identifying this database entry 13 | PRJ,NCBI Genome Project ID,NCBI Genome Project Identifier 14 | GPB,Gpipe Build Name,Gpipe Build 15 | WGPR,NCBI WGS Project ID,NCBI WGS Project Identifier 16 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/books.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | AUTH,Author,Section's author 5 | CA,Corporate Author,Corporate Author of publication 6 | FA,Full Author Name,Full Author Name(s) of publication 7 | FE,Full Editor Name,f 8 | TITL,Title,Section's title 9 | TYPE,Type,Section's type 10 | STXT,Full Text,Section's full text 11 | CONP,Concept Phrases,Generated keywords 12 | BOOK,Book,ID of the book that contains the document 13 | PMID,PMID,PubMed ID 14 | RMID,RefPMID,Citation search by PmId 15 | RID,Rid,Book internal ID 16 | PUBN,Publisher,Publisher's Name 17 | PDAT,Publication Year,Publication Year 18 | ISBN,ISBN,ISBN 19 | ATTR,Attribute,Attributes in key value ordered pairs 20 | EDIT,Editor,Section's Editor 21 | RD,Release Date,Release Date 22 | SUB,Subject,Subject 23 | RT,Resource Type,Resource Type 24 | AID,Accession ID,Accession ID 25 | BACI,Book Accession ID,Book Accession ID 26 | CHID,Chapter Accession ID,Chapter Accession ID 27 | DN,Disease,Disease 28 | GS,Gene Name,Gene Name 29 | PN,Protein Name,Protein Name 30 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/cdd.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACCN,Accession,Unique text identifier for a CD 5 | DB,Database,Which database CD is from (pfam, smart ...) 6 | TITL,Title,The short descriptive name of a CD, e.g. Rho 7 | STTL,Subtitle,A short description of the CD 8 | WORD,Text Word,The long description of the CD 9 | ORGN,Organism,The root taxonomy node of a CD 10 | PDAT,Publication Date,The date a CD was published 11 | MDAT,Modification Date,The date a CD was last modified 12 | PLEN,PssmLength,Length of the PSSM or domain search model 13 | AACN,Alternative Accession,Alternative unique text identifier for a CD, from source database 14 | STRP,Structure Representative,The number of structures in a CD 15 | SD,The description of sites,The desription of functional sites in a domain 16 | NS,Number of Sites,The number of functional sites in a domain 17 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/clinvar.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | TITL,Name of the ClinVar record,Constructed from variant and phenotype names 5 | WORD,Text Word,Free text associated with record 6 | ORGN,Organism,scientific and common names of organism 7 | MDAT,Modification Date,The last date on which the record was updated 8 | CHR,Chromosome,Chromosome number or numbers; also 'mitochondrial', 'unknown' properties 9 | GENE,Gene Name,Symbol or symbols of the gene 10 | MIM,MIM,MIM number from OMIM 11 | DIS,Disease/Phenotype,Diseases or traits associated with this record 12 | ACCN,ClinVar accession,Accession of the genotype/phenotype assertion 13 | VRID,Variant ID,Public ID of a variant 14 | TRID,Trait identifier,Public identifier for a trait (e.g. CUI, HPO) 15 | PROP,Properties,Properties of ClinVar record 16 | CDAT,Creation Date,The date on which this record first appeared 17 | PMID,PubMed ID,PubMed ids of accessions linked to the record 18 | GID,Gene ID,Gene ID 19 | TID,Taxonomy ID,taxonomy id 20 | DDAT,Date Discontinued,The date on which the record was discontinued 21 | CPOS,Base Position,Chromosome base position 22 | GFN,Gene Full Name,Gene full name 23 | PFN,Protein Full Name,Protein full name 24 | SUB,Submitter,Organization or submitter handle making the submission 25 | VRNM,Variant name,Names used for this allele 26 | VRTP,Type of variation,Type of sequence change/variant call 27 | MCNS,Molecular consequence,Consequence of the variation at the molecular level. 28 | RVST,Review status,Review status 29 | ALID,AlleleID,Unique identifier assigned to a specific sequence change at a location. 30 | ORIG,Origin,Origin 31 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/clone.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACC,Accession,Accession of any internal or external identifier. Versions removed. 5 | ACCV,Accession Version,Accession and version of GENBANK accessions associated with clone records. 6 | ALAB,Alternate Library Abbreviation,Alternate Library Abbreviation 7 | ALN,Alternate Library Name,Alternate Library Name 8 | ASSA,Assembly Accession,Accession of assembly on which placed 9 | ASSN,Assembly Name,Assembly on which placed 10 | BREE,Breed,Breed 11 | CELL,Cell Line,Cell Line 12 | CELT,Cell Type,Cell Type 13 | CHRA,Chromosome Accession,Accession of chromosome on which placed 14 | CHRB,Chromosome Start,Chromosome start of placement 15 | CHRE,Chromosome Stop,Chromosome end of placement 16 | CLN,Clone Name,Clone Name 17 | CLA,Clone Name Alias,Clone Name Alias 18 | CULT,Cultivar,Cultivar 19 | CUAC,Cultivar Accession,Cultivar Accession provided in library submission XML 20 | DIST,Distributor,Library Distributor Name, provided in library submission XML 21 | DEST,Development Stage,=Development Stage provided in library submission XML 22 | GENE,Gene Name,Name or alias of Gene at same location as the placement of the clone 23 | GNID,Gene ID,GeneID of gene at same location as the placement of the clone 24 | GDSC,Gene Description,Full name (description) of gene at same location as the placement of the clone 25 | GI,GI,GIs associated with clone records 26 | ISOL,Isolate,Isolate provided in library submission XML 27 | LBR,Library Abbreviation,Library Abbreviation 28 | LID,Library ID,Library ID 29 | LIB,Library Name,Library name 30 | LIBT,Library Type,Library Type 31 | OT,Object type,Object type in Clone DB (library, clone) 32 | ORGA,Organ,Organ provided in library submission XML 33 | ORG,Organism,Organism name (exploded) 34 | PLCD,Placed,display Y/N for has_placement/no_placement 35 | PLMT,Placement Method,Placement Method 36 | PLCN,Placement Confidence,Placement Confidence 37 | POPU,Population,Population provided in library submission XML 38 | PROP,Properties,Properties of data set for example HasInsert HasEnd IsPlaced IsConcordant 39 | PID,Genome Project Id,Genome Project Id provided in library submission XML 40 | PMID,PMID,PubMed Id provided in library submission XML 41 | SCFA,Scaffold Accession,Accession of scaffold on which on which placed 42 | SCFB,Scaffold Start,Scaffold start of placement 43 | SCFE,Scaffold Stop,Scaffold stop of placement 44 | STRA,Strain,Strain 45 | STS,STS,STS's that have been mapped to any sequences associated with clone records 46 | TXID,Taxonomy ID,Taxonomy ID 47 | TI,TI,TIs associated with clone records 48 | TISS,Tissue,Tissue provided in library submission XML 49 | VN,Vector,Vector name provided in library submission XML 50 | VT,Vector Type,Vector type provided in library submission XML 51 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/dbvar.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACC,Accession,Accession of any internal or external identifier. Versions removed from GENBANK accessions. 5 | ACCV,Accession Version,Accession and version of GENBANK accessions used in variant sequence or support. 6 | LAB,Submitter Affiliation,Submitter's affiliation name 7 | ALOR,Allele Origin,Allele origin (controlled vocabulary), including Both=Germline+Somatic 8 | ALTP,Variant Call Type,Variant Call type (controlled vocabulary) 9 | ASSM,Assembly Name,Assembly of placement 10 | ASAC,Assembly Accession,Assembly accession of placement 11 | AORG,Assembly Organism,Assembly organism names (exploded) 12 | ATAX,Assembly Taxonomy ID,Assembly taxonomy ID 13 | AUTH,Author,All authors included in journal 14 | BLCK,Block Start,Start of a 100k block on chromosome containing the variant. 15 | CH,Chromosome,Chromosome of placement 16 | CHRA,Chromosome Accession,Chromosome of placement, using accession.version 17 | CHRE,Chromosome End,End of placement on chromosome 18 | CHRS,Chromosome Start,Start of placement on chromosome 19 | INRE,Chromosome Inner End,Inner end of placement on chromosome 20 | INRS,Chromosome Inner Start,Inner start of placement on chromosome 21 | OTRE,Chromosome Outer End,Outer end of placement on chromosome 22 | OTRS,Chromosome Outer Start,Outer start of placement on chromosome 23 | CLIN,Variant Clinical Interpretation,Clinical interpretation of a variant (controlled vocabulary) 24 | CLVA,ClinVar Accession,ClinVar Accession (SCV) 25 | CTG,Unplaced Contig Accession,Contig of placement, when not on a chromosome, using accession.version 26 | DET,Detection Method,Detection method 27 | DESC,Variant Description,Variant description 28 | DDAT,Discontinued Date,dbVar discontinued date 29 | ESSV,Numeric Portion of EBI Variant Call ID,Numeric portion of EBI Variant Call ID (essv) 30 | ESTD,Numeric Portion of EBI Study ID,Numeric portion of EBI Study ID (estd) 31 | ESV,Numeric Portion of EBI Variant Region ID,Numeric portion of EBI Variant Region ID (esv) 32 | GENE,Gene Name,Name or alias of gene at same location as variant 33 | GNID,Entrez Gene ID,Gene ID of gene at same location as variant 34 | GDSC,Gene Full Name,Full name (description) of gene at same location as variant 35 | LIB,Library Abbreviation,Library name used in the Method 36 | MPLT,Method Platform,Method platform 37 | MSUB,Method Submission Name,Submission name of individual method, used when study contains multiple methods from different submitters, as does the curated dataset. 38 | METH,Method Type,Method type (controlled vocabulary) 39 | MCAT,Method Type Category,Used for sorting and display. Methods are categorized as: probe, mapping, sequencing. 40 | MWGT,Method Type Weight,used for sorting. BAC=all Method_type values of study or variant are BAC aCGH, Non-BAC=study or variant has at least 1 method_type that is other than BAC aCGH 41 | MESH,MeSH ID,Medical Subject Headings (MeSH) ID (exploded) 42 | MIM,MIM ID,Online Mendelian Inheritance in Man 43 | NCBI,Submitter MyNCBI ID,Submitter login ID in myNCBI system 44 | NSSV,Numeric Portion of NCBI Variant Call ID,Numeric portion of NCBI Variant Call ID (nssv) 45 | NST,Numeric Portion of NCBI Study ID,Numeric portion of NCBI Study ID (nstd) 46 | NSV,Numeric Portion of NCBI Variant Region ID,Numeric portion of NCBI Variant Region ID (nsv) 47 | OT,Object Type,Object type in dbVar (STUDY, VARIANT) 48 | ORG,Organism,Organism name (exploded) 49 | PDA,Submitter PDA Login,Submitter login ID in NCBI PDA system 50 | PHEN,Phenotype,Phenotype of sample/subject study or reference specimen 51 | PTYP,Placement Type,Placement type (controlled vocabulary) 52 | PMID,PMID,Unique identifier from PubMed 53 | GPRJ,Genome Projects ID,Unique identifier from Genome Projects 54 | PRNM,Genome Projects Name,Name from Genome Projects corresponding to Project_ID 55 | PDAT,Publication Date,Journal Publication date 56 | SSV,Variant Call Accession,dbVar ID (essv or nssv) of Variant Call 57 | ST,Study Accession,Study dbVar ID (estd or nstd) 58 | SV,Variant Region Accession,dbVar ID (esv or nsv) of Variant Region 59 | SMPL,Sample,Sample/subject ID of study or reference specimen 60 | SC,Sample Count,Number of samples in study 61 | STDE,Study Description,Study description 62 | STDN,Study Display Name,Study display name 63 | STDY,Study ID,Study, batch or submission ID 64 | STYP,Study Type,Study type assigned by NCBI 65 | SVAR,Submitter Variant ID,Originally submitted variant identifier 66 | SUPH,Subject Phenotype status,Boolean subject phenotype status: 0=not affected/null; 1 = affected 67 | SUB,Submitter Name,Submitter first and last name 68 | SSVC,Variant Call Count,Number of supporting variant calls in variant region 69 | TXID,Taxonomy ID,Taxonomy ID 70 | MDAT,Modification Date,dbVar Modification Date 71 | VAL,Validation Method,Validation method (controlled vocabulary) 72 | VSTA,Validation Result,Boolean validation status: null=not validated, 0=validated with result=0; 1 = validated with result=1 73 | VWGT,Validation Result Weight,0=not validated, 1=validated with result=0; 2 = validated with result=1 74 | VC,Variant Region Count,Number of variant regions in study 75 | VLEN,Variant Size,Size of variant 76 | VT,Variant Region Type,Variant region type (controlled vocabulary) 77 | ZYG,Variant Zygosity,Zygosity of a variant (controlled vocabulary) 78 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/epigenomics.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Text 5 | TITL,Title,Title 6 | TXID,Taxonomy ID,TaxId 7 | ACCN,Accession,Epigenomics accession number 8 | KYWD,Keyword,Keyword 9 | COID,Concept ID,UMLS concept ID (CID) 10 | AUTH,Author,Author 11 | PRID,Project ID,ProjectId 12 | DOCT,Document Type,DocType 13 | CDAT,Create Date,CreateDate 14 | MDAT,Update Date,UpdateDate 15 | ORGN,Organism,scientific and common names of organism 16 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/gap.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | DISC,Discriminator,Discriminator 5 | OBJ,Object Type,Object Type 6 | ANCE,Ancestor,Ancestor 7 | BELO,Belongs To,Belongs To 8 | ATTR,Attribution,Attribution 9 | RTST,Is Root Study,Is Root Study 10 | TLST,Is Top-Level Study,Is Top-Level Study 11 | STID,Study ID,Study ID 12 | STNM,Study Name,Study Name 13 | DIS,Disease,Disease 14 | PROJ,Project,Project 15 | GENO,Genotype Platform,Genotype Platform 16 | SRA,Study Has SRA Components,Study Has SRA components 17 | STUD,Study,Study 18 | HASV,Has Variable,Has Variable 19 | VRID,Variable ID,Variable ID 20 | VRNM,Variable Name,Variable Name 21 | VRDS,Variable Description,Variable Description 22 | VAR,Variable,Variable 23 | HASD,Has Document,Has Document 24 | DCID,Document ID,Document ID 25 | DCNM,Document Name,Document Name 26 | DOC,Document,Document 27 | DOCP,Document Part,Document Part 28 | HASA,Has Analysis,Has Analysis 29 | ANID,Analysis ID,Analysis ID 30 | ANNM,Analysis Name,Analysis Name 31 | ANLS,Analysis,Analysis 32 | HAST,Has Dataset,Has Dataset 33 | DSID,Dataset ID,Dataset ID 34 | DSNM,Dataset Name,Dataset Name 35 | DS,Dataset,Dataset 36 | PX,PhenX,PhenX 37 | HASP,Has PhenX Mapping,Has PhenX Mapping 38 | ARCH,Study Archive,Study Archive 39 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/gapplus.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | SRC,Source Database,Source Database 5 | CDAT,Create Date,Create Date 6 | MDAT,Modification Date,Date of last update 7 | PMID,PubMed ID,PubMed ID 8 | PDAT,Publication Date,Publication date 9 | JOUR,Journal,Journal abbreviation of publication 10 | TITL,Title,Document title 11 | RS,Reference SNP ID,Clustered SNP ID (rs) 12 | CHR,Chromosome,Chromosome 13 | CPOS,Chromosome Base Position,Position 14 | PLAT,Platform,Platform 15 | GENE,Gene Name,Gene Name 16 | FXN,Function Class,Function Class 17 | PVAL,Log of P-Value,Log of P-value 18 | TRT,Phenotype Trait,Phenotype Trait 19 | POPL,Population,Population 20 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/gds.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ORGN,Organism,exploded organism names 5 | ACCN,GEO Accession,accession for GDS (DataSet), GPL (Platform), GSM (Sample), GSE (Series) 6 | TITL,Title,Words in title of record 7 | DESC,Description,Text from description, summary and other similar fields 8 | SFIL,Supplementary Files,Supplementary Files 9 | ETYP,Entry Type,Entry type (DataSet or Series) 10 | STYP,Sample Type,Sample type 11 | VTYP,Sample Value Type,type of values, e.g. log ratio, count 12 | PTYP,Platform Technology Type,Platform technology type 13 | GTYP,DataSet Type,type of dataset 14 | NSAM,Number of Samples,Number of samples 15 | SRC,Sample Source,sample source 16 | AUTH,Author,author of the GEO Sample, Platform or Series 17 | INST,Submitter Institute,institute, or organization affiliatedd with contributers 18 | NPRO,Number of Platform Probes,number of platform probes 19 | SSTP,Subset Variable Type,subset variable type 20 | SSDE,Subset Description,subset description 21 | GEID,Reporter Identifier,name or identifier for the spot, e.g. GenBank, UniGene ID, Locus Link ID etc. 22 | PDAT,Publication Date,publication date from the GEO related entities 23 | UDAT,Update Date,date 24 | TAGL,Tag Length,Tag/Signature length for SAGE/MPSS 25 | RGSE,Related Series,Related Series 26 | RGPL,Related Platform,Related Platform 27 | MESH,MeSH Terms,Medical Subject Headings 28 | PROJ,Project,Project 29 | ATNM,Attribute Name,Attribute Name 30 | ATTR,Attribute,Attribute 31 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/gencoll.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACCN,Accession,Chromosome accessions 5 | ASAC,Assembly Accession,Space delimited assembly accessions w/ & w/o versions 6 | CAT,Category,Assembly Set type or "assembly-unit" 7 | TXID,Taxonomy Id,Taxonomy Id 8 | ORGN,Organism,Exploded organism names 9 | PROJ,Project Id,Uid(s) of this Assembly's Projects 10 | NAME,Assembly Name,Assembly Name 11 | ALLN,All Names,All Names, space separated 12 | COV,Coverage,Sequencing Coverage 13 | CLAS,Assembly Class,Type of the Assembly 14 | REL,Release Type,Release Type 15 | PART,Partial Genome Representation,Partial Genome Representation 16 | RELS,NCBI Release Date,NCBI Release Date 17 | LEN,Total Sequence Length,Total length of chromosome/genome including bases and gaps. 18 | REPL,Chromosome count,Number of chromosomes in assembly 19 | PLAC,Placed Scaffolds Count,Number of placed scaffolds 20 | UNLO,Unlocalized Scaffolds Count,Number of unordered(unlocalized) scaffolds belonging to chromosomes 21 | UNPL,Unplaced Scaffolds Count,Number of unplaced scaffolds which do not belong to any chromosome, ie ChrUn 22 | PROP,Properties,Properties 23 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/gene.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to a gene record 3 | FILT,Filter,Limits the records 4 | TITL,Gene/Protein Name,gene or protein name 5 | WORD,Text Word,Free text associated with record 6 | ORGN,Organism,scientific and common names of organism 7 | MDAT,Modification Date,The last date on which the record was updated 8 | CHR,Chromosome,Chromosome number or numbers; also 'mitochondrial', 'unknown' properties 9 | MV,Default Map Location,Chromosomal map location as displayed in MapViewer 10 | GENE,Gene Name,Symbol or symbols of the gene 11 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 12 | MIM,MIM ID,MIM number from OMIM 13 | DIS,Disease/Phenotype,Name(s) of diseases associated with this gene. When available, OMIM name will be used 14 | ACCN,Nucleotide/Protein Accession,Nucleotide or protein accession(s) associated with this gene 15 | UGEN,UniGene Cluster Number,UniGene cluster number for this gene 16 | PROP,Properties,Properties of Gene record 17 | CDAT,Creation Date,The date on which this record first appeared 18 | NCAC,Nucleotide Accession,nucleotide accessions of seqeunces 19 | NUID,Nucleotide UID,nucleotide uids of sequences 20 | PACC,Protein Accession,protein accessions 21 | PUID,Protein UID,protein uids 22 | PMID,PubMed ID,PubMed ids of accessions linked to the record 23 | TID,Taxonomy ID,taxonomy id 24 | GO,Gene Ontology,Gene Ontology 25 | DOM,Domain Name,Domain Name 26 | DDAT,Date Discontinued,The date on which the record was discontinued 27 | CPOS,Base Position,Chromosome base position 28 | GFN,Gene Full Name,Gene full name 29 | PFN,Protein Full Name,Protein full name 30 | GL,Gene Length,Gene length 31 | XC,Exon Count,Exon count 32 | GRP,Group,Relationships for this gene 33 | PREF,Preferred Symbol,Preferred symbol of the gene 34 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/genome.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to genome 3 | FILT,Filter,Limits the records 4 | ORGN,Organism,Organism 5 | PID,ProjectID,Project ID 6 | PRJA,Project Accession,Project Accession 7 | PRJT,Project Type,Project Type 8 | DFLN,Title,Genome short description 9 | DSCR,Genome description,Genome full description 10 | STAT,Status,Bioproject status 11 | AID,AssemblyID,Release ID of full assembly 12 | AACC,Assembly Accession,Accession of full assembly 13 | ANAM,Assembly Name,Name of full assembly 14 | GI,Replicon GI,Replicon GI 15 | ACCN,Replicon accession,Replicon Accession 16 | RNAM,Replicon name,Replicon Name 17 | PACC,Protein Accession,Protein Accession 18 | PROT,Protein Name,Protein Name 19 | PGI,Protein GI,protein GI 20 | GNID,GeneID,GeneID 21 | GENE,Gene Name,Gene Name 22 | LTAG,Locus Tag,Locus Tag 23 | WGSP,WGS prefix,WGS Prefix 24 | PMID,PubMed ID,Unique identifier from PubMed 25 | BIOP,biological properties,Biological Properties 26 | PCID,ProtClust ID,Protein Clusters ID 27 | PROP,Properties,Project/Organism Properties 28 | CDT,Create Date,Create Date 29 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/genomeprj.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/ncbi/entrez_db_terms/data/genomeprj.txt -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/geoprofiles.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ORGN,Organism,Exploded organism names 5 | ACCN,GEO Accession,Accession for GDS (DataSet), GPL (Platform), GSM (Sample), GSE (Series) 6 | GDST,GDS Text,GDS text from title and description 7 | GEOT,GEO Description/Title Text,Sample titles 8 | RTYP,Platform Reporter Type,Platform reporter type, e.g. genbank, clone, orf 9 | GTYP,DataSet Type,Type of dataset 10 | VTYP,Sample Value Type,Sample value type, e.g. log ratio, count 11 | NSAM,Number of Samples,Number of samples 12 | SRC,Sample Source,Sample source 13 | ID,ID_REF,Spot ID from GEO Platform, SAGE tag, Affy ProbeSet ID 14 | NAME,Reporter Identifier,Name or identifier for the spot, e.g. GenBank accession, CLONE_ID, ORF etc. 15 | SYMB,Gene Symbol,Gene symbol (name) from Entrez-Gene or Entrez-UniGene. 16 | GDSC,Gene Description,Gene Description 17 | RSTD,Ranked Standard Deviation,Ranked standard deviation 18 | RMAX,Max Value Rank,Maximal value of ranks 19 | RMIN,Min Value Rank,Minimal value of ranks 20 | FINF,Flag Information,Indicates an interesting or notable uid in the GDS context 21 | FTYP,Flag Type,Type of flag that indicates a uid of interest, or outliers etc. 22 | GI,GI,GenBank Identifier 23 | ATYP,Annotation Type,Type of annotation (gene, unigene, nucleotide) 24 | GO,Gene Ontology,Gene Ontology 25 | CHR,Chromosome,Chromosomes 26 | CPOS,Base Position,Chromosome base position 27 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/gtr.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Free text associated with record 5 | WRD1,Text Word 1,Free text associated with record with higher score then regular TEXT field 6 | PROP,Properties,Properties of this record 7 | ORG,Organization names,Lab or Clinic name including institution and department 8 | OID,Organization UID,Unique number for this lab/clinic 9 | CITY,Organization City,Lab or clinic city 10 | STATE,Organization State,Lab or clinic State or province 11 | COUNTRY,Organization Country,Lab or clinic Country 12 | POSTCODE,Organization postcode,Lab or clinic zip or postcode 13 | DIRECTOR,Organization Director(s),Lab or clinic director(s) 14 | STAFF,Organization Staff,Lab or clinic Staff name 15 | LS,Lab Service name,Lab Service name 16 | GTRACC,Accession for GTR test,Accession for GTR test 17 | MDAT,Modification Date,The last date on which the record was updated 18 | NAME,name for this test,name for this test 19 | ALT,Alternate name,alternate short and full names for test 20 | TESTDIS,disease name for this test,preferred name of disease by lab 21 | SPECIMEN,Specimen options for test,Specimen options for test 22 | TITL,title of this clinvar assertion,title of this clinvar assertion 23 | CVACC,Accession for clinvar assertion,Accession for clinvar assertion 24 | DCUI,Disease BioConcepts concept id,Concept identifier from BioConcepts for a disease 25 | DISNAME,Name of Disease,preferred full name 26 | GCUI,Gene BioConcepts concept id,Concept identifier from BioConcepts for a gene 27 | GENEID,UID for a record from Gene,Unique number for this record 28 | SYMB,Gene Symbol,Symbol or symbols of the gene 29 | GENENAME,Name of Gene,preferred full name 30 | GENEMIM,MIM number for the Gene,MIM number for the Gene 31 | PROTNAME,Name of Protein,preferred full name 32 | MTOD,Name of Method,method name 33 | MCAT,name for method category,name for method category 34 | TCAT,name for method top category,name for method top category 35 | LCRT,laboratory certification,laboratory certification 36 | CID,Country ISO code,Country ISO code 37 | SID,State ISO code,State ISO code 38 | TCID,Top Method Category ID,Top Method Category ID 39 | CTID,Method Category ID,Method Category ID 40 | MTID,Test Method ID,Test Method ID 41 | DID,Disease BioConcepts Entrez ID,Disease BioConcepts Entrez ID 42 | ORGN,Organism,scientific and common names of organism 43 | TARPOP,Target Population option for the test,Target Population option for the test 44 | CLNUTL,Clinical Utility option for the test,Clinical Utility option for the test 45 | CLNVAL,Clinical Validity option for the test,Clinical Validity option for the test 46 | GENENUM,Number of genes,Number of genes explicitly listed as being targeted in a test 47 | pharma,Pharmacogenetic response condition,Names of conditions that are identified as being pharmacogenetic responses 48 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/homologene.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | TITL,Title,Words in title of publication 5 | WORD,Text Word,Free text associated with record 6 | PROP,Properties,Properties (formerly Keyword) 7 | ORGN,Organism,scientific and common names of organism 8 | GNID,Gene ID,Gene ID 9 | GENE,Gene Name,Gene Name 10 | GDSC,Gene Description,Description of gene 11 | PUID,Protein UID,protein uids 12 | PRAC,Protein Accession,protein accessions 13 | NUID,Nucleotide UID,nucleotide uids of sequences 14 | NCAC,Nucleotide Accession,nucleotide accessions of seqeunces 15 | UGID,UniGene ID,UniGene ID 16 | ANCS,Ancestor,scientific and common names of ancestor organism 17 | DOM,Domain Name,Domain Name 18 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/journals.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | TITL,Title,Document title 5 | ESSN,eISSN,eISSN 6 | PSSN,pISSN,pISSN 7 | ISSN,ISSN,ISSN 8 | MABR,Title Abbreviation,Title Abbreviation 9 | ISOA,ISO Abbreviation,ISO Abbreviation 10 | MULT,Multi,Multi 11 | ID,NLM ID,NlmId 12 | ALIA,Alias,Alias 13 | ST,Subject Terms,Subject Terms 14 | WORD,Text Word,Text Word 15 | SYR,Start Year,First year of publication 16 | EYR,End Year,Last year of publication 17 | LANG,Language,Language the title is published in 18 | CNTY,Place of Publication,Place of publication 19 | XS,Indexing Subset,Indexing Subset 20 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/medgen.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to each record 3 | FILT,Filter,Limits the records 4 | ACCN,Accession,concept identifier for the record 5 | TITL,Title,Full name of the concept from the top-ranked vocabulary 6 | PROP,Properties,Classification by source qualifiers and molecule type 7 | MDAT,Modification Date,Date of last update 8 | WORD,Text Word,Free text associated with record 9 | DEFN,Definition,Text from the definition of the concept 10 | VOCB,Vocabulary,Classification by source qualifiers and molecule type 11 | XTIT,ExactTitle,Exact Title 12 | CODE,Source ID,Any identifier used by any vocabulary 13 | REFR,Reference,Authors and titles of citations 14 | CHR,Chromosome,Chromosome number; also 'mitochondrial', 'unknown' properties 15 | CLIN,Clinical Features,Clinical features of disorder integrated from OMIM and Human Phenotype Ontology (HPO) 16 | GENE,Gene Name,Name of gene associated with record 17 | CPOS,Base Position,Chromosome base position 18 | OID,MIM ID,Unique number assigned to OMIM record 19 | GFN,Gene Full Name,Gene full name 20 | KWD,Keyword,Keyword relevant to the concept 21 | MINH,Mode Of Inheritance,Mode of inheritance 22 | SNM,SNOMED CT CUI,SNOMED Concept ID 23 | GTIT,Guideline title,Guideline title 24 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/mesh.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | TN,Tree Number,Tree Number 5 | MESH,MeSH Terms,MeSH Terms 6 | SUBS,Substance Name,Substance Name 7 | WORD,Text Word,Free text 8 | ALSO,See Also,See Also 9 | PREV,Previous Indexing,Previous Indexing 10 | NOTE,Scope Note,Scope Note 11 | REG,Registry Number,Registry Number 12 | MULT,Multi,Multi 13 | TYPE,Record Type,Record type - main heading, subheading, pharmacological action, substance name, publication type 14 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/ncbisearch.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | TITL,Title,Document title 5 | KYWD,Keyword,Keyword from meta tag 6 | DESC,Description,Description from meta tag 7 | WORD,Text Word,Free text from page 8 | CAT,Category,Category assigned to page 9 | MDAT,Modification Date,Date of last update 10 | HOME,Homepage Title,Home page title 11 | HURL,Homepage URL,Home page URL 12 | URL,URL,URL of page 13 | MNAM,META Name,META Name 14 | MCON,META Content,META Content 15 | AUTH,Author,Author(s) of web page 16 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/nlmcatalog.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ITAG,Abstract/Index Tags,Abstract/Index Tags 5 | AUTH,Author,Author(s) of publication 6 | FULL,Author Full Name,Full Names of Authors 7 | CALL,Call Number,Call Number 8 | CNAM,Corporate/Conference Name,Corporate/Conference Name 9 | ITEM,Item Type,Item Type 10 | JOUR,Journal,Journal 11 | LANG,Language,Language of publication 12 | RTYP,Resource Type,Resource Type 13 | MESH,MeSH Terms,Medical Subject Headings assigned to publication 14 | MAJR,MeSH Major Topic,MeSH terms of major importance to publication 15 | SUBH,MeSH Subheading,Additional specificity for MeSH term 16 | OLIO,Olio,Olio 17 | OTHR,Other Number,Other Number 18 | OTRM,Other Term,Other terms associated with publication 19 | PERS,Personal Name as Subject,Personal Name as Subject 20 | FPER,Personal Full Name as Subject,Full Personal Name as Subject 21 | CNTY,Country of Publication,Country of publication 22 | PDAT,Publication Year,Year of publication 23 | PSTA,Publication Status,Status of publication 24 | PTYP,Publication Type,Type of publication (e.g., review) 25 | PUBL,Publisher,Publisher 26 | SERI,Series,Series 27 | TITL,Title,Words in title of publication 28 | URL,URL,URL 29 | NID,NLM Unique ID,NLM Unique ID 30 | EDAT,Entrez Date,Date publication first accessible through Entrez 31 | AI,Authority Information,Authority Information 32 | AIID,AIID,Authority ID 33 | WORD,Text Word,Text 34 | ST,Broad Subject Term,Broad Subject Term 35 | CFS,Current Format Status,Current Format Status 36 | XS,Indexing Subset,Indexing Subset 37 | ISO,ISO Abbreviation,ISO Abbreviation 38 | SYR,Publication Start Year,Publication Start Year 39 | EYR,Publication End Year,Publication End Year 40 | IS,ISSN,ISSN 41 | MABR,NLM Title Abbreviation,NLM Title Abbreviation 42 | MULT,Multi,Multi 43 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/nuccore.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to each sequence 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Free text associated with record 5 | TITL,Title,Words in definition line 6 | KYWD,Keyword,Nonstandardized terms provided by submitter 7 | AUTH,Author,Author(s) of publication 8 | JOUR,Journal,Journal abbreviation of publication 9 | VOL,Volume,Volume number of publication 10 | ISS,Issue,Issue number of publication 11 | PAGE,Page Number,Page number(s) of publication 12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 13 | ACCN,Accession,Accession number of sequence 14 | PACC,Primary Accession,Does not include retired secondary accessions 15 | GENE,Gene Name,Name of gene associated with sequence 16 | PROT,Protein Name,Name of protein associated with sequence 17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 18 | PDAT,Publication Date,Date sequence added to GenBank 19 | MDAT,Modification Date,Date of last update 20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name 21 | PROP,Properties,Classification by source qualifiers and molecule type 22 | SQID,SeqID String,String identifier for sequence 23 | GPRJ,BioProject,BioProject 24 | SLEN,Sequence Length,Length of sequence 25 | FKEY,Feature key,Feature annotated on sequence 26 | PORG,Primary Organism,Scientific and common names of primary organism, and all higher levels of taxonomy 27 | COMP,Component Accession,Component accessions for an assembly 28 | ASSM,Assembly,Assembly 29 | DIV,Division,Division 30 | STRN,Strain,Strain 31 | ISOL,Isolate,Isolate 32 | CULT,Cultivar,Cultivar 33 | BRD,Breed,Breed 34 | BIOS,BioSample,BioSample 35 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/nucest.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to each sequence 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Free text associated with record 5 | TITL,Title,Words in definition line 6 | KYWD,Keyword,Nonstandardized terms provided by submitter 7 | AUTH,Author,Author(s) of publication 8 | JOUR,Journal,Journal abbreviation of publication 9 | VOL,Volume,Volume number of publication 10 | ISS,Issue,Issue number of publication 11 | PAGE,Page Number,Page number(s) of publication 12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 13 | ACCN,Accession,Accession number of sequence 14 | PACC,Primary Accession,Does not include retired secondary accessions 15 | GENE,Gene Name,Name of gene associated with sequence 16 | PROT,Protein Name,Name of protein associated with sequence 17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 18 | PDAT,Publication Date,Date sequence added to GenBank 19 | MDAT,Modification Date,Date of last update 20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name 21 | PROP,Properties,Classification by source qualifiers and molecule type 22 | SQID,SeqID String,String identifier for sequence 23 | GPRJ,BioProject,BioProject 24 | SLEN,Sequence Length,Length of sequence 25 | FKEY,Feature key,Feature annotated on sequence 26 | ID,EST id,EST id in est table 27 | NAME,EST Name,EST uid in est table 28 | CLON,Clone ID,clone id 29 | LIB,Library Name,Library Name 30 | SUBM,Submitter Name,Submitter Name 31 | CIT,Citation Title,Citation Title Publication 32 | STRN,Strain,Strain 33 | ISOL,Isolate,Isolate 34 | CULT,Cultivar,Cultivar 35 | BRD,Breed,Breed 36 | BIOS,BioSample,BioSample 37 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/nucgss.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to each sequence 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Free text associated with record 5 | TITL,Title,Words in definition line 6 | KYWD,Keyword,Nonstandardized terms provided by submitter 7 | AUTH,Author,Author(s) of publication 8 | JOUR,Journal,Journal abbreviation of publication 9 | VOL,Volume,Volume number of publication 10 | ISS,Issue,Issue number of publication 11 | PAGE,Page Number,Page number(s) of publication 12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 13 | ACCN,Accession,Accession number of sequence 14 | PACC,Primary Accession,Does not include retired secondary accessions 15 | GENE,Gene Name,Name of gene associated with sequence 16 | PROT,Protein Name,Name of protein associated with sequence 17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 18 | PDAT,Publication Date,Date sequence added to GenBank 19 | MDAT,Modification Date,Date of last update 20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name 21 | PROP,Properties,Classification by source qualifiers and molecule type 22 | SQID,SeqID String,String identifier for sequence 23 | GPRJ,BioProject,BioProject 24 | SLEN,Sequence Length,Length of sequence 25 | FKEY,Feature key,Feature annotated on sequence 26 | ID,GSS id,GSS id in gss table 27 | NAME,GSS Name,GSS uid in gss table 28 | CLON,Clone ID,clone id 29 | LIB,Library Name,Library Name 30 | SUBM,Submitter Name,Submitter Name 31 | CIT,Citation Title,Citation Title Publication 32 | LC,Library Class,Library Class 33 | STRN,Strain,Strain 34 | ISOL,Isolate,Isolate 35 | CULT,Cultivar,Cultivar 36 | BRD,Breed,Breed 37 | BIOS,BioSample,BioSample 38 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/nucleotide.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to each sequence 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Free text associated with record 5 | TITL,Title,Words in definition line 6 | KYWD,Keyword,Nonstandardized terms provided by submitter 7 | AUTH,Author,Author(s) of publication 8 | JOUR,Journal,Journal abbreviation of publication 9 | VOL,Volume,Volume number of publication 10 | ISS,Issue,Issue number of publication 11 | PAGE,Page Number,Page number(s) of publication 12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 13 | ACCN,Accession,Accession number of sequence 14 | PACC,Primary Accession,Does not include retired secondary accessions 15 | GENE,Gene Name,Name of gene associated with sequence 16 | PROT,Protein Name,Name of protein associated with sequence 17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 18 | PDAT,Publication Date,Date sequence added to GenBank 19 | MDAT,Modification Date,Date of last update 20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name 21 | PROP,Properties,Classification by source qualifiers and molecule type 22 | SQID,SeqID String,String identifier for sequence 23 | GPRJ,BioProject,BioProject 24 | SLEN,Sequence Length,Length of sequence 25 | FKEY,Feature key,Feature annotated on sequence 26 | PORG,Primary Organism,Scientific and common names of primary organism, and all higher levels of taxonomy 27 | COMP,Component Accession,Component accessions for an assembly 28 | ASSM,Assembly,Assembly 29 | DIV,Division,Division 30 | STRN,Strain,Strain 31 | ISOL,Isolate,Isolate 32 | CULT,Cultivar,Cultivar 33 | BRD,Breed,Breed 34 | BIOS,BioSample,BioSample 35 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/omim.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,MIM ID,Unique number assigned to OMIM record 3 | FILT,Filter,Limits the records 4 | TITL,Title,Words in title of record 5 | WORD,Text Word,Free text associated with record 6 | AUTH,Contributor,Contributor to OMIM record 7 | CLIN,Clinical Synopsis,Clinical features of disorder 8 | MDAT,Modification Date,The last date on which the record was updated 9 | ALVR,Allelic Variant,A subset of disease-producing mutations 10 | MDHS,Modification History,All dates on which the record was updated 11 | REFR,Reference,Authors and titles of citations 12 | GMAP,Gene Map,Chromosomal map location 13 | DSDR,Gene Map Disorder,Text word in disorder 14 | GENE,Gene Name,Name of gene associated with record 15 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 16 | CHR,Chromosome,Chromosome number; also 'mitochondrial', 'unknown' properties 17 | EDTR,Editor,A username of an OMIM record Editor 18 | PROP,Properties,Properties of OMIM record 19 | PDAT,Publication Date,The date on which this record first appeared 20 | CSK,Clinical Synopsis Key,The keyword designating a part of the Clinical Synopsis 21 | CSED,Clinical Synopsis Editor,A username of an OMIM record Editor 22 | CSDT,Clinical Synopsis Date,The last date on which the Clinical Synopsis was updated 23 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/orgtrack.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Free text associated with record 5 | WRD1,Text Word 1,Free text associated with record with higher score then regular TEXT field 6 | PROP,Properties,Properties of this record 7 | TITL,Organization name,Lab or Clinic name including institution and department 8 | CITY,Organization City,City in which the organization is located 9 | ST,Organization State or province,State, province or other political subdivision in which the organization is located 10 | CTRY,Organization Country,Country in which the organization is located 11 | LOC,Organization Location,City, State, Country in which the organization is located 12 | PCOD,Organization postal code,Postal code for the organization 13 | DIR,Organization Director(s),Full names of director(s) with credentials 14 | STFF,Organization Staff,full names of non-director staff with credentials 15 | LS,Lab Service name,Lab Service name 16 | TYPE,Type of organization,category of an organization, e.g. laboratory, clinic, LSDB 17 | MDAT,Modification Date,The last date on which the record was updated 18 | TNO,Number of tests offered,Number of tests offered by this organization 19 | SNO,SERVICE NUMBER,Unique identifier for the state or province 20 | MTOD,Name of Method,method name 21 | MCAT,name for method category,name for method category 22 | TCAT,name for method top category,name for method top category 23 | LCRT,laboratory certification,laboratory certification 24 | CID,Country ISO code,Country ISO code 25 | SID,State ISO code,State ISO code 26 | TCID,Top Method Category ID,Top Method Category ID 27 | CTID,Method Category ID,Method Category ID 28 | DTCT,Disease and Top Method Category ID,Disease and Top Method Category ID 29 | DCAT,Disease and Method Category ID,Disease and Method Category ID 30 | MTID,Test Method ID,Test Method ID 31 | DID,Disease BioConcepts Entrez ID,Disease BioConcepts Entrez ID 32 | LSID,Lab Service ID,Lab Service ID 33 | DCUI,Disease BioConcepts concept id,Concept identifier from BioConcepts for a disease 34 | ORGN,Organism,scientific and common names of organism 35 | DASV,Disease and Additional Service ID,Disease and Additional Service ID 36 | ASID,Additional Service ID,Additional Service ID 37 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/pcassay.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,Assay ID,Assay ID 3 | FILT,Filter,Limits the records 4 | ANAM,Assay Name,AssayName 5 | ADES,Assay Description,AssayDescription 6 | APRL,Assay Protocol,AssayProtocol 7 | CCMT,Categorized Comment,CategorizedComment 8 | ACMT,Assay Comment,AssayComment 9 | TNAM,Tid Name,TidName 10 | TDES,Tid Description,TidDescription 11 | RC,Readout Count,ReadoutCount 12 | SRID,Substance Source ID,External substance source identifier 13 | ASRD,Assay Source ID,External assay source identifier 14 | ACMD,Activity Outcome Method,ActivityOutcomeMethod 15 | SNME,Source Name,SourceName 16 | CSNM,Current Source Name,CurrentSourceName 17 | DDAT,Deposit Date,DepositDate 18 | MDAT,Modify Date,ModifyDate 19 | JDAT,Journal Publication Date,JournalPublicationDate 20 | JNAM,Journal Name,JournalName 21 | HDAT,Hold Until Date,HoldUntilDate 22 | AC,Active Sid Count,ActiveSidCount 23 | IAC,Inactive Sid Count,InactiveSidCount 24 | IC,Inconclusive Sid Count,InconclusiveSidCount 25 | TSC,Total Sid Count,TotalSidCount 26 | TCNT,Target Count,TargetCount 27 | ACC,Active Cid Count,ActiveCidCount 28 | PCC,Probe Cid Count,ProbeCidCount 29 | PSC,Probe Sid Count,ProbeSidCount 30 | IACC,Inactive Cid Count,InactiveCidCount 31 | ICC,Inconclusive Cid Count,InconclusiveCidCount 32 | UCC,Unspecified Cid Count,UnspecifiedCidCount 33 | USC,Unspecified Sid Count,UnspecifiedSidCount 34 | TCC,Total Cid Count,TotalCidCount 35 | NARD,Nucleic Acid Reagent ID,NucleicAcidReagentID 36 | XRCT,XRef Comment,XRefComment 37 | XRPD,XRef Pmid,XRefPmid 38 | XRGI,XRef Gi,XRefGi 39 | XRMB,XRef Mmdb,XRefMmdb 40 | XRGN,XRef Gene,XRefGeneID 41 | XRDL,XRef Dburl,XRefDburl 42 | XRSL,XRef Sburl,XRefSburl 43 | XRAL,XRef Asurl,XRefAsurl 44 | XRPI,XRef Proteingi,XRefProteingi 45 | XRNI,XRef Nucleotidegi,XRefNucleotidegi 46 | XRTY,XRef Taxonomy,XRefTaxonomy 47 | XRAD,XRef Aid,XRefAid 48 | XRMM,XRef Omim,XRefOmim 49 | SIDA,Substance ID Active,SubstanceIDActive 50 | SIDT,Substance ID Tested,SubstanceIDTested 51 | CIDA,Compound ID Active,CompoundIDActive 52 | CIDT,Compound ID Tested,CompoundIDTested 53 | MHDA,MeSH Description Active,MeSHDescriptionActive 54 | MHDT,MeSH Description Tested,MeSHDescriptionTested 55 | MHTA,MeSH Term Active,MeSHTermActive 56 | MHTT,MeSH Term Tested,MeSHTermTested 57 | PTN,Protein Target Name,ProteinTargetName 58 | PSFM,Protein SubFamily,Protein SubFamily 59 | GSYM,Gene Symbol,GeneSymbol 60 | PTC,Protein Target Comment,ProteinTargetComment 61 | PTD,Protein Target Description,ProteinTargetDescription 62 | BST,Bio Systems Title,The name of the BioSystems, via Protein target 63 | CCT,Categorized Comment Title,The name of the categorized comment 64 | PTGI,Protein Target GI,ProteinTargetGI 65 | BSID,BioSystems ID,ID of the BioSystems, via Protein Target 66 | PIGI,Pig GI,PigGI 67 | RTGI,RNA Target GI,RNATargetGI 68 | SYNA,Synonym Active,SynonymActive 69 | SYNT,Synonym Tested,SynonymTested 70 | PHAA,Pharm Action Active,PharmActionActive 71 | PHAT,Pharm Action Tested,PharmActionTested 72 | SRCC,Source Category,SourceCategory 73 | TXNM,Taxonomy Name,TaxonomyName 74 | TC,Tested Concentration,TestedConcentration 75 | GRN,Grant Number,GrantNumber 76 | NSAM,Number of Sids With Activity Concentration micromolar,NumberofSidsWithActivityConcmicromolar 77 | NCAM,Number of Cids With Activity Concentration micromolar,NumberofCidsWithActivityConcmicromolar 78 | NSAN,Number of Sids With Activity Concentration nanomolar,NumberofSidsWithActivityConcnanomolar 79 | NCAN,Number of Cids With Activity Concentration nanomolar,NumberofCidsWithActivityConcnanomolar 80 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 81 | APRJ,Assay Project,The name of the summary assay to which this assay is related by same project 82 | CELL,Cell Line,CellLine 83 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/pccompound.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,CompoundID,Compound ID 3 | FILT,Filter,Limits the records 4 | SRC,SourceName,Source Name 5 | SRID,SourceID,Source ID 6 | SRCC,SourceCategory,Source Category 7 | CDAT,CreateDate,Record Create Date 8 | BAID,BioAssayID,Assay ID 9 | AA,ActiveAid,Active AID 10 | IA,InactiveAid,Inactive AID 11 | INCH,InChI,InChI 12 | IKEY,InChIKey,InChI Key 13 | CSYN,CompleteSynonym,Complete Synonym 14 | SYNO,Synonym,Synonym 15 | MSHT,MeSHTerm,MeSH Term 16 | CMST,CompleteMeSHTerm,Complete MeSH Term 17 | MSHN,MeSHTreeNode,MeSH Tree Node 18 | PHMA,PharmAction,Pharmacological Action 19 | MHD,MeSHDescription,MeSH Description 20 | ELMT,Element,Element 21 | MW,MolecularWeight,Molecular Weight 22 | TFC,TotalFormalCharge,Total Formal Charge 23 | UPAC,IUPACName,IUPAC Name 24 | XLGP,XLogP,XLogP 25 | CPLX,Complexity,Complexity 26 | RBC,RotatableBondCount,Rotatable Bond Count 27 | HBDC,HydrogenBondDonorCount,Hydrogen Bond Donor Count 28 | HBAC,HydrogenBondAcceptorCount,Hydrogen Bond Acceptor Count 29 | HAC,HeavyAtomCount,Heavy Atom Count 30 | ACC,AtomChiralCount,Atom Chiral Count 31 | ACDC,AtomChiralDefCount,Atom Chiral Defined Count 32 | ACUC,AtomChiralUndefCount,Atom Chiral Undefined Count 33 | BCC,BondChiralCount,Bond Chiral Count 34 | BCDC,BondChiralDefCount,Bond Chiral Defined Count 35 | BCUC,BondChiralUndefCount,Bond Chiral Undefined Count 36 | IAC,IsotopeAtomCount,Isotope Atom Count 37 | CUC,CovalentUnitCount,Covalent Unit Count 38 | TC,TautomerCount,Tautomer Count 39 | AC,ActiveAidCount,Active AID Count 40 | IC,InactiveAidCount,Inactive AID Count 41 | TAC,TotalAidCount,Total AID Count 42 | AAR,ActiveAidRatio,Active AID Ratio 43 | SID,SubstanceID,Substance ID 44 | TPSA,TPSA,TPSA 45 | ASRC,AssaySourceName,Assay Source Name 46 | EMAS,ExactMass,Exact Mass 47 | MMAS,MonoisotopicMass,Monoisotopic Mass 48 | ACON,ActiveConcentration,Active Concentration 49 | TCON,TestedConcentration,Tested Concentration 50 | VL3D,Volume3D,3D Volume 51 | X3D,XStericQuadrupole3D,3D X Steric Quadrupole 52 | Y3D,YStericQuadrupole3D,3D Y Steric Quadrupole 53 | Z3D,ZStericQuadrupole3D,3D Z StericQuadrupole 54 | PAID,PharmActionID,Pharmacological Action ID 55 | STID,StructureID,Depositor Associated Structure ID 56 | FC3D,FeatureCount3D,3D Feature Count 57 | FAC3,FeatureAcceptorCount3D,3D Feature Acceptor Count 58 | FDC3,FeatureDonorCount3D,3D Feature Donor Count 59 | FNC3,FeatureAnionCount3D,3D Feature Anion Count 60 | FTC3,FeatureCationCount3D,3D Feature Cation Count 61 | FRC3,FeatureRingCount3D,3D Feature Ring Count 62 | FHC3,FeatureHydrophobeCount3D,3D Feature Hydrophobe Count 63 | CMR3,ConformerModelRmsd3D,3D Conformer Model RMSD 64 | ERC3,EffectiveRotorCount3D,3D Effective Rotatable Bond Count 65 | DCSY,DepositorCompleteSynonym,Depositor Complete Synonym 66 | DSYN,DepositorSynonym,Depositor Synonym 67 | CCNT,ConformerCount3D,3D Conformer Count 68 | DCNT,DepositorCount,Depositor Count 69 | PTNT,Patent,Patent 70 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/pcsubstance.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,SubstanceID,Substance ID 3 | FILT,Filter,Limits the records 4 | SRC,SourceName,Source Name 5 | CSN,CurrentSourceName,Current Source Name 6 | SRID,SourceID,Source ID 7 | SRCC,SourceCategory,Source Category 8 | SRD,SourceReleaseDate,Source Release Date 9 | DDAT,DepositDate,Deposit Date 10 | MDAT,ModifyDate,Modification Date 11 | BAID,BioAssayID,Assay ID 12 | AA,ActiveAid,Active AID 13 | IA,InactiveAid,Inactive AID 14 | INCH,InChI,InChI 15 | IKEY,InChIKey,InChI Key 16 | CSYN,CompleteSynonym,Complete Synonym 17 | SYNO,Synonym,Synonym 18 | MSHT,MeSHTerm,MeSH Term 19 | CMST,CompleteMeSHTerm,Complete MeSH Term 20 | MSHN,MeSHTreeNode,MeSH Tree Node 21 | PHMA,PharmAction,Pharmacological Action 22 | CMT,Comment,Comment 23 | MHD,MeSHDescription,MeSH Description 24 | ELMT,Element,Element 25 | MW,MolecularWeight,Molecular Weight 26 | TFC,TotalFormalCharge,Total Formal Charge 27 | UPAC,IUPACName,IUPAC Name 28 | XLGP,XLogP,XLogP 29 | CPLX,Complexity,Complexity 30 | RBC,RotatableBondCount,Rotatable Bond Count 31 | HBDC,HydrogenBondDonorCount,Hydrogen Bond Donor Count 32 | HBAC,HydrogenBondAcceptorCount,Hydrogen Bond Acceptor Count 33 | HAC,HeavyAtomCount,Heavy Atom Count 34 | ACC,AtomChiralCount,Atom Chiral Count 35 | ACDC,AtomChiralDefCount,Atom Chiral Defined Count 36 | ACUC,AtomChiralUndefCount,Atom Chiral Undefined Count 37 | BCC,BondChiralCount,Bond Chiral Count 38 | BCDC,BondChiralDefCount,Bond Chiral Defined Count 39 | BCUC,BondChiralUndefCount,Bond Chiral Undefined Count 40 | IAC,IsotopeAtomCount,Isotope Atom Count 41 | CUC,CovalentUnitCount,Covalent Unit Count 42 | TC,TautomerCount,Tautomer Count 43 | AC,ActiveAidCount,Active AID Count 44 | IC,InactiveAidCount,Inactive AID Count 45 | TAC,TotalAidCount,Total AID Count 46 | AAR,ActiveAidRatio,Active AID Ratio 47 | SCID,StandardizedCID,Standardized CID 48 | CCID,ComponentCID,Component CID 49 | CID,CompoundID,Compound ID 50 | TPSA,TPSA,TPSA 51 | ASRC,AssaySourceName,Assay Source Name 52 | EMAS,ExactMass,Exact Mass 53 | MMAS,MonoisotopicMass,Monoisotopic Mass 54 | ACON,ActiveConcentration,Active Concentration 55 | TCON,TestedConcentration,Tested Concentration 56 | PAID,PharmActionID,Pharmacological Action ID 57 | STID,StructureID,Structure ID 58 | HUD,HoldUntilDate,Hold Until Date 59 | PTNT,Patent,Patent 60 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/pmc.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | PMID,PubMed ID,Unique identifier from PubMed 5 | AUTH,Author,Author(s) of publication 6 | TITL,Title,A short descriptive name 7 | PDAT,Publication Date,Date of publication 8 | ABST,Abstract,Abstract 9 | CAPT,Figure/Table Caption,Caption 10 | SECT,Section Title,Section Title 11 | REFR,Reference,Reference 12 | REFA,Reference Author,Name of Reference author(s) 13 | RPID,Reference PubMed ID,Reference Unique identifier from PubMed 14 | JOUR,Journal,Source journal of publication 15 | VOL,Volume,Volume number of publication 16 | ISS,Issue,Issue number of publication 17 | PAGE,Pagination,Page number(s) of publication 18 | EPDT,Electronic Publication Date,Date publication first accessible through Entrez 19 | WORD,Text Word,Free text associated with publication 20 | ARTI,Body - All Words,Article Body 21 | KWD,Body - Key Terms,Keyword 22 | METH,Methods - Key Terms,Keyword in the Methods Section 23 | MESH,MeSH Terms,Medical Subject Headings assigned to publication 24 | MAJR,MeSH Major Topic,Medical terms of major importance assigned to publication 25 | SUBH,MeSH Subheading,Additional specificity for MeSH Terms 26 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 27 | SUBS,Supplementary Concept,CAS chemical name or MEDLINE Substance Name 28 | AFFL,Affiliation,Author's institutional affiliation and address 29 | LDAT,PMC Live Date,PMC live date 30 | ORGN,Organism,scientific and common names of organism 31 | ONSN,Organism unsynonymized,unsynonymized organism names 32 | ACCN,Accession,Accession number of sequence 33 | EDAT,Entrez Date,Entrez date 34 | DOI,DOI,Digital Object Identifier 35 | FULL,Full Author Name,Full Author Name(s) of publication 36 | GRNT,Grant Number,NIH Grant Numbers 37 | ACK,Acknowledgments,Acknowledgments 38 | PPDT,Print Publication Date,Print Publication Date 39 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/popset.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to each sequence 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Free text associated with record 5 | TITL,Title,Words in definition line 6 | KYWD,Keyword,Nonstandardized terms provided by submitter 7 | AUTH,Author,Author(s) of publication 8 | JOUR,Journal,Journal abbreviation of publication 9 | VOL,Volume,Volume number of publication 10 | ISS,Issue,Issue number of publication 11 | PAGE,Page Number,Page number(s) of publication 12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 13 | ACCN,Accession,Accession number of sequence 14 | PACC,Primary Accession,Does not include retired secondary accessions 15 | GENE,Gene Name,Name of gene associated with sequence 16 | PROT,Protein Name,Name of protein associated with sequence 17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 18 | PDAT,Publication Date,Date sequence added to GenBank 19 | MDAT,Modification Date,Date of last update 20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name 21 | PROP,Properties,Classification by source qualifiers and molecule type 22 | SQID,SeqID String,String identifier for sequence 23 | GPRJ,BioProject,BioProject 24 | FKEY,Feature key,Feature annotated on sequence 25 | PCNT,Protein Count,Number of proteins in the set 26 | NCNT,Nucleotide Count,Number of nucleotides in the set 27 | STRN,Strain,Strain 28 | ISOL,Isolate,Isolate 29 | CULT,Cultivar,Cultivar 30 | BRD,Breed,Breed 31 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/probe.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | AUTH,Author,Author 5 | DIST,Distributor,Distributor 6 | GENE,Gene Name,Gene Name 7 | GNID,Gene ID,Gene ID 8 | KYWD,Application,Application 9 | ORGN,Organism,Organism 10 | PRNM,Probe Name,Probe Name 11 | PRTY,Probe Type,Probe Type 12 | PROP,Properties,Properties 13 | WORD,Text Word,Text Word 14 | TITL,Title,Title 15 | CAPT,Caption,Caption 16 | SUBM,Submission,Submission 17 | COLL,Platform Name,Platform Name 18 | SEQ,Sequence,Sequence 19 | ACCN,Sequence accession,Semicolon delimited sequence accession(s) 20 | PMID,Pubmed ID,Pubmed ID 21 | MDAT,Modification Date,Date of the last update of the submission 22 | PDAT,Publication Date,Date sequence added to GenBank 23 | VAL,Validation,Validation 24 | PLID,Platform ID,Platform ID 25 | USTS,UniSTS ID,Legacy UniSTS ID 26 | PSET,Probeset UID,Probeset(s) this probe belongs to 27 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/protein.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to each sequence 3 | FILT,Filter,Limits the records 4 | WORD,Text Word,Free text associated with record 5 | TITL,Title,Words in definition line 6 | KYWD,Keyword,Nonstandardized terms provided by submitter 7 | AUTH,Author,Author(s) of publication 8 | JOUR,Journal,Journal abbreviation of publication 9 | VOL,Volume,Volume number of publication 10 | ISS,Issue,Issue number of publication 11 | PAGE,Page Number,Page number(s) of publication 12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 13 | ACCN,Accession,Accession number of sequence 14 | PACC,Primary Accession,Does not include retired secondary accessions 15 | GENE,Gene Name,Name of gene associated with sequence 16 | PROT,Protein Name,Name of protein associated with sequence 17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 18 | PDAT,Publication Date,Date sequence added to GenBank 19 | MDAT,Modification Date,Date of last update 20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name 21 | PROP,Properties,Classification by source qualifiers and molecule type 22 | SQID,SeqID String,String identifier for sequence 23 | GPRJ,BioProject,BioProject 24 | SLEN,Sequence Length,Length of sequence 25 | MLWT,Molecular Weight,Molecular Weight 26 | FKEY,Feature key,Feature annotated on sequence 27 | PORG,Primary Organism,Scientific and common names of primary organism, and all higher levels of taxonomy 28 | ASSM,Assembly,Assembly 29 | DIV,Division,Division 30 | STRN,Strain,Strain 31 | ISOL,Isolate,Isolate 32 | CULT,Cultivar,Cultivar 33 | BRD,Breed,Breed 34 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/proteinclusters.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACCN,Accession,Accession 5 | PMID,PubMed ID,PubMed ID 6 | TITL,Title,Title 7 | GENE,Gene Name,Gene Name 8 | GSYN,Gene Synonym,Gene Synonym 9 | COG,COG,Clusters of Orthologous Groups of proteins 10 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 11 | HMAP,HAMAP,HAMAP 12 | KO,KO,KO 13 | PROT,Protein Name,Name of protein 14 | PACC,Protein Accession,Protein Accession 15 | LTAG,Locus Tag,Locus Tag 16 | SLEN,Sequence Length,Length of sequence 17 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 18 | TXID,Taxonomy ID,Taxonomy ID 19 | CDAT,Creation Date,Creation Date 20 | MDAT,Modification Date,Modification Date 21 | SIZE,Size,Size 22 | DOM,Domain Name,Domain Name 23 | DOMS,Domains,Domains 24 | PUID,Protein GI,Protein GI 25 | PARA,Paralogs,Paralogs 26 | COGG,COG group,COG group 27 | AVGL,Average Length,Average Length 28 | PROP,Properties,Properties 29 | TPUB,Total Publications,Total Publications 30 | SPCN,SwissProt Accession,SwissProt Accession 31 | CONS,Conserved In,Conserved In 32 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/pubmed.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | TITL,Title,Words in title of publication 5 | WORD,Text Word,Free text associated with publication 6 | MESH,MeSH Terms,Medical Subject Headings assigned to publication 7 | MAJR,MeSH Major Topic,MeSH terms of major importance to publication 8 | AUTH,Author,Author(s) of publication 9 | JOUR,Journal,Journal abbreviation of publication 10 | AFFL,Affiliation,Author's institutional affiliation and address 11 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number 12 | SUBS,Supplementary Concept,CAS chemical name or MEDLINE Substance Name 13 | PDAT,Date - Publication,Date of publication 14 | EDAT,Date - Entrez,Date publication first accessible through Entrez 15 | VOL,Volume,Volume number of publication 16 | PAGE,Pagination,Page number(s) of publication 17 | PTYP,Publication Type,Type of publication (e.g., review) 18 | LANG,Language,Language of publication 19 | ISS,Issue,Issue number of publication 20 | SUBH,MeSH Subheading,Additional specificity for MeSH term 21 | SI,Secondary Source ID,Cross-reference from publication to other databases 22 | MHDA,Date - MeSH,Date publication was indexed with MeSH terms 23 | TIAB,Title/Abstract,Free text associated with Abstract/Title 24 | OTRM,Other Term,Other terms associated with publication 25 | INVR,Investigator,Investigator 26 | COLN,Author - Corporate,Corporate Author of publication 27 | CNTY,Place of Publication,Country of publication 28 | PAPX,Pharmacological Action,MeSH pharmacological action pre-explosions 29 | GRNT,Grant Number,NIH Grant Numbers 30 | MDAT,Date - Modification,Date of last modification 31 | CDAT,Date - Completion,Date of completion 32 | PID,Publisher ID,Publisher ID 33 | FAUT,Author - First,First Author of publication 34 | FULL,Author - Full,Full Author Name(s) of publication 35 | FINV,Investigator - Full,Full name of investigator 36 | TT,Transliterated Title,Words in transliterated title of publication 37 | LAUT,Author - Last,Last Author of publication 38 | PPDT,Print Publication Date,Date of print publication 39 | EPDT,Electronic Publication Date,Date of Electronic publication 40 | LID,Location ID,ELocation ID 41 | CRDT,Date - Create,Date publication first accessible through Entrez 42 | BOOK,Book,ID of the book that contains the document 43 | ED,Editor,Section's Editor 44 | ISBN,ISBN,ISBN 45 | PUBN,Publisher,Publisher's name 46 | AUCL,Author Cluster ID,Author Cluster ID 47 | EID,Extended PMID,Extended PMID 48 | DSO,DSO,Additional text from the summary 49 | AUID,Author - Identifier,Author Identifier 50 | PS,Subject - Personal Name,Personal Name as Subject 51 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/pubmedhealth.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | TITL,Title,Article title 5 | KYPH,Keyphrase,High-scored field for exact-matched phrases 6 | STXT,Secondary Text,Special text word 7 | CONP,Concept Phrases,Generated keywords 8 | BOOK,Book,ID of the book that contains the document 9 | PID,ParentId,ID of the book 10 | PMID,PmId,PubMed ID 11 | RD,ReleaseDate,ReleaseDate 12 | SUB,Subject,Subject 13 | AID,AccessionID,Accession ID 14 | UMLS,UMLSID,UMLS Concept ID 15 | ICD9,ICD9ID,ICD-9 ID 16 | BCID,BioconceptsID,BioConcepts ID 17 | PDAT,Date of publication,Date of publication 18 | UDAT,Update Date,Content update date 19 | DR,DrugName,Drug brand name 20 | TYPE,Type,Document type (Book/Article/Chapter) 21 | CAT,Category,Category 22 | PUBL,Publisher,PMH Content Provider 23 | CLID,CollectionId,Collection Identifier 24 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/seqannot.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACCN,Accession,Accession number of sequence 5 | TITL,Title,Words in definition line 6 | PROP,Properties,Classification by source qualifiers and molecule type 7 | WORD,Text Word,Free text associated with record 8 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 9 | AUTH,Author,Author(s) of publication 10 | PDAT,Publication Date,Date sequence added to GenBank 11 | MDAT,Modification Date,Date of last update 12 | ASSM,Target Assembly,Target Assembly 13 | ANNT,Annotation Type,Attribute 14 | VCTX,Viewer_Context,Viewer Context 15 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/snp.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | RS,Reference SNP ID,Clustered SNP ID (rs) 5 | CHR,Chromosome,chromosomes 6 | GENE,Gene Name,locus link symbol 7 | HAN,Submitter Handle,Submitter Handle 8 | ACCN,Accession,nucleotide accessions 9 | LLID,LocusLink ID,locus link UID 10 | ORGN,Organism,Organism 11 | FXN,Function Class,Function class 12 | GTYP,Genotype,Genotype info 13 | NREF,non reference assembly,SNP not mapped to reference assembly 14 | HETZ,Heterozygosity,Heterozygosity 15 | MPWT,Map Weight,Map weight 16 | VALI,Validation Status,Validation status 17 | SRAT,Success Rate,Success rate 18 | CBID,Create Build ID,Original Build ID 19 | UBID,Update Build ID,Update Build ID 20 | PDAT,Publication Date,SNP Publication date 21 | MDAT,Modification Date,SNP modification date 22 | PCLS,Population Class,Population classification based on geographic location 23 | MCLS,Method Class,Assay Method 24 | SS,Submitter SNP ID,Submitter ID 25 | SID,Local SNP ID,Local SNP ID 26 | VARI,Allele,Allele 27 | SCLS,SNP Class,SNP class 28 | GDSC,Gene Description,description of gene 29 | CPOS,Base Position,Chromosome base position 30 | GPOS,Contig Position,Contig base position 31 | WORD,Text Word,Free text associated with record 32 | WTAA,Reference Amino Acid,Reference Amino Acid 33 | MTAA,Variant Amino Acid,Variant or Mutant Amino Acid 34 | RSNP,Reference SNP,Reference SNP 35 | SIDX,SNP Index,SNP Index 36 | ALOR,SNP Allele Origin,Allele originated from somatic or germline 37 | SUSP,Suspected false variation,Variation suspected to be false based on evidence 38 | CLIN,Clinical Significance,Variations with clinical effects or significances 39 | GMAF,Global Minor Allele Frequency,Minor Allele Frequency derived from global population (ie. 1000G) 40 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/sra.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ACCN,Accession,Accession number of sequence 5 | TITL,Title,Words in definition line 6 | PROP,Properties,Classification by source qualifiers and molecule type 7 | WORD,Text Word,Free text associated with record 8 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy 9 | AUTH,Author,Author(s) of publication 10 | PDAT,Publication Date,Date sequence added to GenBank 11 | MDAT,Modification Date,Date of last update 12 | GPRJ,BioProject,BioProject 13 | BSPL,BioSample,BioSample 14 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/structure.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,MMDB ID,mmdbId 3 | FILT,Filter,Limits the records 4 | ACCN,PDB Accession,PDB Accession 5 | ECNO,EC/RN Number,EC/RN Number 6 | RESO,Resolution,Resolution 7 | EXPM,Experimental Method,Experimental Method 8 | TITL,Title,Citation Title 9 | ABS,Abstract,The abstracts of all PubMed references that are linked to the structure 10 | AUTH,Author,Citation Author 11 | PCLA,PDB Class,Pdb Class 12 | PSRC,PDB Source,Pdb Source 13 | PDSC,PDB Description,PdbDescr 14 | PCOM,PDB Comment,PdbComment 15 | PDD,PDB Deposit Date,PDB Deposit Date 16 | DDAT,MMDB Entry Date,MMDB Deposit Date 17 | MDAT,MMDB Modify Date,MMDB Modify Date 18 | LCOD,PDB Chemical Code,chemical ligand codes taken from PDB file 19 | LNAM,Chemical Name,chemical ligand names taken from PDB file 20 | CSYN,Chemical Synonyms,Chemical synonyms taken from PubChem 21 | LDES,Chemical Description,Chemical description taken from PDB 22 | ORGN,Organism,Organism Name 23 | TXID,Taxonomy ID,Numerical taxonomy identifier 24 | PMC,BioUnit Protein Molecule Count,Count of Protein Molecules in BioUnit 25 | DMC,BioUnit DNA Molecule Count,Count of DNA molecules in BioUnit 26 | RMC,BioUnit RNA Molecule Count,Count of RNA molecules in BioUnit 27 | BPC,BioUnit Biopolymer Count,Count of Biopolymers in BioUnit 28 | LCOU,BioUnit Chemical Count,Count of Chemical Molecules in BioUnit 29 | OCOU,BioUnit Other Molecule Count,Count of Other Molecules in BioUnit 30 | JOUR,Journal,Source journal of structure 31 | CDID,Conserved Domain PSSMID,identifier for a conserved domain cluster 32 | CDSN,Conserved Domain Short Name,Short name of the domain of a conserved domain cluster 33 | CDDT,Conserved Domain Title,Title of the domain of a conserved domain cluster 34 | CDDF,Conserved Domain Description,Defline of the domain of a conserved domain cluster 35 | SFID,Conserved Domain Superfamily PSSMID,identifier for a superfamily domain cluster 36 | SPFN,Conserved Domain Superfamily Short Name,Short name of a superfamily of conserved domain clusters 37 | SPTL,Conserved Domain Superfamily Title,Title of a superfamily of conserved domain clusters 38 | SPDF,Conserved Domain Superfamily Description,Definition line of a superfamily cluster of conserved domain 39 | OS,Oligomeric State,Oligomeric state of the biological unit 40 | PNAM,Protein Name,Names of Protein Molecules 41 | GN,Gene Name,Names of genes associated with protein molecules 42 | GDSC,Gene Description,Descriptions of genes associated with protein molecules 43 | DNAM,DNA Name,Names of DNA Molecules 44 | RNAM,RNA Name,Names of RNA Molecules 45 | ONAM,Other Molecule Name,Names of Other Molecules 46 | APMC,ASU Protein Molecule Count,Count of Protein Molecules in ASU 47 | ADMC,ASU DNA Molecule Count,Count of DNA molecules in ASU 48 | ARMC,ASU RNA Molecule Count,Count of RNA molecules in ASU 49 | ABPC,ASU Biopolymer Count,Count of Biopolymers in ASU 50 | ALCT,ASU Chemical Count,Count of Chemicals in ASU 51 | AOCT,ASU Other Molecule Count,Count of other molecules in ASU 52 | MLWT,BioUnit Molecular Weight,Molecular Weight of the default Biological Assembly 53 | FC,Number of PDB Records per Structure,Number of PDB records that have been combined to define a complete biological assembly. (For most structures, one record suffices; very large structures were split by the PDB into multiple records.) 54 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/taxonomy.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,Taxonomy ID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | SCIN,Scientific Name,Scientific name of organism 5 | COMN,Common Name,Common name of organism 6 | TXSY,Synonym,Synonym of organism name 7 | ALLN,All Names,All aliases for organism 8 | NXLV,Next Level,Immediate parent in taxonomic hierarchy 9 | SBTR,Subtree,Any parent node in taxonomic hierarchy 10 | LNGE,Lineage,Lineage in taxonomic hierarchy 11 | GC,GC,Nuclear genetic code 12 | MGC,MGC,Mitochondrial genetic code 13 | PGC,PGC,Mitochondrial genetic code 14 | TXDV,Division,GenBank division 15 | RANK,Rank,Hierarchical position (e.g., order, genus) 16 | EDAT,Entrez Date,Date record first accessible through Entrez 17 | MDAT,Modification Date,Date of last update 18 | PROP,Properties,Property defined on particular node (e.g., terminal node) 19 | WORD,Text Word,Free text associated with record 20 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/toolkit.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | DID,Dox ID,This is the doxygen id for this entity 5 | DEF,Definition,Definition 6 | DEFT,Definition Type,Type of Definition 7 | SD,Short Description,Short Description 8 | LD,Long Description,Long Description 9 | FILE,File Name,File Name 10 | MODS,Modifiers,Modifiers 11 | ATTR,Attributes,Attributes 12 | LINE,Lines,Lines 13 | LNK,Link,Link 14 | NAM,Name,Name of item (be it class, method, etc.) 15 | DEFB,Defined by,The entity that defines this entity 16 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/toolkitall.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/ncbi/entrez_db_terms/data/toolkitall.txt -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/toolkitbook.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | AUTH,Author,Section's author 5 | CA,Corporate Author,Corporate Author of publication 6 | FA,Full Author Name,Full Author Name(s) of publication 7 | FE,Full Editor Name,f 8 | TITL,Title,Section's title 9 | TYPE,Type,Section's type 10 | STXT,Secondary Text,Special text word 11 | CONP,Concept Phrases,Generated keywords 12 | BOOK,Book,ID of the book that contains the document 13 | RMID,RefPMID,Citation search by PmId 14 | RID,Rid,Book internal ID 15 | PUBN,Publisher,Publisher's Name 16 | PDAT,Publication Year,Publication Year 17 | ISBN,ISBN,ISBN 18 | ATTR,Attribute,Attributes in key value ordered pairs 19 | EDIT,Editor,Section's Editor 20 | RD,Release Date,Release Date 21 | SUB,Subject,Subject 22 | RT,Resource Type,Resource Type 23 | AID,Accession ID,Accession ID 24 | BACI,Book Accession ID,Book Accession ID 25 | CHID,Chapter Accession ID,Chapter Accession ID 26 | -------------------------------------------------------------------------------- /ncbi/entrez_db_terms/data/unigene.txt: -------------------------------------------------------------------------------- 1 | ALL,All Fields,All terms from all searchable fields 2 | UID,UID,Unique number assigned to publication 3 | FILT,Filter,Limits the records 4 | ORGN,Organism,scientific and common names of organism 5 | TITL,Title,title of cluster 6 | LIBR,Library,dbEST library names 7 | TISS,Tissue,tissue sources of libraries 8 | CLON,Clone ID,clone ids, with and without IMAGE 9 | NCAC,Nucleotide Accession,nucleotide accessions of seqeunces 10 | NUID,Nucleotide UID,nucleotide uids of sequences 11 | PRAC,Protein Accession,protein accessions 12 | PUID,Protein UID,protein uids 13 | PROP,Properties,various flags 14 | WORD,Text Word,titles of sequences, vectors 15 | CHR,Chromosome,chromosomes 16 | GENE,Gene Name,locus link symbol 17 | GDSC,Gene Description,description of gene 18 | LLID,Gene ID,gene id 19 | TXID,Taxonomy ID,taxonomy id 20 | ESTC,Est Count,number of ests per cluster 21 | MRNA,mRNA Count,number of mrna per cluster 22 | SEQC,Sequence Count,total number of sequences per cluster 23 | CID,Cluster ID,Cluster ID 24 | EXPR,Expression,library description of all member sequences 25 | REXP,Restricted Expression,library description of the majority of member sequences 26 | PRNK,Page Rank,Page Rank 27 | RTYP,Record Type,record type 28 | -------------------------------------------------------------------------------- /ncbi/taxonomy/README.md: -------------------------------------------------------------------------------- 1 | # Taxonomy 2 | 3 | Utilities for working with the NCBI Taxonomy database. 4 | 5 | ## Current contents 6 | 7 | 1. virus_hosts - get hosts for viruses given taxonomy ID 8 | -------------------------------------------------------------------------------- /ncbi/taxonomy/virus_hosts/README.md: -------------------------------------------------------------------------------- 1 | # virus_hosts 2 | 3 | [See this blog post](https://nsaunders.wordpress.com/2015/06/02/virus-hosts-from-ncbi-taxonomy-web-pages/). 4 | 5 | The code in code/ruby/virus2host.rb takes a taxonomy UID as input and returns the UID, rank, name and host (where present) for the virus. 6 | 7 | The file in data/virus_host.tsv was generated by downloading all virus UIDs from the taxonomy database to a file, then submitting each line to the Ruby script. 8 | 9 | # all virus UIDs at http://www.ncbi.nlm.nih.gov/taxonomy/?term=txid10239[Subtree] > uids.txt 10 | 11 | for line in $(cat uids.txt) 12 | do ruby virus2host.rb $line >> virus_host.tsv 13 | sleep 1 14 | done 15 | 16 | NOTE: the code scrapes HTML and will break if NCBI change the HTML in the future. 17 | -------------------------------------------------------------------------------- /ncbi/taxonomy/virus_hosts/code/ruby/virus2host.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require 'nokogiri' 4 | require 'open-uri' 5 | 6 | def get_host(uid) 7 | url = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&lvl=3&lin=f&keep=1&srchmode=1&unlock&id=" + uid.to_s 8 | doc = Nokogiri::HTML.parse(open(url).read) 9 | data = doc.xpath("//td").collect { |x| x.inner_html.split("
") }.flatten 10 | orgn = "" 11 | rank = "" 12 | host = "" 13 | data.each do |e| 14 | orgn = $1 if e =~ /

(.*?)<\/h2>/ 15 | rank = $1 if e =~ /Rank:\s+<\/em>(.*?)$/ 16 | host = $1 if e =~ /Host:\s+<\/em>(.*?)$/ 17 | end 18 | puts [uid, rank, orgn, host].join("\t") 19 | end 20 | 21 | get_host(ARGV[0]) -------------------------------------------------------------------------------- /ncbi/taxonomy/virus_hosts/data/host_count.txt: -------------------------------------------------------------------------------- 1 | 1301 2 | 283 algae 3 | 114 archaea 4 | 4509 bacteria 5 | 8 diatom 6 | 51 enviroment 7 | 267 fungi 8 | 1 fungi| plants| invertebrates 9 | 4 human 10 | 761 invertebrates 11 | 181 invertebrates| plants 12 | 7 invertebrates| vertebrates 13 | 3979 plants 14 | 102 protozoa 15 | 6834 vertebrates 16 | 115052 vertebrates| human 17 | 43 vertebrates| human stool 18 | 225 vertebrates| invertebrates 19 | 656 vertebrates| invertebrates| human 20 | -------------------------------------------------------------------------------- /uniprot_words/code/R/match_words_uniprot.R: -------------------------------------------------------------------------------- 1 | library(readr) 2 | library(dplyr) 3 | library(seqinr) 4 | library(AhoCorasickTrie) 5 | 6 | # should check this exists 7 | words <- read_lines("~/Downloads/words_alpha.txt") %>% 8 | toupper() 9 | 10 | # should check this exists 11 | sp <- read.fasta("~/Downloads/uniprot_sprot.fasta.gz", 12 | as.string = TRUE, 13 | seqtype = "AA") 14 | 15 | # search & retain only hits 16 | results <- AhoCorasickSearchList(words[which(nchar(words) > 7)], sp, alphabet = "aminoacid") 17 | results <- results[which(sapply(results, function(x) length(x[[1]]) > 0))] 18 | 19 | # subset into first & second hits then recombine 20 | # my this is ugly 21 | 22 | results01 <- results %>% 23 | plyr::ldply(as.data.frame, stringsAsFactors = FALSE) %>% 24 | as_tibble() %>% 25 | select(.id, Keyword = Keyword.1, Offset = Offset.1) %>% 26 | na.omit() 27 | 28 | results02 <- results %>% 29 | plyr::ldply(as.data.frame, stringsAsFactors = FALSE) %>% 30 | as_tibble() %>% 31 | select(.id, Keyword, Offset) %>% 32 | na.omit() 33 | 34 | word_matches <- bind_rows(results01, results02) %>% 35 | arrange(desc(nchar(Keyword))) 36 | 37 | # assumes running from code/R/ 38 | word_matches %>% write_csv("../../data/word_matches.csv") 39 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_de.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|Q1LX78|CFTR_DANRE,ALTSEIMEN,261 3 | sp|G5ED05|CNNM5_CAEEL,EREKTILEN,312 4 | sp|Q3JCN1|G6PI_NITOC,GEPINNTEN,83 5 | sp|Q99KY4|GAK_MOUSE,KAPITELLE,325 6 | sp|P97874|GAK_RAT,KAPITELLE,325 7 | sp|Q56198|GLK_STAXY,TAGLILIEN,272 8 | sp|Q8N3R3|TCAIM_HUMAN,VEREINENS,53 9 | sp|Q8TKS3|UVRB_METAC,DIKTIERTE,498 10 | sp|Q8PRZ9|UVRB_METMA,DIKTIERTE,498 11 | sp|Q6FTM9|KEX1_CANGA,ANDERSWIE,331 12 | sp|A1TZU2|MNMC_MARN8,LERNTYPEN,358 13 | sp|P0A447|PSBA2_SYNEL,RETTETEST,225 14 | sp|P0A446|PSBA2_THEVB,RETTETEST,225 15 | sp|A0Q810|SECB2_FRATN,INNENFELD,50 16 | sp|Q6DJR2|WWC2_XENTR,EIERLEGER,369 17 | sp|O77384|LRR4_PLAF7,ENDENDEN,3335 18 | sp|A0P8X0|AAMY_NIACI,LERNFALL,488 19 | sp|Q65X71|ACA6_ORYSJ,EINCREME,924 20 | sp|A7MS74|ACCD1_VIBC1,AKTSAALE,180 21 | sp|Q87MP2|ACCD1_VIBPA,AKTSAALE,180 22 | sp|Q87I11|ACCD2_VIBPA,AKTSAALE,180 23 | sp|Q0VPJ1|ACCD_ALCBS,AKTSAALE,184 24 | sp|Q5WEF4|ACCD_ALKCK,AKTSAALE,180 25 | sp|Q1QY40|ACCD_CHRSD,AKTSAALE,182 26 | sp|Q9K841|ACCD_HALH5,AKTSAALE,180 27 | sp|A5F2T5|ACCD_VIBC3,AKTSAALE,180 28 | sp|Q9KTA3|ACCD_VIBCH,AKTSAALE,180 29 | sp|Q8DB33|ACCD_VIBVU,AKTSAALE,180 30 | sp|Q7MIU0|ACCD_VIBVY,AKTSAALE,180 31 | sp|Q2IWU7|ACPS_RHOP2,DRIFTETE,28 32 | sp|Q136W1|ACPS_RHOPS,DRIFTETE,28 33 | sp|P71073|ADER_BACSU,RINGELTE,208 34 | sp|B3NAM7|AFFL_DROER,GLASPART,1161 35 | sp|Q9VQI9|AFFL_DROME,GLASPART,1153 36 | sp|Q29KG4|AFFL_DROPS,GLASPART,1220 37 | sp|B4MUE1|AFFL_DROWI,GLASPART,1308 38 | sp|B4NXA8|AFFL_DROYA,GLASPART,1151 39 | sp|P26818|ARBK2_BOVIN,LEERHEIT,47 40 | sp|Q5L1V3|ARGB_GEOKA,TIERLIED,198 41 | sp|Q2NGN7|ARLY_METST,FIKTIVEM,133 42 | sp|P73997|AROB_SYNY3,ANLERNST,93 43 | sp|P25550|ASLB_ECOLI,VERLADET,202 44 | sp|A1R591|ASSY_PAEAT,EDITIERE,287 45 | sp|B8HGC9|ASSY_PSECP,EDITIERE,287 46 | sp|P0DJJ2|ASTL_CHICK,STATTEST,29 47 | sp|Q6CT08|ATG9_KLULA,ANATEVKA,414 48 | sp|W0TIW1|ATG9_KLUMD,ANATEVKA,415 49 | sp|Q6LKZ6|ATPB2_PHOPR,ANSTELLE,122 50 | sp|B6EHG4|ATPB_ALISL,ANSTELLE,121 51 | sp|B8F774|ATPB_GLAP5,ANSTELLE,118 52 | sp|Q112Z3|ATPF2_TRIEI,KAKERLAK,65 53 | sp|Q55EI6|ATX10_DICDI,TESTTEST,268 54 | sp|A0A385DVS7|AUXCP_BPCA1,LENKTEST,145 55 | sp|Q9FKV2|BBE23_ARATH,EINLADET,143 56 | sp|Q84WV2|BGL20_ARATH,EHEFEHDE,527 57 | sp|P33144|BIMB_EMENI,PENDELND,590 58 | sp|B5BT18|BTAF1_ARATH,HIESIGER,687 59 | sp|Q13137|CACO2_HUMAN,ERLEGEND,329 60 | sp|Q4R914|CACO2_MACFA,ERLEGEND,281 61 | sp|Q5R7H1|CACO2_PONAB,ERLEGEND,329 62 | sp|Q7V9U4|CAPP_PROMA,WATTIERT,408 63 | sp|B7IHG0|CBID_THEAB,EISMASSE,187 64 | sp|Q640L5|CCD18_MOUSE,EIERLAGE,1164 65 | sp|P32468|CDC12_YEAST,ENTGEGNE,316 66 | sp|Q52G60|CEF1_MAGO7,SKANDALS,710 67 | sp|Q02224|CENPE_HUMAN,NIESELNS,495 68 | sp|Q1LX78|CFTR_DANRE,ALTSEIME,261 69 | sp|Q0VF96|CGNL1_HUMAN,LEESEGEL,672 70 | sp|Q6AW69|CGNL1_MOUSE,LEESEGEL,668 71 | sp|P12024|CHAO_DROME,KINNLADE,635 72 | sp|Q22516|CHD3_CAEEL,CRICKETS,331 73 | sp|Q1L8T5|CING_DANRE,LEERERER,785 74 | sp|Q9LSX4|CKL8_ARATH,VERPISST,387 75 | sp|G5ED05|CNNM5_CAEEL,EREKTILE,312 76 | sp|P0C0L4|CO4A_HUMAN,ERDFALLS,79 77 | sp|P0C0L5|CO4B_HUMAN,ERDFALLS,79 78 | sp|Q9TU53|CUBN_CANLF,GEILTEST,929 79 | sp|O60494|CUBN_HUMAN,GEILTEST,933 80 | sp|P0C1J2|CWC27_RHIO9,LEIDENER,147 81 | sp|C1BL82|DDRGK_OSMMO,GEADELTE,188 82 | sp|B1XSN2|DEF_POLNS,KRAKELIG,162 83 | sp|P54925|DEGPL_BARHE,SAALETAL,16 84 | sp|Q1LU74|DER_BAUCH,KRISTALL,362 85 | sp|O60231|DHX16_HUMAN,WERTERER,164 86 | sp|Q7YR39|DHX16_PANTR,WERTERER,167 87 | sp|Q767K6|DHX16_PIG,WERTERER,168 88 | sp|Q08387|DNLI4_YEAST,ELEKTIVE,710 89 | sp|Q14185|DOCK1_HUMAN,IMMENSES,661 90 | sp|Q8BUR4|DOCK1_MOUSE,IMMENSES,661 91 | sp|Q9BY84|DUS16_HUMAN,GESIMSEN,633 92 | sp|B3ELV8|END4_CHLPB,ALTAKTIE,43 93 | sp|Q9SN20|FB200_ARATH,ANEKELNS,375 94 | sp|Q9LH52|FLOR1_ARATH,ENDKNALL,26 95 | sp|B8ZUV4|FOLD_MYCLB,DENTALER,105 96 | sp|O32879|FOLD_MYCLE,DENTALER,105 97 | sp|Q3JCN1|G6PI_NITOC,GEPINNTE,83 98 | sp|Q99KY4|GAK_MOUSE,KAPITELL,325 99 | sp|P97874|GAK_RAT,KAPITELL,325 100 | sp|A0A1D8PNP3|GAP6_CANAL,PLAGIATS,389 101 | sp|A6VBJ8|GATA_PSEA7,ERRANGEN,59 102 | sp|B7V023|GATA_PSEA8,ERRANGEN,59 103 | sp|Q02GV8|GATA_PSEAB,ERRANGEN,59 104 | sp|Q9HVT8|GATA_PSEAE,ERRANGEN,59 105 | sp|Q56198|GLK_STAXY,TAGLILIE,272 106 | sp|Q985F6|GLNE_RHILO,PARAGRAF,664 107 | sp|Q9T0P4|GLTB2_ARATH,PASSIVER,965 108 | sp|A8XLW0|GOSR1_CAEBR,KARSTENS,11 109 | sp|Q95ZW1|GOSR1_CAEEL,KARSTENS,11 110 | sp|P52033|GPXC_DIRIM,FIDELERE,213 111 | sp|A4J6H0|GSA_DESRM,TENDIERT,411 112 | sp|Q9ULI3|HEG1_HUMAN,FIEPSTEN,396 113 | sp|A6WCV0|HIS7_KINRD,TARIERST,4 114 | sp|O17894|HM35_CAEEL,KREISRAT,248 115 | sp|P13545|HMB1_STRPU,GENESEST,262 116 | sp|Q9FN19|HOS15_ARATH,EREIFERE,175 117 | sp|A0LH26|HRCA_SYNFM,LEEREREM,86 118 | sp|Q89A17|HSCB_BUCBP,ELFERLEI,96 119 | sp|Q5PB86|HTPG_ANAMM,GESELLEN,594 120 | sp|Q8SQG8|HYAL2_BOVIN,LISTIGES,313 121 | sp|Q12891|HYAL2_HUMAN,LISTIGES,310 122 | sp|O35632|HYAL2_MOUSE,LISTIGES,310 123 | sp|Q9Z2Q3|HYAL2_RAT,LISTIGES,310 124 | sp|Q8SQG7|HYAL2_SHEEP,LISTIGES,313 125 | sp|Q05A56|HYAL4_MOUSE,LISTIGES,321 126 | sp|Q2G5E7|IF2_NOVAD,KARELIER,739 127 | sp|Q3V3Q4|IFI8_MOUSE,TEILSTIL,188 128 | sp|O28294|ILVC_ARCFU,KALEVALA,161 129 | sp|Q17R60|IMPG1_HUMAN,NETTESTE,42 130 | sp|B7L043|KDSB_METC4,ADLERART,232 131 | sp|A9VZK8|KDSB_METEP,ADLERART,232 132 | sp|B1ZJ23|KDSB_METPB,ADLERART,232 133 | sp|O94806|KPCD3_HUMAN,SEILRISS,463 134 | sp|O77384|LRR4_PLAF7,ENDENDEN,3332 135 | sp|O60732|MAGC1_HUMAN,GEPRELLT,1038 136 | sp|Q9UBF1|MAGC2_HUMAN,GEPRELLT,268 137 | sp|C0NF00|MDM12_AJECG,TIPPELEI,52 138 | sp|A6QYC8|MDM12_AJECN,TIPPELEI,52 139 | sp|C5GK63|MDM12_AJEDR,TIPPELEI,52 140 | sp|A1CNY1|MDM12_ASPCL,TIPPELEI,52 141 | sp|B0XN24|MDM12_ASPFC,TIPPELEI,52 142 | sp|Q4WRX2|MDM12_ASPFU,TIPPELEI,52 143 | sp|A2QAU8|MDM12_ASPNC,TIPPELEI,52 144 | sp|C5K0S2|MDM12_BLAGS,TIPPELEI,52 145 | sp|A1D1T8|MDM12_NEOFI,TIPPELEI,52 146 | sp|C1H3V1|MDM12_PARBA,TIPPELEI,52 147 | sp|C1GHQ8|MDM12_PARBD,TIPPELEI,52 148 | sp|C0SE33|MDM12_PARBP,TIPPELEI,52 149 | sp|Q6CI13|MDM12_YARLI,TIPPELEI,51 150 | sp|Q4PFA7|MDM34_USTMA,TIPPELEI,48 151 | sp|Q71YZ2|MEND_LISMF,ALPENSEE,138 152 | sp|Q8Y6K9|MEND_LISMO,ALPENSEE,138 153 | sp|G5EBL2|MES1_CAEEL,VIGILIEN,732 154 | sp|Q5HYA8|MKS3_HUMAN,VERDINGT,136 155 | sp|P40850|MKT1_YEAST,FITTINGS,530 156 | sp|G0SA56|MLP1_CHATD,KRAKELEE,1610 157 | sp|P28810|MMSA_PSEAE,AIRLINES,399 158 | sp|A1KCP8|MNME_AZOSB,ERIGIERT,286 159 | sp|Q1LH94|MNME_CUPMC,ERIGIERT,287 160 | sp|Q0KFG6|MNME_CUPNH,ERIGIERT,287 161 | sp|Q46VM0|MNME_CUPPJ,ERIGIERT,287 162 | sp|A4GAN2|MNME_HERAR,ERIGIERT,293 163 | sp|A6T4D6|MNME_JANMA,ERIGIERT,292 164 | sp|C1D6H7|MNME_LARHH,ERIGIERT,284 165 | sp|Q8Y3H5|MNME_RALSO,ERIGIERT,297 166 | sp|O35024|MNTC_BACSU,MELANIES,381 167 | sp|P48563|MON2_YEAST,SPLITTEN,398 168 | sp|Q12317|MSB4_YEAST,VERKEILT,51 169 | sp|O74502|MSH6_SCHPO,NERVEREI,594 170 | sp|Q8WXI7|MUC16_HUMAN,SPEISTET,8762 171 | sp|Q89DE6|MUTL_BRADU,ALTKANAL,243 172 | sp|B3Q7Y9|MUTL_RHOPT,ALTKANAL,243 173 | sp|B8FJL5|MUTS_DESAL,DRINGEND,539 174 | sp|Q0AEI7|MUTS_NITEC,ERYSIPEL,517 175 | sp|Q56215|MUTS_THEAQ,LEERERER,425 176 | sp|Q63358|MYO9B_RAT,SCREENST,1161 177 | sp|Q99PD7|NCKX3_MOUSE,NENNENDE,417 178 | sp|P39864|NIA_PHYIN,PISSENDE,693 179 | sp|Q6IR61|NIT2A_XENLA,GESTELLS,63 180 | sp|Q6INI7|NIT2B_XENLA,GESTELLS,63 181 | sp|Q12080|NOP53_YEAST,SEETEILS,356 182 | sp|A0A455M2Y3|NTNH_NECSZ,LERNTIPP,369 183 | sp|Q9YDY8|NTPTH_AERPE,REALTEIL,139 184 | sp|Q89JL7|OADC_BRADU,KALKGLAS,256 185 | sp|B7J427|OBG_ACIF2,ERLENWEG,355 186 | sp|B5ELU2|OBG_ACIF5,ERLENWEG,355 187 | sp|Q01323|OTC_NEIFL,GRILLTEN,157 188 | sp|O86408|OTC_NEIPH,GRILLTEN,142 189 | sp|O86415|OTC_NEISU,GRILLTEN,142 190 | sp|P06108|P49_STRLI,STEIGAAL,311 191 | sp|D4N4Z9|PCHTP_TRISP,GEREICHE,101 192 | sp|Q9NJ15|PCSK5_BRACL,SCHRECKT,1335 193 | sp|B0G101|PKS8_DICDI,EISHAIEN,2083 194 | sp|B0S1M8|PLSX_FINM2,ETIENNES,60 195 | sp|P0A4K4|PMRA_STRPN,NATALIAS,119 196 | sp|P0A4K5|PMRA_STRR6,NATALIAS,119 197 | sp|Q8PGR7|PUR4_XANAC,VERHALTE,1000 198 | sp|Q8PCQ7|PUR4_XANCP,VERHALTE,1000 199 | sp|O67775|PUR9_AQUAE,GETAKELT,311 200 | sp|Q72LY0|PURT_LEPIC,DELIKATE,129 201 | sp|Q8EYF0|PURT_LEPIN,DELIKATE,129 202 | sp|A6SX69|RDGC_JANMA,SPRENGEL,46 203 | sp|A9WR65|RISB_RENSM,SAALETAL,149 204 | sp|A2BT57|RLMN_PROMS,KERNLAND,70 205 | sp|Q9GYH7|RME6_CAEEL,ANPRALLS,1050 206 | sp|Q9D304|RN128_MOUSE,LASERGAS,134 207 | sp|Q3ACX1|RNY_CARHZ,VEREHRER,73 208 | sp|P70335|ROCK1_MOUSE,KNISTERT,990 209 | sp|Q63644|ROCK1_RAT,KNISTERT,990 210 | sp|Q5ZKQ3|RPAP3_CHICK,SELTENER,264 211 | sp|Q04EL5|RPOZ_OENOB,KRAKELEE,27 212 | sp|Q6B8N4|RR4_GRATL,NAIVERES,177 213 | sp|B9W9A9|RRT5_CANDC,ENTGEGEN,259 214 | sp|A8LM44|RS7_DINSH,MEERLAGE,116 215 | sp|Q28UW9|RS7_JANSC,MEERLAGE,116 216 | sp|A1B022|RS7_PARDP,MEERLAGE,116 217 | sp|P59061|RS7_RHOCA,MEERLAGE,116 218 | sp|Q160Y2|RS7_ROSDO,MEERLAGE,116 219 | sp|Q5LMR3|RS7_RUEPO,MEERLAGE,116 220 | sp|Q1GK43|RS7_RUEST,MEERLAGE,116 221 | sp|Q12XH7|RSMA_METBU,VAKANTEN,141 222 | sp|P17863|SKI_AVIES,ANEKELST,385 223 | sp|P49140|SKI_CHICK,ANEKELST,406 224 | sp|Q96Q15|SMG1_HUMAN,PRIESEST,1582 225 | sp|Q8BKX6|SMG1_MOUSE,PRIESEST,1580 226 | sp|Q4P9E5|SPB4_USTMA,DREIFELS,522 227 | sp|P17123|SPO12_YEAST,GEDANKEN,53 228 | sp|Q6G2Z4|SYA_BARHE,EKELNDER,753 229 | sp|A5GPF9|SYA_SYNPW,RIESLING,621 230 | sp|A7IAG1|SYE_METB6,AALSPEER,60 231 | sp|Q9V0V2|SYR_PYRAB,KELTERER,536 232 | sp|Q0VSA8|SYV_ALCBS,VERKLAGE,90 233 | sp|Q9R099|TBL2_MOUSE,KRASSEST,412 234 | sp|Q8N3R3|TCAIM_HUMAN,VEREINEN,53 235 | sp|A0A1I4KS07|TCPO_METOL,HEILERIN,330 236 | sp|Q3J7C1|THIC_NITOC,RADTEILS,95 237 | sp|Q54T85|TRA1_DICDI,SKELETTS,4057 238 | sp|Q0I3P3|TRUB_HISS1,AGITIERE,142 239 | sp|B0UU15|TRUB_HISS2,AGITIERE,142 240 | sp|Q8TKS3|UVRB_METAC,DIKTIERT,498 241 | sp|Q8PRZ9|UVRB_METMA,DIKTIERT,498 242 | sp|Q97CP8|VATD_THEVO,VALERIAN,73 243 | sp|J3S836|VCO3_CROAD,DENKREDE,727 244 | sp|A4UGR9|XIRP2_HUMAN,STARRTEN,2953 245 | sp|Q71LX6|XIRP2_RAT,ALTVATER,2393 246 | sp|Q6AWX0|XYLL2_ARATH,REPLIKEN,29 247 | sp|B4EUS1|Y339_PROMH,EREILENS,139 248 | sp|Q3K6A2|Y4965_PSEPF,INSELRAT,77 249 | sp|C3K2M0|Y5418_PSEFS,INSELRAT,77 250 | sp|P31489|YADA1_YEREN,GLASSAAL,385 251 | sp|P0C2W0|YADA2_YEREN,GLASSAAL,352 252 | sp|A1JUB7|YADA_YERE8,GLASSAAL,352 253 | sp|P10858|YADA_YERPS,GLASSAAL,362 254 | sp|Q9CGY1|YJIE_LACLA,FASSTEST,159 255 | sp|Q2UBI2|YME2_ASPOR,SANDLAND,233 256 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_dk.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|P06105|SC160_YEAST,ANSVARLIG,643 3 | sp|C3PNE7|SYL_RICAE,SELEKTERE,287 4 | sp|A1JIX1|TRUB_YERE8,EGALISERE,92 5 | sp|O34273|TRUB_YEREN,EGALISERE,92 6 | sp|A7FMS0|TRUB_YERP3,EGALISERE,92 7 | sp|Q1CC09|TRUB_YERPA,EGALISERE,92 8 | sp|Q8ZBC4|TRUB_YERPE,EGALISERE,92 9 | sp|Q1CEL5|TRUB_YERPN,EGALISERE,92 10 | sp|A4TRI1|TRUB_YERPP,EGALISERE,92 11 | sp|Q66F58|TRUB_YERPS,EGALISERE,92 12 | sp|Q60312|Y002_METJA,LYSSKYHED,92 13 | sp|B8GD12|ARLY_CHLAD,ALGIERER,205 14 | sp|Q6BTX0|ATG2_DEBHA,KANTNING,583 15 | sp|Q112Z3|ATPF2_TRIEI,KAKERLAK,65 16 | sp|Q5E769|BAMB_ALIF1,LAKPLADE,144 17 | sp|Q12D73|BIOB2_POLSJ,VAGTPLAN,247 18 | sp|B9MJH4|BIOB_ACIET,VAGTPLAN,252 19 | sp|Q8VCR2|DHB13_MOUSE,VARSLING,242 20 | sp|Q9VNJ5|DISP_DROME,VELANSET,1160 21 | sp|Q5NPS6|DNAK_ZYMMO,ISRAELER,295 22 | sp|Q985F6|GLNE_RHILO,PARAGRAF,664 23 | sp|Q4URM1|GLO2_XANC8,VANDGRAV,18 24 | sp|B0RTE9|GLO2_XANCB,VANDGRAV,18 25 | sp|Q8PBY0|GLO2_XANCP,VANDGRAV,18 26 | sp|Q05584|GLO2_YEAST,GENKALDE,206 27 | sp|P14750|HCYA_APHSP,FIREDELE,411 28 | sp|Q58CP0|IDH3G_BOVIN,LIVRENTE,163 29 | sp|P51553|IDH3G_HUMAN,LIVRENTE,164 30 | sp|P41564|IDH3G_MACFA,LIVRENTE,126 31 | sp|P70404|IDHG1_MOUSE,LIVRENTE,164 32 | sp|P41565|IDHG1_RAT,LIVRENTE,164 33 | sp|Q3SKX1|IF2_THIDA,KANTNING,569 34 | sp|Q27564|KITH_DICDI,AFSKRIVE,155 35 | sp|P50455|LEU3_SULTO,LIVRENTE,114 36 | sp|A2SZS3|L_RVFV,SEKSTANT,1175 37 | sp|P27316|L_RVFVZ,SEKSTANT,1175 38 | sp|C5NZL6|MEP8_COCP7,AFHANDLE,151 39 | sp|P40850|MKT1_YEAST,FITTINGS,530 40 | sp|Q80XB4|NRAP_MOUSE,KASSEVIS,1051 41 | sp|B1ZRS0|NUON1_OPITP,VALGSLAG,398 42 | sp|P13909|PAI1_BOVIN,SALTSILD,271 43 | sp|P79335|PAI1_PIG,SALTSILD,271 44 | sp|P24004|PEX1_YEAST,GENSIDIG,1030 45 | sp|Q13YI7|PHNW1_PARXL,HALVLANG,85 46 | sp|A0A8J9RIY3|PHP21_PHOLO,TALELYST,400 47 | sp|A0A142I735|PHP22_PHOLO,TALELYST,400 48 | sp|Q42556|PMA9_ARATH,STYRELSE,903 49 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315 50 | sp|Q17XC9|RIBA_HELAH,MALERISK,74 51 | sp|B6JM32|RIBA_HELP2,MALERISK,74 52 | sp|Q1CT68|RIBA_HELPH,MALERISK,74 53 | sp|Q9ZL42|RIBA_HELPJ,MALERISK,74 54 | sp|O08315|RIBA_HELPY,MALERISK,74 55 | sp|P52822|RL5A_SCHPO,PEGEFELT,125 56 | sp|O74306|RL5B_SCHPO,PEGEFELT,125 57 | sp|P67284|RNY_STRP1,LIVSALIG,7 58 | sp|P0DF20|RNY_STRP3,LIVSALIG,7 59 | sp|Q5XAP0|RNY_STRP6,LIVSALIG,7 60 | sp|Q8P000|RNY_STRP8,LIVSALIG,7 61 | sp|Q1JAJ3|RNY_STRPB,LIVSALIG,7 62 | sp|Q1JKP5|RNY_STRPC,LIVSALIG,7 63 | sp|Q1JFN6|RNY_STRPD,LIVSALIG,7 64 | sp|Q1J5I5|RNY_STRPF,LIVSALIG,7 65 | sp|A2RD66|RNY_STRPG,LIVSALIG,7 66 | sp|Q48S17|RNY_STRPM,LIVSALIG,7 67 | sp|P0DF21|RNY_STRPQ,LIVSALIG,7 68 | sp|Q5UZR5|RPO1C_HALMA,RETLINET,289 69 | sp|Q21M92|RPOC_SACD2,RIDETIME,551 70 | sp|B1MGA0|RS4_MYCA9,VARETAGE,152 71 | sp|A5GPF9|SYA_SYNPW,RIESLING,621 72 | sp|Q5P7Y0|SYFB_AROAE,SLAGVARE,170 73 | sp|Q4R7U0|TMC7_MACFA,PLAYLIST,265 74 | sp|Q8C428|TMC7_MOUSE,PLAYLIST,263 75 | sp|Q9Y2B5|VP9D1_HUMAN,SAMKLANG,15 76 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_en.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|Q2TAC2|CCD57_HUMAN,SLAVERERS,410 3 | sp|B7ZRM8|EVI1B_XENLA,NIDERINGS,861 4 | sp|A9BDD9|PYRG_PROM4,HACIENDAS,315 5 | sp|P17284|VIF_SIVCZ,ALKALISER,150 6 | sp|A8AY34|ADDB_STRGC,SLYNESSES,820 7 | sp|Q96LP6|CL042_HUMAN,TARGETEER,145 8 | sp|Q8L5Z1|GDL17_ARATH,TETANILLA,341 9 | sp|Q8TV85|METK_METKA,DELLENITE,376 10 | sp|P25202|RPC1_GIAIN,CHAPSTICK,163 11 | sp|P03700|VINT_LAMBD,HIDALGISM,247 12 | sp|Q5FTU6|2KGR_GLUOX,SHARPEST,172 13 | sp|Q54ET6|ABPF_DICDI,KNELLING,911 14 | sp|P31562|ACCD_CUSRE,SKINKING,98 15 | sp|A9L9A5|ACCD_LEMMI,FINELESS,110 16 | sp|Q1QPW6|ACKA_NITHX,RELEASED,284 17 | sp|Q5FJW9|ADDB_LACAC,FLINKITE,467 18 | sp|Q38X70|ADDB_LATSS,ALTERING,469 19 | sp|Q8TF27|AGA11_HUMAN,STREEKER,422 20 | sp|Q9UPQ3|AGAP1_HUMAN,STREEKER,703 21 | sp|Q8BXK8|AGAP1_MOUSE,STREEKER,703 22 | sp|Q96P64|AGAP4_HUMAN,STREEKER,535 23 | sp|A6NIR3|AGAP5_HUMAN,STREEKER,558 24 | sp|Q5VW22|AGAP6_HUMAN,STREEKER,535 25 | sp|Q5VUJ5|AGAP7_HUMAN,STREEKER,535 26 | sp|Q4P2W6|ALG10_USTMA,MALACTIC,225 27 | sp|P12726|ALT_BPT4,ASCIDIAN,667 28 | sp|Q84NP7|AMPD_ORYSJ,SAFETIED,118 29 | sp|P91885|AMPN_MANSE,AGERASIA,881 30 | sp|Q5ZXN6|ANKX_LEGPH,PAHLAVIS,395 31 | sp|Q6LTE9|APT_PHOPR,PREVISES,83 32 | sp|Q1AS30|ARGB_RUBXD,GALAGALA,231 33 | sp|A6V1N6|ARNF_PSEA7,LAVALAVA,48 34 | sp|B7VBM8|ARNF_PSEA8,LAVALAVA,48 35 | sp|Q9HY59|ARNF_PSEAE,LAVALAVA,48 36 | sp|B4SMU6|AROE_STRM5,FLAGLEAF,44 37 | sp|B2FMH7|AROE_STRMK,FLAGLEAF,44 38 | sp|Q493B0|AROQ_BLOPB,FRILLING,5 39 | sp|B6YR08|ATPF_AZOPC,LINELIKE,153 40 | sp|Q5A4W8|BDF1_CANAL,ASSESSEE,724 41 | sp|B1KPJ7|BIOB_SHEWM,KATAKANA,332 42 | sp|O52587|BIOD_MYCBO,SALARIAT,184 43 | sp|A1KJ01|BIOD_MYCBP,SALARIAT,184 44 | sp|C1ANK7|BIOD_MYCBT,SALARIAT,184 45 | sp|P9WPQ4|BIOD_MYCTO,SALARIAT,184 46 | sp|Q4I7N9|BRE1_GIBZE,ARDELLAE,374 47 | sp|Q7S304|BRE1_NEUCR,ARDELLAE,393 48 | sp|P32333|BTAF1_YEAST,KILTLIKE,1005 49 | sp|Q02294|CAC1B_RAT,PRETENSE,971 50 | sp|Q8LBH2|CAP8_ARATH,DISASTER,436 51 | sp|Q86UW7|CAPS2_HUMAN,ASPARKLE,831 52 | sp|O27158|CAS3_METTH,TRAILERY,645 53 | sp|A4FXY2|CCA_METM5,SINKLIKE,326 54 | sp|Q2TAC2|CCD57_HUMAN,SLAVERER,410 55 | sp|B0CEZ1|CH602_ACAM1,LIGATIVE,437 56 | sp|Q93G07|CH60_LACAC,DAIKERED,338 57 | sp|Q22516|CHD3_CAEEL,CRICKETS,331 58 | sp|Q94F88|CMT3_ARATH,KETIPATE,199 59 | sp|B2AG52|COAX_CUPTR,GALAGALA,254 60 | sp|C9K1X7|COTB4_STRMJ,HALLMARK,32 61 | sp|Q5TZA2|CROCC_HUMAN,REVERSAL,1862 62 | sp|G0HV85|CSG1_HALHT,ADENITIS,216 63 | sp|G0HV86|CSG2_HALHT,ADENITIS,217 64 | sp|B2RX88|CSPP1_MOUSE,ARRANGER,254 65 | sp|A6WJU3|CYSD_SHEB8,FELLAHIN,85 66 | sp|Q7ZV84|DAAF1_DANRE,SHILPITS,464 67 | sp|A0A0H3M776|DARG_MYCBP,GRAVILEA,167 68 | sp|O53605|DARG_MYCTU,GRAVILEA,167 69 | sp|P05385|DBH_CLOPA,ALKALIES,24 70 | sp|Q98KB6|DDLB_RHILO,CADALENE,27 71 | sp|B1AJ22|DER_UREP2,FAINEANT,76 72 | sp|Q9PQA7|DER_UREPA,FAINEANT,76 73 | sp|B5ZBM9|DER_UREU1,FAINEANT,76 74 | sp|Q9CCG2|DNAG_MYCLE,DRIGHTIN,92 75 | sp|Q04503|DP87_DICDI,ASSESSES,543 76 | sp|Q05FI2|EFG_CARRP,KITLINGS,554 77 | sp|A5D5I8|EFTU2_PELTS,KIDNAPEE,49 78 | sp|P14895|ELI5_HORVU,PAPERING,127 79 | sp|P14896|ELI6_HORVU,PAPERING,64 80 | sp|P14897|ELI9_HORVU,PAPERING,69 81 | sp|P93735|ELIP1_ARATH,PAPERING,95 82 | sp|Q94K66|ELIP2_ARATH,PAPERING,93 83 | sp|P11432|ELI_PEA,PAPERING,96 84 | sp|B7ZRM8|EVI1B_XENLA,NIDERING,861 85 | sp|Q19262|EXOC3_CAEEL,ARDELLAE,152 86 | sp|F1P065|FARP1_CHICK,ASTRAEID,840 87 | sp|Q9LPH0|FB57_ARATH,PISSANTS,351 88 | sp|Q9LXQ4|FBL50_ARATH,SKIPPETS,155 89 | sp|A0A7L8UWS6|FFSC_ASPFV,PARTLESS,3 90 | sp|Q91740|FINC_XENLA,PREVISES,1128 91 | sp|C4ZBG8|FTHS_AGARV,MAILCLAD,203 92 | sp|A3PM52|FTHS_CERS1,MAILCLAD,205 93 | sp|Q3J047|FTHS_CERS4,MAILCLAD,205 94 | sp|B9KLK4|FTHS_CERSK,MAILCLAD,205 95 | sp|P0ABH3|FTSA_SHIFL,DANGLING,421 96 | sp|Q9CD58|FTSH_MYCLE,GANGSHAG,680 97 | sp|Q59W62|GIN4_CANAL,SIDELANG,1098 98 | sp|A8ANL5|GLAH_CITK8,GALLINAE,93 99 | sp|Q5Z175|GLPK_NOCFA,GLISSADE,317 100 | sp|Q03877|GP85_TRYCR,ATLANTES,538 101 | sp|O14357|GPI1_SCHPO,VILLAINY,252 102 | sp|Q08726|GPN2_YEAST,REGALIAN,279 103 | sp|Q4P3F1|HCS1_USTMA,FAIRINGS,379 104 | sp|A0A0E3NEE1|HDRD_METTT,FLAGLIKE,273 105 | sp|B9M416|HEM3_GEODF,TRINKLET,153 106 | sp|Q9VR91|HERC2_DROME,HAIRLESS,1063 107 | sp|P49007|HEXB_PSEO7,FAITHING,91 108 | sp|O88850|HIPK3_RAT,PALSTAVE,965 109 | sp|Q4A048|HIS4_STAS1,KINGWEED,138 110 | sp|Q5A1W9|HST3_CANAL,PASSINGS,453 111 | sp|P0A4M4|HST_VIBMI,DANGLING,18 112 | sp|Q5WJE6|HTPG_ALKCK,GESNERIA,438 113 | sp|A9H863|HUTH_GLUDA,SHREDDER,271 114 | sp|A1JSW6|HUTH_YERE8,STRIATED,62 115 | sp|Q8ZA10|HUTH_YERPE,STRIATED,65 116 | sp|Q664B8|HUTH_YERPS,STRIATED,65 117 | sp|C8VJW0|HXNR_EMENI,GAPPIEST,339 118 | sp|Q53479|IDSA_METTM,REVEALED,194 119 | sp|Q8TVE5|IF2G_METKA,LEVELLER,330 120 | sp|A4JDX1|IF2_BURVG,PAGATPAT,313 121 | sp|Q5AIA4|IML1_CANAL,PALMIPES,1051 122 | sp|P40559|INP51_YEAST,ELEGISED,261 123 | sp|Q6P4Y6|IRS1_XENTR,FRAPPEED,568 124 | sp|P63394|IRTB_MYCBO,PALESTRA,295 125 | sp|P9WQJ6|IRTB_MYCTO,PALESTRA,295 126 | sp|P9WQJ7|IRTB_MYCTU,PALESTRA,295 127 | sp|Q8GYU3|IYO_ARATH,FLAGLESS,1181 128 | sp|A1L317|K1C24_MOUSE,CLEADING,242 129 | sp|C7GWZ2|KEX1_YEAS2,SWADDLES,674 130 | sp|C8Z852|KEX1_YEAS8,SWADDLES,674 131 | sp|E7NHF8|KEX1_YEASO,SWADDLES,675 132 | sp|P09620|KEX1_YEAST,SWADDLES,666 133 | sp|Q6PAR0|KLD10_MOUSE,AMALINGS,208 134 | sp|Q5U3Y0|KLD10_RAT,AMALINGS,179 135 | sp|P42215|KPSU1_ECOLX,VIVIPARY,5 136 | sp|P42216|KPSU5_ECOLX,VIVIPARY,5 137 | sp|Q8ZY35|KTHY_PYRAE,LIKEWALK,42 138 | sp|Q12729|LAC1_PLEOS,FELTINGS,367 139 | sp|Q5P089|LEPA_AROAE,DRILLMAN,221 140 | sp|Q2KIB6|LIN7B_BOVIN,RAVELLER,16 141 | sp|Q9HAP6|LIN7B_HUMAN,RAVELLER,16 142 | sp|O88951|LIN7B_MOUSE,RAVELLER,16 143 | sp|Q9Z252|LIN7B_RAT,RAVELLER,16 144 | sp|Q8IVB5|LIX1L_HUMAN,RELASTER,293 145 | sp|Q8BQ89|LIX1L_MOUSE,RELASTER,293 146 | sp|Q5PQQ7|LIX1L_RAT,RELASTER,294 147 | sp|B3P851|LST2_DROER,DEEDLESS,449 148 | sp|B4PRU6|LST2_DROYA,DEEDLESS,449 149 | sp|Q8X5R8|MDTO_ECO57,RALLIERS,570 150 | sp|Q8FAX2|MDTO_ECOL6,RALLIERS,570 151 | sp|P32715|MDTO_ECOLI,RALLIERS,570 152 | sp|Q83IQ8|MDTO_SHIFL,RALLIERS,567 153 | sp|Q7Q6D9|MED24_ANOGA,TASSELED,862 154 | sp|Q07V68|MIAB_RHOP5,PELLEKAR,117 155 | sp|Q22227|MIG5_CAEEL,PLASMASE,319 156 | sp|P40850|MKT1_YEAST,FITTINGS,530 157 | sp|P28810|MMSA_PSEAE,AIRLINES,399 158 | sp|Q6NHQ7|MNMA_CORDI,ALLERGIA,109 159 | sp|Q8FQ01|MNMA_COREF,ALLERGIA,109 160 | sp|A4QDK1|MNMA_CORGB,ALLERGIA,109 161 | sp|Q8NR24|MNMA_CORGL,ALLERGIA,109 162 | sp|A8M5E1|MNMA_SALAI,PADPIECE,295 163 | sp|Q88RX6|MNMG_LACPL,GLIDDERY,464 164 | sp|P48563|MON2_YEAST,SPLITTEN,398 165 | sp|G5E8K6|MOT6_MOUSE,AVAILING,310 166 | sp|Q96J65|MRP9_HUMAN,DEVILLED,669 167 | sp|Q80WJ6|MRP9_MOUSE,DEVILLED,670 168 | sp|Q6Y306|MRP9_RAT,DEVILLED,670 169 | sp|Q09816|MTAP_SCHPO,REDIPPED,98 170 | sp|Q1MAC8|MTGA_RHIL3,DIAPERED,2 171 | sp|Q6NTN5|MTMRD_XENLA,PASSLESS,1216 172 | sp|Q6Z7K5|MTP3_ORYSJ,FLAGGILY,153 173 | sp|A8GQA9|MUTL_RICAH,VERRIERE,404 174 | sp|P61666|MUTS_DESVH,REASPIRE,319 175 | sp|Q10YG4|MUTS_TRIEI,LETTERER,500 176 | sp|Q875Q8|MYO2_LACK1,PILEATED,67 177 | sp|Q876G9|MYO2_SACU7,PILEATED,67 178 | sp|P19524|MYO2_YEAST,PILEATED,67 179 | sp|C3VEQ3|NCED_ONCHC,RIPPLING,155 180 | sp|Q5QGS0|NEXMI_HUMAN,GENTLING,15 181 | sp|Q2RGI2|NNR_MOOTA,RADIALLY,119 182 | sp|Q5KZL2|NORM_GEOKA,LAVALAVA,97 183 | sp|Q5L6C0|NQRB_CHLAB,PALSGRAF,196 184 | sp|Q823P2|NQRB_CHLCV,PALSGRAF,196 185 | sp|Q253X4|NQRB_CHLFF,PALSGRAF,196 186 | sp|Q9Z8B6|NQRB_CHLPN,PALSGRAF,196 187 | sp|Q15YQ5|NQRB_PSEA6,PALSGRAF,199 188 | sp|P0C6Z2|NSP6_ROTBU,WISPLIKE,89 189 | sp|B3SRR8|NSP6_ROTH7,WISPLIKE,89 190 | sp|Q9E8F1|NSP6_ROTRF,WISPLIKE,89 191 | sp|P0C712|NSP6_ROTW3,WISPLIKE,89 192 | sp|A8FKG9|NUSB_CAMJ8,LAKELAND,107 193 | sp|A7H4Z5|NUSB_CAMJD,LAKELAND,107 194 | sp|Q9PIC0|NUSB_CAMJE,LAKELAND,107 195 | sp|A1VYA2|NUSB_CAMJJ,LAKELAND,107 196 | sp|Q5HW85|NUSB_CAMJR,LAKELAND,107 197 | sp|A2VDP6|NXPE3_BOVIN,GRISETTE,217 198 | sp|Q969Y0|NXPE3_HUMAN,GRISETTE,217 199 | sp|Q5RCA5|NXPE3_PONAB,GRISETTE,217 200 | sp|C1DEA4|OBG_AZOVD,PALTERER,271 201 | sp|A6VBV3|OBG_PSEA7,PALTERER,271 202 | sp|B7V0A9|OBG_PSEA8,PALTERER,271 203 | sp|Q02GB1|OBG_PSEAB,PALTERER,271 204 | sp|Q9HVL8|OBG_PSEAE,PALTERER,271 205 | sp|Q9SA38|OCT3_ARATH,SLEETIER,24 206 | sp|Q57483|OM26_HAEIN,TALALGIA,8 207 | sp|Q7VNN8|ORN_HAEDU,TANGLIER,68 208 | sp|G4N285|OXR1_MAGO7,GRASSILY,385 209 | sp|P54893|P5CR_THET2,IMAGISTS,89 210 | sp|Q9RZV8|PARB3_DEIRA,ANALGIAS,135 211 | sp|P32854|PEP12_YEAST,LEASEMAN,148 212 | sp|Q6QNF3|PGFRB_CANLF,RATLINES,788 213 | sp|P09619|PGFRB_HUMAN,RATLINES,788 214 | sp|P9WPG2|PGSA_MYCTO,AGRARIAN,21 215 | sp|P9WPG3|PGSA_MYCTU,AGRARIAN,21 216 | sp|Q5NL86|PLSX_ZYMMO,LAPACTIC,58 217 | sp|B8D9F8|PNP_BUCA5,SAVAGISM,459 218 | sp|P57454|PNP_BUCAI,SAVAGISM,459 219 | sp|Q8K9H5|PNP_BUCAP,SAVAGISM,459 220 | sp|B8D7R0|PNP_BUCAT,SAVAGISM,459 221 | sp|B3R3W3|PNP_CUPTR,SPINAGES,703 222 | sp|Q2FUB2|POK_METHJ,PRECARIA,124 223 | sp|Q65730|POLG_BSTV1,INVIRILE,2367 224 | sp|Q65729|POLG_BSTVG,INVIRILE,200 225 | sp|O92529|POLG_HCVT5,FLATTING,1074 226 | sp|Q03ZQ0|POTA_LEUMM,DREIDELS,133 227 | sp|P23287|PP2B1_YEAST,ASSAILED,444 228 | sp|P54882|PPX1_MYCLE,LEGISTER,258 229 | sp|Q9LJX4|PUM5_ARATH,THREEPED,167 230 | sp|B8DTV0|PUR7_BIFA0,GRILLADE,186 231 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315 232 | sp|Q02099|RAD3_SCHPO,AVENTAIL,630 233 | sp|Q3AB99|RBFA_CARHZ,AGENESES,118 234 | sp|C1CWJ1|RF1_DEIDV,LARDERER,344 235 | sp|O28190|RFHPS_ARCFU,SNAGGIER,277 236 | sp|Q9U6Y8|RFP_DISSP,EASTERLY,144 237 | sp|A4WW77|RIMP_CERS5,DECADIST,56 238 | sp|P23408|RK22_PEA,SAGANASH,136 239 | sp|Q6MRX8|RL10_MYCMS,KAMAAINA,152 240 | sp|Q49ZE1|RL17_STAS1,SERVETTE,27 241 | sp|B5EHX2|RL25_CITBB,PIGTAILS,166 242 | sp|C6E500|RL25_GEOSM,PIGTAILS,166 243 | sp|Q9V1V6|RL30_PYRAB,KINDLIER,146 244 | sp|B1KHY7|RL9_SHEWM,LAVATERA,62 245 | sp|A8LLC1|RNH_DINSH,GALLIARD,22 246 | sp|A9AXK0|RNPA_HERA2,TAVERNRY,52 247 | sp|O27438|RPA_METTH,PREFERED,434 248 | sp|Q8KWX2|RPOB_EHRCR,MARKLAND,297 249 | sp|Q8EM52|RPOE_OCEIH,DELEADED,155 250 | sp|B4M416|RRF2M_DROVI,SINKLESS,218 251 | sp|O66928|RRF_AQUAE,ELEGISED,143 252 | sp|A2C451|RS16_PROM1,DATASETS,102 253 | sp|Q3A9S2|RS3_CARHZ,RIVIERAS,54 254 | sp|Q92QG4|RS3_RHIME,SERRATES,215 255 | sp|C3MAY6|RS3_SINFN,SERRATES,215 256 | sp|A6U865|RS3_SINMW,SERRATES,215 257 | sp|P54024|RS9_METJA,PILLAGEE,49 258 | sp|Q86VD7|S2542_HUMAN,GALAGALA,40 259 | sp|Q8R0Y8|S2542_MOUSE,GALAGALA,40 260 | sp|P0C546|S2542_RAT,GALAGALA,40 261 | sp|Q5F468|S38A2_CHICK,ADENITIS,266 262 | sp|Q96FL8|S47A1_HUMAN,REELRALL,32 263 | sp|Q5RFD2|S47A1_PONAB,REELRALL,32 264 | sp|A7KAU2|S47A1_RABIT,REELRALL,31 265 | sp|L0HB77|SBHS7_THYVU,TASSELER,499 266 | sp|Q9UQD0|SCN8A_HUMAN,SHREDDED,43 267 | sp|Q9WTU3|SCN8A_MOUSE,SHREDDED,43 268 | sp|O88420|SCN8A_RAT,SHREDDED,43 269 | sp|A1D3V8|SDS23_NEOFI,REVISING,276 270 | sp|A1DLN3|SEC16_NEOFI,DEPRAVED,112 271 | sp|Q6AJK1|SECA_DESPS,TRINDLES,26 272 | sp|C7NC37|SECD_LEPBD,DIALLING,261 273 | sp|Q9D7Y9|SLX4I_MOUSE,KEELHALE,353 274 | sp|Q8MNV7|SMAL1_CAEEL,GRILLADE,216 275 | sp|Q6IUP1|SOLH1_MOUSE,RESELLER,178 276 | sp|P17123|SPO12_YEAST,GEDANKEN,53 277 | sp|P0C586|SSY23_ORYSI,AVERAGED,79 278 | sp|Q0DDE3|SSY23_ORYSJ,AVERAGED,79 279 | sp|A5GPF9|SYA_SYNPW,RIESLING,621 280 | sp|B3CN54|SYC_WOLPP,HEMATEIN,362 281 | sp|A3PHK2|SYE1_CERS1,PELLEKAR,364 282 | sp|Q9ZFA3|SYE1_CERS4,PELLEKAR,364 283 | sp|A4WX62|SYE2_CERS5,PELLEKAR,364 284 | sp|Q8RHB5|SYFB_FUSNN,DIKESIDE,13 285 | sp|B3QS95|SYH_CHLT3,AGAPHITE,219 286 | sp|A7HM68|SYK_FERNB,GRIMSIRE,58 287 | sp|Q8WXH0|SYNE2_HUMAN,MISSPEAK,3319 288 | sp|A1TYU8|SYR_MARN8,VAALPENS,67 289 | sp|A1AVC2|SYR_RUTMC,SEALLIKE,483 290 | sp|Q0AND9|SYS_MARMM,ETERNALS,48 291 | sp|P17222|T1SP_ECOLX,ENSILIST,81 292 | sp|Q5VWN6|TASO2_HUMAN,TEEMLESS,635 293 | sp|Q46149|TCDA_CLONO,SIGFILES,1074 294 | sp|Q5UPT1|TF2B_MIMIV,EKISTICS,244 295 | sp|B5Z7K9|THIM_HELPG,LENSLIKE,251 296 | sp|Q1CT25|THIM_HELPH,LENSLIKE,251 297 | sp|Q9ZKZ9|THIM_HELPJ,LENSLIKE,250 298 | sp|B2USY3|THIM_HELPS,LENSLIKE,250 299 | sp|O25516|THIM_HELPY,LENSLIKE,250 300 | sp|Q0PDK7|TMP_BPSPP,FATAGAGA,111 301 | sp|P51743|TNFA_CEREL,CANALMAN,104 302 | sp|P78875|TPP1_SCHPO,TRINKETS,774 303 | sp|E2E2P2|TPS1D_ORIVU,TASSELER,503 304 | sp|Q5NPZ5|TRPF_ZYMMO,HETAERIA,85 305 | sp|O97399|TRYP_PHACE,DIALLELA,117 306 | sp|Q6PCN3|TTBK1_MOUSE,TEMESCAL,984 307 | sp|P59367|TX35C_PHONI,ARCADING,33 308 | sp|Q9VYV3|TXND5_DROME,LAKELIKE,207 309 | sp|Q9XZ16|UBCP1_DROME,STEADIED,79 310 | sp|B2RM62|UVRC_PORG3,LENSLIKE,80 311 | sp|Q7MTG8|UVRC_PORGI,LENSLIKE,79 312 | sp|Q8SQU9|VATA_ENCCU,DISKLIKE,509 313 | sp|Q97CP8|VATD_THEVO,VALERIAN,73 314 | sp|B5YFA5|VATE_DICT6,RIVERLET,5 315 | sp|P17284|VIF_SIVCZ,ALKALISE,150 316 | sp|B8I9N8|XERC_METNO,LALLYGAG,161 317 | sp|B0UNY7|XERC_METS4,LALLYGAG,161 318 | sp|Q6R7F2|Y077_OSHVF,ANTECELL,1119 319 | sp|Q9X0P5|Y1162_THEMA,TREMELLA,145 320 | sp|B7IVJ7|Y1177_BACC2,FLAKIEST,188 321 | sp|O67364|Y1349_AQUAE,FREAKIER,23 322 | sp|B1LCS9|Y1653_THESQ,TREMELLA,145 323 | sp|P47490|Y248_MYCGE,KRISTIAN,3 324 | sp|Q9K275|Y344_CHLPN,PLANILLA,110 325 | sp|A0RHX6|Y3586_BACAH,FLAKIEST,188 326 | sp|Q6HEK2|Y3705_BACHK,FLAKIEST,188 327 | sp|Q635W6|Y3720_BACCZ,FLAKIEST,188 328 | sp|B9IW40|Y3749_BACCQ,FLAKIEST,188 329 | sp|A9VUC1|Y3786_BACMK,FLAKIEST,188 330 | sp|Q81MS4|Y3872_BACAN,FLAKIEST,188 331 | sp|Q819L6|Y3960_BACCR,FLAKIEST,188 332 | sp|B7JKT4|Y3975_BACC0,FLAKIEST,188 333 | sp|Q732A7|Y4007_BACC1,FLAKIEST,188 334 | sp|B7H6U7|Y4063_BACC4,FLAKIEST,188 335 | sp|C1EPX4|Y4065_BACC3,FLAKIEST,188 336 | sp|B7HME5|Y4079_BACC7,FLAKIEST,188 337 | sp|C3P6W7|Y4195_BACAA,FLAKIEST,188 338 | sp|C3LI26|Y4213_BACAC,FLAKIEST,188 339 | sp|Q8LDV3|Y4320_ARATH,VINERIES,85 340 | sp|P75197|Y583_MYCPN,STETTING,170 341 | sp|Q5UPM1|YL149_MIMIV,INSTINCT,140 342 | sp|P42545|YO10_BPL2,GANTLINE,65 343 | sp|Q3ZC82|ZC3HE_BOVIN,RELAPSED,163 344 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_fi.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|O29876|OGG1_ARCFU,STANSSATA,47 3 | sp|Q92R46|ERA_RHIME,AIKAISTAA,265 4 | sp|A6U7A9|ERA_SINMW,AIKAISTAA,262 5 | sp|A5GWN4|GCSP_SYNR3,TAVALLAAN,775 6 | sp|P36337|GH_MEHV1,NARRAILLA,488 7 | sp|A5CCZ2|HTPG_ORITB,TIIKKINEN,574 8 | sp|Q74MI1|SYC_NANEQ,TAKSIVENE,324 9 | sp|A5D7S3|TRM1L_BOVIN,SADETAKKI,480 10 | sp|Q7Z2T5|TRM1L_HUMAN,SADETAKKI,474 11 | sp|Q4R6C7|TRM1L_MACFA,SADETAKKI,434 12 | sp|A2RSY6|TRM1L_MOUSE,SADETAKKI,469 13 | sp|Q5R5T0|TRM1L_PONAB,SADETAKKI,474 14 | sp|Q496Z9|TRM1L_RAT,SADETAKKI,465 15 | sp|O29015|Y1253_ARCFU,KATKEILLA,25 16 | sp|Q9VW60|ADCY2_DROME,ASIAMIES,773 17 | sp|P02656|APOC3_HUMAN,VALLALLA,9 18 | sp|P33622|APOC3_MOUSE,VALLALLA,9 19 | sp|A9F3R4|ATPB_SORC5,VISKAALI,210 20 | sp|O05098|ATPF_CLOAB,VALTIKKA,41 21 | sp|O89001|CBPD_MOUSE,SADANNES,873 22 | sp|A7ZTS1|CBRB_ECO24,RIIPALLA,125 23 | sp|A8A6H8|CBRB_ECOHS,RIIPALLA,125 24 | sp|A1AHP9|CBRB_ECOK1,RIIPALLA,125 25 | sp|Q0TAZ2|CBRB_ECOL5,RIIPALLA,125 26 | sp|Q8FBU5|CBRB_ECOL6,RIIPALLA,125 27 | sp|P31468|CBRB_ECOLI,RIIPALLA,123 28 | sp|Q1R4L9|CBRB_ECOUT,RIIPALLA,125 29 | sp|Q0SYQ7|CBRB_SHIF8,RIIPALLA,125 30 | sp|Q83IZ9|CBRB_SHIFL,RIIPALLA,125 31 | sp|Q3YWJ8|CBRB_SHISS,RIIPALLA,125 32 | sp|Q8N326|CJ111_HUMAN,VALSSATA,44 33 | sp|A9TKY8|CSPL1_PHYPA,SAASTATA,133 34 | sp|Q98JM5|DPO42_RHILO,VERISIDE,95 35 | sp|B3NB67|EI3F2_DROER,ALHAALTA,232 36 | sp|Q4PI64|EIF3H_USTMA,PAPATTAA,17 37 | sp|P0CN57|EIF3L_CRYNB,PAPATTAA,586 38 | sp|P0CN56|EIF3L_CRYNJ,PAPATTAA,586 39 | sp|A5VAA8|FOLD3_RHIWR,ASETELLA,80 40 | sp|A7GKK3|GATA_BACCN,ALATYYLI,304 41 | sp|Q1DCA3|GATA_MYXXD,ALATYYLI,305 42 | sp|Q2FTL0|HEM1_METHJ,KAADELLA,335 43 | sp|B8GP02|IF2_THISH,AIKAPELI,289 44 | sp|Q6NFC2|ISPF_CORDI,GRAAVATA,149 45 | sp|Q8SQP0|KPYK_ENCCU,REKIKELI,19 46 | sp|B0D8R3|MKAR_LACBS,ALLASTAA,97 47 | sp|Q6NHQ7|MNMA_CORDI,ALLERGIA,109 48 | sp|Q8FQ01|MNMA_COREF,ALLERGIA,109 49 | sp|A4QDK1|MNMA_CORGB,ALLERGIA,109 50 | sp|Q8NR24|MNMA_CORGL,ALLERGIA,109 51 | sp|Q05049|MUC1_XENLA,TAPATTAA,72 52 | sp|Q0AYR3|MURE_SYNWW,ALALLAAN,87 53 | sp|Q3SYU9|MVP_BOVIN,KARRELLE,701 54 | sp|O29876|OGG1_ARCFU,TANSSATA,48 55 | sp|P52591|PO121_RAT,TAPATTAA,768 56 | sp|C9JH25|PRRT4_HUMAN,VALLALLA,377 57 | sp|B2RU40|PRRT4_MOUSE,VALLALLA,378 58 | sp|C5A7L1|PSB1_THEGJ,ALALLEEN,144 59 | sp|A0A494C071|PWWP4_HUMAN,STARTATA,1054 60 | sp|A5CCK6|RS3_ORITB,KIINTEYS,42 61 | sp|B3CT11|RS3_ORITI,KIINTEYS,43 62 | sp|Q8D3I1|RSMA_WIGBR,KIIKKIIN,17 63 | sp|Q21MH7|RSMH_SACD2,KAIVERRE,169 64 | sp|Q6FFZ7|RUTA_ACIAD,VARMASTI,97 65 | sp|B9JLT9|RUTA_AGRRK,VARMASTI,97 66 | sp|B0SW63|RUTA_CAUSK,VARMASTI,97 67 | sp|D5VGV4|RUTA_CAUST,VARMASTI,97 68 | sp|Q9A4N2|RUTA_CAUVC,VARMASTI,97 69 | sp|B8H1Q4|RUTA_CAUVN,VARMASTI,97 70 | sp|A7ME52|RUTA_CROS8,VARMASTI,74 71 | sp|C9Y0S7|RUTA_CROTZ,VARMASTI,113 72 | sp|A4W925|RUTA_ENT38,VARMASTI,97 73 | sp|D5CE32|RUTA_ENTCC,VARMASTI,97 74 | sp|B5XXN0|RUTA_KLEP3,VARMASTI,97 75 | sp|A6T7A2|RUTA_KLEP7,VARMASTI,97 76 | sp|D3RKL0|RUTA_KLEVT,VARMASTI,97 77 | sp|B7KWT7|RUTA_METC4,VARMASTI,109 78 | sp|C5B0U9|RUTA_METEA,VARMASTI,97 79 | sp|C7CM36|RUTA_METED,VARMASTI,97 80 | sp|A9W3I1|RUTA_METEP,VARMASTI,105 81 | sp|B1ZB15|RUTA_METPB,VARMASTI,109 82 | sp|A8GCT6|RUTA_SERP5,VARMASTI,97 83 | sp|A4VQH4|RUTA_STUS1,VARMASTI,97 84 | sp|C5CN79|RUTA_VARPS,VARMASTI,97 85 | sp|A1JMY1|RUTA_YERE8,VARMASTI,97 86 | sp|Q4FTT9|RUVB_PSYA2,NIRPALLA,24 87 | sp|Q1QCY5|RUVB_PSYCK,NIRPALLA,24 88 | sp|Q52428|SYD_THEKO,KYMMENEN,315 89 | sp|Q5E8Y6|SYGA_ALIF1,KESKELLE,229 90 | sp|B5FEV9|SYGA_ALIFM,KESKELLE,229 91 | sp|Q87TP7|SYGA_VIBPA,KESKELLE,229 92 | sp|P67600|SYV_MYCBO,KELASTAA,838 93 | sp|Q9CBY7|SYV_MYCLE,KELASTAA,838 94 | sp|P9WFS8|SYV_MYCTO,KELASTAA,838 95 | sp|P9WFS9|SYV_MYCTU,KELASTAA,838 96 | sp|A3LPG0|TRM82_PICST,LISENSSI,373 97 | sp|Q5NPZ7|TRPA_ZYMMO,KENRAALI,13 98 | sp|P42664|UVS2_XENLA,KISAILLA,4 99 | sp|B8DHJ9|Y1020_LISMH,KASKIMAA,78 100 | sp|A8AG56|Y1332_CITK8,ILMAILLA,152 101 | sp|Q8Y6Y2|Y1549_LISMO,KASKIMAA,78 102 | sp|C1KVJ6|Y1560_LISMC,KASKIMAA,78 103 | sp|A0AIZ8|Y1562_LISW6,KASKIMAA,78 104 | sp|Q71ZC0|Y1569_LISMF,KASKIMAA,78 105 | sp|Q92BG5|Y1584_LISIN,KASKIMAA,78 106 | sp|Q5UQE2|YR474_MIMIV,LINTSARI,44 107 | sp|P54992|YSNA_STRPR,PAPATTAA,177 108 | sp|Q551M4|ZFPL1_DICDI,KIINNIKE,275 109 | sp|Q6WRX3|ZY11A_HUMAN,KAKISTAA,110 110 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_fr.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|P40069|IMB4_YEAST,FERRAILLAI,371 3 | sp|A1R703|AROB_PAEAT,RELIERAIS,220 4 | sp|P02537|K1C0_XENLA,HALETANTE,145 5 | sp|Q98SL1|LDHB_CAICA,LITHIASES,8 6 | sp|Q93YQ3|PURU1_ARATH,IRRITERAS,2 7 | sp|A9BDD9|PYRG_PROM4,HACIENDAS,315 8 | sp|A5EW94|SECB_DICNV,REVISSAIT,114 9 | sp|P40069|IMB4_YEAST,FERRAILLA,371 10 | sp|O70576|STAG3_MOUSE,RAMASSAGE,453 11 | sp|Q99M76|STAG3_RAT,RAMASSAGE,453 12 | sp|Q8T664|ABCH2_DICDI,ERRERAIS,156 13 | sp|Q1CY84|SAHH_MYXXD,PALMAIRE,35 14 | sp|Q8T664|ABCH2_DICDI,SERRERAI,155 15 | sp|Q8TGA2|AFLA_ASPPU,DAMASSAI,242 16 | sp|P22197|ALFC7_ARATH,INVENTES,43 17 | sp|Q1AS71|ALLB_RUBXD,GRAILLAS,222 18 | sp|A0A2H3CSB7|ARMOM_ARMGA,VIEILLES,378 19 | sp|A0JX82|AROB_ARTS2,RELIERAI,220 20 | sp|A1R703|AROB_PAEAT,RELIERAI,220 21 | sp|B8H8V5|AROB_PSECP,RELIERAI,220 22 | sp|Q47QY7|AROB_THEFY,RELIERAI,216 23 | sp|O94649|ATG2_SCHPO,RATISSAI,1174 24 | sp|Q24MN7|ATPF_DESHY,SALADIER,107 25 | sp|O52587|BIOD_MYCBO,SALARIAT,184 26 | sp|A1KJ01|BIOD_MYCBP,SALARIAT,184 27 | sp|C1ANK7|BIOD_MYCBT,SALARIAT,184 28 | sp|P9WPQ4|BIOD_MYCTO,SALARIAT,184 29 | sp|Q6J6I8|BRCA1_GORGO,PELTASTE,1637 30 | sp|P38398|BRCA1_HUMAN,PELTASTE,1637 31 | sp|Q9GKK8|BRCA1_PANTR,PELTASTE,1637 32 | sp|Q6J6J0|BRCA1_PONPY,PELTASTE,1637 33 | sp|Q9H0E9|BRD8_HUMAN,SELLETTE,69 34 | sp|Q8R3B7|BRD8_MOUSE,SELLETTE,69 35 | sp|Q04520|BUDC_RAOTE,AGGRAVAI,49 36 | sp|P93147|C81E1_GLYEC,GLAIRAIS,440 37 | sp|Q9LSE1|CDG1_ARATH,CAPEYANT,244 38 | sp|Q5U3Z0|CF298_RAT,REPLISSE,230 39 | sp|Q22516|CHD3_CAEEL,CRICKETS,331 40 | sp|Q9ZV43|CHR8_ARATH,RAFLASSE,674 41 | sp|Q9ZPR0|COQ4_ARATH,GRAILLER,61 42 | sp|Q7F2E4|CSB_ORYSJ,RAFLASSE,660 43 | sp|B2RX88|CSPP1_MOUSE,ARRANGER,254 44 | sp|C3PNF5|DAPA_RICAE,NICKELLE,214 45 | sp|A8EYZ4|DAPA_RICCK,NICKELLE,214 46 | sp|Q92I25|DAPA_RICCN,NICKELLE,214 47 | sp|A8F1K3|DAPA_RICM5,NICKELLE,220 48 | sp|Q9AKQ3|DAPA_RICMO,NICKELLE,214 49 | sp|C4K288|DAPA_RICPU,NICKELLE,214 50 | sp|Q9AKJ9|DAPA_RICRI,NICKELLE,214 51 | sp|B0BXJ1|DAPA_RICRO,NICKELLE,214 52 | sp|A8GS25|DAPA_RICRS,NICKELLE,214 53 | sp|D0PV95|DDX3_CAEEL,GARDERIE,191 54 | sp|C5DGU9|DEF1_LACTC,NARRERAS,171 55 | sp|O62215|DHSD_CAEEL,SAPRISTI,20 56 | sp|Q14185|DOCK1_HUMAN,IMMENSES,661 57 | sp|Q8BUR4|DOCK1_MOUSE,IMMENSES,661 58 | sp|Q0A5C9|DTD_ALKEH,ASPERGEA,97 59 | sp|A1JIQ4|EPMA_YERE8,SLAVISTE,221 60 | sp|Q5M9G9|FAKD4_RAT,INSTALLE,455 61 | sp|Q5YR85|FLUC2_NOCFA,PAILLAIS,12 62 | sp|P43708|FTN2_HAEIN,SLAVISAI,66 63 | sp|O83746|FTSH_TREPA,RAVAGEAS,192 64 | sp|A3PCW7|G6PI_PROM0,RADINAIS,89 65 | sp|A0A1D8PNP3|GAP6_CANAL,PLAGIATS,389 66 | sp|Q8Y3C6|GATB_RALSO,GAVERAIT,54 67 | sp|Q92538|GBF1_HUMAN,PISSASSE,273 68 | sp|Q2SFI6|GCSP_HAHCH,ALLAITAS,342 69 | sp|Q5Z175|GLPK_NOCFA,GLISSADE,317 70 | sp|Q03877|GP85_TRYCR,ATLANTES,538 71 | sp|P08492|HN_PI3H4,AGNELETS,13 72 | sp|P12562|HN_PI3HT,AGNELETS,13 73 | sp|P12563|HN_PI3HU,AGNELETS,13 74 | sp|P12564|HN_PI3HV,AGNELETS,13 75 | sp|P12565|HN_PI3HW,AGNELETS,13 76 | sp|P12566|HN_PI3HX,AGNELETS,13 77 | sp|C5DYQ1|INA17_ZYGRC,INERTIEL,64 78 | sp|P0CO17|INO80_CRYNB,REDEVRAI,241 79 | sp|P0CO16|INO80_CRYNJ,REDEVRAI,241 80 | sp|P0A1I4|INVA_SALTI,PALLIAIS,244 81 | sp|P0A1I3|INVA_SALTY,PALLIAIS,244 82 | sp|O94854|K0754_HUMAN,AGNELLES,2413 83 | sp|P02537|K1C0_XENLA,HALETANT,145 84 | sp|Q88Z42|KUP1_LACPL,PALPITER,651 85 | sp|A1JU76|LCRD_YERE8,PALLIAIT,247 86 | sp|P0C2V3|LCRD_YEREN,PALLIAIT,247 87 | sp|P69955|LCRD_YERPE,PALLIAIT,247 88 | sp|P69956|LCRD_YERPS,PALLIAIT,247 89 | sp|Q98SL1|LDHB_CAICA,LITHIASE,8 90 | sp|Q9SRX6|LEA2_ARATH,RAGEASSE,48 91 | sp|Q6MEF3|LEPA_PARUW,RETIRAIT,545 92 | sp|Q9UPN3|MACF1_HUMAN,AGNELLES,6286 93 | sp|D3ZHV2|MACF1_RAT,AGNELLES,4328 94 | sp|O14323|MCP4_SCHPO,PAVASSES,192 95 | sp|P0DQK9|MDS1_AGALE,PLAISAIS,6 96 | sp|L0P329|MDS_AGACL,PLAISAIS,55 97 | sp|L0P3K3|MDS_AGADC,PLAISAIS,55 98 | sp|A0A5Q0MU22|MDS_AGASP,PLAISAIS,55 99 | sp|Q5YRD1|METN_NOCFA,GRIVELAS,222 100 | sp|Q9ZE90|MNMG_RICPR,FILTRATS,397 101 | sp|Q68XT0|MNMG_RICTY,FILTRATS,397 102 | sp|B2VDB1|MRAZ_ERWT9,GRILLANT,90 103 | sp|A3CR17|MUTS_STRSV,GLISSAIS,392 104 | sp|B7VK59|MUTS_VIBA3,SELLERAI,403 105 | sp|Q9Y2K3|MYH15_HUMAN,GALERNES,1561 106 | sp|Q5VU43|MYOME_HUMAN,AVALERAI,485 107 | sp|Q5DTJ9|MYPN_MOUSE,TERRERAS,201 108 | sp|Q606N2|NAGZ_METCA,DALLASSE,311 109 | sp|Q9Y618|NCOR2_HUMAN,GRAISSAS,1319 110 | sp|Q5JPE7|NOMO2_HUMAN,FASEILLE,1252 111 | sp|Q9RL35|NPD1_STRCO,GAGISTES,39 112 | sp|Q8R984|NPD2_CALS4,GAGISTES,28 113 | sp|A8MBU4|NPD_CALMQ,GAGISTES,28 114 | sp|Q6N6U0|NPD_RHOPA,GAGISTES,28 115 | sp|B5YJW3|NPD_THEYD,GAGISTES,27 116 | sp|Q750J0|NPR3_ASHGO,REPASSAI,826 117 | sp|Q9V463|NU154_DROME,VESTALES,453 118 | sp|A2VDP6|NXPE3_BOVIN,GRISETTE,217 119 | sp|Q969Y0|NXPE3_HUMAN,GRISETTE,217 120 | sp|Q5RCA5|NXPE3_PONAB,GRISETTE,217 121 | sp|P24102|PER22_ARATH,AFFALANT,158 122 | sp|O80912|PER23_ARATH,AFFALANT,158 123 | sp|Q9LHB9|PER32_ARATH,AFFALANT,158 124 | sp|Q8CHS4|PLCX1_MOUSE,VAGINITE,271 125 | sp|Q9FZD1|PPR58_ARATH,ASPIRAIS,25 126 | sp|A4YKF1|PROA_BRASO,AMERRIRA,46 127 | sp|P50852|PTMCB_GEOSE,PLANIFIE,181 128 | sp|B8DTV0|PUR7_BIFA0,GRILLADE,186 129 | sp|Q0VRD0|PURT_ALCBS,REVALAIT,252 130 | sp|Q93YQ3|PURU1_ARATH,IRRITERA,2 131 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315 132 | sp|P20742|PZP_HUMAN,PRISASSE,91 133 | sp|Q5NP84|QUEA_ZYMMO,VEILLERA,133 134 | sp|Q5BPM6|QWRF6_ARATH,ALLAITES,316 135 | sp|Q8U4J3|RFCS_PYRFU,INVERTIE,462 136 | sp|P74240|RIR1_SYNY3,REGISTRE,75 137 | sp|Q6N4R7|RL10_RHOPA,RELAVERA,11 138 | sp|Q15RL2|RNFD_PSEA6,TAILLAIS,81 139 | sp|A1SSX2|RNFD_PSYIN,TAILLAIS,81 140 | sp|Q3J9L3|RNH2_NITOC,LAMERAIS,94 141 | sp|C0QB17|RS2_DESAH,AVERTIES,254 142 | sp|Q92QG4|RS3_RHIME,SERRATES,215 143 | sp|C3MAY6|RS3_SINFN,SERRATES,215 144 | sp|A6U865|RS3_SINMW,SERRATES,215 145 | sp|Q0ABH9|RS7_ALKEH,GALERIES,45 146 | sp|Q1CY84|SAHH_MYXXD,EMPALMAI,33 147 | sp|O94855|SC24D_HUMAN,VIENDRAS,270 148 | sp|A5EW94|SECB_DICNV,REVISSAI,114 149 | sp|Q9W6G6|SEM3D_DANRE,PAIRESSE,697 150 | sp|Q8E3Y3|SERC_STRA3,PILLASSE,119 151 | sp|Q8DSV3|SERC_STRMU,PILLASSE,119 152 | sp|A3CPJ2|SERC_STRSV,PILLASSE,119 153 | sp|Q5LYP0|SERC_STRT1,PILLASSE,119 154 | sp|Q5M3A4|SERC_STRT2,PILLASSE,119 155 | sp|Q03JH6|SERC_STRTD,PILLASSE,119 156 | sp|B9DTW4|SERC_STRU0,PILLASSE,119 157 | sp|Q20480|SIR41_CAEEL,GAGISTES,35 158 | sp|Q20481|SIR42_CAEEL,GAGISTES,35 159 | sp|Q1JQC6|SIR4_BOVIN,GAGISTES,63 160 | sp|Q8IRR5|SIR4_DROME,GAGISTES,53 161 | sp|Q9Y6E7|SIR4_HUMAN,GAGISTES,62 162 | sp|Q8R216|SIR4_MOUSE,GAGISTES,59 163 | sp|Q9Z0I7|SLFN1_MOUSE,ALCALINS,43 164 | sp|Q8MNV7|SMAL1_CAEEL,GRILLADE,216 165 | sp|A5GPF9|SYA_SYNPW,RIESLING,621 166 | sp|Q8RB93|SYE1_CALS4,VARIERAI,465 167 | sp|Q4QL12|SYR_HAEI8,INSTALLA,476 168 | sp|A5UCH1|SYR_HAEIE,INSTALLA,476 169 | sp|A5UJ40|SYR_HAEIG,INSTALLA,476 170 | sp|P43832|SYR_HAEIN,INSTALLA,476 171 | sp|A9HLG2|SYS_GLUDA,ALARMERA,168 172 | sp|Q2RSR3|SYY_RHORT,SPLITTAS,226 173 | sp|Q8RI63|THIG_FUSNN,AIMANTAI,200 174 | sp|Q8DUR1|THII_STRMU,VAGINITE,379 175 | sp|Q5JTD0|TJAP1_HUMAN,PASSASSE,400 176 | sp|Q9DCD5|TJAP1_MOUSE,PASSASSE,395 177 | sp|P29463|TPT_SOLTU,PAILLETS,67 178 | sp|C0H537|TRM5_PLAF7,NIELLAGE,409 179 | sp|B3L2G0|TRM5_PLAKH,NIELLAGE,356 180 | sp|Q7UKG9|TRPB_RHOBA,FERLASSE,349 181 | sp|Q9Z4S7|TTRC_SALTY,RALLIAIT,56 182 | sp|Q62377|U2AFM_MOUSE,SERRERAS,365 183 | sp|A4XUW4|UVRC_PSEMY,SALAIRES,134 184 | sp|Q9LTT9|VCR_ARATH,RETISSAS,1149 185 | sp|Q8GYF5|WAKLR_ARATH,SASSERAS,277 186 | sp|Q5UPM1|YL149_MIMIV,INSTINCT,140 187 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_it.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|P05098|PHEA_MICDP,ANNIDAVATE,45 3 | sp|Q8NTW4|AFTA_CORGL,TRAVASATI,61 4 | sp|Q12659|ARO1_PNECA,DIRIGISTI,1484 5 | sp|Q8W4K3|CAAT4_ARATH,GIALLICCI,264 6 | sp|Q4PB37|CLF1_USTMA,SALASSARE,623 7 | sp|Q65T53|CYSJ_MANSM,ALLEVIARE,563 8 | sp|B3PJ06|HTPX_CELJU,AFFITTIVA,194 9 | sp|B4LQY8|INT3_DROVI,VESSERETE,538 10 | sp|A2RNZ6|LYSP_LACLM,ALLATTAVA,371 11 | sp|Q5AZ53|MANC_EMENI,ALLATTATA,10 12 | sp|A4SV75|MURC_POLAQ,AVVISTAVA,67 13 | sp|B1XT10|MURC_POLNS,AVVISTAVA,67 14 | sp|B1I4X2|MURI_DESAP,GRIGLIATE,110 15 | sp|D6VTK4|STE2_YEAST,STILLASSI,207 16 | sp|P0CI39|STE2_YEASX,STILLASSI,207 17 | sp|Q0KL02|TRIO_MOUSE,SEGHERETE,2402 18 | sp|F1M0Z1|TRIO_RAT,SEGHERETE,2403 19 | sp|Q2HA54|PLPL_CHAGB,PARLARMI,688 20 | sp|Q9LJX0|AB19B_ARATH,SALDASSE,532 21 | sp|Q8T664|ABCH2_DICDI,SERRERAI,155 22 | sp|Q8NQ98|ACNA_CORGL,GIRAVITE,831 23 | sp|Q6ZDQ1|AGM1_ORYSJ,NAVIGAVA,158 24 | sp|D4B1B1|ALS1_ARTBC,STIPASTI,364 25 | sp|P47631|AMPA_MYCGE,DIASTASI,418 26 | sp|P75206|AMPA_MYCPN,DIASTASI,416 27 | sp|A1SRZ1|AMPA_PSYIN,INVIGILA,312 28 | sp|Q12QW7|AMPA_SHEDO,INVIGILA,313 29 | sp|Q086N8|AMPA_SHEFN,INVIGILA,313 30 | sp|Q9CIQ1|AMPN_LACLA,RELEGAVA,706 31 | sp|P0C2T8|AMPN_LACLC,RELEGAVA,706 32 | sp|A2RI32|AMPN_LACLM,RELEGAVA,706 33 | sp|P45461|AMPR_YEREN,GIALAPPA,233 34 | sp|Q75A82|ANT1_ASHGO,NAVIGATA,5 35 | sp|P40532|APQ12_YEAST,INALEREI,115 36 | sp|P77624|ARCM_ECOLI,STALLARE,219 37 | sp|B2VBI7|ARNT_ERWT9,ALLAGAVI,420 38 | sp|C1KWM3|AROA_LISMC,RIDAVATE,345 39 | sp|Q71Y92|AROA_LISMF,RIDAVATE,345 40 | sp|B8DC03|AROA_LISMH,RIDAVATE,345 41 | sp|Q8Y5Y0|AROA_LISMO,RIDAVATE,345 42 | sp|A0AK35|AROA_LISW6,RIDAVATE,345 43 | sp|Q9V1H6|AROK_PYRAB,SGRASSAI,4 44 | sp|P30329|ARSB_STAAU,DIGITALI,24 45 | sp|Q8CQF4|ARSB_STAES,DIGITALI,24 46 | sp|Q01255|ARSB_STAXY,DIGITALI,24 47 | sp|P73241|ATCS_SYNY3,CASSIERA,17 48 | sp|Q9LSW9|ATL16_ARATH,VIGILATA,41 49 | sp|P93823|ATL1_ARATH,VIGILATA,48 50 | sp|C8V3Y7|ATND_EMENI,SEGHETTA,172 51 | sp|Q13XV9|ATPA1_PARXL,APPAIATE,446 52 | sp|Q0C0X0|ATPF_HYPNA,RITRAETE,140 53 | sp|Q9KNG8|ATPZ_VIBCH,ALGINICA,113 54 | sp|F4KBM7|AVT6B_ARATH,TAGLIAVI,367 55 | sp|P25927|BIGA_SALTY,VEGGENTI,1297 56 | sp|P33144|BIMB_EMENI,SVARIATI,446 57 | sp|Q4I7N9|BRE1_GIBZE,SARDELLA,373 58 | sp|Q7S304|BRE1_NEUCR,SARDELLA,392 59 | sp|Q9Z1S0|BUB1B_MOUSE,SALVERAI,93 60 | sp|Q04520|BUDC_RAOTE,AGGRAVAI,49 61 | sp|O74794|CCHL_SCHPO,REAGENTE,268 62 | sp|Q6UY09|CEA20_HUMAN,VIGILAVI,452 63 | sp|Q8BI06|CEMIP_MOUSE,SGASSAVA,26 64 | sp|Q7SEY2|CFT1_NEUCR,VANGASTI,237 65 | sp|A3KFM7|CHD6_MOUSE,VEGETAVI,1976 66 | sp|Q7U5I4|CHLN_PARMW,ERARIALE,270 67 | sp|Q6CJI9|CHO2_KLULA,ECCITATI,58 68 | sp|D3Z7H8|CILP2_MOUSE,PRAGHESE,872 69 | sp|Q99LJ5|CKLF3_MOUSE,VISITAVA,115 70 | sp|Q9H9A5|CNO10_HUMAN,TESSESSE,495 71 | sp|Q4R350|CNO10_MACFA,TESSESSE,495 72 | sp|Q8BH15|CNO10_MOUSE,TESSESSE,495 73 | sp|Q62GU3|COAE_BURMA,AMPLIARE,45 74 | sp|Q3JNF3|COAE_BURP1,AMPLIARE,45 75 | sp|I1WFB9|COAE_BURP2,AMPLIARE,45 76 | sp|P0DMK3|COAE_BURPS,AMPLIARE,45 77 | sp|Q6NG92|COBS_CORDI,APPARARE,109 78 | sp|A9AYY2|COBS_HERA2,AVVIVATI,200 79 | sp|Q22498|COPG_CAEEL,RIALTESI,629 80 | sp|Q01331|CRTY_PSEVU,ALLAGAVI,126 81 | sp|A4SDU0|CYSD_CHLPM,VESSASTI,258 82 | sp|Q8EAZ9|CYSJ_SHEON,SASSELLA,341 83 | sp|P71128|CYSM_CAMJE,ISLAMICA,73 84 | sp|B2SD81|DAP_BRUA1,ADERISTI,302 85 | sp|Q2YLB4|DAP_BRUA2,ADERISTI,302 86 | sp|Q579G4|DAP_BRUAB,ADERISTI,302 87 | sp|A9MCM7|DAP_BRUC2,ADERISTI,302 88 | sp|C0RM93|DAP_BRUMB,ADERISTI,302 89 | sp|Q8YD27|DAP_BRUME,ADERISTI,302 90 | sp|A5VVL2|DAP_BRUO2,ADERISTI,302 91 | sp|A9WVV8|DAP_BRUSI,ADERISTI,302 92 | sp|Q8FV99|DAP_BRUSU,ADERISTI,302 93 | sp|Q6BLM5|DBP9_DEBHA,DESTEREI,87 94 | sp|Q3AS55|DDL_CHLCH,ADAGIAVA,150 95 | sp|B4SAI2|DDL_PELPB,ADAGIAVA,150 96 | sp|A4ZZ93|DHYSL_LEIDO,FISSAGGI,138 97 | sp|Q9ZIV1|DNAK_MEGEL,PENTISSI,62 98 | sp|A2WZI4|DRE1F_ORYSI,SPEDIRLA,118 99 | sp|Q8S9Z5|DRE1F_ORYSJ,SPEDIRLA,118 100 | sp|P0CN13|DXO_CRYNB,FERRIERE,59 101 | sp|P0CN12|DXO_CRYNJ,FERRIERE,59 102 | sp|Q83I20|DXS_TROW8,REDIVIVA,510 103 | sp|Q83G46|DXS_TROWT,REDIVIVA,510 104 | sp|Q9P225|DYH2_HUMAN,LINGERIA,2190 105 | sp|P0C6F1|DYH2_MOUSE,LINGERIA,2219 106 | sp|Q7SBU6|EAF1_NEUCR,PAPPASSI,137 107 | sp|Q5L764|EFTS_CHLAB,PEDALARE,202 108 | sp|Q6PFQ2|EIF3C_DANRE,IFIGENIA,418 109 | sp|Q74FS7|END4_GEOSL,VILLETTA,141 110 | sp|Q80X91|F110D_MOUSE,RAPPRESE,263 111 | sp|B1MKD7|FABH_MYCA9,TRINELLA,264 112 | sp|A1STW1|FABH_PSYIN,AVVISATE,163 113 | sp|Q6LTK3|FADJ_PHOPR,SIFFATTA,294 114 | sp|A1S7L6|FADJ_SHEAM,SIFFATTE,288 115 | sp|A3D684|FADJ_SHEB5,SIFFATTE,288 116 | sp|A6WQ25|FADJ_SHEB8,SIFFATTE,288 117 | sp|A3QFP3|FADJ_SHELP,SIFFATTE,288 118 | sp|Q8ECP7|FADJ_SHEON,SIFFATTE,288 119 | sp|A4Y897|FADJ_SHEPC,SIFFATTE,288 120 | sp|A0KV76|FADJ_SHESA,SIFFATTE,288 121 | sp|Q0HKD1|FADJ_SHESM,SIFFATTE,288 122 | sp|Q0HWN3|FADJ_SHESR,SIFFATTE,288 123 | sp|A1RI92|FADJ_SHESW,SIFFATTE,288 124 | sp|A7MS61|FADJ_VIBC1,SIFFATTE,289 125 | sp|A5F2P2|FADJ_VIBC3,SIFFATTE,292 126 | sp|Q9KT58|FADJ_VIBCH,SIFFATTE,292 127 | sp|Q87MM3|FADJ_VIBPA,SIFFATTE,289 128 | sp|Q8DB47|FADJ_VIBVU,SIFFATTE,289 129 | sp|Q7MIS5|FADJ_VIBVY,SIFFATTE,289 130 | sp|B1MIT2|FGD_MYCA9,VELAVATE,22 131 | sp|O83710|FLHB_TREPA,TRATTASI,84 132 | sp|P9WES5|FOGB_ASPRC,CALAFATE,20 133 | sp|Q5FIU5|FTHS2_LACAC,AVVIVATA,322 134 | sp|C3P858|FTHS_BACAA,AVVIVATI,330 135 | sp|C3LKJ6|FTHS_BACAC,AVVIVATI,330 136 | sp|A0RD97|FTHS_BACAH,AVVIVATI,330 137 | sp|Q81RE1|FTHS_BACAN,AVVIVATI,330 138 | sp|B7JLG8|FTHS_BACC0,AVVIVATI,330 139 | sp|Q739F4|FTHS_BACC1,AVVIVATI,330 140 | sp|B7IUA4|FTHS_BACC2,AVVIVATI,330 141 | sp|C1ES77|FTHS_BACC3,AVVIVATI,330 142 | sp|B7HP29|FTHS_BACC7,AVVIVATI,330 143 | sp|B9IYP4|FTHS_BACCQ,AVVIVATI,330 144 | sp|Q81E87|FTHS_BACCR,AVVIVATI,330 145 | sp|Q63C61|FTHS_BACCZ,AVVIVATI,330 146 | sp|Q6HJK9|FTHS_BACHK,AVVIVATI,330 147 | sp|Q891R3|FTHS_CLOTE,AVVIVATI,327 148 | sp|Q834D6|FTHS_ENTFA,AVVIVATI,324 149 | sp|Q88W76|FTHS_LACPL,AVVIVATI,320 150 | sp|Q03S45|FTHS_LEVBA,AVVIVATI,320 151 | sp|Q83WS0|FTHS_METEA,AVVIVATI,324 152 | sp|A9VZT0|FTHS_METEP,AVVIVATI,324 153 | sp|B8EKB9|FTHS_METSB,AVVIVATI,325 154 | sp|A9WMW3|FTHS_RENSM,AVVIVATI,334 155 | sp|Q59925|FTHS_STRMU,AVVIVATI,324 156 | sp|Q3APF2|GATB_CHLCH,PARLAGLI,367 157 | sp|Q1D651|GLGE_MYXXD,ALLEGAVA,126 158 | sp|B2S889|GLO2_BRUA1,ESALTATI,28 159 | sp|Q2YLU8|GLO2_BRUA2,ESALTATI,25 160 | sp|Q57AW2|GLO2_BRUAB,ESALTATI,25 161 | sp|A9M8S2|GLO2_BRUC2,ESALTATI,28 162 | sp|C0RFI1|GLO2_BRUMB,ESALTATI,28 163 | sp|Q8YJF4|GLO2_BRUME,ESALTATI,25 164 | sp|A5VSR1|GLO2_BRUO2,ESALTATI,25 165 | sp|B0CIU0|GLO2_BRUSI,ESALTATI,28 166 | sp|Q8FYE7|GLO2_BRUSU,ESALTATI,25 167 | sp|P64183|GLPD1_MYCBO,SARAVINA,231 168 | sp|P9WN80|GLPD1_MYCTO,SARAVINA,231 169 | sp|P9WN81|GLPD1_MYCTU,SARAVINA,231 170 | sp|Q2RFW7|GLYA_MOOTA,AVARIARE,156 171 | sp|Q0AIY2|GRPE_NITEC,STENTERA,30 172 | sp|A6VDX9|GSH1_PSEA7,SELLERIA,405 173 | sp|P0CS37|HAT2_CRYNB,SPARIRAI,124 174 | sp|P0CS36|HAT2_CRYNJ,SPARIRAI,124 175 | sp|P04662|HEMA_I75A5,SVELLETE,45 176 | sp|O53333|HIGA3_MYCTU,DIRADAVA,9 177 | sp|B8E2C7|HIS4_DICTD,RALLEGRI,223 178 | sp|Q3AD55|HISZ_CARHZ,LEGIFERA,192 179 | sp|O64966|HMDH1_GOSHI,ILLATIVA,540 180 | sp|Q5P502|HSLV_AROAE,RALLENTA,140 181 | sp|Q8N5X7|IF4E3_HUMAN,APPAGARE,8 182 | sp|Q9NPH9|IL26_HUMAN,SCASSARE,123 183 | sp|O60100|IMB4_SCHPO,TEATRALE,25 184 | sp|P0DX14|INLPC_STRC4,APPETIVA,127 185 | sp|P63394|IRTB_MYCBO,PALESTRA,295 186 | sp|P9WQJ6|IRTB_MYCTO,PALESTRA,295 187 | sp|P9WQJ7|IRTB_MYCTU,PALESTRA,295 188 | sp|A6TWK9|ISPF_ALKMQ,ANNIDATI,91 189 | sp|P9WKF8|ISPH1_MYCTO,VERSIATE,309 190 | sp|P9WKF9|ISPH1_MYCTU,VERSIATE,309 191 | sp|P0A5I3|ISPH2_MYCBO,VERSIATE,309 192 | sp|Q88M04|KCY_PSEPK,VAGLIARE,21 193 | sp|P65208|KDGT1_SALTI,STAGNAVA,255 194 | sp|P65207|KDGT1_SALTY,STAGNAVA,255 195 | sp|Q8PKS3|KDSB_XANAC,ALLAGARE,43 196 | sp|Q3BTC6|KDSB_XANC5,ALLAGARE,43 197 | sp|Q8I719|KGP_PLAF7,DELETERI,539 198 | sp|W7JX98|KGP_PLAFO,DELETERI,539 199 | sp|P0CU29|KTU_DROWI,DEFERIRE,36 200 | sp|A5CX27|LEUC_VESOH,IRENISTA,15 201 | sp|B8NWW3|LNBC_ASPFN,GETTASTI,308 202 | sp|B9JA09|LPXK_AGRRK,GRADELLA,300 203 | sp|Q9H089|LSG1_HUMAN,TASTASSE,623 204 | sp|A8XJZ8|LST2_CAEBR,VIETASSE,381 205 | sp|Q96LR2|LURA1_HUMAN,VIAGGERA,174 206 | sp|Q91YU6|LZTS2_MOUSE,PARETATA,13 207 | sp|Q3LUD4|LZTS2_RAT,PARETATA,13 208 | sp|Q77SJ8|L_HIRRV,TIRATEVI,1480 209 | sp|Q82707|L_IHNVO,TIRATEVI,1480 210 | sp|Q82685|L_IHNVW,TIRATEVI,1480 211 | sp|Q1LVZ2|MARH2_DANRE,PICRICHE,62 212 | sp|Q5PQ35|MARH2_XENLA,PICRICHE,62 213 | sp|Q28EX7|MARH2_XENTR,PICRICHE,62 214 | sp|B0C1Y1|MEND_ACAM1,RISVEGLI,346 215 | sp|A4FG19|MIBS_SACEN,LASCEREI,266 216 | sp|A0A319DV72|MLFA_ASPSB,PRESIEDI,2062 217 | sp|Q6NHQ7|MNMA_CORDI,ALLERGIA,109 218 | sp|Q8FQ01|MNMA_COREF,ALLERGIA,109 219 | sp|A4QDK1|MNMA_CORGB,ALLERGIA,109 220 | sp|Q8NR24|MNMA_CORGL,ALLERGIA,109 221 | sp|Q0RKY6|MSHB1_FRAAA,RIGELERA,60 222 | sp|D7BQJ3|MSHB_STRBB,RIGELAVA,71 223 | sp|O74472|MUG33_SCHPO,CICLISTI,96 224 | sp|Q8UDM9|MURC_AGRFC,PIEGASSE,400 225 | sp|Q0AJE2|MURC_NITEC,AVVISTAI,67 226 | sp|Q82VS2|MURC_NITEU,AVVISTAI,67 227 | sp|Q2JD52|MURD_FRACC,RALLARGA,42 228 | sp|Q5E7G7|MUTS_ALIF1,CELLERAI,405 229 | sp|B5FAC8|MUTS_ALIFM,CELLERAI,405 230 | sp|Q6LMU0|MUTS_PHOPR,CELLERAI,405 231 | sp|B7VK59|MUTS_VIBA3,SELLERAI,403 232 | sp|Q87LQ9|MUTS_VIBPA,CELLERAI,403 233 | sp|Q8DC53|MUTS_VIBVU,CELLERAI,403 234 | sp|Q7MHR2|MUTS_VIBVY,CELLERAI,403 235 | sp|O35942|NEK2_MOUSE,TARSENSE,393 236 | sp|P51956|NEK3_HUMAN,ALTALENA,371 237 | sp|Q60CT7|NFI_METCA,PALLEALE,95 238 | sp|Q5BDY8|NLSA_EMENI,SGRAVERA,3294 239 | sp|Q8T8C0|NOS_BOMMO,SVAGASSI,649 240 | sp|F4IGA5|NU133_ARATH,SLITTAVA,237 241 | sp|O78706|NU1M_PHACI,SPILLAVA,11 242 | sp|A0LJM5|NUBCD_SYNFM,RIESSERE,213 243 | sp|Q9XAQ7|NUOD2_STRCO,ASPRETTE,6 244 | sp|A2VDP6|NXPE3_BOVIN,GRISETTE,217 245 | sp|Q969Y0|NXPE3_HUMAN,GRISETTE,217 246 | sp|Q5RCA5|NXPE3_PONAB,GRISETTE,217 247 | sp|P23214|OAC_BPSFV,SAGGIAVI,39 248 | sp|Q57483|OM26_HAEIN,TALALGIA,8 249 | sp|K7NTD0|OSTC2_DIPSG,TESSESSE,126 250 | sp|O04226|P5CS1_ORYSJ,STREMAVA,294 251 | sp|Q77MR9|PAP_GAHVM,ASSALIVA,229 252 | sp|Q8AV28|PCM1_CHICK,ASSETATE,1704 253 | sp|Q6FJA3|PDC1_CANGA,ANISETTA,130 254 | sp|P06169|PDC1_YEAST,ANISETTA,130 255 | sp|P16467|PDC5_YEAST,ANISETTA,130 256 | sp|P34734|PDC_HANUV,ANISETTA,130 257 | sp|G1UBC2|PGA47_CANAL,TESTASTE,252 258 | sp|Q65W08|PGK_MANSM,SENAPATE,263 259 | sp|Q7TVK8|PHAS_MYCBO,PARASALE,1961 260 | sp|A1KQG0|PHAS_MYCBP,PARASALE,1961 261 | sp|A5U9F4|PHAS_MYCTA,PARASALE,1961 262 | sp|P9WQE8|PHAS_MYCTO,PARASALE,1961 263 | sp|P9WQE9|PHAS_MYCTU,PARASALE,1961 264 | sp|P05098|PHEA_MICDP,ANNIDAVA,45 265 | sp|P29296|PHEA_PSETP,ANNIDAVA,45 266 | sp|Q6WB63|PHNC_ALCFA,SPEDIRLA,119 267 | sp|Q8CHS4|PLCX1_MOUSE,VAGINITE,271 268 | sp|Q2HA54|PLPL_CHAGB,ALACRITA,444 269 | sp|Q4PSN0|PME29_ARATH,RIFIGLIA,5 270 | sp|Q2FUB2|POK_METHJ,PRECARIA,124 271 | sp|O92529|POLG_HCVT5,FLATTING,1074 272 | sp|P9WI34|PPE13_MYCTO,TRATTARE,411 273 | sp|P9WI35|PPE13_MYCTU,TRATTARE,411 274 | sp|P9WI04|PPE32_MYCTO,SLATTATA,215 275 | sp|P9WI05|PPE32_MYCTU,SLATTATA,216 276 | sp|O42900|PPK19_SCHPO,RISALITI,488 277 | sp|B0R7F5|PRIL_HALS3,AVVERARE,34 278 | sp|Q9HN47|PRIL_HALSA,AVVERARE,34 279 | sp|C0R0B8|PROA_BRAHW,PIGLIAVI,116 280 | sp|Q7WQL9|PROB_BORBR,RECARGLI,332 281 | sp|Q7W1P3|PROB_BORPA,RECARGLI,332 282 | sp|Q7VZX7|PROB_BORPE,RECARGLI,332 283 | sp|Q2UH00|PRP28_ASPOR,AGGIRARE,305 284 | sp|P20053|PRP4_YEAST,MENINGEE,93 285 | sp|P35820|PSC_DROME,INATTIVE,271 286 | sp|Q16825|PTN21_HUMAN,APPARARE,727 287 | sp|O27427|PURL_METTH,SAGGIAVA,613 288 | sp|Q93YQ3|PURU1_ARATH,IRRITERA,2 289 | sp|A7MSE0|PYRB_VIBC1,ALLEGARE,253 290 | sp|Q8DCF6|PYRB_VIBVU,ALLEGARE,253 291 | sp|Q7MHF1|PYRB_VIBVY,ALLEGARE,253 292 | sp|P74782|PYRD_SYNY3,ANNEGAVA,130 293 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315 294 | sp|A2BJ23|PYRI_HYPBU,GREGARIA,37 295 | sp|B3MA91|QTRT2_DROAN,AFFERIRE,362 296 | sp|Q00799|RBP2_PLAVB,PENDETTE,2742 297 | sp|Q2NCT4|RECR_ERYLH,PALLEALE,40 298 | sp|C1F3B3|RIMM_ACIC5,INASTATA,178 299 | sp|Q3SI16|RIMO_THIDA,SPIEGAVA,339 300 | sp|P61726|RISB1_RHOPA,ALLEGAVA,34 301 | sp|Q1QMB5|RISB_NITHX,ALLEGAVA,34 302 | sp|Q3SRV7|RISB_NITWN,ALLEGAVA,34 303 | sp|C1DQ78|RL13_AZOVD,PERVIETA,97 304 | sp|Q2S9X2|RL13_HAHCH,PERVIETA,97 305 | sp|Q48EE0|RL13_PSE14,PERVIETA,97 306 | sp|A6VBA6|RL13_PSEA7,PERVIETA,97 307 | sp|B7UZL1|RL13_PSEA8,PERVIETA,97 308 | sp|Q02H07|RL13_PSEAB,PERVIETA,97 309 | sp|Q9HVY2|RL13_PSEAE,PERVIETA,97 310 | sp|Q1I596|RL13_PSEE4,PERVIETA,97 311 | sp|Q4K6H2|RL13_PSEF5,PERVIETA,97 312 | sp|A4XQQ3|RL13_PSEMY,PERVIETA,97 313 | sp|A5W8S1|RL13_PSEP1,PERVIETA,97 314 | sp|Q3K723|RL13_PSEPF,PERVIETA,97 315 | sp|B0KFU8|RL13_PSEPG,PERVIETA,97 316 | sp|Q88N97|RL13_PSEPK,PERVIETA,97 317 | sp|B1J1W8|RL13_PSEPW,PERVIETA,97 318 | sp|Q87WW7|RL13_PSESM,PERVIETA,97 319 | sp|Q4ZNX2|RL13_PSEU2,PERVIETA,97 320 | sp|B4SLE1|RL13_STRM5,PERVIETA,97 321 | sp|A4VIF7|RL13_STUS1,PERVIETA,97 322 | sp|Q49ZE1|RL17_STAS1,SERVETTE,27 323 | sp|Q8TX51|RL1_METKA,NADIRALI,138 324 | sp|A6GZ91|RL29_FLAPJ,SVARIATE,49 325 | sp|P50345|RLA0_LUPLU,LAVAVATE,252 326 | sp|Q13Z67|RLMD_PARXL,REGALAVA,398 327 | sp|Q8PMV0|RNC_XANAC,REGALAVI,76 328 | sp|Q3BVV6|RNC_XANC5,REGALAVI,76 329 | sp|Q4USF7|RNC_XANC8,REGALAVI,76 330 | sp|Q8PB52|RNC_XANCP,REGALAVI,76 331 | sp|Q5H1R2|RNC_XANOR,REGALAVI,76 332 | sp|O14277|RS5A_SCHPO,LITIGARE,152 333 | sp|Q9P3T6|RS5B_SCHPO,LITIGARE,152 334 | sp|Q8RIM0|RS7_FUSNN,ANNEGATI,128 335 | sp|Q0ALX8|RUVA_MARMM,LATRIATE,119 336 | sp|B3GYP5|SELA_ACTP7,ALIENARE,38 337 | sp|Q8BUH8|SENP7_MOUSE,LESSASSE,386 338 | sp|O08815|SLK_RAT,ESTRATTE,627 339 | sp|P19382|SNAI1_XENLA,SPASSATE,108 340 | sp|Q81LW0|SODM1_BACAN,GELATAIE,100 341 | sp|Q818I1|SODM1_BACCR,GELATAIE,115 342 | sp|Q3V0Q6|SPAG8_MOUSE,METTESTE,1 343 | sp|P32916|SRPR_YEAST,SVENTARE,353 344 | sp|A8AUS0|SSPA_STRGC,INTANATA,191 345 | sp|A8AUS1|SSPB_STRGC,INTANATA,190 346 | sp|P16952|SSPB_STRGN,INTANATA,191 347 | sp|A1JI37|STHA_YERE8,SFIDANTI,118 348 | sp|A5CC52|SUCC_ORITB,MASSAGGI,124 349 | sp|A5GPF9|SYA_SYNPW,RIESLING,621 350 | sp|A1S6P2|SYD_SHEAM,METTERMI,247 351 | sp|Q8RB93|SYE1_CALS4,VARIERAI,465 352 | sp|B1MZ60|SYGB_LEUCK,ENERVATA,422 353 | sp|O32039|SYH_BACSU,REAGISSE,353 354 | sp|A7Z751|SYH_BACVZ,REAGISSE,353 355 | sp|A1S425|SYI_SHEAM,PESAVATE,880 356 | sp|A0JUT1|SYP_ARTS2,LETARGIE,435 357 | sp|A1SLL4|SYP_NOCSJ,LETARGIE,422 358 | sp|A1R508|SYP_PAEAT,LETARGIE,435 359 | sp|Q2JSB6|SYP_SYNJA,LETARGIE,436 360 | sp|Q2JMD8|SYP_SYNJB,LETARGIE,436 361 | sp|Q116D3|SYP_TRIEI,LETARGIE,431 362 | sp|Q4QL12|SYR_HAEI8,INSTALLA,476 363 | sp|A5UCH1|SYR_HAEIE,INSTALLA,476 364 | sp|A5UJ40|SYR_HAEIG,INSTALLA,476 365 | sp|P43832|SYR_HAEIN,INSTALLA,476 366 | sp|Q0AY05|SYT_SYNWW,FEDERARE,62 367 | sp|B6EHW3|SYY_ALISL,ASSERITA,319 368 | sp|O52512|T2S1_STRFI,SPARGEVA,210 369 | sp|P09758|TACD2_HUMAN,TAGLIAVI,274 370 | sp|Q04B89|THII_LACDB,SGRINFIE,234 371 | sp|Q8DUR1|THII_STRMU,VAGINITE,379 372 | sp|Q75GA5|TIP41_ORYSJ,ALLARGHI,91 373 | sp|Q5JTD0|TJAP1_HUMAN,PASSASSE,400 374 | sp|Q9DCD5|TJAP1_MOUSE,PASSASSE,395 375 | sp|P69744|TRPV5_MOUSE,RALLARGA,128 376 | sp|Q9XSM3|TRPV5_RABIT,RALLARGA,134 377 | sp|Q9JIP0|TRPV5_RAT,RALLARGA,128 378 | sp|Q91WD2|TRPV6_MOUSE,RALLARGA,174 379 | sp|Q9R186|TRPV6_RAT,RALLARGA,174 380 | sp|E7F211|TTC17_DANRE,SALIFICA,40 381 | sp|Q6E240|U496E_ARATH,PIPERITE,287 382 | sp|P10861|UCP1_BOVIN,ECLISSAI,50 383 | sp|P16801|UL95_HCMVA,MALVACEA,129 384 | sp|Q6SW48|UL95_HCMVM,MALVACEA,129 385 | sp|B9VXQ2|UL95_HCMVT,MALVACEA,129 386 | sp|A1RV13|UPP_PYRIL,REDIVIVA,125 387 | sp|P9WEV0|VALA_ASPTE,PIETRAIA,321 388 | sp|Q7YRP3|VN1R3_PANTR,NASALITA,267 389 | sp|P0DOJ3|VP2_POVK3,SLATTARE,243 390 | sp|P0DOJ2|VP2_POVK6,SLATTARE,243 391 | sp|Q6GPH4|XAF1_HUMAN,RISAPERE,143 392 | sp|Q8Y0D3|Y1111_RALSO,SPRETAVA,162 393 | sp|Q46XP0|Y2732_CUPPJ,PREVARRA,154 394 | sp|B9LS33|Y273_HALLT,ALIENITA,5 395 | sp|Q9RN18|Y6513_BACAN,VIGILAVI,18 396 | sp|Q02998|YH19_RHOCA,FLIPPATI,28 397 | sp|A0A023PZL2|YM119_YEAST,GALALITI,87 398 | sp|A5CRZ4|YQGF_CLAM3,RILAVATE,46 399 | sp|O75467|Z324A_HUMAN,SVAGASSE,522 400 | sp|Q9HCK1|ZDBF2_HUMAN,DISCINTE,1559 401 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_nl.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|B3DT30|EFG_BIFLD,TREKKERIG,356 3 | sp|Q8G5B6|EFG_BIFLO,TREKKERIG,356 4 | sp|Q89J81|EFG_BRADU,TREKKERIG,344 5 | sp|A5ELN0|EFG_BRASB,TREKKERIG,344 6 | sp|Q1QN33|EFG_NITHX,TREKKERIG,344 7 | sp|Q3SSW9|EFG_NITWN,TREKKERIG,344 8 | sp|Q2IXR3|EFG_RHOP2,TREKKERIG,344 9 | sp|Q07KL5|EFG_RHOP5,TREKKERIG,344 10 | sp|Q6N4T4|EFG_RHOPA,TREKKERIG,344 11 | sp|Q134S6|EFG_RHOPS,TREKKERIG,344 12 | sp|B3QBY3|EFG_RHOPT,TREKKERIG,344 13 | sp|Q7M7P6|ASSY_WOLSU,APERITIEF,220 14 | sp|Q49135|FCHA_METEA,MAGNETIET,1 15 | sp|Q6P7I6|TP8L2_XENLA,SPEELVLAK,80 16 | sp|Q5ISE2|Z36L3_MOUSE,AMALGAAM,469 17 | sp|Q54ET6|ABPF_DICDI,KNELLING,911 18 | sp|B1J539|ACCD_PSEPW,AANHALEN,149 19 | sp|Q3AA34|ADDB_CARHZ,KASSEIEN,140 20 | sp|A2RUV9|AEBP1_RAT,PLETPERS,172 21 | sp|C9K7C1|AMT10_ALTAL,GASSLANG,6 22 | sp|A8DZJ1|BAZ1B_XENLA,AGENESIE,677 23 | sp|P13517|CAPZB_YEAST,KLEILAND,101 24 | sp|K9Y6N7|CCMK4_HALP7,VERFREST,98 25 | sp|Q6YW51|CKX6_ORYSJ,VERLEREN,451 26 | sp|P37974|CNRC_CUPMC,AANVRAAG,53 27 | sp|P63840|COBH_MYCBO,RATELAAR,102 28 | sp|P9WP86|COBH_MYCTO,RATELAAR,102 29 | sp|P9WP87|COBH_MYCTU,RATELAAR,102 30 | sp|Q99031|CR9AA_BACTG,TINSTEEN,738 31 | sp|Q45733|CR9CA_BACTO,TINSTEEN,740 32 | sp|O06014|CR9DA_BACTP,TINSTEEN,751 33 | sp|Q67P64|DAPB_SYMTH,LEEFLAAG,62 34 | sp|P10047|DCTB_RHILE,KLEILAAG,109 35 | sp|Q6DW73|DGDG2_LOTJA,KAAKKLEM,276 36 | sp|O62215|DHSD_CAEEL,SAPRISTI,20 37 | sp|E1V7W1|DOEA_HALED,AMALGAAM,308 38 | sp|B3DT30|EFG_BIFLD,REKKERIG,357 39 | sp|Q8G5B6|EFG_BIFLO,REKKERIG,357 40 | sp|Q89J81|EFG_BRADU,REKKERIG,345 41 | sp|A5ELN0|EFG_BRASB,REKKERIG,345 42 | sp|Q1QN33|EFG_NITHX,REKKERIG,345 43 | sp|Q3SSW9|EFG_NITWN,REKKERIG,345 44 | sp|Q2IXR3|EFG_RHOP2,REKKERIG,345 45 | sp|Q07KL5|EFG_RHOP5,REKKERIG,345 46 | sp|Q6N4T4|EFG_RHOPA,REKKERIG,345 47 | sp|Q134S6|EFG_RHOPS,REKKERIG,345 48 | sp|B3QBY3|EFG_RHOPT,REKKERIG,345 49 | sp|Q758X9|EIF3B_ASHGO,AVERSIEF,28 50 | sp|O49160|EIF3C_ARATH,AFTAPPEN,656 51 | sp|P32476|ERG1_YEAST,MILITAIR,477 52 | sp|O59945|FIMB_SCHPO,WANSMAAK,517 53 | sp|Q8ZZK1|FOLD_PYRAE,GEELHART,7 54 | sp|Q17QD8|G37L1_BOVIN,LAVALAAG,11 55 | sp|A3MKU6|GLND_BURM7,SELDERIE,94 56 | sp|A2SB69|GLND_BURM9,SELDERIE,94 57 | sp|Q62JC2|GLND_BURMA,SELDERIE,94 58 | sp|A1V572|GLND_BURMS,SELDERIE,94 59 | sp|A3NWN4|GLND_BURP0,SELDERIE,94 60 | sp|Q3JR26|GLND_BURP1,SELDERIE,94 61 | sp|A3NAV0|GLND_BURP6,SELDERIE,94 62 | sp|Q63T10|GLND_BURPS,SELDERIE,94 63 | sp|Q0P5E7|GTPB8_BOVIN,KALFSLAP,128 64 | sp|Q8N3Z3|GTPB8_HUMAN,KALFSLAP,128 65 | sp|Q9CY28|GTPB8_MOUSE,KALFSLAP,129 66 | sp|Q5SMM6|HCT4_ORYSJ,APPARAAT,217 67 | sp|Q2IZP7|HLDE_RHOP2,LAVALAAG,281 68 | sp|A9H863|HUTH_GLUDA,SHREDDER,271 69 | sp|Q3SWP9|IF2_NITWN,APPARAAT,78 70 | sp|P09407|ITI3_MOMCH,KALVEREN,35 71 | sp|A9A698|KCY_METM6,IRISEREN,109 72 | sp|A6VJT1|KCY_METM7,IRISEREN,109 73 | sp|Q6LZK1|KCY_METMP,IRISEREN,109 74 | sp|C5FZJ2|LIPA_ARTOC,KRAKERIG,389 75 | sp|Q3TYD6|LMTK2_MOUSE,PEDAALAS,928 76 | sp|F1QWK4|MCA3B_DANRE,GEELHART,978 77 | sp|Q2H9Y1|MDM34_CHAGB,LEESDEEL,406 78 | sp|Q6NIZ3|METXA_CORDI,GELEIDER,214 79 | sp|Q8FRT0|METXA_COREF,GELEIDER,222 80 | sp|O68640|METXA_CORGL,GELEIDER,222 81 | sp|O66962|MNMG_AQUAE,VAGINAAL,393 82 | sp|A4YJT4|MNMG_BRASO,VAGINAAL,385 83 | sp|Q3AG55|MNMG_CARHZ,VAGINAAL,386 84 | sp|Q0TLZ5|MNMG_CLOP1,VAGINAAL,386 85 | sp|Q8XH31|MNMG_CLOPE,VAGINAAL,386 86 | sp|Q0SPQ4|MNMG_CLOPS,VAGINAAL,386 87 | sp|Q9CEJ4|MNMG_LACLA,VAGINAAL,389 88 | sp|O32806|MNMG_LACLM,VAGINAAL,389 89 | sp|Q02X03|MNMG_LACLS,VAGINAAL,389 90 | sp|A1AV42|MNMG_PELPD,VAGINAAL,386 91 | sp|C0QPI1|MNMG_PERMH,VAGINAAL,395 92 | sp|A8GUR1|MNMG_RICB8,VAGINAAL,382 93 | sp|Q1RGT1|MNMG_RICBR,VAGINAAL,382 94 | sp|A8EXC3|MNMG_RICCK,VAGINAAL,382 95 | sp|Q3JYG3|MNMG_STRA1,VAGINAAL,389 96 | sp|P0A3F0|MNMG_STRA3,VAGINAAL,389 97 | sp|P0A3F1|MNMG_STRA5,VAGINAAL,389 98 | sp|Q8DRS6|MNMG_STRMU,VAGINAAL,389 99 | sp|A4W4N0|MNMG_STRS2,VAGINAAL,389 100 | sp|A4VYE0|MNMG_STRSY,VAGINAAL,389 101 | sp|Q5LXK0|MNMG_STRT1,VAGINAAL,389 102 | sp|Q5M250|MNMG_STRT2,VAGINAAL,389 103 | sp|Q03I89|MNMG_STRTD,VAGINAAL,389 104 | sp|B2V6C3|MNMG_SULSY,VAGINAAL,393 105 | sp|B9L851|MOAA_NAUPA,STRAFWET,309 106 | sp|P48563|MON2_YEAST,SPLITTEN,398 107 | sp|B8FT65|MRAZ_DESHD,GRILLPAN,85 108 | sp|Q24TD7|MRAZ_DESHY,GRILLPAN,85 109 | sp|Q67Q58|MRAZ_SYMTH,GRILLPAN,83 110 | sp|Q2S527|MURC_SALRD,AFVELLEN,459 111 | sp|Q0BV25|MURG_GRABC,DAARNAAR,343 112 | sp|B8FJL5|MUTS_DESAL,DRINGEND,539 113 | sp|Q44584|NCCC_ALCXX,AANVRAAG,72 114 | sp|C4Y3N8|NOP9_CLAL4,AFKERVEN,12 115 | sp|Q6BUT3|NST1_DEBHA,FIEDELEN,204 116 | sp|B0R8D2|NUSA_HALS3,TAKELAAR,122 117 | sp|P0CW99|NUSA_HALSA,TAKELAAR,122 118 | sp|Q5DTZ0|NYNRI_MOUSE,LEGPRENT,55 119 | sp|Q96XT4|OFOB2_SULTO,LAVALAAG,153 120 | sp|Q54ID7|OSB11_DICDI,EENKLANK,391 121 | sp|Q07744|PEPO_LACLA,TAALTAAK,544 122 | sp|P0C2B4|PEPO_LACLC,TAALTAAK,544 123 | sp|Q02VB0|PEPO_LACLS,TAALTAAK,544 124 | sp|P15004|PER2_SOLLC,AARDSLAK,173 125 | sp|C1CV29|PGK_DEIDV,GEVALLEN,108 126 | sp|Q1IZA3|PGK_DEIGD,GEVALLEN,108 127 | sp|Q9RUP2|PGK_DEIRA,GEVALLEN,130 128 | sp|Q0RH06|PGK_FRAAA,GEVALLEN,115 129 | sp|Q2JCH8|PGK_FRACC,GEVALLEN,115 130 | sp|C6BUI7|PGK_MARSD,AFKALKEN,388 131 | sp|A0QGK3|PHK_MYCA1,GLASRAAM,756 132 | sp|Q73ZM8|PHK_MYCPA,GLASRAAM,756 133 | sp|Q9JI55|PLEC_CRIGR,KETELPAK,254 134 | sp|Q9QXS1|PLEC_MOUSE,KETELPAK,473 135 | sp|P30427|PLEC_RAT,KETELPAK,468 136 | sp|B0LL23|PLR_SINHE,PARMAHAM,121 137 | sp|Q9CNJ7|PSTB_PASMU,GELIEFDE,230 138 | sp|B0S6S9|RBM44_DANRE,AANSTAAN,373 139 | sp|A1SJ39|RIMO_NOCSJ,TAALKLAS,184 140 | sp|A2BT57|RLMN_PROMS,KERNLAND,70 141 | sp|Q2SBR2|RNH2_HAHCH,VERDEELD,148 142 | sp|Q68S14|RPOB_PANGI,AIDSGALA,573 143 | sp|B1VDC0|RS15_CORU7,LASTDIER,70 144 | sp|A0R024|RSMH_MYCS2,RAAIPAAL,280 145 | sp|A3Q1M6|RSMH_MYCSJ,RAAIPAAL,270 146 | sp|Q7T2D0|SGSM3_DANRE,AASKEVER,599 147 | sp|Q6P7W2|SHKB1_MOUSE,PIERLALA,237 148 | sp|Q1DZ34|SIP5_COCIM,REKENAAR,82 149 | sp|P41508|SMC_MESHY,KNALSEIN,295 150 | sp|Q6IUP1|SOLH1_MOUSE,RESELLER,178 151 | sp|Q30YS5|SYA_OLEA2,AFPELLEN,341 152 | sp|A5GPF9|SYA_SYNPW,RIESLING,621 153 | sp|A7IAG1|SYE_METB6,AALSPEER,60 154 | sp|B8J4S2|SYS_DESDA,GEVLEESD,241 155 | sp|Q485S0|T3HPD_COLP3,INSLECHT,27 156 | sp|A2XSX6|TIF9_ORYSI,TRAPVELD,3 157 | sp|Q7XV97|TIF9_ORYSJ,TRAPVELD,3 158 | sp|Q6MDC8|UVRC_PARUW,KRAKEEND,449 159 | sp|A0A7H0DN27|VPK2_MONPV,DIEFSTAL,388 160 | sp|O57177|VPK2_VACCA,DIEFSTAL,388 161 | sp|P21095|VPK2_VACCC,DIEFSTAL,388 162 | sp|P29884|VPK2_VACCP,DIEFSTAL,354 163 | sp|Q9JFE5|VPK2_VACCT,DIEFSTAL,388 164 | sp|Q89121|VPK2_VACCW,DIEFSTAL,388 165 | sp|P33801|VPK2_VAR67,DIEFSTAL,388 166 | sp|Q3MUH7|XG74_PAESP,LAVALAAG,14 167 | sp|A4ZUC9|Y112_ABVP,PILIPILI,87 168 | sp|Q5UPM1|YL149_MIMIV,INSTINCT,140 169 | sp|Q5ISE2|Z36L3_MOUSE,AMALGAAM,397 170 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_no.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|Q89I89|SYA_BRADU,DATASALGS,255 3 | sp|C3PNE7|SYL_RICAE,SELEKTERE,287 4 | sp|A1JIX1|TRUB_YERE8,EGALISERE,92 5 | sp|O34273|TRUB_YEREN,EGALISERE,92 6 | sp|A7FMS0|TRUB_YERP3,EGALISERE,92 7 | sp|Q1CC09|TRUB_YERPA,EGALISERE,92 8 | sp|Q8ZBC4|TRUB_YERPE,EGALISERE,92 9 | sp|Q1CEL5|TRUB_YERPN,EGALISERE,92 10 | sp|A4TRI1|TRUB_YERPP,EGALISERE,92 11 | sp|Q66F58|TRUB_YERPS,EGALISERE,92 12 | sp|Q93RD9|DTD_LISMO,SMEDEVISE,14 13 | sp|B8GPA2|MOAC_THISH,VEVELSTAD,84 14 | sp|Q5HLX6|MOEA_STAEQ,ELLEVEDEL,193 15 | sp|Q8CNE1|MOEA_STAES,ELLEVEDEL,193 16 | sp|Q06583|PYS1_PSEAI,AVREGNING,162 17 | sp|P06105|SC160_YEAST,ANSVARLIG,643 18 | sp|Q91VX2|UBAP2_MOUSE,GRESSENKE,110 19 | sp|Q96WW0|YNH9_SCHPO,KIRKESKIP,382 20 | sp|Q19753|YU0O_CAEEL,ALMEGREIN,668 21 | sp|B3DT30|EFG_BIFLD,TREKKERI,356 22 | sp|Q8G5B6|EFG_BIFLO,TREKKERI,356 23 | sp|Q89J81|EFG_BRADU,TREKKERI,344 24 | sp|A5ELN0|EFG_BRASB,TREKKERI,344 25 | sp|Q1QN33|EFG_NITHX,TREKKERI,344 26 | sp|Q3SSW9|EFG_NITWN,TREKKERI,344 27 | sp|Q2IXR3|EFG_RHOP2,TREKKERI,344 28 | sp|Q07KL5|EFG_RHOP5,TREKKERI,344 29 | sp|Q6N4T4|EFG_RHOPA,TREKKERI,344 30 | sp|Q134S6|EFG_RHOPS,TREKKERI,344 31 | sp|B3QBY3|EFG_RHOPT,TREKKERI,344 32 | sp|Q9LD43|ACCA_ARATH,VASSGASS,586 33 | sp|P74582|ACNB_SYNY3,FELTDATA,583 34 | sp|A1WQY3|ALR_VEREI,GAVLSIDE,336 35 | sp|P39265|ALSB_ECOLI,KALDVATN,202 36 | sp|Q6A332|ALY3_ARATH,HEKKSAKS,524 37 | sp|O31788|APRX_BACSU,VASSGASS,195 38 | sp|Q47VK9|ARGR_COLP3,KINETIKK,140 39 | sp|Q058D5|ARLY_BUCCC,LETTVINT,88 40 | sp|Q3AQX0|AROE_CHLCH,FALLGRAV,8 41 | sp|Q6LLZ0|AROQ_PHOPR,KRILLING,5 42 | sp|A2XNK3|ASA1_ORYSI,VASSPEIL,351 43 | sp|Q94GF1|ASA1_ORYSJ,VASSPEIL,351 44 | sp|Q9XJ29|ASA2_ORYSJ,VASSPEIL,379 45 | sp|O74431|ATC9_SCHPO,TREPLATA,199 46 | sp|Q6BTX0|ATG2_DEBHA,KANTNING,583 47 | sp|S0EFU6|BEA4_GIBF5,KANSLERE,435 48 | sp|Q55BU9|C5133_DICDI,SYKELEIE,229 49 | sp|Q9H251|CAD23_HUMAN,FRALANDS,2891 50 | sp|Q99PF4|CAD23_MOUSE,FRALANDS,2891 51 | sp|P58365|CAD23_RAT,FRALANDS,2889 52 | sp|O34659|CDAR_BACSU,IALLFALL,14 53 | sp|B5X564|CDC2C_ARATH,FRAREVET,120 54 | sp|O97383|CHH1_PENMO,VASSTAPA,20 55 | sp|O53080|CITXG_LEUMC,AGNATISK,402 56 | sp|P28020|CSK21_XENLA,SALGSLAG,363 57 | sp|A9NGC2|DAPA_ACHLI,SEKKEVIS,55 58 | sp|Q8VCR2|DHB13_MOUSE,VARSLING,242 59 | sp|Q486F9|DNAK1_COLP3,ANNETLAG,43 60 | sp|Q5NPS6|DNAK_ZYMMO,ISRAELER,295 61 | sp|C3PLJ2|DNLJ_RICAE,ISKLASSE,261 62 | sp|Q92GM7|DNLJ_RICCN,ISKLASSE,261 63 | sp|A8F2K5|DNLJ_RICM5,ISKLASSE,261 64 | sp|Q9ZCK9|DNLJ_RICPR,ISKLASSE,261 65 | sp|C4K0T1|DNLJ_RICPU,ISKLASSE,261 66 | sp|B0BUZ1|DNLJ_RICRO,ISKLASSE,261 67 | sp|A8GTF2|DNLJ_RICRS,ISKLASSE,261 68 | sp|Q68W27|DNLJ_RICTY,ISKLASSE,261 69 | sp|B3DT30|EFG_BIFLD,STREKKER,355 70 | sp|Q8G5B6|EFG_BIFLO,STREKKER,355 71 | sp|Q89J81|EFG_BRADU,STREKKER,343 72 | sp|A5ELN0|EFG_BRASB,STREKKER,343 73 | sp|Q1QN33|EFG_NITHX,STREKKER,343 74 | sp|Q3SSW9|EFG_NITWN,STREKKER,343 75 | sp|Q2IXR3|EFG_RHOP2,STREKKER,343 76 | sp|Q07KL5|EFG_RHOP5,STREKKER,343 77 | sp|Q6N4T4|EFG_RHOPA,STREKKER,343 78 | sp|Q134S6|EFG_RHOPS,STREKKER,343 79 | sp|B3QBY3|EFG_RHOPT,STREKKER,343 80 | sp|B9KIA2|EFP_ANAMF,AVLENGER,153 81 | sp|Q5PB21|EFP_ANAMM,AVLENGER,153 82 | sp|B3PBP7|EFTS_CELJU,MAGELEIA,114 83 | sp|Q2YBA6|EFTS_NITMU,SPELLERE,200 84 | sp|Q4P0P0|EIF3C_USTMA,KVIKENDE,207 85 | sp|Q9ZWB9|FAO1_ARATH,SVEIGENE,191 86 | sp|Q989A7|FOLD1_RHILO,ALTAELVA,20 87 | sp|B2V9R3|FOLD_SULSY,TELLELIG,75 88 | sp|I1S163|FSL4_GIBZE,AKKVIRER,295 89 | sp|A8N8S3|GATA_COPC7,SALPETRE,228 90 | sp|Q985F6|GLNE_RHILO,PARAGRAF,664 91 | sp|Q31DG8|GRPE_PROM9,TIENDELS,33 92 | sp|Q974T4|GUAAA_SULTO,FRILANSE,139 93 | sp|Q53T59|H1BP3_HUMAN,LAGSPELL,125 94 | sp|Q3TC93|H1BP3_MOUSE,LAGSPELL,125 95 | sp|A2WKS3|H2B10_ORYSI,KLAGESAK,101 96 | sp|Q9LGI2|H2B10_ORYSJ,KLAGESAK,101 97 | sp|P05621|H2B2_WHEAT,KLAGESAK,98 98 | sp|A2WKP3|H2B3_ORYSI,KLAGESAK,101 99 | sp|Q94JJ7|H2B3_ORYSJ,KLAGESAK,101 100 | sp|Q43217|H2B3_WHEAT,KLAGESAK,86 101 | sp|A2WKP5|H2B4_ORYSI,KLAGESAK,101 102 | sp|Q94JJ4|H2B4_ORYSJ,KLAGESAK,101 103 | sp|Q43215|H2B4_WHEAT,KLAGESAK,83 104 | sp|Q43216|H2B5_WHEAT,KLAGESAK,84 105 | sp|A2WKT1|H2B6_ORYSI,KLAGESAK,101 106 | sp|Q41575|H2B6_WHEAT,KLAGESAK,69 107 | sp|A2WKS8|H2B7_ORYSI,KLAGESAK,101 108 | sp|Q7GBK0|H2B7_ORYSJ,KLAGESAK,101 109 | sp|A2WKS5|H2B8_ORYSI,KLAGESAK,101 110 | sp|Q9LGH8|H2B8_ORYSJ,KLAGESAK,101 111 | sp|B3RHD9|HAP1_YEAS1,SALTSMAK,595 112 | sp|C7GQY3|HAP1_YEAS2,SALTSMAK,595 113 | sp|A7A1D7|HAP1_YEAS7,SALTSMAK,595 114 | sp|C8ZDL9|HAP1_YEAS8,SALTSMAK,595 115 | sp|G2WJ80|HAP1_YEASK,SALTSMAK,592 116 | sp|P0CE41|HAP1_YEAST,SALTSMAK,595 117 | sp|P0CS82|HAP1_YEASX,SALTSMAK,595 118 | sp|P14750|HCYA_APHSP,FIREDELE,411 119 | sp|O53637|HDDA_MYCTU,VARSLERE,316 120 | sp|Q39YP7|HISX_GEOMG,SEILVIND,237 121 | sp|P60859|HISX_GEOSL,SEILVIND,237 122 | sp|Q3A133|HISX_SYNC1,SEILVIND,237 123 | sp|Q7NIA2|HSLO_GLOVI,GRAVGANG,109 124 | sp|P31269|HXA9_HUMAN,SENNAENE,176 125 | sp|P09631|HXA9_MOUSE,SENNAENE,175 126 | sp|Q58CP0|IDH3G_BOVIN,LIVRENTE,163 127 | sp|P51553|IDH3G_HUMAN,LIVRENTE,164 128 | sp|P41564|IDH3G_MACFA,LIVRENTE,126 129 | sp|P70404|IDHG1_MOUSE,LIVRENTE,164 130 | sp|P41565|IDHG1_RAT,LIVRENTE,164 131 | sp|Q3SKX1|IF2_THIDA,KANTNING,569 132 | sp|Q5AAL9|IFF4_CANAL,TETPLASS,1070 133 | sp|O28294|ILVC_ARCFU,KALEVALA,161 134 | sp|A2C096|ISPH_PROM1,PERIGEET,340 135 | sp|Q46HB0|ISPH_PROMT,PERIGEET,340 136 | sp|A1JQY4|KDPA_YERE8,IALLFALL,74 137 | sp|Q8D2E8|KGUA_WIGBR,KINETIKK,196 138 | sp|Q8S2E5|KPRS3_ORYSJ,FEILSLAG,394 139 | sp|P50455|LEU3_SULTO,LIVRENTE,114 140 | sp|Q3A334|LON2_SYNC1,SLIPEREN,115 141 | sp|Q21MS7|LPTD_SACD2,GAMLEVEG,321 142 | sp|Q9H9A6|LRC40_HUMAN,TIPPELAG,234 143 | sp|Q4R3P6|LRC40_MACFA,TIPPELAG,234 144 | sp|Q5RFE9|LRC40_PONAB,TIPPELAG,234 145 | sp|Q0P5X1|LRIQ1_MOUSE,ENDEVEND,28 146 | sp|Q98919|LSAMP_CHICK,SANGLEIK,260 147 | sp|Q13449|LSAMP_HUMAN,SANGLEIK,260 148 | sp|Q8BLK3|LSAMP_MOUSE,SANGLEIK,260 149 | sp|Q62813|LSAMP_RAT,SANGLEIK,260 150 | sp|Q54U63|LVSC_DICDI,VILLMANN,52 151 | sp|A2SZS3|L_RVFV,SEKSTANT,1175 152 | sp|P27316|L_RVFVZ,SEKSTANT,1175 153 | sp|Q6NS57|MABP1_MOUSE,ELEVLAGA,1456 154 | sp|Q767L8|MDC1_PIG,ALTERERE,659 155 | sp|Q13LD8|METN2_PARXL,SALSVEGG,321 156 | sp|Q3ACA8|MIAA_CARHZ,KVAKKING,22 157 | sp|Q076A4|MYH8_CANLF,GAKKGAKK,637 158 | sp|P13542|MYH8_MOUSE,GAKKGAKK,635 159 | sp|B4L7U0|NAAT1_DROMO,FALLGRAV,277 160 | sp|B4MEG2|NAAT1_DROVI,FALLGRAV,267 161 | sp|Q640K1|NCDN_XENLA,SPILLETS,232 162 | sp|Q5SYE7|NHSL1_HUMAN,DAGSLYSE,599 163 | sp|Q8L746|NPR3_ARATH,GRESSKAR,381 164 | sp|Q80XB4|NRAP_MOUSE,KASSEVIS,1051 165 | sp|Q5XGN1|NUP42_XENLA,VASSTAPA,365 166 | sp|A2VDP6|NXPE3_BOVIN,GRISETTE,217 167 | sp|Q969Y0|NXPE3_HUMAN,GRISETTE,217 168 | sp|Q5RCA5|NXPE3_PONAB,GRISETTE,217 169 | sp|Q8NGE2|O2AP1_HUMAN,GATEFYLL,106 170 | sp|C1DEA4|OBG_AZOVD,SPALTERE,270 171 | sp|A6VBV3|OBG_PSEA7,SPALTERE,270 172 | sp|B7V0A9|OBG_PSEA8,SPALTERE,270 173 | sp|Q02GB1|OBG_PSEAB,SPALTERE,270 174 | sp|Q9HVL8|OBG_PSEAE,SPALTERE,270 175 | sp|C4R492|OCA5_KOMPG,ALLELEST,544 176 | sp|Q07017|OL56_STRAT,AVLSGRIS,1836 177 | sp|A0A0C1E5J8|OPAA_ASPUT,VALLAVIK,2904 178 | sp|Q8NGE1|OR6C4_HUMAN,GATEFYLL,106 179 | sp|P13909|PAI1_BOVIN,SALTSILD,271 180 | sp|P79335|PAI1_PIG,SALTSILD,271 181 | sp|P87295|PEP5L_SCHPO,TYSSEDAL,674 182 | sp|P46988|PFD1_YEAST,LETTVEKT,92 183 | sp|Q5GRV2|PGK_WOLTR,VASSKALL,306 184 | sp|Q13YI7|PHNW1_PARXL,HALVLANG,85 185 | sp|P41676|PK2_NPVAC,TALERETT,147 186 | sp|Q42556|PMA9_ARATH,STYRELSE,903 187 | sp|Q04350|POLB_CHPVE,SPEDKALV,1742 188 | sp|Q9YTU2|POLB_CHPVU,SPEDKALV,1741 189 | sp|Q54SJ8|POND_DICDI,IALLFALL,141 190 | sp|Q8K2H1|PPHLN_MOUSE,VASSKALD,226 191 | sp|Q5X5Y6|PYRG_LEGPA,VEKEAVIS,229 192 | sp|A5IB79|PYRG_LEGPC,VEKEAVIS,229 193 | sp|Q5ZWA4|PYRG_LEGPH,VEKEAVIS,229 194 | sp|Q5WXA8|PYRG_LEGPL,VEKEAVIS,229 195 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315 196 | sp|Q9FG68|RAX1_ARATH,SLISSING,169 197 | sp|Q0BTG0|RECO_GRABC,ALTETERE,241 198 | sp|Q6RG78|RGS1_HORSE,TRESTAKK,142 199 | sp|Q08116|RGS1_HUMAN,TRESTAKK,155 200 | sp|Q9JL25|RGS1_MOUSE,TRESTAKK,155 201 | sp|P97844|RGS1_RAT,TRESTAKK,155 202 | sp|Q03314|RHIB_RHILV,PARTALET,198 203 | sp|Q17XC9|RIBA_HELAH,MALERISK,74 204 | sp|B6JM32|RIBA_HELP2,MALERISK,74 205 | sp|Q1CT68|RIBA_HELPH,MALERISK,74 206 | sp|Q9ZL42|RIBA_HELPJ,MALERISK,74 207 | sp|O08315|RIBA_HELPY,MALERISK,74 208 | sp|A0QPD3|RIR2H_MYCS2,ALTERERE,48 209 | sp|Q72GV5|RL9_THET2,VILLEPLE,3 210 | sp|Q5SLQ1|RL9_THET8,VILLEPLE,3 211 | sp|P27151|RL9_THETH,VILLEPLE,3 212 | sp|Q7M438|RNDI_DICDI,TAALESEN,146 213 | sp|P56185|RNJ_HELPY,SENSKADE,15 214 | sp|P67284|RNY_STRP1,LIVSALIG,7 215 | sp|P0DF20|RNY_STRP3,LIVSALIG,7 216 | sp|Q5XAP0|RNY_STRP6,LIVSALIG,7 217 | sp|Q8P000|RNY_STRP8,LIVSALIG,7 218 | sp|Q1JAJ3|RNY_STRPB,LIVSALIG,7 219 | sp|Q1JKP5|RNY_STRPC,LIVSALIG,7 220 | sp|Q1JFN6|RNY_STRPD,LIVSALIG,7 221 | sp|Q1J5I5|RNY_STRPF,LIVSALIG,7 222 | sp|A2RD66|RNY_STRPG,LIVSALIG,7 223 | sp|Q48S17|RNY_STRPM,LIVSALIG,7 224 | sp|P0DF21|RNY_STRPQ,LIVSALIG,7 225 | sp|B0TX10|RPOB_FRAP2,NEVRALGI,1340 226 | sp|Q14JT5|RPOB_FRAT1,NEVRALGI,1340 227 | sp|A7NEC0|RPOB_FRATF,NEVRALGI,1340 228 | sp|Q2A1M7|RPOB_FRATH,NEVRALGI,1340 229 | sp|B2SFD6|RPOB_FRATM,NEVRALGI,1340 230 | sp|A0Q867|RPOB_FRATN,NEVRALGI,1340 231 | sp|Q0BKC5|RPOB_FRATO,NEVRALGI,1340 232 | sp|Q5NID2|RPOB_FRATT,NEVRALGI,1340 233 | sp|A4IW99|RPOB_FRATW,NEVRALGI,1340 234 | sp|P56764|RPOC2_ARATH,SVEIPING,598 235 | sp|A4QKI2|RPOC2_CAPBU,SVEIPING,598 236 | sp|Q9THV5|RPOC2_SINAL,SVEIPING,603 237 | sp|Q21M92|RPOC_SACD2,RIDETIME,551 238 | sp|Q4FLJ0|RS4_PELUB,ALASKERE,156 239 | sp|Q12136|SAS10_YEAST,DESIDERE,70 240 | sp|Q9UPW6|SATB2_HUMAN,VERVEREN,234 241 | sp|Q8VI24|SATB2_MOUSE,VERVEREN,234 242 | sp|P0C883|SCL33_ARATH,VERPESYK,623 243 | sp|B3ECJ8|SECA_CHLL2,GRISEMAT,153 244 | sp|Q30RR0|SECA_SULDN,AVIATIKK,421 245 | sp|P0AG91|SECD_ECO57,TAKSVALE,63 246 | sp|P0AG90|SECD_ECOLI,TAKSVALE,63 247 | sp|P0AG92|SECD_SHIFL,TAKSVALE,63 248 | sp|A7TI28|SHO1_VANPO,VILLSVIN,127 249 | sp|Q1E1R7|SPB4_COCIM,KRAKKERE,586 250 | sp|P16546|SPTN1_MOUSE,SEERTALL,446 251 | sp|A9NF97|SSRP_ACHLI,KRETIKER,127 252 | sp|Q7X2N6|SSRP_SPHEL,KRETIKER,139 253 | sp|A7I6S6|SURE_METB6,ANRIKING,55 254 | sp|Q89I89|SYA_BRADU,DATASALG,255 255 | sp|A5GPF9|SYA_SYNPW,RIESLING,621 256 | sp|C3PAE9|SYFA_BACAA,ALVNEVRE,59 257 | sp|C3L8T8|SYFA_BACAC,ALVNEVRE,59 258 | sp|Q81L30|SYFA_BACAN,ALVNEVRE,59 259 | sp|B7JR67|SYFA_BACC0,ALVNEVRE,59 260 | sp|Q72ZI1|SYFA_BACC1,ALVNEVRE,59 261 | sp|B7IJW0|SYFA_BACC2,ALVNEVRE,59 262 | sp|C1EU00|SYFA_BACC3,ALVNEVRE,59 263 | sp|B7HF77|SYFA_BACC4,ALVNEVRE,59 264 | sp|B7HRK2|SYFA_BACC7,ALVNEVRE,59 265 | sp|A7GTL0|SYFA_BACCN,ALVNEVRE,59 266 | sp|B9J063|SYFA_BACCQ,ALVNEVRE,59 267 | sp|Q817I6|SYFA_BACCR,ALVNEVRE,59 268 | sp|Q633N4|SYFA_BACCZ,ALVNEVRE,59 269 | sp|Q6HCW7|SYFA_BACHK,ALVNEVRE,59 270 | sp|A9VJM4|SYFA_BACMK,ALVNEVRE,59 271 | sp|C3PNE7|SYL_RICAE,ELEKTERE,288 272 | sp|Q9V011|SYM_PYRAB,ELDELDRE,424 273 | sp|C5B832|SYR_EDWI9,TILSVARE,506 274 | sp|Q92CV8|TAGH_LISIN,GLISETST,311 275 | sp|P40412|TCPE1_AVESA,LAVADLER,190 276 | sp|P54411|TCPE2_AVESA,LAVADLER,190 277 | sp|O04450|TCPE_ARATH,LAVADLER,190 278 | sp|Q7YJS6|TI214_CALFG,SENKNING,1500 279 | sp|A6MMG9|TI214_CHLSC,SENKNING,1553 280 | sp|Q5FUR2|TIG_GLUOX,DELAKTIG,253 281 | sp|O76997|TRK1_LYMST,PRISSATT,398 282 | sp|Q6Z4N3|TRL11_ORYSJ,ALTERKAR,56 283 | sp|A0AJP7|TRMD_LISW6,NERVEVEV,28 284 | sp|Q8TYA2|TRPA_METKA,ELVEGARD,259 285 | sp|P32068|TRPE_ARATH,VASSPEIL,369 286 | sp|P32069|TRPX_ARATH,VASSPEIL,382 287 | sp|A8WTE8|TRR1_CAEBR,ISRANDEN,1515 288 | sp|Q8TZ08|TRUB_METKA,VALGVAKA,294 289 | sp|A1JIX1|TRUB_YERE8,EGALISER,92 290 | sp|O34273|TRUB_YEREN,EGALISER,92 291 | sp|A7FMS0|TRUB_YERP3,EGALISER,92 292 | sp|Q1CC09|TRUB_YERPA,EGALISER,92 293 | sp|Q8ZBC4|TRUB_YERPE,EGALISER,92 294 | sp|Q1CEL5|TRUB_YERPN,EGALISER,92 295 | sp|A4TRI1|TRUB_YERPP,EGALISER,92 296 | sp|Q66F58|TRUB_YERPS,EGALISER,92 297 | sp|G4SLH0|TTN1_CAEEL,PAKKSEKK,12795 298 | sp|O59941|VATD_NEUCR,ELDELDRE,193 299 | sp|Q97CP8|VATD_THEVO,VALERIAN,73 300 | sp|P32610|VATD_YEAST,ELDELDRE,191 301 | sp|Q3ZK57|VP3_ROT41,SAKEFALL,661 302 | sp|B3F2X7|VP3_ROTTU,SAKEFALL,661 303 | sp|Q9ENL0|VP6_CTFVL,STEINRIK,453 304 | sp|Q9Y2B5|VP9D1_HUMAN,SAMKLANG,15 305 | sp|Q45212|VSP2_BORHE,DELAKTIG,69 306 | sp|Q5PP32|WTR25_ARATH,HVITKVAL,27 307 | sp|Q8W4R9|WTR35_ARATH,HVITKVAL,36 308 | sp|P47490|Y248_MYCGE,KRISTIAN,3 309 | sp|P75197|Y583_MYCPN,STETTING,170 310 | sp|P34263|YKAD_CAEEL,EKSERSER,97 311 | sp|O13545|YL374_YEAST,IALLFALL,41 312 | sp|Q2UBI2|YME2_ASPOR,SANDLAND,233 313 | sp|Q6R3K9|YSL2_ARATH,ENERVERE,2 314 | sp|E9P860|ZNFX1_CAEEL,ELSKLING,369 315 | -------------------------------------------------------------------------------- /uniprot_words/data/word_matches_se.csv: -------------------------------------------------------------------------------- 1 | .id,Keyword,Offset 2 | sp|P77624|ARCM_ECOLI,STALLARE,219 3 | sp|Q24995|ARY_GALME,WINFIELD,549 4 | sp|Q6BTX0|ATG2_DEBHA,KANTNING,583 5 | sp|Q0PAS1|CBF2_CAMJE,HVILKENS,222 6 | sp|A1VYV6|CBF2_CAMJJ,HVILKENS,222 7 | sp|B9VR26|CML1_BOVIN,PLATAIAI,294 8 | sp|B1PHQ8|CML1_PIG,PLATAIAI,295 9 | sp|A5EF51|DABA2_BRASB,ALLAKATS,625 10 | sp|B8GFL8|DNAG_METPE,SVEDMARK,248 11 | sp|O80928|DOF24_ARATH,SPRITSAS,282 12 | sp|Q83I20|DXS_TROW8,REDIVIVA,510 13 | sp|Q83G46|DXS_TROWT,REDIVIVA,510 14 | sp|C5J6A7|FTSH_MESCH,SELLASIA,545 15 | sp|P19255|GLPF_STRCO,KASKARNA,31 16 | sp|Q3SKX1|IF2_THIDA,KANTNING,569 17 | sp|O28294|ILVC_ARCFU,KALEVALA,161 18 | sp|A0QBE6|KDC_MYCA1,RIDPARTI,304 19 | sp|Q7U140|KDC_MYCBO,RIDPARTI,306 20 | sp|A1KGY5|KDC_MYCBP,RIDPARTI,306 21 | sp|Q9CBD6|KDC_MYCLE,RIDPARTI,302 22 | sp|Q742Q2|KDC_MYCPA,RIDPARTI,304 23 | sp|A0R480|KDC_MYCS2,RIDPARTI,298 24 | sp|A5U0P1|KDC_MYCTA,RIDPARTI,306 25 | sp|P9WG36|KDC_MYCTO,RIDPARTI,306 26 | sp|P9WG37|KDC_MYCTU,RIDPARTI,306 27 | sp|A0PL16|KDC_MYCUA,RIDPARTI,306 28 | sp|Q14BB9|MA6D1_MOUSE,SVAGARES,53 29 | sp|Q8E9P5|MRAY_SHEON,AFFLYTTA,145 30 | sp|A0L1P5|MRAY_SHESA,AFFLYTTA,145 31 | sp|Q0HE80|MRAY_SHESM,AFFLYTTA,145 32 | sp|Q0HZR9|MRAY_SHESR,AFFLYTTA,145 33 | sp|Q133X2|MURC_RHOPS,AFRIFVEN,200 34 | sp|A2QW83|PAN2_ASPNC,VENTRALA,393 35 | sp|Q2ULU6|PAN2_ASPOR,VENTRALA,399 36 | sp|Q5BBL5|PAN2_EMENI,VENTRALA,399 37 | sp|P05066|PHR_YEAST,PELISSEN,530 38 | sp|A3DHY6|RSMG_ACET2,STEEVENS,181 39 | sp|Q8IX30|SCUB3_HUMAN,KLIKAFFE,947 40 | sp|Q66PY1|SCUB3_MOUSE,KLIKAFFE,947 41 | sp|Q9YD97|SYL_AERPE,AVLEDARE,825 42 | sp|Q5X5L8|SYL_LEGPA,INTYGADT,590 43 | sp|A5IBJ1|SYL_LEGPC,INTYGADT,590 44 | sp|Q5ZVU2|SYL_LEGPH,INTYGADT,590 45 | sp|Q5WWZ8|SYL_LEGPL,INTYGADT,590 46 | sp|Q4CNL4|TRM51_TRYCC,AFSLAGEN,458 47 | sp|Q4DPN8|TRM52_TRYCC,AFSLAGEN,456 48 | sp|A1RV13|UPP_PYRIL,REDIVIVA,125 49 | sp|Q9UVJ8|VATA_ASHGO,MAGALENA,1 50 | sp|Q5AJB1|VATA_CANAL,MAGALENA,1 51 | sp|P38078|VATA_CANTR,MAGALENA,1 52 | sp|P55650|Y4SG_SINFN,SYSSLING,313 53 | sp|Q2S9Y0|Y5895_HAHCH,LIKSIDIG,6 54 | --------------------------------------------------------------------------------