├── README.md
├── bioperl
    └── code
    │   └── R
    │       └── bioperl-l.R
├── brauer2007
    ├── brauer2007.Rmd
    ├── brauer2007.md
    └── brauer2007_files
    │   └── figure-gfm
    │       └── plot-top20-genes-1.png
├── citeulike
    └── code
    │   └── ruby
    │       └── cul2mongo.rb
├── maSigPro
    ├── gse59671.Rmd
    ├── gse59671.md
    └── gse59671_files
    │   └── figure-markdown_github
    │       ├── plot1-1.png
    │       ├── plot2-1.png
    │       ├── plot3-1.png
    │       └── plot4-1.png
├── ncbi
    ├── biosample
    │   └── code
    │   │   └── ruby
    │   │       └── cell_lines.rb
    ├── entrez_db_terms
    │   ├── README.md
    │   ├── code
    │   │   └── ruby
    │   │   │   └── entrez_db_terms.rb
    │   └── data
    │   │   ├── assembly.txt
    │   │   ├── bioproject.txt
    │   │   ├── biosample.txt
    │   │   ├── biosystems.txt
    │   │   ├── blastdbinfo.txt
    │   │   ├── books.txt
    │   │   ├── cdd.txt
    │   │   ├── clinvar.txt
    │   │   ├── clone.txt
    │   │   ├── dbvar.txt
    │   │   ├── epigenomics.txt
    │   │   ├── gap.txt
    │   │   ├── gapplus.txt
    │   │   ├── gds.txt
    │   │   ├── gencoll.txt
    │   │   ├── gene.txt
    │   │   ├── genome.txt
    │   │   ├── genomeprj.txt
    │   │   ├── geoprofiles.txt
    │   │   ├── gtr.txt
    │   │   ├── homologene.txt
    │   │   ├── journals.txt
    │   │   ├── medgen.txt
    │   │   ├── mesh.txt
    │   │   ├── ncbisearch.txt
    │   │   ├── nlmcatalog.txt
    │   │   ├── nuccore.txt
    │   │   ├── nucest.txt
    │   │   ├── nucgss.txt
    │   │   ├── nucleotide.txt
    │   │   ├── omim.txt
    │   │   ├── orgtrack.txt
    │   │   ├── pcassay.txt
    │   │   ├── pccompound.txt
    │   │   ├── pcsubstance.txt
    │   │   ├── pmc.txt
    │   │   ├── popset.txt
    │   │   ├── probe.txt
    │   │   ├── protein.txt
    │   │   ├── proteinclusters.txt
    │   │   ├── pubmed.txt
    │   │   ├── pubmedhealth.txt
    │   │   ├── seqannot.txt
    │   │   ├── snp.txt
    │   │   ├── sra.txt
    │   │   ├── structure.txt
    │   │   ├── taxonomy.txt
    │   │   ├── toolkit.txt
    │   │   ├── toolkitall.txt
    │   │   ├── toolkitbook.txt
    │   │   └── unigene.txt
    └── taxonomy
    │   ├── README.md
    │   └── virus_hosts
    │       ├── README.md
    │       ├── code
    │           └── ruby
    │           │   └── virus2host.rb
    │       └── data
    │           ├── host_count.txt
    │           └── virus_host.tsv
└── uniprot_words
    ├── code
        └── R
        │   └── match_words_uniprot.R
    └── data
        ├── word_matches_de.csv
        ├── word_matches_dk.csv
        ├── word_matches_en.csv
        ├── word_matches_es.csv
        ├── word_matches_fi.csv
        ├── word_matches_fr.csv
        ├── word_matches_it.csv
        ├── word_matches_nl.csv
        ├── word_matches_no.csv
        └── word_matches_se.csv


/README.md:
--------------------------------------------------------------------------------
 1 | # utils4bioinformatics
 2 | 
 3 | Little code snippets that do (hopefully) useful things.
 4 | 
 5 | ## Current contents
 6 | 
 7 | 1. ncbi/entrez_db_terms - lists searchable fields for all Entrez databases
 8 | 1. ncbi/taxonomy - utilities for working with the NCBI Taxonomy database
 9 | 1. citeulike - code for working with CiteULike collections
10 | 1. maSigPro - tutorial for the Bioconductor maSigPro package
11 | 1. brauer2007 - trying out random forest on yeast expression data
12 | 


--------------------------------------------------------------------------------
/bioperl/code/R/bioperl-l.R:
--------------------------------------------------------------------------------
 1 | # bioperl-l R
 2 | # plot the size of the monthly archives from bioperl-l mail list
 3 | 
 4 | library(XML)
 5 | library(stringr)
 6 | library(ggplot2)
 7 | 
 8 | # download and get 1st table in list
 9 | bp   <- readHTMLTable("http://lists.open-bio.org/pipermail/bioperl-l/", stringsAsFactors = FALSE)
10 | bp   <- bp[[1]]
11 | 
12 | # get gzip sizes KB or MB
13 | size <- str_match(bp$`Downloadable version`, "Text (\\d+) (\\w+) ")[, 2:3]
14 | bp$size <- as.numeric(size[, 1])
15 | bp$size <- ifelse(size[, 2] == "KB", bp$size * 1024, bp$size)
16 | bp$size <- ifelse(size[, 2] == "MB", bp$size * 1024 * 1024, bp$size)
17 | 
18 | # parse & convert date
19 | bp$date <- gsub(":", "", bp$Archive)
20 | bp$date <- gsub(" ", " 1 ", bp$date)
21 | bp$date <- as.Date(bp$date, "%B %e %Y")
22 | 
23 | # plot
24 | ggplot(bp) + geom_bar(aes(date, size), fill = "cornflowerblue", stat = "identity") + theme_bw() + scale_x_date(date_breaks = "2 years") + labs(x = "Date", y = "archive Gzip size (bytes)", title = "Approximate size of monthly Bioperl-l downloadable version 1996-present")
25 | 


--------------------------------------------------------------------------------
/brauer2007/brauer2007.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Can gene expression predict limiting nutrients in a random forest model?"
 3 | author: "Neil Saunders"
 4 | date: "`r Sys.time()`"
 5 | output:
 6 |   github_document:
 7 |     toc: true
 8 | ---
 9 | 
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(echo = TRUE,
12 |                       message = FALSE,
13 |                       warning = FALSE)
14 | 
15 | library(tidyverse)
16 | library(randomForest)
17 | library(randomForestExplainer)
18 | library(pander)
19 | 
20 | theme_set(theme_dark())
21 | ```
22 | 
23 | # Introduction
24 | Can we use random forest to predict which of 6 nutrients is limiting the growth of yeast, based on gene expression?
25 | 
26 | Inspired by a now-deleted question on Stack Overflow [r].
27 | 
28 | # Dataset
29 | We obtain a tidy version of the [Brauer 2008](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2174172/) dataset in CSV format from [this page](https://4va.github.io/biodatasci/r-dataviz-homework.html).
30 | 
31 | ```{r read-data}
32 | brauer2007_tidy <- read_csv("https://4va.github.io/biodatasci/data/brauer2007_tidy.csv")
33 | ```
34 | The dataset contains `r nrow(brauer2007_tidy)` rows. It can be thought of as 36 separate experiments: yeast are grown at 6 different growth rates, with 6 nutrients where one is supplied at limiting levels. Gene expression is measured in each case - the number of genes varies slightly but is around 5 500.
35 | 
36 | 
37 | # Random forest model
38 | We specify a classification model where the categorical outcome variable is the nutrient, and the predictors are gene expression values and rate. Rate is assumed not to be important in this model.
39 | 
40 | We used `set.seed` here for reproducibility, but it would not normally be used for random forest.
41 | 
42 | ```{r build-model}
43 | set.seed(1001)
44 | 
45 | brauer2007_tidy_rf1 <- brauer2007_tidy %>% 
46 |   mutate(systematic_name = gsub("-", "minus", systematic_name), 
47 |          nutrient = factor(nutrient)) %>% 
48 |   select(systematic_name, nutrient, rate, expression) %>% 
49 |   spread(systematic_name, expression, fill = 0) %>% 
50 |   randomForest(nutrient ~ ., data = ., localImp = TRUE, importance = TRUE)
51 | 
52 | brauer2007_tidy_rf1
53 | ```
54 | 
55 | # Top 20 variables by importance
56 | We plot the expression of the top 20 most important variables (genes) by rate and nutrient.
57 | 
58 | `important_variables` is a function from the `randomForestExplainer` package.
59 | 
60 | ```{r plot-top20-genes}
61 | brauer2007_tidy %>% 
62 |   filter(systematic_name %in% important_variables(brauer2007_tidy_rf1, k = 20)) %>% 
63 |   ggplot(aes(rate, expression)) + 
64 |   geom_line(aes(color = nutrient)) + 
65 |   facet_wrap(~systematic_name, ncol = 5) + 
66 |   scale_color_brewer(palette = "Set2")
67 | ```
68 | 
69 | # Research into a selection of the top 20 genes
70 | We select for each of the 6 nutrients, one gene from the top 20 with a distinctive expression pattern when that nutrient is limited.
71 | 
72 | Then we search the web using the term "gene name + nutrient" to see if there are any known associations, using resources such as the [Saccharomyces Genome Database](https://www.yeastgenome.org/).
73 | 
74 | We can say that the expression pattern under nutrient limitation "makes sense" for 5 of the genes, given what is known about their function. The exception is YLR108C, which is moderately up-regulated under phosphate limitation.
75 | 
76 | ```{r gene-function, echo=FALSE}
77 | genes <- c("YOR348C", "YOR374W", "YHR208W", "YLR108C", "YLL055W", "YKL216W")
78 | 
79 | brauer2007_tidy %>% 
80 |   filter(systematic_name %in% genes) %>% 
81 |   distinct(systematic_name, bp) %>% 
82 |   arrange(systematic_name) %>% 
83 |   bind_cols(nutrient = c("leucine", "uracil", "sulfate", "phosphate", "ammonia", "glucose"),
84 |             search_results = c("[Pathways - leucine biosynthesis](https://www.yeastgenome.org/locus/S000001251)", "[URA1 - null mutant requires uracil](https://www.yeastgenome.org/locus/S000001699)", "[Cysteine transporter; null mutant absent utilization of sulfur source](https://www.yeastgenome.org/locus/S000003978)", "", "[Proline permease; repressed in ammonia-grown cells](https://www.yeastgenome.org/locus/S000005875)", "[Aldehyde dehydrogenase; expression is glucose-repressed](https://www.yeastgenome.org/locus/S000005901)")) %>% 
85 |   pander(split.table = Inf)
86 | ```
87 | 
88 | # Summary
89 | Random forest would be far from my first method of choice for this problem. It would be more usual to determine first which genes were differentially-expressed, then go back and examine the nutrient limitation data. However, random forest does seem to have identified genes that are differentially expressed under nutrient limitation, and which have known biological functions consistent with their expression in the Brauer data.
90 | 


--------------------------------------------------------------------------------
/brauer2007/brauer2007.md:
--------------------------------------------------------------------------------
  1 | Can gene expression predict limiting nutrients in a random forest model?
  2 | ================
  3 | Neil Saunders
  4 | 2019-06-26 21:40:27
  5 | 
  6 | # Introduction
  7 | 
  8 | Can we use random forest to predict which of 6 nutrients is limiting the
  9 | growth of yeast, based on gene expression?
 10 | 
 11 | Inspired by a now-deleted question on Stack Overflow \[r\].
 12 | 
 13 | # Dataset
 14 | 
 15 | We obtain a tidy version of the
 16 | [Brauer 2008](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2174172/)
 17 | dataset in CSV format from [this
 18 | page](https://4va.github.io/biodatasci/r-dataviz-homework.html).
 19 | 
 20 | ``` r
 21 | brauer2007_tidy <- read_csv("https://4va.github.io/biodatasci/data/brauer2007_tidy.csv")
 22 | ```
 23 | 
 24 | The dataset contains 198430 rows. It can be thought of as 36 separate
 25 | experiments: yeast are grown at 6 different growth rates, with 6
 26 | nutrients where one is supplied at limiting levels. Gene expression is
 27 | measured in each case - the number of genes varies slightly but is
 28 | around 5 500.
 29 | 
 30 | # Random forest model
 31 | 
 32 | We specify a classification model where the categorical outcome variable
 33 | is the nutrient, and the predictors are gene expression values and rate.
 34 | Rate is assumed not to be important in this model.
 35 | 
 36 | We used `set.seed` here for reproducibility, but it would not normally
 37 | be used for random forest.
 38 | 
 39 | ``` r
 40 | set.seed(1001)
 41 | 
 42 | brauer2007_tidy_rf1 <- brauer2007_tidy %>% 
 43 |   mutate(systematic_name = gsub("-", "minus", systematic_name), 
 44 |          nutrient = factor(nutrient)) %>% 
 45 |   select(systematic_name, nutrient, rate, expression) %>% 
 46 |   spread(systematic_name, expression, fill = 0) %>% 
 47 |   randomForest(nutrient ~ ., data = ., localImp = TRUE, importance = TRUE)
 48 | 
 49 | brauer2007_tidy_rf1
 50 | ```
 51 | 
 52 |     ## 
 53 |     ## Call:
 54 |     ##  randomForest(formula = nutrient ~ ., data = ., localImp = TRUE,      importance = TRUE) 
 55 |     ##                Type of random forest: classification
 56 |     ##                      Number of trees: 500
 57 |     ## No. of variables tried at each split: 74
 58 |     ## 
 59 |     ##         OOB estimate of  error rate: 5.56%
 60 |     ## Confusion matrix:
 61 |     ##           Ammonia Glucose Leucine Phosphate Sulfate Uracil class.error
 62 |     ## Ammonia         6       0       0         0       0      0   0.0000000
 63 |     ## Glucose         0       6       0         0       0      0   0.0000000
 64 |     ## Leucine         0       1       5         0       0      0   0.1666667
 65 |     ## Phosphate       0       0       0         6       0      0   0.0000000
 66 |     ## Sulfate         0       0       0         0       6      0   0.0000000
 67 |     ## Uracil          0       1       0         0       0      5   0.1666667
 68 | 
 69 | # Top 20 variables by importance
 70 | 
 71 | We plot the expression of the top 20 most important variables (genes) by
 72 | rate and nutrient.
 73 | 
 74 | `important_variables` is a function from the `randomForestExplainer`
 75 | package.
 76 | 
 77 | ``` r
 78 | brauer2007_tidy %>% 
 79 |   filter(systematic_name %in% important_variables(brauer2007_tidy_rf1, k = 20)) %>% 
 80 |   ggplot(aes(rate, expression)) + 
 81 |   geom_line(aes(color = nutrient)) + 
 82 |   facet_wrap(~systematic_name, ncol = 5) + 
 83 |   scale_color_brewer(palette = "Set2")
 84 | ```
 85 | 
 86 | ![](brauer2007_files/figure-gfm/plot-top20-genes-1.png)<!-- -->
 87 | 
 88 | # Research into a selection of the top 20 genes
 89 | 
 90 | We select for each of the 6 nutrients, one gene from the top 20 with a
 91 | distinctive expression pattern when that nutrient is limited.
 92 | 
 93 | Then we search the web using the term “gene name + nutrient” to see if
 94 | there are any known associations, using resources such as the
 95 | [Saccharomyces Genome Database](https://www.yeastgenome.org/).
 96 | 
 97 | We can say that the expression pattern under nutrient limitation “makes
 98 | sense” for 5 of the genes, given what is known about their function. The
 99 | exception is YLR108C, which is moderately up-regulated under phosphate
100 | limitation.
101 | 
102 | | systematic\_name |                       bp                        | nutrient  |                                                    search\_results                                                    |
103 | | :--------------: | :---------------------------------------------: | :-------: | :-------------------------------------------------------------------------------------------------------------------: |
104 | |     YHR208W      | branched chain family amino acid biosynthesis\* |  leucine  |                    [Pathways - leucine biosynthesis](https://www.yeastgenome.org/locus/S000001251)                    |
105 | |     YKL216W      |     ‘de novo’ pyrimidine base biosynthesis      |  uracil   |                  [URA1 - null mutant requires uracil](https://www.yeastgenome.org/locus/S000001699)                   |
106 | |     YLL055W      |           biological process unknown            |  sulfate  | [Cysteine transporter; null mutant absent utilization of sulfur source](https://www.yeastgenome.org/locus/S000003978) |
107 | |     YLR108C      |           biological process unknown            | phosphate |                                                                                                                       |
108 | |     YOR348C      |              proline catabolism\*               |  ammonia  |          [Proline permease; repressed in ammonia-grown cells](https://www.yeastgenome.org/locus/S000005875)           |
109 | |     YOR374W      |               ethanol metabolism                |  glucose  |        [Aldehyde dehydrogenase; expression is glucose-repressed](https://www.yeastgenome.org/locus/S000005901)        |
110 | 
111 | # Summary
112 | 
113 | Random forest would be far from my first method of choice for this
114 | problem. It would be more usual to determine first which genes were
115 | differentially-expressed, then go back and examine the nutrient
116 | limitation data. However, random forest does seem to have identified
117 | genes that are differentially expressed under nutrient limitation, and
118 | which have known biological functions consistent with their expression
119 | in the Brauer data.
120 | 


--------------------------------------------------------------------------------
/brauer2007/brauer2007_files/figure-gfm/plot-top20-genes-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/brauer2007/brauer2007_files/figure-gfm/plot-top20-genes-1.png


--------------------------------------------------------------------------------
/citeulike/code/ruby/cul2mongo.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | # save CiteULike JSON in mongodb database
 4 | def json2mongo(db = "citeulike", col = "articles", user = "neils")
 5 |     require "mongo"
 6 |     require "json/pure"
 7 |     require "open-uri"
 8 | 
 9 |     puts "Fetching JSON..."
10 |     db  = Mongo::Connection.new.db(db)
11 |     col = db.collection(col)
12 |     url = "http://www.citeulike.org/json/user/" + user
13 |     j   = JSON.parse(open(url).read)
14 |     j.each do |article|
15 |         article[:_id] = article['article_id']
16 |         col.save(article)
17 |     end
18 |     puts "Done. Collection contains: #{col.count} articles."
19 | end
20 | 
21 | # run with default options
22 | json2mongo
23 | 


--------------------------------------------------------------------------------
/maSigPro/gse59671.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Analysis of gene expression timecourse data using maSigPro"
  3 | author: "Neil Saunders"
  4 | date: "`r Sys.time()`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | ```{r setup, include=FALSE}
 11 | knitr::opts_chunk$set(echo = FALSE, 
 12 |                       message = FALSE, 
 13 |                       warning = FALSE)
 14 | library(tidyverse)
 15 | library(GEOquery)
 16 | library(maSigPro)
 17 | library(biomaRt)
 18 | library(pander)
 19 | 
 20 | theme_set(theme_bw())
 21 | 
 22 | getGenes <- function(sig, bm) {
 23 |   genes <- getBM(attributes = c("affy_hg_u133a_2", "hgnc_symbol"), 
 24 |                  filters = "affy_hg_u133a_2", 
 25 |                  values = rownames(sig), 
 26 |                  mart = bm)
 27 |   m <- match(rownames(sig), genes$affy_hg_u133a_2)
 28 |   sig$gene <- genes[m, "hgnc_symbol"]
 29 |   return(sig)
 30 | }
 31 | 
 32 | plotGenes <- function(e, probe, g, md) {
 33 |   d <- e[p, ] %>% 
 34 |     as.data.frame() %>% 
 35 |     setNames("value") %>% 
 36 |     mutate(Rep = md$Replicate,
 37 |            time = md$Time,
 38 |            agent = md$agent)
 39 |   gg <- d %>% 
 40 |     ggplot(aes(time, value)) + 
 41 |       geom_boxplot(aes(position = factor(time)), outlier.shape = NA) + 
 42 |       scale_x_continuous(breaks = unique(d$time)) + 
 43 |       geom_jitter(aes(color = factor(agent))) + 
 44 |       geom_smooth() + 
 45 |       labs(title = paste(g, probe, sep = "/"), 
 46 |            x = "time (hours)", 
 47 |            y = "RMA value") + 
 48 |       scale_color_discrete(name = "treatment")
 49 |   return(gg)
 50 | }
 51 | ```
 52 | 
 53 | # Introduction
 54 | This tutorial looks at how to use the Bioconductor package [maSigPro](http://www.bioconductor.org/packages/release/bioc/html/maSigPro.html) to analyse the expression of genes over time.
 55 | 
 56 | # Retrieving data using GEOquery
 57 | First, we search the [NCBI GEO database](http://www.ncbi.nlm.nih.gov/geo) for suitable public datasets. The experimental design criteria that we would like to satisfy are:
 58 | 
 59 | - several timepoints
 60 | - several samples per timepoint
 61 | - clarity in how expression data were obtained (normalisation, log2 values)
 62 | - of some biological interest (easy to link to other data sources)
 63 | 
 64 | Datasets that satisfy these criteria are not easy to discover. Eventually we settled on the study titled [Celecoxib, rofecoxib treated human smooth muscle cells microarray timecourse (GSE59671)](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE59671). In this study cells were pre-treated with one of two drugs then exposed to a protein, interleukin 1-beta, involved in inflammatory responses. Four biological replicates for each drug treatment and untreated controls were analysed at 0, 2, 8 and 24 hours post-IL1b exposure, generating 4 x 3 x 4 = 48 samples. A further 4 control samples were collected at "-2 hours", i.e. prior to IL1b-exposure.
 65 | 
 66 | ## Expression data
 67 | First we fetch the GEO series dataset using GEOquery. Getting the expression values is easy using _exprs()_. A quick check to see that they "look like" normalised log2 values (it's stated that they are at the GEO website).
 68 | 
 69 | ```{r getGEO, cache=TRUE}
 70 | gse <- getGEO("GSE59671")
 71 | exp <- exprs(gse$GSE59671_series_matrix.txt.gz)
 72 | 
 73 | exp %>%
 74 |   as.data.frame() %>%
 75 |   dplyr::select(1:4) %>%
 76 |   slice(1:4) %>%
 77 |   pander(justify = "right")
 78 | ```
 79 | 
 80 | ## Phenotypic data
 81 | "Phenotypic" data, the details of the experimental design, can be accessed using _pData()_.
 82 | 
 83 | ```{r pdata}
 84 | pd <- pData(gse$GSE59671_series_matrix.txt.gz)
 85 | names(pd)
 86 | ```
 87 | 
 88 | We can see that times and treatments are captured in the title attribute.
 89 | ```{r}
 90 | pd$title[1] %>%
 91 |   as.character()
 92 | ```
 93 | 
 94 | # Creating a design matrix
 95 | We can parse the sample titles using _str\_match_ from the _stringr_ package, then do some data cleaning to create the data frame used for the design matrix.
 96 | 
 97 | ```{r designdata}
 98 | pd.des <- str_match(pd$title, "^(.*?)_(.*?)_(.*?)_(.*?)\\s+(.*?)$")[, 2:6] %>% 
 99 |   as.data.frame() %>% 
100 |   setNames(c("cell", "agent", "Time", "bio", "Replicate")) %>% 
101 |   mutate(bio = NULL,
102 |          hasmc = ifelse(cell == "hasmc", 1, 0),
103 |          Control = ifelse(agent == "none", 1, 0),
104 |          celecoxib = ifelse(agent == "celecoxib", 1, 0),
105 |          rofecoxib = ifelse(agent == "rofecoxib", 1, 0),
106 |          Time = gsub("tp", "", Time),
107 |          Time = gsub("hr", "", Time),
108 |          Time = as.numeric(Time))
109 | 
110 | pd.des %>%
111 |   slice(1:5) %>%
112 |   pander(justify = "right")
113 | ```
114 | 
115 | The last steps are to drop the -2 hour samples (to simplify things), number the replicates correctly and importantly, add the GEO sample names as row names in the data frame <code>pd.res</code>, so as the expression data matches the phenotypic data.
116 | 
117 | Now we can make the design matrix from the data frame with degree = 3 (4 timepoints - 1). Note that times and replicates are given their numeric values; conditions (cell type, control or treated) are signified by values of 0 or 1.
118 | 
119 | ```{r design}
120 | exp.des <- exp[, c(1:16, 21:52)]
121 | pd.des <- pd.des[c(1:16, 21:52), ]
122 | pd.des$Replicate <- rep(1:12, 1, each = 4)
123 | rownames(pd.des) <- pd$geo_accession[c(1:16, 21:52)]
124 | 
125 | # now we can make the design matrix from the appropriate columns
126 | design <- make.design.matrix(pd.des[, c(3, 4, 6:8)], degree = 3)
127 | 
128 | design$edesign %>%
129 |   as.data.frame() %>% 
130 |   slice(1:5) %>% 
131 |   pander(justify = "right")
132 | ```
133 | 
134 | # Fitting the regression model
135 | ## From regression model to significant genes
136 | Now we proceed exactly as described in the maSigPro users guide, fitting a regression model to discover probesets with significant differential expression over time. The functions _p.vector()_ and _T.fit()_ use _print()_ to report progress, so we're hiding that output here using _capture.output()_.
137 | 
138 | ```{r regression}
139 | hide <- capture.output(fit <- p.vector(exp.des, design))
140 | hide <- capture.output(tstep <- T.fit(fit, step.method = "backward", alfa = 0.05))
141 | sigs <- get.siggenes(tstep, rsq = 0.6, vars = "groups")
142 | ```
143 | 
144 | The list _sigs_ is a surprisingly complex object.
145 | 
146 | ```{r siggenes}
147 | sigs %>%
148 |   glimpse()
149 | ```
150 | 
151 | Detailed information about significant genes is stored in the list _sig.genes_. Since we specified <code>vars = "groups"</code>, times and treatments are returned together for each treatment. So for example, _sigs$sig.genes$Control_ returns data for the Control (untreated) time points; _sigs$sig.genes$celecoxibvsControl_ returns data for time points with the contrast celecoxib treatment versus Control.
152 | 
153 | We can get the data frames with p-values for control, celecoxib- and rofecoxib-treated cells.
154 | 
155 | ```{r pvals}
156 | control   <- sigs$sig.genes$Control$sig.pvalues
157 | celecoxib <- sigs$sig.genes$celecoxibvsControl$sig.pvalues
158 | rofecoxib <- sigs$sig.genes$rofecoxibvsControl$sig.pvalues
159 | ```
160 | 
161 | ## Matching probesets to genes using biomaRt
162 | Next, we write a function that uses biomaRt to fetch HGNC gene symbols for the probesets.
163 | 
164 | ```{r getGenes}
165 | mart.hs <- useMart("ensembl", "hsapiens_gene_ensembl")
166 | control   <- getGenes(control, mart.hs)
167 | celecoxib <- getGenes(rofecoxib, mart.hs)
168 | rofecoxib <- getGenes(rofecoxib, mart.hs)
169 | ```
170 | 
171 | Now we're ready to look at "interesting genes".
172 | 
173 | # Plotting timecourses for genes of interest
174 | 
175 | ## Control samples
176 | Let's start with the control samples (no drug treatment), sorting on the p-value column. We then write a function that uses ggplot2 to plot the RMA expression values for the probeset corresponding to a given gene. We'll test it with the first probeset.
177 | 
178 | ```{r plot1, fig.height=6, fig.width=9}
179 | # head(control[order(control$`p-value`, decreasing = FALSE), ])
180 | 
181 | p    <- rownames(control[order(control$`p-value`, decreasing = FALSE), ])[1]
182 | gene <- ifelse(is.na(subset(control, rownames(control) == p)$gene), p, subset(control, rownames(control) == p)$gene)
183 | plotGenes(exp.des, p, gene, pd.des)
184 | ```
185 | 
186 | Expression of the gene CD83 rises dramatically in the first 2 hours after exposure to IL1b, then drops back to the base level by 8 hours. A quick Google search for the term "CD83 inflammation" indicates that the gene is involved with inflammatory responses.
187 | 
188 | Let's try another - say, number 6 in the list.
189 | 
190 | ```{r plot2, fig.height=6, fig.width=9}
191 | p <- rownames(control[order(control$`p-value`, decreasing = FALSE), ])[6]
192 | gene <- ifelse(is.na(subset(control, rownames(control) == p)$gene), p, subset(control, rownames(control) == p)$gene)
193 | plotGenes(exp.des, p, gene, pd.des)
194 | ```
195 | 
196 | The ANXA11 gene is also involved with inflammation and shows the opposite behaviour to CD83; expression drops ~ 3-4 fold in the first 8 hours, then rises gradually from 8-24 hours.
197 | 
198 | ## Treated samples
199 | Did the drug treatments make any difference to the IL1b response? Let's start with the best p-value for celecoxib versus Control.
200 | 
201 | ```{r plot3, fig.height=6, fig.width=9}
202 | p    <- rownames(celecoxib[order(celecoxib$p.valor_celecoxibvsControl, decreasing = FALSE), ])[1]
203 | gene <- ifelse(is.na(subset(celecoxib, rownames(celecoxib) == p)$gene), p, subset(celecoxib, rownames(celecoxib) == p)$gene)
204 | plotGenes(exp.des, p, gene, pd.des)
205 | ```
206 | 
207 | There is some indication that expression of JRK post-IL1b exposure was lowered less between 0-2 hours in celecoxib-treated cells, but it's not incredibly convincing as the fold-change overall is quite low under all conditions.  What about rofecoxib?
208 | 
209 | ```{r plot4, fig.height=6, fig.width=9}
210 | p    <- rownames(rofecoxib[order(rofecoxib$p.valor_rofecoxibvsControl, decreasing = FALSE), ])[1]
211 | gene <- ifelse(is.na(subset(rofecoxib, rownames(rofecoxib) == p)$gene), p, subset(rofecoxib, rownames(rofecoxib) == p)$gene)
212 | plotGenes(exp.des, p, gene, pd.des)
213 | ```
214 | 
215 | It appears that this p-value is driven by higher PIK3C2B expression in 3/4 rofecoxib-treated samples at t = 0. Perhaps the less-than-dramatic effects of drug treatment explain why this GEO series is not associated with a publication.
216 | 
217 | # Summary
218 | * maSigPro is a useful, effective package for analysis of timecourse microarray data
219 | * It combines well with ggplot2 to generate attractive and informative plots of gene expression over time
220 | * The example dataset GSE59671 reveals some interesting effects on the expression of inflammation-associated genes when cells are exposed to ILb1
221 | 


--------------------------------------------------------------------------------
/maSigPro/gse59671.md:
--------------------------------------------------------------------------------
  1 | Analysis of gene expression timecourse data using maSigPro
  2 | ================
  3 | Neil Saunders
  4 | 2018-04-03 21:50:36
  5 | 
  6 | -   [Introduction](#introduction)
  7 | -   [Retrieving data using GEOquery](#retrieving-data-using-geoquery)
  8 |     -   [Expression data](#expression-data)
  9 |     -   [Phenotypic data](#phenotypic-data)
 10 | -   [Creating a design matrix](#creating-a-design-matrix)
 11 | -   [Fitting the regression model](#fitting-the-regression-model)
 12 |     -   [From regression model to significant genes](#from-regression-model-to-significant-genes)
 13 |     -   [Matching probesets to genes using biomaRt](#matching-probesets-to-genes-using-biomart)
 14 | -   [Plotting timecourses for genes of interest](#plotting-timecourses-for-genes-of-interest)
 15 |     -   [Control samples](#control-samples)
 16 |     -   [Treated samples](#treated-samples)
 17 | -   [Summary](#summary)
 18 | 
 19 | Introduction
 20 | ============
 21 | 
 22 | This tutorial looks at how to use the Bioconductor package [maSigPro](http://www.bioconductor.org/packages/release/bioc/html/maSigPro.html) to analyse the expression of genes over time.
 23 | 
 24 | Retrieving data using GEOquery
 25 | ==============================
 26 | 
 27 | First, we search the [NCBI GEO database](http://www.ncbi.nlm.nih.gov/geo) for suitable public datasets. The experimental design criteria that we would like to satisfy are:
 28 | 
 29 | -   several timepoints
 30 | -   several samples per timepoint
 31 | -   clarity in how expression data were obtained (normalisation, log2 values)
 32 | -   of some biological interest (easy to link to other data sources)
 33 | 
 34 | Datasets that satisfy these criteria are not easy to discover. Eventually we settled on the study titled [Celecoxib, rofecoxib treated human smooth muscle cells microarray timecourse (GSE59671)](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE59671). In this study cells were pre-treated with one of two drugs then exposed to a protein, interleukin 1-beta, involved in inflammatory responses. Four biological replicates for each drug treatment and untreated controls were analysed at 0, 2, 8 and 24 hours post-IL1b exposure, generating 4 x 3 x 4 = 48 samples. A further 4 control samples were collected at "-2 hours", i.e. prior to IL1b-exposure.
 35 | 
 36 | Expression data
 37 | ---------------
 38 | 
 39 | First we fetch the GEO series dataset using GEOquery. Getting the expression values is easy using *exprs()*. A quick check to see that they "look like" normalised log2 values (it's stated that they are at the GEO website).
 40 | 
 41 | <table style="width:72%;">
 42 | <colgroup>
 43 | <col width="18%" />
 44 | <col width="18%" />
 45 | <col width="18%" />
 46 | <col width="18%" />
 47 | </colgroup>
 48 | <thead>
 49 | <tr class="header">
 50 | <th align="right">GSM1442176</th>
 51 | <th align="right">GSM1442177</th>
 52 | <th align="right">GSM1442178</th>
 53 | <th align="right">GSM1442179</th>
 54 | </tr>
 55 | </thead>
 56 | <tbody>
 57 | <tr class="odd">
 58 | <td align="right">9.229</td>
 59 | <td align="right">9.371</td>
 60 | <td align="right">9.369</td>
 61 | <td align="right">9.458</td>
 62 | </tr>
 63 | <tr class="even">
 64 | <td align="right">7.297</td>
 65 | <td align="right">7.113</td>
 66 | <td align="right">7.315</td>
 67 | <td align="right">7.265</td>
 68 | </tr>
 69 | <tr class="odd">
 70 | <td align="right">2.37</td>
 71 | <td align="right">2.412</td>
 72 | <td align="right">2.405</td>
 73 | <td align="right">2.424</td>
 74 | </tr>
 75 | <tr class="even">
 76 | <td align="right">5.686</td>
 77 | <td align="right">5.719</td>
 78 | <td align="right">5.941</td>
 79 | <td align="right">5.725</td>
 80 | </tr>
 81 | </tbody>
 82 | </table>
 83 | 
 84 | Phenotypic data
 85 | ---------------
 86 | 
 87 | "Phenotypic" data, the details of the experimental design, can be accessed using *pData()*.
 88 | 
 89 |     ##  [1] "title"                    "geo_accession"           
 90 |     ##  [3] "status"                   "submission_date"         
 91 |     ##  [5] "last_update_date"         "type"                    
 92 |     ##  [7] "channel_count"            "source_name_ch1"         
 93 |     ##  [9] "organism_ch1"             "characteristics_ch1"     
 94 |     ## [11] "characteristics_ch1.1"    "characteristics_ch1.2"   
 95 |     ## [13] "characteristics_ch1.3"    "characteristics_ch1.4"   
 96 |     ## [15] "characteristics_ch1.5"    "biomaterial_provider_ch1"
 97 |     ## [17] "treatment_protocol_ch1"   "growth_protocol_ch1"     
 98 |     ## [19] "molecule_ch1"             "extract_protocol_ch1"    
 99 |     ## [21] "label_ch1"                "label_protocol_ch1"      
100 |     ## [23] "taxid_ch1"                "hyb_protocol"            
101 |     ## [25] "scan_protocol"            "description"             
102 |     ## [27] "data_processing"          "platform_id"             
103 |     ## [29] "contact_name"             "contact_email"           
104 |     ## [31] "contact_laboratory"       "contact_department"      
105 |     ## [33] "contact_institute"        "contact_address"         
106 |     ## [35] "contact_city"             "contact_state"           
107 |     ## [37] "contact_zip/postal_code"  "contact_country"         
108 |     ## [39] "supplementary_file"       "data_row_count"          
109 |     ## [41] "relation"                 "cell type:ch1"           
110 |     ## [43] "gender:ch1"               "material type:ch1"       
111 |     ## [45] "nsaid treatment:ch1"      "race:ch1"                
112 |     ## [47] "time point:ch1"
113 | 
114 | We can see that times and treatments are captured in the title attribute.
115 | 
116 |     ## [1] "hasmc_celecoxib_tp0hr_biological rep1"
117 | 
118 | Creating a design matrix
119 | ========================
120 | 
121 | We can parse the sample titles using *str\_match* from the *stringr* package, then do some data cleaning to create the data frame used for the design matrix.
122 | 
123 | <table style="width:100%;">
124 | <colgroup>
125 | <col width="9%" />
126 | <col width="14%" />
127 | <col width="8%" />
128 | <col width="14%" />
129 | <col width="9%" />
130 | <col width="12%" />
131 | <col width="14%" />
132 | <col width="14%" />
133 | </colgroup>
134 | <thead>
135 | <tr class="header">
136 | <th align="right">cell</th>
137 | <th align="right">agent</th>
138 | <th align="right">Time</th>
139 | <th align="right">Replicate</th>
140 | <th align="right">hasmc</th>
141 | <th align="right">Control</th>
142 | <th align="right">celecoxib</th>
143 | <th align="right">rofecoxib</th>
144 | </tr>
145 | </thead>
146 | <tbody>
147 | <tr class="odd">
148 | <td align="right">hasmc</td>
149 | <td align="right">celecoxib</td>
150 | <td align="right">0</td>
151 | <td align="right">rep1</td>
152 | <td align="right">1</td>
153 | <td align="right">0</td>
154 | <td align="right">1</td>
155 | <td align="right">0</td>
156 | </tr>
157 | <tr class="even">
158 | <td align="right">hasmc</td>
159 | <td align="right">celecoxib</td>
160 | <td align="right">0</td>
161 | <td align="right">rep2</td>
162 | <td align="right">1</td>
163 | <td align="right">0</td>
164 | <td align="right">1</td>
165 | <td align="right">0</td>
166 | </tr>
167 | <tr class="odd">
168 | <td align="right">hasmc</td>
169 | <td align="right">celecoxib</td>
170 | <td align="right">0</td>
171 | <td align="right">rep3</td>
172 | <td align="right">1</td>
173 | <td align="right">0</td>
174 | <td align="right">1</td>
175 | <td align="right">0</td>
176 | </tr>
177 | <tr class="even">
178 | <td align="right">hasmc</td>
179 | <td align="right">celecoxib</td>
180 | <td align="right">0</td>
181 | <td align="right">rep4</td>
182 | <td align="right">1</td>
183 | <td align="right">0</td>
184 | <td align="right">1</td>
185 | <td align="right">0</td>
186 | </tr>
187 | <tr class="odd">
188 | <td align="right">hasmc</td>
189 | <td align="right">celecoxib</td>
190 | <td align="right">2</td>
191 | <td align="right">rep1</td>
192 | <td align="right">1</td>
193 | <td align="right">0</td>
194 | <td align="right">1</td>
195 | <td align="right">0</td>
196 | </tr>
197 | </tbody>
198 | </table>
199 | 
200 | The last steps are to drop the -2 hour samples (to simplify things), number the replicates correctly and importantly, add the GEO sample names as row names in the data frame <code>pd.res</code>, so as the expression data matches the phenotypic data.
201 | 
202 | Now we can make the design matrix from the data frame with degree = 3 (4 timepoints - 1). Note that times and replicates are given their numeric values; conditions (cell type, control or treated) are signified by values of 0 or 1.
203 | 
204 | <table style="width:74%;">
205 | <colgroup>
206 | <col width="9%" />
207 | <col width="16%" />
208 | <col width="13%" />
209 | <col width="16%" />
210 | <col width="16%" />
211 | </colgroup>
212 | <thead>
213 | <tr class="header">
214 | <th align="right">Time</th>
215 | <th align="right">Replicate</th>
216 | <th align="right">Control</th>
217 | <th align="right">celecoxib</th>
218 | <th align="right">rofecoxib</th>
219 | </tr>
220 | </thead>
221 | <tbody>
222 | <tr class="odd">
223 | <td align="right">0</td>
224 | <td align="right">1</td>
225 | <td align="right">0</td>
226 | <td align="right">1</td>
227 | <td align="right">0</td>
228 | </tr>
229 | <tr class="even">
230 | <td align="right">0</td>
231 | <td align="right">1</td>
232 | <td align="right">0</td>
233 | <td align="right">1</td>
234 | <td align="right">0</td>
235 | </tr>
236 | <tr class="odd">
237 | <td align="right">0</td>
238 | <td align="right">1</td>
239 | <td align="right">0</td>
240 | <td align="right">1</td>
241 | <td align="right">0</td>
242 | </tr>
243 | <tr class="even">
244 | <td align="right">0</td>
245 | <td align="right">1</td>
246 | <td align="right">0</td>
247 | <td align="right">1</td>
248 | <td align="right">0</td>
249 | </tr>
250 | <tr class="odd">
251 | <td align="right">2</td>
252 | <td align="right">2</td>
253 | <td align="right">0</td>
254 | <td align="right">1</td>
255 | <td align="right">0</td>
256 | </tr>
257 | </tbody>
258 | </table>
259 | 
260 | Fitting the regression model
261 | ============================
262 | 
263 | From regression model to significant genes
264 | ------------------------------------------
265 | 
266 | Now we proceed exactly as described in the maSigPro users guide, fitting a regression model to discover probesets with significant differential expression over time. The functions *p.vector()* and *T.fit()* use *print()* to report progress, so we're hiding that output here using *capture.output()*.
267 | 
268 | The list *sigs* is a surprisingly complex object.
269 | 
270 |     ## List of 2
271 |     ##  $ sig.genes:List of 3
272 |     ##   ..$ Control           :List of 7
273 |     ##   .. ..$ sig.profiles :'data.frame': 4757 obs. of  48 variables:
274 |     ##   .. ..$ coefficients :'data.frame': 4757 obs. of  12 variables:
275 |     ##   .. ..$ group.coeffs :'data.frame': 4757 obs. of  12 variables:
276 |     ##   .. ..$ sig.pvalues  :'data.frame': 4757 obs. of  14 variables:
277 |     ##   .. ..$ g            : int 4757
278 |     ##   .. ..$ edesign      :'data.frame': 48 obs. of  5 variables:
279 |     ##   .. ..$ groups.vector: chr [1:12] "Control" "celecoxibvsControl" "rofecoxibvsControl" "Control" ...
280 |     ##   ..$ celecoxibvsControl:List of 7
281 |     ##   .. ..$ sig.profiles :'data.frame': 769 obs. of  48 variables:
282 |     ##   .. ..$ coefficients :'data.frame': 769 obs. of  12 variables:
283 |     ##   .. ..$ group.coeffs :'data.frame': 769 obs. of  12 variables:
284 |     ##   .. ..$ sig.pvalues  :'data.frame': 769 obs. of  14 variables:
285 |     ##   .. ..$ g            : int 769
286 |     ##   .. ..$ edesign      :'data.frame': 48 obs. of  5 variables:
287 |     ##   .. ..$ groups.vector: chr [1:12] "Control" "celecoxibvsControl" "rofecoxibvsControl" "Control" ...
288 |     ##   ..$ rofecoxibvsControl:List of 7
289 |     ##   .. ..$ sig.profiles :'data.frame': 899 obs. of  48 variables:
290 |     ##   .. ..$ coefficients :'data.frame': 899 obs. of  12 variables:
291 |     ##   .. ..$ group.coeffs :'data.frame': 899 obs. of  12 variables:
292 |     ##   .. ..$ sig.pvalues  :'data.frame': 899 obs. of  14 variables:
293 |     ##   .. ..$ g            : int 899
294 |     ##   .. ..$ edesign      :'data.frame': 48 obs. of  5 variables:
295 |     ##   .. ..$ groups.vector: chr [1:12] "Control" "celecoxibvsControl" "rofecoxibvsControl" "Control" ...
296 |     ##  $ summary  :'data.frame':   4757 obs. of  3 variables:
297 |     ##   ..$ Control           : Factor w/ 4757 levels "1053_at","1294_at",..: 1 2 3 4 5 6 7 8 9 10 ...
298 |     ##   ..$ celecoxibvsControl: Factor w/ 770 levels " ","1053_at",..: 2 3 4 5 6 7 8 9 10 11 ...
299 |     ##   ..$ rofecoxibvsControl: Factor w/ 900 levels " ","1053_at",..: 2 3 4 5 6 7 8 9 10 11 ...
300 | 
301 | Detailed information about significant genes is stored in the list *sig.genes*. Since we specified <code>vars = "groups"</code>, times and treatments are returned together for each treatment. So for example, *sigs*s**i**g*.*g**e**n**e**s*Control* returns data for the Control (untreated) time points; *sigs*s**i**g*.*g**e**n**e**s*celecoxibvsControl* returns data for time points with the contrast celecoxib treatment versus Control.
302 | 
303 | We can get the data frames with p-values for control, celecoxib- and rofecoxib-treated cells.
304 | 
305 | Matching probesets to genes using biomaRt
306 | -----------------------------------------
307 | 
308 | Next, we write a function that uses biomaRt to fetch HGNC gene symbols for the probesets.
309 | 
310 | Now we're ready to look at "interesting genes".
311 | 
312 | Plotting timecourses for genes of interest
313 | ==========================================
314 | 
315 | Control samples
316 | ---------------
317 | 
318 | Let's start with the control samples (no drug treatment), sorting on the p-value column. We then write a function that uses ggplot2 to plot the RMA expression values for the probeset corresponding to a given gene. We'll test it with the first probeset.
319 | 
320 | ![](gse59671_files/figure-markdown_github/plot1-1.png)
321 | 
322 | Expression of the gene CD83 rises dramatically in the first 2 hours after exposure to IL1b, then drops back to the base level by 8 hours. A quick Google search for the term "CD83 inflammation" indicates that the gene is involved with inflammatory responses.
323 | 
324 | Let's try another - say, number 6 in the list.
325 | 
326 | ![](gse59671_files/figure-markdown_github/plot2-1.png)
327 | 
328 | The ANXA11 gene is also involved with inflammation and shows the opposite behaviour to CD83; expression drops ~ 3-4 fold in the first 8 hours, then rises gradually from 8-24 hours.
329 | 
330 | Treated samples
331 | ---------------
332 | 
333 | Did the drug treatments make any difference to the IL1b response? Let's start with the best p-value for celecoxib versus Control.
334 | 
335 | ![](gse59671_files/figure-markdown_github/plot3-1.png)
336 | 
337 | There is some indication that expression of JRK post-IL1b exposure was lowered less between 0-2 hours in celecoxib-treated cells, but it's not incredibly convincing as the fold-change overall is quite low under all conditions. What about rofecoxib?
338 | 
339 | ![](gse59671_files/figure-markdown_github/plot4-1.png)
340 | 
341 | It appears that this p-value is driven by higher PIK3C2B expression in 3/4 rofecoxib-treated samples at t = 0. Perhaps the less-than-dramatic effects of drug treatment explain why this GEO series is not associated with a publication.
342 | 
343 | Summary
344 | =======
345 | 
346 | -   maSigPro is a useful, effective package for analysis of timecourse microarray data
347 | -   It combines well with ggplot2 to generate attractive and informative plots of gene expression over time
348 | -   The example dataset GSE59671 reveals some interesting effects on the expression of inflammation-associated genes when cells are exposed to ILb1
349 | 


--------------------------------------------------------------------------------
/maSigPro/gse59671_files/figure-markdown_github/plot1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/maSigPro/gse59671_files/figure-markdown_github/plot1-1.png


--------------------------------------------------------------------------------
/maSigPro/gse59671_files/figure-markdown_github/plot2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/maSigPro/gse59671_files/figure-markdown_github/plot2-1.png


--------------------------------------------------------------------------------
/maSigPro/gse59671_files/figure-markdown_github/plot3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/maSigPro/gse59671_files/figure-markdown_github/plot3-1.png


--------------------------------------------------------------------------------
/maSigPro/gse59671_files/figure-markdown_github/plot4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/maSigPro/gse59671_files/figure-markdown_github/plot4-1.png


--------------------------------------------------------------------------------
/ncbi/biosample/code/ruby/cell_lines.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | # cell_lines.rb
 4 | # search NCBI biosample database for misidentified cell lines
 5 | # then search pubmed for those cell lines & return count
 6 | 
 7 | require 'bio'
 8 | 
 9 | Bio::NCBI.default_email = "me@me.com"
10 | ncbi   = Bio::NCBI::REST.new
11 | 
12 | search = ncbi.esearch("cell line status misidentified[Attribute]", {"db" => "biosample", "retmax" => 500})
13 | 
14 | search.each do |id|
15 | 	record = ncbi.efetch(id, {"report" => "full", "db" => "biosample", "mode" => "text"})
16 | 	line = record.split("\n").find {|e| /\/cell line="(.*?)"/ =~ e }
17 | 	if line =~ /cell line="(.*?)"/
18 | 		pubmed = ncbi.esearch_count("#{$1}[TIAB]", {"db" => "pubmed"})
19 | 		puts "#{$1}\t#{pubmed}"
20 | 	end
21 | end
22 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/README.md:
--------------------------------------------------------------------------------
1 | # entrez_db_terms
2 | 
3 | The script *entrez_db_terms.rb* generates a summary of searchable fields for each of the NCBI Entrez databases.
4 | 
5 | Summary files (one per database) are written to the *data/* directory.
6 | 
7 | The script requires Nokogiri and BioRuby.
8 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/code/ruby/entrez_db_terms.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | require 'bio'
 3 | require 'nokogiri'
 4 | require 'open-uri'
 5 | 
 6 | Bio::NCBI.default_email = "me@me.com"
 7 | outd = File.expand_path("../../../data", __FILE__)
 8 | ncbi = Bio::NCBI::REST.new
 9 | url  = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db="
10 | ncbi.einfo.each do |db|
11 |   puts "Processing #{db}..."
12 |   outf = outd + "/" + "#{db}.txt"
13 |   File.open(outf, "w") do |f|
14 |     doc = Nokogiri::XML(open("#{url + db}"))
15 |     doc.xpath("//FieldList/Field").each do |field|
16 |       name = field.xpath("Name").inner_html
17 |       fullname = field.xpath("FullName").inner_html
18 |       description = field.xpath("Description").inner_html
19 |       f.write("#{name},#{fullname},#{description}\n")
20 |     end
21 |   end
22 |   puts "Wrote file #{outf}"
23 | end
24 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/assembly.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACCN,Accession,Chromosome accessions
 5 | ASAC,Assembly Accession,Space delimited assembly accessions w/ &amp; w/o versions
 6 | ASLV,Assembly Level,How assembled is this assembly. 'Contig' to 'Chromosome'
 7 | TXID,Taxonomy ID,Taxonomy ID
 8 | ORGN,Organism,Exploded organism names
 9 | RUID,RefSeq Release ID,Release Id of RefSeq Assembly.
10 | GUID,GenBank Release ID,Release Id of GenBank synonym of this Assembly.
11 | UIDS,All Uids,Pair-id, GB-id, and RS-id of this Assembly.
12 | PROJ,BioProject IDs and Accessions,Uid and accessions of this assembly's projects
13 | SAMP,Biosample,Biosample Accession and Id
14 | NAME,Assembly Name,Assembly name
15 | ALLN,All Names,All names, space separated
16 | DESC,Description,Assembly description
17 | COV,Coverage,Sequencing coverage
18 | CLAS,Assembly Class,Type of the assembly
19 | RELS,Date - Assembly Release,Date the assembly was first released
20 | SRDT,Date - Sequences Release,Date the most recent sequence went live in ID
21 | UPDT,Date - Assembly Update,Date the assembly was last updated
22 | LEN,Total Sequence Length,Total length of chromosome/genome including bases and gaps divided by 1,000,000.
23 | REPL,Chromosome Count,Number of chromosomes in assembly
24 | PLAC,Placed Scaffolds Count,Number of placed scaffolds
25 | UNLO,Unlocalized Scaffolds Count,Number of unordered(unlocalized) scaffolds belonging to chromosomes
26 | UNPL,Unplaced Scaffolds Count,Number of unplaced scaffolds which do not belong to any chromosome, ie ChrUn
27 | CN50,Contig N50,Contig length at which 50% of total bases in assembly are in contigs of that length or greater
28 | SN50,Scaffold N50,Scaffold length at which 50% of total bases in assembly are in contigs of that length or greater
29 | CL50,Contig L50,Number of contigs that are greater than or equal to the N50 length.
30 | SL50,Scaffold L50,Number of scaffolds that are greater than or equal to the N50 length.
31 | CNTG,Contig Count,Number of contigs
32 | UNGL,Ungapped Length,Total length excluding gaps in chromosome/genome divided by 1,000,000
33 | PROP,Properties,Properties
34 | SUBO,Submitter Organization,Organization that submitted this assembly
35 | INFR,Infraspecifc name,Infraspecific name: breed, cultivar, strain, ecotype
36 | ISOL,Isolate,Isolate name
37 | SEX,Sex,Sex
38 | ASMM,Assembly Method,Assembly Method
39 | GCOV,Genome Coverage,Genome Coverage
40 | TECH,Sequencing Technology,Sequencing Technology
41 | EXFV,Expected Final Version,Expected Final Version
42 | RGAS,Reference Guided Assembly,Reference Guided Assembly
43 | SCAM,Single Cell Amplification,Single Cell Amplification
44 | RCAT,RefSeq Category,RefSeq Category
45 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/bioproject.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ORGN,Organism,Organism
 5 | PRJA,Project Accession,Project Accession
 6 | TYPE,Project Type,Project Type
 7 | STPE,Project Subtype,Project Subtype
 8 | DATE,Registration Date,Registration Date
 9 | TITL,Title,Title
10 | CEN,Submitter Organization,Submitter Organization(s)
11 | ACCN,Replicon accession,Space delimited GenBank or RefSeq Replicon Accessions
12 | RTYP,Replicon type,Replicon Type
13 | RNME,Replicon name,Replicon Name
14 | LTP,Locus Tag Prefix,Locus Tag Prefix
15 | WORD,Description,Organism/Project Description
16 | KWRD,Keyword,Keywords
17 | PROP,Properties,Project/Organism Properties
18 | DTPE,Project Data Type,Project Data Type
19 | GRNT,Grant ID,Grant ID
20 | FUND,Funding Agency,Funding Agency
21 | PMID,PMID,Pubmed ID
22 | DOID,DOI,DOI ID
23 | PID,ProjectID,Project ID
24 | RELV,Relevance,Relevance
25 | ANME,Assembly name,Assembly Name
26 | BPRJ,BioProject ID,BioProject ID or accession
27 | TPRJ,Top Bioproject,Top Bioproject ID
28 | WGSA,WGS Accession,WGS Accessions
29 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/biosample.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACCN,Accession,Accession number of sequence
 5 | TITL,Title,Words in definition line
 6 | PROP,Properties,Classification by source qualifiers and molecule type
 7 | WORD,Text Word,Free text associated with record
 8 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
 9 | AUTH,Author,Author(s) of publication
10 | PDAT,Publication Date,Date sequence added to GenBank
11 | MDAT,Modification Date,Date of last update
12 | ATNM,Attribute Name,Attribute Name
13 | ATTR,Attribute,Attribute
14 | CEN,Submitter Organization,Submitter Organization(s)
15 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/biosystems.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | SRC,SourceName,Name of the organization that is the source of the record
 5 | SRID,SourceID,A numerical id that is assigned to a particular source of biosystem records
 6 | TYPE,BioSystemType,Type of the biosystem
 7 | PDAT,CreateDate,The date the biosystem record first appeared in the NCBI biosystems database
 8 | MDAT,ModifyDate,The date the biosystem record last changed in the NCBI biosystems database
 9 | SACC,SourceAccession,The accession used by the source of the biosystem
10 | TITL,Title,The name of the biosystem
11 | DESC,Description,The text description of a biosystem
12 | COM,Comments,Comments on the biosystem
13 | ORGN,Organism,Organism that contain the biosystem
14 | PN,ProteinName,Names of proteins in a biosystem (definition line)
15 | CN,ChemicalName,Names of small molecules in a biosystem (taken from PubChem)
16 | SEID,SidExternalID,Id given to a small molecule by the source of a biosystem
17 | GN,GeneName,Gene name
18 | GEID,GeneExternalID,Id given to a gene by the source of a biosystem
19 | PID,ProteinID,Protein accessions and gis in a biosystem
20 | CID,CID,PubChem compound identifiers (cid) found in a biosystem
21 | GID,GeneID,NCBI gene ids found in a biosystem
22 | SID,SID,PubChem substance ids (sid) found in a biosystem
23 | ACCN,Accession,The biosystem accession (bsid plus version)
24 | SCT,SIDCount,Total Count of PubChem substance ids (sid) found in a biosystem
25 | CCT,CIDCount,Total Count of PubChem compound ids (cid) found in a biosystem
26 | GCT,GeneCount,Total Count of NCBI Gene records (geneid) found in a biosystem
27 | PCT,ProteinCount,Total Count of NCBI Protein records (gi) found in a biosystem
28 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/blastdbinfo.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | DB,Database Name,Official name of the database
 5 | TITL,Database Title,Words in the title of database (e.g., "NCBI Transcript Reference Sequences")
 6 | DATE,Last Update,Date of last database update
 7 | ORGN,Database Organism Taxid,Organism Taxid
 8 | ASM,Genome Collection Assembly Name,Genome Collection Assembly Name
 9 | SEQT,Blast Sequence Type,One of genomic, cdna, other-dna, or protein, of which genomic and cdna could be further specified
10 | SEQS,Blast Sequence Strategy,Appropriate sequence strategy for the sequence type specified
11 | SRC,Blast Database Source,States where the sequences came from, e.g., genbank, refseq, trace, etc.
12 | KEYW,Keyword,Search term identifying this database entry
13 | PRJ,NCBI Genome Project ID,NCBI Genome Project Identifier
14 | GPB,Gpipe Build Name,Gpipe Build
15 | WGPR,NCBI WGS Project ID,NCBI WGS Project Identifier
16 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/books.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | AUTH,Author,Section's author
 5 | CA,Corporate Author,Corporate Author of publication
 6 | FA,Full Author Name,Full Author Name(s) of publication
 7 | FE,Full Editor Name,f
 8 | TITL,Title,Section's title
 9 | TYPE,Type,Section's type
10 | STXT,Full Text,Section's full text
11 | CONP,Concept Phrases,Generated keywords
12 | BOOK,Book,ID of the book that contains the document
13 | PMID,PMID,PubMed ID
14 | RMID,RefPMID,Citation search by PmId
15 | RID,Rid,Book internal ID
16 | PUBN,Publisher,Publisher's Name
17 | PDAT,Publication Year,Publication Year
18 | ISBN,ISBN,ISBN
19 | ATTR,Attribute,Attributes in key value ordered pairs
20 | EDIT,Editor,Section's Editor
21 | RD,Release Date,Release Date
22 | SUB,Subject,Subject
23 | RT,Resource Type,Resource Type
24 | AID,Accession ID,Accession ID
25 | BACI,Book Accession ID,Book Accession ID
26 | CHID,Chapter Accession ID,Chapter Accession ID
27 | DN,Disease,Disease
28 | GS,Gene Name,Gene Name
29 | PN,Protein Name,Protein Name
30 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/cdd.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACCN,Accession,Unique text identifier for a CD
 5 | DB,Database,Which database CD is from (pfam, smart ...)
 6 | TITL,Title,The short descriptive name of a CD, e.g. Rho
 7 | STTL,Subtitle,A short description of the CD
 8 | WORD,Text Word,The long description of the CD
 9 | ORGN,Organism,The root taxonomy node of a CD
10 | PDAT,Publication Date,The date a CD was published
11 | MDAT,Modification Date,The date a CD was last modified
12 | PLEN,PssmLength,Length of the PSSM or domain search model
13 | AACN,Alternative Accession,Alternative unique text identifier for a CD, from source database
14 | STRP,Structure Representative,The number of structures in a CD
15 | SD,The description of sites,The desription of functional sites in a domain
16 | NS,Number of Sites,The number of functional sites in a domain
17 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/clinvar.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | TITL,Name of the ClinVar record,Constructed from variant and phenotype names
 5 | WORD,Text Word,Free text associated with record
 6 | ORGN,Organism,scientific and common names of organism
 7 | MDAT,Modification Date,The last date on which the record was updated
 8 | CHR,Chromosome,Chromosome number or numbers; also 'mitochondrial', 'unknown' properties
 9 | GENE,Gene Name,Symbol or symbols of the gene
10 | MIM,MIM,MIM number from OMIM
11 | DIS,Disease/Phenotype,Diseases or traits associated with this record
12 | ACCN,ClinVar accession,Accession of the genotype/phenotype assertion
13 | VRID,Variant ID,Public ID of a variant
14 | TRID,Trait identifier,Public identifier for a trait (e.g. CUI, HPO)
15 | PROP,Properties,Properties of ClinVar record
16 | CDAT,Creation Date,The date on which this record first appeared
17 | PMID,PubMed ID,PubMed ids of accessions linked to the record
18 | GID,Gene ID,Gene ID
19 | TID,Taxonomy ID,taxonomy id
20 | DDAT,Date Discontinued,The date on which the record was discontinued
21 | CPOS,Base Position,Chromosome base position
22 | GFN,Gene Full Name,Gene full name
23 | PFN,Protein Full Name,Protein full name
24 | SUB,Submitter,Organization or submitter handle making the submission
25 | VRNM,Variant name,Names used for this allele
26 | VRTP,Type of variation,Type of sequence change/variant call
27 | MCNS,Molecular consequence,Consequence of the variation at the molecular level.
28 | RVST,Review status,Review status
29 | ALID,AlleleID,Unique identifier assigned to a specific sequence change at a location.
30 | ORIG,Origin,Origin
31 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/clone.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACC,Accession,Accession of any internal or external identifier. Versions removed.
 5 | ACCV,Accession Version,Accession and version of GENBANK accessions associated with clone records.
 6 | ALAB,Alternate Library Abbreviation,Alternate Library Abbreviation
 7 | ALN,Alternate Library Name,Alternate Library Name
 8 | ASSA,Assembly Accession,Accession of assembly on which placed
 9 | ASSN,Assembly Name,Assembly on which placed
10 | BREE,Breed,Breed
11 | CELL,Cell Line,Cell Line
12 | CELT,Cell Type,Cell Type
13 | CHRA,Chromosome Accession,Accession of chromosome on which placed
14 | CHRB,Chromosome Start,Chromosome start of placement
15 | CHRE,Chromosome Stop,Chromosome end of placement
16 | CLN,Clone Name,Clone Name
17 | CLA,Clone Name Alias,Clone Name Alias
18 | CULT,Cultivar,Cultivar
19 | CUAC,Cultivar Accession,Cultivar Accession provided in library submission XML
20 | DIST,Distributor,Library Distributor Name, provided in library submission XML
21 | DEST,Development Stage,=Development Stage provided in library submission XML
22 | GENE,Gene Name,Name or alias of Gene at same location as the placement of the clone
23 | GNID,Gene ID,GeneID of gene at same location as the placement of the clone
24 | GDSC,Gene Description,Full name (description) of gene at same location as the placement of the clone
25 | GI,GI,GIs associated with clone records
26 | ISOL,Isolate,Isolate provided in library submission XML
27 | LBR,Library Abbreviation,Library Abbreviation
28 | LID,Library ID,Library ID
29 | LIB,Library Name,Library name
30 | LIBT,Library Type,Library Type
31 | OT,Object type,Object type in Clone DB (library, clone)
32 | ORGA,Organ,Organ provided in library submission XML
33 | ORG,Organism,Organism name (exploded)
34 | PLCD,Placed,display Y/N for has_placement/no_placement
35 | PLMT,Placement Method,Placement Method
36 | PLCN,Placement Confidence,Placement Confidence
37 | POPU,Population,Population provided in library submission XML
38 | PROP,Properties,Properties of data set for example HasInsert HasEnd IsPlaced IsConcordant
39 | PID,Genome Project Id,Genome Project Id provided in library submission XML
40 | PMID,PMID,PubMed Id provided in library submission XML
41 | SCFA,Scaffold Accession,Accession of scaffold on which on which placed
42 | SCFB,Scaffold Start,Scaffold start of placement
43 | SCFE,Scaffold Stop,Scaffold stop of placement
44 | STRA,Strain,Strain
45 | STS,STS,STS's that have been mapped to any sequences associated with clone records
46 | TXID,Taxonomy ID,Taxonomy ID
47 | TI,TI,TIs associated with clone records
48 | TISS,Tissue,Tissue provided in library submission XML
49 | VN,Vector,Vector name provided in library submission XML
50 | VT,Vector Type,Vector type provided in library submission XML
51 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/dbvar.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACC,Accession,Accession of any internal or external identifier. Versions removed from GENBANK accessions.
 5 | ACCV,Accession Version,Accession and version of GENBANK accessions used in variant sequence or support.
 6 | LAB,Submitter Affiliation,Submitter's affiliation name
 7 | ALOR,Allele Origin,Allele origin (controlled vocabulary), including Both=Germline+Somatic
 8 | ALTP,Variant Call Type,Variant Call type (controlled vocabulary)
 9 | ASSM,Assembly Name,Assembly of placement
10 | ASAC,Assembly Accession,Assembly accession of placement
11 | AORG,Assembly Organism,Assembly organism names (exploded)
12 | ATAX,Assembly Taxonomy ID,Assembly taxonomy ID
13 | AUTH,Author,All authors included in journal
14 | BLCK,Block Start,Start of a 100k block on chromosome containing the variant.
15 | CH,Chromosome,Chromosome of placement
16 | CHRA,Chromosome Accession,Chromosome of placement, using accession.version
17 | CHRE,Chromosome End,End of placement on chromosome
18 | CHRS,Chromosome Start,Start of placement on chromosome
19 | INRE,Chromosome Inner End,Inner end of placement on chromosome
20 | INRS,Chromosome Inner Start,Inner start of placement on chromosome
21 | OTRE,Chromosome Outer End,Outer end of placement on chromosome
22 | OTRS,Chromosome Outer Start,Outer start of placement on chromosome
23 | CLIN,Variant Clinical Interpretation,Clinical interpretation of a variant (controlled vocabulary)
24 | CLVA,ClinVar Accession,ClinVar Accession (SCV)
25 | CTG,Unplaced Contig Accession,Contig of placement, when not on a chromosome, using accession.version
26 | DET,Detection Method,Detection method
27 | DESC,Variant Description,Variant description
28 | DDAT,Discontinued Date,dbVar discontinued date
29 | ESSV,Numeric Portion of EBI Variant Call ID,Numeric portion of EBI Variant Call ID (essv)
30 | ESTD,Numeric Portion of EBI Study ID,Numeric portion of EBI Study ID (estd)
31 | ESV,Numeric Portion of EBI Variant Region ID,Numeric portion of EBI Variant Region ID (esv)
32 | GENE,Gene Name,Name or alias of gene at same location as variant
33 | GNID,Entrez Gene ID,Gene ID of gene at same location as variant
34 | GDSC,Gene Full Name,Full name (description) of gene at same location as variant
35 | LIB,Library Abbreviation,Library name used in the Method
36 | MPLT,Method Platform,Method platform
37 | MSUB,Method Submission Name,Submission name of individual method, used when study contains multiple methods from different submitters, as does the curated dataset.
38 | METH,Method Type,Method type (controlled vocabulary)
39 | MCAT,Method Type Category,Used for sorting and display. Methods are categorized as: probe, mapping, sequencing.
40 | MWGT,Method Type Weight,used for sorting. BAC=all Method_type values of study or variant are BAC aCGH, Non-BAC=study or variant has at least 1 method_type that is other than BAC aCGH
41 | MESH,MeSH ID,Medical Subject Headings (MeSH) ID (exploded)
42 | MIM,MIM ID,Online Mendelian Inheritance in Man
43 | NCBI,Submitter MyNCBI ID,Submitter login ID in myNCBI system
44 | NSSV,Numeric Portion of NCBI Variant Call ID,Numeric portion of NCBI Variant Call ID (nssv)
45 | NST,Numeric Portion of NCBI Study ID,Numeric portion of NCBI Study ID (nstd)
46 | NSV,Numeric Portion of NCBI Variant Region ID,Numeric portion of NCBI Variant Region ID (nsv)
47 | OT,Object Type,Object type in dbVar (STUDY, VARIANT)
48 | ORG,Organism,Organism name (exploded)
49 | PDA,Submitter PDA Login,Submitter login ID in NCBI PDA system
50 | PHEN,Phenotype,Phenotype of sample/subject study or reference specimen
51 | PTYP,Placement Type,Placement type (controlled vocabulary)
52 | PMID,PMID,Unique identifier from PubMed
53 | GPRJ,Genome Projects ID,Unique identifier from Genome Projects
54 | PRNM,Genome Projects Name,Name from Genome Projects corresponding to Project_ID
55 | PDAT,Publication Date,Journal Publication date
56 | SSV,Variant Call Accession,dbVar ID (essv or nssv) of Variant Call
57 | ST,Study Accession,Study dbVar ID (estd or nstd)
58 | SV,Variant Region Accession,dbVar ID (esv or nsv) of Variant Region
59 | SMPL,Sample,Sample/subject ID of study or reference specimen
60 | SC,Sample Count,Number of samples in study
61 | STDE,Study Description,Study description
62 | STDN,Study Display Name,Study display name
63 | STDY,Study ID,Study, batch or submission ID
64 | STYP,Study Type,Study type assigned by NCBI
65 | SVAR,Submitter Variant ID,Originally submitted variant identifier
66 | SUPH,Subject Phenotype status,Boolean subject phenotype status: 0=not affected/null; 1 = affected
67 | SUB,Submitter Name,Submitter first and last name
68 | SSVC,Variant Call Count,Number of supporting variant calls in variant region
69 | TXID,Taxonomy ID,Taxonomy ID
70 | MDAT,Modification Date,dbVar Modification Date
71 | VAL,Validation Method,Validation method (controlled vocabulary)
72 | VSTA,Validation Result,Boolean validation status: null=not validated, 0=validated with result=0; 1 = validated with result=1
73 | VWGT,Validation Result Weight,0=not validated, 1=validated with result=0; 2 = validated with result=1
74 | VC,Variant Region Count,Number of variant regions in study
75 | VLEN,Variant Size,Size of variant
76 | VT,Variant Region Type,Variant region type (controlled vocabulary)
77 | ZYG,Variant Zygosity,Zygosity of a variant (controlled vocabulary)
78 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/epigenomics.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Text
 5 | TITL,Title,Title
 6 | TXID,Taxonomy ID,TaxId
 7 | ACCN,Accession,Epigenomics accession number
 8 | KYWD,Keyword,Keyword
 9 | COID,Concept ID,UMLS concept ID (CID)
10 | AUTH,Author,Author
11 | PRID,Project ID,ProjectId
12 | DOCT,Document Type,DocType
13 | CDAT,Create Date,CreateDate
14 | MDAT,Update Date,UpdateDate
15 | ORGN,Organism,scientific and common names of organism
16 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/gap.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | DISC,Discriminator,Discriminator
 5 | OBJ,Object Type,Object Type
 6 | ANCE,Ancestor,Ancestor
 7 | BELO,Belongs To,Belongs To
 8 | ATTR,Attribution,Attribution
 9 | RTST,Is Root Study,Is Root Study
10 | TLST,Is Top-Level Study,Is Top-Level Study
11 | STID,Study ID,Study ID
12 | STNM,Study Name,Study Name
13 | DIS,Disease,Disease
14 | PROJ,Project,Project
15 | GENO,Genotype Platform,Genotype Platform
16 | SRA,Study Has SRA Components,Study Has SRA components
17 | STUD,Study,Study
18 | HASV,Has Variable,Has Variable
19 | VRID,Variable ID,Variable ID
20 | VRNM,Variable Name,Variable Name
21 | VRDS,Variable Description,Variable Description
22 | VAR,Variable,Variable
23 | HASD,Has Document,Has Document
24 | DCID,Document ID,Document ID
25 | DCNM,Document Name,Document Name
26 | DOC,Document,Document
27 | DOCP,Document Part,Document Part
28 | HASA,Has Analysis,Has Analysis
29 | ANID,Analysis ID,Analysis ID
30 | ANNM,Analysis Name,Analysis Name
31 | ANLS,Analysis,Analysis
32 | HAST,Has Dataset,Has Dataset
33 | DSID,Dataset ID,Dataset ID
34 | DSNM,Dataset Name,Dataset Name
35 | DS,Dataset,Dataset
36 | PX,PhenX,PhenX
37 | HASP,Has PhenX Mapping,Has PhenX Mapping
38 | ARCH,Study Archive,Study Archive
39 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/gapplus.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | SRC,Source Database,Source Database
 5 | CDAT,Create Date,Create Date
 6 | MDAT,Modification Date,Date of last update
 7 | PMID,PubMed ID,PubMed ID
 8 | PDAT,Publication Date,Publication date
 9 | JOUR,Journal,Journal abbreviation of publication
10 | TITL,Title,Document title
11 | RS,Reference SNP ID,Clustered SNP ID (rs)
12 | CHR,Chromosome,Chromosome
13 | CPOS,Chromosome Base Position,Position
14 | PLAT,Platform,Platform
15 | GENE,Gene Name,Gene Name
16 | FXN,Function Class,Function Class
17 | PVAL,Log of P-Value,Log of P-value
18 | TRT,Phenotype Trait,Phenotype Trait
19 | POPL,Population,Population
20 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/gds.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ORGN,Organism,exploded organism names
 5 | ACCN,GEO Accession,accession for GDS (DataSet), GPL (Platform), GSM (Sample), GSE (Series)
 6 | TITL,Title,Words in title of record
 7 | DESC,Description,Text from description, summary and other similar fields
 8 | SFIL,Supplementary Files,Supplementary Files
 9 | ETYP,Entry Type,Entry type (DataSet or Series)
10 | STYP,Sample Type,Sample type
11 | VTYP,Sample Value Type,type of values, e.g. log ratio, count
12 | PTYP,Platform Technology Type,Platform technology type
13 | GTYP,DataSet Type,type of dataset
14 | NSAM,Number of Samples,Number of samples
15 | SRC,Sample Source,sample source
16 | AUTH,Author,author of the GEO Sample, Platform or Series
17 | INST,Submitter Institute,institute, or organization affiliatedd with contributers
18 | NPRO,Number of Platform Probes,number of platform probes
19 | SSTP,Subset Variable Type,subset variable type
20 | SSDE,Subset Description,subset description
21 | GEID,Reporter Identifier,name or identifier for the spot, e.g. GenBank, UniGene ID, Locus Link ID etc.
22 | PDAT,Publication Date,publication date from the GEO related entities
23 | UDAT,Update Date,date
24 | TAGL,Tag Length,Tag/Signature length for SAGE/MPSS
25 | RGSE,Related Series,Related Series
26 | RGPL,Related Platform,Related Platform
27 | MESH,MeSH Terms,Medical Subject Headings
28 | PROJ,Project,Project
29 | ATNM,Attribute Name,Attribute Name
30 | ATTR,Attribute,Attribute
31 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/gencoll.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACCN,Accession,Chromosome accessions
 5 | ASAC,Assembly Accession,Space delimited assembly accessions w/ &amp; w/o versions
 6 | CAT,Category,Assembly Set type or "assembly-unit"
 7 | TXID,Taxonomy Id,Taxonomy Id
 8 | ORGN,Organism,Exploded organism names
 9 | PROJ,Project Id,Uid(s) of this Assembly's Projects
10 | NAME,Assembly Name,Assembly Name
11 | ALLN,All Names,All Names, space separated
12 | COV,Coverage,Sequencing Coverage
13 | CLAS,Assembly Class,Type of the Assembly
14 | REL,Release Type,Release Type
15 | PART,Partial Genome Representation,Partial Genome Representation
16 | RELS,NCBI Release Date,NCBI Release Date
17 | LEN,Total Sequence Length,Total length of chromosome/genome including bases and gaps.
18 | REPL,Chromosome count,Number of chromosomes in assembly
19 | PLAC,Placed Scaffolds Count,Number of placed scaffolds
20 | UNLO,Unlocalized Scaffolds Count,Number of unordered(unlocalized) scaffolds belonging to chromosomes
21 | UNPL,Unplaced Scaffolds Count,Number of unplaced scaffolds which do not belong to any chromosome, ie ChrUn
22 | PROP,Properties,Properties
23 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/gene.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to a gene record
 3 | FILT,Filter,Limits the records
 4 | TITL,Gene/Protein Name,gene or protein name
 5 | WORD,Text Word,Free text associated with record
 6 | ORGN,Organism,scientific and common names of organism
 7 | MDAT,Modification Date,The last date on which the record was updated
 8 | CHR,Chromosome,Chromosome number or numbers; also 'mitochondrial', 'unknown' properties
 9 | MV,Default Map Location,Chromosomal map location as displayed in MapViewer
10 | GENE,Gene Name,Symbol or symbols of the gene
11 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
12 | MIM,MIM ID,MIM number from OMIM
13 | DIS,Disease/Phenotype,Name(s) of diseases associated with this gene. When available, OMIM name will be used
14 | ACCN,Nucleotide/Protein Accession,Nucleotide or protein accession(s) associated with this gene
15 | UGEN,UniGene Cluster Number,UniGene cluster number for this gene
16 | PROP,Properties,Properties of Gene record
17 | CDAT,Creation Date,The date on which this record first appeared
18 | NCAC,Nucleotide Accession,nucleotide accessions of seqeunces
19 | NUID,Nucleotide UID,nucleotide uids of sequences
20 | PACC,Protein Accession,protein accessions
21 | PUID,Protein UID,protein uids
22 | PMID,PubMed ID,PubMed ids of accessions linked to the record
23 | TID,Taxonomy ID,taxonomy id
24 | GO,Gene Ontology,Gene Ontology
25 | DOM,Domain Name,Domain Name
26 | DDAT,Date Discontinued,The date on which the record was discontinued
27 | CPOS,Base Position,Chromosome base position
28 | GFN,Gene Full Name,Gene full name
29 | PFN,Protein Full Name,Protein full name
30 | GL,Gene Length,Gene length
31 | XC,Exon Count,Exon count
32 | GRP,Group,Relationships for this gene
33 | PREF,Preferred Symbol,Preferred symbol of the gene
34 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/genome.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to genome
 3 | FILT,Filter,Limits the records
 4 | ORGN,Organism,Organism
 5 | PID,ProjectID,Project ID
 6 | PRJA,Project Accession,Project Accession
 7 | PRJT,Project Type,Project Type
 8 | DFLN,Title,Genome short description
 9 | DSCR,Genome description,Genome full description
10 | STAT,Status,Bioproject status
11 | AID,AssemblyID,Release ID of full assembly
12 | AACC,Assembly Accession,Accession of full assembly
13 | ANAM,Assembly Name,Name of full assembly
14 | GI,Replicon GI,Replicon GI
15 | ACCN,Replicon accession,Replicon Accession
16 | RNAM,Replicon name,Replicon Name
17 | PACC,Protein Accession,Protein Accession
18 | PROT,Protein Name,Protein Name
19 | PGI,Protein GI,protein GI
20 | GNID,GeneID,GeneID
21 | GENE,Gene Name,Gene Name
22 | LTAG,Locus Tag,Locus Tag
23 | WGSP,WGS prefix,WGS Prefix
24 | PMID,PubMed ID,Unique identifier from PubMed
25 | BIOP,biological properties,Biological Properties
26 | PCID,ProtClust ID,Protein Clusters ID
27 | PROP,Properties,Project/Organism Properties
28 | CDT,Create Date,Create Date
29 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/genomeprj.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/ncbi/entrez_db_terms/data/genomeprj.txt


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/geoprofiles.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ORGN,Organism,Exploded organism names
 5 | ACCN,GEO Accession,Accession for GDS (DataSet), GPL (Platform), GSM (Sample), GSE (Series)
 6 | GDST,GDS Text,GDS text from title and description
 7 | GEOT,GEO Description/Title Text,Sample titles
 8 | RTYP,Platform Reporter Type,Platform reporter type, e.g. genbank, clone, orf
 9 | GTYP,DataSet Type,Type of dataset
10 | VTYP,Sample Value Type,Sample value type, e.g. log ratio, count
11 | NSAM,Number of Samples,Number of samples
12 | SRC,Sample Source,Sample source
13 | ID,ID_REF,Spot ID from GEO Platform, SAGE tag, Affy ProbeSet ID
14 | NAME,Reporter Identifier,Name or identifier for the spot, e.g. GenBank accession, CLONE_ID, ORF etc.
15 | SYMB,Gene Symbol,Gene symbol (name) from Entrez-Gene or Entrez-UniGene.
16 | GDSC,Gene Description,Gene Description
17 | RSTD,Ranked Standard Deviation,Ranked standard deviation
18 | RMAX,Max Value Rank,Maximal value of ranks
19 | RMIN,Min Value Rank,Minimal value of ranks
20 | FINF,Flag Information,Indicates an interesting or notable uid in the GDS context
21 | FTYP,Flag Type,Type of flag that indicates a uid of interest, or outliers etc.
22 | GI,GI,GenBank Identifier
23 | ATYP,Annotation Type,Type of annotation (gene, unigene, nucleotide)
24 | GO,Gene Ontology,Gene Ontology
25 | CHR,Chromosome,Chromosomes
26 | CPOS,Base Position,Chromosome base position
27 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/gtr.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Free text associated with record
 5 | WRD1,Text Word 1,Free text associated with record with higher score then regular TEXT field
 6 | PROP,Properties,Properties of this record
 7 | ORG,Organization names,Lab or Clinic name including institution and department
 8 | OID,Organization UID,Unique number for this lab/clinic
 9 | CITY,Organization City,Lab or clinic city
10 | STATE,Organization State,Lab or clinic State or province
11 | COUNTRY,Organization Country,Lab or clinic Country
12 | POSTCODE,Organization postcode,Lab or clinic zip or postcode
13 | DIRECTOR,Organization Director(s),Lab or clinic director(s)
14 | STAFF,Organization Staff,Lab or clinic Staff name
15 | LS,Lab Service name,Lab Service name
16 | GTRACC,Accession for GTR test,Accession for GTR test
17 | MDAT,Modification Date,The last date on which the record was updated
18 | NAME,name for this test,name for this test
19 | ALT,Alternate name,alternate short and full names for test
20 | TESTDIS,disease name for this test,preferred name of disease by lab
21 | SPECIMEN,Specimen options for test,Specimen options for test
22 | TITL,title of this clinvar assertion,title of this clinvar assertion
23 | CVACC,Accession for clinvar assertion,Accession for clinvar assertion
24 | DCUI,Disease BioConcepts concept id,Concept identifier from BioConcepts for a disease
25 | DISNAME,Name of Disease,preferred full name
26 | GCUI,Gene BioConcepts concept id,Concept identifier from BioConcepts for a gene
27 | GENEID,UID for a record from Gene,Unique number for this record
28 | SYMB,Gene Symbol,Symbol or symbols of the gene
29 | GENENAME,Name of Gene,preferred full name
30 | GENEMIM,MIM number for the Gene,MIM number for the Gene
31 | PROTNAME,Name of Protein,preferred full name
32 | MTOD,Name of Method,method name
33 | MCAT,name for method category,name for method category
34 | TCAT,name for method top category,name for method top category
35 | LCRT,laboratory certification,laboratory certification
36 | CID,Country ISO code,Country ISO code
37 | SID,State ISO code,State ISO code
38 | TCID,Top Method Category ID,Top Method Category ID
39 | CTID,Method Category ID,Method Category ID
40 | MTID,Test Method ID,Test Method ID
41 | DID,Disease BioConcepts Entrez ID,Disease BioConcepts Entrez ID
42 | ORGN,Organism,scientific and common names of organism
43 | TARPOP,Target Population option for the test,Target Population option for the test
44 | CLNUTL,Clinical Utility option for the test,Clinical Utility option for the test
45 | CLNVAL,Clinical Validity option for the test,Clinical Validity option for the test
46 | GENENUM,Number of genes,Number of genes explicitly listed as being targeted in a test
47 | pharma,Pharmacogenetic response condition,Names of conditions that are identified as being pharmacogenetic responses
48 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/homologene.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | TITL,Title,Words in title of publication
 5 | WORD,Text Word,Free text associated with record
 6 | PROP,Properties,Properties (formerly Keyword)
 7 | ORGN,Organism,scientific and common names of organism
 8 | GNID,Gene ID,Gene ID
 9 | GENE,Gene Name,Gene Name
10 | GDSC,Gene Description,Description of gene
11 | PUID,Protein UID,protein uids
12 | PRAC,Protein Accession,protein accessions
13 | NUID,Nucleotide UID,nucleotide uids of sequences
14 | NCAC,Nucleotide Accession,nucleotide accessions of seqeunces
15 | UGID,UniGene ID,UniGene ID
16 | ANCS,Ancestor,scientific and common names of ancestor organism
17 | DOM,Domain Name,Domain Name
18 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/journals.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | TITL,Title,Document title
 5 | ESSN,eISSN,eISSN
 6 | PSSN,pISSN,pISSN
 7 | ISSN,ISSN,ISSN
 8 | MABR,Title Abbreviation,Title Abbreviation
 9 | ISOA,ISO Abbreviation,ISO Abbreviation
10 | MULT,Multi,Multi
11 | ID,NLM ID,NlmId
12 | ALIA,Alias,Alias
13 | ST,Subject Terms,Subject Terms
14 | WORD,Text Word,Text Word
15 | SYR,Start Year,First year of publication
16 | EYR,End Year,Last year of publication
17 | LANG,Language,Language the title is published in
18 | CNTY,Place of Publication,Place of publication
19 | XS,Indexing Subset,Indexing Subset
20 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/medgen.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to each record
 3 | FILT,Filter,Limits the records
 4 | ACCN,Accession,concept identifier for the record
 5 | TITL,Title,Full name of the concept from the top-ranked vocabulary
 6 | PROP,Properties,Classification by source qualifiers and molecule type
 7 | MDAT,Modification Date,Date of last update
 8 | WORD,Text Word,Free text associated with record
 9 | DEFN,Definition,Text from the definition of the concept
10 | VOCB,Vocabulary,Classification by source qualifiers and molecule type
11 | XTIT,ExactTitle,Exact Title
12 | CODE,Source ID,Any identifier used by any vocabulary
13 | REFR,Reference,Authors and titles of citations
14 | CHR,Chromosome,Chromosome number; also 'mitochondrial', 'unknown' properties
15 | CLIN,Clinical Features,Clinical features of disorder integrated from OMIM and Human Phenotype Ontology (HPO)
16 | GENE,Gene Name,Name of gene associated with record
17 | CPOS,Base Position,Chromosome base position
18 | OID,MIM ID,Unique number assigned to OMIM record
19 | GFN,Gene Full Name,Gene full name
20 | KWD,Keyword,Keyword relevant to the concept
21 | MINH,Mode Of Inheritance,Mode of inheritance
22 | SNM,SNOMED CT CUI,SNOMED Concept ID
23 | GTIT,Guideline title,Guideline title
24 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/mesh.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | TN,Tree Number,Tree Number
 5 | MESH,MeSH Terms,MeSH Terms
 6 | SUBS,Substance Name,Substance Name
 7 | WORD,Text Word,Free text
 8 | ALSO,See Also,See Also
 9 | PREV,Previous Indexing,Previous Indexing
10 | NOTE,Scope Note,Scope Note
11 | REG,Registry Number,Registry Number
12 | MULT,Multi,Multi
13 | TYPE,Record Type,Record type - main heading, subheading, pharmacological action, substance name, publication type
14 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/ncbisearch.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | TITL,Title,Document title
 5 | KYWD,Keyword,Keyword from meta tag
 6 | DESC,Description,Description from meta tag
 7 | WORD,Text Word,Free text from page
 8 | CAT,Category,Category assigned to page
 9 | MDAT,Modification Date,Date of last update
10 | HOME,Homepage Title,Home page title
11 | HURL,Homepage URL,Home page URL
12 | URL,URL,URL of page
13 | MNAM,META Name,META Name
14 | MCON,META Content,META Content
15 | AUTH,Author,Author(s) of web page
16 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/nlmcatalog.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ITAG,Abstract/Index Tags,Abstract/Index Tags
 5 | AUTH,Author,Author(s) of publication
 6 | FULL,Author Full Name,Full Names of Authors
 7 | CALL,Call Number,Call Number
 8 | CNAM,Corporate/Conference Name,Corporate/Conference Name
 9 | ITEM,Item Type,Item Type
10 | JOUR,Journal,Journal
11 | LANG,Language,Language of publication
12 | RTYP,Resource Type,Resource Type
13 | MESH,MeSH Terms,Medical Subject Headings assigned to publication
14 | MAJR,MeSH Major Topic,MeSH terms of major importance to publication
15 | SUBH,MeSH Subheading,Additional specificity for MeSH term
16 | OLIO,Olio,Olio
17 | OTHR,Other Number,Other Number
18 | OTRM,Other Term,Other terms associated with publication
19 | PERS,Personal Name as Subject,Personal Name as Subject
20 | FPER,Personal Full Name as Subject,Full Personal Name as Subject
21 | CNTY,Country of Publication,Country of publication
22 | PDAT,Publication Year,Year of publication
23 | PSTA,Publication Status,Status of publication
24 | PTYP,Publication Type,Type of publication (e.g., review)
25 | PUBL,Publisher,Publisher
26 | SERI,Series,Series
27 | TITL,Title,Words in title of publication
28 | URL,URL,URL
29 | NID,NLM Unique ID,NLM Unique ID
30 | EDAT,Entrez Date,Date publication first accessible through Entrez
31 | AI,Authority Information,Authority Information
32 | AIID,AIID,Authority ID
33 | WORD,Text Word,Text
34 | ST,Broad Subject Term,Broad Subject Term
35 | CFS,Current Format Status,Current Format Status
36 | XS,Indexing Subset,Indexing Subset
37 | ISO,ISO Abbreviation,ISO Abbreviation
38 | SYR,Publication Start Year,Publication Start Year
39 | EYR,Publication End Year,Publication End Year
40 | IS,ISSN,ISSN
41 | MABR,NLM Title Abbreviation,NLM Title Abbreviation
42 | MULT,Multi,Multi
43 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/nuccore.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to each sequence
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Free text associated with record
 5 | TITL,Title,Words in definition line
 6 | KYWD,Keyword,Nonstandardized terms provided by submitter
 7 | AUTH,Author,Author(s) of publication
 8 | JOUR,Journal,Journal abbreviation of publication
 9 | VOL,Volume,Volume number of publication
10 | ISS,Issue,Issue number of publication
11 | PAGE,Page Number,Page number(s) of publication
12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
13 | ACCN,Accession,Accession number of sequence
14 | PACC,Primary Accession,Does not include retired secondary accessions
15 | GENE,Gene Name,Name of gene associated with sequence
16 | PROT,Protein Name,Name of protein associated with sequence
17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
18 | PDAT,Publication Date,Date sequence added to GenBank
19 | MDAT,Modification Date,Date of last update
20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name
21 | PROP,Properties,Classification by source qualifiers and molecule type
22 | SQID,SeqID String,String identifier for sequence
23 | GPRJ,BioProject,BioProject
24 | SLEN,Sequence Length,Length of sequence
25 | FKEY,Feature key,Feature annotated on sequence
26 | PORG,Primary Organism,Scientific and common names of primary organism, and all higher levels of taxonomy
27 | COMP,Component Accession,Component accessions for an assembly
28 | ASSM,Assembly,Assembly
29 | DIV,Division,Division
30 | STRN,Strain,Strain
31 | ISOL,Isolate,Isolate
32 | CULT,Cultivar,Cultivar
33 | BRD,Breed,Breed
34 | BIOS,BioSample,BioSample
35 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/nucest.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to each sequence
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Free text associated with record
 5 | TITL,Title,Words in definition line
 6 | KYWD,Keyword,Nonstandardized terms provided by submitter
 7 | AUTH,Author,Author(s) of publication
 8 | JOUR,Journal,Journal abbreviation of publication
 9 | VOL,Volume,Volume number of publication
10 | ISS,Issue,Issue number of publication
11 | PAGE,Page Number,Page number(s) of publication
12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
13 | ACCN,Accession,Accession number of sequence
14 | PACC,Primary Accession,Does not include retired secondary accessions
15 | GENE,Gene Name,Name of gene associated with sequence
16 | PROT,Protein Name,Name of protein associated with sequence
17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
18 | PDAT,Publication Date,Date sequence added to GenBank
19 | MDAT,Modification Date,Date of last update
20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name
21 | PROP,Properties,Classification by source qualifiers and molecule type
22 | SQID,SeqID String,String identifier for sequence
23 | GPRJ,BioProject,BioProject
24 | SLEN,Sequence Length,Length of sequence
25 | FKEY,Feature key,Feature annotated on sequence
26 | ID,EST id,EST id in est table
27 | NAME,EST Name,EST uid in est table
28 | CLON,Clone ID,clone id
29 | LIB,Library Name,Library Name
30 | SUBM,Submitter Name,Submitter Name
31 | CIT,Citation Title,Citation Title Publication
32 | STRN,Strain,Strain
33 | ISOL,Isolate,Isolate
34 | CULT,Cultivar,Cultivar
35 | BRD,Breed,Breed
36 | BIOS,BioSample,BioSample
37 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/nucgss.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to each sequence
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Free text associated with record
 5 | TITL,Title,Words in definition line
 6 | KYWD,Keyword,Nonstandardized terms provided by submitter
 7 | AUTH,Author,Author(s) of publication
 8 | JOUR,Journal,Journal abbreviation of publication
 9 | VOL,Volume,Volume number of publication
10 | ISS,Issue,Issue number of publication
11 | PAGE,Page Number,Page number(s) of publication
12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
13 | ACCN,Accession,Accession number of sequence
14 | PACC,Primary Accession,Does not include retired secondary accessions
15 | GENE,Gene Name,Name of gene associated with sequence
16 | PROT,Protein Name,Name of protein associated with sequence
17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
18 | PDAT,Publication Date,Date sequence added to GenBank
19 | MDAT,Modification Date,Date of last update
20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name
21 | PROP,Properties,Classification by source qualifiers and molecule type
22 | SQID,SeqID String,String identifier for sequence
23 | GPRJ,BioProject,BioProject
24 | SLEN,Sequence Length,Length of sequence
25 | FKEY,Feature key,Feature annotated on sequence
26 | ID,GSS id,GSS id in gss table
27 | NAME,GSS Name,GSS uid in gss table
28 | CLON,Clone ID,clone id
29 | LIB,Library Name,Library Name
30 | SUBM,Submitter Name,Submitter Name
31 | CIT,Citation Title,Citation Title Publication
32 | LC,Library Class,Library Class
33 | STRN,Strain,Strain
34 | ISOL,Isolate,Isolate
35 | CULT,Cultivar,Cultivar
36 | BRD,Breed,Breed
37 | BIOS,BioSample,BioSample
38 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/nucleotide.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to each sequence
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Free text associated with record
 5 | TITL,Title,Words in definition line
 6 | KYWD,Keyword,Nonstandardized terms provided by submitter
 7 | AUTH,Author,Author(s) of publication
 8 | JOUR,Journal,Journal abbreviation of publication
 9 | VOL,Volume,Volume number of publication
10 | ISS,Issue,Issue number of publication
11 | PAGE,Page Number,Page number(s) of publication
12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
13 | ACCN,Accession,Accession number of sequence
14 | PACC,Primary Accession,Does not include retired secondary accessions
15 | GENE,Gene Name,Name of gene associated with sequence
16 | PROT,Protein Name,Name of protein associated with sequence
17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
18 | PDAT,Publication Date,Date sequence added to GenBank
19 | MDAT,Modification Date,Date of last update
20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name
21 | PROP,Properties,Classification by source qualifiers and molecule type
22 | SQID,SeqID String,String identifier for sequence
23 | GPRJ,BioProject,BioProject
24 | SLEN,Sequence Length,Length of sequence
25 | FKEY,Feature key,Feature annotated on sequence
26 | PORG,Primary Organism,Scientific and common names of primary organism, and all higher levels of taxonomy
27 | COMP,Component Accession,Component accessions for an assembly
28 | ASSM,Assembly,Assembly
29 | DIV,Division,Division
30 | STRN,Strain,Strain
31 | ISOL,Isolate,Isolate
32 | CULT,Cultivar,Cultivar
33 | BRD,Breed,Breed
34 | BIOS,BioSample,BioSample
35 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/omim.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,MIM ID,Unique number assigned to OMIM record
 3 | FILT,Filter,Limits the records
 4 | TITL,Title,Words in title of record
 5 | WORD,Text Word,Free text associated with record
 6 | AUTH,Contributor,Contributor to OMIM record
 7 | CLIN,Clinical Synopsis,Clinical features of disorder
 8 | MDAT,Modification Date,The last date on which the record was updated
 9 | ALVR,Allelic Variant,A subset of disease-producing mutations
10 | MDHS,Modification History,All dates on which the record was updated
11 | REFR,Reference,Authors and titles of citations
12 | GMAP,Gene Map,Chromosomal map location
13 | DSDR,Gene Map Disorder,Text word in disorder
14 | GENE,Gene Name,Name of gene associated with record
15 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
16 | CHR,Chromosome,Chromosome number; also 'mitochondrial', 'unknown' properties
17 | EDTR,Editor,A username of an OMIM record Editor
18 | PROP,Properties,Properties of OMIM record
19 | PDAT,Publication Date,The date on which this record first appeared
20 | CSK,Clinical Synopsis Key,The keyword designating a part of the Clinical Synopsis
21 | CSED,Clinical Synopsis Editor,A username of an OMIM record Editor
22 | CSDT,Clinical Synopsis Date,The last date on which the Clinical Synopsis was updated
23 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/orgtrack.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Free text associated with record
 5 | WRD1,Text Word 1,Free text associated with record with higher score then regular TEXT field
 6 | PROP,Properties,Properties of this record
 7 | TITL,Organization name,Lab or Clinic name including institution and department
 8 | CITY,Organization City,City in which the organization is located
 9 | ST,Organization State or province,State, province or other political subdivision  in which the organization is located
10 | CTRY,Organization Country,Country in which the organization is located
11 | LOC,Organization Location,City, State, Country in which the organization is located
12 | PCOD,Organization  postal code,Postal code for the organization
13 | DIR,Organization Director(s),Full names of  director(s) with credentials
14 | STFF,Organization Staff,full names of non-director staff with credentials
15 | LS,Lab Service name,Lab Service name
16 | TYPE,Type of organization,category of an organization, e.g. laboratory, clinic, LSDB
17 | MDAT,Modification Date,The last date on which the record was updated
18 | TNO,Number of tests offered,Number of tests offered by this organization
19 | SNO,SERVICE NUMBER,Unique identifier for the state or province
20 | MTOD,Name of Method,method name
21 | MCAT,name for method category,name for method category
22 | TCAT,name for method top category,name for method top category
23 | LCRT,laboratory certification,laboratory certification
24 | CID,Country ISO code,Country ISO code
25 | SID,State ISO code,State ISO code
26 | TCID,Top Method Category ID,Top Method Category ID
27 | CTID,Method Category ID,Method Category ID
28 | DTCT,Disease and Top Method Category ID,Disease and Top Method Category ID
29 | DCAT,Disease and Method Category ID,Disease and Method Category ID
30 | MTID,Test Method ID,Test Method ID
31 | DID,Disease BioConcepts Entrez ID,Disease BioConcepts Entrez ID
32 | LSID,Lab Service ID,Lab Service ID
33 | DCUI,Disease BioConcepts concept id,Concept identifier from BioConcepts for a disease
34 | ORGN,Organism,scientific and common names of organism
35 | DASV,Disease and Additional Service ID,Disease and Additional Service ID
36 | ASID,Additional Service ID,Additional Service ID
37 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/pcassay.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,Assay ID,Assay ID
 3 | FILT,Filter,Limits the records
 4 | ANAM,Assay Name,AssayName
 5 | ADES,Assay Description,AssayDescription
 6 | APRL,Assay Protocol,AssayProtocol
 7 | CCMT,Categorized Comment,CategorizedComment
 8 | ACMT,Assay Comment,AssayComment
 9 | TNAM,Tid Name,TidName
10 | TDES,Tid Description,TidDescription
11 | RC,Readout Count,ReadoutCount
12 | SRID,Substance Source ID,External substance source identifier
13 | ASRD,Assay Source ID,External assay source identifier
14 | ACMD,Activity Outcome Method,ActivityOutcomeMethod
15 | SNME,Source Name,SourceName
16 | CSNM,Current Source Name,CurrentSourceName
17 | DDAT,Deposit Date,DepositDate
18 | MDAT,Modify Date,ModifyDate
19 | JDAT,Journal Publication Date,JournalPublicationDate
20 | JNAM,Journal Name,JournalName
21 | HDAT,Hold Until Date,HoldUntilDate
22 | AC,Active Sid Count,ActiveSidCount
23 | IAC,Inactive Sid Count,InactiveSidCount
24 | IC,Inconclusive Sid Count,InconclusiveSidCount
25 | TSC,Total Sid Count,TotalSidCount
26 | TCNT,Target Count,TargetCount
27 | ACC,Active Cid Count,ActiveCidCount
28 | PCC,Probe Cid Count,ProbeCidCount
29 | PSC,Probe Sid Count,ProbeSidCount
30 | IACC,Inactive Cid Count,InactiveCidCount
31 | ICC,Inconclusive Cid Count,InconclusiveCidCount
32 | UCC,Unspecified Cid Count,UnspecifiedCidCount
33 | USC,Unspecified Sid Count,UnspecifiedSidCount
34 | TCC,Total Cid Count,TotalCidCount
35 | NARD,Nucleic Acid Reagent ID,NucleicAcidReagentID
36 | XRCT,XRef Comment,XRefComment
37 | XRPD,XRef Pmid,XRefPmid
38 | XRGI,XRef Gi,XRefGi
39 | XRMB,XRef Mmdb,XRefMmdb
40 | XRGN,XRef Gene,XRefGeneID
41 | XRDL,XRef Dburl,XRefDburl
42 | XRSL,XRef Sburl,XRefSburl
43 | XRAL,XRef Asurl,XRefAsurl
44 | XRPI,XRef Proteingi,XRefProteingi
45 | XRNI,XRef Nucleotidegi,XRefNucleotidegi
46 | XRTY,XRef Taxonomy,XRefTaxonomy
47 | XRAD,XRef Aid,XRefAid
48 | XRMM,XRef Omim,XRefOmim
49 | SIDA,Substance ID Active,SubstanceIDActive
50 | SIDT,Substance ID Tested,SubstanceIDTested
51 | CIDA,Compound ID Active,CompoundIDActive
52 | CIDT,Compound ID Tested,CompoundIDTested
53 | MHDA,MeSH Description Active,MeSHDescriptionActive
54 | MHDT,MeSH Description Tested,MeSHDescriptionTested
55 | MHTA,MeSH Term Active,MeSHTermActive
56 | MHTT,MeSH Term Tested,MeSHTermTested
57 | PTN,Protein Target Name,ProteinTargetName
58 | PSFM,Protein SubFamily,Protein SubFamily
59 | GSYM,Gene Symbol,GeneSymbol
60 | PTC,Protein Target Comment,ProteinTargetComment
61 | PTD,Protein Target Description,ProteinTargetDescription
62 | BST,Bio Systems Title,The name of the BioSystems, via Protein target
63 | CCT,Categorized Comment Title,The name of the categorized comment
64 | PTGI,Protein Target GI,ProteinTargetGI
65 | BSID,BioSystems ID,ID of the BioSystems, via Protein Target
66 | PIGI,Pig GI,PigGI
67 | RTGI,RNA Target GI,RNATargetGI
68 | SYNA,Synonym Active,SynonymActive
69 | SYNT,Synonym Tested,SynonymTested
70 | PHAA,Pharm Action Active,PharmActionActive
71 | PHAT,Pharm Action Tested,PharmActionTested
72 | SRCC,Source Category,SourceCategory
73 | TXNM,Taxonomy Name,TaxonomyName
74 | TC,Tested Concentration,TestedConcentration
75 | GRN,Grant Number,GrantNumber
76 | NSAM,Number of Sids With Activity Concentration micromolar,NumberofSidsWithActivityConcmicromolar
77 | NCAM,Number of Cids With Activity Concentration micromolar,NumberofCidsWithActivityConcmicromolar
78 | NSAN,Number of Sids With Activity Concentration nanomolar,NumberofSidsWithActivityConcnanomolar
79 | NCAN,Number of Cids With Activity Concentration nanomolar,NumberofCidsWithActivityConcnanomolar
80 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
81 | APRJ,Assay Project,The name of the summary assay to which this assay is related by same project
82 | CELL,Cell Line,CellLine
83 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/pccompound.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,CompoundID,Compound ID
 3 | FILT,Filter,Limits the records
 4 | SRC,SourceName,Source Name
 5 | SRID,SourceID,Source ID
 6 | SRCC,SourceCategory,Source Category
 7 | CDAT,CreateDate,Record Create Date
 8 | BAID,BioAssayID,Assay ID
 9 | AA,ActiveAid,Active AID
10 | IA,InactiveAid,Inactive AID
11 | INCH,InChI,InChI
12 | IKEY,InChIKey,InChI Key
13 | CSYN,CompleteSynonym,Complete Synonym
14 | SYNO,Synonym,Synonym
15 | MSHT,MeSHTerm,MeSH Term
16 | CMST,CompleteMeSHTerm,Complete MeSH Term
17 | MSHN,MeSHTreeNode,MeSH Tree Node
18 | PHMA,PharmAction,Pharmacological Action
19 | MHD,MeSHDescription,MeSH Description
20 | ELMT,Element,Element
21 | MW,MolecularWeight,Molecular Weight
22 | TFC,TotalFormalCharge,Total Formal Charge
23 | UPAC,IUPACName,IUPAC Name
24 | XLGP,XLogP,XLogP
25 | CPLX,Complexity,Complexity
26 | RBC,RotatableBondCount,Rotatable Bond Count
27 | HBDC,HydrogenBondDonorCount,Hydrogen Bond Donor Count
28 | HBAC,HydrogenBondAcceptorCount,Hydrogen Bond Acceptor Count
29 | HAC,HeavyAtomCount,Heavy Atom Count
30 | ACC,AtomChiralCount,Atom Chiral Count
31 | ACDC,AtomChiralDefCount,Atom Chiral Defined Count
32 | ACUC,AtomChiralUndefCount,Atom Chiral Undefined Count
33 | BCC,BondChiralCount,Bond Chiral Count
34 | BCDC,BondChiralDefCount,Bond Chiral Defined Count
35 | BCUC,BondChiralUndefCount,Bond Chiral Undefined Count
36 | IAC,IsotopeAtomCount,Isotope Atom Count
37 | CUC,CovalentUnitCount,Covalent Unit Count
38 | TC,TautomerCount,Tautomer Count
39 | AC,ActiveAidCount,Active AID Count
40 | IC,InactiveAidCount,Inactive AID Count
41 | TAC,TotalAidCount,Total AID Count
42 | AAR,ActiveAidRatio,Active AID Ratio
43 | SID,SubstanceID,Substance ID
44 | TPSA,TPSA,TPSA
45 | ASRC,AssaySourceName,Assay Source Name
46 | EMAS,ExactMass,Exact Mass
47 | MMAS,MonoisotopicMass,Monoisotopic Mass
48 | ACON,ActiveConcentration,Active Concentration
49 | TCON,TestedConcentration,Tested Concentration
50 | VL3D,Volume3D,3D Volume
51 | X3D,XStericQuadrupole3D,3D X Steric Quadrupole
52 | Y3D,YStericQuadrupole3D,3D Y Steric Quadrupole
53 | Z3D,ZStericQuadrupole3D,3D Z StericQuadrupole
54 | PAID,PharmActionID,Pharmacological Action ID
55 | STID,StructureID,Depositor Associated Structure ID
56 | FC3D,FeatureCount3D,3D Feature Count
57 | FAC3,FeatureAcceptorCount3D,3D Feature Acceptor Count
58 | FDC3,FeatureDonorCount3D,3D Feature Donor Count
59 | FNC3,FeatureAnionCount3D,3D Feature Anion Count
60 | FTC3,FeatureCationCount3D,3D Feature Cation Count
61 | FRC3,FeatureRingCount3D,3D Feature Ring Count
62 | FHC3,FeatureHydrophobeCount3D,3D Feature Hydrophobe Count
63 | CMR3,ConformerModelRmsd3D,3D Conformer Model RMSD
64 | ERC3,EffectiveRotorCount3D,3D Effective Rotatable Bond Count
65 | DCSY,DepositorCompleteSynonym,Depositor Complete Synonym
66 | DSYN,DepositorSynonym,Depositor Synonym
67 | CCNT,ConformerCount3D,3D Conformer Count
68 | DCNT,DepositorCount,Depositor Count
69 | PTNT,Patent,Patent
70 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/pcsubstance.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,SubstanceID,Substance ID
 3 | FILT,Filter,Limits the records
 4 | SRC,SourceName,Source Name
 5 | CSN,CurrentSourceName,Current Source Name
 6 | SRID,SourceID,Source ID
 7 | SRCC,SourceCategory,Source Category
 8 | SRD,SourceReleaseDate,Source Release Date
 9 | DDAT,DepositDate,Deposit Date
10 | MDAT,ModifyDate,Modification Date
11 | BAID,BioAssayID,Assay ID
12 | AA,ActiveAid,Active AID
13 | IA,InactiveAid,Inactive AID
14 | INCH,InChI,InChI
15 | IKEY,InChIKey,InChI Key
16 | CSYN,CompleteSynonym,Complete Synonym
17 | SYNO,Synonym,Synonym
18 | MSHT,MeSHTerm,MeSH Term
19 | CMST,CompleteMeSHTerm,Complete MeSH Term
20 | MSHN,MeSHTreeNode,MeSH Tree Node
21 | PHMA,PharmAction,Pharmacological Action
22 | CMT,Comment,Comment
23 | MHD,MeSHDescription,MeSH Description
24 | ELMT,Element,Element
25 | MW,MolecularWeight,Molecular Weight
26 | TFC,TotalFormalCharge,Total Formal Charge
27 | UPAC,IUPACName,IUPAC Name
28 | XLGP,XLogP,XLogP
29 | CPLX,Complexity,Complexity
30 | RBC,RotatableBondCount,Rotatable Bond Count
31 | HBDC,HydrogenBondDonorCount,Hydrogen Bond Donor Count
32 | HBAC,HydrogenBondAcceptorCount,Hydrogen Bond Acceptor Count
33 | HAC,HeavyAtomCount,Heavy Atom Count
34 | ACC,AtomChiralCount,Atom Chiral Count
35 | ACDC,AtomChiralDefCount,Atom Chiral Defined Count
36 | ACUC,AtomChiralUndefCount,Atom Chiral Undefined Count
37 | BCC,BondChiralCount,Bond Chiral Count
38 | BCDC,BondChiralDefCount,Bond Chiral Defined Count
39 | BCUC,BondChiralUndefCount,Bond Chiral Undefined Count
40 | IAC,IsotopeAtomCount,Isotope Atom Count
41 | CUC,CovalentUnitCount,Covalent Unit Count
42 | TC,TautomerCount,Tautomer Count
43 | AC,ActiveAidCount,Active AID Count
44 | IC,InactiveAidCount,Inactive AID Count
45 | TAC,TotalAidCount,Total AID Count
46 | AAR,ActiveAidRatio,Active AID Ratio
47 | SCID,StandardizedCID,Standardized CID
48 | CCID,ComponentCID,Component CID
49 | CID,CompoundID,Compound ID
50 | TPSA,TPSA,TPSA
51 | ASRC,AssaySourceName,Assay Source Name
52 | EMAS,ExactMass,Exact Mass
53 | MMAS,MonoisotopicMass,Monoisotopic Mass
54 | ACON,ActiveConcentration,Active Concentration
55 | TCON,TestedConcentration,Tested Concentration
56 | PAID,PharmActionID,Pharmacological Action ID
57 | STID,StructureID,Structure ID
58 | HUD,HoldUntilDate,Hold Until Date
59 | PTNT,Patent,Patent
60 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/pmc.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | PMID,PubMed ID,Unique identifier from PubMed
 5 | AUTH,Author,Author(s) of publication
 6 | TITL,Title,A short descriptive name
 7 | PDAT,Publication Date,Date of publication
 8 | ABST,Abstract,Abstract
 9 | CAPT,Figure/Table Caption,Caption
10 | SECT,Section Title,Section Title
11 | REFR,Reference,Reference
12 | REFA,Reference Author,Name of Reference author(s)
13 | RPID,Reference PubMed ID,Reference Unique identifier from PubMed
14 | JOUR,Journal,Source journal of publication
15 | VOL,Volume,Volume number of publication
16 | ISS,Issue,Issue number of publication
17 | PAGE,Pagination,Page number(s) of publication
18 | EPDT,Electronic Publication Date,Date publication first accessible through Entrez
19 | WORD,Text Word,Free text associated with publication
20 | ARTI,Body - All Words,Article Body
21 | KWD,Body - Key Terms,Keyword
22 | METH,Methods - Key Terms,Keyword in the Methods Section
23 | MESH,MeSH Terms,Medical Subject Headings assigned to publication
24 | MAJR,MeSH Major Topic,Medical terms of major importance assigned to publication
25 | SUBH,MeSH Subheading,Additional specificity for MeSH Terms
26 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
27 | SUBS,Supplementary Concept,CAS chemical name or MEDLINE Substance Name
28 | AFFL,Affiliation,Author's institutional affiliation and address
29 | LDAT,PMC Live Date,PMC live date
30 | ORGN,Organism,scientific and common names of organism
31 | ONSN,Organism unsynonymized,unsynonymized organism names
32 | ACCN,Accession,Accession number of sequence
33 | EDAT,Entrez Date,Entrez date
34 | DOI,DOI,Digital Object Identifier
35 | FULL,Full Author Name,Full Author Name(s) of publication
36 | GRNT,Grant Number,NIH Grant Numbers
37 | ACK,Acknowledgments,Acknowledgments
38 | PPDT,Print Publication Date,Print Publication Date
39 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/popset.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to each sequence
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Free text associated with record
 5 | TITL,Title,Words in definition line
 6 | KYWD,Keyword,Nonstandardized terms provided by submitter
 7 | AUTH,Author,Author(s) of publication
 8 | JOUR,Journal,Journal abbreviation of publication
 9 | VOL,Volume,Volume number of publication
10 | ISS,Issue,Issue number of publication
11 | PAGE,Page Number,Page number(s) of publication
12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
13 | ACCN,Accession,Accession number of sequence
14 | PACC,Primary Accession,Does not include retired secondary accessions
15 | GENE,Gene Name,Name of gene associated with sequence
16 | PROT,Protein Name,Name of protein associated with sequence
17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
18 | PDAT,Publication Date,Date sequence added to GenBank
19 | MDAT,Modification Date,Date of last update
20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name
21 | PROP,Properties,Classification by source qualifiers and molecule type
22 | SQID,SeqID String,String identifier for sequence
23 | GPRJ,BioProject,BioProject
24 | FKEY,Feature key,Feature annotated on sequence
25 | PCNT,Protein Count,Number of proteins in the set
26 | NCNT,Nucleotide Count,Number of nucleotides in the set
27 | STRN,Strain,Strain
28 | ISOL,Isolate,Isolate
29 | CULT,Cultivar,Cultivar
30 | BRD,Breed,Breed
31 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/probe.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | AUTH,Author,Author
 5 | DIST,Distributor,Distributor
 6 | GENE,Gene Name,Gene Name
 7 | GNID,Gene ID,Gene ID
 8 | KYWD,Application,Application
 9 | ORGN,Organism,Organism
10 | PRNM,Probe Name,Probe Name
11 | PRTY,Probe Type,Probe Type
12 | PROP,Properties,Properties
13 | WORD,Text Word,Text Word
14 | TITL,Title,Title
15 | CAPT,Caption,Caption
16 | SUBM,Submission,Submission
17 | COLL,Platform Name,Platform Name
18 | SEQ,Sequence,Sequence
19 | ACCN,Sequence accession,Semicolon delimited sequence accession(s)
20 | PMID,Pubmed ID,Pubmed ID
21 | MDAT,Modification Date,Date of the last update of the submission
22 | PDAT,Publication Date,Date sequence added to GenBank
23 | VAL,Validation,Validation
24 | PLID,Platform ID,Platform ID
25 | USTS,UniSTS ID,Legacy UniSTS ID
26 | PSET,Probeset UID,Probeset(s) this probe belongs to
27 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/protein.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to each sequence
 3 | FILT,Filter,Limits the records
 4 | WORD,Text Word,Free text associated with record
 5 | TITL,Title,Words in definition line
 6 | KYWD,Keyword,Nonstandardized terms provided by submitter
 7 | AUTH,Author,Author(s) of publication
 8 | JOUR,Journal,Journal abbreviation of publication
 9 | VOL,Volume,Volume number of publication
10 | ISS,Issue,Issue number of publication
11 | PAGE,Page Number,Page number(s) of publication
12 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
13 | ACCN,Accession,Accession number of sequence
14 | PACC,Primary Accession,Does not include retired secondary accessions
15 | GENE,Gene Name,Name of gene associated with sequence
16 | PROT,Protein Name,Name of protein associated with sequence
17 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
18 | PDAT,Publication Date,Date sequence added to GenBank
19 | MDAT,Modification Date,Date of last update
20 | SUBS,Substance Name,CAS chemical name or MEDLINE Substance Name
21 | PROP,Properties,Classification by source qualifiers and molecule type
22 | SQID,SeqID String,String identifier for sequence
23 | GPRJ,BioProject,BioProject
24 | SLEN,Sequence Length,Length of sequence
25 | MLWT,Molecular Weight,Molecular Weight
26 | FKEY,Feature key,Feature annotated on sequence
27 | PORG,Primary Organism,Scientific and common names of primary organism, and all higher levels of taxonomy
28 | ASSM,Assembly,Assembly
29 | DIV,Division,Division
30 | STRN,Strain,Strain
31 | ISOL,Isolate,Isolate
32 | CULT,Cultivar,Cultivar
33 | BRD,Breed,Breed
34 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/proteinclusters.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACCN,Accession,Accession
 5 | PMID,PubMed ID,PubMed ID
 6 | TITL,Title,Title
 7 | GENE,Gene Name,Gene Name
 8 | GSYN,Gene Synonym,Gene Synonym
 9 | COG,COG,Clusters of Orthologous Groups of proteins
10 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
11 | HMAP,HAMAP,HAMAP
12 | KO,KO,KO
13 | PROT,Protein Name,Name of protein
14 | PACC,Protein Accession,Protein Accession
15 | LTAG,Locus Tag,Locus Tag
16 | SLEN,Sequence Length,Length of sequence
17 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
18 | TXID,Taxonomy ID,Taxonomy ID
19 | CDAT,Creation Date,Creation Date
20 | MDAT,Modification Date,Modification Date
21 | SIZE,Size,Size
22 | DOM,Domain Name,Domain Name
23 | DOMS,Domains,Domains
24 | PUID,Protein GI,Protein GI
25 | PARA,Paralogs,Paralogs
26 | COGG,COG group,COG group
27 | AVGL,Average Length,Average Length
28 | PROP,Properties,Properties
29 | TPUB,Total Publications,Total Publications
30 | SPCN,SwissProt Accession,SwissProt Accession
31 | CONS,Conserved In,Conserved In
32 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/pubmed.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | TITL,Title,Words in title of publication
 5 | WORD,Text Word,Free text associated with publication
 6 | MESH,MeSH Terms,Medical Subject Headings assigned to publication
 7 | MAJR,MeSH Major Topic,MeSH terms of major importance to publication
 8 | AUTH,Author,Author(s) of publication
 9 | JOUR,Journal,Journal abbreviation of publication
10 | AFFL,Affiliation,Author's institutional affiliation and address
11 | ECNO,EC/RN Number,EC number for enzyme or CAS registry number
12 | SUBS,Supplementary Concept,CAS chemical name or MEDLINE Substance Name
13 | PDAT,Date - Publication,Date of publication
14 | EDAT,Date - Entrez,Date publication first accessible through Entrez
15 | VOL,Volume,Volume number of publication
16 | PAGE,Pagination,Page number(s) of publication
17 | PTYP,Publication Type,Type of publication (e.g., review)
18 | LANG,Language,Language of publication
19 | ISS,Issue,Issue number of publication
20 | SUBH,MeSH Subheading,Additional specificity for MeSH term
21 | SI,Secondary Source ID,Cross-reference from publication to other databases
22 | MHDA,Date - MeSH,Date publication was indexed with MeSH terms
23 | TIAB,Title/Abstract,Free text associated with Abstract/Title
24 | OTRM,Other Term,Other terms associated with publication
25 | INVR,Investigator,Investigator
26 | COLN,Author - Corporate,Corporate Author of publication
27 | CNTY,Place of Publication,Country of publication
28 | PAPX,Pharmacological Action,MeSH pharmacological action pre-explosions
29 | GRNT,Grant Number,NIH Grant Numbers
30 | MDAT,Date - Modification,Date of last modification
31 | CDAT,Date - Completion,Date of completion
32 | PID,Publisher ID,Publisher ID
33 | FAUT,Author - First,First Author of publication
34 | FULL,Author - Full,Full Author Name(s) of publication
35 | FINV,Investigator - Full,Full name of investigator
36 | TT,Transliterated Title,Words in transliterated title of publication
37 | LAUT,Author - Last,Last Author of publication
38 | PPDT,Print Publication Date,Date of print publication
39 | EPDT,Electronic Publication Date,Date of Electronic publication
40 | LID,Location ID,ELocation ID
41 | CRDT,Date - Create,Date publication first accessible through Entrez
42 | BOOK,Book,ID of the book that contains the document
43 | ED,Editor,Section's Editor
44 | ISBN,ISBN,ISBN
45 | PUBN,Publisher,Publisher's name
46 | AUCL,Author Cluster ID,Author Cluster ID
47 | EID,Extended PMID,Extended PMID
48 | DSO,DSO,Additional text from the summary
49 | AUID,Author - Identifier,Author Identifier
50 | PS,Subject - Personal Name,Personal Name as Subject
51 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/pubmedhealth.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | TITL,Title,Article title
 5 | KYPH,Keyphrase,High-scored field for exact-matched phrases
 6 | STXT,Secondary Text,Special text word
 7 | CONP,Concept Phrases,Generated keywords
 8 | BOOK,Book,ID of the book that contains the document
 9 | PID,ParentId,ID of the book
10 | PMID,PmId,PubMed ID
11 | RD,ReleaseDate,ReleaseDate
12 | SUB,Subject,Subject
13 | AID,AccessionID,Accession ID
14 | UMLS,UMLSID,UMLS Concept ID
15 | ICD9,ICD9ID,ICD-9 ID
16 | BCID,BioconceptsID,BioConcepts ID
17 | PDAT,Date of publication,Date of publication
18 | UDAT,Update Date,Content update date
19 | DR,DrugName,Drug brand name
20 | TYPE,Type,Document type (Book/Article/Chapter)
21 | CAT,Category,Category
22 | PUBL,Publisher,PMH Content Provider
23 | CLID,CollectionId,Collection Identifier
24 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/seqannot.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACCN,Accession,Accession number of sequence
 5 | TITL,Title,Words in definition line
 6 | PROP,Properties,Classification by source qualifiers and molecule type
 7 | WORD,Text Word,Free text associated with record
 8 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
 9 | AUTH,Author,Author(s) of publication
10 | PDAT,Publication Date,Date sequence added to GenBank
11 | MDAT,Modification Date,Date of last update
12 | ASSM,Target Assembly,Target Assembly
13 | ANNT,Annotation Type,Attribute
14 | VCTX,Viewer_Context,Viewer Context
15 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/snp.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | RS,Reference SNP ID,Clustered SNP ID (rs)
 5 | CHR,Chromosome,chromosomes
 6 | GENE,Gene Name,locus link symbol
 7 | HAN,Submitter Handle,Submitter Handle
 8 | ACCN,Accession,nucleotide accessions
 9 | LLID,LocusLink ID,locus link UID
10 | ORGN,Organism,Organism
11 | FXN,Function Class,Function class
12 | GTYP,Genotype,Genotype info
13 | NREF,non reference assembly,SNP not mapped to reference assembly
14 | HETZ,Heterozygosity,Heterozygosity
15 | MPWT,Map Weight,Map weight
16 | VALI,Validation Status,Validation status
17 | SRAT,Success Rate,Success rate
18 | CBID,Create Build ID,Original Build ID
19 | UBID,Update Build ID,Update Build ID
20 | PDAT,Publication Date,SNP Publication date
21 | MDAT,Modification Date,SNP modification date
22 | PCLS,Population Class,Population classification based on geographic location
23 | MCLS,Method Class,Assay Method
24 | SS,Submitter SNP ID,Submitter ID
25 | SID,Local SNP ID,Local SNP ID
26 | VARI,Allele,Allele
27 | SCLS,SNP Class,SNP class
28 | GDSC,Gene Description,description of gene
29 | CPOS,Base Position,Chromosome base position
30 | GPOS,Contig Position,Contig base position
31 | WORD,Text Word,Free text associated with record
32 | WTAA,Reference Amino Acid,Reference Amino Acid
33 | MTAA,Variant Amino Acid,Variant or Mutant Amino Acid
34 | RSNP,Reference SNP,Reference SNP
35 | SIDX,SNP Index,SNP Index
36 | ALOR,SNP Allele Origin,Allele originated from somatic or germline
37 | SUSP,Suspected false variation,Variation suspected to be false based on evidence
38 | CLIN,Clinical Significance,Variations with clinical effects or significances
39 | GMAF,Global Minor Allele Frequency,Minor Allele Frequency derived from global population (ie. 1000G)
40 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/sra.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ACCN,Accession,Accession number of sequence
 5 | TITL,Title,Words in definition line
 6 | PROP,Properties,Classification by source qualifiers and molecule type
 7 | WORD,Text Word,Free text associated with record
 8 | ORGN,Organism,Scientific and common names of organism, and all higher levels of taxonomy
 9 | AUTH,Author,Author(s) of publication
10 | PDAT,Publication Date,Date sequence added to GenBank
11 | MDAT,Modification Date,Date of last update
12 | GPRJ,BioProject,BioProject
13 | BSPL,BioSample,BioSample
14 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/structure.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,MMDB ID,mmdbId
 3 | FILT,Filter,Limits the records
 4 | ACCN,PDB Accession,PDB Accession
 5 | ECNO,EC/RN Number,EC/RN Number
 6 | RESO,Resolution,Resolution
 7 | EXPM,Experimental Method,Experimental Method
 8 | TITL,Title,Citation Title
 9 | ABS,Abstract,The abstracts of all PubMed references that are linked to the structure
10 | AUTH,Author,Citation Author
11 | PCLA,PDB Class,Pdb Class
12 | PSRC,PDB Source,Pdb Source
13 | PDSC,PDB Description,PdbDescr
14 | PCOM,PDB Comment,PdbComment
15 | PDD,PDB Deposit Date,PDB Deposit Date
16 | DDAT,MMDB Entry Date,MMDB Deposit Date
17 | MDAT,MMDB Modify Date,MMDB Modify Date
18 | LCOD,PDB Chemical Code,chemical ligand codes taken from PDB file
19 | LNAM,Chemical Name,chemical ligand names taken from PDB file
20 | CSYN,Chemical Synonyms,Chemical synonyms taken from PubChem
21 | LDES,Chemical Description,Chemical description taken from PDB
22 | ORGN,Organism,Organism Name
23 | TXID,Taxonomy ID,Numerical taxonomy identifier
24 | PMC,BioUnit Protein Molecule Count,Count of Protein Molecules in BioUnit
25 | DMC,BioUnit DNA Molecule Count,Count of DNA molecules in BioUnit
26 | RMC,BioUnit RNA Molecule Count,Count of RNA molecules in BioUnit
27 | BPC,BioUnit Biopolymer Count,Count of Biopolymers in BioUnit
28 | LCOU,BioUnit Chemical Count,Count of Chemical Molecules in BioUnit
29 | OCOU,BioUnit Other Molecule Count,Count of Other Molecules in BioUnit
30 | JOUR,Journal,Source journal of structure
31 | CDID,Conserved Domain PSSMID,identifier for a conserved domain cluster
32 | CDSN,Conserved Domain Short Name,Short name of the domain of a conserved domain cluster
33 | CDDT,Conserved Domain Title,Title of the domain of a conserved domain cluster
34 | CDDF,Conserved Domain Description,Defline of the domain of a conserved domain cluster
35 | SFID,Conserved Domain Superfamily PSSMID,identifier for a superfamily domain cluster
36 | SPFN,Conserved Domain Superfamily Short Name,Short name of a superfamily of conserved domain clusters
37 | SPTL,Conserved Domain Superfamily Title,Title of a superfamily of conserved domain clusters
38 | SPDF,Conserved Domain Superfamily Description,Definition line of a superfamily cluster of conserved domain
39 | OS,Oligomeric State,Oligomeric state of the biological unit
40 | PNAM,Protein Name,Names of Protein Molecules
41 | GN,Gene Name,Names of genes associated with protein molecules
42 | GDSC,Gene Description,Descriptions of genes associated with protein molecules
43 | DNAM,DNA Name,Names of DNA Molecules
44 | RNAM,RNA Name,Names of RNA Molecules
45 | ONAM,Other Molecule Name,Names of Other Molecules
46 | APMC,ASU Protein Molecule Count,Count of Protein Molecules in ASU
47 | ADMC,ASU DNA Molecule Count,Count of DNA molecules in ASU
48 | ARMC,ASU RNA Molecule Count,Count of RNA molecules in ASU
49 | ABPC,ASU Biopolymer Count,Count of Biopolymers in ASU
50 | ALCT,ASU Chemical Count,Count of Chemicals in ASU
51 | AOCT,ASU Other Molecule Count,Count of other molecules in ASU
52 | MLWT,BioUnit Molecular Weight,Molecular Weight of the default Biological Assembly
53 | FC,Number of PDB Records per Structure,Number of PDB records that have been combined to define a complete biological assembly. (For most structures, one record suffices; very large structures were split by the PDB into multiple records.)
54 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/taxonomy.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,Taxonomy ID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | SCIN,Scientific Name,Scientific name of organism
 5 | COMN,Common Name,Common name of organism
 6 | TXSY,Synonym,Synonym of organism name
 7 | ALLN,All Names,All aliases for organism
 8 | NXLV,Next Level,Immediate parent in taxonomic hierarchy
 9 | SBTR,Subtree,Any parent node in taxonomic hierarchy
10 | LNGE,Lineage,Lineage in taxonomic hierarchy
11 | GC,GC,Nuclear genetic code
12 | MGC,MGC,Mitochondrial genetic code
13 | PGC,PGC,Mitochondrial genetic code
14 | TXDV,Division,GenBank division
15 | RANK,Rank,Hierarchical position (e.g., order, genus)
16 | EDAT,Entrez Date,Date record first accessible through Entrez
17 | MDAT,Modification Date,Date of last update
18 | PROP,Properties,Property defined on particular node (e.g., terminal node)
19 | WORD,Text Word,Free text associated with record
20 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/toolkit.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | DID,Dox ID,This is the doxygen id for this entity
 5 | DEF,Definition,Definition
 6 | DEFT,Definition Type,Type of Definition
 7 | SD,Short Description,Short Description
 8 | LD,Long Description,Long Description
 9 | FILE,File Name,File Name
10 | MODS,Modifiers,Modifiers
11 | ATTR,Attributes,Attributes
12 | LINE,Lines,Lines
13 | LNK,Link,Link
14 | NAM,Name,Name of item (be it class, method, etc.)
15 | DEFB,Defined by,The entity that defines this entity
16 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/toolkitall.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neilfws/utils4bioinformatics/d689dbac6a2e6959ab369ec76596964d108c599e/ncbi/entrez_db_terms/data/toolkitall.txt


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/toolkitbook.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | AUTH,Author,Section's author
 5 | CA,Corporate Author,Corporate Author of publication
 6 | FA,Full Author Name,Full Author Name(s) of publication
 7 | FE,Full Editor Name,f
 8 | TITL,Title,Section's title
 9 | TYPE,Type,Section's type
10 | STXT,Secondary Text,Special text word
11 | CONP,Concept Phrases,Generated keywords
12 | BOOK,Book,ID of the book that contains the document
13 | RMID,RefPMID,Citation search by PmId
14 | RID,Rid,Book internal ID
15 | PUBN,Publisher,Publisher's Name
16 | PDAT,Publication Year,Publication Year
17 | ISBN,ISBN,ISBN
18 | ATTR,Attribute,Attributes in key value ordered pairs
19 | EDIT,Editor,Section's Editor
20 | RD,Release Date,Release Date
21 | SUB,Subject,Subject
22 | RT,Resource Type,Resource Type
23 | AID,Accession ID,Accession ID
24 | BACI,Book Accession ID,Book Accession ID
25 | CHID,Chapter Accession ID,Chapter Accession ID
26 | 


--------------------------------------------------------------------------------
/ncbi/entrez_db_terms/data/unigene.txt:
--------------------------------------------------------------------------------
 1 | ALL,All Fields,All terms from all searchable fields
 2 | UID,UID,Unique number assigned to publication
 3 | FILT,Filter,Limits the records
 4 | ORGN,Organism,scientific and common names of organism
 5 | TITL,Title,title of cluster
 6 | LIBR,Library,dbEST library names
 7 | TISS,Tissue,tissue sources of libraries
 8 | CLON,Clone ID,clone ids, with and without IMAGE
 9 | NCAC,Nucleotide Accession,nucleotide accessions of seqeunces
10 | NUID,Nucleotide UID,nucleotide uids of sequences
11 | PRAC,Protein Accession,protein accessions
12 | PUID,Protein UID,protein uids
13 | PROP,Properties,various flags
14 | WORD,Text Word,titles of sequences, vectors
15 | CHR,Chromosome,chromosomes
16 | GENE,Gene Name,locus link symbol
17 | GDSC,Gene Description,description of gene
18 | LLID,Gene ID,gene id
19 | TXID,Taxonomy ID,taxonomy id
20 | ESTC,Est Count,number of ests per cluster
21 | MRNA,mRNA Count,number of mrna per cluster
22 | SEQC,Sequence Count,total number of sequences per cluster
23 | CID,Cluster ID,Cluster ID
24 | EXPR,Expression,library description of all member sequences
25 | REXP,Restricted Expression,library description of the majority of member sequences
26 | PRNK,Page Rank,Page Rank
27 | RTYP,Record Type,record type
28 | 


--------------------------------------------------------------------------------
/ncbi/taxonomy/README.md:
--------------------------------------------------------------------------------
1 | # Taxonomy
2 | 
3 | Utilities for working with the NCBI Taxonomy database.
4 | 
5 | ## Current contents
6 | 
7 | 1. virus_hosts - get hosts for viruses given taxonomy ID 
8 | 


--------------------------------------------------------------------------------
/ncbi/taxonomy/virus_hosts/README.md:
--------------------------------------------------------------------------------
 1 | # virus_hosts
 2 | 
 3 | [See this blog post](https://nsaunders.wordpress.com/2015/06/02/virus-hosts-from-ncbi-taxonomy-web-pages/).
 4 | 
 5 | The code in code/ruby/virus2host.rb takes a taxonomy UID as input and returns the UID, rank, name and host (where present) for the virus.
 6 | 
 7 | The file in data/virus_host.tsv was generated by downloading all virus UIDs from the taxonomy database to a file, then submitting each line to the Ruby script.
 8 | 
 9 |     # all virus UIDs at http://www.ncbi.nlm.nih.gov/taxonomy/?term=txid10239[Subtree] > uids.txt
10 | 
11 |     for line in $(cat uids.txt)
12 |       do ruby virus2host.rb $line >> virus_host.tsv
13 |       sleep 1
14 |     done
15 | 
16 | NOTE: the code scrapes HTML and will break if NCBI change the HTML in the future.
17 | 


--------------------------------------------------------------------------------
/ncbi/taxonomy/virus_hosts/code/ruby/virus2host.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | require 'nokogiri'
 4 | require 'open-uri'
 5 | 
 6 | def get_host(uid)
 7 | 	url   = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&lvl=3&lin=f&keep=1&srchmode=1&unlock&id=" + uid.to_s
 8 | 	doc   = Nokogiri::HTML.parse(open(url).read)
 9 | 	data  = doc.xpath("//td").collect { |x| x.inner_html.split("<br>") }.flatten
10 | 	orgn = ""
11 | 	rank = ""
12 | 	host = ""
13 | 	data.each do |e|
14 | 		orgn = $1 if e =~ /<h2>(.*?)<\/h2>/
15 | 		rank = $1 if e =~ /Rank:\s+<\/em>(.*?)$/
16 | 		host = $1 if e =~ /Host:\s+<\/em>(.*?)$/
17 | 	end
18 | 	puts [uid, rank, orgn, host].join("\t")
19 | end
20 | 
21 | get_host(ARGV[0])


--------------------------------------------------------------------------------
/ncbi/taxonomy/virus_hosts/data/host_count.txt:
--------------------------------------------------------------------------------
 1 |    1301 
 2 |     283 algae
 3 |     114 archaea
 4 |    4509 bacteria
 5 |       8 diatom
 6 |      51 enviroment
 7 |     267 fungi
 8 |       1 fungi| plants| invertebrates
 9 |       4 human
10 |     761 invertebrates
11 |     181 invertebrates| plants
12 |       7 invertebrates| vertebrates
13 |    3979 plants
14 |     102 protozoa
15 |    6834 vertebrates
16 |  115052 vertebrates| human
17 |      43 vertebrates| human  stool
18 |     225 vertebrates| invertebrates
19 |     656 vertebrates| invertebrates| human
20 | 


--------------------------------------------------------------------------------
/uniprot_words/code/R/match_words_uniprot.R:
--------------------------------------------------------------------------------
 1 | library(readr)
 2 | library(dplyr)
 3 | library(seqinr)
 4 | library(AhoCorasickTrie)
 5 | 
 6 | # should check this exists
 7 | words <- read_lines("~/Downloads/words_alpha.txt") %>% 
 8 |   toupper()
 9 | 
10 | # should check this exists
11 | sp <- read.fasta("~/Downloads/uniprot_sprot.fasta.gz", 
12 |                  as.string = TRUE, 
13 |                  seqtype = "AA")
14 | 
15 | # search & retain only hits
16 | results <- AhoCorasickSearchList(words[which(nchar(words) > 7)], sp, alphabet = "aminoacid")
17 | results <- results[which(sapply(results, function(x) length(x[[1]]) > 0))]
18 | 
19 | # subset into first & second hits then recombine
20 | # my this is ugly
21 | 
22 | results01 <- results %>% 
23 |   plyr::ldply(as.data.frame, stringsAsFactors = FALSE) %>% 
24 |   as_tibble() %>% 
25 |   select(.id, Keyword = Keyword.1, Offset = Offset.1) %>% 
26 |   na.omit()
27 | 
28 | results02 <- results %>% 
29 |   plyr::ldply(as.data.frame, stringsAsFactors = FALSE) %>% 
30 |   as_tibble() %>% 
31 |   select(.id, Keyword, Offset) %>% 
32 |   na.omit()
33 | 
34 | word_matches <- bind_rows(results01, results02) %>% 
35 |   arrange(desc(nchar(Keyword)))
36 | 
37 | # assumes running from code/R/
38 | word_matches %>% write_csv("../../data/word_matches.csv")
39 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_de.csv:
--------------------------------------------------------------------------------
  1 | .id,Keyword,Offset
  2 | sp|Q1LX78|CFTR_DANRE,ALTSEIMEN,261
  3 | sp|G5ED05|CNNM5_CAEEL,EREKTILEN,312
  4 | sp|Q3JCN1|G6PI_NITOC,GEPINNTEN,83
  5 | sp|Q99KY4|GAK_MOUSE,KAPITELLE,325
  6 | sp|P97874|GAK_RAT,KAPITELLE,325
  7 | sp|Q56198|GLK_STAXY,TAGLILIEN,272
  8 | sp|Q8N3R3|TCAIM_HUMAN,VEREINENS,53
  9 | sp|Q8TKS3|UVRB_METAC,DIKTIERTE,498
 10 | sp|Q8PRZ9|UVRB_METMA,DIKTIERTE,498
 11 | sp|Q6FTM9|KEX1_CANGA,ANDERSWIE,331
 12 | sp|A1TZU2|MNMC_MARN8,LERNTYPEN,358
 13 | sp|P0A447|PSBA2_SYNEL,RETTETEST,225
 14 | sp|P0A446|PSBA2_THEVB,RETTETEST,225
 15 | sp|A0Q810|SECB2_FRATN,INNENFELD,50
 16 | sp|Q6DJR2|WWC2_XENTR,EIERLEGER,369
 17 | sp|O77384|LRR4_PLAF7,ENDENDEN,3335
 18 | sp|A0P8X0|AAMY_NIACI,LERNFALL,488
 19 | sp|Q65X71|ACA6_ORYSJ,EINCREME,924
 20 | sp|A7MS74|ACCD1_VIBC1,AKTSAALE,180
 21 | sp|Q87MP2|ACCD1_VIBPA,AKTSAALE,180
 22 | sp|Q87I11|ACCD2_VIBPA,AKTSAALE,180
 23 | sp|Q0VPJ1|ACCD_ALCBS,AKTSAALE,184
 24 | sp|Q5WEF4|ACCD_ALKCK,AKTSAALE,180
 25 | sp|Q1QY40|ACCD_CHRSD,AKTSAALE,182
 26 | sp|Q9K841|ACCD_HALH5,AKTSAALE,180
 27 | sp|A5F2T5|ACCD_VIBC3,AKTSAALE,180
 28 | sp|Q9KTA3|ACCD_VIBCH,AKTSAALE,180
 29 | sp|Q8DB33|ACCD_VIBVU,AKTSAALE,180
 30 | sp|Q7MIU0|ACCD_VIBVY,AKTSAALE,180
 31 | sp|Q2IWU7|ACPS_RHOP2,DRIFTETE,28
 32 | sp|Q136W1|ACPS_RHOPS,DRIFTETE,28
 33 | sp|P71073|ADER_BACSU,RINGELTE,208
 34 | sp|B3NAM7|AFFL_DROER,GLASPART,1161
 35 | sp|Q9VQI9|AFFL_DROME,GLASPART,1153
 36 | sp|Q29KG4|AFFL_DROPS,GLASPART,1220
 37 | sp|B4MUE1|AFFL_DROWI,GLASPART,1308
 38 | sp|B4NXA8|AFFL_DROYA,GLASPART,1151
 39 | sp|P26818|ARBK2_BOVIN,LEERHEIT,47
 40 | sp|Q5L1V3|ARGB_GEOKA,TIERLIED,198
 41 | sp|Q2NGN7|ARLY_METST,FIKTIVEM,133
 42 | sp|P73997|AROB_SYNY3,ANLERNST,93
 43 | sp|P25550|ASLB_ECOLI,VERLADET,202
 44 | sp|A1R591|ASSY_PAEAT,EDITIERE,287
 45 | sp|B8HGC9|ASSY_PSECP,EDITIERE,287
 46 | sp|P0DJJ2|ASTL_CHICK,STATTEST,29
 47 | sp|Q6CT08|ATG9_KLULA,ANATEVKA,414
 48 | sp|W0TIW1|ATG9_KLUMD,ANATEVKA,415
 49 | sp|Q6LKZ6|ATPB2_PHOPR,ANSTELLE,122
 50 | sp|B6EHG4|ATPB_ALISL,ANSTELLE,121
 51 | sp|B8F774|ATPB_GLAP5,ANSTELLE,118
 52 | sp|Q112Z3|ATPF2_TRIEI,KAKERLAK,65
 53 | sp|Q55EI6|ATX10_DICDI,TESTTEST,268
 54 | sp|A0A385DVS7|AUXCP_BPCA1,LENKTEST,145
 55 | sp|Q9FKV2|BBE23_ARATH,EINLADET,143
 56 | sp|Q84WV2|BGL20_ARATH,EHEFEHDE,527
 57 | sp|P33144|BIMB_EMENI,PENDELND,590
 58 | sp|B5BT18|BTAF1_ARATH,HIESIGER,687
 59 | sp|Q13137|CACO2_HUMAN,ERLEGEND,329
 60 | sp|Q4R914|CACO2_MACFA,ERLEGEND,281
 61 | sp|Q5R7H1|CACO2_PONAB,ERLEGEND,329
 62 | sp|Q7V9U4|CAPP_PROMA,WATTIERT,408
 63 | sp|B7IHG0|CBID_THEAB,EISMASSE,187
 64 | sp|Q640L5|CCD18_MOUSE,EIERLAGE,1164
 65 | sp|P32468|CDC12_YEAST,ENTGEGNE,316
 66 | sp|Q52G60|CEF1_MAGO7,SKANDALS,710
 67 | sp|Q02224|CENPE_HUMAN,NIESELNS,495
 68 | sp|Q1LX78|CFTR_DANRE,ALTSEIME,261
 69 | sp|Q0VF96|CGNL1_HUMAN,LEESEGEL,672
 70 | sp|Q6AW69|CGNL1_MOUSE,LEESEGEL,668
 71 | sp|P12024|CHAO_DROME,KINNLADE,635
 72 | sp|Q22516|CHD3_CAEEL,CRICKETS,331
 73 | sp|Q1L8T5|CING_DANRE,LEERERER,785
 74 | sp|Q9LSX4|CKL8_ARATH,VERPISST,387
 75 | sp|G5ED05|CNNM5_CAEEL,EREKTILE,312
 76 | sp|P0C0L4|CO4A_HUMAN,ERDFALLS,79
 77 | sp|P0C0L5|CO4B_HUMAN,ERDFALLS,79
 78 | sp|Q9TU53|CUBN_CANLF,GEILTEST,929
 79 | sp|O60494|CUBN_HUMAN,GEILTEST,933
 80 | sp|P0C1J2|CWC27_RHIO9,LEIDENER,147
 81 | sp|C1BL82|DDRGK_OSMMO,GEADELTE,188
 82 | sp|B1XSN2|DEF_POLNS,KRAKELIG,162
 83 | sp|P54925|DEGPL_BARHE,SAALETAL,16
 84 | sp|Q1LU74|DER_BAUCH,KRISTALL,362
 85 | sp|O60231|DHX16_HUMAN,WERTERER,164
 86 | sp|Q7YR39|DHX16_PANTR,WERTERER,167
 87 | sp|Q767K6|DHX16_PIG,WERTERER,168
 88 | sp|Q08387|DNLI4_YEAST,ELEKTIVE,710
 89 | sp|Q14185|DOCK1_HUMAN,IMMENSES,661
 90 | sp|Q8BUR4|DOCK1_MOUSE,IMMENSES,661
 91 | sp|Q9BY84|DUS16_HUMAN,GESIMSEN,633
 92 | sp|B3ELV8|END4_CHLPB,ALTAKTIE,43
 93 | sp|Q9SN20|FB200_ARATH,ANEKELNS,375
 94 | sp|Q9LH52|FLOR1_ARATH,ENDKNALL,26
 95 | sp|B8ZUV4|FOLD_MYCLB,DENTALER,105
 96 | sp|O32879|FOLD_MYCLE,DENTALER,105
 97 | sp|Q3JCN1|G6PI_NITOC,GEPINNTE,83
 98 | sp|Q99KY4|GAK_MOUSE,KAPITELL,325
 99 | sp|P97874|GAK_RAT,KAPITELL,325
100 | sp|A0A1D8PNP3|GAP6_CANAL,PLAGIATS,389
101 | sp|A6VBJ8|GATA_PSEA7,ERRANGEN,59
102 | sp|B7V023|GATA_PSEA8,ERRANGEN,59
103 | sp|Q02GV8|GATA_PSEAB,ERRANGEN,59
104 | sp|Q9HVT8|GATA_PSEAE,ERRANGEN,59
105 | sp|Q56198|GLK_STAXY,TAGLILIE,272
106 | sp|Q985F6|GLNE_RHILO,PARAGRAF,664
107 | sp|Q9T0P4|GLTB2_ARATH,PASSIVER,965
108 | sp|A8XLW0|GOSR1_CAEBR,KARSTENS,11
109 | sp|Q95ZW1|GOSR1_CAEEL,KARSTENS,11
110 | sp|P52033|GPXC_DIRIM,FIDELERE,213
111 | sp|A4J6H0|GSA_DESRM,TENDIERT,411
112 | sp|Q9ULI3|HEG1_HUMAN,FIEPSTEN,396
113 | sp|A6WCV0|HIS7_KINRD,TARIERST,4
114 | sp|O17894|HM35_CAEEL,KREISRAT,248
115 | sp|P13545|HMB1_STRPU,GENESEST,262
116 | sp|Q9FN19|HOS15_ARATH,EREIFERE,175
117 | sp|A0LH26|HRCA_SYNFM,LEEREREM,86
118 | sp|Q89A17|HSCB_BUCBP,ELFERLEI,96
119 | sp|Q5PB86|HTPG_ANAMM,GESELLEN,594
120 | sp|Q8SQG8|HYAL2_BOVIN,LISTIGES,313
121 | sp|Q12891|HYAL2_HUMAN,LISTIGES,310
122 | sp|O35632|HYAL2_MOUSE,LISTIGES,310
123 | sp|Q9Z2Q3|HYAL2_RAT,LISTIGES,310
124 | sp|Q8SQG7|HYAL2_SHEEP,LISTIGES,313
125 | sp|Q05A56|HYAL4_MOUSE,LISTIGES,321
126 | sp|Q2G5E7|IF2_NOVAD,KARELIER,739
127 | sp|Q3V3Q4|IFI8_MOUSE,TEILSTIL,188
128 | sp|O28294|ILVC_ARCFU,KALEVALA,161
129 | sp|Q17R60|IMPG1_HUMAN,NETTESTE,42
130 | sp|B7L043|KDSB_METC4,ADLERART,232
131 | sp|A9VZK8|KDSB_METEP,ADLERART,232
132 | sp|B1ZJ23|KDSB_METPB,ADLERART,232
133 | sp|O94806|KPCD3_HUMAN,SEILRISS,463
134 | sp|O77384|LRR4_PLAF7,ENDENDEN,3332
135 | sp|O60732|MAGC1_HUMAN,GEPRELLT,1038
136 | sp|Q9UBF1|MAGC2_HUMAN,GEPRELLT,268
137 | sp|C0NF00|MDM12_AJECG,TIPPELEI,52
138 | sp|A6QYC8|MDM12_AJECN,TIPPELEI,52
139 | sp|C5GK63|MDM12_AJEDR,TIPPELEI,52
140 | sp|A1CNY1|MDM12_ASPCL,TIPPELEI,52
141 | sp|B0XN24|MDM12_ASPFC,TIPPELEI,52
142 | sp|Q4WRX2|MDM12_ASPFU,TIPPELEI,52
143 | sp|A2QAU8|MDM12_ASPNC,TIPPELEI,52
144 | sp|C5K0S2|MDM12_BLAGS,TIPPELEI,52
145 | sp|A1D1T8|MDM12_NEOFI,TIPPELEI,52
146 | sp|C1H3V1|MDM12_PARBA,TIPPELEI,52
147 | sp|C1GHQ8|MDM12_PARBD,TIPPELEI,52
148 | sp|C0SE33|MDM12_PARBP,TIPPELEI,52
149 | sp|Q6CI13|MDM12_YARLI,TIPPELEI,51
150 | sp|Q4PFA7|MDM34_USTMA,TIPPELEI,48
151 | sp|Q71YZ2|MEND_LISMF,ALPENSEE,138
152 | sp|Q8Y6K9|MEND_LISMO,ALPENSEE,138
153 | sp|G5EBL2|MES1_CAEEL,VIGILIEN,732
154 | sp|Q5HYA8|MKS3_HUMAN,VERDINGT,136
155 | sp|P40850|MKT1_YEAST,FITTINGS,530
156 | sp|G0SA56|MLP1_CHATD,KRAKELEE,1610
157 | sp|P28810|MMSA_PSEAE,AIRLINES,399
158 | sp|A1KCP8|MNME_AZOSB,ERIGIERT,286
159 | sp|Q1LH94|MNME_CUPMC,ERIGIERT,287
160 | sp|Q0KFG6|MNME_CUPNH,ERIGIERT,287
161 | sp|Q46VM0|MNME_CUPPJ,ERIGIERT,287
162 | sp|A4GAN2|MNME_HERAR,ERIGIERT,293
163 | sp|A6T4D6|MNME_JANMA,ERIGIERT,292
164 | sp|C1D6H7|MNME_LARHH,ERIGIERT,284
165 | sp|Q8Y3H5|MNME_RALSO,ERIGIERT,297
166 | sp|O35024|MNTC_BACSU,MELANIES,381
167 | sp|P48563|MON2_YEAST,SPLITTEN,398
168 | sp|Q12317|MSB4_YEAST,VERKEILT,51
169 | sp|O74502|MSH6_SCHPO,NERVEREI,594
170 | sp|Q8WXI7|MUC16_HUMAN,SPEISTET,8762
171 | sp|Q89DE6|MUTL_BRADU,ALTKANAL,243
172 | sp|B3Q7Y9|MUTL_RHOPT,ALTKANAL,243
173 | sp|B8FJL5|MUTS_DESAL,DRINGEND,539
174 | sp|Q0AEI7|MUTS_NITEC,ERYSIPEL,517
175 | sp|Q56215|MUTS_THEAQ,LEERERER,425
176 | sp|Q63358|MYO9B_RAT,SCREENST,1161
177 | sp|Q99PD7|NCKX3_MOUSE,NENNENDE,417
178 | sp|P39864|NIA_PHYIN,PISSENDE,693
179 | sp|Q6IR61|NIT2A_XENLA,GESTELLS,63
180 | sp|Q6INI7|NIT2B_XENLA,GESTELLS,63
181 | sp|Q12080|NOP53_YEAST,SEETEILS,356
182 | sp|A0A455M2Y3|NTNH_NECSZ,LERNTIPP,369
183 | sp|Q9YDY8|NTPTH_AERPE,REALTEIL,139
184 | sp|Q89JL7|OADC_BRADU,KALKGLAS,256
185 | sp|B7J427|OBG_ACIF2,ERLENWEG,355
186 | sp|B5ELU2|OBG_ACIF5,ERLENWEG,355
187 | sp|Q01323|OTC_NEIFL,GRILLTEN,157
188 | sp|O86408|OTC_NEIPH,GRILLTEN,142
189 | sp|O86415|OTC_NEISU,GRILLTEN,142
190 | sp|P06108|P49_STRLI,STEIGAAL,311
191 | sp|D4N4Z9|PCHTP_TRISP,GEREICHE,101
192 | sp|Q9NJ15|PCSK5_BRACL,SCHRECKT,1335
193 | sp|B0G101|PKS8_DICDI,EISHAIEN,2083
194 | sp|B0S1M8|PLSX_FINM2,ETIENNES,60
195 | sp|P0A4K4|PMRA_STRPN,NATALIAS,119
196 | sp|P0A4K5|PMRA_STRR6,NATALIAS,119
197 | sp|Q8PGR7|PUR4_XANAC,VERHALTE,1000
198 | sp|Q8PCQ7|PUR4_XANCP,VERHALTE,1000
199 | sp|O67775|PUR9_AQUAE,GETAKELT,311
200 | sp|Q72LY0|PURT_LEPIC,DELIKATE,129
201 | sp|Q8EYF0|PURT_LEPIN,DELIKATE,129
202 | sp|A6SX69|RDGC_JANMA,SPRENGEL,46
203 | sp|A9WR65|RISB_RENSM,SAALETAL,149
204 | sp|A2BT57|RLMN_PROMS,KERNLAND,70
205 | sp|Q9GYH7|RME6_CAEEL,ANPRALLS,1050
206 | sp|Q9D304|RN128_MOUSE,LASERGAS,134
207 | sp|Q3ACX1|RNY_CARHZ,VEREHRER,73
208 | sp|P70335|ROCK1_MOUSE,KNISTERT,990
209 | sp|Q63644|ROCK1_RAT,KNISTERT,990
210 | sp|Q5ZKQ3|RPAP3_CHICK,SELTENER,264
211 | sp|Q04EL5|RPOZ_OENOB,KRAKELEE,27
212 | sp|Q6B8N4|RR4_GRATL,NAIVERES,177
213 | sp|B9W9A9|RRT5_CANDC,ENTGEGEN,259
214 | sp|A8LM44|RS7_DINSH,MEERLAGE,116
215 | sp|Q28UW9|RS7_JANSC,MEERLAGE,116
216 | sp|A1B022|RS7_PARDP,MEERLAGE,116
217 | sp|P59061|RS7_RHOCA,MEERLAGE,116
218 | sp|Q160Y2|RS7_ROSDO,MEERLAGE,116
219 | sp|Q5LMR3|RS7_RUEPO,MEERLAGE,116
220 | sp|Q1GK43|RS7_RUEST,MEERLAGE,116
221 | sp|Q12XH7|RSMA_METBU,VAKANTEN,141
222 | sp|P17863|SKI_AVIES,ANEKELST,385
223 | sp|P49140|SKI_CHICK,ANEKELST,406
224 | sp|Q96Q15|SMG1_HUMAN,PRIESEST,1582
225 | sp|Q8BKX6|SMG1_MOUSE,PRIESEST,1580
226 | sp|Q4P9E5|SPB4_USTMA,DREIFELS,522
227 | sp|P17123|SPO12_YEAST,GEDANKEN,53
228 | sp|Q6G2Z4|SYA_BARHE,EKELNDER,753
229 | sp|A5GPF9|SYA_SYNPW,RIESLING,621
230 | sp|A7IAG1|SYE_METB6,AALSPEER,60
231 | sp|Q9V0V2|SYR_PYRAB,KELTERER,536
232 | sp|Q0VSA8|SYV_ALCBS,VERKLAGE,90
233 | sp|Q9R099|TBL2_MOUSE,KRASSEST,412
234 | sp|Q8N3R3|TCAIM_HUMAN,VEREINEN,53
235 | sp|A0A1I4KS07|TCPO_METOL,HEILERIN,330
236 | sp|Q3J7C1|THIC_NITOC,RADTEILS,95
237 | sp|Q54T85|TRA1_DICDI,SKELETTS,4057
238 | sp|Q0I3P3|TRUB_HISS1,AGITIERE,142
239 | sp|B0UU15|TRUB_HISS2,AGITIERE,142
240 | sp|Q8TKS3|UVRB_METAC,DIKTIERT,498
241 | sp|Q8PRZ9|UVRB_METMA,DIKTIERT,498
242 | sp|Q97CP8|VATD_THEVO,VALERIAN,73
243 | sp|J3S836|VCO3_CROAD,DENKREDE,727
244 | sp|A4UGR9|XIRP2_HUMAN,STARRTEN,2953
245 | sp|Q71LX6|XIRP2_RAT,ALTVATER,2393
246 | sp|Q6AWX0|XYLL2_ARATH,REPLIKEN,29
247 | sp|B4EUS1|Y339_PROMH,EREILENS,139
248 | sp|Q3K6A2|Y4965_PSEPF,INSELRAT,77
249 | sp|C3K2M0|Y5418_PSEFS,INSELRAT,77
250 | sp|P31489|YADA1_YEREN,GLASSAAL,385
251 | sp|P0C2W0|YADA2_YEREN,GLASSAAL,352
252 | sp|A1JUB7|YADA_YERE8,GLASSAAL,352
253 | sp|P10858|YADA_YERPS,GLASSAAL,362
254 | sp|Q9CGY1|YJIE_LACLA,FASSTEST,159
255 | sp|Q2UBI2|YME2_ASPOR,SANDLAND,233
256 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_dk.csv:
--------------------------------------------------------------------------------
 1 | .id,Keyword,Offset
 2 | sp|P06105|SC160_YEAST,ANSVARLIG,643
 3 | sp|C3PNE7|SYL_RICAE,SELEKTERE,287
 4 | sp|A1JIX1|TRUB_YERE8,EGALISERE,92
 5 | sp|O34273|TRUB_YEREN,EGALISERE,92
 6 | sp|A7FMS0|TRUB_YERP3,EGALISERE,92
 7 | sp|Q1CC09|TRUB_YERPA,EGALISERE,92
 8 | sp|Q8ZBC4|TRUB_YERPE,EGALISERE,92
 9 | sp|Q1CEL5|TRUB_YERPN,EGALISERE,92
10 | sp|A4TRI1|TRUB_YERPP,EGALISERE,92
11 | sp|Q66F58|TRUB_YERPS,EGALISERE,92
12 | sp|Q60312|Y002_METJA,LYSSKYHED,92
13 | sp|B8GD12|ARLY_CHLAD,ALGIERER,205
14 | sp|Q6BTX0|ATG2_DEBHA,KANTNING,583
15 | sp|Q112Z3|ATPF2_TRIEI,KAKERLAK,65
16 | sp|Q5E769|BAMB_ALIF1,LAKPLADE,144
17 | sp|Q12D73|BIOB2_POLSJ,VAGTPLAN,247
18 | sp|B9MJH4|BIOB_ACIET,VAGTPLAN,252
19 | sp|Q8VCR2|DHB13_MOUSE,VARSLING,242
20 | sp|Q9VNJ5|DISP_DROME,VELANSET,1160
21 | sp|Q5NPS6|DNAK_ZYMMO,ISRAELER,295
22 | sp|Q985F6|GLNE_RHILO,PARAGRAF,664
23 | sp|Q4URM1|GLO2_XANC8,VANDGRAV,18
24 | sp|B0RTE9|GLO2_XANCB,VANDGRAV,18
25 | sp|Q8PBY0|GLO2_XANCP,VANDGRAV,18
26 | sp|Q05584|GLO2_YEAST,GENKALDE,206
27 | sp|P14750|HCYA_APHSP,FIREDELE,411
28 | sp|Q58CP0|IDH3G_BOVIN,LIVRENTE,163
29 | sp|P51553|IDH3G_HUMAN,LIVRENTE,164
30 | sp|P41564|IDH3G_MACFA,LIVRENTE,126
31 | sp|P70404|IDHG1_MOUSE,LIVRENTE,164
32 | sp|P41565|IDHG1_RAT,LIVRENTE,164
33 | sp|Q3SKX1|IF2_THIDA,KANTNING,569
34 | sp|Q27564|KITH_DICDI,AFSKRIVE,155
35 | sp|P50455|LEU3_SULTO,LIVRENTE,114
36 | sp|A2SZS3|L_RVFV,SEKSTANT,1175
37 | sp|P27316|L_RVFVZ,SEKSTANT,1175
38 | sp|C5NZL6|MEP8_COCP7,AFHANDLE,151
39 | sp|P40850|MKT1_YEAST,FITTINGS,530
40 | sp|Q80XB4|NRAP_MOUSE,KASSEVIS,1051
41 | sp|B1ZRS0|NUON1_OPITP,VALGSLAG,398
42 | sp|P13909|PAI1_BOVIN,SALTSILD,271
43 | sp|P79335|PAI1_PIG,SALTSILD,271
44 | sp|P24004|PEX1_YEAST,GENSIDIG,1030
45 | sp|Q13YI7|PHNW1_PARXL,HALVLANG,85
46 | sp|A0A8J9RIY3|PHP21_PHOLO,TALELYST,400
47 | sp|A0A142I735|PHP22_PHOLO,TALELYST,400
48 | sp|Q42556|PMA9_ARATH,STYRELSE,903
49 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315
50 | sp|Q17XC9|RIBA_HELAH,MALERISK,74
51 | sp|B6JM32|RIBA_HELP2,MALERISK,74
52 | sp|Q1CT68|RIBA_HELPH,MALERISK,74
53 | sp|Q9ZL42|RIBA_HELPJ,MALERISK,74
54 | sp|O08315|RIBA_HELPY,MALERISK,74
55 | sp|P52822|RL5A_SCHPO,PEGEFELT,125
56 | sp|O74306|RL5B_SCHPO,PEGEFELT,125
57 | sp|P67284|RNY_STRP1,LIVSALIG,7
58 | sp|P0DF20|RNY_STRP3,LIVSALIG,7
59 | sp|Q5XAP0|RNY_STRP6,LIVSALIG,7
60 | sp|Q8P000|RNY_STRP8,LIVSALIG,7
61 | sp|Q1JAJ3|RNY_STRPB,LIVSALIG,7
62 | sp|Q1JKP5|RNY_STRPC,LIVSALIG,7
63 | sp|Q1JFN6|RNY_STRPD,LIVSALIG,7
64 | sp|Q1J5I5|RNY_STRPF,LIVSALIG,7
65 | sp|A2RD66|RNY_STRPG,LIVSALIG,7
66 | sp|Q48S17|RNY_STRPM,LIVSALIG,7
67 | sp|P0DF21|RNY_STRPQ,LIVSALIG,7
68 | sp|Q5UZR5|RPO1C_HALMA,RETLINET,289
69 | sp|Q21M92|RPOC_SACD2,RIDETIME,551
70 | sp|B1MGA0|RS4_MYCA9,VARETAGE,152
71 | sp|A5GPF9|SYA_SYNPW,RIESLING,621
72 | sp|Q5P7Y0|SYFB_AROAE,SLAGVARE,170
73 | sp|Q4R7U0|TMC7_MACFA,PLAYLIST,265
74 | sp|Q8C428|TMC7_MOUSE,PLAYLIST,263
75 | sp|Q9Y2B5|VP9D1_HUMAN,SAMKLANG,15
76 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_en.csv:
--------------------------------------------------------------------------------
  1 | .id,Keyword,Offset
  2 | sp|Q2TAC2|CCD57_HUMAN,SLAVERERS,410
  3 | sp|B7ZRM8|EVI1B_XENLA,NIDERINGS,861
  4 | sp|A9BDD9|PYRG_PROM4,HACIENDAS,315
  5 | sp|P17284|VIF_SIVCZ,ALKALISER,150
  6 | sp|A8AY34|ADDB_STRGC,SLYNESSES,820
  7 | sp|Q96LP6|CL042_HUMAN,TARGETEER,145
  8 | sp|Q8L5Z1|GDL17_ARATH,TETANILLA,341
  9 | sp|Q8TV85|METK_METKA,DELLENITE,376
 10 | sp|P25202|RPC1_GIAIN,CHAPSTICK,163
 11 | sp|P03700|VINT_LAMBD,HIDALGISM,247
 12 | sp|Q5FTU6|2KGR_GLUOX,SHARPEST,172
 13 | sp|Q54ET6|ABPF_DICDI,KNELLING,911
 14 | sp|P31562|ACCD_CUSRE,SKINKING,98
 15 | sp|A9L9A5|ACCD_LEMMI,FINELESS,110
 16 | sp|Q1QPW6|ACKA_NITHX,RELEASED,284
 17 | sp|Q5FJW9|ADDB_LACAC,FLINKITE,467
 18 | sp|Q38X70|ADDB_LATSS,ALTERING,469
 19 | sp|Q8TF27|AGA11_HUMAN,STREEKER,422
 20 | sp|Q9UPQ3|AGAP1_HUMAN,STREEKER,703
 21 | sp|Q8BXK8|AGAP1_MOUSE,STREEKER,703
 22 | sp|Q96P64|AGAP4_HUMAN,STREEKER,535
 23 | sp|A6NIR3|AGAP5_HUMAN,STREEKER,558
 24 | sp|Q5VW22|AGAP6_HUMAN,STREEKER,535
 25 | sp|Q5VUJ5|AGAP7_HUMAN,STREEKER,535
 26 | sp|Q4P2W6|ALG10_USTMA,MALACTIC,225
 27 | sp|P12726|ALT_BPT4,ASCIDIAN,667
 28 | sp|Q84NP7|AMPD_ORYSJ,SAFETIED,118
 29 | sp|P91885|AMPN_MANSE,AGERASIA,881
 30 | sp|Q5ZXN6|ANKX_LEGPH,PAHLAVIS,395
 31 | sp|Q6LTE9|APT_PHOPR,PREVISES,83
 32 | sp|Q1AS30|ARGB_RUBXD,GALAGALA,231
 33 | sp|A6V1N6|ARNF_PSEA7,LAVALAVA,48
 34 | sp|B7VBM8|ARNF_PSEA8,LAVALAVA,48
 35 | sp|Q9HY59|ARNF_PSEAE,LAVALAVA,48
 36 | sp|B4SMU6|AROE_STRM5,FLAGLEAF,44
 37 | sp|B2FMH7|AROE_STRMK,FLAGLEAF,44
 38 | sp|Q493B0|AROQ_BLOPB,FRILLING,5
 39 | sp|B6YR08|ATPF_AZOPC,LINELIKE,153
 40 | sp|Q5A4W8|BDF1_CANAL,ASSESSEE,724
 41 | sp|B1KPJ7|BIOB_SHEWM,KATAKANA,332
 42 | sp|O52587|BIOD_MYCBO,SALARIAT,184
 43 | sp|A1KJ01|BIOD_MYCBP,SALARIAT,184
 44 | sp|C1ANK7|BIOD_MYCBT,SALARIAT,184
 45 | sp|P9WPQ4|BIOD_MYCTO,SALARIAT,184
 46 | sp|Q4I7N9|BRE1_GIBZE,ARDELLAE,374
 47 | sp|Q7S304|BRE1_NEUCR,ARDELLAE,393
 48 | sp|P32333|BTAF1_YEAST,KILTLIKE,1005
 49 | sp|Q02294|CAC1B_RAT,PRETENSE,971
 50 | sp|Q8LBH2|CAP8_ARATH,DISASTER,436
 51 | sp|Q86UW7|CAPS2_HUMAN,ASPARKLE,831
 52 | sp|O27158|CAS3_METTH,TRAILERY,645
 53 | sp|A4FXY2|CCA_METM5,SINKLIKE,326
 54 | sp|Q2TAC2|CCD57_HUMAN,SLAVERER,410
 55 | sp|B0CEZ1|CH602_ACAM1,LIGATIVE,437
 56 | sp|Q93G07|CH60_LACAC,DAIKERED,338
 57 | sp|Q22516|CHD3_CAEEL,CRICKETS,331
 58 | sp|Q94F88|CMT3_ARATH,KETIPATE,199
 59 | sp|B2AG52|COAX_CUPTR,GALAGALA,254
 60 | sp|C9K1X7|COTB4_STRMJ,HALLMARK,32
 61 | sp|Q5TZA2|CROCC_HUMAN,REVERSAL,1862
 62 | sp|G0HV85|CSG1_HALHT,ADENITIS,216
 63 | sp|G0HV86|CSG2_HALHT,ADENITIS,217
 64 | sp|B2RX88|CSPP1_MOUSE,ARRANGER,254
 65 | sp|A6WJU3|CYSD_SHEB8,FELLAHIN,85
 66 | sp|Q7ZV84|DAAF1_DANRE,SHILPITS,464
 67 | sp|A0A0H3M776|DARG_MYCBP,GRAVILEA,167
 68 | sp|O53605|DARG_MYCTU,GRAVILEA,167
 69 | sp|P05385|DBH_CLOPA,ALKALIES,24
 70 | sp|Q98KB6|DDLB_RHILO,CADALENE,27
 71 | sp|B1AJ22|DER_UREP2,FAINEANT,76
 72 | sp|Q9PQA7|DER_UREPA,FAINEANT,76
 73 | sp|B5ZBM9|DER_UREU1,FAINEANT,76
 74 | sp|Q9CCG2|DNAG_MYCLE,DRIGHTIN,92
 75 | sp|Q04503|DP87_DICDI,ASSESSES,543
 76 | sp|Q05FI2|EFG_CARRP,KITLINGS,554
 77 | sp|A5D5I8|EFTU2_PELTS,KIDNAPEE,49
 78 | sp|P14895|ELI5_HORVU,PAPERING,127
 79 | sp|P14896|ELI6_HORVU,PAPERING,64
 80 | sp|P14897|ELI9_HORVU,PAPERING,69
 81 | sp|P93735|ELIP1_ARATH,PAPERING,95
 82 | sp|Q94K66|ELIP2_ARATH,PAPERING,93
 83 | sp|P11432|ELI_PEA,PAPERING,96
 84 | sp|B7ZRM8|EVI1B_XENLA,NIDERING,861
 85 | sp|Q19262|EXOC3_CAEEL,ARDELLAE,152
 86 | sp|F1P065|FARP1_CHICK,ASTRAEID,840
 87 | sp|Q9LPH0|FB57_ARATH,PISSANTS,351
 88 | sp|Q9LXQ4|FBL50_ARATH,SKIPPETS,155
 89 | sp|A0A7L8UWS6|FFSC_ASPFV,PARTLESS,3
 90 | sp|Q91740|FINC_XENLA,PREVISES,1128
 91 | sp|C4ZBG8|FTHS_AGARV,MAILCLAD,203
 92 | sp|A3PM52|FTHS_CERS1,MAILCLAD,205
 93 | sp|Q3J047|FTHS_CERS4,MAILCLAD,205
 94 | sp|B9KLK4|FTHS_CERSK,MAILCLAD,205
 95 | sp|P0ABH3|FTSA_SHIFL,DANGLING,421
 96 | sp|Q9CD58|FTSH_MYCLE,GANGSHAG,680
 97 | sp|Q59W62|GIN4_CANAL,SIDELANG,1098
 98 | sp|A8ANL5|GLAH_CITK8,GALLINAE,93
 99 | sp|Q5Z175|GLPK_NOCFA,GLISSADE,317
100 | sp|Q03877|GP85_TRYCR,ATLANTES,538
101 | sp|O14357|GPI1_SCHPO,VILLAINY,252
102 | sp|Q08726|GPN2_YEAST,REGALIAN,279
103 | sp|Q4P3F1|HCS1_USTMA,FAIRINGS,379
104 | sp|A0A0E3NEE1|HDRD_METTT,FLAGLIKE,273
105 | sp|B9M416|HEM3_GEODF,TRINKLET,153
106 | sp|Q9VR91|HERC2_DROME,HAIRLESS,1063
107 | sp|P49007|HEXB_PSEO7,FAITHING,91
108 | sp|O88850|HIPK3_RAT,PALSTAVE,965
109 | sp|Q4A048|HIS4_STAS1,KINGWEED,138
110 | sp|Q5A1W9|HST3_CANAL,PASSINGS,453
111 | sp|P0A4M4|HST_VIBMI,DANGLING,18
112 | sp|Q5WJE6|HTPG_ALKCK,GESNERIA,438
113 | sp|A9H863|HUTH_GLUDA,SHREDDER,271
114 | sp|A1JSW6|HUTH_YERE8,STRIATED,62
115 | sp|Q8ZA10|HUTH_YERPE,STRIATED,65
116 | sp|Q664B8|HUTH_YERPS,STRIATED,65
117 | sp|C8VJW0|HXNR_EMENI,GAPPIEST,339
118 | sp|Q53479|IDSA_METTM,REVEALED,194
119 | sp|Q8TVE5|IF2G_METKA,LEVELLER,330
120 | sp|A4JDX1|IF2_BURVG,PAGATPAT,313
121 | sp|Q5AIA4|IML1_CANAL,PALMIPES,1051
122 | sp|P40559|INP51_YEAST,ELEGISED,261
123 | sp|Q6P4Y6|IRS1_XENTR,FRAPPEED,568
124 | sp|P63394|IRTB_MYCBO,PALESTRA,295
125 | sp|P9WQJ6|IRTB_MYCTO,PALESTRA,295
126 | sp|P9WQJ7|IRTB_MYCTU,PALESTRA,295
127 | sp|Q8GYU3|IYO_ARATH,FLAGLESS,1181
128 | sp|A1L317|K1C24_MOUSE,CLEADING,242
129 | sp|C7GWZ2|KEX1_YEAS2,SWADDLES,674
130 | sp|C8Z852|KEX1_YEAS8,SWADDLES,674
131 | sp|E7NHF8|KEX1_YEASO,SWADDLES,675
132 | sp|P09620|KEX1_YEAST,SWADDLES,666
133 | sp|Q6PAR0|KLD10_MOUSE,AMALINGS,208
134 | sp|Q5U3Y0|KLD10_RAT,AMALINGS,179
135 | sp|P42215|KPSU1_ECOLX,VIVIPARY,5
136 | sp|P42216|KPSU5_ECOLX,VIVIPARY,5
137 | sp|Q8ZY35|KTHY_PYRAE,LIKEWALK,42
138 | sp|Q12729|LAC1_PLEOS,FELTINGS,367
139 | sp|Q5P089|LEPA_AROAE,DRILLMAN,221
140 | sp|Q2KIB6|LIN7B_BOVIN,RAVELLER,16
141 | sp|Q9HAP6|LIN7B_HUMAN,RAVELLER,16
142 | sp|O88951|LIN7B_MOUSE,RAVELLER,16
143 | sp|Q9Z252|LIN7B_RAT,RAVELLER,16
144 | sp|Q8IVB5|LIX1L_HUMAN,RELASTER,293
145 | sp|Q8BQ89|LIX1L_MOUSE,RELASTER,293
146 | sp|Q5PQQ7|LIX1L_RAT,RELASTER,294
147 | sp|B3P851|LST2_DROER,DEEDLESS,449
148 | sp|B4PRU6|LST2_DROYA,DEEDLESS,449
149 | sp|Q8X5R8|MDTO_ECO57,RALLIERS,570
150 | sp|Q8FAX2|MDTO_ECOL6,RALLIERS,570
151 | sp|P32715|MDTO_ECOLI,RALLIERS,570
152 | sp|Q83IQ8|MDTO_SHIFL,RALLIERS,567
153 | sp|Q7Q6D9|MED24_ANOGA,TASSELED,862
154 | sp|Q07V68|MIAB_RHOP5,PELLEKAR,117
155 | sp|Q22227|MIG5_CAEEL,PLASMASE,319
156 | sp|P40850|MKT1_YEAST,FITTINGS,530
157 | sp|P28810|MMSA_PSEAE,AIRLINES,399
158 | sp|Q6NHQ7|MNMA_CORDI,ALLERGIA,109
159 | sp|Q8FQ01|MNMA_COREF,ALLERGIA,109
160 | sp|A4QDK1|MNMA_CORGB,ALLERGIA,109
161 | sp|Q8NR24|MNMA_CORGL,ALLERGIA,109
162 | sp|A8M5E1|MNMA_SALAI,PADPIECE,295
163 | sp|Q88RX6|MNMG_LACPL,GLIDDERY,464
164 | sp|P48563|MON2_YEAST,SPLITTEN,398
165 | sp|G5E8K6|MOT6_MOUSE,AVAILING,310
166 | sp|Q96J65|MRP9_HUMAN,DEVILLED,669
167 | sp|Q80WJ6|MRP9_MOUSE,DEVILLED,670
168 | sp|Q6Y306|MRP9_RAT,DEVILLED,670
169 | sp|Q09816|MTAP_SCHPO,REDIPPED,98
170 | sp|Q1MAC8|MTGA_RHIL3,DIAPERED,2
171 | sp|Q6NTN5|MTMRD_XENLA,PASSLESS,1216
172 | sp|Q6Z7K5|MTP3_ORYSJ,FLAGGILY,153
173 | sp|A8GQA9|MUTL_RICAH,VERRIERE,404
174 | sp|P61666|MUTS_DESVH,REASPIRE,319
175 | sp|Q10YG4|MUTS_TRIEI,LETTERER,500
176 | sp|Q875Q8|MYO2_LACK1,PILEATED,67
177 | sp|Q876G9|MYO2_SACU7,PILEATED,67
178 | sp|P19524|MYO2_YEAST,PILEATED,67
179 | sp|C3VEQ3|NCED_ONCHC,RIPPLING,155
180 | sp|Q5QGS0|NEXMI_HUMAN,GENTLING,15
181 | sp|Q2RGI2|NNR_MOOTA,RADIALLY,119
182 | sp|Q5KZL2|NORM_GEOKA,LAVALAVA,97
183 | sp|Q5L6C0|NQRB_CHLAB,PALSGRAF,196
184 | sp|Q823P2|NQRB_CHLCV,PALSGRAF,196
185 | sp|Q253X4|NQRB_CHLFF,PALSGRAF,196
186 | sp|Q9Z8B6|NQRB_CHLPN,PALSGRAF,196
187 | sp|Q15YQ5|NQRB_PSEA6,PALSGRAF,199
188 | sp|P0C6Z2|NSP6_ROTBU,WISPLIKE,89
189 | sp|B3SRR8|NSP6_ROTH7,WISPLIKE,89
190 | sp|Q9E8F1|NSP6_ROTRF,WISPLIKE,89
191 | sp|P0C712|NSP6_ROTW3,WISPLIKE,89
192 | sp|A8FKG9|NUSB_CAMJ8,LAKELAND,107
193 | sp|A7H4Z5|NUSB_CAMJD,LAKELAND,107
194 | sp|Q9PIC0|NUSB_CAMJE,LAKELAND,107
195 | sp|A1VYA2|NUSB_CAMJJ,LAKELAND,107
196 | sp|Q5HW85|NUSB_CAMJR,LAKELAND,107
197 | sp|A2VDP6|NXPE3_BOVIN,GRISETTE,217
198 | sp|Q969Y0|NXPE3_HUMAN,GRISETTE,217
199 | sp|Q5RCA5|NXPE3_PONAB,GRISETTE,217
200 | sp|C1DEA4|OBG_AZOVD,PALTERER,271
201 | sp|A6VBV3|OBG_PSEA7,PALTERER,271
202 | sp|B7V0A9|OBG_PSEA8,PALTERER,271
203 | sp|Q02GB1|OBG_PSEAB,PALTERER,271
204 | sp|Q9HVL8|OBG_PSEAE,PALTERER,271
205 | sp|Q9SA38|OCT3_ARATH,SLEETIER,24
206 | sp|Q57483|OM26_HAEIN,TALALGIA,8
207 | sp|Q7VNN8|ORN_HAEDU,TANGLIER,68
208 | sp|G4N285|OXR1_MAGO7,GRASSILY,385
209 | sp|P54893|P5CR_THET2,IMAGISTS,89
210 | sp|Q9RZV8|PARB3_DEIRA,ANALGIAS,135
211 | sp|P32854|PEP12_YEAST,LEASEMAN,148
212 | sp|Q6QNF3|PGFRB_CANLF,RATLINES,788
213 | sp|P09619|PGFRB_HUMAN,RATLINES,788
214 | sp|P9WPG2|PGSA_MYCTO,AGRARIAN,21
215 | sp|P9WPG3|PGSA_MYCTU,AGRARIAN,21
216 | sp|Q5NL86|PLSX_ZYMMO,LAPACTIC,58
217 | sp|B8D9F8|PNP_BUCA5,SAVAGISM,459
218 | sp|P57454|PNP_BUCAI,SAVAGISM,459
219 | sp|Q8K9H5|PNP_BUCAP,SAVAGISM,459
220 | sp|B8D7R0|PNP_BUCAT,SAVAGISM,459
221 | sp|B3R3W3|PNP_CUPTR,SPINAGES,703
222 | sp|Q2FUB2|POK_METHJ,PRECARIA,124
223 | sp|Q65730|POLG_BSTV1,INVIRILE,2367
224 | sp|Q65729|POLG_BSTVG,INVIRILE,200
225 | sp|O92529|POLG_HCVT5,FLATTING,1074
226 | sp|Q03ZQ0|POTA_LEUMM,DREIDELS,133
227 | sp|P23287|PP2B1_YEAST,ASSAILED,444
228 | sp|P54882|PPX1_MYCLE,LEGISTER,258
229 | sp|Q9LJX4|PUM5_ARATH,THREEPED,167
230 | sp|B8DTV0|PUR7_BIFA0,GRILLADE,186
231 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315
232 | sp|Q02099|RAD3_SCHPO,AVENTAIL,630
233 | sp|Q3AB99|RBFA_CARHZ,AGENESES,118
234 | sp|C1CWJ1|RF1_DEIDV,LARDERER,344
235 | sp|O28190|RFHPS_ARCFU,SNAGGIER,277
236 | sp|Q9U6Y8|RFP_DISSP,EASTERLY,144
237 | sp|A4WW77|RIMP_CERS5,DECADIST,56
238 | sp|P23408|RK22_PEA,SAGANASH,136
239 | sp|Q6MRX8|RL10_MYCMS,KAMAAINA,152
240 | sp|Q49ZE1|RL17_STAS1,SERVETTE,27
241 | sp|B5EHX2|RL25_CITBB,PIGTAILS,166
242 | sp|C6E500|RL25_GEOSM,PIGTAILS,166
243 | sp|Q9V1V6|RL30_PYRAB,KINDLIER,146
244 | sp|B1KHY7|RL9_SHEWM,LAVATERA,62
245 | sp|A8LLC1|RNH_DINSH,GALLIARD,22
246 | sp|A9AXK0|RNPA_HERA2,TAVERNRY,52
247 | sp|O27438|RPA_METTH,PREFERED,434
248 | sp|Q8KWX2|RPOB_EHRCR,MARKLAND,297
249 | sp|Q8EM52|RPOE_OCEIH,DELEADED,155
250 | sp|B4M416|RRF2M_DROVI,SINKLESS,218
251 | sp|O66928|RRF_AQUAE,ELEGISED,143
252 | sp|A2C451|RS16_PROM1,DATASETS,102
253 | sp|Q3A9S2|RS3_CARHZ,RIVIERAS,54
254 | sp|Q92QG4|RS3_RHIME,SERRATES,215
255 | sp|C3MAY6|RS3_SINFN,SERRATES,215
256 | sp|A6U865|RS3_SINMW,SERRATES,215
257 | sp|P54024|RS9_METJA,PILLAGEE,49
258 | sp|Q86VD7|S2542_HUMAN,GALAGALA,40
259 | sp|Q8R0Y8|S2542_MOUSE,GALAGALA,40
260 | sp|P0C546|S2542_RAT,GALAGALA,40
261 | sp|Q5F468|S38A2_CHICK,ADENITIS,266
262 | sp|Q96FL8|S47A1_HUMAN,REELRALL,32
263 | sp|Q5RFD2|S47A1_PONAB,REELRALL,32
264 | sp|A7KAU2|S47A1_RABIT,REELRALL,31
265 | sp|L0HB77|SBHS7_THYVU,TASSELER,499
266 | sp|Q9UQD0|SCN8A_HUMAN,SHREDDED,43
267 | sp|Q9WTU3|SCN8A_MOUSE,SHREDDED,43
268 | sp|O88420|SCN8A_RAT,SHREDDED,43
269 | sp|A1D3V8|SDS23_NEOFI,REVISING,276
270 | sp|A1DLN3|SEC16_NEOFI,DEPRAVED,112
271 | sp|Q6AJK1|SECA_DESPS,TRINDLES,26
272 | sp|C7NC37|SECD_LEPBD,DIALLING,261
273 | sp|Q9D7Y9|SLX4I_MOUSE,KEELHALE,353
274 | sp|Q8MNV7|SMAL1_CAEEL,GRILLADE,216
275 | sp|Q6IUP1|SOLH1_MOUSE,RESELLER,178
276 | sp|P17123|SPO12_YEAST,GEDANKEN,53
277 | sp|P0C586|SSY23_ORYSI,AVERAGED,79
278 | sp|Q0DDE3|SSY23_ORYSJ,AVERAGED,79
279 | sp|A5GPF9|SYA_SYNPW,RIESLING,621
280 | sp|B3CN54|SYC_WOLPP,HEMATEIN,362
281 | sp|A3PHK2|SYE1_CERS1,PELLEKAR,364
282 | sp|Q9ZFA3|SYE1_CERS4,PELLEKAR,364
283 | sp|A4WX62|SYE2_CERS5,PELLEKAR,364
284 | sp|Q8RHB5|SYFB_FUSNN,DIKESIDE,13
285 | sp|B3QS95|SYH_CHLT3,AGAPHITE,219
286 | sp|A7HM68|SYK_FERNB,GRIMSIRE,58
287 | sp|Q8WXH0|SYNE2_HUMAN,MISSPEAK,3319
288 | sp|A1TYU8|SYR_MARN8,VAALPENS,67
289 | sp|A1AVC2|SYR_RUTMC,SEALLIKE,483
290 | sp|Q0AND9|SYS_MARMM,ETERNALS,48
291 | sp|P17222|T1SP_ECOLX,ENSILIST,81
292 | sp|Q5VWN6|TASO2_HUMAN,TEEMLESS,635
293 | sp|Q46149|TCDA_CLONO,SIGFILES,1074
294 | sp|Q5UPT1|TF2B_MIMIV,EKISTICS,244
295 | sp|B5Z7K9|THIM_HELPG,LENSLIKE,251
296 | sp|Q1CT25|THIM_HELPH,LENSLIKE,251
297 | sp|Q9ZKZ9|THIM_HELPJ,LENSLIKE,250
298 | sp|B2USY3|THIM_HELPS,LENSLIKE,250
299 | sp|O25516|THIM_HELPY,LENSLIKE,250
300 | sp|Q0PDK7|TMP_BPSPP,FATAGAGA,111
301 | sp|P51743|TNFA_CEREL,CANALMAN,104
302 | sp|P78875|TPP1_SCHPO,TRINKETS,774
303 | sp|E2E2P2|TPS1D_ORIVU,TASSELER,503
304 | sp|Q5NPZ5|TRPF_ZYMMO,HETAERIA,85
305 | sp|O97399|TRYP_PHACE,DIALLELA,117
306 | sp|Q6PCN3|TTBK1_MOUSE,TEMESCAL,984
307 | sp|P59367|TX35C_PHONI,ARCADING,33
308 | sp|Q9VYV3|TXND5_DROME,LAKELIKE,207
309 | sp|Q9XZ16|UBCP1_DROME,STEADIED,79
310 | sp|B2RM62|UVRC_PORG3,LENSLIKE,80
311 | sp|Q7MTG8|UVRC_PORGI,LENSLIKE,79
312 | sp|Q8SQU9|VATA_ENCCU,DISKLIKE,509
313 | sp|Q97CP8|VATD_THEVO,VALERIAN,73
314 | sp|B5YFA5|VATE_DICT6,RIVERLET,5
315 | sp|P17284|VIF_SIVCZ,ALKALISE,150
316 | sp|B8I9N8|XERC_METNO,LALLYGAG,161
317 | sp|B0UNY7|XERC_METS4,LALLYGAG,161
318 | sp|Q6R7F2|Y077_OSHVF,ANTECELL,1119
319 | sp|Q9X0P5|Y1162_THEMA,TREMELLA,145
320 | sp|B7IVJ7|Y1177_BACC2,FLAKIEST,188
321 | sp|O67364|Y1349_AQUAE,FREAKIER,23
322 | sp|B1LCS9|Y1653_THESQ,TREMELLA,145
323 | sp|P47490|Y248_MYCGE,KRISTIAN,3
324 | sp|Q9K275|Y344_CHLPN,PLANILLA,110
325 | sp|A0RHX6|Y3586_BACAH,FLAKIEST,188
326 | sp|Q6HEK2|Y3705_BACHK,FLAKIEST,188
327 | sp|Q635W6|Y3720_BACCZ,FLAKIEST,188
328 | sp|B9IW40|Y3749_BACCQ,FLAKIEST,188
329 | sp|A9VUC1|Y3786_BACMK,FLAKIEST,188
330 | sp|Q81MS4|Y3872_BACAN,FLAKIEST,188
331 | sp|Q819L6|Y3960_BACCR,FLAKIEST,188
332 | sp|B7JKT4|Y3975_BACC0,FLAKIEST,188
333 | sp|Q732A7|Y4007_BACC1,FLAKIEST,188
334 | sp|B7H6U7|Y4063_BACC4,FLAKIEST,188
335 | sp|C1EPX4|Y4065_BACC3,FLAKIEST,188
336 | sp|B7HME5|Y4079_BACC7,FLAKIEST,188
337 | sp|C3P6W7|Y4195_BACAA,FLAKIEST,188
338 | sp|C3LI26|Y4213_BACAC,FLAKIEST,188
339 | sp|Q8LDV3|Y4320_ARATH,VINERIES,85
340 | sp|P75197|Y583_MYCPN,STETTING,170
341 | sp|Q5UPM1|YL149_MIMIV,INSTINCT,140
342 | sp|P42545|YO10_BPL2,GANTLINE,65
343 | sp|Q3ZC82|ZC3HE_BOVIN,RELAPSED,163
344 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_fi.csv:
--------------------------------------------------------------------------------
  1 | .id,Keyword,Offset
  2 | sp|O29876|OGG1_ARCFU,STANSSATA,47
  3 | sp|Q92R46|ERA_RHIME,AIKAISTAA,265
  4 | sp|A6U7A9|ERA_SINMW,AIKAISTAA,262
  5 | sp|A5GWN4|GCSP_SYNR3,TAVALLAAN,775
  6 | sp|P36337|GH_MEHV1,NARRAILLA,488
  7 | sp|A5CCZ2|HTPG_ORITB,TIIKKINEN,574
  8 | sp|Q74MI1|SYC_NANEQ,TAKSIVENE,324
  9 | sp|A5D7S3|TRM1L_BOVIN,SADETAKKI,480
 10 | sp|Q7Z2T5|TRM1L_HUMAN,SADETAKKI,474
 11 | sp|Q4R6C7|TRM1L_MACFA,SADETAKKI,434
 12 | sp|A2RSY6|TRM1L_MOUSE,SADETAKKI,469
 13 | sp|Q5R5T0|TRM1L_PONAB,SADETAKKI,474
 14 | sp|Q496Z9|TRM1L_RAT,SADETAKKI,465
 15 | sp|O29015|Y1253_ARCFU,KATKEILLA,25
 16 | sp|Q9VW60|ADCY2_DROME,ASIAMIES,773
 17 | sp|P02656|APOC3_HUMAN,VALLALLA,9
 18 | sp|P33622|APOC3_MOUSE,VALLALLA,9
 19 | sp|A9F3R4|ATPB_SORC5,VISKAALI,210
 20 | sp|O05098|ATPF_CLOAB,VALTIKKA,41
 21 | sp|O89001|CBPD_MOUSE,SADANNES,873
 22 | sp|A7ZTS1|CBRB_ECO24,RIIPALLA,125
 23 | sp|A8A6H8|CBRB_ECOHS,RIIPALLA,125
 24 | sp|A1AHP9|CBRB_ECOK1,RIIPALLA,125
 25 | sp|Q0TAZ2|CBRB_ECOL5,RIIPALLA,125
 26 | sp|Q8FBU5|CBRB_ECOL6,RIIPALLA,125
 27 | sp|P31468|CBRB_ECOLI,RIIPALLA,123
 28 | sp|Q1R4L9|CBRB_ECOUT,RIIPALLA,125
 29 | sp|Q0SYQ7|CBRB_SHIF8,RIIPALLA,125
 30 | sp|Q83IZ9|CBRB_SHIFL,RIIPALLA,125
 31 | sp|Q3YWJ8|CBRB_SHISS,RIIPALLA,125
 32 | sp|Q8N326|CJ111_HUMAN,VALSSATA,44
 33 | sp|A9TKY8|CSPL1_PHYPA,SAASTATA,133
 34 | sp|Q98JM5|DPO42_RHILO,VERISIDE,95
 35 | sp|B3NB67|EI3F2_DROER,ALHAALTA,232
 36 | sp|Q4PI64|EIF3H_USTMA,PAPATTAA,17
 37 | sp|P0CN57|EIF3L_CRYNB,PAPATTAA,586
 38 | sp|P0CN56|EIF3L_CRYNJ,PAPATTAA,586
 39 | sp|A5VAA8|FOLD3_RHIWR,ASETELLA,80
 40 | sp|A7GKK3|GATA_BACCN,ALATYYLI,304
 41 | sp|Q1DCA3|GATA_MYXXD,ALATYYLI,305
 42 | sp|Q2FTL0|HEM1_METHJ,KAADELLA,335
 43 | sp|B8GP02|IF2_THISH,AIKAPELI,289
 44 | sp|Q6NFC2|ISPF_CORDI,GRAAVATA,149
 45 | sp|Q8SQP0|KPYK_ENCCU,REKIKELI,19
 46 | sp|B0D8R3|MKAR_LACBS,ALLASTAA,97
 47 | sp|Q6NHQ7|MNMA_CORDI,ALLERGIA,109
 48 | sp|Q8FQ01|MNMA_COREF,ALLERGIA,109
 49 | sp|A4QDK1|MNMA_CORGB,ALLERGIA,109
 50 | sp|Q8NR24|MNMA_CORGL,ALLERGIA,109
 51 | sp|Q05049|MUC1_XENLA,TAPATTAA,72
 52 | sp|Q0AYR3|MURE_SYNWW,ALALLAAN,87
 53 | sp|Q3SYU9|MVP_BOVIN,KARRELLE,701
 54 | sp|O29876|OGG1_ARCFU,TANSSATA,48
 55 | sp|P52591|PO121_RAT,TAPATTAA,768
 56 | sp|C9JH25|PRRT4_HUMAN,VALLALLA,377
 57 | sp|B2RU40|PRRT4_MOUSE,VALLALLA,378
 58 | sp|C5A7L1|PSB1_THEGJ,ALALLEEN,144
 59 | sp|A0A494C071|PWWP4_HUMAN,STARTATA,1054
 60 | sp|A5CCK6|RS3_ORITB,KIINTEYS,42
 61 | sp|B3CT11|RS3_ORITI,KIINTEYS,43
 62 | sp|Q8D3I1|RSMA_WIGBR,KIIKKIIN,17
 63 | sp|Q21MH7|RSMH_SACD2,KAIVERRE,169
 64 | sp|Q6FFZ7|RUTA_ACIAD,VARMASTI,97
 65 | sp|B9JLT9|RUTA_AGRRK,VARMASTI,97
 66 | sp|B0SW63|RUTA_CAUSK,VARMASTI,97
 67 | sp|D5VGV4|RUTA_CAUST,VARMASTI,97
 68 | sp|Q9A4N2|RUTA_CAUVC,VARMASTI,97
 69 | sp|B8H1Q4|RUTA_CAUVN,VARMASTI,97
 70 | sp|A7ME52|RUTA_CROS8,VARMASTI,74
 71 | sp|C9Y0S7|RUTA_CROTZ,VARMASTI,113
 72 | sp|A4W925|RUTA_ENT38,VARMASTI,97
 73 | sp|D5CE32|RUTA_ENTCC,VARMASTI,97
 74 | sp|B5XXN0|RUTA_KLEP3,VARMASTI,97
 75 | sp|A6T7A2|RUTA_KLEP7,VARMASTI,97
 76 | sp|D3RKL0|RUTA_KLEVT,VARMASTI,97
 77 | sp|B7KWT7|RUTA_METC4,VARMASTI,109
 78 | sp|C5B0U9|RUTA_METEA,VARMASTI,97
 79 | sp|C7CM36|RUTA_METED,VARMASTI,97
 80 | sp|A9W3I1|RUTA_METEP,VARMASTI,105
 81 | sp|B1ZB15|RUTA_METPB,VARMASTI,109
 82 | sp|A8GCT6|RUTA_SERP5,VARMASTI,97
 83 | sp|A4VQH4|RUTA_STUS1,VARMASTI,97
 84 | sp|C5CN79|RUTA_VARPS,VARMASTI,97
 85 | sp|A1JMY1|RUTA_YERE8,VARMASTI,97
 86 | sp|Q4FTT9|RUVB_PSYA2,NIRPALLA,24
 87 | sp|Q1QCY5|RUVB_PSYCK,NIRPALLA,24
 88 | sp|Q52428|SYD_THEKO,KYMMENEN,315
 89 | sp|Q5E8Y6|SYGA_ALIF1,KESKELLE,229
 90 | sp|B5FEV9|SYGA_ALIFM,KESKELLE,229
 91 | sp|Q87TP7|SYGA_VIBPA,KESKELLE,229
 92 | sp|P67600|SYV_MYCBO,KELASTAA,838
 93 | sp|Q9CBY7|SYV_MYCLE,KELASTAA,838
 94 | sp|P9WFS8|SYV_MYCTO,KELASTAA,838
 95 | sp|P9WFS9|SYV_MYCTU,KELASTAA,838
 96 | sp|A3LPG0|TRM82_PICST,LISENSSI,373
 97 | sp|Q5NPZ7|TRPA_ZYMMO,KENRAALI,13
 98 | sp|P42664|UVS2_XENLA,KISAILLA,4
 99 | sp|B8DHJ9|Y1020_LISMH,KASKIMAA,78
100 | sp|A8AG56|Y1332_CITK8,ILMAILLA,152
101 | sp|Q8Y6Y2|Y1549_LISMO,KASKIMAA,78
102 | sp|C1KVJ6|Y1560_LISMC,KASKIMAA,78
103 | sp|A0AIZ8|Y1562_LISW6,KASKIMAA,78
104 | sp|Q71ZC0|Y1569_LISMF,KASKIMAA,78
105 | sp|Q92BG5|Y1584_LISIN,KASKIMAA,78
106 | sp|Q5UQE2|YR474_MIMIV,LINTSARI,44
107 | sp|P54992|YSNA_STRPR,PAPATTAA,177
108 | sp|Q551M4|ZFPL1_DICDI,KIINNIKE,275
109 | sp|Q6WRX3|ZY11A_HUMAN,KAKISTAA,110
110 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_fr.csv:
--------------------------------------------------------------------------------
  1 | .id,Keyword,Offset
  2 | sp|P40069|IMB4_YEAST,FERRAILLAI,371
  3 | sp|A1R703|AROB_PAEAT,RELIERAIS,220
  4 | sp|P02537|K1C0_XENLA,HALETANTE,145
  5 | sp|Q98SL1|LDHB_CAICA,LITHIASES,8
  6 | sp|Q93YQ3|PURU1_ARATH,IRRITERAS,2
  7 | sp|A9BDD9|PYRG_PROM4,HACIENDAS,315
  8 | sp|A5EW94|SECB_DICNV,REVISSAIT,114
  9 | sp|P40069|IMB4_YEAST,FERRAILLA,371
 10 | sp|O70576|STAG3_MOUSE,RAMASSAGE,453
 11 | sp|Q99M76|STAG3_RAT,RAMASSAGE,453
 12 | sp|Q8T664|ABCH2_DICDI,ERRERAIS,156
 13 | sp|Q1CY84|SAHH_MYXXD,PALMAIRE,35
 14 | sp|Q8T664|ABCH2_DICDI,SERRERAI,155
 15 | sp|Q8TGA2|AFLA_ASPPU,DAMASSAI,242
 16 | sp|P22197|ALFC7_ARATH,INVENTES,43
 17 | sp|Q1AS71|ALLB_RUBXD,GRAILLAS,222
 18 | sp|A0A2H3CSB7|ARMOM_ARMGA,VIEILLES,378
 19 | sp|A0JX82|AROB_ARTS2,RELIERAI,220
 20 | sp|A1R703|AROB_PAEAT,RELIERAI,220
 21 | sp|B8H8V5|AROB_PSECP,RELIERAI,220
 22 | sp|Q47QY7|AROB_THEFY,RELIERAI,216
 23 | sp|O94649|ATG2_SCHPO,RATISSAI,1174
 24 | sp|Q24MN7|ATPF_DESHY,SALADIER,107
 25 | sp|O52587|BIOD_MYCBO,SALARIAT,184
 26 | sp|A1KJ01|BIOD_MYCBP,SALARIAT,184
 27 | sp|C1ANK7|BIOD_MYCBT,SALARIAT,184
 28 | sp|P9WPQ4|BIOD_MYCTO,SALARIAT,184
 29 | sp|Q6J6I8|BRCA1_GORGO,PELTASTE,1637
 30 | sp|P38398|BRCA1_HUMAN,PELTASTE,1637
 31 | sp|Q9GKK8|BRCA1_PANTR,PELTASTE,1637
 32 | sp|Q6J6J0|BRCA1_PONPY,PELTASTE,1637
 33 | sp|Q9H0E9|BRD8_HUMAN,SELLETTE,69
 34 | sp|Q8R3B7|BRD8_MOUSE,SELLETTE,69
 35 | sp|Q04520|BUDC_RAOTE,AGGRAVAI,49
 36 | sp|P93147|C81E1_GLYEC,GLAIRAIS,440
 37 | sp|Q9LSE1|CDG1_ARATH,CAPEYANT,244
 38 | sp|Q5U3Z0|CF298_RAT,REPLISSE,230
 39 | sp|Q22516|CHD3_CAEEL,CRICKETS,331
 40 | sp|Q9ZV43|CHR8_ARATH,RAFLASSE,674
 41 | sp|Q9ZPR0|COQ4_ARATH,GRAILLER,61
 42 | sp|Q7F2E4|CSB_ORYSJ,RAFLASSE,660
 43 | sp|B2RX88|CSPP1_MOUSE,ARRANGER,254
 44 | sp|C3PNF5|DAPA_RICAE,NICKELLE,214
 45 | sp|A8EYZ4|DAPA_RICCK,NICKELLE,214
 46 | sp|Q92I25|DAPA_RICCN,NICKELLE,214
 47 | sp|A8F1K3|DAPA_RICM5,NICKELLE,220
 48 | sp|Q9AKQ3|DAPA_RICMO,NICKELLE,214
 49 | sp|C4K288|DAPA_RICPU,NICKELLE,214
 50 | sp|Q9AKJ9|DAPA_RICRI,NICKELLE,214
 51 | sp|B0BXJ1|DAPA_RICRO,NICKELLE,214
 52 | sp|A8GS25|DAPA_RICRS,NICKELLE,214
 53 | sp|D0PV95|DDX3_CAEEL,GARDERIE,191
 54 | sp|C5DGU9|DEF1_LACTC,NARRERAS,171
 55 | sp|O62215|DHSD_CAEEL,SAPRISTI,20
 56 | sp|Q14185|DOCK1_HUMAN,IMMENSES,661
 57 | sp|Q8BUR4|DOCK1_MOUSE,IMMENSES,661
 58 | sp|Q0A5C9|DTD_ALKEH,ASPERGEA,97
 59 | sp|A1JIQ4|EPMA_YERE8,SLAVISTE,221
 60 | sp|Q5M9G9|FAKD4_RAT,INSTALLE,455
 61 | sp|Q5YR85|FLUC2_NOCFA,PAILLAIS,12
 62 | sp|P43708|FTN2_HAEIN,SLAVISAI,66
 63 | sp|O83746|FTSH_TREPA,RAVAGEAS,192
 64 | sp|A3PCW7|G6PI_PROM0,RADINAIS,89
 65 | sp|A0A1D8PNP3|GAP6_CANAL,PLAGIATS,389
 66 | sp|Q8Y3C6|GATB_RALSO,GAVERAIT,54
 67 | sp|Q92538|GBF1_HUMAN,PISSASSE,273
 68 | sp|Q2SFI6|GCSP_HAHCH,ALLAITAS,342
 69 | sp|Q5Z175|GLPK_NOCFA,GLISSADE,317
 70 | sp|Q03877|GP85_TRYCR,ATLANTES,538
 71 | sp|P08492|HN_PI3H4,AGNELETS,13
 72 | sp|P12562|HN_PI3HT,AGNELETS,13
 73 | sp|P12563|HN_PI3HU,AGNELETS,13
 74 | sp|P12564|HN_PI3HV,AGNELETS,13
 75 | sp|P12565|HN_PI3HW,AGNELETS,13
 76 | sp|P12566|HN_PI3HX,AGNELETS,13
 77 | sp|C5DYQ1|INA17_ZYGRC,INERTIEL,64
 78 | sp|P0CO17|INO80_CRYNB,REDEVRAI,241
 79 | sp|P0CO16|INO80_CRYNJ,REDEVRAI,241
 80 | sp|P0A1I4|INVA_SALTI,PALLIAIS,244
 81 | sp|P0A1I3|INVA_SALTY,PALLIAIS,244
 82 | sp|O94854|K0754_HUMAN,AGNELLES,2413
 83 | sp|P02537|K1C0_XENLA,HALETANT,145
 84 | sp|Q88Z42|KUP1_LACPL,PALPITER,651
 85 | sp|A1JU76|LCRD_YERE8,PALLIAIT,247
 86 | sp|P0C2V3|LCRD_YEREN,PALLIAIT,247
 87 | sp|P69955|LCRD_YERPE,PALLIAIT,247
 88 | sp|P69956|LCRD_YERPS,PALLIAIT,247
 89 | sp|Q98SL1|LDHB_CAICA,LITHIASE,8
 90 | sp|Q9SRX6|LEA2_ARATH,RAGEASSE,48
 91 | sp|Q6MEF3|LEPA_PARUW,RETIRAIT,545
 92 | sp|Q9UPN3|MACF1_HUMAN,AGNELLES,6286
 93 | sp|D3ZHV2|MACF1_RAT,AGNELLES,4328
 94 | sp|O14323|MCP4_SCHPO,PAVASSES,192
 95 | sp|P0DQK9|MDS1_AGALE,PLAISAIS,6
 96 | sp|L0P329|MDS_AGACL,PLAISAIS,55
 97 | sp|L0P3K3|MDS_AGADC,PLAISAIS,55
 98 | sp|A0A5Q0MU22|MDS_AGASP,PLAISAIS,55
 99 | sp|Q5YRD1|METN_NOCFA,GRIVELAS,222
100 | sp|Q9ZE90|MNMG_RICPR,FILTRATS,397
101 | sp|Q68XT0|MNMG_RICTY,FILTRATS,397
102 | sp|B2VDB1|MRAZ_ERWT9,GRILLANT,90
103 | sp|A3CR17|MUTS_STRSV,GLISSAIS,392
104 | sp|B7VK59|MUTS_VIBA3,SELLERAI,403
105 | sp|Q9Y2K3|MYH15_HUMAN,GALERNES,1561
106 | sp|Q5VU43|MYOME_HUMAN,AVALERAI,485
107 | sp|Q5DTJ9|MYPN_MOUSE,TERRERAS,201
108 | sp|Q606N2|NAGZ_METCA,DALLASSE,311
109 | sp|Q9Y618|NCOR2_HUMAN,GRAISSAS,1319
110 | sp|Q5JPE7|NOMO2_HUMAN,FASEILLE,1252
111 | sp|Q9RL35|NPD1_STRCO,GAGISTES,39
112 | sp|Q8R984|NPD2_CALS4,GAGISTES,28
113 | sp|A8MBU4|NPD_CALMQ,GAGISTES,28
114 | sp|Q6N6U0|NPD_RHOPA,GAGISTES,28
115 | sp|B5YJW3|NPD_THEYD,GAGISTES,27
116 | sp|Q750J0|NPR3_ASHGO,REPASSAI,826
117 | sp|Q9V463|NU154_DROME,VESTALES,453
118 | sp|A2VDP6|NXPE3_BOVIN,GRISETTE,217
119 | sp|Q969Y0|NXPE3_HUMAN,GRISETTE,217
120 | sp|Q5RCA5|NXPE3_PONAB,GRISETTE,217
121 | sp|P24102|PER22_ARATH,AFFALANT,158
122 | sp|O80912|PER23_ARATH,AFFALANT,158
123 | sp|Q9LHB9|PER32_ARATH,AFFALANT,158
124 | sp|Q8CHS4|PLCX1_MOUSE,VAGINITE,271
125 | sp|Q9FZD1|PPR58_ARATH,ASPIRAIS,25
126 | sp|A4YKF1|PROA_BRASO,AMERRIRA,46
127 | sp|P50852|PTMCB_GEOSE,PLANIFIE,181
128 | sp|B8DTV0|PUR7_BIFA0,GRILLADE,186
129 | sp|Q0VRD0|PURT_ALCBS,REVALAIT,252
130 | sp|Q93YQ3|PURU1_ARATH,IRRITERA,2
131 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315
132 | sp|P20742|PZP_HUMAN,PRISASSE,91
133 | sp|Q5NP84|QUEA_ZYMMO,VEILLERA,133
134 | sp|Q5BPM6|QWRF6_ARATH,ALLAITES,316
135 | sp|Q8U4J3|RFCS_PYRFU,INVERTIE,462
136 | sp|P74240|RIR1_SYNY3,REGISTRE,75
137 | sp|Q6N4R7|RL10_RHOPA,RELAVERA,11
138 | sp|Q15RL2|RNFD_PSEA6,TAILLAIS,81
139 | sp|A1SSX2|RNFD_PSYIN,TAILLAIS,81
140 | sp|Q3J9L3|RNH2_NITOC,LAMERAIS,94
141 | sp|C0QB17|RS2_DESAH,AVERTIES,254
142 | sp|Q92QG4|RS3_RHIME,SERRATES,215
143 | sp|C3MAY6|RS3_SINFN,SERRATES,215
144 | sp|A6U865|RS3_SINMW,SERRATES,215
145 | sp|Q0ABH9|RS7_ALKEH,GALERIES,45
146 | sp|Q1CY84|SAHH_MYXXD,EMPALMAI,33
147 | sp|O94855|SC24D_HUMAN,VIENDRAS,270
148 | sp|A5EW94|SECB_DICNV,REVISSAI,114
149 | sp|Q9W6G6|SEM3D_DANRE,PAIRESSE,697
150 | sp|Q8E3Y3|SERC_STRA3,PILLASSE,119
151 | sp|Q8DSV3|SERC_STRMU,PILLASSE,119
152 | sp|A3CPJ2|SERC_STRSV,PILLASSE,119
153 | sp|Q5LYP0|SERC_STRT1,PILLASSE,119
154 | sp|Q5M3A4|SERC_STRT2,PILLASSE,119
155 | sp|Q03JH6|SERC_STRTD,PILLASSE,119
156 | sp|B9DTW4|SERC_STRU0,PILLASSE,119
157 | sp|Q20480|SIR41_CAEEL,GAGISTES,35
158 | sp|Q20481|SIR42_CAEEL,GAGISTES,35
159 | sp|Q1JQC6|SIR4_BOVIN,GAGISTES,63
160 | sp|Q8IRR5|SIR4_DROME,GAGISTES,53
161 | sp|Q9Y6E7|SIR4_HUMAN,GAGISTES,62
162 | sp|Q8R216|SIR4_MOUSE,GAGISTES,59
163 | sp|Q9Z0I7|SLFN1_MOUSE,ALCALINS,43
164 | sp|Q8MNV7|SMAL1_CAEEL,GRILLADE,216
165 | sp|A5GPF9|SYA_SYNPW,RIESLING,621
166 | sp|Q8RB93|SYE1_CALS4,VARIERAI,465
167 | sp|Q4QL12|SYR_HAEI8,INSTALLA,476
168 | sp|A5UCH1|SYR_HAEIE,INSTALLA,476
169 | sp|A5UJ40|SYR_HAEIG,INSTALLA,476
170 | sp|P43832|SYR_HAEIN,INSTALLA,476
171 | sp|A9HLG2|SYS_GLUDA,ALARMERA,168
172 | sp|Q2RSR3|SYY_RHORT,SPLITTAS,226
173 | sp|Q8RI63|THIG_FUSNN,AIMANTAI,200
174 | sp|Q8DUR1|THII_STRMU,VAGINITE,379
175 | sp|Q5JTD0|TJAP1_HUMAN,PASSASSE,400
176 | sp|Q9DCD5|TJAP1_MOUSE,PASSASSE,395
177 | sp|P29463|TPT_SOLTU,PAILLETS,67
178 | sp|C0H537|TRM5_PLAF7,NIELLAGE,409
179 | sp|B3L2G0|TRM5_PLAKH,NIELLAGE,356
180 | sp|Q7UKG9|TRPB_RHOBA,FERLASSE,349
181 | sp|Q9Z4S7|TTRC_SALTY,RALLIAIT,56
182 | sp|Q62377|U2AFM_MOUSE,SERRERAS,365
183 | sp|A4XUW4|UVRC_PSEMY,SALAIRES,134
184 | sp|Q9LTT9|VCR_ARATH,RETISSAS,1149
185 | sp|Q8GYF5|WAKLR_ARATH,SASSERAS,277
186 | sp|Q5UPM1|YL149_MIMIV,INSTINCT,140
187 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_it.csv:
--------------------------------------------------------------------------------
  1 | .id,Keyword,Offset
  2 | sp|P05098|PHEA_MICDP,ANNIDAVATE,45
  3 | sp|Q8NTW4|AFTA_CORGL,TRAVASATI,61
  4 | sp|Q12659|ARO1_PNECA,DIRIGISTI,1484
  5 | sp|Q8W4K3|CAAT4_ARATH,GIALLICCI,264
  6 | sp|Q4PB37|CLF1_USTMA,SALASSARE,623
  7 | sp|Q65T53|CYSJ_MANSM,ALLEVIARE,563
  8 | sp|B3PJ06|HTPX_CELJU,AFFITTIVA,194
  9 | sp|B4LQY8|INT3_DROVI,VESSERETE,538
 10 | sp|A2RNZ6|LYSP_LACLM,ALLATTAVA,371
 11 | sp|Q5AZ53|MANC_EMENI,ALLATTATA,10
 12 | sp|A4SV75|MURC_POLAQ,AVVISTAVA,67
 13 | sp|B1XT10|MURC_POLNS,AVVISTAVA,67
 14 | sp|B1I4X2|MURI_DESAP,GRIGLIATE,110
 15 | sp|D6VTK4|STE2_YEAST,STILLASSI,207
 16 | sp|P0CI39|STE2_YEASX,STILLASSI,207
 17 | sp|Q0KL02|TRIO_MOUSE,SEGHERETE,2402
 18 | sp|F1M0Z1|TRIO_RAT,SEGHERETE,2403
 19 | sp|Q2HA54|PLPL_CHAGB,PARLARMI,688
 20 | sp|Q9LJX0|AB19B_ARATH,SALDASSE,532
 21 | sp|Q8T664|ABCH2_DICDI,SERRERAI,155
 22 | sp|Q8NQ98|ACNA_CORGL,GIRAVITE,831
 23 | sp|Q6ZDQ1|AGM1_ORYSJ,NAVIGAVA,158
 24 | sp|D4B1B1|ALS1_ARTBC,STIPASTI,364
 25 | sp|P47631|AMPA_MYCGE,DIASTASI,418
 26 | sp|P75206|AMPA_MYCPN,DIASTASI,416
 27 | sp|A1SRZ1|AMPA_PSYIN,INVIGILA,312
 28 | sp|Q12QW7|AMPA_SHEDO,INVIGILA,313
 29 | sp|Q086N8|AMPA_SHEFN,INVIGILA,313
 30 | sp|Q9CIQ1|AMPN_LACLA,RELEGAVA,706
 31 | sp|P0C2T8|AMPN_LACLC,RELEGAVA,706
 32 | sp|A2RI32|AMPN_LACLM,RELEGAVA,706
 33 | sp|P45461|AMPR_YEREN,GIALAPPA,233
 34 | sp|Q75A82|ANT1_ASHGO,NAVIGATA,5
 35 | sp|P40532|APQ12_YEAST,INALEREI,115
 36 | sp|P77624|ARCM_ECOLI,STALLARE,219
 37 | sp|B2VBI7|ARNT_ERWT9,ALLAGAVI,420
 38 | sp|C1KWM3|AROA_LISMC,RIDAVATE,345
 39 | sp|Q71Y92|AROA_LISMF,RIDAVATE,345
 40 | sp|B8DC03|AROA_LISMH,RIDAVATE,345
 41 | sp|Q8Y5Y0|AROA_LISMO,RIDAVATE,345
 42 | sp|A0AK35|AROA_LISW6,RIDAVATE,345
 43 | sp|Q9V1H6|AROK_PYRAB,SGRASSAI,4
 44 | sp|P30329|ARSB_STAAU,DIGITALI,24
 45 | sp|Q8CQF4|ARSB_STAES,DIGITALI,24
 46 | sp|Q01255|ARSB_STAXY,DIGITALI,24
 47 | sp|P73241|ATCS_SYNY3,CASSIERA,17
 48 | sp|Q9LSW9|ATL16_ARATH,VIGILATA,41
 49 | sp|P93823|ATL1_ARATH,VIGILATA,48
 50 | sp|C8V3Y7|ATND_EMENI,SEGHETTA,172
 51 | sp|Q13XV9|ATPA1_PARXL,APPAIATE,446
 52 | sp|Q0C0X0|ATPF_HYPNA,RITRAETE,140
 53 | sp|Q9KNG8|ATPZ_VIBCH,ALGINICA,113
 54 | sp|F4KBM7|AVT6B_ARATH,TAGLIAVI,367
 55 | sp|P25927|BIGA_SALTY,VEGGENTI,1297
 56 | sp|P33144|BIMB_EMENI,SVARIATI,446
 57 | sp|Q4I7N9|BRE1_GIBZE,SARDELLA,373
 58 | sp|Q7S304|BRE1_NEUCR,SARDELLA,392
 59 | sp|Q9Z1S0|BUB1B_MOUSE,SALVERAI,93
 60 | sp|Q04520|BUDC_RAOTE,AGGRAVAI,49
 61 | sp|O74794|CCHL_SCHPO,REAGENTE,268
 62 | sp|Q6UY09|CEA20_HUMAN,VIGILAVI,452
 63 | sp|Q8BI06|CEMIP_MOUSE,SGASSAVA,26
 64 | sp|Q7SEY2|CFT1_NEUCR,VANGASTI,237
 65 | sp|A3KFM7|CHD6_MOUSE,VEGETAVI,1976
 66 | sp|Q7U5I4|CHLN_PARMW,ERARIALE,270
 67 | sp|Q6CJI9|CHO2_KLULA,ECCITATI,58
 68 | sp|D3Z7H8|CILP2_MOUSE,PRAGHESE,872
 69 | sp|Q99LJ5|CKLF3_MOUSE,VISITAVA,115
 70 | sp|Q9H9A5|CNO10_HUMAN,TESSESSE,495
 71 | sp|Q4R350|CNO10_MACFA,TESSESSE,495
 72 | sp|Q8BH15|CNO10_MOUSE,TESSESSE,495
 73 | sp|Q62GU3|COAE_BURMA,AMPLIARE,45
 74 | sp|Q3JNF3|COAE_BURP1,AMPLIARE,45
 75 | sp|I1WFB9|COAE_BURP2,AMPLIARE,45
 76 | sp|P0DMK3|COAE_BURPS,AMPLIARE,45
 77 | sp|Q6NG92|COBS_CORDI,APPARARE,109
 78 | sp|A9AYY2|COBS_HERA2,AVVIVATI,200
 79 | sp|Q22498|COPG_CAEEL,RIALTESI,629
 80 | sp|Q01331|CRTY_PSEVU,ALLAGAVI,126
 81 | sp|A4SDU0|CYSD_CHLPM,VESSASTI,258
 82 | sp|Q8EAZ9|CYSJ_SHEON,SASSELLA,341
 83 | sp|P71128|CYSM_CAMJE,ISLAMICA,73
 84 | sp|B2SD81|DAP_BRUA1,ADERISTI,302
 85 | sp|Q2YLB4|DAP_BRUA2,ADERISTI,302
 86 | sp|Q579G4|DAP_BRUAB,ADERISTI,302
 87 | sp|A9MCM7|DAP_BRUC2,ADERISTI,302
 88 | sp|C0RM93|DAP_BRUMB,ADERISTI,302
 89 | sp|Q8YD27|DAP_BRUME,ADERISTI,302
 90 | sp|A5VVL2|DAP_BRUO2,ADERISTI,302
 91 | sp|A9WVV8|DAP_BRUSI,ADERISTI,302
 92 | sp|Q8FV99|DAP_BRUSU,ADERISTI,302
 93 | sp|Q6BLM5|DBP9_DEBHA,DESTEREI,87
 94 | sp|Q3AS55|DDL_CHLCH,ADAGIAVA,150
 95 | sp|B4SAI2|DDL_PELPB,ADAGIAVA,150
 96 | sp|A4ZZ93|DHYSL_LEIDO,FISSAGGI,138
 97 | sp|Q9ZIV1|DNAK_MEGEL,PENTISSI,62
 98 | sp|A2WZI4|DRE1F_ORYSI,SPEDIRLA,118
 99 | sp|Q8S9Z5|DRE1F_ORYSJ,SPEDIRLA,118
100 | sp|P0CN13|DXO_CRYNB,FERRIERE,59
101 | sp|P0CN12|DXO_CRYNJ,FERRIERE,59
102 | sp|Q83I20|DXS_TROW8,REDIVIVA,510
103 | sp|Q83G46|DXS_TROWT,REDIVIVA,510
104 | sp|Q9P225|DYH2_HUMAN,LINGERIA,2190
105 | sp|P0C6F1|DYH2_MOUSE,LINGERIA,2219
106 | sp|Q7SBU6|EAF1_NEUCR,PAPPASSI,137
107 | sp|Q5L764|EFTS_CHLAB,PEDALARE,202
108 | sp|Q6PFQ2|EIF3C_DANRE,IFIGENIA,418
109 | sp|Q74FS7|END4_GEOSL,VILLETTA,141
110 | sp|Q80X91|F110D_MOUSE,RAPPRESE,263
111 | sp|B1MKD7|FABH_MYCA9,TRINELLA,264
112 | sp|A1STW1|FABH_PSYIN,AVVISATE,163
113 | sp|Q6LTK3|FADJ_PHOPR,SIFFATTA,294
114 | sp|A1S7L6|FADJ_SHEAM,SIFFATTE,288
115 | sp|A3D684|FADJ_SHEB5,SIFFATTE,288
116 | sp|A6WQ25|FADJ_SHEB8,SIFFATTE,288
117 | sp|A3QFP3|FADJ_SHELP,SIFFATTE,288
118 | sp|Q8ECP7|FADJ_SHEON,SIFFATTE,288
119 | sp|A4Y897|FADJ_SHEPC,SIFFATTE,288
120 | sp|A0KV76|FADJ_SHESA,SIFFATTE,288
121 | sp|Q0HKD1|FADJ_SHESM,SIFFATTE,288
122 | sp|Q0HWN3|FADJ_SHESR,SIFFATTE,288
123 | sp|A1RI92|FADJ_SHESW,SIFFATTE,288
124 | sp|A7MS61|FADJ_VIBC1,SIFFATTE,289
125 | sp|A5F2P2|FADJ_VIBC3,SIFFATTE,292
126 | sp|Q9KT58|FADJ_VIBCH,SIFFATTE,292
127 | sp|Q87MM3|FADJ_VIBPA,SIFFATTE,289
128 | sp|Q8DB47|FADJ_VIBVU,SIFFATTE,289
129 | sp|Q7MIS5|FADJ_VIBVY,SIFFATTE,289
130 | sp|B1MIT2|FGD_MYCA9,VELAVATE,22
131 | sp|O83710|FLHB_TREPA,TRATTASI,84
132 | sp|P9WES5|FOGB_ASPRC,CALAFATE,20
133 | sp|Q5FIU5|FTHS2_LACAC,AVVIVATA,322
134 | sp|C3P858|FTHS_BACAA,AVVIVATI,330
135 | sp|C3LKJ6|FTHS_BACAC,AVVIVATI,330
136 | sp|A0RD97|FTHS_BACAH,AVVIVATI,330
137 | sp|Q81RE1|FTHS_BACAN,AVVIVATI,330
138 | sp|B7JLG8|FTHS_BACC0,AVVIVATI,330
139 | sp|Q739F4|FTHS_BACC1,AVVIVATI,330
140 | sp|B7IUA4|FTHS_BACC2,AVVIVATI,330
141 | sp|C1ES77|FTHS_BACC3,AVVIVATI,330
142 | sp|B7HP29|FTHS_BACC7,AVVIVATI,330
143 | sp|B9IYP4|FTHS_BACCQ,AVVIVATI,330
144 | sp|Q81E87|FTHS_BACCR,AVVIVATI,330
145 | sp|Q63C61|FTHS_BACCZ,AVVIVATI,330
146 | sp|Q6HJK9|FTHS_BACHK,AVVIVATI,330
147 | sp|Q891R3|FTHS_CLOTE,AVVIVATI,327
148 | sp|Q834D6|FTHS_ENTFA,AVVIVATI,324
149 | sp|Q88W76|FTHS_LACPL,AVVIVATI,320
150 | sp|Q03S45|FTHS_LEVBA,AVVIVATI,320
151 | sp|Q83WS0|FTHS_METEA,AVVIVATI,324
152 | sp|A9VZT0|FTHS_METEP,AVVIVATI,324
153 | sp|B8EKB9|FTHS_METSB,AVVIVATI,325
154 | sp|A9WMW3|FTHS_RENSM,AVVIVATI,334
155 | sp|Q59925|FTHS_STRMU,AVVIVATI,324
156 | sp|Q3APF2|GATB_CHLCH,PARLAGLI,367
157 | sp|Q1D651|GLGE_MYXXD,ALLEGAVA,126
158 | sp|B2S889|GLO2_BRUA1,ESALTATI,28
159 | sp|Q2YLU8|GLO2_BRUA2,ESALTATI,25
160 | sp|Q57AW2|GLO2_BRUAB,ESALTATI,25
161 | sp|A9M8S2|GLO2_BRUC2,ESALTATI,28
162 | sp|C0RFI1|GLO2_BRUMB,ESALTATI,28
163 | sp|Q8YJF4|GLO2_BRUME,ESALTATI,25
164 | sp|A5VSR1|GLO2_BRUO2,ESALTATI,25
165 | sp|B0CIU0|GLO2_BRUSI,ESALTATI,28
166 | sp|Q8FYE7|GLO2_BRUSU,ESALTATI,25
167 | sp|P64183|GLPD1_MYCBO,SARAVINA,231
168 | sp|P9WN80|GLPD1_MYCTO,SARAVINA,231
169 | sp|P9WN81|GLPD1_MYCTU,SARAVINA,231
170 | sp|Q2RFW7|GLYA_MOOTA,AVARIARE,156
171 | sp|Q0AIY2|GRPE_NITEC,STENTERA,30
172 | sp|A6VDX9|GSH1_PSEA7,SELLERIA,405
173 | sp|P0CS37|HAT2_CRYNB,SPARIRAI,124
174 | sp|P0CS36|HAT2_CRYNJ,SPARIRAI,124
175 | sp|P04662|HEMA_I75A5,SVELLETE,45
176 | sp|O53333|HIGA3_MYCTU,DIRADAVA,9
177 | sp|B8E2C7|HIS4_DICTD,RALLEGRI,223
178 | sp|Q3AD55|HISZ_CARHZ,LEGIFERA,192
179 | sp|O64966|HMDH1_GOSHI,ILLATIVA,540
180 | sp|Q5P502|HSLV_AROAE,RALLENTA,140
181 | sp|Q8N5X7|IF4E3_HUMAN,APPAGARE,8
182 | sp|Q9NPH9|IL26_HUMAN,SCASSARE,123
183 | sp|O60100|IMB4_SCHPO,TEATRALE,25
184 | sp|P0DX14|INLPC_STRC4,APPETIVA,127
185 | sp|P63394|IRTB_MYCBO,PALESTRA,295
186 | sp|P9WQJ6|IRTB_MYCTO,PALESTRA,295
187 | sp|P9WQJ7|IRTB_MYCTU,PALESTRA,295
188 | sp|A6TWK9|ISPF_ALKMQ,ANNIDATI,91
189 | sp|P9WKF8|ISPH1_MYCTO,VERSIATE,309
190 | sp|P9WKF9|ISPH1_MYCTU,VERSIATE,309
191 | sp|P0A5I3|ISPH2_MYCBO,VERSIATE,309
192 | sp|Q88M04|KCY_PSEPK,VAGLIARE,21
193 | sp|P65208|KDGT1_SALTI,STAGNAVA,255
194 | sp|P65207|KDGT1_SALTY,STAGNAVA,255
195 | sp|Q8PKS3|KDSB_XANAC,ALLAGARE,43
196 | sp|Q3BTC6|KDSB_XANC5,ALLAGARE,43
197 | sp|Q8I719|KGP_PLAF7,DELETERI,539
198 | sp|W7JX98|KGP_PLAFO,DELETERI,539
199 | sp|P0CU29|KTU_DROWI,DEFERIRE,36
200 | sp|A5CX27|LEUC_VESOH,IRENISTA,15
201 | sp|B8NWW3|LNBC_ASPFN,GETTASTI,308
202 | sp|B9JA09|LPXK_AGRRK,GRADELLA,300
203 | sp|Q9H089|LSG1_HUMAN,TASTASSE,623
204 | sp|A8XJZ8|LST2_CAEBR,VIETASSE,381
205 | sp|Q96LR2|LURA1_HUMAN,VIAGGERA,174
206 | sp|Q91YU6|LZTS2_MOUSE,PARETATA,13
207 | sp|Q3LUD4|LZTS2_RAT,PARETATA,13
208 | sp|Q77SJ8|L_HIRRV,TIRATEVI,1480
209 | sp|Q82707|L_IHNVO,TIRATEVI,1480
210 | sp|Q82685|L_IHNVW,TIRATEVI,1480
211 | sp|Q1LVZ2|MARH2_DANRE,PICRICHE,62
212 | sp|Q5PQ35|MARH2_XENLA,PICRICHE,62
213 | sp|Q28EX7|MARH2_XENTR,PICRICHE,62
214 | sp|B0C1Y1|MEND_ACAM1,RISVEGLI,346
215 | sp|A4FG19|MIBS_SACEN,LASCEREI,266
216 | sp|A0A319DV72|MLFA_ASPSB,PRESIEDI,2062
217 | sp|Q6NHQ7|MNMA_CORDI,ALLERGIA,109
218 | sp|Q8FQ01|MNMA_COREF,ALLERGIA,109
219 | sp|A4QDK1|MNMA_CORGB,ALLERGIA,109
220 | sp|Q8NR24|MNMA_CORGL,ALLERGIA,109
221 | sp|Q0RKY6|MSHB1_FRAAA,RIGELERA,60
222 | sp|D7BQJ3|MSHB_STRBB,RIGELAVA,71
223 | sp|O74472|MUG33_SCHPO,CICLISTI,96
224 | sp|Q8UDM9|MURC_AGRFC,PIEGASSE,400
225 | sp|Q0AJE2|MURC_NITEC,AVVISTAI,67
226 | sp|Q82VS2|MURC_NITEU,AVVISTAI,67
227 | sp|Q2JD52|MURD_FRACC,RALLARGA,42
228 | sp|Q5E7G7|MUTS_ALIF1,CELLERAI,405
229 | sp|B5FAC8|MUTS_ALIFM,CELLERAI,405
230 | sp|Q6LMU0|MUTS_PHOPR,CELLERAI,405
231 | sp|B7VK59|MUTS_VIBA3,SELLERAI,403
232 | sp|Q87LQ9|MUTS_VIBPA,CELLERAI,403
233 | sp|Q8DC53|MUTS_VIBVU,CELLERAI,403
234 | sp|Q7MHR2|MUTS_VIBVY,CELLERAI,403
235 | sp|O35942|NEK2_MOUSE,TARSENSE,393
236 | sp|P51956|NEK3_HUMAN,ALTALENA,371
237 | sp|Q60CT7|NFI_METCA,PALLEALE,95
238 | sp|Q5BDY8|NLSA_EMENI,SGRAVERA,3294
239 | sp|Q8T8C0|NOS_BOMMO,SVAGASSI,649
240 | sp|F4IGA5|NU133_ARATH,SLITTAVA,237
241 | sp|O78706|NU1M_PHACI,SPILLAVA,11
242 | sp|A0LJM5|NUBCD_SYNFM,RIESSERE,213
243 | sp|Q9XAQ7|NUOD2_STRCO,ASPRETTE,6
244 | sp|A2VDP6|NXPE3_BOVIN,GRISETTE,217
245 | sp|Q969Y0|NXPE3_HUMAN,GRISETTE,217
246 | sp|Q5RCA5|NXPE3_PONAB,GRISETTE,217
247 | sp|P23214|OAC_BPSFV,SAGGIAVI,39
248 | sp|Q57483|OM26_HAEIN,TALALGIA,8
249 | sp|K7NTD0|OSTC2_DIPSG,TESSESSE,126
250 | sp|O04226|P5CS1_ORYSJ,STREMAVA,294
251 | sp|Q77MR9|PAP_GAHVM,ASSALIVA,229
252 | sp|Q8AV28|PCM1_CHICK,ASSETATE,1704
253 | sp|Q6FJA3|PDC1_CANGA,ANISETTA,130
254 | sp|P06169|PDC1_YEAST,ANISETTA,130
255 | sp|P16467|PDC5_YEAST,ANISETTA,130
256 | sp|P34734|PDC_HANUV,ANISETTA,130
257 | sp|G1UBC2|PGA47_CANAL,TESTASTE,252
258 | sp|Q65W08|PGK_MANSM,SENAPATE,263
259 | sp|Q7TVK8|PHAS_MYCBO,PARASALE,1961
260 | sp|A1KQG0|PHAS_MYCBP,PARASALE,1961
261 | sp|A5U9F4|PHAS_MYCTA,PARASALE,1961
262 | sp|P9WQE8|PHAS_MYCTO,PARASALE,1961
263 | sp|P9WQE9|PHAS_MYCTU,PARASALE,1961
264 | sp|P05098|PHEA_MICDP,ANNIDAVA,45
265 | sp|P29296|PHEA_PSETP,ANNIDAVA,45
266 | sp|Q6WB63|PHNC_ALCFA,SPEDIRLA,119
267 | sp|Q8CHS4|PLCX1_MOUSE,VAGINITE,271
268 | sp|Q2HA54|PLPL_CHAGB,ALACRITA,444
269 | sp|Q4PSN0|PME29_ARATH,RIFIGLIA,5
270 | sp|Q2FUB2|POK_METHJ,PRECARIA,124
271 | sp|O92529|POLG_HCVT5,FLATTING,1074
272 | sp|P9WI34|PPE13_MYCTO,TRATTARE,411
273 | sp|P9WI35|PPE13_MYCTU,TRATTARE,411
274 | sp|P9WI04|PPE32_MYCTO,SLATTATA,215
275 | sp|P9WI05|PPE32_MYCTU,SLATTATA,216
276 | sp|O42900|PPK19_SCHPO,RISALITI,488
277 | sp|B0R7F5|PRIL_HALS3,AVVERARE,34
278 | sp|Q9HN47|PRIL_HALSA,AVVERARE,34
279 | sp|C0R0B8|PROA_BRAHW,PIGLIAVI,116
280 | sp|Q7WQL9|PROB_BORBR,RECARGLI,332
281 | sp|Q7W1P3|PROB_BORPA,RECARGLI,332
282 | sp|Q7VZX7|PROB_BORPE,RECARGLI,332
283 | sp|Q2UH00|PRP28_ASPOR,AGGIRARE,305
284 | sp|P20053|PRP4_YEAST,MENINGEE,93
285 | sp|P35820|PSC_DROME,INATTIVE,271
286 | sp|Q16825|PTN21_HUMAN,APPARARE,727
287 | sp|O27427|PURL_METTH,SAGGIAVA,613
288 | sp|Q93YQ3|PURU1_ARATH,IRRITERA,2
289 | sp|A7MSE0|PYRB_VIBC1,ALLEGARE,253
290 | sp|Q8DCF6|PYRB_VIBVU,ALLEGARE,253
291 | sp|Q7MHF1|PYRB_VIBVY,ALLEGARE,253
292 | sp|P74782|PYRD_SYNY3,ANNEGAVA,130
293 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315
294 | sp|A2BJ23|PYRI_HYPBU,GREGARIA,37
295 | sp|B3MA91|QTRT2_DROAN,AFFERIRE,362
296 | sp|Q00799|RBP2_PLAVB,PENDETTE,2742
297 | sp|Q2NCT4|RECR_ERYLH,PALLEALE,40
298 | sp|C1F3B3|RIMM_ACIC5,INASTATA,178
299 | sp|Q3SI16|RIMO_THIDA,SPIEGAVA,339
300 | sp|P61726|RISB1_RHOPA,ALLEGAVA,34
301 | sp|Q1QMB5|RISB_NITHX,ALLEGAVA,34
302 | sp|Q3SRV7|RISB_NITWN,ALLEGAVA,34
303 | sp|C1DQ78|RL13_AZOVD,PERVIETA,97
304 | sp|Q2S9X2|RL13_HAHCH,PERVIETA,97
305 | sp|Q48EE0|RL13_PSE14,PERVIETA,97
306 | sp|A6VBA6|RL13_PSEA7,PERVIETA,97
307 | sp|B7UZL1|RL13_PSEA8,PERVIETA,97
308 | sp|Q02H07|RL13_PSEAB,PERVIETA,97
309 | sp|Q9HVY2|RL13_PSEAE,PERVIETA,97
310 | sp|Q1I596|RL13_PSEE4,PERVIETA,97
311 | sp|Q4K6H2|RL13_PSEF5,PERVIETA,97
312 | sp|A4XQQ3|RL13_PSEMY,PERVIETA,97
313 | sp|A5W8S1|RL13_PSEP1,PERVIETA,97
314 | sp|Q3K723|RL13_PSEPF,PERVIETA,97
315 | sp|B0KFU8|RL13_PSEPG,PERVIETA,97
316 | sp|Q88N97|RL13_PSEPK,PERVIETA,97
317 | sp|B1J1W8|RL13_PSEPW,PERVIETA,97
318 | sp|Q87WW7|RL13_PSESM,PERVIETA,97
319 | sp|Q4ZNX2|RL13_PSEU2,PERVIETA,97
320 | sp|B4SLE1|RL13_STRM5,PERVIETA,97
321 | sp|A4VIF7|RL13_STUS1,PERVIETA,97
322 | sp|Q49ZE1|RL17_STAS1,SERVETTE,27
323 | sp|Q8TX51|RL1_METKA,NADIRALI,138
324 | sp|A6GZ91|RL29_FLAPJ,SVARIATE,49
325 | sp|P50345|RLA0_LUPLU,LAVAVATE,252
326 | sp|Q13Z67|RLMD_PARXL,REGALAVA,398
327 | sp|Q8PMV0|RNC_XANAC,REGALAVI,76
328 | sp|Q3BVV6|RNC_XANC5,REGALAVI,76
329 | sp|Q4USF7|RNC_XANC8,REGALAVI,76
330 | sp|Q8PB52|RNC_XANCP,REGALAVI,76
331 | sp|Q5H1R2|RNC_XANOR,REGALAVI,76
332 | sp|O14277|RS5A_SCHPO,LITIGARE,152
333 | sp|Q9P3T6|RS5B_SCHPO,LITIGARE,152
334 | sp|Q8RIM0|RS7_FUSNN,ANNEGATI,128
335 | sp|Q0ALX8|RUVA_MARMM,LATRIATE,119
336 | sp|B3GYP5|SELA_ACTP7,ALIENARE,38
337 | sp|Q8BUH8|SENP7_MOUSE,LESSASSE,386
338 | sp|O08815|SLK_RAT,ESTRATTE,627
339 | sp|P19382|SNAI1_XENLA,SPASSATE,108
340 | sp|Q81LW0|SODM1_BACAN,GELATAIE,100
341 | sp|Q818I1|SODM1_BACCR,GELATAIE,115
342 | sp|Q3V0Q6|SPAG8_MOUSE,METTESTE,1
343 | sp|P32916|SRPR_YEAST,SVENTARE,353
344 | sp|A8AUS0|SSPA_STRGC,INTANATA,191
345 | sp|A8AUS1|SSPB_STRGC,INTANATA,190
346 | sp|P16952|SSPB_STRGN,INTANATA,191
347 | sp|A1JI37|STHA_YERE8,SFIDANTI,118
348 | sp|A5CC52|SUCC_ORITB,MASSAGGI,124
349 | sp|A5GPF9|SYA_SYNPW,RIESLING,621
350 | sp|A1S6P2|SYD_SHEAM,METTERMI,247
351 | sp|Q8RB93|SYE1_CALS4,VARIERAI,465
352 | sp|B1MZ60|SYGB_LEUCK,ENERVATA,422
353 | sp|O32039|SYH_BACSU,REAGISSE,353
354 | sp|A7Z751|SYH_BACVZ,REAGISSE,353
355 | sp|A1S425|SYI_SHEAM,PESAVATE,880
356 | sp|A0JUT1|SYP_ARTS2,LETARGIE,435
357 | sp|A1SLL4|SYP_NOCSJ,LETARGIE,422
358 | sp|A1R508|SYP_PAEAT,LETARGIE,435
359 | sp|Q2JSB6|SYP_SYNJA,LETARGIE,436
360 | sp|Q2JMD8|SYP_SYNJB,LETARGIE,436
361 | sp|Q116D3|SYP_TRIEI,LETARGIE,431
362 | sp|Q4QL12|SYR_HAEI8,INSTALLA,476
363 | sp|A5UCH1|SYR_HAEIE,INSTALLA,476
364 | sp|A5UJ40|SYR_HAEIG,INSTALLA,476
365 | sp|P43832|SYR_HAEIN,INSTALLA,476
366 | sp|Q0AY05|SYT_SYNWW,FEDERARE,62
367 | sp|B6EHW3|SYY_ALISL,ASSERITA,319
368 | sp|O52512|T2S1_STRFI,SPARGEVA,210
369 | sp|P09758|TACD2_HUMAN,TAGLIAVI,274
370 | sp|Q04B89|THII_LACDB,SGRINFIE,234
371 | sp|Q8DUR1|THII_STRMU,VAGINITE,379
372 | sp|Q75GA5|TIP41_ORYSJ,ALLARGHI,91
373 | sp|Q5JTD0|TJAP1_HUMAN,PASSASSE,400
374 | sp|Q9DCD5|TJAP1_MOUSE,PASSASSE,395
375 | sp|P69744|TRPV5_MOUSE,RALLARGA,128
376 | sp|Q9XSM3|TRPV5_RABIT,RALLARGA,134
377 | sp|Q9JIP0|TRPV5_RAT,RALLARGA,128
378 | sp|Q91WD2|TRPV6_MOUSE,RALLARGA,174
379 | sp|Q9R186|TRPV6_RAT,RALLARGA,174
380 | sp|E7F211|TTC17_DANRE,SALIFICA,40
381 | sp|Q6E240|U496E_ARATH,PIPERITE,287
382 | sp|P10861|UCP1_BOVIN,ECLISSAI,50
383 | sp|P16801|UL95_HCMVA,MALVACEA,129
384 | sp|Q6SW48|UL95_HCMVM,MALVACEA,129
385 | sp|B9VXQ2|UL95_HCMVT,MALVACEA,129
386 | sp|A1RV13|UPP_PYRIL,REDIVIVA,125
387 | sp|P9WEV0|VALA_ASPTE,PIETRAIA,321
388 | sp|Q7YRP3|VN1R3_PANTR,NASALITA,267
389 | sp|P0DOJ3|VP2_POVK3,SLATTARE,243
390 | sp|P0DOJ2|VP2_POVK6,SLATTARE,243
391 | sp|Q6GPH4|XAF1_HUMAN,RISAPERE,143
392 | sp|Q8Y0D3|Y1111_RALSO,SPRETAVA,162
393 | sp|Q46XP0|Y2732_CUPPJ,PREVARRA,154
394 | sp|B9LS33|Y273_HALLT,ALIENITA,5
395 | sp|Q9RN18|Y6513_BACAN,VIGILAVI,18
396 | sp|Q02998|YH19_RHOCA,FLIPPATI,28
397 | sp|A0A023PZL2|YM119_YEAST,GALALITI,87
398 | sp|A5CRZ4|YQGF_CLAM3,RILAVATE,46
399 | sp|O75467|Z324A_HUMAN,SVAGASSE,522
400 | sp|Q9HCK1|ZDBF2_HUMAN,DISCINTE,1559
401 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_nl.csv:
--------------------------------------------------------------------------------
  1 | .id,Keyword,Offset
  2 | sp|B3DT30|EFG_BIFLD,TREKKERIG,356
  3 | sp|Q8G5B6|EFG_BIFLO,TREKKERIG,356
  4 | sp|Q89J81|EFG_BRADU,TREKKERIG,344
  5 | sp|A5ELN0|EFG_BRASB,TREKKERIG,344
  6 | sp|Q1QN33|EFG_NITHX,TREKKERIG,344
  7 | sp|Q3SSW9|EFG_NITWN,TREKKERIG,344
  8 | sp|Q2IXR3|EFG_RHOP2,TREKKERIG,344
  9 | sp|Q07KL5|EFG_RHOP5,TREKKERIG,344
 10 | sp|Q6N4T4|EFG_RHOPA,TREKKERIG,344
 11 | sp|Q134S6|EFG_RHOPS,TREKKERIG,344
 12 | sp|B3QBY3|EFG_RHOPT,TREKKERIG,344
 13 | sp|Q7M7P6|ASSY_WOLSU,APERITIEF,220
 14 | sp|Q49135|FCHA_METEA,MAGNETIET,1
 15 | sp|Q6P7I6|TP8L2_XENLA,SPEELVLAK,80
 16 | sp|Q5ISE2|Z36L3_MOUSE,AMALGAAM,469
 17 | sp|Q54ET6|ABPF_DICDI,KNELLING,911
 18 | sp|B1J539|ACCD_PSEPW,AANHALEN,149
 19 | sp|Q3AA34|ADDB_CARHZ,KASSEIEN,140
 20 | sp|A2RUV9|AEBP1_RAT,PLETPERS,172
 21 | sp|C9K7C1|AMT10_ALTAL,GASSLANG,6
 22 | sp|A8DZJ1|BAZ1B_XENLA,AGENESIE,677
 23 | sp|P13517|CAPZB_YEAST,KLEILAND,101
 24 | sp|K9Y6N7|CCMK4_HALP7,VERFREST,98
 25 | sp|Q6YW51|CKX6_ORYSJ,VERLEREN,451
 26 | sp|P37974|CNRC_CUPMC,AANVRAAG,53
 27 | sp|P63840|COBH_MYCBO,RATELAAR,102
 28 | sp|P9WP86|COBH_MYCTO,RATELAAR,102
 29 | sp|P9WP87|COBH_MYCTU,RATELAAR,102
 30 | sp|Q99031|CR9AA_BACTG,TINSTEEN,738
 31 | sp|Q45733|CR9CA_BACTO,TINSTEEN,740
 32 | sp|O06014|CR9DA_BACTP,TINSTEEN,751
 33 | sp|Q67P64|DAPB_SYMTH,LEEFLAAG,62
 34 | sp|P10047|DCTB_RHILE,KLEILAAG,109
 35 | sp|Q6DW73|DGDG2_LOTJA,KAAKKLEM,276
 36 | sp|O62215|DHSD_CAEEL,SAPRISTI,20
 37 | sp|E1V7W1|DOEA_HALED,AMALGAAM,308
 38 | sp|B3DT30|EFG_BIFLD,REKKERIG,357
 39 | sp|Q8G5B6|EFG_BIFLO,REKKERIG,357
 40 | sp|Q89J81|EFG_BRADU,REKKERIG,345
 41 | sp|A5ELN0|EFG_BRASB,REKKERIG,345
 42 | sp|Q1QN33|EFG_NITHX,REKKERIG,345
 43 | sp|Q3SSW9|EFG_NITWN,REKKERIG,345
 44 | sp|Q2IXR3|EFG_RHOP2,REKKERIG,345
 45 | sp|Q07KL5|EFG_RHOP5,REKKERIG,345
 46 | sp|Q6N4T4|EFG_RHOPA,REKKERIG,345
 47 | sp|Q134S6|EFG_RHOPS,REKKERIG,345
 48 | sp|B3QBY3|EFG_RHOPT,REKKERIG,345
 49 | sp|Q758X9|EIF3B_ASHGO,AVERSIEF,28
 50 | sp|O49160|EIF3C_ARATH,AFTAPPEN,656
 51 | sp|P32476|ERG1_YEAST,MILITAIR,477
 52 | sp|O59945|FIMB_SCHPO,WANSMAAK,517
 53 | sp|Q8ZZK1|FOLD_PYRAE,GEELHART,7
 54 | sp|Q17QD8|G37L1_BOVIN,LAVALAAG,11
 55 | sp|A3MKU6|GLND_BURM7,SELDERIE,94
 56 | sp|A2SB69|GLND_BURM9,SELDERIE,94
 57 | sp|Q62JC2|GLND_BURMA,SELDERIE,94
 58 | sp|A1V572|GLND_BURMS,SELDERIE,94
 59 | sp|A3NWN4|GLND_BURP0,SELDERIE,94
 60 | sp|Q3JR26|GLND_BURP1,SELDERIE,94
 61 | sp|A3NAV0|GLND_BURP6,SELDERIE,94
 62 | sp|Q63T10|GLND_BURPS,SELDERIE,94
 63 | sp|Q0P5E7|GTPB8_BOVIN,KALFSLAP,128
 64 | sp|Q8N3Z3|GTPB8_HUMAN,KALFSLAP,128
 65 | sp|Q9CY28|GTPB8_MOUSE,KALFSLAP,129
 66 | sp|Q5SMM6|HCT4_ORYSJ,APPARAAT,217
 67 | sp|Q2IZP7|HLDE_RHOP2,LAVALAAG,281
 68 | sp|A9H863|HUTH_GLUDA,SHREDDER,271
 69 | sp|Q3SWP9|IF2_NITWN,APPARAAT,78
 70 | sp|P09407|ITI3_MOMCH,KALVEREN,35
 71 | sp|A9A698|KCY_METM6,IRISEREN,109
 72 | sp|A6VJT1|KCY_METM7,IRISEREN,109
 73 | sp|Q6LZK1|KCY_METMP,IRISEREN,109
 74 | sp|C5FZJ2|LIPA_ARTOC,KRAKERIG,389
 75 | sp|Q3TYD6|LMTK2_MOUSE,PEDAALAS,928
 76 | sp|F1QWK4|MCA3B_DANRE,GEELHART,978
 77 | sp|Q2H9Y1|MDM34_CHAGB,LEESDEEL,406
 78 | sp|Q6NIZ3|METXA_CORDI,GELEIDER,214
 79 | sp|Q8FRT0|METXA_COREF,GELEIDER,222
 80 | sp|O68640|METXA_CORGL,GELEIDER,222
 81 | sp|O66962|MNMG_AQUAE,VAGINAAL,393
 82 | sp|A4YJT4|MNMG_BRASO,VAGINAAL,385
 83 | sp|Q3AG55|MNMG_CARHZ,VAGINAAL,386
 84 | sp|Q0TLZ5|MNMG_CLOP1,VAGINAAL,386
 85 | sp|Q8XH31|MNMG_CLOPE,VAGINAAL,386
 86 | sp|Q0SPQ4|MNMG_CLOPS,VAGINAAL,386
 87 | sp|Q9CEJ4|MNMG_LACLA,VAGINAAL,389
 88 | sp|O32806|MNMG_LACLM,VAGINAAL,389
 89 | sp|Q02X03|MNMG_LACLS,VAGINAAL,389
 90 | sp|A1AV42|MNMG_PELPD,VAGINAAL,386
 91 | sp|C0QPI1|MNMG_PERMH,VAGINAAL,395
 92 | sp|A8GUR1|MNMG_RICB8,VAGINAAL,382
 93 | sp|Q1RGT1|MNMG_RICBR,VAGINAAL,382
 94 | sp|A8EXC3|MNMG_RICCK,VAGINAAL,382
 95 | sp|Q3JYG3|MNMG_STRA1,VAGINAAL,389
 96 | sp|P0A3F0|MNMG_STRA3,VAGINAAL,389
 97 | sp|P0A3F1|MNMG_STRA5,VAGINAAL,389
 98 | sp|Q8DRS6|MNMG_STRMU,VAGINAAL,389
 99 | sp|A4W4N0|MNMG_STRS2,VAGINAAL,389
100 | sp|A4VYE0|MNMG_STRSY,VAGINAAL,389
101 | sp|Q5LXK0|MNMG_STRT1,VAGINAAL,389
102 | sp|Q5M250|MNMG_STRT2,VAGINAAL,389
103 | sp|Q03I89|MNMG_STRTD,VAGINAAL,389
104 | sp|B2V6C3|MNMG_SULSY,VAGINAAL,393
105 | sp|B9L851|MOAA_NAUPA,STRAFWET,309
106 | sp|P48563|MON2_YEAST,SPLITTEN,398
107 | sp|B8FT65|MRAZ_DESHD,GRILLPAN,85
108 | sp|Q24TD7|MRAZ_DESHY,GRILLPAN,85
109 | sp|Q67Q58|MRAZ_SYMTH,GRILLPAN,83
110 | sp|Q2S527|MURC_SALRD,AFVELLEN,459
111 | sp|Q0BV25|MURG_GRABC,DAARNAAR,343
112 | sp|B8FJL5|MUTS_DESAL,DRINGEND,539
113 | sp|Q44584|NCCC_ALCXX,AANVRAAG,72
114 | sp|C4Y3N8|NOP9_CLAL4,AFKERVEN,12
115 | sp|Q6BUT3|NST1_DEBHA,FIEDELEN,204
116 | sp|B0R8D2|NUSA_HALS3,TAKELAAR,122
117 | sp|P0CW99|NUSA_HALSA,TAKELAAR,122
118 | sp|Q5DTZ0|NYNRI_MOUSE,LEGPRENT,55
119 | sp|Q96XT4|OFOB2_SULTO,LAVALAAG,153
120 | sp|Q54ID7|OSB11_DICDI,EENKLANK,391
121 | sp|Q07744|PEPO_LACLA,TAALTAAK,544
122 | sp|P0C2B4|PEPO_LACLC,TAALTAAK,544
123 | sp|Q02VB0|PEPO_LACLS,TAALTAAK,544
124 | sp|P15004|PER2_SOLLC,AARDSLAK,173
125 | sp|C1CV29|PGK_DEIDV,GEVALLEN,108
126 | sp|Q1IZA3|PGK_DEIGD,GEVALLEN,108
127 | sp|Q9RUP2|PGK_DEIRA,GEVALLEN,130
128 | sp|Q0RH06|PGK_FRAAA,GEVALLEN,115
129 | sp|Q2JCH8|PGK_FRACC,GEVALLEN,115
130 | sp|C6BUI7|PGK_MARSD,AFKALKEN,388
131 | sp|A0QGK3|PHK_MYCA1,GLASRAAM,756
132 | sp|Q73ZM8|PHK_MYCPA,GLASRAAM,756
133 | sp|Q9JI55|PLEC_CRIGR,KETELPAK,254
134 | sp|Q9QXS1|PLEC_MOUSE,KETELPAK,473
135 | sp|P30427|PLEC_RAT,KETELPAK,468
136 | sp|B0LL23|PLR_SINHE,PARMAHAM,121
137 | sp|Q9CNJ7|PSTB_PASMU,GELIEFDE,230
138 | sp|B0S6S9|RBM44_DANRE,AANSTAAN,373
139 | sp|A1SJ39|RIMO_NOCSJ,TAALKLAS,184
140 | sp|A2BT57|RLMN_PROMS,KERNLAND,70
141 | sp|Q2SBR2|RNH2_HAHCH,VERDEELD,148
142 | sp|Q68S14|RPOB_PANGI,AIDSGALA,573
143 | sp|B1VDC0|RS15_CORU7,LASTDIER,70
144 | sp|A0R024|RSMH_MYCS2,RAAIPAAL,280
145 | sp|A3Q1M6|RSMH_MYCSJ,RAAIPAAL,270
146 | sp|Q7T2D0|SGSM3_DANRE,AASKEVER,599
147 | sp|Q6P7W2|SHKB1_MOUSE,PIERLALA,237
148 | sp|Q1DZ34|SIP5_COCIM,REKENAAR,82
149 | sp|P41508|SMC_MESHY,KNALSEIN,295
150 | sp|Q6IUP1|SOLH1_MOUSE,RESELLER,178
151 | sp|Q30YS5|SYA_OLEA2,AFPELLEN,341
152 | sp|A5GPF9|SYA_SYNPW,RIESLING,621
153 | sp|A7IAG1|SYE_METB6,AALSPEER,60
154 | sp|B8J4S2|SYS_DESDA,GEVLEESD,241
155 | sp|Q485S0|T3HPD_COLP3,INSLECHT,27
156 | sp|A2XSX6|TIF9_ORYSI,TRAPVELD,3
157 | sp|Q7XV97|TIF9_ORYSJ,TRAPVELD,3
158 | sp|Q6MDC8|UVRC_PARUW,KRAKEEND,449
159 | sp|A0A7H0DN27|VPK2_MONPV,DIEFSTAL,388
160 | sp|O57177|VPK2_VACCA,DIEFSTAL,388
161 | sp|P21095|VPK2_VACCC,DIEFSTAL,388
162 | sp|P29884|VPK2_VACCP,DIEFSTAL,354
163 | sp|Q9JFE5|VPK2_VACCT,DIEFSTAL,388
164 | sp|Q89121|VPK2_VACCW,DIEFSTAL,388
165 | sp|P33801|VPK2_VAR67,DIEFSTAL,388
166 | sp|Q3MUH7|XG74_PAESP,LAVALAAG,14
167 | sp|A4ZUC9|Y112_ABVP,PILIPILI,87
168 | sp|Q5UPM1|YL149_MIMIV,INSTINCT,140
169 | sp|Q5ISE2|Z36L3_MOUSE,AMALGAAM,397
170 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_no.csv:
--------------------------------------------------------------------------------
  1 | .id,Keyword,Offset
  2 | sp|Q89I89|SYA_BRADU,DATASALGS,255
  3 | sp|C3PNE7|SYL_RICAE,SELEKTERE,287
  4 | sp|A1JIX1|TRUB_YERE8,EGALISERE,92
  5 | sp|O34273|TRUB_YEREN,EGALISERE,92
  6 | sp|A7FMS0|TRUB_YERP3,EGALISERE,92
  7 | sp|Q1CC09|TRUB_YERPA,EGALISERE,92
  8 | sp|Q8ZBC4|TRUB_YERPE,EGALISERE,92
  9 | sp|Q1CEL5|TRUB_YERPN,EGALISERE,92
 10 | sp|A4TRI1|TRUB_YERPP,EGALISERE,92
 11 | sp|Q66F58|TRUB_YERPS,EGALISERE,92
 12 | sp|Q93RD9|DTD_LISMO,SMEDEVISE,14
 13 | sp|B8GPA2|MOAC_THISH,VEVELSTAD,84
 14 | sp|Q5HLX6|MOEA_STAEQ,ELLEVEDEL,193
 15 | sp|Q8CNE1|MOEA_STAES,ELLEVEDEL,193
 16 | sp|Q06583|PYS1_PSEAI,AVREGNING,162
 17 | sp|P06105|SC160_YEAST,ANSVARLIG,643
 18 | sp|Q91VX2|UBAP2_MOUSE,GRESSENKE,110
 19 | sp|Q96WW0|YNH9_SCHPO,KIRKESKIP,382
 20 | sp|Q19753|YU0O_CAEEL,ALMEGREIN,668
 21 | sp|B3DT30|EFG_BIFLD,TREKKERI,356
 22 | sp|Q8G5B6|EFG_BIFLO,TREKKERI,356
 23 | sp|Q89J81|EFG_BRADU,TREKKERI,344
 24 | sp|A5ELN0|EFG_BRASB,TREKKERI,344
 25 | sp|Q1QN33|EFG_NITHX,TREKKERI,344
 26 | sp|Q3SSW9|EFG_NITWN,TREKKERI,344
 27 | sp|Q2IXR3|EFG_RHOP2,TREKKERI,344
 28 | sp|Q07KL5|EFG_RHOP5,TREKKERI,344
 29 | sp|Q6N4T4|EFG_RHOPA,TREKKERI,344
 30 | sp|Q134S6|EFG_RHOPS,TREKKERI,344
 31 | sp|B3QBY3|EFG_RHOPT,TREKKERI,344
 32 | sp|Q9LD43|ACCA_ARATH,VASSGASS,586
 33 | sp|P74582|ACNB_SYNY3,FELTDATA,583
 34 | sp|A1WQY3|ALR_VEREI,GAVLSIDE,336
 35 | sp|P39265|ALSB_ECOLI,KALDVATN,202
 36 | sp|Q6A332|ALY3_ARATH,HEKKSAKS,524
 37 | sp|O31788|APRX_BACSU,VASSGASS,195
 38 | sp|Q47VK9|ARGR_COLP3,KINETIKK,140
 39 | sp|Q058D5|ARLY_BUCCC,LETTVINT,88
 40 | sp|Q3AQX0|AROE_CHLCH,FALLGRAV,8
 41 | sp|Q6LLZ0|AROQ_PHOPR,KRILLING,5
 42 | sp|A2XNK3|ASA1_ORYSI,VASSPEIL,351
 43 | sp|Q94GF1|ASA1_ORYSJ,VASSPEIL,351
 44 | sp|Q9XJ29|ASA2_ORYSJ,VASSPEIL,379
 45 | sp|O74431|ATC9_SCHPO,TREPLATA,199
 46 | sp|Q6BTX0|ATG2_DEBHA,KANTNING,583
 47 | sp|S0EFU6|BEA4_GIBF5,KANSLERE,435
 48 | sp|Q55BU9|C5133_DICDI,SYKELEIE,229
 49 | sp|Q9H251|CAD23_HUMAN,FRALANDS,2891
 50 | sp|Q99PF4|CAD23_MOUSE,FRALANDS,2891
 51 | sp|P58365|CAD23_RAT,FRALANDS,2889
 52 | sp|O34659|CDAR_BACSU,IALLFALL,14
 53 | sp|B5X564|CDC2C_ARATH,FRAREVET,120
 54 | sp|O97383|CHH1_PENMO,VASSTAPA,20
 55 | sp|O53080|CITXG_LEUMC,AGNATISK,402
 56 | sp|P28020|CSK21_XENLA,SALGSLAG,363
 57 | sp|A9NGC2|DAPA_ACHLI,SEKKEVIS,55
 58 | sp|Q8VCR2|DHB13_MOUSE,VARSLING,242
 59 | sp|Q486F9|DNAK1_COLP3,ANNETLAG,43
 60 | sp|Q5NPS6|DNAK_ZYMMO,ISRAELER,295
 61 | sp|C3PLJ2|DNLJ_RICAE,ISKLASSE,261
 62 | sp|Q92GM7|DNLJ_RICCN,ISKLASSE,261
 63 | sp|A8F2K5|DNLJ_RICM5,ISKLASSE,261
 64 | sp|Q9ZCK9|DNLJ_RICPR,ISKLASSE,261
 65 | sp|C4K0T1|DNLJ_RICPU,ISKLASSE,261
 66 | sp|B0BUZ1|DNLJ_RICRO,ISKLASSE,261
 67 | sp|A8GTF2|DNLJ_RICRS,ISKLASSE,261
 68 | sp|Q68W27|DNLJ_RICTY,ISKLASSE,261
 69 | sp|B3DT30|EFG_BIFLD,STREKKER,355
 70 | sp|Q8G5B6|EFG_BIFLO,STREKKER,355
 71 | sp|Q89J81|EFG_BRADU,STREKKER,343
 72 | sp|A5ELN0|EFG_BRASB,STREKKER,343
 73 | sp|Q1QN33|EFG_NITHX,STREKKER,343
 74 | sp|Q3SSW9|EFG_NITWN,STREKKER,343
 75 | sp|Q2IXR3|EFG_RHOP2,STREKKER,343
 76 | sp|Q07KL5|EFG_RHOP5,STREKKER,343
 77 | sp|Q6N4T4|EFG_RHOPA,STREKKER,343
 78 | sp|Q134S6|EFG_RHOPS,STREKKER,343
 79 | sp|B3QBY3|EFG_RHOPT,STREKKER,343
 80 | sp|B9KIA2|EFP_ANAMF,AVLENGER,153
 81 | sp|Q5PB21|EFP_ANAMM,AVLENGER,153
 82 | sp|B3PBP7|EFTS_CELJU,MAGELEIA,114
 83 | sp|Q2YBA6|EFTS_NITMU,SPELLERE,200
 84 | sp|Q4P0P0|EIF3C_USTMA,KVIKENDE,207
 85 | sp|Q9ZWB9|FAO1_ARATH,SVEIGENE,191
 86 | sp|Q989A7|FOLD1_RHILO,ALTAELVA,20
 87 | sp|B2V9R3|FOLD_SULSY,TELLELIG,75
 88 | sp|I1S163|FSL4_GIBZE,AKKVIRER,295
 89 | sp|A8N8S3|GATA_COPC7,SALPETRE,228
 90 | sp|Q985F6|GLNE_RHILO,PARAGRAF,664
 91 | sp|Q31DG8|GRPE_PROM9,TIENDELS,33
 92 | sp|Q974T4|GUAAA_SULTO,FRILANSE,139
 93 | sp|Q53T59|H1BP3_HUMAN,LAGSPELL,125
 94 | sp|Q3TC93|H1BP3_MOUSE,LAGSPELL,125
 95 | sp|A2WKS3|H2B10_ORYSI,KLAGESAK,101
 96 | sp|Q9LGI2|H2B10_ORYSJ,KLAGESAK,101
 97 | sp|P05621|H2B2_WHEAT,KLAGESAK,98
 98 | sp|A2WKP3|H2B3_ORYSI,KLAGESAK,101
 99 | sp|Q94JJ7|H2B3_ORYSJ,KLAGESAK,101
100 | sp|Q43217|H2B3_WHEAT,KLAGESAK,86
101 | sp|A2WKP5|H2B4_ORYSI,KLAGESAK,101
102 | sp|Q94JJ4|H2B4_ORYSJ,KLAGESAK,101
103 | sp|Q43215|H2B4_WHEAT,KLAGESAK,83
104 | sp|Q43216|H2B5_WHEAT,KLAGESAK,84
105 | sp|A2WKT1|H2B6_ORYSI,KLAGESAK,101
106 | sp|Q41575|H2B6_WHEAT,KLAGESAK,69
107 | sp|A2WKS8|H2B7_ORYSI,KLAGESAK,101
108 | sp|Q7GBK0|H2B7_ORYSJ,KLAGESAK,101
109 | sp|A2WKS5|H2B8_ORYSI,KLAGESAK,101
110 | sp|Q9LGH8|H2B8_ORYSJ,KLAGESAK,101
111 | sp|B3RHD9|HAP1_YEAS1,SALTSMAK,595
112 | sp|C7GQY3|HAP1_YEAS2,SALTSMAK,595
113 | sp|A7A1D7|HAP1_YEAS7,SALTSMAK,595
114 | sp|C8ZDL9|HAP1_YEAS8,SALTSMAK,595
115 | sp|G2WJ80|HAP1_YEASK,SALTSMAK,592
116 | sp|P0CE41|HAP1_YEAST,SALTSMAK,595
117 | sp|P0CS82|HAP1_YEASX,SALTSMAK,595
118 | sp|P14750|HCYA_APHSP,FIREDELE,411
119 | sp|O53637|HDDA_MYCTU,VARSLERE,316
120 | sp|Q39YP7|HISX_GEOMG,SEILVIND,237
121 | sp|P60859|HISX_GEOSL,SEILVIND,237
122 | sp|Q3A133|HISX_SYNC1,SEILVIND,237
123 | sp|Q7NIA2|HSLO_GLOVI,GRAVGANG,109
124 | sp|P31269|HXA9_HUMAN,SENNAENE,176
125 | sp|P09631|HXA9_MOUSE,SENNAENE,175
126 | sp|Q58CP0|IDH3G_BOVIN,LIVRENTE,163
127 | sp|P51553|IDH3G_HUMAN,LIVRENTE,164
128 | sp|P41564|IDH3G_MACFA,LIVRENTE,126
129 | sp|P70404|IDHG1_MOUSE,LIVRENTE,164
130 | sp|P41565|IDHG1_RAT,LIVRENTE,164
131 | sp|Q3SKX1|IF2_THIDA,KANTNING,569
132 | sp|Q5AAL9|IFF4_CANAL,TETPLASS,1070
133 | sp|O28294|ILVC_ARCFU,KALEVALA,161
134 | sp|A2C096|ISPH_PROM1,PERIGEET,340
135 | sp|Q46HB0|ISPH_PROMT,PERIGEET,340
136 | sp|A1JQY4|KDPA_YERE8,IALLFALL,74
137 | sp|Q8D2E8|KGUA_WIGBR,KINETIKK,196
138 | sp|Q8S2E5|KPRS3_ORYSJ,FEILSLAG,394
139 | sp|P50455|LEU3_SULTO,LIVRENTE,114
140 | sp|Q3A334|LON2_SYNC1,SLIPEREN,115
141 | sp|Q21MS7|LPTD_SACD2,GAMLEVEG,321
142 | sp|Q9H9A6|LRC40_HUMAN,TIPPELAG,234
143 | sp|Q4R3P6|LRC40_MACFA,TIPPELAG,234
144 | sp|Q5RFE9|LRC40_PONAB,TIPPELAG,234
145 | sp|Q0P5X1|LRIQ1_MOUSE,ENDEVEND,28
146 | sp|Q98919|LSAMP_CHICK,SANGLEIK,260
147 | sp|Q13449|LSAMP_HUMAN,SANGLEIK,260
148 | sp|Q8BLK3|LSAMP_MOUSE,SANGLEIK,260
149 | sp|Q62813|LSAMP_RAT,SANGLEIK,260
150 | sp|Q54U63|LVSC_DICDI,VILLMANN,52
151 | sp|A2SZS3|L_RVFV,SEKSTANT,1175
152 | sp|P27316|L_RVFVZ,SEKSTANT,1175
153 | sp|Q6NS57|MABP1_MOUSE,ELEVLAGA,1456
154 | sp|Q767L8|MDC1_PIG,ALTERERE,659
155 | sp|Q13LD8|METN2_PARXL,SALSVEGG,321
156 | sp|Q3ACA8|MIAA_CARHZ,KVAKKING,22
157 | sp|Q076A4|MYH8_CANLF,GAKKGAKK,637
158 | sp|P13542|MYH8_MOUSE,GAKKGAKK,635
159 | sp|B4L7U0|NAAT1_DROMO,FALLGRAV,277
160 | sp|B4MEG2|NAAT1_DROVI,FALLGRAV,267
161 | sp|Q640K1|NCDN_XENLA,SPILLETS,232
162 | sp|Q5SYE7|NHSL1_HUMAN,DAGSLYSE,599
163 | sp|Q8L746|NPR3_ARATH,GRESSKAR,381
164 | sp|Q80XB4|NRAP_MOUSE,KASSEVIS,1051
165 | sp|Q5XGN1|NUP42_XENLA,VASSTAPA,365
166 | sp|A2VDP6|NXPE3_BOVIN,GRISETTE,217
167 | sp|Q969Y0|NXPE3_HUMAN,GRISETTE,217
168 | sp|Q5RCA5|NXPE3_PONAB,GRISETTE,217
169 | sp|Q8NGE2|O2AP1_HUMAN,GATEFYLL,106
170 | sp|C1DEA4|OBG_AZOVD,SPALTERE,270
171 | sp|A6VBV3|OBG_PSEA7,SPALTERE,270
172 | sp|B7V0A9|OBG_PSEA8,SPALTERE,270
173 | sp|Q02GB1|OBG_PSEAB,SPALTERE,270
174 | sp|Q9HVL8|OBG_PSEAE,SPALTERE,270
175 | sp|C4R492|OCA5_KOMPG,ALLELEST,544
176 | sp|Q07017|OL56_STRAT,AVLSGRIS,1836
177 | sp|A0A0C1E5J8|OPAA_ASPUT,VALLAVIK,2904
178 | sp|Q8NGE1|OR6C4_HUMAN,GATEFYLL,106
179 | sp|P13909|PAI1_BOVIN,SALTSILD,271
180 | sp|P79335|PAI1_PIG,SALTSILD,271
181 | sp|P87295|PEP5L_SCHPO,TYSSEDAL,674
182 | sp|P46988|PFD1_YEAST,LETTVEKT,92
183 | sp|Q5GRV2|PGK_WOLTR,VASSKALL,306
184 | sp|Q13YI7|PHNW1_PARXL,HALVLANG,85
185 | sp|P41676|PK2_NPVAC,TALERETT,147
186 | sp|Q42556|PMA9_ARATH,STYRELSE,903
187 | sp|Q04350|POLB_CHPVE,SPEDKALV,1742
188 | sp|Q9YTU2|POLB_CHPVU,SPEDKALV,1741
189 | sp|Q54SJ8|POND_DICDI,IALLFALL,141
190 | sp|Q8K2H1|PPHLN_MOUSE,VASSKALD,226
191 | sp|Q5X5Y6|PYRG_LEGPA,VEKEAVIS,229
192 | sp|A5IB79|PYRG_LEGPC,VEKEAVIS,229
193 | sp|Q5ZWA4|PYRG_LEGPH,VEKEAVIS,229
194 | sp|Q5WXA8|PYRG_LEGPL,VEKEAVIS,229
195 | sp|A9BDD9|PYRG_PROM4,HACIENDA,315
196 | sp|Q9FG68|RAX1_ARATH,SLISSING,169
197 | sp|Q0BTG0|RECO_GRABC,ALTETERE,241
198 | sp|Q6RG78|RGS1_HORSE,TRESTAKK,142
199 | sp|Q08116|RGS1_HUMAN,TRESTAKK,155
200 | sp|Q9JL25|RGS1_MOUSE,TRESTAKK,155
201 | sp|P97844|RGS1_RAT,TRESTAKK,155
202 | sp|Q03314|RHIB_RHILV,PARTALET,198
203 | sp|Q17XC9|RIBA_HELAH,MALERISK,74
204 | sp|B6JM32|RIBA_HELP2,MALERISK,74
205 | sp|Q1CT68|RIBA_HELPH,MALERISK,74
206 | sp|Q9ZL42|RIBA_HELPJ,MALERISK,74
207 | sp|O08315|RIBA_HELPY,MALERISK,74
208 | sp|A0QPD3|RIR2H_MYCS2,ALTERERE,48
209 | sp|Q72GV5|RL9_THET2,VILLEPLE,3
210 | sp|Q5SLQ1|RL9_THET8,VILLEPLE,3
211 | sp|P27151|RL9_THETH,VILLEPLE,3
212 | sp|Q7M438|RNDI_DICDI,TAALESEN,146
213 | sp|P56185|RNJ_HELPY,SENSKADE,15
214 | sp|P67284|RNY_STRP1,LIVSALIG,7
215 | sp|P0DF20|RNY_STRP3,LIVSALIG,7
216 | sp|Q5XAP0|RNY_STRP6,LIVSALIG,7
217 | sp|Q8P000|RNY_STRP8,LIVSALIG,7
218 | sp|Q1JAJ3|RNY_STRPB,LIVSALIG,7
219 | sp|Q1JKP5|RNY_STRPC,LIVSALIG,7
220 | sp|Q1JFN6|RNY_STRPD,LIVSALIG,7
221 | sp|Q1J5I5|RNY_STRPF,LIVSALIG,7
222 | sp|A2RD66|RNY_STRPG,LIVSALIG,7
223 | sp|Q48S17|RNY_STRPM,LIVSALIG,7
224 | sp|P0DF21|RNY_STRPQ,LIVSALIG,7
225 | sp|B0TX10|RPOB_FRAP2,NEVRALGI,1340
226 | sp|Q14JT5|RPOB_FRAT1,NEVRALGI,1340
227 | sp|A7NEC0|RPOB_FRATF,NEVRALGI,1340
228 | sp|Q2A1M7|RPOB_FRATH,NEVRALGI,1340
229 | sp|B2SFD6|RPOB_FRATM,NEVRALGI,1340
230 | sp|A0Q867|RPOB_FRATN,NEVRALGI,1340
231 | sp|Q0BKC5|RPOB_FRATO,NEVRALGI,1340
232 | sp|Q5NID2|RPOB_FRATT,NEVRALGI,1340
233 | sp|A4IW99|RPOB_FRATW,NEVRALGI,1340
234 | sp|P56764|RPOC2_ARATH,SVEIPING,598
235 | sp|A4QKI2|RPOC2_CAPBU,SVEIPING,598
236 | sp|Q9THV5|RPOC2_SINAL,SVEIPING,603
237 | sp|Q21M92|RPOC_SACD2,RIDETIME,551
238 | sp|Q4FLJ0|RS4_PELUB,ALASKERE,156
239 | sp|Q12136|SAS10_YEAST,DESIDERE,70
240 | sp|Q9UPW6|SATB2_HUMAN,VERVEREN,234
241 | sp|Q8VI24|SATB2_MOUSE,VERVEREN,234
242 | sp|P0C883|SCL33_ARATH,VERPESYK,623
243 | sp|B3ECJ8|SECA_CHLL2,GRISEMAT,153
244 | sp|Q30RR0|SECA_SULDN,AVIATIKK,421
245 | sp|P0AG91|SECD_ECO57,TAKSVALE,63
246 | sp|P0AG90|SECD_ECOLI,TAKSVALE,63
247 | sp|P0AG92|SECD_SHIFL,TAKSVALE,63
248 | sp|A7TI28|SHO1_VANPO,VILLSVIN,127
249 | sp|Q1E1R7|SPB4_COCIM,KRAKKERE,586
250 | sp|P16546|SPTN1_MOUSE,SEERTALL,446
251 | sp|A9NF97|SSRP_ACHLI,KRETIKER,127
252 | sp|Q7X2N6|SSRP_SPHEL,KRETIKER,139
253 | sp|A7I6S6|SURE_METB6,ANRIKING,55
254 | sp|Q89I89|SYA_BRADU,DATASALG,255
255 | sp|A5GPF9|SYA_SYNPW,RIESLING,621
256 | sp|C3PAE9|SYFA_BACAA,ALVNEVRE,59
257 | sp|C3L8T8|SYFA_BACAC,ALVNEVRE,59
258 | sp|Q81L30|SYFA_BACAN,ALVNEVRE,59
259 | sp|B7JR67|SYFA_BACC0,ALVNEVRE,59
260 | sp|Q72ZI1|SYFA_BACC1,ALVNEVRE,59
261 | sp|B7IJW0|SYFA_BACC2,ALVNEVRE,59
262 | sp|C1EU00|SYFA_BACC3,ALVNEVRE,59
263 | sp|B7HF77|SYFA_BACC4,ALVNEVRE,59
264 | sp|B7HRK2|SYFA_BACC7,ALVNEVRE,59
265 | sp|A7GTL0|SYFA_BACCN,ALVNEVRE,59
266 | sp|B9J063|SYFA_BACCQ,ALVNEVRE,59
267 | sp|Q817I6|SYFA_BACCR,ALVNEVRE,59
268 | sp|Q633N4|SYFA_BACCZ,ALVNEVRE,59
269 | sp|Q6HCW7|SYFA_BACHK,ALVNEVRE,59
270 | sp|A9VJM4|SYFA_BACMK,ALVNEVRE,59
271 | sp|C3PNE7|SYL_RICAE,ELEKTERE,288
272 | sp|Q9V011|SYM_PYRAB,ELDELDRE,424
273 | sp|C5B832|SYR_EDWI9,TILSVARE,506
274 | sp|Q92CV8|TAGH_LISIN,GLISETST,311
275 | sp|P40412|TCPE1_AVESA,LAVADLER,190
276 | sp|P54411|TCPE2_AVESA,LAVADLER,190
277 | sp|O04450|TCPE_ARATH,LAVADLER,190
278 | sp|Q7YJS6|TI214_CALFG,SENKNING,1500
279 | sp|A6MMG9|TI214_CHLSC,SENKNING,1553
280 | sp|Q5FUR2|TIG_GLUOX,DELAKTIG,253
281 | sp|O76997|TRK1_LYMST,PRISSATT,398
282 | sp|Q6Z4N3|TRL11_ORYSJ,ALTERKAR,56
283 | sp|A0AJP7|TRMD_LISW6,NERVEVEV,28
284 | sp|Q8TYA2|TRPA_METKA,ELVEGARD,259
285 | sp|P32068|TRPE_ARATH,VASSPEIL,369
286 | sp|P32069|TRPX_ARATH,VASSPEIL,382
287 | sp|A8WTE8|TRR1_CAEBR,ISRANDEN,1515
288 | sp|Q8TZ08|TRUB_METKA,VALGVAKA,294
289 | sp|A1JIX1|TRUB_YERE8,EGALISER,92
290 | sp|O34273|TRUB_YEREN,EGALISER,92
291 | sp|A7FMS0|TRUB_YERP3,EGALISER,92
292 | sp|Q1CC09|TRUB_YERPA,EGALISER,92
293 | sp|Q8ZBC4|TRUB_YERPE,EGALISER,92
294 | sp|Q1CEL5|TRUB_YERPN,EGALISER,92
295 | sp|A4TRI1|TRUB_YERPP,EGALISER,92
296 | sp|Q66F58|TRUB_YERPS,EGALISER,92
297 | sp|G4SLH0|TTN1_CAEEL,PAKKSEKK,12795
298 | sp|O59941|VATD_NEUCR,ELDELDRE,193
299 | sp|Q97CP8|VATD_THEVO,VALERIAN,73
300 | sp|P32610|VATD_YEAST,ELDELDRE,191
301 | sp|Q3ZK57|VP3_ROT41,SAKEFALL,661
302 | sp|B3F2X7|VP3_ROTTU,SAKEFALL,661
303 | sp|Q9ENL0|VP6_CTFVL,STEINRIK,453
304 | sp|Q9Y2B5|VP9D1_HUMAN,SAMKLANG,15
305 | sp|Q45212|VSP2_BORHE,DELAKTIG,69
306 | sp|Q5PP32|WTR25_ARATH,HVITKVAL,27
307 | sp|Q8W4R9|WTR35_ARATH,HVITKVAL,36
308 | sp|P47490|Y248_MYCGE,KRISTIAN,3
309 | sp|P75197|Y583_MYCPN,STETTING,170
310 | sp|P34263|YKAD_CAEEL,EKSERSER,97
311 | sp|O13545|YL374_YEAST,IALLFALL,41
312 | sp|Q2UBI2|YME2_ASPOR,SANDLAND,233
313 | sp|Q6R3K9|YSL2_ARATH,ENERVERE,2
314 | sp|E9P860|ZNFX1_CAEEL,ELSKLING,369
315 | 


--------------------------------------------------------------------------------
/uniprot_words/data/word_matches_se.csv:
--------------------------------------------------------------------------------
 1 | .id,Keyword,Offset
 2 | sp|P77624|ARCM_ECOLI,STALLARE,219
 3 | sp|Q24995|ARY_GALME,WINFIELD,549
 4 | sp|Q6BTX0|ATG2_DEBHA,KANTNING,583
 5 | sp|Q0PAS1|CBF2_CAMJE,HVILKENS,222
 6 | sp|A1VYV6|CBF2_CAMJJ,HVILKENS,222
 7 | sp|B9VR26|CML1_BOVIN,PLATAIAI,294
 8 | sp|B1PHQ8|CML1_PIG,PLATAIAI,295
 9 | sp|A5EF51|DABA2_BRASB,ALLAKATS,625
10 | sp|B8GFL8|DNAG_METPE,SVEDMARK,248
11 | sp|O80928|DOF24_ARATH,SPRITSAS,282
12 | sp|Q83I20|DXS_TROW8,REDIVIVA,510
13 | sp|Q83G46|DXS_TROWT,REDIVIVA,510
14 | sp|C5J6A7|FTSH_MESCH,SELLASIA,545
15 | sp|P19255|GLPF_STRCO,KASKARNA,31
16 | sp|Q3SKX1|IF2_THIDA,KANTNING,569
17 | sp|O28294|ILVC_ARCFU,KALEVALA,161
18 | sp|A0QBE6|KDC_MYCA1,RIDPARTI,304
19 | sp|Q7U140|KDC_MYCBO,RIDPARTI,306
20 | sp|A1KGY5|KDC_MYCBP,RIDPARTI,306
21 | sp|Q9CBD6|KDC_MYCLE,RIDPARTI,302
22 | sp|Q742Q2|KDC_MYCPA,RIDPARTI,304
23 | sp|A0R480|KDC_MYCS2,RIDPARTI,298
24 | sp|A5U0P1|KDC_MYCTA,RIDPARTI,306
25 | sp|P9WG36|KDC_MYCTO,RIDPARTI,306
26 | sp|P9WG37|KDC_MYCTU,RIDPARTI,306
27 | sp|A0PL16|KDC_MYCUA,RIDPARTI,306
28 | sp|Q14BB9|MA6D1_MOUSE,SVAGARES,53
29 | sp|Q8E9P5|MRAY_SHEON,AFFLYTTA,145
30 | sp|A0L1P5|MRAY_SHESA,AFFLYTTA,145
31 | sp|Q0HE80|MRAY_SHESM,AFFLYTTA,145
32 | sp|Q0HZR9|MRAY_SHESR,AFFLYTTA,145
33 | sp|Q133X2|MURC_RHOPS,AFRIFVEN,200
34 | sp|A2QW83|PAN2_ASPNC,VENTRALA,393
35 | sp|Q2ULU6|PAN2_ASPOR,VENTRALA,399
36 | sp|Q5BBL5|PAN2_EMENI,VENTRALA,399
37 | sp|P05066|PHR_YEAST,PELISSEN,530
38 | sp|A3DHY6|RSMG_ACET2,STEEVENS,181
39 | sp|Q8IX30|SCUB3_HUMAN,KLIKAFFE,947
40 | sp|Q66PY1|SCUB3_MOUSE,KLIKAFFE,947
41 | sp|Q9YD97|SYL_AERPE,AVLEDARE,825
42 | sp|Q5X5L8|SYL_LEGPA,INTYGADT,590
43 | sp|A5IBJ1|SYL_LEGPC,INTYGADT,590
44 | sp|Q5ZVU2|SYL_LEGPH,INTYGADT,590
45 | sp|Q5WWZ8|SYL_LEGPL,INTYGADT,590
46 | sp|Q4CNL4|TRM51_TRYCC,AFSLAGEN,458
47 | sp|Q4DPN8|TRM52_TRYCC,AFSLAGEN,456
48 | sp|A1RV13|UPP_PYRIL,REDIVIVA,125
49 | sp|Q9UVJ8|VATA_ASHGO,MAGALENA,1
50 | sp|Q5AJB1|VATA_CANAL,MAGALENA,1
51 | sp|P38078|VATA_CANTR,MAGALENA,1
52 | sp|P55650|Y4SG_SINFN,SYSSLING,313
53 | sp|Q2S9Y0|Y5895_HAHCH,LIKSIDIG,6
54 | 


--------------------------------------------------------------------------------