├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ └── main.yml ├── .gitignore ├── CITATION.cff ├── Dockerfile ├── LICENSE ├── README.md ├── cicd ├── ci_build.sh ├── environment.yml ├── full_ids.txt ├── ids.blast ├── ids.csv ├── meta.yaml └── proteomes.fasta └── upimapi.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | 12 | 13 | **To Reproduce** 14 | - Upload an input file [drop a file here] 15 | - Write the command used: 16 | 17 | **Screenshots** 18 | If applicable, add screenshots to help explain your problem. Paste them here. 19 | 20 | **Please complete the following information:** 21 | - OS: 22 | - Version of UPIMAPI: 23 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v4 15 | - name: Set up Docker Buildx 16 | uses: docker/setup-buildx-action@v1 17 | - name: Build and push 18 | uses: docker/build-push-action@v4 19 | with: 20 | context: . 21 | file: ./Dockerfile 22 | tags: upimapi:latest 23 | outputs: type=docker,dest=/tmp/upimapi.tar 24 | - name: Upload artifact 25 | uses: actions/upload-artifact@v4 26 | with: 27 | name: upimapi 28 | path: /tmp/upimapi.tar 29 | 30 | txt-file-comma-separated: 31 | runs-on: ubuntu-latest 32 | needs: build 33 | steps: 34 | - name: Download artifact 35 | uses: actions/download-artifact@v4 36 | with: 37 | name: upimapi 38 | path: /tmp 39 | - name: Load Docker image 40 | run: docker load --input /tmp/upimapi.tar 41 | - name: IDs inputted through TXT file (comma-separated) 42 | run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/ids.csv -cols 'Entry&KEGG&Interacts with&Taxonomic lineage (SUPERKINGDOM)&Taxonomic lineage (SPECIES)&Taxonomic lineage IDs (SUPERKINGDOM)&Taxonomic lineage IDs (SPECIES)'" 43 | 44 | txt-file-newline-separated: 45 | runs-on: ubuntu-latest 46 | needs: build 47 | steps: 48 | - name: Download artifact 49 | uses: actions/download-artifact@v4 50 | with: 51 | name: upimapi 52 | path: /tmp 53 | - name: Load Docker image 54 | run: docker load --input /tmp/upimapi.tar 55 | - name: Full IDs inputted through TXT file (newline-separated) 56 | run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/full_ids.txt" 57 | 58 | blast-file: 59 | runs-on: ubuntu-latest 60 | needs: build 61 | steps: 62 | - name: Download artifact 63 | uses: actions/download-artifact@v4 64 | with: 65 | name: upimapi 66 | path: /tmp 67 | - name: Load Docker image 68 | run: docker load --input /tmp/upimapi.tar 69 | - name: IDs inputted through BLAST file 70 | run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/ids.blast -rd resources_directory --blast" 71 | 72 | get-fasta-sequences: 73 | runs-on: ubuntu-latest 74 | needs: build 75 | steps: 76 | - name: Download artifact 77 | uses: actions/download-artifact@v4 78 | with: 79 | name: upimapi 80 | path: /tmp 81 | - name: Load Docker image 82 | run: docker load --input /tmp/upimapi.tar 83 | - name: Obtain FASTA sequences 84 | run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/ids.csv -rd resources_directory --fasta" 85 | 86 | basic-id-mapping: 87 | runs-on: ubuntu-latest 88 | needs: build 89 | steps: 90 | - name: Download artifact 91 | uses: actions/download-artifact@v4 92 | with: 93 | name: upimapi 94 | path: /tmp 95 | - name: Load Docker image 96 | run: docker load --input /tmp/upimapi.tar 97 | - name: Perform basic ID mapping 98 | run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/ids.csv -rd resources_directory --from-db 'UniProtKB AC/ID' --to-db 'EMBL/GenBank/DDBJ CDS'" 99 | 100 | full-workflow: 101 | runs-on: ubuntu-latest 102 | needs: build 103 | steps: 104 | - name: Download artifact 105 | uses: actions/download-artifact@v4 106 | with: 107 | name: upimapi 108 | path: /tmp 109 | - name: Load Docker image 110 | run: docker load --input /tmp/upimapi.tar 111 | - name: Full workflow, TaxIDs DB at Species level 112 | run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/proteomes.fasta -rd resources_directory -db taxids --taxids 2203,2223,2209,2162,119484,35554,29543,863" 113 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | 3 | # Pycharm 4 | .idea 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 103 | __pypackages__/ 104 | 105 | # Celery stuff 106 | celerybeat-schedule 107 | celerybeat.pid 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | .dmypy.json 134 | dmypy.json 135 | 136 | # Pyre type checker 137 | .pyre/ 138 | 139 | # pytype static type analyzer 140 | .pytype/ 141 | 142 | # Cython debug symbols 143 | cython_debug/ -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Sequeira" 5 | given-names: "João C." 6 | orcid: "https://orcid.org/0000-0002-2691-9950" 7 | - family-names: "Rocha" 8 | given-names: "Miguel" 9 | orcid: "https://orcid.org/0000-0001-8439-8172" 10 | - family-names: "Alves" 11 | given-names: "M. Madalena" 12 | orcid: "https://orcid.org/0000-0002-9078-3613" 13 | - family-names: "Salvador" 14 | given-names: "Andreia F." 15 | orcid: "https://orcid.org/0000-0001-6037-4248" 16 | title: "UPIMAPI: UniProt Id Mapping through API" 17 | version: 1.6.4 18 | doi: "10.1016/J.CSBJ.2022.03.042" 19 | date-released: 2022-01-26 20 | url: "https://github.com/iquasere/UPIMAPI" 21 | preferred-citation: 22 | type: article 23 | authors: 24 | - family-names: "Sequeira" 25 | given-names: "João C." 26 | orcid: "https://orcid.org/0000-0002-2691-9950" 27 | - family-names: "Rocha" 28 | given-names: "Miguel" 29 | orcid: "https://orcid.org/0000-0001-8439-8172" 30 | - family-names: "Alves" 31 | given-names: "M. Madalena" 32 | orcid: "https://orcid.org/0000-0002-9078-3613" 33 | - family-names: "Salvador" 34 | given-names: "Andreia F." 35 | orcid: "https://orcid.org/0000-0001-6037-4248" 36 | doi: "10.1016/J.CSBJ.2022.03.042" 37 | journal: "Computational and Structural Biotechnology Journal" 38 | start: 1798 39 | end: 1810 40 | title: "UPIMAPI, reCOGnizer and KEGGCharter: Bioinformatics tools for functional annotation and visualization of (meta)-omics datasets" 41 | volume: 20 42 | year: 2022 -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | RUN git clone https://github.com/iquasere/UPIMAPI.git \ 4 | && conda env update --file UPIMAPI/cicd/environment.yml --name base \ 5 | && bash UPIMAPI/cicd/ci_build.sh \ 6 | && conda clean --all -y 7 | 8 | CMD [ "python", "bin/upimapi.py" ] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, João C. Sequeira 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UniProt Id Mapping through API 2 | 3 | A tool for retrieving huge ammounts of information from UniProt! 4 | 5 | UPIMAPI is a command line interface for using UniProt's API, which allows to access [UniProt's ID mapping](https://www.uniprot.org/uploadlists/) programmatically! 6 | 7 | UPIMAPI can handle big numbers of UniProt IDs (like, millions) for which information can be obtained in a single command. 8 | 9 | UPIMAPI also allows to first perform annotation with DIAMOND, connecting its powerfull annotation with the convenience of directly obtaining information from UniProt. 10 | 11 | ### Index 12 | 13 | 1. [Installing UPIMAPI](https://github.com/iquasere/UPIMAPI#installing-upimapi) 14 | 2. [Annotation with UPIMAPI](https://github.com/iquasere/UPIMAPI#annotation-with-upimapi) 15 | 3. [Information retrieval from UniProt](https://github.com/iquasere/UPIMAPI#information-retrieval-from-uniprot) 16 | 4. [Output](https://github.com/iquasere/UPIMAPI#output) 17 | 5. [Additional parameters](https://github.com/iquasere/UPIMAPI#additional-parameters) 18 | 6. [Referencing UPIMAPI](https://github.com/iquasere/UPIMAPI#referencing-upimapi) 19 | 20 | ## Installing UPIMAPI 21 | 22 | To install UPIMAPI through Bioconda, run 23 | ``` 24 | conda install -c bioconda upimapi 25 | ``` 26 | To check if UPIMAPI was installed correctly, run 27 | ``` 28 | upimapi --version 29 | ``` 30 | 31 | ## Annotation with UPIMAPI 32 | 33 | UPIMAPI can be used to perform homology-based annotation with DIAMOND. Main advantages of using UPIMAPI are that it determines optimal values for the most important search parameters, and directly links annotation to UniProt ID mapping. 34 | To annotate protein sequences and get information from UniProt, UPIMAPI can be run as 35 | ``` 36 | upimapi -i path/to/sequences.fasta -o path/to/output_directory -db database -t threads 37 | ``` 38 | where: 39 | * ```sequences.fasta``` is a FASTA file with aminoacid sequences of query proteins 40 | * ```output_directory``` can be any folder, existent or not 41 | * ```database``` can be either "uniprot" (default), "swissprot", "taxids" or the filename of a FASTA file with the reference sequences (see below). 42 | 43 | ### Reference database 44 | 45 | Several points to take notice about the reference database: 46 | * It must be either UniProt or a subsection of it (e.g. SwissProt, or all proteins of a specific taxon). UPIMAPI performs ID mapping with UniProt IDs, so the database must have those; 47 | * It can be supplied in either FASTA (.fasta) or DIAMOND (.dmnd) format. If in FASTA, UPIMAPI will create a new database in DIAMOND format for annotation; 48 | * There are four different ways to input reference databases to UPIMAPI: 49 | 50 | #### Use the entire UniProt (or just SwissProt) 51 | 52 | Using the UniProt database is a valid choice if the case study is a metagenome with a mostly unknown community composition. 53 | 54 | To use the entire UniProt database as reference for UPIMAPI, specify the database as ```--database uniprot```. 55 | 56 | If alternatively you only want to use SwissProt (the manually curated part of UniProt), specify the database as ```--database swissprot```. 57 | 58 | #### Input tax IDs to build a more specific database 59 | 60 | If, for both pure and mixed cultures, the taxonomic composition is known, UPIMAPI can build a database with the reference proteomes of the known taxa. 61 | 62 | To build a reference for specific taxa, specify the database as ```--database taxids```, and the tax IDs as ```--tax-ids taxid1,taxid2,taxid3 ...```. 63 | 64 | #### Input a custom database 65 | 66 | A custom database can be inputted if, for example, there is only interest in annotating proteins of a specific family (e.g. hydrogenases). Such a database must be manually built from UniProt. 67 | 68 | To input a custom database into UPIMAPI, specify it as ```--database path/to/database.fasta```. 69 | 70 | ## Information retrieval from UniProt 71 | 72 | ### Columns of information from UniProt 73 | 74 | UniProt provides information for many different fields of information and cross-references. For the user's convenience, a default selection is provided: ```Entry```, ```Entry name```, ```Gene Names```, ```Protein names```, ```EC number```, ```Function[CC]```, ```Pathway```, ```Keywords```, ```Protein existence```, ```Gene Ontology (GO)```, ```Protein families```, ```Taxonomic lineage```, ```Organism```, ```Organism ID```, ```BioCyc```, ```BRENDA```, ```CDD```, ```eggNOG```, ```Ensembl```, ```InterPro```, ```KEGG```, ```Pfam```, ```Reactome```, ```RefSeq``` and ```UniPathway```. 75 | 76 | If another selection of columns/databases is desired, it can be specified, for example, as 77 | ``` 78 | --columns "Coiled coil&Compositional bias" 79 | ``` 80 | where ```--columns``` takes as input the names of the fields of information required. Valid values for the columns can be consulted at [UniProtKB return fields](https://www.uniprot.org/help/return_fields). 81 | 82 | #### Sometimes the return fields are not properly updated 83 | 84 | If the columns were correctly inputted according to the [return fields page](https://www.uniprot.org/help/return_fields) and UPIMAPI is still complaining about "\[COL] is not a valid column name for ID mapping", it may be that values at return fields are not properly updated. If that happens, running `upimapi --show-available-columns` will present the user with the current valid fields. 85 | 86 | #### UPIMAPI offers a few additional columns for taxonomic information 87 | 88 | Previous to the Summer 2022 UniProt release, the API provided fields for taxonomic information, but these have been condensed into the ```Taxonomic lineage``` and ```Taxonomic lineage (IDs)``` columns. Since ```1.8.6```, UPIMAPI provides this information again, properly organized. Additional available columns for taxonomy are as follows: 89 | 90 | * ```Taxonomic lineage (LEVEL OF TAXONOMY)```: the taxonomic lineage of the organism, with the specified level of taxonomy. For example, ```--columns "Taxonomic lineage (SPECIES)"``` will return the species of the organism. Other possible values are ```SUPERKINGDOM```, ```PHYLUM```, ```CLASS```, ```ORDER```, ```FAMILY```, ```GENUS```, ```SPECIES```, [among others](https://en.wikipedia.org/wiki/Taxonomic_rank). 91 | 92 | * ```Taxonomic lineage IDs (LEVEL OF TAXONOMY)```: the TaxIDs of the organism, with the specified level of taxonomy. For example, ```--columns "Taxonomic lineage IDs (SPECIES)"``` will return the TaxID of the species of the organism. Other possible values are as above. 93 | 94 | ## ID mapping without annotation 95 | 96 | If only retrieval of information from UniProt is required (no annotation step), IDs can be inputted to UPIMAPI directly through several different inputs. 97 | 98 | ### Annotation BLAST file 99 | 100 | The result of an annotation with some database with UniProt IDs can be directy inputted for ID mapping with the command 101 | ``` 102 | upimapi -i aligned.blast -o output_directory --blast 103 | ``` 104 | 105 | ### CSV file 106 | 107 | A CSV file with UniProt IDs (separated by commas) can be inputted to UPIMAPI with the command 108 | ``` 109 | upimapi -i ids.txt -o output_directory 110 | ``` 111 | This repo provides an [example](https://github.com/iquasere/UPIMAPI/blob/master/ids.txt) of this file. 112 | 113 | ### Directly from the command line 114 | 115 | IDs can also be directly inputted through the command line by not specifying an input. They must be inputted as a comma separated value: 116 | ``` 117 | >>> upimapi -o output_directory 118 | 119 | IDs to perform mapping on (comma separated values): 120 | ``` 121 | 122 | ## Output 123 | 124 | Information obtained with UPIMAPI can come in two forms: 125 | 1. The **Base** (default) workflow obtains information for the list of columns and databases inputted. It produces the following outputs, in the output folder: 126 | * ```uniprotinfo.tsv```, contains information of the columns and databases specified 127 | * if annotation was performed, ```aligned.blast``` and ```unaligned.fasta``` contain the annotated and unannotated proteins, respectively. 128 | 129 | 2. The **Fasta** workflow, specified with the ```--fasta``` argument, results in a FASTA file with the protein sequences correspondent to the inputted IDs 130 | 131 | ## From/To ID mapping 132 | 133 | The ID mapping available at https://www.uniprot.org/id-mapping triggered when "From database" and "To database" are different to the default values - "UniProtKB AC/ID" and "UniProtKB" - is also implemented since UPIMAPI `1.12`. 134 | 135 | As an example, this command would convert IDs from UniProtKB to EMBL/Genbank/DDBJ CDS: 136 | ``` 137 | upimapi -i ids.txt -o output_directory --from-db 'UniProtKB AC/ID' --to-db 'EMBL/GenBank/DDBJ CDS' 138 | ``` 139 | 140 | Possible values for parameters `--from-db` and `--to-db` can be consulted through the browser (https://www.uniprot.org/id-mapping), at https://rest.uniprot.org/configure/idmapping/fields, or by inputting a wrong value to one of those parameters. Possible options will show up. 141 | 142 | This new ID mapping can't be combined with the ID mapping that obtains columns of information from UniProt. UPIMAPI will exit after ID mapping. 143 | 144 | ## Additional parameters 145 | 146 | ``` 147 | -h, --help show this help message and exit 148 | -i INPUT, --input INPUT 149 | Input filename - can be: 1. a file containing a list of IDs (comma-separated values, no spaces) 2. a BLAST TSV result file (requires to be specified with the 150 | --blast parameter 3. a protein FASTA file to be annotated (requires the -db parameter) 4. nothing! If so, will read input from command line, and parse as CSV 151 | (id1,id2,...) 152 | -o OUTPUT, --output OUTPUT 153 | Folder to store outputs 154 | -ot OUTPUT_TABLE, --output-table OUTPUT_TABLE 155 | Filename of table output, where UniProt info is stored. If set, will override 'output' parameter just for that specific file 156 | -rd RESOURCES_DIRECTORY, --resources-directory RESOURCES_DIRECTORY 157 | Directory to store resources of UPIMAPI [~/upimapi_resources] 158 | -cols COLUMNS, --columns COLUMNS 159 | List of UniProt columns to obtain information from (separated by &) 160 | --blast If input file is in BLAST TSV format (will consider one ID per line if not set) [false] 161 | --full-id FULL_ID If IDs in database are in 'full' format: tr|XXX|XXX [auto] 162 | --fasta Output will be generated in FASTA format [false] 163 | --step STEP How many IDs to submit per request to the API [1000] 164 | --max-tries MAX_TRIES 165 | How many times to try obtaining information from UniProt before giving up [3] 166 | --sleep SLEEP Time between requests (in seconds) [3] 167 | --no-annotation Do not perform annotation - input must be in one of BLAST result or TXT IDs file or STDIN [false] 168 | --local-id-mapping Perform local ID mapping of SwissProt IDs. Advisable if many IDs of SwissProt are present [false] 169 | --skip-id-mapping If true, UPIMAPI will not perform ID mapping [false] 170 | --skip-id-checking If true, UPIMAPI will not check if IDs are valid before mapping [false] 171 | --skip-db-check So UPIMAPI doesn't check for (FASTA) database existence [false] 172 | --mirror {expasy,uniprot,ebi} 173 | From where to download UniProt database [expasy] 174 | -v, --version show program's version number and exit 175 | 176 | DIAMOND arguments: 177 | -db DATABASE, --database DATABASE 178 | How the reference database is inputted to UPIMAPI. 1. uniprot - UPIMAPI will download the entire UniProt and use it as reference 2. swissprot - UPIMAPI will 179 | download SwissProt and use it as reference 3. taxids - Reference proteomes will be downloaded for the taxa specified with the --taxids, and those will be used as 180 | reference 4. a custom database - Input will be considered as the database, and will be used as reference 181 | -t THREADS, --threads THREADS 182 | Number of threads to use in annotation steps [all available] 183 | --evalue EVALUE Maximum e-value to report annotations for [1e-3] 184 | --pident PIDENT Minimum pident to report annotations for. 185 | --bitscore BITSCORE Minimum bit score to report annotations for (overrides e-value). 186 | -mts MAX_TARGET_SEQS, --max-target-seqs MAX_TARGET_SEQS 187 | Number of annotations to output per sequence inputed [1] 188 | -b BLOCK_SIZE, --block-size BLOCK_SIZE 189 | Billions of sequence letters to be processed at a time [memory / 20] 190 | -c INDEX_CHUNKS, --index-chunks INDEX_CHUNKS 191 | Number of chunks for processing the seed index [dependant on block size] 192 | --max-memory MAX_MEMORY 193 | Maximum memory to use (in Gb) [all available] 194 | --taxids TAXIDS Tax IDs to obtain protein sequences of for building a reference database. 195 | --diamond-mode {fast,mid_sensitive,sensitive,more_sensitive,very_sensitive,ultra_sensitive} 196 | Mode to run DIAMOND with [fast] 197 | ``` 198 | 199 | ## Referencing UPIMAPI 200 | 201 | If you use UPIMAPI, please cite its [publication](https://www.sciencedirect.com/science/article/pii/S2001037022001179). -------------------------------------------------------------------------------- /cicd/ci_build.sh: -------------------------------------------------------------------------------- 1 | PREFIX="/opt/conda" 2 | mkdir -p "${PREFIX}/bin" 3 | cp UPIMAPI/upimapi.py "${PREFIX}/bin" 4 | chmod +x /opt/conda/bin/upimapi.py 5 | ln -s "${PREFIX}/bin/upimapi.py" "${PREFIX}/bin/upimapi" -------------------------------------------------------------------------------- /cicd/environment.yml: -------------------------------------------------------------------------------- 1 | name: upimapi 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - pandas 8 | - diamond 9 | - psutil 10 | - tqdm 11 | - requests 12 | - biopython 13 | - pyyaml -------------------------------------------------------------------------------- /cicd/full_ids.txt: -------------------------------------------------------------------------------- 1 | tr|A0A090I5T7|A0A090I5T7_METFO 2 | tr|A0A089ZJ62|A0A089ZJ62_METFO 3 | tr|A0A090I166|A0A090I166_METFO 4 | tr|A0A090I2M9|A0A090I2M9_METFO 5 | tr|A0A090I395|A0A090I395_METFO 6 | tr|A0A090I3B2|A0A090I3B2_METFO 7 | tr|A0A090I3H2|A0A090I3H2_METFO 8 | tr|A0A090I4Q7|A0A090I4Q7_METFO 9 | tr|A0A090I4T1|A0A090I4T1_METFO 10 | tr|A0A090I521|A0A090I521_METFO 11 | tr|A0A090I6C9|A0A090I6C9_METFO 12 | tr|A0A090I6I6|A0A090I6I6_METFO 13 | tr|A0A090I8P6|A0A090I8P6_METFO 14 | tr|A0A090I8T6|A0A090I8T6_METFO 15 | tr|A0A090I915|A0A090I915_METFO 16 | tr|A0A090IAB0|A0A090IAB0_METFO 17 | tr|A0A090JTG7|A0A090JTG7_METFO 18 | tr|A0A090JXV9|A0A090JXV9_METFO 19 | tr|A0A089Z9J6|A0A089Z9J6_METFO 20 | tr|A0A089ZCR8|A0A089ZCR8_METFO 21 | tr|A0A089ZDP0|A0A089ZDP0_METFO 22 | tr|A0A089ZDP3|A0A089ZDP3_METFO 23 | tr|A0A089ZGW2|A0A089ZGW2_METFO 24 | tr|A0A089ZH11|A0A089ZH11_METFO 25 | tr|A0A089ZHB0|A0A089ZHB0_METFO 26 | tr|A0A089ZHC6|A0A089ZHC6_METFO 27 | tr|A0A089ZHH1|A0A089ZHH1_METFO 28 | tr|A0A089ZVL0|A0A089ZVL0_METFO 29 | tr|A0A089ZVU4|A0A089ZVU4_METFO 30 | -------------------------------------------------------------------------------- /cicd/ids.blast: -------------------------------------------------------------------------------- 1 | sp|Q74FU6|SFRA_GEOSL sp|D7AF63|SFRA_GEOSK 100 844 0 0 1 844 1 844 0.0 1667 2 | sp|Q74FU5|SFRB_GEOSL sp|D7AF64|SFRB_GEOSK 100 672 0 0 1 672 1 672 0.0 1367 3 | sp|Q74DI8|PPNP_GEOSL tr|A0A0D5NBK3|A0A0D5NBK3_GEOSN 100 104 0 0 1 104 1 104 3.01e-70 214 4 | sp|P61422|THIED_GEOSL tr|A0A0D5N9M5|A0A0D5N9M5_GEOSN 100 490 0 0 1 490 1 490 0.0 932 5 | tr|Q74CH0|Q74CH0_GEOSL tr|A0A0D5NCT2|A0A0D5NCT2_GEOSN 100 363 0 0 1 363 1 363 3.75e-260 715 6 | sp|Q74GH5|GLMU_GEOSL tr|A0A0D5N8K0|A0A0D5N8K0_GEOSN 100 476 0 0 1 476 1 476 0.0 884 7 | tr|Q74BH3|Q74BH3_GEOSL tr|A0A0D5N307|A0A0D5N307_GEOSN 100 319 0 0 1 319 1 319 8.65e-221 612 8 | tr|Q74C72|Q74C72_GEOSL tr|A0A0D5NB94|A0A0D5NB94_GEOSN 100 519 0 0 1 519 1 519 0.0 992 9 | sp|Q74FW6|TSAL_GEOSL tr|A0A0D5N940|A0A0D5N940_GEOSN 100 402 0 0 1 402 1 402 2.43e-278 764 10 | sp|Q74BY3|PYRG_GEOSL tr|A0A0D5N2K6|A0A0D5N2K6_GEOSN 100 536 0 0 1 536 1 536 0.0 1080 11 | -------------------------------------------------------------------------------- /cicd/ids.csv: -------------------------------------------------------------------------------- 1 | P31946,P62258,ALBU_HUMAN,EFTU_ECOLI -------------------------------------------------------------------------------- /cicd/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "upimapi" %} 2 | {% set version = "1.12.1" %} 3 | {% set sha256 = "c806ba0804abf2eb482b75be3bd7312d8b117048ba800ebff788b06f430b188f" %} 4 | 5 | package: 6 | name: {{ name|lower }} 7 | version: {{ version }} 8 | 9 | source: 10 | url: https://github.com/iquasere/UPIMAPI/archive/{{ version }}.tar.gz 11 | sha256: {{ sha256 }} 12 | 13 | build: 14 | noarch: generic 15 | number: 0 16 | run_exports: 17 | - { { pin_subpackage(name, max_pin="x.x") } } 18 | script: > 19 | mkdir -p $PREFIX/bin && 20 | cp upimapi.py $PREFIX/bin && 21 | chmod +x $PREFIX/bin/upimapi.py && 22 | ln -s $PREFIX/bin/upimapi.py $PREFIX/bin/upimapi 23 | 24 | requirements: 25 | run: 26 | - pandas 27 | - diamond 28 | - psutil 29 | - tqdm 30 | - requests 31 | - biopython 32 | - pyyaml 33 | 34 | test: 35 | commands: 36 | - upimapi -v 37 | 38 | about: 39 | home: https://github.com/iquasere/UPIMAPI 40 | license: BSD-3-Clause 41 | license_family: BSD 42 | license_file: LICENSE 43 | summary: 'UniProt Id Mapping through API' 44 | description: | 45 | UPIMAPI takes as input either a list of UniProt IDs or a blast file from 46 | annotation using UniProt database as reference and uses UniProt's API to 47 | retrieve information relative to those IDs. It is essentially a command 48 | line implementation of UniProt's ID mapping web service available at 49 | https://www.uniprot.org/uploadlists/, allowing for retrieval of information 50 | from thousands of IDs in one go, while still relying on the web service. 51 | doc_url: https://github.com/iquasere/UPIMAPI/blob/master/README.md 52 | dev_url: https://github.com/iquasere/UPIMAPI 53 | 54 | extra: 55 | recipe-maintainers: 56 | - iquasere 57 | -------------------------------------------------------------------------------- /cicd/proteomes.fasta: -------------------------------------------------------------------------------- 1 | >Q74FU6 2 | MVSLTIDGKDITVAKETTILDAAALLGITIPTLCWLKKVSPTGACRVCAVEIEGVDRPMTACNTPVKDGIKVTTQSEKLSRIRQKIMELMLVNHPLDCPVCDAGGECDLQNACYGLGAAKQEYGAVLERRKIRYDWPLIESDPNRCILCEKCVKVDHEIVGCNAIRVVNRGEATIIDTVDGNPLNCEFCGNCVAACPTGTLISKPFKFRGRPWAFTTTPSVCPFCATGCQIEYHSRNGRVERVTSDDSTYNSGNLCINGRFGYSYINSPDRLAEPMVKGQKADWNTAMGTAATALKQIVASHGADAVAGFGSPRVTNEDNYLFQKLMRSAIGTGNIDSEARLGFAATQKVLREMLGIAGASTTIDAIDRATAVLVVGCDLNAEATGMEYRVIKAATKNNAKLVLAAMRDIKLKKFANSHLKYRPGNETLLINALTKAVLEEGLENKEFCSANISNLSDLTAALAGVSIADAAAATGVTEADLRAAARLVGGKKGVAVIFGAELMRGGNTDAVKALINLALILGATAGDTGGLFPVYEKTNIRGLLDMGVAPDHFPGHQTDGTTFEKAWGKKLPAAAGKDLWQIIEGIEQGSVKALYLLGCDPVASFPEGERIRKALEKLELLIVQDPFPGEAAKMAHVVFPSSVAAEKNGTFTTIDGRVQPLAKAVAPSGDAREDWDILTELYNRLTGESRIHSPAAVLDEVAALVPAYASVGRTGGTITAQPRSGGLALAPVSARAVAGSPTTLLVGTILYHSGTTTTWSKNNLEIIPKGYIEIHPNDAAKLGIAEGGKVRLSAGSVKVEGTAKITPRVQPGLLFAPSHFRGMNVNALLSRDGGVVPVTVEKA 3 | >Q74FU5 4 | MAQVVFSSWGRTIVDNRKGGEAQDVSFRLPTTLDGERQIAAFMGWDGIILYDLKVDVPAMAAEYMKRVQTQYCCGKCTPGKKGTKVLADVLAAIIEGRATEADLDTIDDLADLLTNCKCTLCQSSTIPVLDAVKHFREDFLAYITGIRKPANVHRFIDKYTAPCMDRCPAHIDIPAYIEAIKEYRFDESLDIIRDNMPLPSVCGRVCPHPCETHCRRKNVDDSVNIMVLKRSASDYEWMHNAAPPMQPKPQKNKKVAIVGAGPAGLACAYYLALEGYPCTIYEALPEGYGGGMIAVGIPPYRQPRHLLQRDIDIISSMGVDIIYDTRIGKDISLEELKQKFDAVFLAPGAHRSKPMGVEGEDKGYKGFLKGGIDFLREAYMGRPTGMGKKVVVVGGGNTAIDCVRVALREGAEESTLLYRRSRKEMPADVWEVDGADEEGVRFEFQVLPTRVLVDENEQVTGVECVRMALGEPDASGRRRPEPVPGSEFVVECDTVIPAIGQDPDLSFIPDNLGIDITKWNTVVTKYVPLKDAAGKDLKDGMGNPLARVLITDLEGVFAGGDAEIGPLTVVACIGNAHRAARVIQRWLEEGKAYLTEDELMEDILTNMPVYDKNEKVPWLDSRERAHQAEVHGQERASKGNYQEVELGFVDTQAVEEAERCLRCYRVAMAAI 5 | >Q74DI8 6 | MSEFTNVTIIREANVYFDGGVVSRTVVFPDGTKKTLGIMQPGEYTFTTGAPEIMEILSGELDLKLPGSDAWNRVGGGESFDVPANSSFTMKVLSLTDYCCSFLG 7 | >P61422 8 | MASNGHTLRLVINRDKHDSVIRGLYLVTDHDDNLIPRVEAAIDGGARVVQYRNKNQDRESRLALGLELRELCRRRSIPFIVNDDLEMAVSLKADGLHLGQGDGDPREARRVLGPGKIIGVSTHTLSEALEAQAAGVDYIGLGAMFPSRSKEVEHVAGSELLAAIRSSISIPIVAIGGITRDNGASVIDAGADAVAVISAVLSHPDPALAATEIALLFNRRAPFPRGSVLTVAGSDSGGGAGIQADLKTVTLLGSYGSSVLTALTAQNTRGVSGIHGVPPAFVADQLDAVFSDIPVDVVKTGMLFSAETIVAIAAKLTEYRRRMVVVDPVMVAKGGANLIDRGAVSVLKERLFPLAYLVTPNIPEAERLTGANISDEESMREAARRLHRLGARNVLLKGGHLLAGDSVDILFDGAAFHRFVSPRILSKNTHGTGCTFASAIATYLAQGDPLREAIARAKRYITAAIRLAQPLGRGHGPVNHILAAEDVRDR 9 | >Q74CH0 10 | MARKVGILTGGGDCPGLNAVIRGVVKSSIIRRGWEVVGIRDGFDGLLYNNRIVPLGLNEVRGILPRGGTILGTSNRGNPFSYPVEADGKTVLTDVSDEVVANIKKQGIDALVAVGGDGSLKIALELMNKGIPVVGVPKTIDNDLMETDVTFGYNTALETATDALDKLHSTAESHHRVMIMEVMGRYAGWIALESGISGGADVILIPEIPYDISAVCRAVDERRRRGSSFSIIVVAEGAFPRGGNRVVQKRADETNTIERLGGIGQYVARQLGDCLDMDVRVMVLGHLQRGGSPSTFDRCLGSRFGVAAIDLIEQEQYGRMVCLRGRDIKSVSIERAVRKLKLVNPGGQMVTAAEELGIVVGRR 11 | >Q74GH5 12 | MDNLAAIILAAGKGTRMKSGIVKVMHPLAGAPMVAWPVAVARQAGAGRIVAVVGHQAERLREHFSNDADITLAVQEEQLGTGHAVACAAGDLSGFSGKVLILCGDVPLIRTETLRAMVTAHEATGAVLTVLTARQENPHGYGRIIRGFDGRVIRIVEEKDATPDERSRTEVNAGIYCAEASFLFDAVKRIGNDNAQGEYYLTDIITMANDRGLRCTAHPVADPVEVMGINDRVQLAEAARHARRRIAEEHMLNGVTLVDPAATYIDQGVVIGADTTIQPGVQIAGGCRVGEGCTIEAGAIIKGSELGDRCVVESRAVIRGCRLGSDVVIKAGTVMEDSTVMDHAAIGPMAHLRPGSELGAHVKIGNFVETKKIVMGEGSKASHLTYLGDATIGRNVNVGCGTITCNYDGVNKHRTVIGDDVFVGSDVQFVAPVTIGSNTLIAAGTTVTRDVPADSLAIARTPQINKEGWKLRKRDQ 13 | >Q74BH3 14 | MKKIGILTSGGDCSGMNAAIRAAVRTAIRMNIEVVGFRKGYLGLMKGDAIPLDTKAVSGILHRGGTFLQSARSPEFKTPEGQRTALNNLKALGVEGMVVMGGDGSLTGALALNRLGLPVVGIPASIDNDIPFTDMALGVDTALNNIIYAVDCIKDTASSHARAFVIEVMGRHSGYLASISAIATGAEYALVPEREYDLAEICQQLRARYEEGRDNAIIILAEGAGHGHEIANSIKDAIGFETRVTVLGHYQRGGAPTVFDRLLASRLGKKSVELLVTGTWGVMVGLSCNAILATPLEDVIKGEKRPQDEVLRLAEVLGV -------------------------------------------------------------------------------- /upimapi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | UPIMAPI - UniProt Id Mapping through API 4 | 5 | By João Sequeira 6 | 7 | Mar 2020 8 | """ 9 | 10 | import json 11 | from argparse import ArgumentParser, ArgumentTypeError 12 | import os 13 | import sys 14 | from time import strftime, gmtime, time, sleep 15 | from subprocess import run, Popen, PIPE, check_output 16 | import requests 17 | import yaml 18 | from psutil import virtual_memory 19 | from pathlib import Path 20 | from multiprocessing import cpu_count, Pool, Manager 21 | from io import StringIO 22 | import pandas as pd 23 | import xml.etree.ElementTree as ET 24 | from tqdm import tqdm 25 | from datetime import datetime 26 | from Bio import SwissProt as SP 27 | import numpy as np 28 | from functools import partial 29 | import re 30 | 31 | __version__ = '1.13.2' 32 | 33 | 34 | def load_api_info(): 35 | return yaml.safe_load(requests.get('https://rest.uniprot.org/docs/uniprot-openapi3.yaml').text) 36 | 37 | 38 | def get_url(url, max_tries = 3, **kwargs): 39 | tries = 0 40 | response = None 41 | while tries < max_tries + 1: 42 | try: 43 | response = requests.get(url, **kwargs) 44 | if response.ok: 45 | return response 46 | except: 47 | tries += 1 48 | sleep(5) 49 | response.raise_for_status() 50 | sys.exit(response.txt) 51 | 52 | 53 | def get_uniprot_columns(): 54 | res = get_url('https://rest.uniprot.org/configure/uniprotkb/result-fields') 55 | obj = json.loads(res.text) 56 | result = {} 57 | for i in range(len(obj)): 58 | for col in obj[i]['fields']: 59 | result[col['label']] = col['name'] 60 | return result 61 | 62 | 63 | def get_id_mapping_fields(): 64 | res = get_url('https://rest.uniprot.org/configure/idmapping/fields') 65 | obj = json.loads(res.text) 66 | froms, tos = {}, {} 67 | for group in range(len(obj['groups'])): 68 | for item in obj['groups'][group]['items']: 69 | if item['from']: 70 | froms[item['displayName']] = item['name'] 71 | if item['to']: 72 | tos[item['displayName']] = item['name'] 73 | return froms, tos 74 | 75 | 76 | api_info = load_api_info() 77 | columns_dict = get_uniprot_columns() 78 | from_fields, to_fields = get_id_mapping_fields() 79 | 80 | 81 | def get_arguments(): 82 | parser = ArgumentParser(description="UniProt Id Mapping through API", 83 | epilog="A tool for retrieving information from UniProt.") 84 | parser.add_argument( 85 | "-i", "--input", help="""Input filename - can be:\n 86 | \t1. a file containing a list of IDs (comma-separated values, no spaces)\n 87 | \t2. a BLAST TSV result file (requires to be specified with the --blast parameter\n 88 | \t3. a protein FASTA file to be annotated (requires the -db parameter)\n 89 | \t4. nothing! If so, will read input from command line, and parse as CSV (id1,id2,...)""") 90 | parser.add_argument("-o", "--output", help="Folder to store outputs", default="UPIMAPI_output") 91 | parser.add_argument( 92 | "-ot", "--output-table", 93 | help="Filename of table output, where UniProt info is stored. If set, will override 'output' parameter " 94 | "just for that specific file") 95 | parser.add_argument( 96 | "-rd", "--resources-directory", default=os.path.expanduser("~/upimapi_resources"), 97 | help="Directory to store resources of UPIMAPI [~/upimapi_resources]") 98 | parser.add_argument( 99 | "-cols", "--columns", default=None, help="List of UniProt columns to obtain information from (separated by &)") 100 | parser.add_argument( 101 | "--from-db", default="UniProtKB AC/ID", choices=from_fields.keys(), 102 | help="Which database are the IDs from. If from UniProt, default is fine [UniProtKB AC/ID]") 103 | parser.add_argument( 104 | "--to-db", default="UniProtKB", choices=to_fields.keys(), 105 | help="To which database the IDs should be mapped. If only interested in columns information " 106 | "(which include cross-references), default is fine [UniProtKB]") 107 | parser.add_argument( 108 | "--blast", action="store_true", default=False, 109 | help="If input file is in BLAST TSV format (will consider one ID per line if not set) [false]") 110 | parser.add_argument( 111 | "--full-id", type=str2bool, default="auto", help="If IDs in database are in 'full' format: tr|XXX|XXX [auto]") 112 | parser.add_argument( 113 | "--fasta", help="Output will be generated in FASTA format [false]", action="store_true", default=False) 114 | parser.add_argument( 115 | "--step", type=int, default=1000, help="How many IDs to submit per request to the API [1000]") 116 | parser.add_argument( 117 | "--max-tries", default=3, type=int, 118 | help="How many times to try obtaining information from UniProt before giving up [3]") 119 | parser.add_argument("--sleep", default=3, type=int, help="Time between requests (in seconds) [3]") 120 | parser.add_argument( 121 | "--no-annotation", action="store_true", default=False, 122 | help="Do not perform annotation - input must be in one of BLAST result or TXT IDs file or STDIN [false]") 123 | parser.add_argument( 124 | "--local-id-mapping", action="store_true", default=False, 125 | help="Perform local ID mapping of SwissProt IDs. Advisable if many IDs of SwissProt are present [false]") 126 | parser.add_argument( 127 | "--skip-id-mapping", action="store_true", default=False, 128 | help="If true, UPIMAPI will not perform ID mapping [false]") 129 | parser.add_argument( 130 | "--skip-id-checking", action="store_true", default=False, 131 | help="If true, UPIMAPI will not check if IDs are valid before mapping [false]") 132 | parser.add_argument( 133 | "--skip-db-check", action="store_true", default=False, 134 | help="So UPIMAPI doesn't check for (FASTA) database existence [false]") 135 | parser.add_argument( 136 | "--mirror", choices=['expasy', 'uniprot', 'ebi'], default='expasy', 137 | help="From where to download UniProt database [expasy]") 138 | parser.add_argument('-v', '--version', action='version', version=f'UPIMAPI {__version__}') 139 | 140 | diamond_args = parser.add_argument_group('DIAMOND arguments') 141 | diamond_args.add_argument( 142 | "-db", "--database", default='uniprot', 143 | help="How the reference database is inputted to UPIMAPI.\n" 144 | "\t1. uniprot - UPIMAPI will download the entire UniProt and use it as reference\n" 145 | "\t2. swissprot - UPIMAPI will download SwissProt and use it as reference\n" 146 | "\t3. taxids - Reference proteomes will be downloaded for the taxa specified with the --taxids, and those " 147 | "will be used as reference\n" 148 | "\t4. a custom database - Input will be considered as the database, and will be used as reference") 149 | diamond_args.add_argument( 150 | "-t", "--threads", type=int, default=cpu_count(), 151 | help="Number of threads to use in annotation steps [all available]") 152 | diamond_args.add_argument( 153 | "--evalue", type=float, default=1e-3, help="Maximum e-value to report annotations for [1e-3]") 154 | diamond_args.add_argument( 155 | "--pident", type=float, default=None, help="Minimum pident to report annotations for.") 156 | diamond_args.add_argument( 157 | "--bitscore", type=float, default=None, help="Minimum bit score to report annotations for (overrides e-value).") 158 | diamond_args.add_argument( 159 | "-mts", "--max-target-seqs", type=int, default=1, 160 | help="Number of annotations to output per sequence inputed [1]") 161 | diamond_args.add_argument( 162 | "-b", "--block-size", type=float, 163 | help="Billions of sequence letters to be processed at a time [memory / 20]") 164 | diamond_args.add_argument( 165 | "-c", "--index-chunks", type=int, 166 | help="Number of chunks for processing the seed index [dependant on block size]") 167 | diamond_args.add_argument( 168 | "--max-memory", type=float, default=virtual_memory().available / (1024.0 ** 3), 169 | help="Maximum memory to use (in Gb) [all available]") 170 | diamond_args.add_argument( 171 | "--taxids", default=None, help="Tax IDs to obtain protein sequences of for building a reference database.") 172 | diamond_args.add_argument( 173 | '--diamond-mode', help="Mode to run DIAMOND with [fast]", default='fast', 174 | choices=['fast', 'mid_sensitive', 'sensitive', 'more_sensitive', 'very_sensitive', 'ultra_sensitive']) 175 | 176 | special_functions = parser.add_argument_group('Special functions') 177 | special_functions.add_argument( 178 | "--show-available-fields", action="store_true", default=False, 179 | help="Outputs the fields available from the API.") 180 | 181 | args = parser.parse_args() 182 | if args.show_available_fields: 183 | sys.exit('\n'.join(columns_dict.keys())) 184 | 185 | args.output = args.output.rstrip('/') 186 | args.resources_directory = args.resources_directory.rstrip('/') 187 | args.columns = args.columns.split('&') if args.columns else None 188 | 189 | columns_fine = True 190 | if args.columns: 191 | for col in args.columns: 192 | if col not in columns_dict.keys() and not col.startswith('Taxonomic lineage'): 193 | print( 194 | f'ERR: [{col}] is not a valid column name for ID mapping. For more information, check ' 195 | f'https://github.com/iquasere/UPIMAPI/tree/master#sometimes-the-return-fields-are-not-properly-updated') 196 | columns_fine = False 197 | if not columns_fine: 198 | sys.exit(1) 199 | if args.taxids is not None: 200 | args.taxids = args.taxids.split(',') 201 | return args 202 | 203 | 204 | def timed_message(message): 205 | print(f'[{strftime("%Y-%m-%d %H:%M:%S", gmtime())}] {message}') 206 | 207 | 208 | def human_time(seconds): 209 | days = round(seconds // 86400) 210 | if days > 0: 211 | return strftime(f"{days}d%Hh%Mm%Ss", gmtime(seconds)) 212 | return strftime("%Hh%Mm%Ss", gmtime(seconds)) 213 | 214 | 215 | def str2bool(v): 216 | if v.lower() == 'auto': 217 | return 'auto' 218 | elif v.lower() in ('yes', 'true', 't', 'y', '1'): 219 | return True 220 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 221 | return False 222 | else: 223 | raise ArgumentTypeError('Boolean value expected.') 224 | 225 | 226 | def get_fasta_ids(filename): 227 | return [line[1:-1] for line in open(filename) if line.startswith('>')] 228 | 229 | 230 | def parse_blast(blast): 231 | result = pd.read_csv(blast, sep='\t', header=None) 232 | result.columns = [ 233 | 'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 234 | 'bitscore'] 235 | return result 236 | 237 | 238 | def string4mapping(columns=None): 239 | if columns is None or columns == []: # if no columns are inputted, UPIMAPI uses defaults 240 | return None 241 | valid_columns = [column for column in columns if column in columns_dict.keys()] 242 | invalid_columns = [column for column in columns if column not in columns_dict.keys()] 243 | for col in invalid_columns: 244 | print(f'WARNING: "{col}" is not a valid column name. ' 245 | f'Check https://www.uniprot.org/help/return_fields (Label* column) for valid column names ' 246 | f'or raise an issue at https://github.com/iquasere/UPIMAPI/issues') 247 | return ','.join([columns_dict[column] for column in valid_columns]) 248 | 249 | 250 | def parallelize(data, func, num_of_processes=8): 251 | data_split = np.array_split(data, num_of_processes) 252 | pool = Pool(num_of_processes) 253 | data = pd.concat(pool.map(func, data_split)) 254 | pool.close() 255 | pool.join() 256 | return data 257 | 258 | 259 | def run_on_subset(func, data_subset, **kwargs): 260 | return data_subset.apply(func, kwargs) 261 | 262 | 263 | def parallelize_on_rows(data, func, num_of_processes=8, **kwargs): 264 | return parallelize(data, partial(run_on_subset, func, kwargs), num_of_processes) 265 | 266 | 267 | def uniprot_request(ids, columns=None, output_format='tsv'): 268 | """ 269 | Input: 270 | ids: list of UniProt IDs to query 271 | original_database: database from where the IDs are 272 | database_destination: database to where to map (so far, only works with 'ACC' 273 | output_format: format of response to get 274 | columns: names of UniProt columns to get info on 275 | databases: names of databases to cross-reference with 276 | Output: 277 | Returns the content of the response from UniProt 278 | """ 279 | fields = f'&fields={string4mapping(columns=columns)}' if output_format == 'tsv' else '' 280 | WEBSITE_API = api_info['servers'][0]['url'] 281 | resp = get_url(f"{WEBSITE_API}/uniprotkb/accessions?accessions={','.join(ids)}{fields}&format={output_format}") 282 | return resp.text 283 | 284 | 285 | def split_list(a, n): 286 | k, m = divmod(len(a), n) 287 | return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) 288 | 289 | 290 | def submit_id_mapping(from_db, to_db, ids): 291 | """ 292 | Get info from one database to the other 293 | :param from_db: Available options at https://rest.uniprot.org/configure/idmapping/fields 294 | :param to_db: Available options at https://rest.uniprot.org/configure/idmapping/fields 295 | :param ids: 296 | :return: 297 | """ 298 | data = {"from": from_fields[from_db], "to": to_fields[to_db], "ids": ids} 299 | r = requests.post(f"{api_info['servers'][0]['url']}/idmapping/run", data=data) 300 | r.raise_for_status() 301 | return r.json()["jobId"] 302 | 303 | 304 | def get_id_mapping_results(job_id): 305 | while True: 306 | r = get_url(f"{api_info['servers'][0]['url']}/idmapping/status/{job_id}") 307 | job = r.json() 308 | if "jobStatus" in job: 309 | if job["jobStatus"] == "RUNNING": 310 | sleep(3) 311 | else: 312 | return r 313 | 314 | 315 | def basic_idmapping(ids, from_db, to_db): 316 | """ 317 | Get info from one database to the other 318 | :param ids: 319 | :param from_db: Available options at https://rest.uniprot.org/configure/idmapping/fields 320 | :param to_db: Available options at https://rest.uniprot.org/configure/idmapping/fields 321 | :return: 322 | """ 323 | job_id = submit_id_mapping(from_db, to_db, ids) 324 | r = get_id_mapping_results(job_id) 325 | result = pd.DataFrame().from_dict(r.json()["results"]) 326 | while r.links.get("next", {}).get("url"): 327 | r = get_url(r.links["next"]["url"]) 328 | result = pd.concat([result, pd.DataFrame().from_dict(r.json()["results"])]) 329 | return result 330 | 331 | 332 | def basic_idmapping_batch(ids, from_db, to_db, step=1000): 333 | """ 334 | Allows to retrieve millions of IDs at once, there seems to be some limit causing UniProt's API to fail with 335 | "Request Entity Too Large for url". 336 | :param to_db: 337 | :param from_db: 338 | :param step: 339 | :param ids: 340 | :return: 341 | """ 342 | result = pd.DataFrame() 343 | for i in tqdm(range(0, len(ids), step), desc='Getting valid UniProt IDs', ascii=' >='): 344 | done = False 345 | while not done: 346 | j = min(i + step, len(ids)) 347 | try: 348 | result = pd.concat([result, basic_idmapping(ids[i:j], from_db, to_db)]) 349 | done = True 350 | except: 351 | sleep(3) 352 | return result 353 | 354 | 355 | def basic_idmapping_multiprocess(ids, output, from_db, to_db, step=1000, threads=15): 356 | result = pd.DataFrame() 357 | ids_groups = split_list(ids, threads) 358 | with Manager() as m: 359 | with m.Pool() as p: 360 | mapping_results = p.starmap(basic_idmapping_batch, [( 361 | ids_group, from_db, to_db, step) for ids_group in ids_groups]) 362 | for res in mapping_results: 363 | result = pd.concat([result, res]) 364 | timed_message(f'{result["from"].unique().size} IDs were successfully mapped.') 365 | result.to_csv(output, sep='\t', index=False) 366 | timed_message(f'Results saved at {output}.') 367 | 368 | 369 | def get_valid_entries(ids): 370 | job_id = submit_id_mapping("UniProtKB AC/ID", "UniProtKB", ids) 371 | r = get_id_mapping_results(job_id) 372 | valid_entries = [res["from"] for res in r.json()["results"] if '_' not in res["from"]] 373 | while r.links.get("next", {}).get("url"): 374 | r = get_url(r.links["next"]["url"]) 375 | valid_entries += [res["from"] for res in r.json()["results"] if '_' not in res["from"]] 376 | return valid_entries 377 | 378 | 379 | def get_valid_entries_batch(ids, step=1000): 380 | """ 381 | Allows to retrieve millions of IDs at once, there seems to be some limit causing UniProt's API to fail with 382 | "Request Entity Too Large for url". 383 | :param step: 384 | :param ids: 385 | :return: 386 | """ 387 | valid_entries = [] 388 | for i in tqdm(range(0, len(ids), step), desc='Getting valid UniProt IDs', ascii=' >='): 389 | done = False 390 | while not done: 391 | j = min(i + step, len(ids)) 392 | try: 393 | valid_entries += get_valid_entries(ids[i:j]) 394 | done = True 395 | except: 396 | sleep(3) 397 | return valid_entries 398 | 399 | 400 | def get_valid_entries_multiprocess(ids, step=1000, threads=15): 401 | valid_entries = [] 402 | ids_groups = split_list(ids, threads) 403 | with Manager() as m: 404 | with m.Pool() as p: 405 | result = p.starmap(get_valid_entries_batch, [(ids_group, step) for ids_group in ids_groups]) 406 | for res in result: 407 | valid_entries += res 408 | not_valid = [ide for ide in ids if ide not in valid_entries] 409 | # take, from the valid IDs, the part after the dot, as this invalidates them 410 | valid_entries = [entry.split('.')[0] for entry in valid_entries] 411 | timed_message(f'{len(valid_entries)} UniProt IDs identified as valid.') 412 | return valid_entries, not_valid 413 | 414 | 415 | def get_uniprot_information(ids, step=1000, sleep_time=30, columns=None, max_tries=3): 416 | """ 417 | Input: 418 | ids: list of UniProt IDs to query 419 | step: INT, number of IDs to send per request 420 | sleep_time: INT, number of seconds to wait between requests 421 | columns: list - names of UniProt columns to get info on 422 | Output: 423 | pd.DataFrame will be returned with the information about the IDs queried. 424 | """ 425 | result = pd.DataFrame() 426 | for i in tqdm(range(0, len(ids), step), desc=f'Retrieving UniProt information from {len(ids)} IDs'): 427 | tries = 0 428 | done = False 429 | j = min(i + step, len(ids)) 430 | while not done and tries < max_tries: 431 | try: 432 | data = uniprot_request(ids[i:j], columns=columns) 433 | if len(data) > 0: 434 | uniprotinfo = pd.read_csv(StringIO(data), sep='\t') 435 | result = pd.concat([result, uniprotinfo[uniprotinfo.columns.tolist()]]) 436 | sleep(sleep_time) 437 | done = True 438 | except ConnectionError: 439 | print(f'ID mapping failed. Remaining tries: {max_tries - tries}') 440 | tries += 1 441 | sleep(10) 442 | return result 443 | 444 | 445 | def get_uniprot_fasta(ids, step=1000, sleep_time=30): 446 | """ 447 | Input: 448 | ids: list of UniProt IDs to query 449 | step: INT, number of IDs to send per request 450 | sleep_time: INT, number of seconds to wait between requests 451 | Output: 452 | str object containing the fasta sequences and headers 453 | of the proteis belonging to the IDs queried will be returned 454 | """ 455 | result = '' 456 | for i in tqdm(range(0, len(ids), step), desc=f"Building FASTA from {len(ids)} IDs."): 457 | j = min(i + step, len(ids)) 458 | data = uniprot_request(ids[i:j], output_format='fasta') 459 | if len(data) > 0: 460 | result += data 461 | sleep(sleep_time) 462 | return result 463 | 464 | 465 | def uniprot_fasta_workflow(all_ids, output, max_iter=5, step=1000, sleep_time=10): 466 | if os.path.isfile(output): 467 | print(f'{output} was found. Will perform mapping for the remaining IDs.') 468 | ids_done = get_fasta_ids(output) 469 | else: 470 | print(f'{output} not found. Will perform mapping for all IDs.') 471 | ids_done = [] 472 | ids_missing = list(set(all_ids) - set(ids_done)) 473 | 474 | tries = 0 475 | ids_done = ([ide.split('|')[1] for ide in get_fasta_ids(output)] if os.path.isfile(output) else []) 476 | while len(ids_done) < len(all_ids) and tries < max_iter: 477 | ids_missing = list(set([ide for ide in tqdm(all_ids, desc='Checking which IDs are missing information.') 478 | if ide not in ids_done])) 479 | print(f'Information already gathered for {int(len(ids_done) / 2)} ids. Still missing for {len(ids_missing)}.') 480 | uniprotinfo = get_uniprot_fasta(ids_missing, step=step, sleep_time=sleep_time) 481 | with open(output, 'a') as file: 482 | file.write(uniprotinfo) 483 | ids_done = [ide.split('|')[1] for ide in get_fasta_ids(output)] 484 | tries += 1 485 | if len(ids_done) == len(all_ids): 486 | print(f'Results for all IDs are available at {output}') 487 | else: 488 | ids_unmapped_output = f"{'/'.join(output.split('/')[:-1])}/ids_unmapped.txt" 489 | handler = open(ids_unmapped_output, 'w') 490 | handler.write('\n'.join(ids_missing)) 491 | print(f'Maximum iterations were made. Results related to {str(len(ids_missing))} IDs were not obtained. ' 492 | f'IDs with missing information are available at {ids_unmapped_output} and information obtained is ' 493 | f'available at {output}') 494 | 495 | 496 | def check_ids_already_done(output, ids): 497 | if os.path.isfile(output) and os.stat(output).st_size > 1: 498 | try: 499 | result = pd.read_csv(output, sep='\t', low_memory=False).drop_duplicates() 500 | print(f'{output} was found. Will perform mapping for the remaining IDs.') 501 | if 'Entry Name' not in result.columns: 502 | result['Entry Name'] = [np.nan] * len(result) 503 | ids_done = result['Entry'].unique().tolist() + result['Entry Name'].unique().tolist() 504 | except OSError: # file doesn't exist or is empty 505 | print(f'{output} was found. However, it could not be parsed. Will restart mapping.') 506 | result = pd.DataFrame() 507 | ids_done = [] 508 | else: 509 | print(f'{output} not found or empty. Will perform mapping for all IDs.') 510 | result = pd.DataFrame() 511 | ids_done = [] 512 | ids_missing = list(set(ids) - set(ids_done)) 513 | print(f'IDs present in uniprotinfo file: {int(len(ids_done) / 2)}') # entry and entry name count by 2 514 | print(f'IDs missing: {len(ids_missing)}') 515 | return ids_done, ids_missing, result 516 | 517 | 518 | def select_columns(columns): 519 | """ 520 | :param columns: list - of columns to retrieve information from, including taxonomic columns 521 | :return: new_cols: list - of columns to retrieve information from, without taxonomic columns added by UPIMAPI 522 | :return: tax_cols: list - of taxonomic columns to retrieve information from 523 | :return: taxids_cols: list - of taxid columns to retrieve information from 524 | """ 525 | if columns is None: 526 | columns = [ # default columns of UPIMAPI 527 | 'Entry', 'Entry Name', 'Organism', 'Organism (ID)', 'Taxonomic lineage', 'Taxonomic lineage (Ids)', 528 | 'Gene Names', 'Protein names', 'EC number', 'Function [CC]', 'Pathway', 'Keywords', 529 | 'Protein existence', 'Gene Ontology (GO)', 'Protein families', 'BRENDA', 'BioCyc', 'CDD', 'eggNOG', 530 | 'Ensembl', 'InterPro', 'KEGG', 'Pfam', 'Reactome', 'RefSeq', 'UniPathway', 531 | 'Taxonomic lineage (SUPERKINGDOM)', 'Taxonomic lineage (PHYLUM)', 'Taxonomic lineage (CLASS)', 532 | 'Taxonomic lineage (ORDER)', 'Taxonomic lineage (FAMILY)', 'Taxonomic lineage (GENUS)', 533 | 'Taxonomic lineage (SPECIES)', 'Taxonomic lineage IDs (SPECIES)'] 534 | tax_cols = [col for col in columns if ('Taxonomic lineage (' in col and col not in [ 535 | 'Taxonomic lineage (SPECIES)', 'Taxonomic lineage (Ids)'])] 536 | taxids_cols = [col for col in columns if ( 537 | 'Taxonomic lineage IDs (' in col and col not in 'Taxonomic lineage IDs (SPECIES)')] 538 | for col in ['Entry Name', 'Entry']: 539 | if col not in columns: 540 | columns.insert(0, col) 541 | new_cols = [col for col in columns if col not in tax_cols + taxids_cols + [ 542 | 'Taxonomic lineage (SPECIES)', 'Taxonomic lineage IDs (SPECIES)']] 543 | col_conversion = {'Organism': 'Taxonomic lineage (SPECIES)', 'Organism (ID)': 'Taxonomic lineage IDs (SPECIES)'} 544 | for k, v in col_conversion.items(): 545 | if v in columns and k not in new_cols: 546 | new_cols.append(k) 547 | conditions = { 548 | 'len(tax_cols) > 0 and "Taxonomic lineage" not in new_cols': 'Taxonomic lineage', 549 | 'len(taxids_cols) > 0 and "Taxonomic lineage (Ids)" not in new_cols': 'Taxonomic lineage (Ids)', 550 | '"Taxonomic lineage (SPECIES)" in columns and "Organism" not in new_cols': 'Organism', 551 | '"Taxonomic lineage IDs (SPECIES)" in columns and "Organism (ID)" not in new_cols': 'Organism (ID)'} 552 | for cond, col in conditions.items(): # check if cond (key) is True, then append or not the col (value) 553 | if eval(cond): 554 | new_cols.append(col) 555 | for col in ['Entry Name', 'Entry']: # UPIMAPI requires these two columns to be present 556 | if col not in new_cols: 557 | new_cols.insert(0, col) 558 | return columns, new_cols, tax_cols, taxids_cols 559 | 560 | 561 | def make_taxonomic_lineage_df(tax_lineage_col, prefix='Taxonomic lineage IDs'): 562 | """ 563 | Parses the taxonomic lineage column of the uniprotinfo dataframe and returns a dataframe with the taxonomic lineage 564 | separated in columns. 565 | :param tax_lineage_col: pd.Series with the taxonomic lineage column of the uniprotinfo dataframe 566 | :param prefix: str, prefix to use for the columns of the new dataframe 567 | :return: pd.DataFrame with the taxonomic lineage separated in columns 568 | """ 569 | # First, split records by ', ' 570 | split_regex = r"(?<=\))," 571 | result = pd.DataFrame.from_records(tax_lineage_col.apply(lambda x: re.split(split_regex, x)).apply( 572 | # Then, split each record by ' (' 573 | lambda x: [part[:-1].split(' (') for part in x]).apply( 574 | # Finally, build dictionary with the taxonomic level as key and the taxonomy as value. ' ('.join avoids cases 575 | # where the taxonomy has a '(' in it (e.g. 'Clostridium scindens (strain JCM 10418 / VPI 12708) (species)') 576 | lambda x: {part[-1]: ' ('.join(part[:-1]) for part in x if part[-1] != 'no rank'})) 577 | # Rename columns in old UniProt fashion 578 | result.rename(columns={col: f'{prefix} ({col.upper()})' for col in result.columns}, inplace=True) 579 | for col in result.columns: 580 | result[col] = result[col].str.lstrip() 581 | return result 582 | 583 | 584 | def uniprot_information_workflow(ids, output, max_iter=5, columns=None, step=1000, sleep_time=10): 585 | ids_done, ids_missing, result = check_ids_already_done(output, ids) 586 | tries, last_ids_missing, ids_unmapped_output = 0, None, f"{'/'.join(output.split('/')[:-1])}/ids_unmapped.txt" 587 | columns, new_cols, tax_cols, taxids_cols = select_columns(columns) 588 | columns = new_cols if columns is None else columns 589 | uniprotinfo = pd.DataFrame() 590 | while len(ids_missing) > 0 and tries < max_iter and ids_missing != last_ids_missing: 591 | print(f'Information already gathered for {int(len(ids_done) / 2)} ids. Still missing for {len(ids_missing)}.') 592 | last_ids_missing = ids_missing 593 | info = get_uniprot_information( 594 | ids_missing, step=step, columns=new_cols, max_tries=max_iter, sleep_time=sleep_time) 595 | info.reset_index(inplace=True, drop=True) 596 | if len(info) > 0: 597 | ids_done += info['Entry'].unique().tolist() + info['Entry Name'].unique().tolist() 598 | uniprotinfo = pd.concat([uniprotinfo, info], ignore_index=True) 599 | ids_missing = list(set(last_ids_missing) - set(ids_done)) 600 | if len(ids_missing) > 0: 601 | if last_ids_missing == ids_missing: 602 | print("Could not map additional IDs for this mapping. There were probably some outdated IDs. " 603 | "For more questions, please contact through https://github.com/iquasere/UPIMAPI/issues") 604 | else: 605 | print('Failed to retrieve information for some IDs. Retrying request.') 606 | tries += 1 607 | if len(uniprotinfo) == 0: 608 | return result 609 | tax_df = pd.DataFrame() 610 | if len(tax_cols) > 0: 611 | tax_df = make_taxonomic_lineage_df(uniprotinfo['Taxonomic lineage'], prefix='Taxonomic lineage') 612 | if len(taxids_cols) > 0: 613 | tax_df = pd.concat([tax_df, make_taxonomic_lineage_df( 614 | uniprotinfo['Taxonomic lineage (Ids)'], prefix='Taxonomic lineage IDs')], axis=1) 615 | # rename columns to old UniProt fashion if those columns are to be outputted 616 | # then remove the original columns if they are not to be outputted 617 | col_conversion = {'Organism': 'Taxonomic lineage (SPECIES)', 'Organism (ID)': 'Taxonomic lineage IDs (SPECIES)'} 618 | for k, v in col_conversion.items(): 619 | if v in columns: 620 | uniprotinfo[v] = uniprotinfo[k] 621 | if k not in columns: 622 | del uniprotinfo[k] 623 | tax_df_gut_cols = [col for col in tax_df.columns if col not in col_conversion.values()] # don't repeat columns that were added in the previous loop 624 | uniprotinfo = pd.concat([uniprotinfo, tax_df[tax_df_gut_cols]], axis=1) 625 | result = pd.concat([result, uniprotinfo[columns]], ignore_index=True) 626 | if len(ids_missing) == 0: 627 | print(f'Results for all IDs are available at {output}') 628 | else: 629 | open(ids_unmapped_output, 'w').write('\n'.join(ids_missing)) 630 | print(f"Maximum iterations were made. Results related to {str(len(ids_missing))} IDs were not obtained. " 631 | f"IDs with missing information are available at {ids_unmapped_output} and information obtained is " 632 | f"available at {output}") 633 | return result 634 | 635 | 636 | def determine_full_id(ids): 637 | for ide in ids: 638 | if '|' in ide: 639 | return True 640 | return False 641 | 642 | 643 | def parse_fasta(file): 644 | with open(file) as f: 645 | lines = [line.rstrip('\n') for line in f] 646 | i = 0 647 | sequences = {} 648 | while i < len(lines): 649 | if lines[i].startswith('>'): 650 | name = lines[i][1:] 651 | sequences[name] = '' 652 | i += 1 653 | while i < len(lines) and not lines[i].startswith('>'): 654 | sequences[name] += lines[i] 655 | i += 1 656 | return sequences 657 | 658 | 659 | def get_ids(args_input, input_type, full_id='auto'): 660 | if args_input.endswith(('.zip', '.tar', '.gz', '.bz2')): 661 | exit('File seems to be compressed! If not, please change its extension.') 662 | if input_type == 'blast': 663 | ids = parse_blast(args_input)['sseqid'].tolist() 664 | elif input_type == 'txt': 665 | ids = [] 666 | with open(args_input) as f: 667 | preids = f.read().split('\n') 668 | for preid in preids: 669 | ids += preid.split(',') 670 | elif input_type == 'fasta': 671 | ids = parse_fasta(args_input).keys() 672 | else: # if PIPE 673 | ids = args_input.split(',') 674 | if full_id == 'auto': 675 | full_id = determine_full_id(ids) 676 | print(f'Auto determined "full id" as: {full_id}') 677 | if full_id: 678 | return_ids = [ide.split('|')[1] for ide in ids if ide not in ['*', '']] 679 | sp_ids = [ide.split('|')[1] for ide in ids if ide.startswith('sp')] 680 | return return_ids, full_id, sp_ids 681 | return_ids = [ide for ide in ids if ide not in ['*', '']] 682 | return return_ids, full_id, return_ids 683 | # second return_ids is just mock to return the same type of output as for sp_ids 684 | 685 | 686 | def run_command(bash_command, print_message=True): 687 | if print_message: 688 | print(bash_command) 689 | run(bash_command.split(), check=True) 690 | 691 | 692 | def run_pipe_command(bash_command, output='', mode='w', print_message=True): 693 | if print_message: 694 | print(bash_command) 695 | if output == '': 696 | Popen(bash_command, stdin=PIPE, shell=True).communicate() 697 | elif output == 'PIPE': 698 | return Popen(bash_command, stdin=PIPE, shell=True, 699 | stdout=PIPE).communicate()[0].decode('utf8') 700 | else: 701 | with open(output, mode) as output_file: 702 | Popen(bash_command, stdin=PIPE, shell=True, stdout=output_file).communicate() 703 | 704 | 705 | def make_diamond_database(fasta, dmnd): 706 | run_command(f'diamond makedb --in {fasta} -d {dmnd}') 707 | 708 | 709 | def block_size_and_index_chunks(argsb, argsc, memory): 710 | if argsb: 711 | b = argsb 712 | else: 713 | b = memory / 20 # b = memory in Gb / 20 714 | if argsc: 715 | return b, argsc 716 | if b > 3: 717 | return b, 1 718 | if b > 2: 719 | return b, 2 720 | if b > 1: 721 | return b, 3 722 | return b, 4 723 | 724 | 725 | def run_diamond(query, aligned, unaligned, database, threads=12, max_target_seqs=50, b=1, c=4, e_value=0.01, 726 | bit_score=None, pident=None, mode='fast'): 727 | command = ( 728 | f"diamond blastp --query {query} --out {aligned} --un {unaligned} --db {database} --outfmt 6 --unal 1 " 729 | f"--threads {threads} --max-target-seqs {max_target_seqs} -b {b} -c {c} --evalue {e_value} " 730 | f"--{mode.replace('_', '-')}") 731 | if bit_score: 732 | command += f' --min-score {bit_score}' 733 | if pident: 734 | command += f' --id {pident}' 735 | run_command(command) 736 | 737 | 738 | def get_proteome_for_taxid_slow(taxid, max_tries=3): 739 | """ 740 | Get proteome for taxid the "proper" way. It is very slow, though, so not used. 741 | :param taxid: 742 | :param max_tries: 743 | :return: 744 | """ 745 | tries = 0 746 | res = requests.get(f'https://rest.uniprot.org/uniprotkb/search?format=fasta&query=%28taxonomy_id%3A{taxid}%29') 747 | result = res.content.decode('utf8') 748 | pages = 0 749 | while tries < max_tries and res.links != {}: 750 | try: 751 | res = requests.get(res.links['next']['url']) 752 | result += res.content.decode('utf8') 753 | pages += 1 754 | except: 755 | tries += 1 756 | sleep(10) 757 | return result 758 | 759 | 760 | def get_proteome_for_taxid(taxid, max_tries=3): 761 | tries = 0 762 | url = f'https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=taxonomy_id:{taxid}' 763 | while tries < max_tries: 764 | try: 765 | return requests.get(url).content.decode('utf8') 766 | except: 767 | print(f'Failed! {max_tries - tries} tries remaining.') 768 | tries += 1 769 | sleep(10) 770 | 771 | 772 | def local_uniprot_is_outdated(local_reldate_file): 773 | local = open(local_reldate_file).readlines() 774 | [sp_date, tr_date] = [datetime.strptime(local[i][:-1].split()[-1], '%d-%b-%Y') for i in [1, 2]] 775 | current = requests.get("https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/reldate.txt" 776 | ).content.decode('utf8').split('\n') 777 | [c_sp_date, c_tr_date] = [datetime.strptime(current[i][:-1].split()[-1], '%d-%b-%Y') for i in [1, 2]] 778 | return c_sp_date < sp_date or c_tr_date < tr_date 779 | 780 | 781 | def download_with_progress_bar(url, output_folder): 782 | # Streaming, so we can iterate over the response. 783 | response = requests.get(url, stream=True) 784 | total_size_in_bytes = int(response.headers.get('content-length', 0)) 785 | block_size = 102400 # 100 Kibibytes 786 | progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True, desc=f'Downloading {url.split("/")[-1]}') 787 | with open(f'{output_folder}/{url.split("/")[-1]}', 'wb') as file: 788 | for data in response.iter_content(block_size): 789 | progress_bar.update(len(data)) 790 | file.write(data) 791 | progress_bar.close() 792 | 793 | 794 | def download_uniprot(output_folder, mirror='expasy'): 795 | base_urls = { 796 | 'expasy': 'https://ftp.expasy.org', 797 | 'uniprot': 'https://ftp.uniprot.org/pub', 798 | 'ebi': 'https://ftp.ebi.ac.uk/pub' 799 | } 800 | for file in ["uniprot_sprot.fasta.gz", "uniprot_trembl.fasta.gz", "reldate.txt"]: 801 | print(f'Downloading and writing: {file}') 802 | download_with_progress_bar( 803 | f'{base_urls[mirror]}/databases/uniprot/current_release/knowledgebase/complete/{file}', output_folder) 804 | run_pipe_command(f'zcat {output_folder}/uniprot_trembl.fasta.gz {output_folder}/uniprot_sprot.fasta.gz > ' 805 | f'{output_folder}/uniprot.fasta') 806 | for file in [f'{output_folder}/uniprot_trembl.fasta.gz', f'{output_folder}/uniprot_sprot.fasta.gz']: 807 | os.remove(file) 808 | 809 | 810 | def build_reference_database(database, output_folder, taxids=None, max_tries=3, mirror='expasy'): 811 | if database == 'uniprot': 812 | download_uniprot(output_folder, mirror=mirror) 813 | elif database == 'swissprot': 814 | download_with_progress_bar( 815 | "https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/uniprot_sprot.fasta.gz", 816 | output_folder) 817 | run_command(f'gunzip {output_folder}/uniprot_sprot.fasta.gz') 818 | elif database == 'taxids': 819 | for taxid in tqdm(taxids, desc=f'Retrieving reference proteomes for {len(taxids)} taxa from UniProt.'): 820 | with open(f'{output_folder}/taxids_database.fasta', 'a') as f: 821 | f.write(get_proteome_for_taxid(taxid, max_tries=max_tries)) 822 | 823 | 824 | def must_build_database(database, resources_folder): 825 | db2suffix = {'uniprot': 'uniprot.fasta', 'swissprot': 'uniprot_sprot.fasta', 'taxids': 'taxids_database.fasta'} 826 | if database in db2suffix.keys(): 827 | if os.path.isfile(f'{resources_folder}/{db2suffix[database]}'): 828 | return str2bool(input(f'{resources_folder}/{db2suffix[database]} exists. Overwrite? [Y/N] ')) 829 | return True 830 | 831 | 832 | def get_tabular_taxonomy(output): 833 | res = requests.get('https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/taxonomy.rdf.xz') 834 | with open('taxonomy.rdf.xz', 'wb') as f: 835 | f.write(res.content) 836 | run_command(f'unxz taxonomy.rdf.xz') 837 | print('Reading RDF taxonomy') 838 | root = ET.parse('taxonomy.rdf').getroot() 839 | elems = root.findall('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description') 840 | with open(output, 'w') as f: 841 | written = f.write('\t'.join(['taxid', 'name', 'rank', 'parent_taxid']) + '\n') 842 | for elem in tqdm(elems, desc='Converting XML taxonomy.rdf to TSV format'): 843 | info = [elem.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about').split('/')[-1]] 844 | scientific_name = elem.find('{http://purl.uniprot.org/core/}scientificName') 845 | info.append(scientific_name.text if scientific_name is not None else '') 846 | rank_elem = elem.find('{http://purl.uniprot.org/core/}rank') 847 | info.append(rank_elem.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split('/')[-1] 848 | if rank_elem is not None else '') 849 | upper_taxon = elem.find('{http://www.w3.org/2000/01/rdf-schema#}subClassOf') 850 | info.append(upper_taxon.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split('/')[-1] 851 | if upper_taxon is not None else '') 852 | written = f.write('\t'.join(info) + '\n') 853 | 854 | 855 | def get_match_id(record, ids): 856 | if record.entry_name in ids: 857 | return record.entry_name 858 | if record.accessions[0] in ids: 859 | return record.accessions[0] 860 | return None 861 | 862 | 863 | def count_on_file(expression, file, compressed=False): 864 | return int(check_output(f"{'zgrep' if compressed else 'grep'} -c '{expression}' {file}", shell=True)) 865 | 866 | 867 | def get_local_swissprot_data(sp_dat_filename, ids): 868 | sp_dat = SP.parse(open(sp_dat_filename)) 869 | result, ids_found = [], [] 870 | i = 1 871 | record = next(sp_dat) 872 | number_of_entries = count_on_file('Reviewed;', sp_dat_filename) 873 | while record is not None and len(ids) > 0: 874 | match_id = get_match_id(record, ids) 875 | if match_id is not None: 876 | result.append(record.__dict__) 877 | ids_found.append(match_id) 878 | if i % 100000 == 0: 879 | print(f'[{i}/{number_of_entries}] SwissProt entries queried') 880 | record = next(sp_dat, None) 881 | i += 1 882 | print(f'[{i}/{number_of_entries}] SwissProt entries queried') 883 | return pd.DataFrame(result), ids_found 884 | 885 | 886 | def lineage_to_columns(lineage, tax_tsv): 887 | l2c_result, l2c_taxids = {}, {} 888 | for taxon in lineage: 889 | match = tax_tsv.loc[taxon, ["rank", "taxid"]] 890 | if type(match) == pd.core.series.Series: 891 | rank, taxid = match[["rank", "taxid"]] 892 | if type(rank) == str: 893 | l2c_result[f'Taxonomic lineage ({rank.upper()})'] = taxon 894 | l2c_taxids[f'Taxonomic identifier ({rank.upper()})'] = taxid 895 | else: # some taxIDs have multiple levels (e.g. "Craniata") 896 | for i in range(len(match)): 897 | rank, taxid = match.iloc[i][["rank", "taxid"]] 898 | if type(rank) == str: 899 | l2c_result[f'Taxonomic lineage ({rank.upper()})'] = taxon 900 | l2c_taxids[f'Taxonomic identifier ({rank.upper()})'] = taxid 901 | l2c_result['Taxonomic lineage (ALL)'] = ', '.join(set(l2c_result.values())) 902 | l2c_taxids['Taxonomic identifier (ALL)'] = ', '.join(set(l2c_taxids.values())) 903 | l2c_result = {**l2c_result, **l2c_taxids, 'index': lineage} 904 | return l2c_result 905 | 906 | 907 | def lineages_to_columns(lineages, tax_tsv): 908 | """ 909 | Does the same as lineage_to_columns, but to all lineages, instead of a single one 910 | :param lineages: 911 | :param tax_tsv: 912 | :return: 913 | """ 914 | return [lineage_to_columns(lineage, tax_tsv) for lineage in lineages] 915 | 916 | 917 | def get_upper_taxids(taxid, tax_df): 918 | """ 919 | :param taxid: str - taxID to get upper taxIDs from 920 | :param tax_df: pd.DataFrame - of read taxonomy.tsv (from taxonomy.rdf) 921 | :returns list - of upper taxIDs 922 | """ 923 | if taxid == '0': 924 | return [] 925 | taxids = [] 926 | while taxid != '1' and taxid != 'Taxon': 927 | taxids.append(taxid) 928 | taxid = tax_df.loc[taxid]['parent_taxid'] 929 | return taxids 930 | 931 | 932 | def parse_taxonomy(data, tax_tsv_df, threads=15): 933 | tax_tsv_df.set_index('name', inplace=True) 934 | tax_tsv_df['taxid'] = tax_tsv_df['taxid'].astype(str) 935 | all_classifications = split(data['organism_classification'].drop_duplicates().tolist(), threads) 936 | with Manager() as m: 937 | with m.Pool() as p: 938 | result = p.starmap(lineages_to_columns, [(classifications, tax_tsv_df) 939 | for classifications in all_classifications]) 940 | decompacted = [] 941 | for res in result: 942 | decompacted += res 943 | return pd.DataFrame(decompacted).set_index('index') 944 | 945 | 946 | def parse_comments(sp_data): 947 | result = [] 948 | bpc_list = [] 949 | for comments in sp_data['comments']: 950 | partial = {key: '' for key in [ 951 | 'FUNCTION', 'SUBUNIT', 'INTERACTION', 'SUBCELLULAR LOCATION', 'ALTERNATIVE PRODUCTS', 'TISSUE SPECIFICITY', 952 | 'PTM', 'POLYMORPHISM', 'DISEASE', 'MISCELLANEOUS', 'SIMILARITY', 'CAUTION', 'SEQUENCE CAUTION', 953 | 'WEB RESOURCE', 'MASS SPECTROMETRY', 'RNA EDITING', 'CATALYTIC ACTIVITY', 'COFACTOR', 'ACTIVITY REGULATION', 954 | 'PATHWAY', 'DEVELOPMENTAL STAGE', 'INDUCTION', 'ALLERGEN', 'BIOTECHNOLOGY', 'DISRUPTION PHENOTYPE', 955 | 'PHARMACEUTICAL', 'TOXIC DOSE', 'DOMAIN']} 956 | bpc_dict = {'Kinetic parameters': []} 957 | for comment in comments: 958 | comment = comment.split(': ') 959 | if comment[0] in partial.keys(): 960 | partial[comment[0]] += f'{": ".join(comment)} ' 961 | else: 962 | if comment[0] in ['BIOPHYSICOCHEMICAL PROPERTIES']: 963 | if comment[1] not in bpc_dict.keys(): 964 | bpc_dict[comment[1]] = [f'{comment[0]}: {comment[1]}: {comment[2]}'] 965 | else: 966 | bpc_dict[comment[1]].append(f'{comment[0]}: {comment[1]}: {comment[2]}') 967 | else: 968 | print(f'Comment still not implemented: [{comment[0]}]') 969 | result.append(partial) 970 | bpc_dict['Kinetics'] = bpc_dict.pop('Kinetic parameters') 971 | bpc_list.append(bpc_dict) 972 | result = pd.DataFrame(result, columns=[ 973 | 'Function [CC]', 'Subunit structure [CC]', 'Interacts with', 'Subcellular location [CC]', 974 | 'Alternative products (isoforms)', 'Tissue specificity', 'Post-translational modification', 'Polymorphism', 975 | 'Involvement in disease', 'Miscellaneous [CC]', 'Sequence similarities', 'Caution', 'Sequence caution', 976 | 'Web resources', 'Mass spectrometry', 'RNA editing', 'Catalytic activity', 'Cofactor', 'Activity regulation', 977 | 'Pathway', 'Developmental stage', 'Induction', 'Allergenic properties', 'Biotechnological use', 978 | 'Disruption phenotype', 'Pharmaceutical use', 'Toxic dose', 'Domain [CC]']) 979 | result['Erroneous gene model prediction'] = result['Sequence caution'] 980 | bpc_df = pd.DataFrame(bpc_list) 981 | for col in bpc_df: 982 | bpc_df[col] = bpc_df[col].apply(lambda x: '; '.join(x) if type(x) == list else x) 983 | return pd.concat([result, bpc_df], axis=1) 984 | 985 | 986 | def add_to_dict(dictionary, key, value): 987 | if key in dictionary.keys(): 988 | dictionary[key] += value 989 | else: 990 | dictionary[key] = value 991 | 992 | 993 | def cross_references_to_columns(cross_refs): 994 | result = {} 995 | go_dict = {} 996 | go_rel = {'C': 'cellular component', 'F': 'molecular function', 'P': 'biological process'} 997 | for ref in cross_refs: 998 | if ref[0] == 'GO': 999 | refie = ref[2].split(':') 1000 | add_to_dict(go_dict, f'Gene ontology ({go_rel[refie[0]]})', f'{refie[1]} [{ref[1]}]; ') 1001 | add_to_dict(go_dict, 'Gene ontology (GO)', f'{refie[1]} [{ref[1]}]; ') 1002 | add_to_dict(go_dict, 'Gene ontology IDs', f'{ref[1]}; ') 1003 | else: 1004 | if ref[0] == 'Proteomes': 1005 | value = f'{ref[1]}: {ref[2]}' 1006 | else: 1007 | value = f'{ref[1]};' 1008 | add_to_dict(result, ref[0], value) 1009 | return result, go_dict 1010 | 1011 | 1012 | def parse_cross_references(sp_data): 1013 | ref_result = [cross_references_to_columns(cross_refs) for cross_refs in sp_data['cross_references']] 1014 | ref_dict, go_dict = zip(*ref_result) 1015 | ref_df = pd.DataFrame(ref_dict) 1016 | ref_df.columns = map(lambda x: f'Cross-reference ({x})' if x != 'Proteomes' else x, ref_df.columns) 1017 | go_df = pd.DataFrame(go_dict) 1018 | ref_df = pd.concat([ref_df, go_df], axis=1) 1019 | return ref_df 1020 | 1021 | 1022 | def gene_name_to_columns(genes): 1023 | if genes == '': 1024 | info = {} 1025 | else: 1026 | info = [pair.split('=') for pair in genes.rstrip(';').split('; ')] 1027 | info = {pair[0]: pair[1] for pair in info} 1028 | return {'Gene names': ' '.join(info.values()) if info != {} else '', 1029 | 'Gene names (ordered locus )': info['OrderedLocusNames'] if 'OrderedLocusNames' in info else '', 1030 | 'Gene names (ORF )': info['ORFNames'] if 'ORFNames' in info else '', 1031 | 'Gene names (primary )': info['Name'] if 'Name' in info else '', 1032 | 'Gene names (synonym )': info['Synonyms'] if 'Synonyms' in info else ''} 1033 | 1034 | 1035 | def parse_gene_names(sp_data): 1036 | return pd.DataFrame([gene_name_to_columns(genes) for genes in sp_data['gene_name']]) 1037 | 1038 | 1039 | def parse_description_text(description): 1040 | result = {} 1041 | parts = description[:-1].split('; ') 1042 | i = 0 1043 | while i < len(parts): 1044 | parted = parts[i].split('=') 1045 | if parted[0].startswith('RecName: Full'): 1046 | result['RecName'] = {} 1047 | result['RecName']['Full'] = parted[1] 1048 | i += 1 1049 | while i < len(parts) and ':' not in parts[i].split()[0]: 1050 | parted = parts[i].split('=') 1051 | result['RecName'][parted[0]] = parted[1] 1052 | i += 1 1053 | elif parted[0].startswith('AltName: Full'): 1054 | if 'AltName' not in result.keys(): 1055 | result['AltName'] = [] 1056 | altname = {'Full': parted[1]} 1057 | i += 1 1058 | while i < len(parts) and ':' not in parts[i].split()[0]: 1059 | parted = parts[i].split('=') 1060 | altname[parted[0]] = parted[1] 1061 | i += 1 1062 | result['AltName'].append(altname) 1063 | elif parted[0].startswith('Contains: RecName'): 1064 | if 'Contains' not in result.keys(): 1065 | result['Contains'] = [] 1066 | contains = {'RecName': {'Full': parted[1]}} 1067 | i += 1 1068 | while i < len(parts) and ':' not in parts[i].split()[0]: 1069 | parted = parts[i].split('=') 1070 | contains['RecName'][parted[0]] = parted[1] 1071 | i += 1 1072 | result['Contains'].append(contains) 1073 | elif parts[i].startswith('Flags'): 1074 | parted = parts[i].split(': ') 1075 | if 'Flags' in result.keys(): 1076 | result['Flags'] = parted[1] 1077 | else: 1078 | result['Flags'] = [parted[1]] 1079 | i += 1 1080 | else: 1081 | #print('A description UPIMAPI cannot yet handle!') 1082 | #print(parts[i]) 1083 | i += 1 1084 | return result 1085 | 1086 | 1087 | def fix_term(term): 1088 | return term if '{ECO:' not in term else ' '.join(term.split()[:-1]) 1089 | 1090 | 1091 | def parse_descriptions(sp_data): 1092 | desc_data_df = sp_data['description'].apply(parse_description_text) 1093 | description_df = pd.DataFrame() 1094 | description_df['Protein names'] = desc_data_df.apply( 1095 | lambda x: '{}{}{}{}{}{}'.format( 1096 | fix_term(x['RecName']['Full']), f" ({fix_term(x['RecName']['Short'])})" if 'Short' in x['RecName'].keys() 1097 | else "", f" (EC {fix_term(x['RecName']['EC'])})" if 'EC' in x['RecName'].keys() else "", 1098 | ' ' + ' '.join(' '.join([f"({fix_term(value)})" for value in altname.values()]) for altname in x['AltName']) 1099 | if 'AltName' in x.keys() else "", 1100 | f" [Cleaved into: {'; '.join([fix_term(v['RecName']['Full']) for v in x['Contains']])}]" 1101 | if 'Contains' in x.keys() else "", 1102 | ' '.join([f" ({flag})" for flag in x['Flags']]) if 'Flags' in x.keys() else "")) 1103 | description_df['EC number'] = desc_data_df.apply( 1104 | lambda x: x['RecName']['EC'].split()[0] if type(x) != float and 'RecName' in x.keys() 1105 | and 'EC' in x['RecName'].keys() else np.nan) 1106 | return description_df 1107 | 1108 | 1109 | def parse_feature(feature, position, qualifiers=True, ide=True): 1110 | """ 1111 | :param feature: str - the feature itself 1112 | :param position: str - position information 1113 | :param qualifiers: bool - add qualifiers information? 1114 | :param ide: bool - add id information? 1115 | :return: str - the term to add 1116 | """ 1117 | result = f'{feature.type} {position}' 1118 | if qualifiers: 1119 | result += ' ' + " ".join([f'/{key}="{value}";' for key, value in feature.qualifiers.items()]) 1120 | if ide: 1121 | result += ' ' + f' /id="{feature.id}";' 1122 | return result 1123 | 1124 | 1125 | def parse_features(sp_data): 1126 | feats_list = [] 1127 | pos_funcs = { 1128 | 'all': lambda x: f'{"?" if x.location.start.position is None else x.location.start.position + 1}..' 1129 | f'{x.location.end.position}; ', 1130 | 'end': lambda x: f'{feature.location.end.position}; '} 1131 | prefix2info = { 1132 | 'VAR_SEQ': ('Alternative sequence', 'all', True, True), 1133 | 'VARIANT': ('Natural variant', 'end', True, True), 1134 | 'NON_CONS': ('Non-adjacent residues', 'all', True, False), 1135 | 'NON_STD': ('Non-standard residue', 'end', True, False), 1136 | 'NON_TER': ('Non-terminal residue', 'end', False, False), 1137 | 'CONFLICT': ('Sequence conflict', 'all', True, False), 1138 | 'UNSURE': ('Sequence uncertainty', 'end', True, False), 1139 | 'ACT_SITE': ('Active site', 'end', True, False), 1140 | 'BINDING': ('Binding site', 'end', True, False), 1141 | 'DNA_BIND': ('DNA binding', 'all', True, False), 1142 | 'METAL': ('Metal binding', 'end', True, False), 1143 | 'NP_BIND': ('Nucleotide binding', 'all', True, False), 1144 | 'SITE': ('Site', 'end', True, False), 1145 | 'INTRAMEM': ('Intramembrane', 'all', True, False), 1146 | 'TOPO_DOM': ('Topological domain', 'all', True, False), 1147 | 'TRANSMEM': ('Transmembrane', 'all', True, False), 1148 | 'CHAIN': ('Chain', 'all', True, True), 1149 | 'CROSSLNK': ('Cross-link', 'all', True, False), 1150 | 'DISULFID': ('Disulfide bond', 'all', True, False), 1151 | 'CARBOHYD': ('Glycosylation', 'end', True, False), 1152 | 'INIT_MET': ('Initiator methionine', 'end', True, False), 1153 | 'LIPID': ('Lipidation', 'end', True, False), 1154 | 'MOD_RES': ('Modified residue', 'end', True, False), 1155 | 'PEPTIDE': ('Peptide', 'all', True, True), 1156 | 'PROPEP': ('Propeptide', 'all', False, True), 1157 | 'SIGNAL': ('Signal peptide', 'all', True, False), 1158 | 'TRANSIT': ('Transit peptide', 'all', True, False), 1159 | 'STRAND': ('Beta strand', 'all', True, False), 1160 | 'HELIX': ('Helix', 'all', True, False), 1161 | 'TURN': ('Turn', 'all', True, False), 1162 | 'COILED': ('Coiled coil', 'all', True, False), 1163 | 'COMPBIAS': ('Compositional bias', 'all', True, False), 1164 | 'DOMAIN': ('Domain [FT]', 'all', True, False), 1165 | 'MOTIF': ('Motif', 'all', True, False), 1166 | 'REGION': ('Region', 'all', True, False), 1167 | 'REPEAT': ('Repeat', 'all', True, False), 1168 | 'ZN_FING': ('Zinc finger', 'all', True, False), 1169 | 'MUTAGEN': ('Mutagenesis', 'end', True, False), 1170 | 'CA_BIND': ('Calcium binding', 'all', True, False)} 1171 | count_features = {} 1172 | for features in sp_data['features']: 1173 | feats_dict = {} 1174 | for feature in features: 1175 | if feature.type in prefix2info.keys(): 1176 | parameters = prefix2info[feature.type] 1177 | if parameters[0] not in feats_dict.keys(): 1178 | feats_dict[parameters[0]] = parse_feature( 1179 | feature, pos_funcs[parameters[1]](feature), qualifiers=parameters[2], ide=parameters[3]) 1180 | count_features[parameters[0]] = 1 1181 | else: 1182 | feats_dict[parameters[0]] += ' ' + parse_feature( 1183 | feature, pos_funcs[parameters[1]](feature), qualifiers=parameters[2], ide=parameters[3]) 1184 | count_features[parameters[0]] += 1 1185 | else: 1186 | print(f'A feature UPIMAPI can yet not handle! [{feature.type}]') 1187 | feats_dict['Features'] = '; '.join([f'{feat_type} ({count})' for feat_type, count in count_features.items()]) 1188 | feats_list.append(feats_dict) 1189 | return pd.DataFrame(feats_list) 1190 | 1191 | 1192 | def parse_host_taxonomy_id(sp_data, tax_tsv): 1193 | tax_tsv = tax_tsv.reset_index().set_index('taxid') 1194 | return sp_data['host_taxonomy_id'].apply( 1195 | lambda x: '; '.join([f'{tax_tsv.loc[tid, "name"]} [TaxID: {tid}]' for tid in x]) if len(x) > 0 else np.nan) 1196 | 1197 | 1198 | def parse_sp_data(sp_data, tax_tsv, threads=15): 1199 | """ 1200 | Parses data from local ID mapping through DAT file 1201 | :param threads: 1202 | :param sp_data: pandas.DataFrame 1203 | :param tax_tsv: str - filename of taxonomy in TSV format 1204 | :return: pandas.DataFrame - organized in same columns as data from UniProt's API 1205 | """ 1206 | if len(sp_data) == 0: 1207 | return pd.DataFrame() 1208 | tax_tsv_df = pd.read_csv(tax_tsv, sep='\t', dtype={'taxid': str, 'name': str, 'rank': str, 'parent_taxid': str}) 1209 | tax_tsv_df = tax_tsv_df[tax_tsv_df.name.notnull()] 1210 | result = pd.DataFrame() 1211 | result['Entry'] = sp_data['accessions'].apply(lambda x: x[0]) 1212 | local2api = { 1213 | 'entry_name': 'Entry name', 1214 | 'data_class': 'Status', 1215 | 'sequence_length': 'Length', 1216 | 'sequence': 'Sequence' 1217 | } 1218 | for k, v in local2api.items(): 1219 | if v not in [None, False]: 1220 | result[v] = sp_data[k] 1221 | result['Organism ID'] = result['Taxonomic identifier (SPECIES)'] = \ 1222 | sp_data['taxonomy_id'].apply(lambda x: x[0] if len(x) > 0 else x) 1223 | result['Virus hosts'] = sp_data['host_organism'].apply(lambda x: x[0] if len(x) > 0 else x) 1224 | result['Keywords'] = sp_data['keywords'].apply(';'.join) 1225 | result['Organism'] = sp_data['organism'].str.rstrip('.') 1226 | result['Taxonomic lineage (SPECIES)'] = result['Organism'].apply(lambda x: ' '.join(x.split()[:2])) 1227 | timed_message('Parsing taxonomy (this may take a while)') 1228 | tax_df = parse_taxonomy(sp_data, tax_tsv_df, threads=threads).reset_index() 1229 | rel_df = sp_data['organism_classification'].apply(','.join) 1230 | tax_df['index'] = tax_df['index'].apply(','.join) 1231 | rel_df = pd.merge(rel_df, tax_df, left_on='organism_classification', right_on='index', how='left') 1232 | del rel_df['organism_classification'] 1233 | del rel_df['index'] 1234 | result['Virus hosts'] = parse_host_taxonomy_id(sp_data, tax_tsv_df) 1235 | result = pd.concat([result, rel_df], axis=1) 1236 | timed_message('Parsing genes') 1237 | result = pd.concat([result, parse_gene_names(sp_data)], axis=1) 1238 | timed_message('Parsing cross-references') 1239 | result = pd.concat([result, parse_cross_references(sp_data)], axis=1) 1240 | timed_message('Parsing comments') 1241 | result = pd.concat([result, parse_comments(sp_data)], axis=1) 1242 | timed_message('Parsing features') 1243 | result = pd.concat([result, parse_features(sp_data)], axis=1) 1244 | result = pd.concat([result, parse_descriptions(sp_data)], axis=1) 1245 | result['Gene encoded by'] = sp_data['organelle'].str.rstrip('.') 1246 | result['Mass'] = sp_data['seqinfo'].apply(lambda x: x[1]) 1247 | result['Date of creation'] = sp_data['created'].apply( 1248 | lambda x: datetime.strptime(x[0], '%d-%b-%Y').strftime('%Y-%m-%d')) 1249 | result['Date of last modification'] = sp_data['annotation_update'].apply( 1250 | lambda x: datetime.strptime(x[0], '%d-%b-%Y').strftime('%Y-%m-%d')) 1251 | result['Version (entry)'] = sp_data['annotation_update'].apply(lambda x: x[1]) 1252 | result['Date of last sequence modification'] = sp_data['sequence_update'].apply( 1253 | lambda x: datetime.strptime(x[0], '%d-%b-%Y').strftime('%Y-%m-%d')) 1254 | result['Version (sequence)'] = sp_data['sequence_update'].apply(lambda x: x[1]) 1255 | result['PubMed ID'] = sp_data['references'].apply( 1256 | lambda x: '; '.join([ref.references[0][1] for ref in x if len(ref.references) > 0])) 1257 | return result 1258 | 1259 | 1260 | def get_sprot_dat(sp_dat): 1261 | run_command( 1262 | f'wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/' 1263 | f'uniprot_sprot.dat.gz -O {sp_dat}.gz') 1264 | run_command(f'gunzip {sp_dat}.gz') 1265 | 1266 | 1267 | def local_id_mapping(ids, sp_dat, tax_tsv, output, columns=None, databases=None, threads=15): 1268 | ids_done, ids_missing, result = check_ids_already_done(output, ids) 1269 | if len(ids_missing) == 0: 1270 | return set() 1271 | if not os.path.isfile(sp_dat): 1272 | timed_message(f'Creating {sp_dat}') 1273 | get_sprot_dat(sp_dat) 1274 | if not os.path.isfile(tax_tsv): 1275 | timed_message(f'Creating {tax_tsv}') 1276 | get_tabular_taxonomy(tax_tsv) 1277 | timed_message('Searching for IDs in SwissProt DAT') 1278 | sp_data, ids_found = get_local_swissprot_data(sp_dat, ids_missing) 1279 | timed_message('Parsing SwissProt resuls') 1280 | sp_parsed = parse_sp_data(sp_data, tax_tsv, threads=threads) 1281 | result = pd.concat([result, sp_parsed]) 1282 | columns = [col for col in columns if col in result.columns.tolist()] 1283 | databases = [db for db in databases if db in result.columns.tolist()] 1284 | result[columns + databases].to_csv(output, sep='\t', index=False) 1285 | return ids_found 1286 | 1287 | 1288 | def get_input_type(args_input, blast=True): 1289 | if args_input is None: 1290 | return input('IDs to perform mapping on (comma separated values):'), 'stdin' 1291 | if blast: 1292 | return args_input, 'blast' 1293 | if check_output(f"head -c 1 {args_input}", shell=True).decode('utf8') == '>': 1294 | return args_input, 'fasta' 1295 | return args_input, 'txt' 1296 | 1297 | 1298 | def check_no_annotation(args_input, no_annotation): 1299 | if args_input is None: 1300 | is_fasta = False 1301 | else: 1302 | is_fasta = check_output(f"head -c 1 {args_input}", shell=True).decode('utf8') == '>' 1303 | if not is_fasta: 1304 | no_annotation = True 1305 | return no_annotation 1306 | 1307 | 1308 | def blast_consensus(alignment_file): 1309 | blast = parse_blast(alignment_file) 1310 | query_to_ref, ref_to_query, res = {}, {}, {} 1311 | with open(alignment_file) as file: 1312 | line = file.readline() 1313 | while line: 1314 | line = line.strip('\n').split('\t') 1315 | query_seq, ref_seq, evalue = line[0], line[1], float(line[-2]) 1316 | if query_seq not in query_to_ref: 1317 | if ref_seq not in ref_to_query: 1318 | query_to_ref[query_seq] = {'ref_seq': ref_seq, 'evalue': evalue} 1319 | ref_to_query[ref_seq] = {'query_seq': query_seq, 'evalue': evalue} 1320 | else: 1321 | if ref_to_query[ref_seq]['evalue'] > evalue: 1322 | ref_to_query[ref_seq] = {'query_seq': query_seq, 'evalue': evalue} 1323 | query_to_ref[query_seq] = {'ref_seq': ref_seq, 'evalue': evalue} 1324 | else: 1325 | if ref_seq not in ref_to_query: 1326 | if query_to_ref[query_seq]['evalue'] > evalue: 1327 | ref_to_query[ref_seq] = {'query_seq': query_seq, 'evalue': evalue} 1328 | query_to_ref[query_seq] = {'ref_seq': ref_seq, 'evalue': evalue} 1329 | else: 1330 | if ref_to_query[ref_seq]['evalue'] > evalue: 1331 | if query_to_ref[query_seq]['evalue'] > evalue: 1332 | ref_to_query[ref_seq] = {'query_seq': query_seq, 'evalue': evalue} 1333 | query_to_ref[query_seq] = {'ref_seq': ref_seq, 'evalue': evalue} 1334 | line = file.readline() 1335 | for query_seq in query_to_ref: 1336 | ref_seq = query_to_ref[query_seq]['ref_seq'] 1337 | if query_seq == ref_to_query[ref_seq]['query_seq']: 1338 | res[query_seq] = query_to_ref[query_seq]['ref_seq'] 1339 | res = pd.DataFrame.from_dict(res, orient='index').reset_index() 1340 | res.columns = ['qseqid', 'sseqid'] 1341 | return blast.set_index(['qseqid', 'sseqid']).loc[res.set_index(['qseqid', 'sseqid']).index].reset_index() 1342 | 1343 | 1344 | def upimapi(): 1345 | args = get_arguments() 1346 | Path(args.output).mkdir(parents=True, exist_ok=True) 1347 | Path(args.resources_directory).mkdir(parents=True, exist_ok=True) 1348 | args.no_annotation = check_no_annotation(args.input, args.no_annotation) 1349 | 1350 | # Annotation with DIAMOND 1351 | if not args.no_annotation: 1352 | db2file = {'uniprot': f'{args.resources_directory}/uniprot.fasta', 1353 | 'swissprot': f'{args.resources_directory}/uniprot_sprot.fasta', 1354 | 'taxids': f'{args.resources_directory}/taxids_database.fasta'} 1355 | if args.database in db2file.keys(): 1356 | database = db2file[args.database] 1357 | else: 1358 | database = args.database 1359 | 1360 | if not args.skip_db_check: 1361 | if must_build_database(args.database, args.resources_directory): 1362 | build_reference_database( 1363 | args.database, args.resources_directory, taxids=args.taxids, max_tries=args.max_tries, 1364 | mirror=args.mirror) 1365 | if not database.endswith(".dmnd"): 1366 | diamond_formatted = f"{'.'.join(database.split('.')[:-1])}.dmnd" 1367 | if not os.path.isfile(diamond_formatted): 1368 | make_diamond_database(database, diamond_formatted) 1369 | database = diamond_formatted 1370 | (b, c) = block_size_and_index_chunks( 1371 | argsb=args.block_size, argsc=args.index_chunks, memory=args.max_memory) 1372 | run_diamond( 1373 | args.input, f'{args.output}/aligned.blast', f'{args.output}/unaligned.blast', database, 1374 | threads=args.threads, max_target_seqs=args.max_target_seqs, b=b, c=c, e_value=args.evalue, 1375 | bit_score=args.bitscore, pident=args.pident, mode=args.diamond_mode) 1376 | if args.max_target_seqs > 1: 1377 | blast_consensus(f'{args.output}/aligned.blast').to_csv( 1378 | f'{args.output}/consensus.blast', sep='\t', index=False) 1379 | args.input = f'{args.output}/aligned.blast' 1380 | args.blast = True 1381 | 1382 | if args.skip_id_mapping: 1383 | exit('Not performing ID mapping as specified.') 1384 | 1385 | timed_message('ID mapping has begun.') 1386 | args_input, input_type = get_input_type(args.input, blast=args.blast) 1387 | 1388 | # Get the IDs 1389 | ids, full_id, sp_ids = get_ids(args_input, input_type=input_type, full_id=args.full_id) 1390 | 1391 | if args.output_table: 1392 | table_output = args.output_table 1393 | print(f'Overrided table output to {table_output}') 1394 | Path('/'.join(args.output_table.split('/')[:-1])).mkdir(parents=True, exist_ok=True) 1395 | else: 1396 | table_output = f'{args.output}/uniprotinfo.tsv' 1397 | 1398 | if args.from_db != 'UniProtKB AC/ID' or args.to_db != 'UniProtKB': 1399 | basic_idmapping_multiprocess(ids, table_output, args.from_db, args.to_db, threads=args.threads) 1400 | return 1401 | 1402 | if not args.skip_id_checking: 1403 | # UniProt's API now fails if outdated IDs or entry names are submitted. This function removes those IDs. 1404 | ids, not_valid = get_valid_entries_multiprocess(ids, threads=args.threads) 1405 | with open(f'{args.output}/valid_ids.txt', 'w') as f: 1406 | f.write('\n'.join(ids)) 1407 | with open(f'{args.output}/not_valid_ids.txt', 'w') as f: 1408 | f.write('\n'.join(not_valid)) 1409 | 1410 | # Get UniProt information 1411 | if not args.fasta: 1412 | # ID mapping through local SwissProt information 1413 | if args.local_id_mapping: 1414 | ids = set(ids) - set(local_id_mapping( 1415 | sp_ids, f'{args.resources_directory}/uniprot_sprot.dat', f'{args.resources_directory}/taxonomy.tsv', 1416 | table_output, columns=args.columns, databases=args.databases, threads=15)) 1417 | 1418 | # ID mapping through API 1419 | result = uniprot_information_workflow( 1420 | ids, table_output, columns=args.columns, step=args.step, max_iter=args.max_tries, 1421 | sleep_time=args.sleep) 1422 | result.to_csv(table_output, sep='\t', index=False) 1423 | 1424 | if not args.no_annotation: 1425 | blast = parse_blast(f'{args.output}/aligned.blast') 1426 | if full_id: 1427 | blast.sseqid = [ide.split('|')[1] if ide not in ['*', ''] else ide for ide in blast.sseqid] 1428 | result = pd.merge(blast, result, left_on='sseqid', right_on='Entry') 1429 | sort_columns = ['Entry'] if args.no_annotation else ['qseqid', 'evalue'] 1430 | result.sort_values(by=sort_columns, ascending=False).to_csv( 1431 | f'{args.output}/UPIMAPI_results.tsv', index=False, sep='\t') 1432 | else: 1433 | uniprot_fasta_workflow( 1434 | ids, f'{args.output}/uniprotinfo.fasta', step=args.step, sleep_time=args.sleep) 1435 | 1436 | 1437 | if __name__ == '__main__': 1438 | start_time = time() 1439 | upimapi() 1440 | timed_message(f'UPIMAPI analysis finished in {human_time(time() - start_time)}') 1441 | --------------------------------------------------------------------------------