├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    └── workflows
    │   └── main.yml
├── .gitignore
├── CITATION.cff
├── Dockerfile
├── LICENSE
├── README.md
├── cicd
    ├── ci_build.sh
    ├── environment.yml
    ├── full_ids.txt
    ├── ids.blast
    ├── ids.csv
    ├── meta.yaml
    └── proteomes.fasta
└── upimapi.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | 
12 | 
13 | **To Reproduce**
14 | - Upload an input file [drop a file here]
15 | - Write the command used:
16 | 
17 | **Screenshots**
18 | If applicable, add screenshots to help explain your problem. Paste them here.
19 | 
20 | **Please complete the following information:**
21 |  - OS:
22 |  - Version of UPIMAPI:
23 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ master ]
  6 |   pull_request:
  7 |     branches: [ master ]
  8 | 
  9 | jobs:
 10 |   build:
 11 |     runs-on: ubuntu-latest
 12 |     steps:
 13 |       - name: Checkout
 14 |         uses: actions/checkout@v4
 15 |       - name: Set up Docker Buildx
 16 |         uses: docker/setup-buildx-action@v1
 17 |       - name: Build and push
 18 |         uses: docker/build-push-action@v4
 19 |         with:
 20 |           context: .
 21 |           file: ./Dockerfile
 22 |           tags: upimapi:latest
 23 |           outputs: type=docker,dest=/tmp/upimapi.tar
 24 |       - name: Upload artifact
 25 |         uses: actions/upload-artifact@v4
 26 |         with:
 27 |           name: upimapi
 28 |           path: /tmp/upimapi.tar
 29 | 
 30 |   txt-file-comma-separated:
 31 |     runs-on: ubuntu-latest
 32 |     needs: build
 33 |     steps:
 34 |       - name: Download artifact
 35 |         uses: actions/download-artifact@v4
 36 |         with:
 37 |           name: upimapi
 38 |           path: /tmp
 39 |       - name: Load Docker image
 40 |         run: docker load --input /tmp/upimapi.tar
 41 |       - name: IDs inputted through TXT file (comma-separated)
 42 |         run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/ids.csv -cols 'Entry&KEGG&Interacts with&Taxonomic lineage (SUPERKINGDOM)&Taxonomic lineage (SPECIES)&Taxonomic lineage IDs (SUPERKINGDOM)&Taxonomic lineage IDs (SPECIES)'"
 43 | 
 44 |   txt-file-newline-separated:
 45 |     runs-on: ubuntu-latest
 46 |     needs: build
 47 |     steps:
 48 |       - name: Download artifact
 49 |         uses: actions/download-artifact@v4
 50 |         with:
 51 |           name: upimapi
 52 |           path: /tmp
 53 |       - name: Load Docker image
 54 |         run: docker load --input /tmp/upimapi.tar
 55 |       - name: Full IDs inputted through TXT file (newline-separated)
 56 |         run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/full_ids.txt"
 57 | 
 58 |   blast-file:
 59 |     runs-on: ubuntu-latest
 60 |     needs: build
 61 |     steps:
 62 |       - name: Download artifact
 63 |         uses: actions/download-artifact@v4
 64 |         with:
 65 |           name: upimapi
 66 |           path: /tmp
 67 |       - name: Load Docker image
 68 |         run: docker load --input /tmp/upimapi.tar
 69 |       - name: IDs inputted through BLAST file
 70 |         run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/ids.blast -rd resources_directory --blast"
 71 | 
 72 |   get-fasta-sequences:
 73 |     runs-on: ubuntu-latest
 74 |     needs: build
 75 |     steps:
 76 |       - name: Download artifact
 77 |         uses: actions/download-artifact@v4
 78 |         with:
 79 |           name: upimapi
 80 |           path: /tmp
 81 |       - name: Load Docker image
 82 |         run: docker load --input /tmp/upimapi.tar
 83 |       - name: Obtain FASTA sequences
 84 |         run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/ids.csv -rd resources_directory --fasta"
 85 | 
 86 |   basic-id-mapping:
 87 |     runs-on: ubuntu-latest
 88 |     needs: build
 89 |     steps:
 90 |       - name: Download artifact
 91 |         uses: actions/download-artifact@v4
 92 |         with:
 93 |           name: upimapi
 94 |           path: /tmp
 95 |       - name: Load Docker image
 96 |         run: docker load --input /tmp/upimapi.tar
 97 |       - name: Perform basic ID mapping
 98 |         run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/ids.csv -rd resources_directory --from-db 'UniProtKB AC/ID' --to-db 'EMBL/GenBank/DDBJ CDS'"
 99 | 
100 |   full-workflow:
101 |     runs-on: ubuntu-latest
102 |     needs: build
103 |     steps:
104 |       - name: Download artifact
105 |         uses: actions/download-artifact@v4
106 |         with:
107 |           name: upimapi
108 |           path: /tmp
109 |       - name: Load Docker image
110 |         run: docker load --input /tmp/upimapi.tar
111 |       - name: Full workflow, TaxIDs DB at Species level
112 |         run: docker run upimapi /bin/bash -c "upimapi -i UPIMAPI/cicd/proteomes.fasta -rd resources_directory -db taxids --taxids 2203,2223,2209,2162,119484,35554,29543,863"
113 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | 
  3 | # Pycharm
  4 | .idea
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | cover/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | .pybuilder/
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | #   For a library or package, you might want to ignore these files since the code is
 92 | #   intended to run in multiple environments; otherwise, check them in:
 93 | # .python-version
 94 | 
 95 | # pipenv
 96 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 97 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 98 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 99 | #   install all needed dependencies.
100 | #Pipfile.lock
101 | 
102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
103 | __pypackages__/
104 | 
105 | # Celery stuff
106 | celerybeat-schedule
107 | celerybeat.pid
108 | 
109 | # SageMath parsed files
110 | *.sage.py
111 | 
112 | # Environments
113 | .env
114 | .venv
115 | env/
116 | venv/
117 | ENV/
118 | env.bak/
119 | venv.bak/
120 | 
121 | # Spyder project settings
122 | .spyderproject
123 | .spyproject
124 | 
125 | # Rope project settings
126 | .ropeproject
127 | 
128 | # mkdocs documentation
129 | /site
130 | 
131 | # mypy
132 | .mypy_cache/
133 | .dmypy.json
134 | dmypy.json
135 | 
136 | # Pyre type checker
137 | .pyre/
138 | 
139 | # pytype static type analyzer
140 | .pytype/
141 | 
142 | # Cython debug symbols
143 | cython_debug/


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 |   - family-names: "Sequeira"
 5 |     given-names: "João C."
 6 |     orcid: "https://orcid.org/0000-0002-2691-9950"
 7 |   - family-names: "Rocha"
 8 |     given-names: "Miguel"
 9 |     orcid: "https://orcid.org/0000-0001-8439-8172"
10 |   - family-names: "Alves"
11 |     given-names: "M. Madalena"
12 |     orcid: "https://orcid.org/0000-0002-9078-3613"
13 |   - family-names: "Salvador"
14 |     given-names: "Andreia F."
15 |     orcid: "https://orcid.org/0000-0001-6037-4248"
16 | title: "UPIMAPI: UniProt Id Mapping through API"
17 | version: 1.6.4
18 | doi: "10.1016/J.CSBJ.2022.03.042"
19 | date-released: 2022-01-26
20 | url: "https://github.com/iquasere/UPIMAPI"
21 | preferred-citation:
22 |   type: article
23 |   authors:
24 |     - family-names: "Sequeira"
25 |       given-names: "João C."
26 |       orcid: "https://orcid.org/0000-0002-2691-9950"
27 |     - family-names: "Rocha"
28 |       given-names: "Miguel"
29 |       orcid: "https://orcid.org/0000-0001-8439-8172"
30 |     - family-names: "Alves"
31 |       given-names: "M. Madalena"
32 |       orcid: "https://orcid.org/0000-0002-9078-3613"
33 |     - family-names: "Salvador"
34 |       given-names: "Andreia F."
35 |       orcid: "https://orcid.org/0000-0001-6037-4248"
36 |   doi: "10.1016/J.CSBJ.2022.03.042"
37 |   journal: "Computational and Structural Biotechnology Journal"
38 |   start: 1798
39 |   end: 1810
40 |   title: "UPIMAPI, reCOGnizer and KEGGCharter: Bioinformatics tools for functional annotation and visualization of (meta)-omics datasets"
41 |   volume: 20
42 |   year: 2022


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda3
2 | 
3 | RUN git clone https://github.com/iquasere/UPIMAPI.git \
4 | && conda env update --file UPIMAPI/cicd/environment.yml --name base \
5 | && bash UPIMAPI/cicd/ci_build.sh \
6 | && conda clean --all -y
7 | 
8 | CMD [ "python", "bin/upimapi.py" ]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, João C. Sequeira
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # UniProt Id Mapping through API
  2 | 
  3 | A tool for retrieving huge ammounts of information from UniProt! 
  4 | 
  5 | UPIMAPI is a command line interface for using UniProt's API, which allows to access [UniProt's ID mapping](https://www.uniprot.org/uploadlists/) programmatically!
  6 | 
  7 | UPIMAPI can handle big numbers of UniProt IDs (like, millions) for which information can be obtained in a single command.
  8 | 
  9 | UPIMAPI also allows to first perform annotation with DIAMOND, connecting its powerfull annotation with the convenience of directly obtaining information from UniProt.
 10 | 
 11 | ### Index
 12 | 
 13 | 1. [Installing UPIMAPI](https://github.com/iquasere/UPIMAPI#installing-upimapi)
 14 | 2. [Annotation with UPIMAPI](https://github.com/iquasere/UPIMAPI#annotation-with-upimapi)
 15 | 3. [Information retrieval from UniProt](https://github.com/iquasere/UPIMAPI#information-retrieval-from-uniprot)
 16 | 4. [Output](https://github.com/iquasere/UPIMAPI#output)
 17 | 5. [Additional parameters](https://github.com/iquasere/UPIMAPI#additional-parameters)
 18 | 6. [Referencing UPIMAPI](https://github.com/iquasere/UPIMAPI#referencing-upimapi)
 19 | 
 20 | ## Installing UPIMAPI
 21 | 
 22 | To install UPIMAPI through Bioconda, run
 23 | ```
 24 | conda install -c bioconda upimapi
 25 | ```
 26 | To check if UPIMAPI was installed correctly, run
 27 | ```
 28 | upimapi --version
 29 | ```
 30 | 
 31 | ## Annotation with UPIMAPI
 32 | 
 33 | UPIMAPI can be used to perform homology-based annotation with DIAMOND. Main advantages of using UPIMAPI are that it determines optimal values for the most important search parameters, and directly links annotation to UniProt ID mapping.
 34 | To annotate protein sequences and get information from UniProt, UPIMAPI can be run as
 35 | ```
 36 | upimapi -i path/to/sequences.fasta -o path/to/output_directory -db database -t threads
 37 | ```
 38 | where:
 39 | * ```sequences.fasta``` is a FASTA file with aminoacid sequences of query proteins
 40 | * ```output_directory``` can be any folder, existent or not
 41 | * ```database``` can be either "uniprot" (default), "swissprot", "taxids" or the filename of a FASTA file with the reference sequences (see below).
 42 | 
 43 | ### Reference database
 44 | 
 45 | Several points to take notice about the reference database:
 46 | * It must be either UniProt or a subsection of it (e.g. SwissProt, or all proteins of a specific taxon). UPIMAPI performs ID mapping with UniProt IDs, so the database must have those;
 47 | * It can be supplied in either FASTA (.fasta) or DIAMOND (.dmnd) format. If in FASTA, UPIMAPI will create a new database in DIAMOND format for annotation;
 48 | * There are four different ways to input reference databases to UPIMAPI:
 49 | 
 50 | #### Use the entire UniProt (or just SwissProt)
 51 | 
 52 | Using the UniProt database is a valid choice if the case study is a metagenome with a mostly unknown community composition.
 53 | 
 54 | To use the entire UniProt database as reference for UPIMAPI, specify the database as ```--database uniprot```.
 55 | 
 56 | If alternatively you only want to use SwissProt (the manually curated part of UniProt), specify the database as ```--database swissprot```.
 57 | 
 58 | #### Input tax IDs to build a more specific database
 59 | 
 60 | If, for both pure and mixed cultures, the taxonomic composition is known, UPIMAPI can build a database with the reference proteomes of the known taxa. 
 61 | 
 62 | To build a reference for specific taxa, specify the database as ```--database taxids```, and the tax IDs as ```--tax-ids taxid1,taxid2,taxid3 ...```.
 63 | 
 64 | #### Input a custom database
 65 | 
 66 | A custom database can be inputted if, for example, there is only interest in annotating proteins of a specific family (e.g. hydrogenases). Such a database must be manually built from UniProt.
 67 | 
 68 | To input a custom database into UPIMAPI, specify it as ```--database path/to/database.fasta```.
 69 | 
 70 | ## Information retrieval from UniProt
 71 | 
 72 | ### Columns of information from UniProt
 73 | 
 74 | UniProt provides information for many different fields of information and cross-references. For the user's convenience, a default selection is provided: ```Entry```, ```Entry name```, ```Gene Names```, ```Protein names```, ```EC number```, ```Function[CC]```, ```Pathway```, ```Keywords```, ```Protein existence```, ```Gene Ontology (GO)```, ```Protein families```, ```Taxonomic lineage```, ```Organism```, ```Organism ID```, ```BioCyc```, ```BRENDA```, ```CDD```, ```eggNOG```, ```Ensembl```, ```InterPro```, ```KEGG```, ```Pfam```, ```Reactome```, ```RefSeq``` and ```UniPathway```.
 75 | 
 76 | If another selection of columns/databases is desired, it can be specified, for example, as 
 77 | ```
 78 | --columns "Coiled coil&Compositional bias"
 79 | ```
 80 | where ```--columns``` takes as input the names of the fields of information required. Valid values for the columns can be consulted at [UniProtKB return fields](https://www.uniprot.org/help/return_fields).
 81 | 
 82 | #### Sometimes the return fields are not properly updated
 83 | 
 84 | If the columns were correctly inputted according to the [return fields page](https://www.uniprot.org/help/return_fields) and UPIMAPI is still complaining about "\[COL] is not a valid column name for ID mapping", it may be that values at return fields are not properly updated. If that happens, running `upimapi --show-available-columns` will present the user with the current valid fields.
 85 | 
 86 | #### UPIMAPI offers a few additional columns for taxonomic information
 87 | 
 88 | Previous to the Summer 2022 UniProt release, the API provided fields for taxonomic information, but these have been condensed into the ```Taxonomic lineage``` and ```Taxonomic lineage (IDs)``` columns. Since ```1.8.6```, UPIMAPI provides this information again, properly organized. Additional available columns for taxonomy are as follows:
 89 | 
 90 | * ```Taxonomic lineage (LEVEL OF TAXONOMY)```: the taxonomic lineage of the organism, with the specified level of taxonomy. For example, ```--columns "Taxonomic lineage (SPECIES)"``` will return the species of the organism. Other possible values are ```SUPERKINGDOM```, ```PHYLUM```, ```CLASS```, ```ORDER```, ```FAMILY```, ```GENUS```, ```SPECIES```, [among others](https://en.wikipedia.org/wiki/Taxonomic_rank).
 91 | 
 92 | * ```Taxonomic lineage IDs (LEVEL OF TAXONOMY)```: the TaxIDs of the organism, with the specified level of taxonomy. For example, ```--columns "Taxonomic lineage IDs (SPECIES)"``` will return the TaxID of the species of the organism. Other possible values are as above.
 93 | 
 94 | ## ID mapping without annotation
 95 | 
 96 | If only retrieval of information from UniProt is required (no annotation step), IDs can be inputted to UPIMAPI directly through several different inputs.
 97 | 
 98 | ### Annotation BLAST file
 99 | 
100 | The result of an annotation with some database with UniProt IDs can be directy inputted for ID mapping with the command
101 | ```
102 | upimapi -i aligned.blast -o output_directory --blast
103 | ```
104 | 
105 | ### CSV file
106 | 
107 | A CSV file with UniProt IDs (separated by commas) can be inputted to UPIMAPI with the command
108 | ```
109 | upimapi -i ids.txt -o output_directory
110 | ```
111 | This repo provides an [example](https://github.com/iquasere/UPIMAPI/blob/master/ids.txt) of this file.
112 | 
113 | ### Directly from the command line
114 | 
115 | IDs can also be directly inputted through the command line by not specifying an input. They must be inputted as a comma separated value:
116 | ```
117 | >>> upimapi -o output_directory
118 | 
119 | IDs to perform mapping on (comma separated values):
120 | ```
121 | 
122 | ## Output
123 | 
124 | Information obtained with UPIMAPI can come in two forms:
125 | 1. The **Base** (default) workflow obtains information for the list of columns and databases inputted. It produces the following outputs, in the output folder:
126 |     * ```uniprotinfo.tsv```, contains information of the columns and databases specified
127 |     * if annotation was performed, ```aligned.blast``` and ```unaligned.fasta``` contain the annotated and unannotated proteins, respectively.
128 | 
129 | 2. The **Fasta** workflow, specified with the ```--fasta``` argument, results in a FASTA file with the protein sequences correspondent to the inputted IDs
130 | 
131 | ## From/To ID mapping
132 | 
133 | The ID mapping available at https://www.uniprot.org/id-mapping triggered when "From database" and "To database" are different to the default values - "UniProtKB AC/ID" and "UniProtKB" - is also implemented since UPIMAPI `1.12`.
134 | 
135 | As an example, this command would convert IDs from UniProtKB to EMBL/Genbank/DDBJ CDS: 
136 | ```
137 | upimapi -i ids.txt -o output_directory --from-db 'UniProtKB AC/ID' --to-db 'EMBL/GenBank/DDBJ CDS'
138 | ```
139 | 
140 | Possible values for parameters `--from-db` and `--to-db` can be consulted through the browser (https://www.uniprot.org/id-mapping), at https://rest.uniprot.org/configure/idmapping/fields, or by inputting a wrong value to one of those parameters. Possible options will show up.
141 | 
142 | This new ID mapping can't be combined with the ID mapping that obtains columns of information from UniProt. UPIMAPI will exit after ID mapping.
143 | 
144 | ## Additional parameters
145 | 
146 | ```
147 |   -h, --help            show this help message and exit
148 |   -i INPUT, --input INPUT
149 |                         Input filename - can be: 1. a file containing a list of IDs (comma-separated values, no spaces) 2. a BLAST TSV result file (requires to be specified with the
150 |                         --blast parameter 3. a protein FASTA file to be annotated (requires the -db parameter) 4. nothing! If so, will read input from command line, and parse as CSV
151 |                         (id1,id2,...)
152 |   -o OUTPUT, --output OUTPUT
153 |                         Folder to store outputs
154 |   -ot OUTPUT_TABLE, --output-table OUTPUT_TABLE
155 |                         Filename of table output, where UniProt info is stored. If set, will override 'output' parameter just for that specific file
156 |   -rd RESOURCES_DIRECTORY, --resources-directory RESOURCES_DIRECTORY
157 |                         Directory to store resources of UPIMAPI [~/upimapi_resources]
158 |   -cols COLUMNS, --columns COLUMNS
159 |                         List of UniProt columns to obtain information from (separated by &)
160 |   --blast               If input file is in BLAST TSV format (will consider one ID per line if not set) [false]
161 |   --full-id FULL_ID     If IDs in database are in 'full' format: tr|XXX|XXX [auto]
162 |   --fasta               Output will be generated in FASTA format [false]
163 |   --step STEP           How many IDs to submit per request to the API [1000]
164 |   --max-tries MAX_TRIES
165 |                         How many times to try obtaining information from UniProt before giving up [3]
166 |   --sleep SLEEP         Time between requests (in seconds) [3]
167 |   --no-annotation       Do not perform annotation - input must be in one of BLAST result or TXT IDs file or STDIN [false]
168 |   --local-id-mapping    Perform local ID mapping of SwissProt IDs. Advisable if many IDs of SwissProt are present [false]
169 |   --skip-id-mapping     If true, UPIMAPI will not perform ID mapping [false]
170 |   --skip-id-checking    If true, UPIMAPI will not check if IDs are valid before mapping [false]
171 |   --skip-db-check       So UPIMAPI doesn't check for (FASTA) database existence [false]
172 |   --mirror {expasy,uniprot,ebi}
173 |                         From where to download UniProt database [expasy]
174 |   -v, --version         show program's version number and exit
175 | 
176 | DIAMOND arguments:
177 |   -db DATABASE, --database DATABASE
178 |                         How the reference database is inputted to UPIMAPI. 1. uniprot - UPIMAPI will download the entire UniProt and use it as reference 2. swissprot - UPIMAPI will
179 |                         download SwissProt and use it as reference 3. taxids - Reference proteomes will be downloaded for the taxa specified with the --taxids, and those will be used as
180 |                         reference 4. a custom database - Input will be considered as the database, and will be used as reference
181 |   -t THREADS, --threads THREADS
182 |                         Number of threads to use in annotation steps [all available]
183 |   --evalue EVALUE       Maximum e-value to report annotations for [1e-3]
184 |   --pident PIDENT       Minimum pident to report annotations for.
185 |   --bitscore BITSCORE   Minimum bit score to report annotations for (overrides e-value).
186 |   -mts MAX_TARGET_SEQS, --max-target-seqs MAX_TARGET_SEQS
187 |                         Number of annotations to output per sequence inputed [1]
188 |   -b BLOCK_SIZE, --block-size BLOCK_SIZE
189 |                         Billions of sequence letters to be processed at a time [memory / 20]
190 |   -c INDEX_CHUNKS, --index-chunks INDEX_CHUNKS
191 |                         Number of chunks for processing the seed index [dependant on block size]
192 |   --max-memory MAX_MEMORY
193 |                         Maximum memory to use (in Gb) [all available]
194 |   --taxids TAXIDS       Tax IDs to obtain protein sequences of for building a reference database.
195 |   --diamond-mode {fast,mid_sensitive,sensitive,more_sensitive,very_sensitive,ultra_sensitive}
196 |                         Mode to run DIAMOND with [fast]
197 | ```
198 | 
199 | ## Referencing UPIMAPI
200 | 
201 | If you use UPIMAPI, please cite its [publication](https://www.sciencedirect.com/science/article/pii/S2001037022001179).


--------------------------------------------------------------------------------
/cicd/ci_build.sh:
--------------------------------------------------------------------------------
1 | PREFIX="/opt/conda"
2 | mkdir -p "${PREFIX}/bin"
3 | cp UPIMAPI/upimapi.py "${PREFIX}/bin"
4 | chmod +x /opt/conda/bin/upimapi.py
5 | ln -s "${PREFIX}/bin/upimapi.py" "${PREFIX}/bin/upimapi"


--------------------------------------------------------------------------------
/cicd/environment.yml:
--------------------------------------------------------------------------------
 1 | name: upimapi
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |     - pandas
 8 |     - diamond
 9 |     - psutil
10 |     - tqdm
11 |     - requests
12 |     - biopython
13 |     - pyyaml


--------------------------------------------------------------------------------
/cicd/full_ids.txt:
--------------------------------------------------------------------------------
 1 | tr|A0A090I5T7|A0A090I5T7_METFO
 2 | tr|A0A089ZJ62|A0A089ZJ62_METFO
 3 | tr|A0A090I166|A0A090I166_METFO
 4 | tr|A0A090I2M9|A0A090I2M9_METFO
 5 | tr|A0A090I395|A0A090I395_METFO
 6 | tr|A0A090I3B2|A0A090I3B2_METFO
 7 | tr|A0A090I3H2|A0A090I3H2_METFO
 8 | tr|A0A090I4Q7|A0A090I4Q7_METFO
 9 | tr|A0A090I4T1|A0A090I4T1_METFO
10 | tr|A0A090I521|A0A090I521_METFO
11 | tr|A0A090I6C9|A0A090I6C9_METFO
12 | tr|A0A090I6I6|A0A090I6I6_METFO
13 | tr|A0A090I8P6|A0A090I8P6_METFO
14 | tr|A0A090I8T6|A0A090I8T6_METFO
15 | tr|A0A090I915|A0A090I915_METFO
16 | tr|A0A090IAB0|A0A090IAB0_METFO
17 | tr|A0A090JTG7|A0A090JTG7_METFO
18 | tr|A0A090JXV9|A0A090JXV9_METFO
19 | tr|A0A089Z9J6|A0A089Z9J6_METFO
20 | tr|A0A089ZCR8|A0A089ZCR8_METFO
21 | tr|A0A089ZDP0|A0A089ZDP0_METFO
22 | tr|A0A089ZDP3|A0A089ZDP3_METFO
23 | tr|A0A089ZGW2|A0A089ZGW2_METFO
24 | tr|A0A089ZH11|A0A089ZH11_METFO
25 | tr|A0A089ZHB0|A0A089ZHB0_METFO
26 | tr|A0A089ZHC6|A0A089ZHC6_METFO
27 | tr|A0A089ZHH1|A0A089ZHH1_METFO
28 | tr|A0A089ZVL0|A0A089ZVL0_METFO
29 | tr|A0A089ZVU4|A0A089ZVU4_METFO
30 | 


--------------------------------------------------------------------------------
/cicd/ids.blast:
--------------------------------------------------------------------------------
 1 | sp|Q74FU6|SFRA_GEOSL	sp|D7AF63|SFRA_GEOSK	100	844	0	0	1	844	1	844	0.0	1667
 2 | sp|Q74FU5|SFRB_GEOSL	sp|D7AF64|SFRB_GEOSK	100	672	0	0	1	672	1	672	0.0	1367
 3 | sp|Q74DI8|PPNP_GEOSL	tr|A0A0D5NBK3|A0A0D5NBK3_GEOSN	100	104	0	0	1	104	1	104	3.01e-70	214
 4 | sp|P61422|THIED_GEOSL	tr|A0A0D5N9M5|A0A0D5N9M5_GEOSN	100	490	0	0	1	490	1	490	0.0	932
 5 | tr|Q74CH0|Q74CH0_GEOSL	tr|A0A0D5NCT2|A0A0D5NCT2_GEOSN	100	363	0	0	1	363	1	363	3.75e-260	715
 6 | sp|Q74GH5|GLMU_GEOSL	tr|A0A0D5N8K0|A0A0D5N8K0_GEOSN	100	476	0	0	1	476	1	476	0.0	884
 7 | tr|Q74BH3|Q74BH3_GEOSL	tr|A0A0D5N307|A0A0D5N307_GEOSN	100	319	0	0	1	319	1	319	8.65e-221	612
 8 | tr|Q74C72|Q74C72_GEOSL	tr|A0A0D5NB94|A0A0D5NB94_GEOSN	100	519	0	0	1	519	1	519	0.0	992
 9 | sp|Q74FW6|TSAL_GEOSL	tr|A0A0D5N940|A0A0D5N940_GEOSN	100	402	0	0	1	402	1	402	2.43e-278	764
10 | sp|Q74BY3|PYRG_GEOSL	tr|A0A0D5N2K6|A0A0D5N2K6_GEOSN	100	536	0	0	1	536	1	536	0.0	1080
11 | 


--------------------------------------------------------------------------------
/cicd/ids.csv:
--------------------------------------------------------------------------------
1 | P31946,P62258,ALBU_HUMAN,EFTU_ECOLI


--------------------------------------------------------------------------------
/cicd/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "upimapi" %}
 2 | {% set version = "1.12.1" %}
 3 | {% set sha256 = "c806ba0804abf2eb482b75be3bd7312d8b117048ba800ebff788b06f430b188f" %}
 4 | 
 5 | package:
 6 |   name: {{ name|lower }}
 7 |   version: {{ version }}
 8 | 
 9 | source:
10 |   url: https://github.com/iquasere/UPIMAPI/archive/{{ version }}.tar.gz
11 |   sha256: {{ sha256 }}
12 | 
13 | build:
14 |   noarch: generic
15 |   number: 0
16 |   run_exports:
17 |     - { { pin_subpackage(name, max_pin="x.x") } }
18 |   script: >
19 |     mkdir -p $PREFIX/bin && 
20 |     cp upimapi.py $PREFIX/bin &&
21 |     chmod +x $PREFIX/bin/upimapi.py &&
22 |     ln -s $PREFIX/bin/upimapi.py $PREFIX/bin/upimapi
23 | 
24 | requirements:
25 |   run:
26 |     - pandas
27 |     - diamond
28 |     - psutil
29 |     - tqdm
30 |     - requests
31 |     - biopython
32 |     - pyyaml
33 | 
34 | test:
35 |   commands:
36 |     - upimapi -v
37 | 
38 | about:
39 |   home: https://github.com/iquasere/UPIMAPI
40 |   license: BSD-3-Clause
41 |   license_family: BSD
42 |   license_file: LICENSE
43 |   summary: 'UniProt Id Mapping through API'
44 |   description: |
45 |     UPIMAPI takes as input either a list of UniProt IDs or a blast file from
46 |     annotation using UniProt database as reference and uses UniProt's API to
47 |     retrieve information relative to those IDs. It is essentially a command
48 |     line implementation of UniProt's ID mapping web service available at
49 |     https://www.uniprot.org/uploadlists/, allowing for retrieval of information
50 |     from thousands of IDs in one go, while still relying on the web service.
51 |   doc_url: https://github.com/iquasere/UPIMAPI/blob/master/README.md
52 |   dev_url: https://github.com/iquasere/UPIMAPI
53 | 
54 | extra:
55 |   recipe-maintainers:
56 |     - iquasere
57 | 


--------------------------------------------------------------------------------
/cicd/proteomes.fasta:
--------------------------------------------------------------------------------
 1 | >Q74FU6
 2 | MVSLTIDGKDITVAKETTILDAAALLGITIPTLCWLKKVSPTGACRVCAVEIEGVDRPMTACNTPVKDGIKVTTQSEKLSRIRQKIMELMLVNHPLDCPVCDAGGECDLQNACYGLGAAKQEYGAVLERRKIRYDWPLIESDPNRCILCEKCVKVDHEIVGCNAIRVVNRGEATIIDTVDGNPLNCEFCGNCVAACPTGTLISKPFKFRGRPWAFTTTPSVCPFCATGCQIEYHSRNGRVERVTSDDSTYNSGNLCINGRFGYSYINSPDRLAEPMVKGQKADWNTAMGTAATALKQIVASHGADAVAGFGSPRVTNEDNYLFQKLMRSAIGTGNIDSEARLGFAATQKVLREMLGIAGASTTIDAIDRATAVLVVGCDLNAEATGMEYRVIKAATKNNAKLVLAAMRDIKLKKFANSHLKYRPGNETLLINALTKAVLEEGLENKEFCSANISNLSDLTAALAGVSIADAAAATGVTEADLRAAARLVGGKKGVAVIFGAELMRGGNTDAVKALINLALILGATAGDTGGLFPVYEKTNIRGLLDMGVAPDHFPGHQTDGTTFEKAWGKKLPAAAGKDLWQIIEGIEQGSVKALYLLGCDPVASFPEGERIRKALEKLELLIVQDPFPGEAAKMAHVVFPSSVAAEKNGTFTTIDGRVQPLAKAVAPSGDAREDWDILTELYNRLTGESRIHSPAAVLDEVAALVPAYASVGRTGGTITAQPRSGGLALAPVSARAVAGSPTTLLVGTILYHSGTTTTWSKNNLEIIPKGYIEIHPNDAAKLGIAEGGKVRLSAGSVKVEGTAKITPRVQPGLLFAPSHFRGMNVNALLSRDGGVVPVTVEKA
 3 | >Q74FU5
 4 | MAQVVFSSWGRTIVDNRKGGEAQDVSFRLPTTLDGERQIAAFMGWDGIILYDLKVDVPAMAAEYMKRVQTQYCCGKCTPGKKGTKVLADVLAAIIEGRATEADLDTIDDLADLLTNCKCTLCQSSTIPVLDAVKHFREDFLAYITGIRKPANVHRFIDKYTAPCMDRCPAHIDIPAYIEAIKEYRFDESLDIIRDNMPLPSVCGRVCPHPCETHCRRKNVDDSVNIMVLKRSASDYEWMHNAAPPMQPKPQKNKKVAIVGAGPAGLACAYYLALEGYPCTIYEALPEGYGGGMIAVGIPPYRQPRHLLQRDIDIISSMGVDIIYDTRIGKDISLEELKQKFDAVFLAPGAHRSKPMGVEGEDKGYKGFLKGGIDFLREAYMGRPTGMGKKVVVVGGGNTAIDCVRVALREGAEESTLLYRRSRKEMPADVWEVDGADEEGVRFEFQVLPTRVLVDENEQVTGVECVRMALGEPDASGRRRPEPVPGSEFVVECDTVIPAIGQDPDLSFIPDNLGIDITKWNTVVTKYVPLKDAAGKDLKDGMGNPLARVLITDLEGVFAGGDAEIGPLTVVACIGNAHRAARVIQRWLEEGKAYLTEDELMEDILTNMPVYDKNEKVPWLDSRERAHQAEVHGQERASKGNYQEVELGFVDTQAVEEAERCLRCYRVAMAAI
 5 | >Q74DI8
 6 | MSEFTNVTIIREANVYFDGGVVSRTVVFPDGTKKTLGIMQPGEYTFTTGAPEIMEILSGELDLKLPGSDAWNRVGGGESFDVPANSSFTMKVLSLTDYCCSFLG
 7 | >P61422
 8 | MASNGHTLRLVINRDKHDSVIRGLYLVTDHDDNLIPRVEAAIDGGARVVQYRNKNQDRESRLALGLELRELCRRRSIPFIVNDDLEMAVSLKADGLHLGQGDGDPREARRVLGPGKIIGVSTHTLSEALEAQAAGVDYIGLGAMFPSRSKEVEHVAGSELLAAIRSSISIPIVAIGGITRDNGASVIDAGADAVAVISAVLSHPDPALAATEIALLFNRRAPFPRGSVLTVAGSDSGGGAGIQADLKTVTLLGSYGSSVLTALTAQNTRGVSGIHGVPPAFVADQLDAVFSDIPVDVVKTGMLFSAETIVAIAAKLTEYRRRMVVVDPVMVAKGGANLIDRGAVSVLKERLFPLAYLVTPNIPEAERLTGANISDEESMREAARRLHRLGARNVLLKGGHLLAGDSVDILFDGAAFHRFVSPRILSKNTHGTGCTFASAIATYLAQGDPLREAIARAKRYITAAIRLAQPLGRGHGPVNHILAAEDVRDR
 9 | >Q74CH0
10 | MARKVGILTGGGDCPGLNAVIRGVVKSSIIRRGWEVVGIRDGFDGLLYNNRIVPLGLNEVRGILPRGGTILGTSNRGNPFSYPVEADGKTVLTDVSDEVVANIKKQGIDALVAVGGDGSLKIALELMNKGIPVVGVPKTIDNDLMETDVTFGYNTALETATDALDKLHSTAESHHRVMIMEVMGRYAGWIALESGISGGADVILIPEIPYDISAVCRAVDERRRRGSSFSIIVVAEGAFPRGGNRVVQKRADETNTIERLGGIGQYVARQLGDCLDMDVRVMVLGHLQRGGSPSTFDRCLGSRFGVAAIDLIEQEQYGRMVCLRGRDIKSVSIERAVRKLKLVNPGGQMVTAAEELGIVVGRR
11 | >Q74GH5
12 | MDNLAAIILAAGKGTRMKSGIVKVMHPLAGAPMVAWPVAVARQAGAGRIVAVVGHQAERLREHFSNDADITLAVQEEQLGTGHAVACAAGDLSGFSGKVLILCGDVPLIRTETLRAMVTAHEATGAVLTVLTARQENPHGYGRIIRGFDGRVIRIVEEKDATPDERSRTEVNAGIYCAEASFLFDAVKRIGNDNAQGEYYLTDIITMANDRGLRCTAHPVADPVEVMGINDRVQLAEAARHARRRIAEEHMLNGVTLVDPAATYIDQGVVIGADTTIQPGVQIAGGCRVGEGCTIEAGAIIKGSELGDRCVVESRAVIRGCRLGSDVVIKAGTVMEDSTVMDHAAIGPMAHLRPGSELGAHVKIGNFVETKKIVMGEGSKASHLTYLGDATIGRNVNVGCGTITCNYDGVNKHRTVIGDDVFVGSDVQFVAPVTIGSNTLIAAGTTVTRDVPADSLAIARTPQINKEGWKLRKRDQ
13 | >Q74BH3
14 | MKKIGILTSGGDCSGMNAAIRAAVRTAIRMNIEVVGFRKGYLGLMKGDAIPLDTKAVSGILHRGGTFLQSARSPEFKTPEGQRTALNNLKALGVEGMVVMGGDGSLTGALALNRLGLPVVGIPASIDNDIPFTDMALGVDTALNNIIYAVDCIKDTASSHARAFVIEVMGRHSGYLASISAIATGAEYALVPEREYDLAEICQQLRARYEEGRDNAIIILAEGAGHGHEIANSIKDAIGFETRVTVLGHYQRGGAPTVFDRLLASRLGKKSVELLVTGTWGVMVGLSCNAILATPLEDVIKGEKRPQDEVLRLAEVLGV


--------------------------------------------------------------------------------
/upimapi.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | """
   3 | UPIMAPI - UniProt Id Mapping through API
   4 | 
   5 | By João Sequeira
   6 | 
   7 | Mar 2020
   8 | """
   9 | 
  10 | import json
  11 | from argparse import ArgumentParser, ArgumentTypeError
  12 | import os
  13 | import sys
  14 | from time import strftime, gmtime, time, sleep
  15 | from subprocess import run, Popen, PIPE, check_output
  16 | import requests
  17 | import yaml
  18 | from psutil import virtual_memory
  19 | from pathlib import Path
  20 | from multiprocessing import cpu_count, Pool, Manager
  21 | from io import StringIO
  22 | import pandas as pd
  23 | import xml.etree.ElementTree as ET
  24 | from tqdm import tqdm
  25 | from datetime import datetime
  26 | from Bio import SwissProt as SP
  27 | import numpy as np
  28 | from functools import partial
  29 | import re
  30 | 
  31 | __version__ = '1.13.2'
  32 | 
  33 | 
  34 | def load_api_info():
  35 |     return yaml.safe_load(requests.get('https://rest.uniprot.org/docs/uniprot-openapi3.yaml').text)
  36 | 
  37 | 
  38 | def get_url(url, max_tries = 3, **kwargs):
  39 |     tries = 0
  40 |     response = None
  41 |     while tries < max_tries + 1:
  42 |         try:
  43 |             response = requests.get(url, **kwargs)
  44 |             if response.ok:
  45 |                 return response
  46 |         except:
  47 |             tries += 1
  48 |             sleep(5)
  49 |     response.raise_for_status()
  50 |     sys.exit(response.txt)
  51 | 
  52 | 
  53 | def get_uniprot_columns():
  54 |     res = get_url('https://rest.uniprot.org/configure/uniprotkb/result-fields')
  55 |     obj = json.loads(res.text)
  56 |     result = {}
  57 |     for i in range(len(obj)):
  58 |         for col in obj[i]['fields']:
  59 |             result[col['label']] = col['name']
  60 |     return result
  61 | 
  62 | 
  63 | def get_id_mapping_fields():
  64 |     res = get_url('https://rest.uniprot.org/configure/idmapping/fields')
  65 |     obj = json.loads(res.text)
  66 |     froms, tos = {}, {}
  67 |     for group in range(len(obj['groups'])):
  68 |         for item in obj['groups'][group]['items']:
  69 |             if item['from']:
  70 |                 froms[item['displayName']] = item['name']
  71 |             if item['to']:
  72 |                 tos[item['displayName']] = item['name']
  73 |     return froms, tos
  74 | 
  75 | 
  76 | api_info = load_api_info()
  77 | columns_dict = get_uniprot_columns()
  78 | from_fields, to_fields = get_id_mapping_fields()
  79 | 
  80 | 
  81 | def get_arguments():
  82 |     parser = ArgumentParser(description="UniProt Id Mapping through API",
  83 |                             epilog="A tool for retrieving information from UniProt.")
  84 |     parser.add_argument(
  85 |         "-i", "--input", help="""Input filename - can be:\n
  86 |         \t1. a file containing a list of IDs (comma-separated values, no spaces)\n
  87 |         \t2. a BLAST TSV result file (requires to be specified with the --blast parameter\n
  88 |         \t3. a protein FASTA file to be annotated (requires the -db parameter)\n
  89 |         \t4. nothing! If so, will read input from command line, and parse as CSV (id1,id2,...)""")
  90 |     parser.add_argument("-o", "--output", help="Folder to store outputs", default="UPIMAPI_output")
  91 |     parser.add_argument(
  92 |         "-ot", "--output-table",
  93 |         help="Filename of table output, where UniProt info is stored. If set, will override 'output' parameter "
  94 |              "just for that specific file")
  95 |     parser.add_argument(
  96 |         "-rd", "--resources-directory", default=os.path.expanduser("~/upimapi_resources"),
  97 |         help="Directory to store resources of UPIMAPI [~/upimapi_resources]")
  98 |     parser.add_argument(
  99 |         "-cols", "--columns", default=None, help="List of UniProt columns to obtain information from (separated by &)")
 100 |     parser.add_argument(
 101 |         "--from-db", default="UniProtKB AC/ID", choices=from_fields.keys(),
 102 |         help="Which database are the IDs from. If from UniProt, default is fine [UniProtKB AC/ID]")
 103 |     parser.add_argument(
 104 |         "--to-db", default="UniProtKB", choices=to_fields.keys(),
 105 |         help="To which database the IDs should be mapped. If only interested in columns information "
 106 |              "(which include cross-references), default is fine [UniProtKB]")
 107 |     parser.add_argument(
 108 |         "--blast", action="store_true", default=False,
 109 |         help="If input file is in BLAST TSV format (will consider one ID per line if not set) [false]")
 110 |     parser.add_argument(
 111 |         "--full-id", type=str2bool, default="auto", help="If IDs in database are in 'full' format: tr|XXX|XXX [auto]")
 112 |     parser.add_argument(
 113 |         "--fasta", help="Output will be generated in FASTA format [false]", action="store_true", default=False)
 114 |     parser.add_argument(
 115 |         "--step", type=int, default=1000, help="How many IDs to submit per request to the API [1000]")
 116 |     parser.add_argument(
 117 |         "--max-tries", default=3, type=int,
 118 |         help="How many times to try obtaining information from UniProt before giving up [3]")
 119 |     parser.add_argument("--sleep", default=3, type=int, help="Time between requests (in seconds) [3]")
 120 |     parser.add_argument(
 121 |         "--no-annotation", action="store_true", default=False,
 122 |         help="Do not perform annotation - input must be in one of BLAST result or TXT IDs file or STDIN [false]")
 123 |     parser.add_argument(
 124 |         "--local-id-mapping", action="store_true", default=False,
 125 |         help="Perform local ID mapping of SwissProt IDs. Advisable if many IDs of SwissProt are present [false]")
 126 |     parser.add_argument(
 127 |         "--skip-id-mapping", action="store_true", default=False,
 128 |         help="If true, UPIMAPI will not perform ID mapping [false]")
 129 |     parser.add_argument(
 130 |         "--skip-id-checking", action="store_true", default=False,
 131 |         help="If true, UPIMAPI will not check if IDs are valid before mapping [false]")
 132 |     parser.add_argument(
 133 |         "--skip-db-check", action="store_true", default=False,
 134 |         help="So UPIMAPI doesn't check for (FASTA) database existence [false]")
 135 |     parser.add_argument(
 136 |         "--mirror", choices=['expasy', 'uniprot', 'ebi'], default='expasy',
 137 |         help="From where to download UniProt database [expasy]")
 138 |     parser.add_argument('-v', '--version', action='version', version=f'UPIMAPI {__version__}')
 139 | 
 140 |     diamond_args = parser.add_argument_group('DIAMOND arguments')
 141 |     diamond_args.add_argument(
 142 |         "-db", "--database", default='uniprot',
 143 |         help="How the reference database is inputted to UPIMAPI.\n"
 144 |              "\t1. uniprot - UPIMAPI will download the entire UniProt and use it as reference\n"
 145 |              "\t2. swissprot - UPIMAPI will download SwissProt and use it as reference\n"
 146 |              "\t3. taxids - Reference proteomes will be downloaded for the taxa specified with the --taxids, and those "
 147 |              "will be used as reference\n"
 148 |              "\t4. a custom database - Input will be considered as the database, and will be used as reference")
 149 |     diamond_args.add_argument(
 150 |         "-t", "--threads", type=int, default=cpu_count(),
 151 |         help="Number of threads to use in annotation steps [all available]")
 152 |     diamond_args.add_argument(
 153 |         "--evalue", type=float, default=1e-3, help="Maximum e-value to report annotations for [1e-3]")
 154 |     diamond_args.add_argument(
 155 |         "--pident", type=float, default=None, help="Minimum pident to report annotations for.")
 156 |     diamond_args.add_argument(
 157 |         "--bitscore", type=float, default=None, help="Minimum bit score to report annotations for (overrides e-value).")
 158 |     diamond_args.add_argument(
 159 |         "-mts", "--max-target-seqs", type=int, default=1,
 160 |         help="Number of annotations to output per sequence inputed [1]")
 161 |     diamond_args.add_argument(
 162 |         "-b", "--block-size", type=float,
 163 |         help="Billions of sequence letters to be processed at a time [memory / 20]")
 164 |     diamond_args.add_argument(
 165 |         "-c", "--index-chunks", type=int,
 166 |         help="Number of chunks for processing the seed index [dependant on block size]")
 167 |     diamond_args.add_argument(
 168 |         "--max-memory", type=float, default=virtual_memory().available / (1024.0 ** 3),
 169 |         help="Maximum memory to use (in Gb) [all available]")
 170 |     diamond_args.add_argument(
 171 |         "--taxids", default=None, help="Tax IDs to obtain protein sequences of for building a reference database.")
 172 |     diamond_args.add_argument(
 173 |         '--diamond-mode', help="Mode to run DIAMOND with [fast]", default='fast',
 174 |         choices=['fast', 'mid_sensitive', 'sensitive', 'more_sensitive', 'very_sensitive', 'ultra_sensitive'])
 175 | 
 176 |     special_functions = parser.add_argument_group('Special functions')
 177 |     special_functions.add_argument(
 178 |         "--show-available-fields", action="store_true", default=False,
 179 |         help="Outputs the fields available from the API.")
 180 | 
 181 |     args = parser.parse_args()
 182 |     if args.show_available_fields:
 183 |         sys.exit('\n'.join(columns_dict.keys()))
 184 | 
 185 |     args.output = args.output.rstrip('/')
 186 |     args.resources_directory = args.resources_directory.rstrip('/')
 187 |     args.columns = args.columns.split('&') if args.columns else None
 188 | 
 189 |     columns_fine = True
 190 |     if args.columns:
 191 |         for col in args.columns:
 192 |             if col not in columns_dict.keys() and not col.startswith('Taxonomic lineage'):
 193 |                 print(
 194 |                     f'ERR: [{col}] is not a valid column name for ID mapping. For more information, check '
 195 |                     f'https://github.com/iquasere/UPIMAPI/tree/master#sometimes-the-return-fields-are-not-properly-updated')
 196 |                 columns_fine = False
 197 |     if not columns_fine:
 198 |         sys.exit(1)
 199 |     if args.taxids is not None:
 200 |         args.taxids = args.taxids.split(',')
 201 |     return args
 202 | 
 203 | 
 204 | def timed_message(message):
 205 |     print(f'[{strftime("%Y-%m-%d %H:%M:%S", gmtime())}] {message}')
 206 | 
 207 | 
 208 | def human_time(seconds):
 209 |     days = round(seconds // 86400)
 210 |     if days > 0:
 211 |         return strftime(f"{days}d%Hh%Mm%Ss", gmtime(seconds))
 212 |     return strftime("%Hh%Mm%Ss", gmtime(seconds))
 213 | 
 214 | 
 215 | def str2bool(v):
 216 |     if v.lower() == 'auto':
 217 |         return 'auto'
 218 |     elif v.lower() in ('yes', 'true', 't', 'y', '1'):
 219 |         return True
 220 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 221 |         return False
 222 |     else:
 223 |         raise ArgumentTypeError('Boolean value expected.')
 224 | 
 225 | 
 226 | def get_fasta_ids(filename):
 227 |     return [line[1:-1] for line in open(filename) if line.startswith('>')]
 228 | 
 229 | 
 230 | def parse_blast(blast):
 231 |     result = pd.read_csv(blast, sep='\t', header=None)
 232 |     result.columns = [
 233 |         'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue',
 234 |         'bitscore']
 235 |     return result
 236 | 
 237 | 
 238 | def string4mapping(columns=None):
 239 |     if columns is None or columns == []:    # if no columns are inputted, UPIMAPI uses defaults
 240 |         return None
 241 |     valid_columns = [column for column in columns if column in columns_dict.keys()]
 242 |     invalid_columns = [column for column in columns if column not in columns_dict.keys()]
 243 |     for col in invalid_columns:
 244 |         print(f'WARNING: "{col}" is not a valid column name. '
 245 |               f'Check https://www.uniprot.org/help/return_fields (Label* column) for valid column names '
 246 |               f'or raise an issue at https://github.com/iquasere/UPIMAPI/issues')
 247 |     return ','.join([columns_dict[column] for column in valid_columns])
 248 | 
 249 | 
 250 | def parallelize(data, func, num_of_processes=8):
 251 |     data_split = np.array_split(data, num_of_processes)
 252 |     pool = Pool(num_of_processes)
 253 |     data = pd.concat(pool.map(func, data_split))
 254 |     pool.close()
 255 |     pool.join()
 256 |     return data
 257 | 
 258 | 
 259 | def run_on_subset(func, data_subset, **kwargs):
 260 |     return data_subset.apply(func, kwargs)
 261 | 
 262 | 
 263 | def parallelize_on_rows(data, func, num_of_processes=8, **kwargs):
 264 |     return parallelize(data, partial(run_on_subset, func, kwargs), num_of_processes)
 265 | 
 266 | 
 267 | def uniprot_request(ids, columns=None, output_format='tsv'):
 268 |     """
 269 |     Input:
 270 |         ids: list of UniProt IDs to query
 271 |         original_database: database from where the IDs are
 272 |         database_destination: database to where to map (so far, only works with 'ACC'
 273 |         output_format: format of response to get
 274 |         columns: names of UniProt columns to get info on
 275 |         databases: names of databases to cross-reference with
 276 |     Output:
 277 |         Returns the content of the response from UniProt
 278 |     """
 279 |     fields = f'&fields={string4mapping(columns=columns)}' if output_format == 'tsv' else ''
 280 |     WEBSITE_API = api_info['servers'][0]['url']
 281 |     resp = get_url(f"{WEBSITE_API}/uniprotkb/accessions?accessions={','.join(ids)}{fields}&format={output_format}")
 282 |     return resp.text
 283 | 
 284 | 
 285 | def split_list(a, n):
 286 |     k, m = divmod(len(a), n)
 287 |     return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
 288 | 
 289 | 
 290 | def submit_id_mapping(from_db, to_db, ids):
 291 |     """
 292 |     Get info from one database to the other
 293 |     :param from_db: Available options at https://rest.uniprot.org/configure/idmapping/fields
 294 |     :param to_db: Available options at https://rest.uniprot.org/configure/idmapping/fields
 295 |     :param ids:
 296 |     :return:
 297 |     """
 298 |     data = {"from": from_fields[from_db], "to": to_fields[to_db], "ids": ids}
 299 |     r = requests.post(f"{api_info['servers'][0]['url']}/idmapping/run", data=data)
 300 |     r.raise_for_status()
 301 |     return r.json()["jobId"]
 302 | 
 303 | 
 304 | def get_id_mapping_results(job_id):
 305 |     while True:
 306 |         r = get_url(f"{api_info['servers'][0]['url']}/idmapping/status/{job_id}")
 307 |         job = r.json()
 308 |         if "jobStatus" in job:
 309 |             if job["jobStatus"] == "RUNNING":
 310 |                 sleep(3)
 311 |         else:
 312 |             return r
 313 | 
 314 | 
 315 | def basic_idmapping(ids, from_db, to_db):
 316 |     """
 317 |     Get info from one database to the other
 318 |     :param ids:
 319 |     :param from_db: Available options at https://rest.uniprot.org/configure/idmapping/fields
 320 |     :param to_db: Available options at https://rest.uniprot.org/configure/idmapping/fields
 321 |     :return:
 322 |     """
 323 |     job_id = submit_id_mapping(from_db, to_db, ids)
 324 |     r = get_id_mapping_results(job_id)
 325 |     result = pd.DataFrame().from_dict(r.json()["results"])
 326 |     while r.links.get("next", {}).get("url"):
 327 |         r = get_url(r.links["next"]["url"])
 328 |         result = pd.concat([result, pd.DataFrame().from_dict(r.json()["results"])])
 329 |     return result
 330 | 
 331 | 
 332 | def basic_idmapping_batch(ids, from_db, to_db, step=1000):
 333 |     """
 334 |     Allows to retrieve millions of IDs at once, there seems to be some limit causing UniProt's API to fail with
 335 |     "Request Entity Too Large for url".
 336 |     :param to_db:
 337 |     :param from_db:
 338 |     :param step:
 339 |     :param ids:
 340 |     :return:
 341 |     """
 342 |     result = pd.DataFrame()
 343 |     for i in tqdm(range(0, len(ids), step), desc='Getting valid UniProt IDs', ascii=' >='):
 344 |         done = False
 345 |         while not done:
 346 |             j = min(i + step, len(ids))
 347 |             try:
 348 |                 result = pd.concat([result, basic_idmapping(ids[i:j], from_db, to_db)])
 349 |                 done = True
 350 |             except:
 351 |                 sleep(3)
 352 |     return result
 353 | 
 354 | 
 355 | def basic_idmapping_multiprocess(ids, output, from_db, to_db, step=1000, threads=15):
 356 |     result = pd.DataFrame()
 357 |     ids_groups = split_list(ids, threads)
 358 |     with Manager() as m:
 359 |         with m.Pool() as p:
 360 |             mapping_results = p.starmap(basic_idmapping_batch, [(
 361 |                 ids_group, from_db, to_db, step) for ids_group in ids_groups])
 362 |     for res in mapping_results:
 363 |         result = pd.concat([result, res])
 364 |     timed_message(f'{result["from"].unique().size} IDs were successfully mapped.')
 365 |     result.to_csv(output, sep='\t', index=False)
 366 |     timed_message(f'Results saved at {output}.')
 367 | 
 368 | 
 369 | def get_valid_entries(ids):
 370 |     job_id = submit_id_mapping("UniProtKB AC/ID", "UniProtKB", ids)
 371 |     r = get_id_mapping_results(job_id)
 372 |     valid_entries = [res["from"] for res in r.json()["results"] if '_' not in res["from"]]
 373 |     while r.links.get("next", {}).get("url"):
 374 |         r = get_url(r.links["next"]["url"])
 375 |         valid_entries += [res["from"] for res in r.json()["results"] if '_' not in res["from"]]
 376 |     return valid_entries
 377 | 
 378 | 
 379 | def get_valid_entries_batch(ids, step=1000):
 380 |     """
 381 |     Allows to retrieve millions of IDs at once, there seems to be some limit causing UniProt's API to fail with
 382 |     "Request Entity Too Large for url".
 383 |     :param step:
 384 |     :param ids:
 385 |     :return:
 386 |     """
 387 |     valid_entries = []
 388 |     for i in tqdm(range(0, len(ids), step), desc='Getting valid UniProt IDs', ascii=' >='):
 389 |         done = False
 390 |         while not done:
 391 |             j = min(i + step, len(ids))
 392 |             try:
 393 |                 valid_entries += get_valid_entries(ids[i:j])
 394 |                 done = True
 395 |             except:
 396 |                 sleep(3)
 397 |     return valid_entries
 398 | 
 399 | 
 400 | def get_valid_entries_multiprocess(ids, step=1000, threads=15):
 401 |     valid_entries = []
 402 |     ids_groups = split_list(ids, threads)
 403 |     with Manager() as m:
 404 |         with m.Pool() as p:
 405 |             result = p.starmap(get_valid_entries_batch, [(ids_group, step) for ids_group in ids_groups])
 406 |     for res in result:
 407 |         valid_entries += res
 408 |     not_valid = [ide for ide in ids if ide not in valid_entries]
 409 |     # take, from the valid IDs, the part after the dot, as this invalidates them
 410 |     valid_entries = [entry.split('.')[0] for entry in valid_entries]
 411 |     timed_message(f'{len(valid_entries)} UniProt IDs identified as valid.')
 412 |     return valid_entries, not_valid
 413 | 
 414 | 
 415 | def get_uniprot_information(ids, step=1000, sleep_time=30, columns=None, max_tries=3):
 416 |     """
 417 |     Input:
 418 |         ids: list of UniProt IDs to query
 419 |         step: INT, number of IDs to send per request
 420 |         sleep_time: INT, number of seconds to wait between requests
 421 |         columns: list - names of UniProt columns to get info on
 422 |     Output:
 423 |         pd.DataFrame will be returned with the information about the IDs queried.
 424 |     """
 425 |     result = pd.DataFrame()
 426 |     for i in tqdm(range(0, len(ids), step), desc=f'Retrieving UniProt information from {len(ids)} IDs'):
 427 |         tries = 0
 428 |         done = False
 429 |         j = min(i + step, len(ids))
 430 |         while not done and tries < max_tries:
 431 |             try:
 432 |                 data = uniprot_request(ids[i:j], columns=columns)
 433 |                 if len(data) > 0:
 434 |                     uniprotinfo = pd.read_csv(StringIO(data), sep='\t')
 435 |                     result = pd.concat([result, uniprotinfo[uniprotinfo.columns.tolist()]])
 436 |                 sleep(sleep_time)
 437 |                 done = True
 438 |             except ConnectionError:
 439 |                 print(f'ID mapping failed. Remaining tries: {max_tries - tries}')
 440 |                 tries += 1
 441 |                 sleep(10)
 442 |     return result
 443 | 
 444 | 
 445 | def get_uniprot_fasta(ids, step=1000, sleep_time=30):
 446 |     """
 447 |     Input:
 448 |         ids: list of UniProt IDs to query
 449 |         step: INT, number of IDs to send per request
 450 |         sleep_time: INT, number of seconds to wait between requests
 451 |     Output:
 452 |         str object containing the fasta sequences and headers
 453 |         of the proteis belonging to the IDs queried will be returned
 454 |     """
 455 |     result = ''
 456 |     for i in tqdm(range(0, len(ids), step), desc=f"Building FASTA from {len(ids)} IDs."):
 457 |         j = min(i + step, len(ids))
 458 |         data = uniprot_request(ids[i:j], output_format='fasta')
 459 |         if len(data) > 0:
 460 |             result += data
 461 |         sleep(sleep_time)
 462 |     return result
 463 | 
 464 | 
 465 | def uniprot_fasta_workflow(all_ids, output, max_iter=5, step=1000, sleep_time=10):
 466 |     if os.path.isfile(output):
 467 |         print(f'{output} was found. Will perform mapping for the remaining IDs.')
 468 |         ids_done = get_fasta_ids(output)
 469 |     else:
 470 |         print(f'{output} not found. Will perform mapping for all IDs.')
 471 |         ids_done = []
 472 |     ids_missing = list(set(all_ids) - set(ids_done))
 473 | 
 474 |     tries = 0
 475 |     ids_done = ([ide.split('|')[1] for ide in get_fasta_ids(output)] if os.path.isfile(output) else [])
 476 |     while len(ids_done) < len(all_ids) and tries < max_iter:
 477 |         ids_missing = list(set([ide for ide in tqdm(all_ids, desc='Checking which IDs are missing information.')
 478 |                                 if ide not in ids_done]))
 479 |         print(f'Information already gathered for {int(len(ids_done) / 2)} ids. Still missing for {len(ids_missing)}.')
 480 |         uniprotinfo = get_uniprot_fasta(ids_missing, step=step, sleep_time=sleep_time)
 481 |         with open(output, 'a') as file:
 482 |             file.write(uniprotinfo)
 483 |         ids_done = [ide.split('|')[1] for ide in get_fasta_ids(output)]
 484 |         tries += 1
 485 |     if len(ids_done) == len(all_ids):
 486 |         print(f'Results for all IDs are available at {output}')
 487 |     else:
 488 |         ids_unmapped_output = f"{'/'.join(output.split('/')[:-1])}/ids_unmapped.txt"
 489 |         handler = open(ids_unmapped_output, 'w')
 490 |         handler.write('\n'.join(ids_missing))
 491 |         print(f'Maximum iterations were made. Results related to {str(len(ids_missing))} IDs were not obtained. '
 492 |               f'IDs with missing information are available at {ids_unmapped_output} and information obtained is '
 493 |               f'available at {output}')
 494 | 
 495 | 
 496 | def check_ids_already_done(output, ids):
 497 |     if os.path.isfile(output) and os.stat(output).st_size > 1:
 498 |         try:
 499 |             result = pd.read_csv(output, sep='\t', low_memory=False).drop_duplicates()
 500 |             print(f'{output} was found. Will perform mapping for the remaining IDs.')
 501 |             if 'Entry Name' not in result.columns:
 502 |                 result['Entry Name'] = [np.nan] * len(result)
 503 |             ids_done = result['Entry'].unique().tolist() + result['Entry Name'].unique().tolist()
 504 |         except OSError:         # file doesn't exist or is empty
 505 |             print(f'{output} was found. However, it could not be parsed. Will restart mapping.')
 506 |             result = pd.DataFrame()
 507 |             ids_done = []
 508 |     else:
 509 |         print(f'{output} not found or empty. Will perform mapping for all IDs.')
 510 |         result = pd.DataFrame()
 511 |         ids_done = []
 512 |     ids_missing = list(set(ids) - set(ids_done))
 513 |     print(f'IDs present in uniprotinfo file: {int(len(ids_done) / 2)}')      # entry and entry name count by 2
 514 |     print(f'IDs missing: {len(ids_missing)}')
 515 |     return ids_done, ids_missing, result
 516 | 
 517 | 
 518 | def select_columns(columns):
 519 |     """
 520 |     :param columns: list - of columns to retrieve information from, including taxonomic columns
 521 |     :return: new_cols: list - of columns to retrieve information from, without taxonomic columns added by UPIMAPI
 522 |     :return: tax_cols: list - of taxonomic columns to retrieve information from
 523 |     :return: taxids_cols: list - of taxid columns to retrieve information from
 524 |     """
 525 |     if columns is None:
 526 |         columns = [            # default columns of UPIMAPI
 527 |             'Entry', 'Entry Name', 'Organism', 'Organism (ID)', 'Taxonomic lineage', 'Taxonomic lineage (Ids)',
 528 |             'Gene Names', 'Protein names', 'EC number', 'Function [CC]', 'Pathway', 'Keywords',
 529 |             'Protein existence', 'Gene Ontology (GO)', 'Protein families', 'BRENDA', 'BioCyc', 'CDD', 'eggNOG',
 530 |             'Ensembl', 'InterPro', 'KEGG', 'Pfam', 'Reactome', 'RefSeq', 'UniPathway',
 531 |             'Taxonomic lineage (SUPERKINGDOM)', 'Taxonomic lineage (PHYLUM)', 'Taxonomic lineage (CLASS)',
 532 |             'Taxonomic lineage (ORDER)', 'Taxonomic lineage (FAMILY)', 'Taxonomic lineage (GENUS)',
 533 |             'Taxonomic lineage (SPECIES)', 'Taxonomic lineage IDs (SPECIES)']
 534 |     tax_cols = [col for col in columns if ('Taxonomic lineage (' in col and col not in [
 535 |         'Taxonomic lineage (SPECIES)', 'Taxonomic lineage (Ids)'])]
 536 |     taxids_cols = [col for col in columns if (
 537 |             'Taxonomic lineage IDs (' in col and col not in 'Taxonomic lineage IDs (SPECIES)')]
 538 |     for col in ['Entry Name', 'Entry']:
 539 |         if col not in columns:
 540 |             columns.insert(0, col)
 541 |     new_cols = [col for col in columns if col not in tax_cols + taxids_cols + [
 542 |         'Taxonomic lineage (SPECIES)', 'Taxonomic lineage IDs (SPECIES)']]
 543 |     col_conversion = {'Organism': 'Taxonomic lineage (SPECIES)', 'Organism (ID)': 'Taxonomic lineage IDs (SPECIES)'}
 544 |     for k, v in col_conversion.items():
 545 |         if v in columns and k not in new_cols:
 546 |             new_cols.append(k)
 547 |     conditions = {
 548 |         'len(tax_cols) > 0 and "Taxonomic lineage" not in new_cols': 'Taxonomic lineage',
 549 |         'len(taxids_cols) > 0 and "Taxonomic lineage (Ids)" not in new_cols': 'Taxonomic lineage (Ids)',
 550 |         '"Taxonomic lineage (SPECIES)" in columns and "Organism" not in new_cols': 'Organism',
 551 |         '"Taxonomic lineage IDs (SPECIES)" in columns and "Organism (ID)" not in new_cols': 'Organism (ID)'}
 552 |     for cond, col in conditions.items():    # check if cond (key) is True, then append or not the col (value)
 553 |         if eval(cond):
 554 |             new_cols.append(col)
 555 |     for col in ['Entry Name', 'Entry']:     # UPIMAPI requires these two columns to be present
 556 |         if col not in new_cols:
 557 |             new_cols.insert(0, col)
 558 |     return columns, new_cols, tax_cols, taxids_cols
 559 | 
 560 | 
 561 | def make_taxonomic_lineage_df(tax_lineage_col, prefix='Taxonomic lineage IDs'):
 562 |     """
 563 |     Parses the taxonomic lineage column of the uniprotinfo dataframe and returns a dataframe with the taxonomic lineage
 564 |     separated in columns.
 565 |     :param tax_lineage_col: pd.Series with the taxonomic lineage column of the uniprotinfo dataframe
 566 |     :param prefix: str, prefix to use for the columns of the new dataframe
 567 |     :return: pd.DataFrame with the taxonomic lineage separated in columns
 568 |     """
 569 |     # First, split records by ', '
 570 |     split_regex = r"(?<=\)),"
 571 |     result = pd.DataFrame.from_records(tax_lineage_col.apply(lambda x: re.split(split_regex, x)).apply(
 572 |         # Then, split each record by ' ('
 573 |         lambda x: [part[:-1].split(' (') for part in x]).apply(
 574 |         # Finally, build dictionary with the taxonomic level as key and the taxonomy as value. ' ('.join avoids cases
 575 |         # where the taxonomy has a '(' in it (e.g. 'Clostridium scindens (strain JCM 10418 / VPI 12708) (species)')
 576 |         lambda x: {part[-1]: ' ('.join(part[:-1]) for part in x if part[-1] != 'no rank'}))
 577 |     # Rename columns in old UniProt fashion
 578 |     result.rename(columns={col: f'{prefix} ({col.upper()})' for col in result.columns}, inplace=True)
 579 |     for col in result.columns:
 580 |         result[col] = result[col].str.lstrip()
 581 |     return result
 582 | 
 583 | 
 584 | def uniprot_information_workflow(ids, output, max_iter=5, columns=None, step=1000, sleep_time=10):
 585 |     ids_done, ids_missing, result = check_ids_already_done(output, ids)
 586 |     tries, last_ids_missing, ids_unmapped_output = 0, None, f"{'/'.join(output.split('/')[:-1])}/ids_unmapped.txt"
 587 |     columns, new_cols, tax_cols, taxids_cols = select_columns(columns)
 588 |     columns = new_cols if columns is None else columns
 589 |     uniprotinfo = pd.DataFrame()
 590 |     while len(ids_missing) > 0 and tries < max_iter and ids_missing != last_ids_missing:
 591 |         print(f'Information already gathered for {int(len(ids_done) / 2)} ids. Still missing for {len(ids_missing)}.')
 592 |         last_ids_missing = ids_missing
 593 |         info = get_uniprot_information(
 594 |             ids_missing, step=step, columns=new_cols, max_tries=max_iter, sleep_time=sleep_time)
 595 |         info.reset_index(inplace=True, drop=True)
 596 |         if len(info) > 0:
 597 |             ids_done += info['Entry'].unique().tolist() + info['Entry Name'].unique().tolist()
 598 |             uniprotinfo = pd.concat([uniprotinfo, info], ignore_index=True)
 599 |         ids_missing = list(set(last_ids_missing) - set(ids_done))
 600 |         if len(ids_missing) > 0:
 601 |             if last_ids_missing == ids_missing:
 602 |                 print("Could not map additional IDs for this mapping. There were probably some outdated IDs. "
 603 |                       "For more questions, please contact through https://github.com/iquasere/UPIMAPI/issues")
 604 |             else:
 605 |                 print('Failed to retrieve information for some IDs. Retrying request.')
 606 |                 tries += 1
 607 |     if len(uniprotinfo) == 0:
 608 |         return result
 609 |     tax_df = pd.DataFrame()
 610 |     if len(tax_cols) > 0:
 611 |         tax_df = make_taxonomic_lineage_df(uniprotinfo['Taxonomic lineage'], prefix='Taxonomic lineage')
 612 |     if len(taxids_cols) > 0:
 613 |         tax_df = pd.concat([tax_df, make_taxonomic_lineage_df(
 614 |             uniprotinfo['Taxonomic lineage (Ids)'], prefix='Taxonomic lineage IDs')], axis=1)
 615 |     # rename columns to old UniProt fashion if those columns are to be outputted
 616 |     # then remove the original columns if they are not to be outputted
 617 |     col_conversion = {'Organism': 'Taxonomic lineage (SPECIES)', 'Organism (ID)': 'Taxonomic lineage IDs (SPECIES)'}
 618 |     for k, v in col_conversion.items():
 619 |         if v in columns:
 620 |             uniprotinfo[v] = uniprotinfo[k]
 621 |             if k not in columns:
 622 |                 del uniprotinfo[k]
 623 |     tax_df_gut_cols = [col for col in tax_df.columns if col not in col_conversion.values()]     # don't repeat columns that were added in the previous loop
 624 |     uniprotinfo = pd.concat([uniprotinfo, tax_df[tax_df_gut_cols]], axis=1)
 625 |     result = pd.concat([result, uniprotinfo[columns]], ignore_index=True)
 626 |     if len(ids_missing) == 0:
 627 |         print(f'Results for all IDs are available at {output}')
 628 |     else:
 629 |         open(ids_unmapped_output, 'w').write('\n'.join(ids_missing))
 630 |         print(f"Maximum iterations were made. Results related to {str(len(ids_missing))} IDs were not obtained. "
 631 |               f"IDs with missing information are available at {ids_unmapped_output} and information obtained is "
 632 |               f"available at {output}")
 633 |     return result
 634 | 
 635 | 
 636 | def determine_full_id(ids):
 637 |     for ide in ids:
 638 |         if '|' in ide:
 639 |             return True
 640 |     return False
 641 | 
 642 | 
 643 | def parse_fasta(file):
 644 |     with open(file) as f:
 645 |         lines = [line.rstrip('\n') for line in f]
 646 |     i = 0
 647 |     sequences = {}
 648 |     while i < len(lines):
 649 |         if lines[i].startswith('>'):
 650 |             name = lines[i][1:]
 651 |             sequences[name] = ''
 652 |             i += 1
 653 |             while i < len(lines) and not lines[i].startswith('>'):
 654 |                 sequences[name] += lines[i]
 655 |                 i += 1
 656 |     return sequences
 657 | 
 658 | 
 659 | def get_ids(args_input, input_type, full_id='auto'):
 660 |     if args_input.endswith(('.zip', '.tar', '.gz', '.bz2')):
 661 |         exit('File seems to be compressed! If not, please change its extension.')
 662 |     if input_type == 'blast':
 663 |         ids = parse_blast(args_input)['sseqid'].tolist()
 664 |     elif input_type == 'txt':
 665 |         ids = []
 666 |         with open(args_input) as f:
 667 |             preids = f.read().split('\n')
 668 |         for preid in preids:
 669 |             ids += preid.split(',')
 670 |     elif input_type == 'fasta':
 671 |         ids = parse_fasta(args_input).keys()
 672 |     else:       # if PIPE
 673 |         ids = args_input.split(',')
 674 |     if full_id == 'auto':
 675 |         full_id = determine_full_id(ids)
 676 |         print(f'Auto determined "full id" as: {full_id}')
 677 |     if full_id:
 678 |         return_ids = [ide.split('|')[1] for ide in ids if ide not in ['*', '']]
 679 |         sp_ids = [ide.split('|')[1] for ide in ids if ide.startswith('sp')]
 680 |         return return_ids, full_id, sp_ids
 681 |     return_ids = [ide for ide in ids if ide not in ['*', '']]
 682 |     return return_ids, full_id, return_ids
 683 |     # second return_ids is just mock to return the same type of output as for sp_ids
 684 | 
 685 | 
 686 | def run_command(bash_command, print_message=True):
 687 |     if print_message:
 688 |         print(bash_command)
 689 |     run(bash_command.split(), check=True)
 690 | 
 691 | 
 692 | def run_pipe_command(bash_command, output='', mode='w', print_message=True):
 693 |     if print_message:
 694 |         print(bash_command)
 695 |     if output == '':
 696 |         Popen(bash_command, stdin=PIPE, shell=True).communicate()
 697 |     elif output == 'PIPE':
 698 |         return Popen(bash_command, stdin=PIPE, shell=True,
 699 |                      stdout=PIPE).communicate()[0].decode('utf8')
 700 |     else:
 701 |         with open(output, mode) as output_file:
 702 |             Popen(bash_command, stdin=PIPE, shell=True, stdout=output_file).communicate()
 703 | 
 704 | 
 705 | def make_diamond_database(fasta, dmnd):
 706 |     run_command(f'diamond makedb --in {fasta} -d {dmnd}')
 707 | 
 708 | 
 709 | def block_size_and_index_chunks(argsb, argsc, memory):
 710 |     if argsb:
 711 |         b = argsb
 712 |     else:
 713 |         b = memory / 20  # b = memory in Gb / 20
 714 |     if argsc:
 715 |         return b, argsc
 716 |     if b > 3:
 717 |         return b, 1
 718 |     if b > 2:
 719 |         return b, 2
 720 |     if b > 1:
 721 |         return b, 3
 722 |     return b, 4
 723 | 
 724 | 
 725 | def run_diamond(query, aligned, unaligned, database, threads=12, max_target_seqs=50, b=1, c=4, e_value=0.01,
 726 |                 bit_score=None, pident=None, mode='fast'):
 727 |     command = (
 728 |         f"diamond blastp --query {query} --out {aligned} --un {unaligned} --db {database} --outfmt 6 --unal 1 "
 729 |         f"--threads {threads} --max-target-seqs {max_target_seqs} -b {b} -c {c} --evalue {e_value} "
 730 |         f"--{mode.replace('_', '-')}")
 731 |     if bit_score:
 732 |         command += f' --min-score {bit_score}'
 733 |     if pident:
 734 |         command += f' --id {pident}'
 735 |     run_command(command)
 736 | 
 737 | 
 738 | def get_proteome_for_taxid_slow(taxid, max_tries=3):
 739 |     """
 740 |     Get proteome for taxid the "proper" way. It is very slow, though, so not used.
 741 |     :param taxid:
 742 |     :param max_tries:
 743 |     :return:
 744 |     """
 745 |     tries = 0
 746 |     res = requests.get(f'https://rest.uniprot.org/uniprotkb/search?format=fasta&query=%28taxonomy_id%3A{taxid}%29')
 747 |     result = res.content.decode('utf8')
 748 |     pages = 0
 749 |     while tries < max_tries and res.links != {}:
 750 |         try:
 751 |             res = requests.get(res.links['next']['url'])
 752 |             result += res.content.decode('utf8')
 753 |             pages += 1
 754 |         except:
 755 |             tries += 1
 756 |             sleep(10)
 757 |     return result
 758 | 
 759 | 
 760 | def get_proteome_for_taxid(taxid, max_tries=3):
 761 |     tries = 0
 762 |     url = f'https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=taxonomy_id:{taxid}'
 763 |     while tries < max_tries:
 764 |         try:
 765 |             return requests.get(url).content.decode('utf8')
 766 |         except:
 767 |             print(f'Failed! {max_tries - tries} tries remaining.')
 768 |             tries += 1
 769 |             sleep(10)
 770 | 
 771 | 
 772 | def local_uniprot_is_outdated(local_reldate_file):
 773 |     local = open(local_reldate_file).readlines()
 774 |     [sp_date, tr_date] = [datetime.strptime(local[i][:-1].split()[-1], '%d-%b-%Y') for i in [1, 2]]
 775 |     current = requests.get("https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/reldate.txt"
 776 |                            ).content.decode('utf8').split('\n')
 777 |     [c_sp_date, c_tr_date] = [datetime.strptime(current[i][:-1].split()[-1], '%d-%b-%Y') for i in [1, 2]]
 778 |     return c_sp_date < sp_date or c_tr_date < tr_date
 779 | 
 780 | 
 781 | def download_with_progress_bar(url, output_folder):
 782 |     # Streaming, so we can iterate over the response.
 783 |     response = requests.get(url, stream=True)
 784 |     total_size_in_bytes = int(response.headers.get('content-length', 0))
 785 |     block_size = 102400  # 100 Kibibytes
 786 |     progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True, desc=f'Downloading {url.split("/")[-1]}')
 787 |     with open(f'{output_folder}/{url.split("/")[-1]}', 'wb') as file:
 788 |         for data in response.iter_content(block_size):
 789 |             progress_bar.update(len(data))
 790 |             file.write(data)
 791 |     progress_bar.close()
 792 | 
 793 | 
 794 | def download_uniprot(output_folder, mirror='expasy'):
 795 |     base_urls = {
 796 |         'expasy': 'https://ftp.expasy.org',
 797 |         'uniprot': 'https://ftp.uniprot.org/pub',
 798 |         'ebi': 'https://ftp.ebi.ac.uk/pub'
 799 |     }
 800 |     for file in ["uniprot_sprot.fasta.gz", "uniprot_trembl.fasta.gz", "reldate.txt"]:
 801 |         print(f'Downloading and writing: {file}')
 802 |         download_with_progress_bar(
 803 |             f'{base_urls[mirror]}/databases/uniprot/current_release/knowledgebase/complete/{file}', output_folder)
 804 |     run_pipe_command(f'zcat {output_folder}/uniprot_trembl.fasta.gz {output_folder}/uniprot_sprot.fasta.gz > '
 805 |                      f'{output_folder}/uniprot.fasta')
 806 |     for file in [f'{output_folder}/uniprot_trembl.fasta.gz', f'{output_folder}/uniprot_sprot.fasta.gz']:
 807 |         os.remove(file)
 808 | 
 809 | 
 810 | def build_reference_database(database, output_folder, taxids=None, max_tries=3, mirror='expasy'):
 811 |     if database == 'uniprot':
 812 |         download_uniprot(output_folder, mirror=mirror)
 813 |     elif database == 'swissprot':
 814 |         download_with_progress_bar(
 815 |             "https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/uniprot_sprot.fasta.gz",
 816 |             output_folder)
 817 |         run_command(f'gunzip {output_folder}/uniprot_sprot.fasta.gz')
 818 |     elif database == 'taxids':
 819 |         for taxid in tqdm(taxids, desc=f'Retrieving reference proteomes for {len(taxids)} taxa from UniProt.'):
 820 |             with open(f'{output_folder}/taxids_database.fasta', 'a') as f:
 821 |                 f.write(get_proteome_for_taxid(taxid, max_tries=max_tries))
 822 | 
 823 | 
 824 | def must_build_database(database, resources_folder):
 825 |     db2suffix = {'uniprot': 'uniprot.fasta', 'swissprot': 'uniprot_sprot.fasta', 'taxids': 'taxids_database.fasta'}
 826 |     if database in db2suffix.keys():
 827 |         if os.path.isfile(f'{resources_folder}/{db2suffix[database]}'):
 828 |             return str2bool(input(f'{resources_folder}/{db2suffix[database]} exists. Overwrite? [Y/N] '))
 829 |     return True
 830 | 
 831 | 
 832 | def get_tabular_taxonomy(output):
 833 |     res = requests.get('https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/taxonomy.rdf.xz')
 834 |     with open('taxonomy.rdf.xz', 'wb') as f:
 835 |         f.write(res.content)
 836 |     run_command(f'unxz taxonomy.rdf.xz')
 837 |     print('Reading RDF taxonomy')
 838 |     root = ET.parse('taxonomy.rdf').getroot()
 839 |     elems = root.findall('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description')
 840 |     with open(output, 'w') as f:
 841 |         written = f.write('\t'.join(['taxid', 'name', 'rank', 'parent_taxid']) + '\n')
 842 |         for elem in tqdm(elems, desc='Converting XML taxonomy.rdf to TSV format'):
 843 |             info = [elem.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about').split('/')[-1]]
 844 |             scientific_name = elem.find('{http://purl.uniprot.org/core/}scientificName')
 845 |             info.append(scientific_name.text if scientific_name is not None else '')
 846 |             rank_elem = elem.find('{http://purl.uniprot.org/core/}rank')
 847 |             info.append(rank_elem.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split('/')[-1]
 848 |                         if rank_elem is not None else '')
 849 |             upper_taxon = elem.find('{http://www.w3.org/2000/01/rdf-schema#}subClassOf')
 850 |             info.append(upper_taxon.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split('/')[-1]
 851 |                         if upper_taxon is not None else '')
 852 |             written = f.write('\t'.join(info) + '\n')
 853 | 
 854 | 
 855 | def get_match_id(record, ids):
 856 |     if record.entry_name in ids:
 857 |         return record.entry_name
 858 |     if record.accessions[0] in ids:
 859 |         return record.accessions[0]
 860 |     return None
 861 | 
 862 | 
 863 | def count_on_file(expression, file, compressed=False):
 864 |     return int(check_output(f"{'zgrep' if compressed else 'grep'} -c '{expression}' {file}", shell=True))
 865 | 
 866 | 
 867 | def get_local_swissprot_data(sp_dat_filename, ids):
 868 |     sp_dat = SP.parse(open(sp_dat_filename))
 869 |     result, ids_found = [], []
 870 |     i = 1
 871 |     record = next(sp_dat)
 872 |     number_of_entries = count_on_file('Reviewed;', sp_dat_filename)
 873 |     while record is not None and len(ids) > 0:
 874 |         match_id = get_match_id(record, ids)
 875 |         if match_id is not None:
 876 |             result.append(record.__dict__)
 877 |             ids_found.append(match_id)
 878 |         if i % 100000 == 0:
 879 |             print(f'[{i}/{number_of_entries}] SwissProt entries queried')
 880 |         record = next(sp_dat, None)
 881 |         i += 1
 882 |     print(f'[{i}/{number_of_entries}] SwissProt entries queried')
 883 |     return pd.DataFrame(result), ids_found
 884 | 
 885 | 
 886 | def lineage_to_columns(lineage, tax_tsv):
 887 |     l2c_result, l2c_taxids = {}, {}
 888 |     for taxon in lineage:
 889 |         match = tax_tsv.loc[taxon, ["rank", "taxid"]]
 890 |         if type(match) == pd.core.series.Series:
 891 |             rank, taxid = match[["rank", "taxid"]]
 892 |             if type(rank) == str:
 893 |                 l2c_result[f'Taxonomic lineage ({rank.upper()})'] = taxon
 894 |                 l2c_taxids[f'Taxonomic identifier ({rank.upper()})'] = taxid
 895 |         else:   # some taxIDs have multiple levels (e.g. "Craniata")
 896 |             for i in range(len(match)):
 897 |                 rank, taxid = match.iloc[i][["rank", "taxid"]]
 898 |                 if type(rank) == str:
 899 |                     l2c_result[f'Taxonomic lineage ({rank.upper()})'] = taxon
 900 |                     l2c_taxids[f'Taxonomic identifier ({rank.upper()})'] = taxid
 901 |     l2c_result['Taxonomic lineage (ALL)'] = ', '.join(set(l2c_result.values()))
 902 |     l2c_taxids['Taxonomic identifier (ALL)'] = ', '.join(set(l2c_taxids.values()))
 903 |     l2c_result = {**l2c_result, **l2c_taxids, 'index': lineage}
 904 |     return l2c_result
 905 | 
 906 | 
 907 | def lineages_to_columns(lineages, tax_tsv):
 908 |     """
 909 |     Does the same as lineage_to_columns, but to all lineages, instead of a single one
 910 |     :param lineages:
 911 |     :param tax_tsv:
 912 |     :return:
 913 |     """
 914 |     return [lineage_to_columns(lineage, tax_tsv) for lineage in lineages]
 915 | 
 916 | 
 917 | def get_upper_taxids(taxid, tax_df):
 918 |     """
 919 |     :param taxid: str - taxID to get upper taxIDs from
 920 |     :param tax_df: pd.DataFrame - of read taxonomy.tsv (from taxonomy.rdf)
 921 |     :returns list - of upper taxIDs
 922 |     """
 923 |     if taxid == '0':
 924 |         return []
 925 |     taxids = []
 926 |     while taxid != '1' and taxid != 'Taxon':
 927 |         taxids.append(taxid)
 928 |         taxid = tax_df.loc[taxid]['parent_taxid']
 929 |     return taxids
 930 | 
 931 | 
 932 | def parse_taxonomy(data, tax_tsv_df, threads=15):
 933 |     tax_tsv_df.set_index('name', inplace=True)
 934 |     tax_tsv_df['taxid'] = tax_tsv_df['taxid'].astype(str)
 935 |     all_classifications = split(data['organism_classification'].drop_duplicates().tolist(), threads)
 936 |     with Manager() as m:
 937 |         with m.Pool() as p:
 938 |             result = p.starmap(lineages_to_columns, [(classifications, tax_tsv_df)
 939 |                                                      for classifications in all_classifications])
 940 |     decompacted = []
 941 |     for res in result:
 942 |         decompacted += res
 943 |     return pd.DataFrame(decompacted).set_index('index')
 944 | 
 945 | 
 946 | def parse_comments(sp_data):
 947 |     result = []
 948 |     bpc_list = []
 949 |     for comments in sp_data['comments']:
 950 |         partial = {key: '' for key in [
 951 |             'FUNCTION', 'SUBUNIT', 'INTERACTION', 'SUBCELLULAR LOCATION', 'ALTERNATIVE PRODUCTS', 'TISSUE SPECIFICITY',
 952 |             'PTM', 'POLYMORPHISM', 'DISEASE', 'MISCELLANEOUS', 'SIMILARITY', 'CAUTION', 'SEQUENCE CAUTION',
 953 |             'WEB RESOURCE', 'MASS SPECTROMETRY', 'RNA EDITING', 'CATALYTIC ACTIVITY', 'COFACTOR', 'ACTIVITY REGULATION',
 954 |             'PATHWAY', 'DEVELOPMENTAL STAGE', 'INDUCTION', 'ALLERGEN', 'BIOTECHNOLOGY', 'DISRUPTION PHENOTYPE',
 955 |             'PHARMACEUTICAL', 'TOXIC DOSE', 'DOMAIN']}
 956 |         bpc_dict = {'Kinetic parameters': []}
 957 |         for comment in comments:
 958 |             comment = comment.split(': ')
 959 |             if comment[0] in partial.keys():
 960 |                 partial[comment[0]] += f'{": ".join(comment)} '
 961 |             else:
 962 |                 if comment[0] in ['BIOPHYSICOCHEMICAL PROPERTIES']:
 963 |                     if comment[1] not in bpc_dict.keys():
 964 |                         bpc_dict[comment[1]] = [f'{comment[0]}: {comment[1]}: {comment[2]}']
 965 |                     else:
 966 |                         bpc_dict[comment[1]].append(f'{comment[0]}: {comment[1]}: {comment[2]}')
 967 |                 else:
 968 |                     print(f'Comment still not implemented: [{comment[0]}]')
 969 |         result.append(partial)
 970 |         bpc_dict['Kinetics'] = bpc_dict.pop('Kinetic parameters')
 971 |         bpc_list.append(bpc_dict)
 972 |     result = pd.DataFrame(result, columns=[
 973 |         'Function [CC]', 'Subunit structure [CC]', 'Interacts with', 'Subcellular location [CC]',
 974 |         'Alternative products (isoforms)', 'Tissue specificity', 'Post-translational modification', 'Polymorphism',
 975 |         'Involvement in disease', 'Miscellaneous [CC]', 'Sequence similarities', 'Caution', 'Sequence caution',
 976 |         'Web resources', 'Mass spectrometry', 'RNA editing', 'Catalytic activity', 'Cofactor', 'Activity regulation',
 977 |         'Pathway', 'Developmental stage', 'Induction', 'Allergenic properties', 'Biotechnological use',
 978 |         'Disruption phenotype', 'Pharmaceutical use', 'Toxic dose', 'Domain [CC]'])
 979 |     result['Erroneous gene model prediction'] = result['Sequence caution']
 980 |     bpc_df = pd.DataFrame(bpc_list)
 981 |     for col in bpc_df:
 982 |         bpc_df[col] = bpc_df[col].apply(lambda x: '; '.join(x) if type(x) == list else x)
 983 |     return pd.concat([result, bpc_df], axis=1)
 984 | 
 985 | 
 986 | def add_to_dict(dictionary, key, value):
 987 |     if key in dictionary.keys():
 988 |         dictionary[key] += value
 989 |     else:
 990 |         dictionary[key] = value
 991 | 
 992 | 
 993 | def cross_references_to_columns(cross_refs):
 994 |     result = {}
 995 |     go_dict = {}
 996 |     go_rel = {'C': 'cellular component', 'F': 'molecular function', 'P': 'biological process'}
 997 |     for ref in cross_refs:
 998 |         if ref[0] == 'GO':
 999 |             refie = ref[2].split(':')
1000 |             add_to_dict(go_dict, f'Gene ontology ({go_rel[refie[0]]})', f'{refie[1]} [{ref[1]}]; ')
1001 |             add_to_dict(go_dict, 'Gene ontology (GO)', f'{refie[1]} [{ref[1]}]; ')
1002 |             add_to_dict(go_dict, 'Gene ontology IDs', f'{ref[1]}; ')
1003 |         else:
1004 |             if ref[0] == 'Proteomes':
1005 |                 value = f'{ref[1]}: {ref[2]}'
1006 |             else:
1007 |                 value = f'{ref[1]};'
1008 |             add_to_dict(result, ref[0], value)
1009 |     return result, go_dict
1010 | 
1011 | 
1012 | def parse_cross_references(sp_data):
1013 |     ref_result = [cross_references_to_columns(cross_refs) for cross_refs in sp_data['cross_references']]
1014 |     ref_dict, go_dict = zip(*ref_result)
1015 |     ref_df = pd.DataFrame(ref_dict)
1016 |     ref_df.columns = map(lambda x: f'Cross-reference ({x})' if x != 'Proteomes' else x, ref_df.columns)
1017 |     go_df = pd.DataFrame(go_dict)
1018 |     ref_df = pd.concat([ref_df, go_df], axis=1)
1019 |     return ref_df
1020 | 
1021 | 
1022 | def gene_name_to_columns(genes):
1023 |     if genes == '':
1024 |         info = {}
1025 |     else:
1026 |         info = [pair.split('=') for pair in genes.rstrip(';').split('; ')]
1027 |         info = {pair[0]: pair[1] for pair in info}
1028 |     return {'Gene names': ' '.join(info.values()) if info != {} else '',
1029 |             'Gene names  (ordered locus )': info['OrderedLocusNames'] if 'OrderedLocusNames' in info else '',
1030 |             'Gene names  (ORF )': info['ORFNames'] if 'ORFNames' in info else '',
1031 |             'Gene names  (primary )': info['Name'] if 'Name' in info else '',
1032 |             'Gene names  (synonym )': info['Synonyms'] if 'Synonyms' in info else ''}
1033 | 
1034 | 
1035 | def parse_gene_names(sp_data):
1036 |     return pd.DataFrame([gene_name_to_columns(genes) for genes in sp_data['gene_name']])
1037 | 
1038 | 
1039 | def parse_description_text(description):
1040 |     result = {}
1041 |     parts = description[:-1].split('; ')
1042 |     i = 0
1043 |     while i < len(parts):
1044 |         parted = parts[i].split('=')
1045 |         if parted[0].startswith('RecName: Full'):
1046 |             result['RecName'] = {}
1047 |             result['RecName']['Full'] = parted[1]
1048 |             i += 1
1049 |             while i < len(parts) and ':' not in parts[i].split()[0]:
1050 |                 parted = parts[i].split('=')
1051 |                 result['RecName'][parted[0]] = parted[1]
1052 |                 i += 1
1053 |         elif parted[0].startswith('AltName: Full'):
1054 |             if 'AltName' not in result.keys():
1055 |                 result['AltName'] = []
1056 |             altname = {'Full': parted[1]}
1057 |             i += 1
1058 |             while i < len(parts) and ':' not in parts[i].split()[0]:
1059 |                 parted = parts[i].split('=')
1060 |                 altname[parted[0]] = parted[1]
1061 |                 i += 1
1062 |             result['AltName'].append(altname)
1063 |         elif parted[0].startswith('Contains: RecName'):
1064 |             if 'Contains' not in result.keys():
1065 |                 result['Contains'] = []
1066 |             contains = {'RecName': {'Full': parted[1]}}
1067 |             i += 1
1068 |             while i < len(parts) and ':' not in parts[i].split()[0]:
1069 |                 parted = parts[i].split('=')
1070 |                 contains['RecName'][parted[0]] = parted[1]
1071 |                 i += 1
1072 |             result['Contains'].append(contains)
1073 |         elif parts[i].startswith('Flags'):
1074 |             parted = parts[i].split(': ')
1075 |             if 'Flags' in result.keys():
1076 |                 result['Flags'] = parted[1]
1077 |             else:
1078 |                 result['Flags'] = [parted[1]]
1079 |             i += 1
1080 |         else:
1081 |             #print('A description UPIMAPI cannot yet handle!')
1082 |             #print(parts[i])
1083 |             i += 1
1084 |     return result
1085 | 
1086 | 
1087 | def fix_term(term):
1088 |     return term if '{ECO:' not in term else ' '.join(term.split()[:-1])
1089 | 
1090 | 
1091 | def parse_descriptions(sp_data):
1092 |     desc_data_df = sp_data['description'].apply(parse_description_text)
1093 |     description_df = pd.DataFrame()
1094 |     description_df['Protein names'] = desc_data_df.apply(
1095 |         lambda x: '{}{}{}{}{}{}'.format(
1096 |             fix_term(x['RecName']['Full']), f" ({fix_term(x['RecName']['Short'])})" if 'Short' in x['RecName'].keys()
1097 |             else "", f" (EC {fix_term(x['RecName']['EC'])})" if 'EC' in x['RecName'].keys() else "",
1098 |             ' ' + ' '.join(' '.join([f"({fix_term(value)})" for value in altname.values()]) for altname in x['AltName'])
1099 |             if 'AltName' in x.keys() else "",
1100 |             f" [Cleaved into: {'; '.join([fix_term(v['RecName']['Full']) for v in x['Contains']])}]"
1101 |             if 'Contains' in x.keys() else "",
1102 |             ' '.join([f" ({flag})" for flag in x['Flags']]) if 'Flags' in x.keys() else ""))
1103 |     description_df['EC number'] = desc_data_df.apply(
1104 |         lambda x: x['RecName']['EC'].split()[0] if type(x) != float and 'RecName' in x.keys()
1105 |         and 'EC' in x['RecName'].keys() else np.nan)
1106 |     return description_df
1107 | 
1108 | 
1109 | def parse_feature(feature, position, qualifiers=True, ide=True):
1110 |     """
1111 |     :param feature: str - the feature itself
1112 |     :param position: str - position information
1113 |     :param qualifiers: bool - add qualifiers information?
1114 |     :param ide: bool - add id information?
1115 |     :return: str - the term to add
1116 |     """
1117 |     result = f'{feature.type} {position}'
1118 |     if qualifiers:
1119 |         result += '  ' + "  ".join([f'/{key}="{value}";' for key, value in feature.qualifiers.items()])
1120 |     if ide:
1121 |         result += '  ' + f'  /id="{feature.id}";'
1122 |     return result
1123 | 
1124 | 
1125 | def parse_features(sp_data):
1126 |     feats_list = []
1127 |     pos_funcs = {
1128 |         'all': lambda x: f'{"?" if x.location.start.position is None else x.location.start.position + 1}..'
1129 |                          f'{x.location.end.position};  ',
1130 |         'end': lambda x: f'{feature.location.end.position};  '}
1131 |     prefix2info = {
1132 |         'VAR_SEQ': ('Alternative sequence', 'all', True, True),
1133 |         'VARIANT': ('Natural variant', 'end', True, True),
1134 |         'NON_CONS': ('Non-adjacent residues', 'all', True, False),
1135 |         'NON_STD': ('Non-standard residue', 'end', True, False),
1136 |         'NON_TER': ('Non-terminal residue', 'end', False, False),
1137 |         'CONFLICT': ('Sequence conflict', 'all', True, False),
1138 |         'UNSURE': ('Sequence uncertainty', 'end', True, False),
1139 |         'ACT_SITE': ('Active site', 'end', True, False),
1140 |         'BINDING': ('Binding site', 'end', True, False),
1141 |         'DNA_BIND': ('DNA binding', 'all', True, False),
1142 |         'METAL': ('Metal binding', 'end', True, False),
1143 |         'NP_BIND': ('Nucleotide binding', 'all', True, False),
1144 |         'SITE': ('Site', 'end', True, False),
1145 |         'INTRAMEM': ('Intramembrane', 'all', True, False),
1146 |         'TOPO_DOM': ('Topological domain', 'all', True, False),
1147 |         'TRANSMEM': ('Transmembrane', 'all', True, False),
1148 |         'CHAIN': ('Chain', 'all', True, True),
1149 |         'CROSSLNK': ('Cross-link', 'all', True, False),
1150 |         'DISULFID': ('Disulfide bond', 'all', True, False),
1151 |         'CARBOHYD': ('Glycosylation', 'end', True, False),
1152 |         'INIT_MET': ('Initiator methionine', 'end', True, False),
1153 |         'LIPID': ('Lipidation', 'end', True, False),
1154 |         'MOD_RES': ('Modified residue', 'end', True, False),
1155 |         'PEPTIDE': ('Peptide', 'all', True, True),
1156 |         'PROPEP': ('Propeptide', 'all', False, True),
1157 |         'SIGNAL': ('Signal peptide', 'all', True, False),
1158 |         'TRANSIT': ('Transit peptide', 'all', True, False),
1159 |         'STRAND': ('Beta strand', 'all', True, False),
1160 |         'HELIX': ('Helix', 'all', True, False),
1161 |         'TURN': ('Turn', 'all', True, False),
1162 |         'COILED': ('Coiled coil', 'all', True, False),
1163 |         'COMPBIAS': ('Compositional bias', 'all', True, False),
1164 |         'DOMAIN': ('Domain [FT]', 'all', True, False),
1165 |         'MOTIF': ('Motif', 'all', True, False),
1166 |         'REGION': ('Region', 'all', True, False),
1167 |         'REPEAT': ('Repeat', 'all', True, False),
1168 |         'ZN_FING': ('Zinc finger', 'all', True, False),
1169 |         'MUTAGEN': ('Mutagenesis', 'end', True, False),
1170 |         'CA_BIND': ('Calcium binding', 'all', True, False)}
1171 |     count_features = {}
1172 |     for features in sp_data['features']:
1173 |         feats_dict = {}
1174 |         for feature in features:
1175 |             if feature.type in prefix2info.keys():
1176 |                 parameters = prefix2info[feature.type]
1177 |                 if parameters[0] not in feats_dict.keys():
1178 |                     feats_dict[parameters[0]] = parse_feature(
1179 |                     feature, pos_funcs[parameters[1]](feature), qualifiers=parameters[2], ide=parameters[3])
1180 |                     count_features[parameters[0]] = 1
1181 |                 else:
1182 |                     feats_dict[parameters[0]] += ' ' + parse_feature(
1183 |                         feature, pos_funcs[parameters[1]](feature), qualifiers=parameters[2], ide=parameters[3])
1184 |                     count_features[parameters[0]] += 1
1185 |             else:
1186 |                 print(f'A feature UPIMAPI can yet not handle! [{feature.type}]')
1187 |         feats_dict['Features'] = '; '.join([f'{feat_type} ({count})' for feat_type, count in count_features.items()])
1188 |         feats_list.append(feats_dict)
1189 |     return pd.DataFrame(feats_list)
1190 | 
1191 | 
1192 | def parse_host_taxonomy_id(sp_data, tax_tsv):
1193 |     tax_tsv = tax_tsv.reset_index().set_index('taxid')
1194 |     return sp_data['host_taxonomy_id'].apply(
1195 |         lambda x: '; '.join([f'{tax_tsv.loc[tid, "name"]} [TaxID: {tid}]' for tid in x]) if len(x) > 0 else np.nan)
1196 | 
1197 | 
1198 | def parse_sp_data(sp_data, tax_tsv, threads=15):
1199 |     """
1200 |     Parses data from local ID mapping through DAT file
1201 |     :param threads:
1202 |     :param sp_data: pandas.DataFrame
1203 |     :param tax_tsv: str - filename of taxonomy in TSV format
1204 |     :return: pandas.DataFrame - organized in same columns as data from UniProt's API
1205 |     """
1206 |     if len(sp_data) == 0:
1207 |         return pd.DataFrame()
1208 |     tax_tsv_df = pd.read_csv(tax_tsv, sep='\t', dtype={'taxid': str, 'name': str, 'rank': str, 'parent_taxid': str})
1209 |     tax_tsv_df = tax_tsv_df[tax_tsv_df.name.notnull()]
1210 |     result = pd.DataFrame()
1211 |     result['Entry'] = sp_data['accessions'].apply(lambda x: x[0])
1212 |     local2api = {
1213 |         'entry_name': 'Entry name',
1214 |         'data_class': 'Status',
1215 |         'sequence_length': 'Length',
1216 |         'sequence': 'Sequence'
1217 |     }
1218 |     for k, v in local2api.items():
1219 |         if v not in [None, False]:
1220 |             result[v] = sp_data[k]
1221 |     result['Organism ID'] = result['Taxonomic identifier (SPECIES)'] = \
1222 |         sp_data['taxonomy_id'].apply(lambda x: x[0] if len(x) > 0 else x)
1223 |     result['Virus hosts'] = sp_data['host_organism'].apply(lambda x: x[0] if len(x) > 0 else x)
1224 |     result['Keywords'] = sp_data['keywords'].apply(';'.join)
1225 |     result['Organism'] = sp_data['organism'].str.rstrip('.')
1226 |     result['Taxonomic lineage (SPECIES)'] = result['Organism'].apply(lambda x: ' '.join(x.split()[:2]))
1227 |     timed_message('Parsing taxonomy (this may take a while)')
1228 |     tax_df = parse_taxonomy(sp_data, tax_tsv_df, threads=threads).reset_index()
1229 |     rel_df = sp_data['organism_classification'].apply(','.join)
1230 |     tax_df['index'] = tax_df['index'].apply(','.join)
1231 |     rel_df = pd.merge(rel_df, tax_df, left_on='organism_classification', right_on='index', how='left')
1232 |     del rel_df['organism_classification']
1233 |     del rel_df['index']
1234 |     result['Virus hosts'] = parse_host_taxonomy_id(sp_data, tax_tsv_df)
1235 |     result = pd.concat([result, rel_df], axis=1)
1236 |     timed_message('Parsing genes')
1237 |     result = pd.concat([result, parse_gene_names(sp_data)], axis=1)
1238 |     timed_message('Parsing cross-references')
1239 |     result = pd.concat([result, parse_cross_references(sp_data)], axis=1)
1240 |     timed_message('Parsing comments')
1241 |     result = pd.concat([result, parse_comments(sp_data)], axis=1)
1242 |     timed_message('Parsing features')
1243 |     result = pd.concat([result, parse_features(sp_data)], axis=1)
1244 |     result = pd.concat([result, parse_descriptions(sp_data)], axis=1)
1245 |     result['Gene encoded by'] = sp_data['organelle'].str.rstrip('.')
1246 |     result['Mass'] = sp_data['seqinfo'].apply(lambda x: x[1])
1247 |     result['Date of creation'] = sp_data['created'].apply(
1248 |         lambda x: datetime.strptime(x[0], '%d-%b-%Y').strftime('%Y-%m-%d'))
1249 |     result['Date of last modification'] = sp_data['annotation_update'].apply(
1250 |         lambda x: datetime.strptime(x[0], '%d-%b-%Y').strftime('%Y-%m-%d'))
1251 |     result['Version (entry)'] = sp_data['annotation_update'].apply(lambda x: x[1])
1252 |     result['Date of last sequence modification'] = sp_data['sequence_update'].apply(
1253 |         lambda x: datetime.strptime(x[0], '%d-%b-%Y').strftime('%Y-%m-%d'))
1254 |     result['Version (sequence)'] = sp_data['sequence_update'].apply(lambda x: x[1])
1255 |     result['PubMed ID'] = sp_data['references'].apply(
1256 |         lambda x: '; '.join([ref.references[0][1] for ref in x if len(ref.references) > 0]))
1257 |     return result
1258 | 
1259 | 
1260 | def get_sprot_dat(sp_dat):
1261 |     run_command(
1262 |         f'wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/'
1263 |         f'uniprot_sprot.dat.gz -O {sp_dat}.gz')
1264 |     run_command(f'gunzip {sp_dat}.gz')
1265 | 
1266 | 
1267 | def local_id_mapping(ids, sp_dat, tax_tsv, output, columns=None, databases=None, threads=15):
1268 |     ids_done, ids_missing, result = check_ids_already_done(output, ids)
1269 |     if len(ids_missing) == 0:
1270 |         return set()
1271 |     if not os.path.isfile(sp_dat):
1272 |         timed_message(f'Creating {sp_dat}')
1273 |         get_sprot_dat(sp_dat)
1274 |     if not os.path.isfile(tax_tsv):
1275 |         timed_message(f'Creating {tax_tsv}')
1276 |         get_tabular_taxonomy(tax_tsv)
1277 |     timed_message('Searching for IDs in SwissProt DAT')
1278 |     sp_data, ids_found = get_local_swissprot_data(sp_dat, ids_missing)
1279 |     timed_message('Parsing SwissProt resuls')
1280 |     sp_parsed = parse_sp_data(sp_data, tax_tsv, threads=threads)
1281 |     result = pd.concat([result, sp_parsed])
1282 |     columns = [col for col in columns if col in result.columns.tolist()]
1283 |     databases = [db for db in databases if db in result.columns.tolist()]
1284 |     result[columns + databases].to_csv(output, sep='\t', index=False)
1285 |     return ids_found
1286 | 
1287 | 
1288 | def get_input_type(args_input, blast=True):
1289 |     if args_input is None:
1290 |         return input('IDs to perform mapping on (comma separated values):'), 'stdin'
1291 |     if blast:
1292 |         return args_input, 'blast'
1293 |     if check_output(f"head -c 1 {args_input}", shell=True).decode('utf8') == '>':
1294 |         return args_input, 'fasta'
1295 |     return args_input, 'txt'
1296 | 
1297 | 
1298 | def check_no_annotation(args_input, no_annotation):
1299 |     if args_input is None:
1300 |         is_fasta = False
1301 |     else:
1302 |         is_fasta = check_output(f"head -c 1 {args_input}", shell=True).decode('utf8') == '>'
1303 |     if not is_fasta:
1304 |         no_annotation = True
1305 |     return no_annotation
1306 | 
1307 | 
1308 | def blast_consensus(alignment_file):
1309 |     blast = parse_blast(alignment_file)
1310 |     query_to_ref, ref_to_query, res = {}, {}, {}
1311 |     with open(alignment_file) as file:
1312 |         line = file.readline()
1313 |         while line:
1314 |             line = line.strip('\n').split('\t')
1315 |             query_seq, ref_seq, evalue = line[0], line[1], float(line[-2])
1316 |             if query_seq not in query_to_ref:
1317 |                 if ref_seq not in ref_to_query:
1318 |                     query_to_ref[query_seq] = {'ref_seq': ref_seq, 'evalue': evalue}
1319 |                     ref_to_query[ref_seq] = {'query_seq': query_seq, 'evalue': evalue}
1320 |                 else:
1321 |                     if ref_to_query[ref_seq]['evalue'] > evalue:
1322 |                         ref_to_query[ref_seq] = {'query_seq': query_seq, 'evalue': evalue}
1323 |                         query_to_ref[query_seq] = {'ref_seq': ref_seq, 'evalue': evalue}
1324 |             else:
1325 |                 if ref_seq not in ref_to_query:
1326 |                     if query_to_ref[query_seq]['evalue'] > evalue:
1327 |                         ref_to_query[ref_seq] = {'query_seq': query_seq, 'evalue': evalue}
1328 |                         query_to_ref[query_seq] = {'ref_seq': ref_seq, 'evalue': evalue}
1329 |                 else:
1330 |                     if ref_to_query[ref_seq]['evalue'] > evalue:
1331 |                         if query_to_ref[query_seq]['evalue'] > evalue:
1332 |                             ref_to_query[ref_seq] = {'query_seq': query_seq, 'evalue': evalue}
1333 |                             query_to_ref[query_seq] = {'ref_seq': ref_seq, 'evalue': evalue}
1334 |             line = file.readline()
1335 |     for query_seq in query_to_ref:
1336 |         ref_seq = query_to_ref[query_seq]['ref_seq']
1337 |         if query_seq == ref_to_query[ref_seq]['query_seq']:
1338 |             res[query_seq] = query_to_ref[query_seq]['ref_seq']
1339 |     res = pd.DataFrame.from_dict(res, orient='index').reset_index()
1340 |     res.columns = ['qseqid', 'sseqid']
1341 |     return blast.set_index(['qseqid', 'sseqid']).loc[res.set_index(['qseqid', 'sseqid']).index].reset_index()
1342 | 
1343 | 
1344 | def upimapi():
1345 |     args = get_arguments()
1346 |     Path(args.output).mkdir(parents=True, exist_ok=True)
1347 |     Path(args.resources_directory).mkdir(parents=True, exist_ok=True)
1348 |     args.no_annotation = check_no_annotation(args.input, args.no_annotation)
1349 | 
1350 |     # Annotation with DIAMOND
1351 |     if not args.no_annotation:
1352 |         db2file = {'uniprot': f'{args.resources_directory}/uniprot.fasta',
1353 |                    'swissprot': f'{args.resources_directory}/uniprot_sprot.fasta',
1354 |                    'taxids': f'{args.resources_directory}/taxids_database.fasta'}
1355 |         if args.database in db2file.keys():
1356 |             database = db2file[args.database]
1357 |         else:
1358 |             database = args.database
1359 | 
1360 |         if not args.skip_db_check:
1361 |             if must_build_database(args.database, args.resources_directory):
1362 |                 build_reference_database(
1363 |                     args.database, args.resources_directory, taxids=args.taxids, max_tries=args.max_tries,
1364 |                     mirror=args.mirror)
1365 |         if not database.endswith(".dmnd"):
1366 |             diamond_formatted = f"{'.'.join(database.split('.')[:-1])}.dmnd"
1367 |             if not os.path.isfile(diamond_formatted):
1368 |                 make_diamond_database(database, diamond_formatted)
1369 |             database = diamond_formatted
1370 |         (b, c) = block_size_and_index_chunks(
1371 |             argsb=args.block_size, argsc=args.index_chunks, memory=args.max_memory)
1372 |         run_diamond(
1373 |             args.input, f'{args.output}/aligned.blast', f'{args.output}/unaligned.blast', database,
1374 |             threads=args.threads, max_target_seqs=args.max_target_seqs, b=b, c=c, e_value=args.evalue,
1375 |             bit_score=args.bitscore, pident=args.pident, mode=args.diamond_mode)
1376 |         if args.max_target_seqs > 1:
1377 |             blast_consensus(f'{args.output}/aligned.blast').to_csv(
1378 |                 f'{args.output}/consensus.blast', sep='\t', index=False)
1379 |         args.input = f'{args.output}/aligned.blast'
1380 |         args.blast = True
1381 | 
1382 |     if args.skip_id_mapping:
1383 |         exit('Not performing ID mapping as specified.')
1384 | 
1385 |     timed_message('ID mapping has begun.')
1386 |     args_input, input_type = get_input_type(args.input, blast=args.blast)
1387 | 
1388 |     # Get the IDs
1389 |     ids, full_id, sp_ids = get_ids(args_input, input_type=input_type, full_id=args.full_id)
1390 | 
1391 |     if args.output_table:
1392 |         table_output = args.output_table
1393 |         print(f'Overrided table output to {table_output}')
1394 |         Path('/'.join(args.output_table.split('/')[:-1])).mkdir(parents=True, exist_ok=True)
1395 |     else:
1396 |         table_output = f'{args.output}/uniprotinfo.tsv'
1397 | 
1398 |     if args.from_db != 'UniProtKB AC/ID' or args.to_db != 'UniProtKB':
1399 |         basic_idmapping_multiprocess(ids, table_output, args.from_db, args.to_db, threads=args.threads)
1400 |         return
1401 | 
1402 |     if not args.skip_id_checking:
1403 |         # UniProt's API now fails if outdated IDs or entry names are submitted. This function removes those IDs.
1404 |         ids, not_valid = get_valid_entries_multiprocess(ids, threads=args.threads)
1405 |         with open(f'{args.output}/valid_ids.txt', 'w') as f:
1406 |             f.write('\n'.join(ids))
1407 |         with open(f'{args.output}/not_valid_ids.txt', 'w') as f:
1408 |             f.write('\n'.join(not_valid))
1409 | 
1410 |     # Get UniProt information
1411 |     if not args.fasta:
1412 |         # ID mapping through local SwissProt information
1413 |         if args.local_id_mapping:
1414 |             ids = set(ids) - set(local_id_mapping(
1415 |                 sp_ids, f'{args.resources_directory}/uniprot_sprot.dat', f'{args.resources_directory}/taxonomy.tsv',
1416 |                 table_output, columns=args.columns, databases=args.databases, threads=15))
1417 | 
1418 |         # ID mapping through API
1419 |         result = uniprot_information_workflow(
1420 |             ids, table_output, columns=args.columns, step=args.step, max_iter=args.max_tries,
1421 |             sleep_time=args.sleep)
1422 |         result.to_csv(table_output, sep='\t', index=False)
1423 | 
1424 |         if not args.no_annotation:
1425 |             blast = parse_blast(f'{args.output}/aligned.blast')
1426 |             if full_id:
1427 |                 blast.sseqid = [ide.split('|')[1] if ide not in ['*', ''] else ide for ide in blast.sseqid]
1428 |             result = pd.merge(blast, result, left_on='sseqid', right_on='Entry')
1429 |         sort_columns = ['Entry'] if args.no_annotation else ['qseqid', 'evalue']
1430 |         result.sort_values(by=sort_columns, ascending=False).to_csv(
1431 |             f'{args.output}/UPIMAPI_results.tsv', index=False, sep='\t')
1432 |     else:
1433 |         uniprot_fasta_workflow(
1434 |             ids, f'{args.output}/uniprotinfo.fasta', step=args.step, sleep_time=args.sleep)
1435 | 
1436 | 
1437 | if __name__ == '__main__':
1438 |     start_time = time()
1439 |     upimapi()
1440 |     timed_message(f'UPIMAPI analysis finished in {human_time(time() - start_time)}')
1441 | 


--------------------------------------------------------------------------------