├── .gitignore ├── LICENSE ├── README.md ├── TODO.md ├── gpcrmining ├── __init__.py └── gpcrdb │ ├── __init__.py │ ├── __main__.py │ ├── sequence.py │ └── structure.py ├── setup.py └── tests ├── test_sequence_arrestin.py ├── test_sequence_gpcr.py └── test_sequence_gprotein.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Manual exclusions 2 | data* 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Dror Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPCR-mining 2 | Functions to scrape data about G protein-coupled receptors (GPCRs) from the web. 3 | 4 | The [__GPCRdb__](https://gpcrdb.org) provides a comprehensive overview for sequence information about a GPCR, including definitions of transmembrane helices and generic residue numbering. 5 | Looking up a large number of residues or including the conversion for a specific receptor into an automated workflow can become tedious. Here we provide code to download and display this data. 6 | 7 | 8 | ## Installation 9 | 10 | You can install the latest release of the python package via pip 11 | 12 | pip install gpcrmining 13 | 14 | or an editable installation from this repository 15 | 16 | git clone https://github.com/drorlab/GPCR-mining 17 | cd GPCR-mining 18 | pip install -e . 19 | 20 | 21 | ## Run within Python code 22 | 23 | To include the functions in your Python workflow, import the library via 24 | 25 | import gpcrmining.gpcrdb as db 26 | 27 | ### Sequence information 28 | 29 | You can download all information into a list of residues 30 | 31 | gpcr_name = 'acm2_human' 32 | res_info = db.get_residue_info(gpcr_name) 33 | 34 | ... convert sequential numbers to the generic GPCRdb numbers 35 | 36 | db_num = db.sequential_to_gpcrdb('acm2_human', [393, 194, 151, 154, 190, 68, 108])) 37 | print(db_num) 38 | 39 | ... or the other way round 40 | 41 | seq_num = db.gpcrdb_to_sequential('acm2_human', ['6.41x41', '5.46x461', '4.56x56', '5.42x43']) 42 | print(seq_num) 43 | 44 | These conversion functions also work with the generic numbering schemes for signalling proteins (arrestins and G proteins). 45 | 46 | ### Structures 47 | 48 | You can obtain information about any GPCR structure from the PDB such as the entry names of receptor and signaling proteins, chain names, or details on the experimental method. Run the following to obtain a dictionary with the corresponding information: 49 | 50 | info = db.get_structure_info('6u1m') 51 | print(info) 52 | 53 | To download the PDB file to a new directory, run 54 | 55 | download_pdb_structure('6u1m', directory='structures') 56 | 57 | 58 | ## Run from the command line 59 | 60 | ### Obtain the entire sequence 61 | 62 | To obtain such a sequence and to save it in a more easily usable CSV file, run 63 | 64 | python -m gpcrmining.gpcrdb -n GPCR_NAME -d DIR 65 | 66 | with "GPCR_NAME" being the name of the GPCR as used in the corresponding GPCRmd URL. "DIR" is the directory where the data should be saved (default: data-gpcrmd), which is created if it does not exist. For example, 67 | 68 | python -m gpcrmining.gpcrdb -n adrb1_human -d my-data-from-gpcrmd 69 | 70 | writes the file _gpcrdb-residues_adrb1_human.csv_ into the directory _my-data-from-gpcrmd_. 71 | 72 | ### Select and print residues 73 | 74 | To select residues by their sequential number, use the option _-rn_. To select multiple residues, their IDs have to be separated by a whitespace and everything enclosed in quotation marks. 75 | 76 | python -m gpcrmining.gpcrdb -n adrb1_human -rn "230 231 232 233 313 339" 77 | 78 | To select residues by a generic residue numbering scheme, use the option _-id_. 79 | GPCRdb uses two similar [numbering systems](https://docs.gpcrdb.org/generic_numbering.html) (one sequence-based, following Ballesteros-Weistein, Wooten,... and one corrected for helix bulges). 80 | By default, the code will return the combined format. 81 | For input, both formats can be used (BW etc. with a dot as separator and the GPCRdb format with x) as well as the combined one. Numbering schemes can be mixed, e.g., 82 | 83 | python -m gpcrmining.gpcrdb -n adrb1_human -id "5.45 5x461 6.24 6.27 6.50x50" 84 | 85 | To select defined parts of the receptor, use the option _-p_. 86 | 87 | python -m gpcrmining.gpcrdb -n adrb1_human -p "N-term TM7 ICL2" 88 | 89 | If several selection flags are provided, only residues that fulfill all conditions will be printed. For example, 90 | 91 | python -m gpcrmining.gpcrdb -n adrb1_human -id "5.45 5x461 6.24 6.27 6.50x50" -rn "230 231 232 233 313 339" 92 | 93 | prints the following: 94 | 95 | Residue mapping for adrb1_human, using directory ./data-gpcrdb. 96 | TM5 231 V 5.45x46 97 | TM5 232 S 5.46x461 98 | TM6 313 R 6.24x24 99 | TM6 339 P 6.50x50 100 | 101 | To obtain analogous residues across receptors, use a multiple-entry string, just as for the residues: 102 | 103 | python -m gpcrmining.gpcrdb -n "adrb1_human adrb2_human" -id "5.45 5x461 6.24 6.27 6.50x50" 104 | 105 | 106 | ### Output formats 107 | 108 | Available output formats are 'plain' and 'drormd', with 'plain' (as above) being the default. 109 | 110 | If you would like to have another format added, you have two options: 111 | - open an issue with a description of what you have in mind or 112 | - fork the repo, implement your favorite format as an additional option, and open a pull request. 113 | 114 | The specific DrorMD format has an option to define one or multiple segment IDs. 115 | For example, 116 | 117 | python -m gpcrmining.gpcrdb -n adrb1_human -id "6.24 6.27 6.50" -f drormd -s 'P0 P1' 118 | 119 | prints the numbers in a format that can be directly copied into a DrorMD conditions file: 120 | 121 | Residue mapping for adrb1_human, using directory ./data-gpcrdb. 122 | 'R6.24x24': 'segid P0 P1 and resid 313' 123 | 'A6.27x27': 'segid P0 P1 and resid 316' 124 | 'P6.50x50': 'segid P0 P1 and resid 339' 125 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # Tasks 2 | - [x] Use proper API for residues, see [API documentation](https://gpcrdb.org/services/reference/). 3 | - [x] Option for multiple receptors (as whitespace-separated string). 4 | - [x] Option to circumvent download (--no-write). 5 | - [ ] Option to select receptor via [Uniprot mapping](https://files.gpcrdb.org/uniprot_mapping.txt). 6 | - [x] Add example for using the library in python to the documentation. 7 | - [ ] Write proper unit tests. 8 | - [ ] Create doc pages? 9 | - [ ] Catch invalid inputs. 10 | - [ ] Mapping between PDB ID, Uniprot ID and GPCRdb ID of a given receptor. 11 | - [ ] Include information about structures from the AlphaFold-EMBL database. 12 | - [x] Make numbering conversion work for arrestins and G proteins 13 | -------------------------------------------------------------------------------- /gpcrmining/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drorlab/GPCR-mining/e1f3b2980629d83ac04e1904fdcd770c54e0d69c/gpcrmining/__init__.py -------------------------------------------------------------------------------- /gpcrmining/gpcrdb/__init__.py: -------------------------------------------------------------------------------- 1 | from .sequence import * 2 | from .structure import * 3 | -------------------------------------------------------------------------------- /gpcrmining/gpcrdb/__main__.py: -------------------------------------------------------------------------------- 1 | from .sequence import * 2 | 3 | 4 | def _split_argument(arg): 5 | return [i for i in arg.split(' ') if i] 6 | 7 | 8 | @click.command() 9 | @click.option('-n', '--name', type=str, required=True) 10 | @click.option('-d', '--directory', type=click.Path(exists=False), default='./data-gpcrdb') 11 | @click.option('-id', '--gpcrdb-id', type=str, default='') 12 | @click.option('-rn', '--res_num', type=str, default='') 13 | @click.option('-p', '--part', type=str, default='') 14 | @click.option('-f', '--fmt', type=str, default='plain') 15 | @click.option('-s', '--segid', type=str, default='R') 16 | @click.option('--write/--no-write', type=bool, default=True) 17 | def main(name, directory, gpcrdb_id, res_num, part, fmt, segid, write): 18 | 19 | # Create lists from string arguments. 20 | gpcrdb_id = _split_argument(gpcrdb_id) 21 | res_num = _split_argument(res_num) 22 | part = _split_argument(part) 23 | name = _split_argument(name) 24 | 25 | # Loop over all receptor entries. 26 | for entry in name: 27 | 28 | # Obtain information for all residues in this entry. 29 | msg = 'Residue mapping for '+entry 30 | if write: 31 | msg += ', using directory '+directory+'.' 32 | if not Path(directory).joinpath('gpcrdb-residues_'+entry+'.csv').is_file(): 33 | res_info = download_gpcrdb_residues(entry, directory=directory, show=False) 34 | res_info = load_as_array(entry, directory) if write else res_info 35 | else: 36 | msg += ', using no directory on disk.' 37 | res_info = download_gpcrdb_residues(entry, directory=None, show=False) 38 | 39 | # Select residues to print. 40 | if len(gpcrdb_id) > 0: 41 | res_info = select_by_gpcrdbnum(res_info, gpcrdb_id) 42 | if len(res_num) > 0: 43 | res_info = select_by_resnum(res_info, res_num) 44 | if len(part) > 0: 45 | res_info = select_by_part(res_info, part) 46 | 47 | # Print the residues. 48 | print(msg) 49 | print_residues(res_info, fmt=fmt, segid=segid) 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | 55 | -------------------------------------------------------------------------------- /gpcrmining/gpcrdb/sequence.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | import requests 4 | import numpy as np 5 | from pathlib import Path 6 | 7 | 8 | # Functions to obtain residue information from the GPCRdb 9 | 10 | 11 | def download_gpcrdb_residues(name, directory=None, show=False, scrape=False): 12 | """ 13 | Downloads a sequence with segment annotations and sequential+generic residue numbers from GPCRdb. 14 | Optionally saves it as CSV file or prints it to the command line. 15 | 16 | Parameters 17 | ---------- 18 | name : str 19 | Name of the GPCR to download (as in its GPCRdb URL). 20 | directory : str 21 | Target directory. 22 | The directory is created if it does not exist. 23 | show : bool 24 | Print the sequence to the terminal. 25 | 26 | Returns 27 | ------- 28 | residue_info : list 29 | A list in which each item contains the following strings: 30 | - part of the residue 31 | - sequential residue number 32 | - amino acid 1-letter code 33 | - GPCRdb residue ID 34 | 35 | """ 36 | # Fetch data from GPCRdb 37 | if scrape: 38 | # scrape info from the website HTML (outdated, for test purposes only) 39 | residue_info = scrape_residue_info(name) 40 | else: 41 | # retrieve info via the GPCRdb API 42 | residue_info = get_residue_info(name) 43 | # Write information to CSV file 44 | if directory is not None: 45 | Path(directory).mkdir(parents=True, exist_ok=True) 46 | out_filename = Path(directory).joinpath('gpcrdb-residues_'+name+'.csv') 47 | np.savetxt(out_filename, residue_info, delimiter=',', fmt='%s', 48 | header='Part, SeqID, Code, GPCRdbID') 49 | # Print information to screen 50 | if show: 51 | for res in residue_info: 52 | print('%6s %4s %1s %s'%(res[0],res[1],res[2],res[3])) 53 | return residue_info 54 | 55 | 56 | def get_residue_info(name, num_scheme=None, verbose=False): 57 | """ 58 | Gets residue info from the GPCRdb 59 | 60 | Parameters 61 | ---------- 62 | name : str 63 | Name of the protein to download (as in its GPCRdb URL). 64 | num_scheme : str 65 | Alternative numbering scheme to use. 66 | If None, use the system used for display in the GPCRdb. 67 | Only works for GPCRs, not for arrestins or G proteins. 68 | verbose : bool 69 | Print info about the numbering scheme. 70 | 71 | 72 | Returns 73 | ------- 74 | residue_info : list 75 | A list in which each item contains the following strings: 76 | - part of the residue 77 | - sequential residue number 78 | - amino acid 1-letter code 79 | - GPCRdb residue ID 80 | 81 | """ 82 | # Fetch the protein 83 | url = 'https://gpcrdb.org/services/protein/'+name 84 | response = requests.get(url) 85 | protein_data = response.json() 86 | # Determine the numbering scheme 87 | if num_scheme is None and verbose: 88 | scheme = protein_data['residue_numbering_scheme'] 89 | print('Numbering scheme: '+scheme) 90 | # Fetch the residue information 91 | url = 'https://gpcrdb.org/services/residues/extended/'+name 92 | response = requests.get(url) 93 | residue_data = response.json() 94 | # Extract info in array format 95 | residue_info = [] 96 | for res in residue_data: 97 | res_part = res['protein_segment'] 98 | res_seqn = res['sequence_number'] 99 | res_code = res['amino_acid'] 100 | if num_scheme == None: 101 | res_dbid = res['display_generic_number'] 102 | else: 103 | res_dbid = '' 104 | for num in res['alternative_generic_numbers']: 105 | if num['scheme'] == num_scheme: 106 | res_dbid = num['label'] 107 | if res_dbid == None: res_dbid = '' 108 | residue_info.append([res_part, res_seqn, res_code, res_dbid]) 109 | return residue_info 110 | 111 | 112 | def scrape_residue_info(name): 113 | """ 114 | Scrapes residue info from the GPCRdb website html 115 | 116 | Parameters 117 | ---------- 118 | name : str 119 | Name of the GPCR to download (as in its GPCRdb URL). 120 | 121 | Returns 122 | ------- 123 | residue_info : list 124 | A list in which each item contains the following strings: 125 | - part of the residue 126 | - sequential residue number 127 | - amino acid 1-letter code 128 | - GPCRdb residue ID 129 | """ 130 | url = 'https://gpcrdb.org/protein/'+name+'/' 131 | req = requests.get(url, allow_redirects=True) 132 | txt = req.content.decode(req.encoding) 133 | residue_html = _extract_gpcrdb_residue_html(txt) 134 | residue_info = _extract_gpcrdb_residue_info(residue_html) 135 | return residue_info 136 | 137 | 138 | def _extract_gpcrdb_residue_html(txt): 139 | """ 140 | Extracts the relevant lines for all residues from a GPCRdb html entry. 141 | 142 | Parameters 143 | ---------- 144 | txt : str 145 | Content (html) of the website with the GPCRdb entry. 146 | 147 | Returns 148 | ------- 149 | residue_html : list 150 | A list in which each item contains the html lines for one residue. 151 | 152 | """ 153 | res_start_line = '