├── .gitignore
├── LICENSE
├── README.md
├── TODO.md
├── gpcrmining
    ├── __init__.py
    └── gpcrdb
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── sequence.py
    │   └── structure.py
├── setup.py
└── tests
    ├── test_sequence_arrestin.py
    ├── test_sequence_gpcr.py
    └── test_sequence_gprotein.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Manual exclusions
  2 | data*
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Dror Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GPCR-mining
  2 | Functions to scrape data about G protein-coupled receptors (GPCRs) from the web.
  3 | 
  4 | The [__GPCRdb__](https://gpcrdb.org) provides a comprehensive overview for sequence information about a GPCR, including definitions of transmembrane helices and generic residue numbering.
  5 | Looking up a large number of residues or including the conversion for a specific receptor into an automated workflow can become tedious. Here we provide code to download and display this data.
  6 | 
  7 | 
  8 | ## Installation
  9 | 
 10 | You can install the latest release of the python package via pip
 11 | 
 12 |     pip install gpcrmining
 13 | 
 14 | or an editable installation from this repository
 15 | 
 16 |     git clone https://github.com/drorlab/GPCR-mining
 17 |     cd GPCR-mining
 18 |     pip install -e .
 19 | 
 20 | 
 21 | ## Run within Python code
 22 | 
 23 | To include the functions in your Python workflow, import the library via
 24 | 
 25 |     import gpcrmining.gpcrdb as db
 26 | 
 27 | ### Sequence information
 28 | 
 29 | You can download all information into a list of residues
 30 | 
 31 |     gpcr_name = 'acm2_human'
 32 |     res_info = db.get_residue_info(gpcr_name)
 33 | 
 34 | ... convert sequential numbers to the generic GPCRdb numbers
 35 | 
 36 |     db_num = db.sequential_to_gpcrdb('acm2_human', [393, 194, 151, 154, 190, 68, 108]))
 37 |     print(db_num)
 38 | 
 39 | ... or the other way round
 40 | 
 41 |     seq_num = db.gpcrdb_to_sequential('acm2_human', ['6.41x41', '5.46x461', '4.56x56', '5.42x43'])    
 42 |     print(seq_num)
 43 |     
 44 | These conversion functions also work with the generic numbering schemes for signalling proteins (arrestins and G proteins).
 45 | 
 46 | ### Structures
 47 | 
 48 | You can obtain information about any GPCR structure from the PDB such as the entry names of receptor and signaling proteins, chain names, or details on the experimental method. Run the following to obtain a dictionary with the corresponding information:
 49 | 
 50 |     info = db.get_structure_info('6u1m')
 51 |     print(info)
 52 |     
 53 | To download the PDB file to a new directory, run
 54 | 
 55 |     download_pdb_structure('6u1m', directory='structures')
 56 |     
 57 | 
 58 | ## Run from the command line
 59 | 
 60 | ### Obtain the entire sequence
 61 | 
 62 | To obtain such a sequence and to save it in a more easily usable CSV file, run
 63 | 
 64 |     python -m gpcrmining.gpcrdb -n GPCR_NAME -d DIR
 65 | 
 66 | with "GPCR_NAME" being the name of the GPCR as used in the corresponding GPCRmd URL. "DIR" is the directory where the data should be saved (default: data-gpcrmd), which is created if it does not exist. For example,
 67 | 
 68 |     python -m gpcrmining.gpcrdb -n adrb1_human -d my-data-from-gpcrmd
 69 | 
 70 | writes the file _gpcrdb-residues_adrb1_human.csv_ into the directory _my-data-from-gpcrmd_.
 71 | 
 72 | ### Select and print residues
 73 | 
 74 | To select residues by their sequential number, use the option _-rn_. To select multiple residues, their IDs have to be separated by a whitespace and everything enclosed in quotation marks.
 75 | 
 76 |     python -m gpcrmining.gpcrdb -n adrb1_human -rn "230 231 232 233 313 339" 
 77 |     
 78 | To select residues by a generic residue numbering scheme, use the option _-id_.
 79 | GPCRdb uses two similar [numbering systems](https://docs.gpcrdb.org/generic_numbering.html) (one sequence-based, following Ballesteros-Weistein, Wooten,... and one corrected for helix bulges).
 80 | By default, the code will return the combined format. 
 81 | For input, both formats can be used (BW etc. with a dot as separator and the GPCRdb format with x) as well as the combined one. Numbering schemes can be mixed, e.g.,
 82 | 
 83 |     python -m gpcrmining.gpcrdb -n adrb1_human -id "5.45 5x461 6.24 6.27 6.50x50"
 84 | 
 85 | To select defined parts of the receptor, use the option _-p_.
 86 | 
 87 |     python -m gpcrmining.gpcrdb -n adrb1_human -p "N-term TM7 ICL2"
 88 | 
 89 | If several selection flags are provided, only residues that fulfill all conditions will be printed. For example,
 90 | 
 91 |     python -m gpcrmining.gpcrdb -n adrb1_human -id "5.45 5x461 6.24 6.27 6.50x50" -rn "230 231 232 233 313 339"    
 92 |     
 93 | prints the following:
 94 | 
 95 |     Residue mapping for adrb1_human, using directory ./data-gpcrdb.
 96 |        TM5  231 V 5.45x46
 97 |        TM5  232 S 5.46x461
 98 |        TM6  313 R 6.24x24
 99 |        TM6  339 P 6.50x50
100 | 
101 | To obtain analogous residues across receptors, use a multiple-entry string, just as for the residues:
102 | 
103 |     python -m gpcrmining.gpcrdb -n "adrb1_human adrb2_human" -id "5.45 5x461 6.24 6.27 6.50x50"
104 | 
105 | 
106 | ### Output formats
107 | 
108 | Available output formats are 'plain' and 'drormd', with 'plain' (as above) being the default. 
109 | 
110 | If you would like to have another format added, you have two options:
111 | - open an issue with a description of what you have in mind or
112 | - fork the repo, implement your favorite format as an additional option, and open a pull request. 
113 | 
114 | The specific DrorMD format has an option to define one or multiple segment IDs.
115 | For example, 
116 | 
117 |     python -m gpcrmining.gpcrdb -n adrb1_human -id "6.24 6.27 6.50" -f drormd -s 'P0 P1'
118 | 
119 | prints the numbers in a format that can be directly copied into a DrorMD conditions file:
120 | 
121 |     Residue mapping for adrb1_human, using directory ./data-gpcrdb.
122 |     'R6.24x24': 'segid P0 P1 and resid 313'
123 |     'A6.27x27': 'segid P0 P1 and resid 316'
124 |     'P6.50x50': 'segid P0 P1 and resid 339'
125 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # Tasks
 2 | - [x] Use proper API for residues, see [API documentation](https://gpcrdb.org/services/reference/).
 3 | - [x] Option for multiple receptors (as whitespace-separated string).
 4 | - [x] Option to circumvent download (--no-write).
 5 | - [ ] Option to select receptor via [Uniprot mapping](https://files.gpcrdb.org/uniprot_mapping.txt).
 6 | - [x] Add example for using the library in python to the documentation.
 7 | - [ ] Write proper unit tests.
 8 | - [ ] Create doc pages?
 9 | - [ ] Catch invalid inputs.
10 | - [ ] Mapping between PDB ID, Uniprot ID and GPCRdb ID of a given receptor.
11 | - [ ] Include information about structures from the AlphaFold-EMBL database.
12 | - [x] Make numbering conversion work for arrestins and G proteins
13 | 


--------------------------------------------------------------------------------
/gpcrmining/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drorlab/GPCR-mining/e1f3b2980629d83ac04e1904fdcd770c54e0d69c/gpcrmining/__init__.py


--------------------------------------------------------------------------------
/gpcrmining/gpcrdb/__init__.py:
--------------------------------------------------------------------------------
1 | from .sequence import *
2 | from .structure import *
3 | 


--------------------------------------------------------------------------------
/gpcrmining/gpcrdb/__main__.py:
--------------------------------------------------------------------------------
 1 | from .sequence import *
 2 | 
 3 | 
 4 | def _split_argument(arg):
 5 |     return [i for i in arg.split(' ') if i] 
 6 | 
 7 | 
 8 | @click.command()
 9 | @click.option('-n', '--name', type=str, required=True)
10 | @click.option('-d', '--directory', type=click.Path(exists=False), default='./data-gpcrdb')
11 | @click.option('-id', '--gpcrdb-id', type=str, default='')
12 | @click.option('-rn', '--res_num', type=str, default='')
13 | @click.option('-p', '--part', type=str, default='')
14 | @click.option('-f', '--fmt', type=str, default='plain')
15 | @click.option('-s', '--segid', type=str, default='R')
16 | @click.option('--write/--no-write', type=bool, default=True)
17 | def main(name, directory, gpcrdb_id, res_num, part, fmt, segid, write):
18 | 
19 |     # Create lists from string arguments.
20 |     gpcrdb_id = _split_argument(gpcrdb_id)
21 |     res_num = _split_argument(res_num)
22 |     part = _split_argument(part)
23 |     name = _split_argument(name)
24 | 
25 |     # Loop over all receptor entries.
26 |     for entry in name:
27 |     
28 |         # Obtain information for all residues in this entry.
29 |         msg = 'Residue mapping for '+entry
30 |         if write:
31 |             msg += ', using directory '+directory+'.'
32 |             if not Path(directory).joinpath('gpcrdb-residues_'+entry+'.csv').is_file():
33 |                 res_info = download_gpcrdb_residues(entry, directory=directory, show=False)
34 |             res_info = load_as_array(entry, directory) if write else res_info
35 |         else:
36 |             msg += ', using no directory on disk.'
37 |             res_info = download_gpcrdb_residues(entry, directory=None, show=False)
38 |         
39 |         # Select residues to print.        
40 |         if len(gpcrdb_id) > 0:
41 |             res_info = select_by_gpcrdbnum(res_info, gpcrdb_id)
42 |         if len(res_num) > 0:
43 |             res_info = select_by_resnum(res_info, res_num)
44 |         if len(part) > 0:
45 |             res_info = select_by_part(res_info, part)
46 |             
47 |         # Print the residues.
48 |         print(msg)
49 |         print_residues(res_info, fmt=fmt, segid=segid)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 
55 | 


--------------------------------------------------------------------------------
/gpcrmining/gpcrdb/sequence.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import click
  3 | import requests
  4 | import numpy as np
  5 | from pathlib import Path
  6 | 
  7 | 
  8 | # Functions to obtain residue information from the GPCRdb
  9 | 
 10 | 
 11 | def download_gpcrdb_residues(name, directory=None, show=False, scrape=False):
 12 |     """
 13 |     Downloads a sequence with segment annotations and sequential+generic residue numbers from GPCRdb. 
 14 |     Optionally saves it as CSV file or prints it to the command line.
 15 |     
 16 |     Parameters
 17 |     ----------
 18 |         name : str
 19 |             Name of the GPCR to download (as in its GPCRdb URL). 
 20 |         directory : str
 21 |             Target directory.
 22 |             The directory is created if it does not exist.
 23 |         show : bool
 24 |             Print the sequence to the terminal.
 25 |             
 26 |     Returns
 27 |     -------
 28 |         residue_info : list
 29 |             A list in which each item contains the following strings:
 30 |             - part of the residue
 31 |             - sequential residue number
 32 |             - amino acid 1-letter code
 33 |             - GPCRdb residue ID
 34 |             
 35 |     """  
 36 |     # Fetch data from GPCRdb
 37 |     if scrape: 
 38 |         # scrape info from the website HTML (outdated, for test purposes only)
 39 |         residue_info = scrape_residue_info(name)
 40 |     else: 
 41 |         # retrieve info via the GPCRdb API
 42 |         residue_info = get_residue_info(name)
 43 |     # Write information to CSV file
 44 |     if directory is not None:
 45 |         Path(directory).mkdir(parents=True, exist_ok=True)
 46 |         out_filename = Path(directory).joinpath('gpcrdb-residues_'+name+'.csv')
 47 |         np.savetxt(out_filename, residue_info, delimiter=',', fmt='%s', 
 48 |                    header='Part, SeqID, Code, GPCRdbID')
 49 |     # Print information to screen
 50 |     if show:
 51 |         for res in residue_info:
 52 |             print('%6s %4s %1s %s'%(res[0],res[1],res[2],res[3]))
 53 |     return residue_info
 54 |     
 55 |     
 56 | def get_residue_info(name, num_scheme=None, verbose=False):
 57 |     """
 58 |     Gets residue info from the GPCRdb
 59 |     
 60 |     Parameters
 61 |     ----------
 62 |         name : str
 63 |             Name of the protein to download (as in its GPCRdb URL). 
 64 |         num_scheme : str
 65 |             Alternative numbering scheme to use. 
 66 |             If None, use the system used for display in the GPCRdb.
 67 |             Only works for GPCRs, not for arrestins or G proteins.
 68 |         verbose : bool
 69 |             Print info about the numbering scheme.
 70 |             
 71 |             
 72 |     Returns
 73 |     -------
 74 |         residue_info : list
 75 |             A list in which each item contains the following strings:
 76 |             - part of the residue
 77 |             - sequential residue number
 78 |             - amino acid 1-letter code
 79 |             - GPCRdb residue ID
 80 |             
 81 |     """
 82 |     # Fetch the protein 
 83 |     url = 'https://gpcrdb.org/services/protein/'+name
 84 |     response = requests.get(url)
 85 |     protein_data = response.json()
 86 |     # Determine the numbering scheme
 87 |     if num_scheme is None and verbose:
 88 |         scheme = protein_data['residue_numbering_scheme']
 89 |         print('Numbering scheme: '+scheme)
 90 |     # Fetch the residue information
 91 |     url = 'https://gpcrdb.org/services/residues/extended/'+name
 92 |     response = requests.get(url)
 93 |     residue_data = response.json()
 94 |     # Extract info in array format
 95 |     residue_info = []
 96 |     for res in residue_data:
 97 |         res_part = res['protein_segment']
 98 |         res_seqn = res['sequence_number']
 99 |         res_code = res['amino_acid']
100 |         if num_scheme == None:
101 |             res_dbid = res['display_generic_number']
102 |         else:
103 |             res_dbid = ''
104 |             for num in res['alternative_generic_numbers']:
105 |                 if num['scheme'] == num_scheme:
106 |                     res_dbid = num['label']
107 |         if res_dbid == None: res_dbid = ''
108 |         residue_info.append([res_part, res_seqn, res_code, res_dbid])
109 |     return residue_info
110 | 
111 | 
112 | def scrape_residue_info(name):
113 |     """
114 |     Scrapes residue info from the GPCRdb website html
115 |     
116 |     Parameters
117 |     ----------
118 |         name : str
119 |             Name of the GPCR to download (as in its GPCRdb URL). 
120 |             
121 |     Returns
122 |     -------
123 |         residue_info : list
124 |             A list in which each item contains the following strings:
125 |             - part of the residue
126 |             - sequential residue number
127 |             - amino acid 1-letter code
128 |             - GPCRdb residue ID
129 |     """
130 |     url = 'https://gpcrdb.org/protein/'+name+'/'
131 |     req = requests.get(url, allow_redirects=True)
132 |     txt = req.content.decode(req.encoding)
133 |     residue_html = _extract_gpcrdb_residue_html(txt)
134 |     residue_info = _extract_gpcrdb_residue_info(residue_html)
135 |     return residue_info
136 |     
137 |     
138 | def _extract_gpcrdb_residue_html(txt):
139 |     """
140 |     Extracts the relevant lines for all residues from a GPCRdb html entry.
141 |     
142 |     Parameters
143 |     ----------
144 |         txt : str
145 |             Content (html) of the website with the GPCRdb entry.
146 | 
147 |     Returns
148 |     -------
149 |         residue_html : list
150 |             A list in which each item contains the html lines for one residue.
151 |             
152 |     """  
153 |     res_start_line = '                        <td class="seqv seqv-sequence">'
154 |     res_end_line = '                        </td>'
155 |     spl = txt.split('\n')
156 |     residue_html = []
157 |     for lnum, line in enumerate(spl):
158 |         if line == res_start_line:
159 |             residue_lines = spl[lnum:lnum+12]
160 |             # Use fewer lines if the residue is shorter
161 |             # (i.e. has no GPCRdb number)
162 |             if residue_lines[-4] == res_end_line:
163 |                 residue_lines = residue_lines[:-3]
164 |             residue_html.append(residue_lines)
165 |     return residue_html
166 | 
167 | 
168 | def _extract_gpcrdb_residue_info(residue_html):
169 |     """
170 |     Extracts the relevant info from GPCRdb html entries of residues.
171 |     
172 |     Parameters
173 |     ----------
174 |         residue_html : list
175 |             A list in which each item contains the html lines for one residue.
176 | 
177 |     Returns
178 |     -------
179 |         residue_info : list
180 |             A list in which each item contains the following strings:
181 |             - part of the residue
182 |             - sequential residue number
183 |             - amino acid 1-letter code
184 |             - GPCRdb residue ID
185 |             
186 |     """
187 |     residue_info = []
188 |     for res in residue_html:
189 |         res_part = res[2].split('>')[1]
190 |         res_seqn = res[3].split(' # ')[1][1:]
191 |         res_code = res[-3].split(' ')[-1]
192 |         if len(res) == 12 and 'GPCRdb' in res[5]:
193 |             res_dbid = res[5].split(' # ')[-1][1:]
194 |         else:
195 |             res_dbid = ''
196 |         residue_info.append([res_part, res_seqn, res_code, res_dbid])
197 |     return residue_info
198 | 
199 |     
200 | 
201 | # Functions to load a receptor data file
202 | 
203 | 
204 | def load_as_array(name, directory):
205 |     filename = Path(directory).joinpath('gpcrdb-residues_'+name+'.csv')
206 |     return np.loadtxt(filename ,dtype=str, delimiter=',', skiprows=1)
207 | 
208 | 
209 | def load_as_dataframe(name, directory):
210 |     filename = Path(directory).joinpath('gpcrdb-residues_'+name+'.csv')
211 |     df = pd.read_csv(filename)
212 |     for i in range(len(df.columns)):
213 |         df.columns.values[i] = df.columns.values[i].split(' ')[-1]
214 |     return df
215 |     
216 |     
217 | # Functions to print and select residues
218 |     
219 | 
220 | def print_residues(ar, fmt='plain', segid='R'):    
221 |     for res in ar:      
222 |         # Read out residue data and build labels
223 |         resnum = res[1]
224 |         if res[3] == '':
225 |             reslabel = res[2]+res[1]
226 |         else:
227 |             reslabel = res[2]+res[3]            
228 |         # Print if a valid format is given
229 |         if fmt=='plain':
230 |             print('%6s %4s %1s %s'%(res[0],res[1],res[2],res[3]))
231 |         elif fmt=='drormd':
232 |             drorlabel=reslabel.split('x')[0].replace('.','x')
233 |             print("    '%s': 'segid %s and resid %s',"%(drorlabel, segid, resnum))           
234 |     return
235 |     
236 |     
237 | def select_by_gpcrdbnum(res_array, gpcrdb_num):
238 |     out_list = []
239 |     # Go through all residues
240 |     for res in res_array:
241 |         selected = False
242 |         # Read out residue data and build labels
243 |         resnum = res[1]
244 |         if res[3] == '':
245 |             reslabel = res[1]
246 |             if reslabel in gpcrdb_num:
247 |                 selected = True
248 |         else:
249 |             reslabel = res[3]
250 |             reslabel_bw = res[3].split('x')[0]
251 |             reslabel_db = res[3].split('.')[0]+'x'+res[3].split('x')[1]
252 |             if reslabel in gpcrdb_num:
253 |                 selected = True
254 |             if reslabel_bw in gpcrdb_num:
255 |                 selected = True
256 |             if reslabel_db in gpcrdb_num:
257 |                 selected = True
258 |         # Add residue info if it is in the list
259 |         if selected:
260 |             out_list.append(res)
261 |     return out_list
262 | 
263 | 
264 | def select_by_resnum(res_array, res_num):    
265 |     out_list = []   
266 |     # Go through all residues
267 |     for res in res_array:           
268 |         # Add residue info if it is in the list
269 |         if res[1] in res_num:
270 |             out_list.append(res)
271 |     return out_list
272 | 
273 | 
274 | def select_by_part(res_array, parts):
275 |     out_list = []
276 |     # Go through all residues
277 |     for res in res_array:
278 |         # Add residue info if it is in the list
279 |         if res[0] in parts:
280 |             out_list.append(res)
281 |     return out_list
282 | 
283 | 
284 | # Functions to convert residue numbers
285 | 
286 | 
287 | def sequential_to_gpcrdb(gpcr_name, resnums):
288 |     """
289 |     Gets GPCRdb number for residues provided as sequential residue number.
290 |     
291 |     Parameters
292 |     ----------
293 |     gpcr_name : str
294 |         Name of the GPCR as in the GPCRdb.
295 |     resnums : list of int
296 |         Sequential residue numbers.
297 |         
298 |     Returns
299 |     -------
300 |     dbnums : list of str
301 |         GPCRdb numbering of the residues.
302 |     """
303 |     res_array = get_residue_info(gpcr_name)
304 |     label3 = {int(res[1]):res[3] for res in res_array}
305 |     label0 = {int(res[1]):res[0] for res in res_array}
306 |     dbnums = []
307 |     for rn in resnums:
308 |         if rn in label3.keys():
309 |             label = label3[rn] if len(label3[rn])>0 else label0[rn]
310 |         else:
311 |             label = ''
312 |         dbnums.append(label)
313 |     return dbnums
314 | 
315 | 
316 | def gpcrdb_to_sequential(gpcr_name, resnums):
317 |     """
318 |     Gets sequential residue numbers for residues provided as GPCRdb numbers.
319 |     
320 |     Parameters
321 |     ----------
322 |     gpcr_name : str
323 |         Name of the GPCR as in the GPCRdb.
324 |     resnums : list of str
325 |         Residue numbers in GPCRdb format without AA name, e.g., '5.42x43'.
326 |         
327 |     Returns
328 |     -------
329 |     dbnums : list of int
330 |         Sequential numbers of the residues.
331 |     """
332 |     res_array = get_residue_info(gpcr_name)
333 |     label1 = {res[3]:int(res[1]) for res in res_array}
334 |     seqnums = []
335 |     for rn in resnums:
336 |         if rn == '':
337 |             label = None
338 |         elif rn in label1.keys():
339 |             label = label1[rn] 
340 |         else:
341 |             label = None
342 |         seqnums.append(label)
343 |     return seqnums
344 | 


--------------------------------------------------------------------------------
/gpcrmining/gpcrdb/structure.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import urllib
 3 | import io
 4 | import gzip
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | def get_pdb_structure_info(pdbid):
 9 |     """
10 |     Gets info about a PDB structure from the GPCRdb
11 |     
12 |     Parameters
13 |     ----------
14 |         pdbid : str
15 |             PDB ID of the structure. Can be upper- or lowercase. 
16 |             
17 |     Returns
18 |     -------
19 |         structure_info : dict
20 |             A dictionary with information about the PDB structure.
21 |                 
22 |     """
23 |     # Convert to uppercase
24 |     pdbid = pdbid.upper()
25 |     # Fetch the structure 
26 |     url = 'https://gpcrdb.org/services/structure/'+pdbid
27 |     response = requests.get(url)
28 |     str_info = response.json()
29 |     return str_info
30 | 
31 | 
32 | def download_pdb_structure(pdbid, directory='.'):
33 |     """
34 |     Downloads a structure from the PDB.
35 |     
36 |     Parameters
37 |     ----------
38 |         pdbid : str
39 |             PDB ID of the structure. Can be upper- or lowercase. 
40 |         directory : str
41 |             Directory in which to save the structure
42 |             
43 |     Returns
44 |     -------
45 |         pdbid : str
46 |             PDB ID of the structure in lowercase.
47 |                 
48 |     """    
49 |     pdbid = pdbid.lower()
50 | 
51 |     Path(directory).mkdir(parents=True, exist_ok=True)
52 |     out_filename = Path(directory).joinpath(pdbid+'.pdb')
53 | 
54 |     # Fetch the structure 
55 |     url = 'https://files.rcsb.org/download/'+pdbid+'.pdb.gz'
56 |     response = urllib.request.urlopen(url)
57 |     compressed_file = io.BytesIO(response.read())
58 |     decompressed_file = gzip.GzipFile(fileobj=compressed_file)
59 | 
60 |     with open(out_filename, 'wb') as outfile:
61 |         outfile.write(decompressed_file.read())
62 | 
63 |     return pdbid
64 | 
65 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(name='gpcrmining',
 7 |       version='0.2.0',
 8 |       description='Functions to scrape GPCR data from the web.',
 9 |       url='http://github.com/drorlab/GPCR-mining',
10 |       author='Martin Voegele',
11 |       author_email='martinvoegele1989@gmail.com',
12 |       license='MIT',
13 |       packages=find_packages(include=[
14 |         'gpcrmining',
15 |         'gpcrmining.gpcrdb',
16 |       ]),
17 |       zip_safe=False,
18 |       install_requires=[
19 |         'numpy',
20 |         'pandas',
21 |         'click',
22 |         'requests',
23 |       ],
24 |       classifiers=[
25 |         # How mature is this project? Common values are
26 |         #   3 - Alpha
27 |         #   4 - Beta
28 |         #   5 - Production/Stable
29 |         'Development Status :: 4 - Beta',
30 |         # license (should match "license" above)
31 |         'License :: OSI Approved :: MIT License',
32 |         # Supported Python versions
33 |         'Programming Language :: Python :: 3',
34 |       ],)
35 | 
36 | 


--------------------------------------------------------------------------------
/tests/test_sequence_arrestin.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import importlib
 4 | import gpcrmining.gpcrdb as db
 5 | 
 6 | arr_name = 'arrs_human'
 7 | seq_num = [15, 23, 166, 234]
 8 | gen_num = ['N.ns1.15', 'N.s1s2.03', 'N.s9s10.08', 'C.s14s15.02']
 9 | 
10 | def test_get_residue_info():
11 |     res_info = db.get_residue_info(arr_name)
12 |     assert len(res_info) == 405
13 |     assert res_info[0] == ['ns1', 1, 'M', 'N.ns1.01']
14 |     assert res_info[99] == ['s6h1', 100, 'A', 'N.s6h1.07']
15 |     pass
16 | 
17 | def test_gpcrdb_to_sequential():
18 |     seq = db.gpcrdb_to_sequential(arr_name, gen_num)
19 |     assert seq == seq_num
20 |     pass
21 |     
22 | def test_sequential_to_gpcrdb():
23 |     gen = db.sequential_to_gpcrdb(arr_name, seq_num)
24 |     assert gen == gen_num
25 |     pass
26 |     
27 | 


--------------------------------------------------------------------------------
/tests/test_sequence_gpcr.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import importlib
 4 | import gpcrmining.gpcrdb as db
 5 | 
 6 | gpcr_name = 'adrb1_human'
 7 | seq_num = [330, 232, 189, 228]
 8 | gen_num = ['6.41x41', '5.46x461', '4.56x56', '5.42x43']
 9 | 
10 | def test_get_residue_info():
11 |     res_info = db.get_residue_info(gpcr_name)
12 |     assert len(res_info) == 477
13 |     assert res_info[0] == ['N-term', 1, 'M', '']
14 |     assert res_info[99] == ['TM2', 100, 'L', '2.46x46']
15 |     pass
16 | 
17 | def test_gpcrdb_to_sequential():
18 |     seq = db.gpcrdb_to_sequential(gpcr_name, gen_num)
19 |     assert seq == seq_num
20 |     pass
21 |     
22 | def test_sequential_to_gpcrdb():
23 |     gen = db.sequential_to_gpcrdb(gpcr_name, seq_num)
24 |     assert gen == gen_num
25 |     pass
26 |     
27 | 


--------------------------------------------------------------------------------
/tests/test_sequence_gprotein.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import importlib
 4 | import gpcrmining.gpcrdb as db
 5 | 
 6 | gp_name = 'gnal_human'
 7 | seq_num = [15, 23, 166, 234]
 8 | gen_num = ['G.HN.30', 'G.HN.38', 'H.HE.06', 'G.S4.05']
 9 | 
10 | def test_get_residue_info():
11 |     res_info = db.get_residue_info(gp_name)
12 |     assert len(res_info) == 381
13 |     assert res_info[0] == ['HN', 1, 'M', 'G.HN.01'] 
14 |     assert res_info[99] == ['HA', 100, 'I', 'H.HA.29']
15 |     pass
16 | 
17 | def test_gpcrdb_to_sequential():
18 |     seq = db.gpcrdb_to_sequential(gp_name, gen_num)
19 |     assert seq == seq_num
20 |     pass
21 |     
22 | def test_sequential_to_gpcrdb():
23 |     gen = db.sequential_to_gpcrdb(gp_name, seq_num)
24 |     assert gen == gen_num
25 |     pass
26 |     
27 | 


--------------------------------------------------------------------------------