├── .travis.yml ├── DEPLOY.md ├── LICENSE.txt ├── MANIFEST ├── README.md ├── dist ├── genepattern-python-1.0.3.tar.gz ├── genepattern-python-1.0.4.tar.gz ├── genepattern-python-1.0.5.tar.gz ├── genepattern-python-1.0.6.tar.gz ├── genepattern-python-1.0.7.tar.gz ├── genepattern-python-1.1.0.tar.gz ├── genepattern-python-1.1.1.tar.gz ├── genepattern-python-1.2.0.tar.gz ├── genepattern-python-1.2.1.tar.gz ├── genepattern-python-1.2.2.tar.gz └── genepattern-python-1.2.3.tar.gz ├── gp ├── __init__.py ├── core.py ├── data.py └── modules.py ├── setup.cfg ├── setup.py └── test └── test_data.py /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | - "3.7" 5 | install: 6 | - python setup.py install 7 | - pip install pandas 8 | script: 9 | - pytest 10 | notifications: 11 | - slack: genepattern:OAj2q4hee6Mk5v9r6SUCIacZ 12 | -------------------------------------------------------------------------------- /DEPLOY.md: -------------------------------------------------------------------------------- 1 | # How to Deploy to PyPi Test 2 | 3 | 1. Make sure setup.py and gp.py/__version__ are updated 4 | 2. cd to *genepattern-python* directory 5 | 3. Remove any residual build artifacts from the last time nbtools was built. This step is not necessary the first time the package is built. 6 | > rm dist/\*.tar.gz; rm dist/\*.whl 7 | 4. Build the sdist and wheel artifacts. 8 | > python -m build . 9 | 5. Upload the files by running: 10 | > twine upload -r pypitest dist/\*.tar.gz; twine upload -r pypitest dist/\*.whl 11 | 6. If the upload fails go to [https://testpypi.python.org/pypi](https://testpypi.python.org/pypi) and manually upload dist/nbtools-*.tar.gz. 12 | 7. Test the deploy by uninstalling and reinstalling the package: 13 | > pip uninstall genepattern-python; 14 | > pip install -i https://test.pypi.org/simple/ genepattern-python 15 | > 16 | # How to Deploy to Production PyPi 17 | 18 | 1. First deploy to test and ensure everything is working correctly (see above). 19 | 2. cd to *genepattern-python* directory 20 | 4. Remove any residual build artifacts from the last time nbtools was built. This step is not necessary the first time the package is built. 21 | > rm dist/\*.tar.gz; rm dist/\*.whl 22 | 5. Build the sdist and wheel artifacts. 23 | > python -m build . 24 | 6. Upload the files by running: 25 | > twine upload dist/\*.tar.gz; twine upload dist/\*.whl 26 | 7. If the upload fails go to [https://testpypi.python.org/pypi](https://testpypi.python.org/pypi) and manually upload dist/nbtools-*.tar.gz. 27 | 8. Test the deploy by uninstalling and reinstalling the package: 28 | > pip uninstall genepattern-python; 29 | > pip install genepattern-python 30 | > 31 | # How to Deploy to Conda 32 | 33 | 1. Deploy to Production PyPi 34 | 2. Navigate to Anaconda directory 35 | > cd ~/opt/anaconda3 36 | 3. Run the following, removing the existing directory if necessary: 37 | > conda skeleton pypi genepattern-python --version XXX 38 | 4. Build the package: 39 | > conda build genepattern-python 40 | 5. Converting this package to builds for other operating systems can be done as shown below. You will need to upload each 41 | built version using a separate upload command. 42 | > conda convert --platform all ./conda-bld/osx-64/genepattern-python-XXX-py37_0.tar.bz2 -o conda-bld/ 43 | 6. Upload the newly built package: 44 | > anaconda upload ./conda-bld/*/genepattern-python-XXX-py37_0.tar.bz2 -u genepattern 45 | 7. Log into the [Anaconda website](https://anaconda.org/) to make sure everything is good. -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2015-2023, Regents of the University of California & Broad Institute 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | setup.cfg 3 | setup.py 4 | gp/__init__.py 5 | gp/core.py 6 | gp/data.py 7 | gp/modules.py 8 | test/test_data.py 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Version](https://img.shields.io/pypi/v/genepattern-python.svg)](https://pypi.python.org/pypi/genepattern-python) 2 | [![Build](https://travis-ci.org/genepattern/genepattern-python.svg?branch=master)](https://travis-ci.org/genepattern/genepattern-python) 3 | [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://github.com/genepattern/example-notebooks/blob/master/GenePattern%20Python%20Tutorial.ipynb) 4 | 5 | # GenePattern Python Library 6 | 7 | This is a Python library for working with GenePattern programmatically. Behind the scenes, calls from this library execute the GenePattern REST API. 8 | 9 | ## Supported Python Versions 10 | 11 | This library requires Python 3.6+. The bundled data submodule `gp.data` also requires [pandas](http://pandas.pydata.org/), although the rest of the module does not. 12 | 13 | **Python 2 Support:** Support for Python 2 was removed in version 1.4.0. Python 2 users should use version 1.3.1. 14 | 15 | ## Installing 16 | 17 | It is recommended to install this library from PIP. Simply execute the command below: 18 | 19 | > pip install genepattern-python 20 | 21 | ## Upgrading 22 | 23 | To upgrade to the latest version of the library, execute the command below: 24 | 25 | > pip install genepattern-python --upgrade 26 | 27 | ## Tutorial 28 | 29 | A tutorial on how to use the GenePattern Python Library is [available here](https://github.com/genepattern/example-notebooks/blob/master/GenePattern%20Python%20Tutorial.ipynb). 30 | 31 | ## "Connection Reset by Peer" Error 32 | 33 | Connecting to the GenePattern public server now requires TLS 1.2+. Older versions of SSL and TLS will no longer work. If you're attempting to connect and receiving a "Connection Reset by Peer" error, you will need to update the OpenSSL library associated with your Python installation. 34 | -------------------------------------------------------------------------------- /dist/genepattern-python-1.0.3.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.0.3.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.0.4.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.0.4.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.0.5.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.0.5.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.0.6.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.0.6.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.0.7.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.0.7.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.1.0.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.1.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.1.1.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.2.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.2.0.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.2.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.2.1.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.2.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.2.2.tar.gz -------------------------------------------------------------------------------- /dist/genepattern-python-1.2.3.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genepattern/genepattern-python/ec6109095732c628785ee2edfcd6aa46ab3baf8b/dist/genepattern-python-1.2.3.tar.gz -------------------------------------------------------------------------------- /gp/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | """ 4 | GenePattern Python Client 5 | 6 | Compatible with Python 3.6+ 7 | """ 8 | 9 | __authors__ = ['Thorin Tabor', 'Chet Birger'] 10 | __copyright__ = 'Copyright 2014-2024, Regents of the University of California & Broad Institute' 11 | __version__ = '24.01' 12 | __status__ = 'Production' 13 | 14 | # Import core functionality 15 | from .core import GPException, GPFile, GPJob, GPJobSpec, GPResource, GPServer, GPTask, GPTaskParam, GPJSONEncoder 16 | -------------------------------------------------------------------------------- /gp/core.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import urllib 3 | import base64 4 | import json 5 | import time 6 | from contextlib import closing 7 | import urllib.request 8 | import urllib.parse 9 | import urllib.error 10 | 11 | 12 | GP_JOB_TAG = 'GenePattern Python Client' 13 | 14 | 15 | class GPServer(object): 16 | """ 17 | Wrapper for data needed to make server calls. 18 | 19 | Wraps the server url, username and password, and provides helper function 20 | to construct the authorization header. 21 | """ 22 | 23 | def __init__(self, url, username, password): 24 | self.url = url 25 | self.username = username 26 | self.password = password 27 | self.token = None 28 | self.last_job = None 29 | 30 | def __str__(self): 31 | return self.url + ' ' + self.username 32 | 33 | def authorization_header(self): 34 | """ 35 | Returns a string containing the authorization header used to authenticate 36 | with GenePattern. This string is included in the header of subsequent 37 | requests sent to GenePattern. 38 | """ 39 | return 'Basic %s' % base64.b64encode(bytes(self.username + ':' + self.password, 'ascii')).decode('ascii') 40 | 41 | def system_message(self): 42 | url = f"{self.url}/rest/v1/config/system-message" 43 | request = urllib.request.Request(url) 44 | response = urllib.request.urlopen(request) 45 | return response.read().decode('utf-8') 46 | 47 | def login(self): 48 | """Log in to the OAuth2 endpoint""" 49 | safe_username = urllib.parse.quote(self.username) 50 | safe_password = urllib.parse.quote(self.password) 51 | url = f"{self.url}/rest/v1/oauth2/token?grant_type=password&username={safe_username}&password={safe_password}&client_id=GenePatternNotebook-{safe_username}" 52 | 53 | request = urllib.request.Request(url) 54 | response = urllib.request.urlopen(request, b'') 55 | if response.getcode() != 200: 56 | raise urllib.error.HTTPError(url, response.getcode(), 'Invalid username or password', response.getheaders(), None) 57 | self.token = json.loads(response.read())['access_token'] 58 | return self.token 59 | 60 | def upload_file(self, file_name, file_path): 61 | """ 62 | Upload a file to a server 63 | 64 | Attempts to upload a local file with path filepath, to the server, where it 65 | will be named filename. 66 | 67 | Args: 68 | :param file_name: The name that the uploaded file will be called on the server. 69 | :param file_path: The path of the local file to upload. 70 | 71 | Returns: 72 | :return: A GPFile object that wraps the URI of the uploaded file, or None if the upload fails. 73 | """ 74 | 75 | request = urllib.request.Request(self.url + '/rest/v1/data/upload/job_input?name=' + urllib.parse.quote(file_name)) 76 | if self.authorization_header() is not None: 77 | request.add_header('Authorization', self.authorization_header()) 78 | request.add_header('User-Agent', 'GenePatternRest') 79 | with open(file_path, 'rb') as f: 80 | data = f.read() 81 | 82 | try: 83 | response = urllib.request.urlopen(request, data) 84 | except IOError: 85 | print("authentication failed") 86 | return None 87 | 88 | if response.getcode() != 201: 89 | print("file upload failed, status code = %i" % response.getcode()) 90 | return None 91 | 92 | return GPFile(self, response.info().get('Location')) 93 | 94 | def run_job(self, job_spec, wait_until_done=True): 95 | """ 96 | Runs a job defined by jobspec, optionally non-blocking. 97 | 98 | Takes a GPJobSpec object that defines a request to run a job, and makes the 99 | request to the server. By default blocks until the job is finished by 100 | polling the server, but can also run asynchronously. 101 | 102 | Args: 103 | :param job_spec: A GPJobSpec object that contains the data defining the job to be run. 104 | :param wait_until_done: Whether to wait until the job is finished before returning. 105 | :return: 106 | 107 | Returns: 108 | a GPJob object that refers to the running job on the server. If called 109 | synchronously, this object will contain the info associated with the 110 | completed job. Otherwise, it will just wrap the URI of the running job. 111 | """ 112 | 113 | # names should be a list of names, 114 | # values should be a list of **lists** of values 115 | json_string = json.dumps({'lsid': job_spec.lsid, 'params': job_spec.params, 'tags': [GP_JOB_TAG]}, cls=GPJSONEncoder) 116 | if sys.version_info.major == 3: # Handle conversion to bytes for Python 3 117 | json_string = bytes(json_string, 'utf-8') 118 | request = urllib.request.Request(self.url + '/rest/v1/jobs') 119 | if self.authorization_header() is not None: 120 | request.add_header('Authorization', self.authorization_header()) 121 | request.add_header('Content-Type', 'application/json') 122 | request.add_header('User-Agent', 'GenePatternRest') 123 | try: 124 | response = urllib.request.urlopen(request, json_string) 125 | except urllib.error.HTTPError as e: 126 | if e.code == 403: 127 | print("job POST failed, your account is either over the data limit or you have too many jobs running") 128 | else: 129 | print(f" job POST failed, status code = {e.code}, {e.reason}") 130 | return None 131 | if response.getcode() != 201: 132 | print(" job POST failed, status code = %i" % response.getcode()) 133 | return None 134 | data = json.loads(response.read().decode('utf-8')) 135 | job = GPJob(self, data['jobId']) 136 | job.get_info() 137 | self.last_job = job # Set the last job 138 | if wait_until_done: 139 | job.wait_until_done() 140 | return job 141 | 142 | def get_token(self): 143 | """Return the authentication token, logging in to obtain it if necessary""" 144 | if self.token: return self.token 145 | else: return self.login() 146 | 147 | def get_job(self, job_number): 148 | job = GPJob(self, job_number) 149 | return job 150 | 151 | def get_task_list(self): 152 | """ 153 | Queries the GenePattern server and returns a list of GPTask objects, 154 | each representing one of the modules installed on the server. Useful 155 | for determining which are available on the server. 156 | """ 157 | request = urllib.request.Request(self.url + '/rest/v1/tasks/all.json') 158 | if self.authorization_header() is not None: 159 | request.add_header('Authorization', self.authorization_header()) 160 | request.add_header('User-Agent', 'GenePatternRest') 161 | response = urllib.request.urlopen(request) 162 | response_string = response.read().decode('utf-8') 163 | category_and_tasks = json.loads(response_string) 164 | raw_list = category_and_tasks['all_modules'] 165 | task_list = [] 166 | for task_dict in raw_list: 167 | task = GPTask(self, task_dict['lsid'], task_dict) 168 | task_list.append(task) 169 | return task_list 170 | 171 | @staticmethod 172 | def wait_until_complete(job_list): 173 | """ 174 | Args: Accepts a list of GPJob objects 175 | 176 | This method will not return until all GPJob objects in the list have 177 | finished running. That us, they are either complete and have resulted in 178 | an error state. 179 | 180 | This method will occasionally query each job to see if it is finished. 181 | """ 182 | complete = [False] * len(job_list) 183 | wait = 1 184 | while not all(complete): 185 | time.sleep(wait) 186 | for i, job in enumerate(job_list): 187 | if not complete[i]: 188 | complete[i] = job.is_finished() 189 | if not complete[i]: 190 | break 191 | wait = min(wait * 2, 10) 192 | 193 | def get_recent_jobs(self, n_jobs=10): 194 | """ 195 | Returns the user's N most recently submitted jobs on the GenePattern server. 196 | 197 | Args: If not specified, n_jobs = 10. 198 | 199 | Returns: An array of GPJob objects. 200 | """ 201 | 202 | # Query the server for the list of jobs 203 | request = urllib.request.Request(self.url + '/rest/v1/jobs/?pageSize=' + 204 | str(n_jobs) + '&userId=' + str(urllib.parse.quote(self.username)) + 205 | '&orderBy=-dateSubmitted') 206 | if self.authorization_header() is not None: 207 | request.add_header('Authorization', self.authorization_header()) 208 | request.add_header('User-Agent', 'GenePatternRest') 209 | response = urllib.request.urlopen(request) 210 | response_string = response.read().decode('utf-8') 211 | response_json = json.loads(response_string) 212 | 213 | # For each job in the JSON Array, build a GPJob object and add to the job list 214 | job_list = [] 215 | for job_json in response_json['items']: 216 | job_id = job_json['jobId'] 217 | job = GPJob(self, job_id) 218 | job.info = job_json 219 | job.load_info() 220 | job_list.append(job) 221 | 222 | return job_list 223 | 224 | 225 | class GPResource(object): 226 | """ 227 | Base class for resources on a Gene Pattern server. 228 | 229 | Wraps references to resources on a Gene Pattern server, which are all 230 | defined by a URI. Subclasses can implement custom logic appropriate for 231 | that resources such as downloading a file or info for a running or completed 232 | job. 233 | """ 234 | uri = None 235 | 236 | def __init__(self, uri): 237 | self.uri = uri 238 | 239 | 240 | class GPFile(GPResource): 241 | """ 242 | A file on a Gene Pattern server. 243 | 244 | Wraps the URI of the file, and contains methods to download the file. 245 | """ 246 | server_data = None 247 | 248 | def __init__(self, server_data, uri): 249 | GPResource.__init__(self, uri) 250 | self.server_data = server_data 251 | 252 | def open(self): 253 | """ 254 | Opens the URL associated with the GPFile and returns a file-like object 255 | with three extra methods: 256 | 257 | * geturl() - return the ultimate URL (can be used to determine if a 258 | redirect was followed) 259 | 260 | * info() - return the meta-information of the page, such as headers 261 | 262 | * getcode() - return the HTTP status code of the response 263 | """ 264 | request = urllib.request.Request(self.uri) 265 | if self.server_data.authorization_header() is not None: 266 | request.add_header('Authorization', self.server_data.authorization_header()) 267 | request.add_header('User-Agent', 'GenePatternRest') 268 | try: 269 | return urllib.request.urlopen(request) 270 | except urllib.error.HTTPError as e: 271 | if e.geturl(): # Handle S3 redirects if one is encountered 272 | return urllib.request.urlopen(urllib.request.Request(e.geturl())) 273 | else: 274 | raise e 275 | 276 | def read(self): 277 | """ 278 | Reads the contents of the GPFile and returns the contents as a string (assumes UTF-8) 279 | """ 280 | with closing(self.open()) as f: 281 | data = f.read() 282 | return data.decode("utf-8") or None 283 | 284 | def get_url(self): 285 | """ 286 | Returns the URL to the GPFile 287 | """ 288 | return self.uri 289 | 290 | def get_name(self): 291 | """ 292 | Returns the file name of the output file 293 | """ 294 | return urllib.parse.unquote(self.get_url().split('/')[-1]) 295 | 296 | def __str__(self): 297 | return self.uri 298 | 299 | 300 | class GPJob(GPResource): 301 | """ 302 | A running or completed job on a Gene Pattern server. 303 | 304 | Contains methods to get the info of the job, and to wait on a running job by 305 | polling the server until the job is completed. 306 | """ 307 | json = None # Define the backing JSON string 308 | info = None 309 | server_data = None 310 | task_name = None 311 | task_lsid = None 312 | user_id = None 313 | job_number = None 314 | status = None 315 | date_submitted = None 316 | log_files = None 317 | output_files = None 318 | num_output_files = None 319 | children = None 320 | input_params = None 321 | 322 | def __init__(self, server_data, uri): 323 | super(GPJob, self).__init__(str(uri)) 324 | self.info = None 325 | self.server_data = server_data 326 | self.job_number = uri 327 | 328 | def get_info(self): 329 | """ 330 | Query the GenePattern server for metadata regarding this job and assign 331 | that metadata to the properties on this GPJob object. Including: 332 | * Task Name 333 | * LSID 334 | * User ID 335 | * Job Number 336 | * Status 337 | * Date Submitted 338 | * URL of Log Files 339 | * URL of Output Files 340 | * Number of Output Files 341 | """ 342 | request = urllib.request.Request(self.server_data.url + "/rest/v1/jobs/" + self.uri + "?includeInputParams=true") 343 | if self.server_data.authorization_header() is not None: 344 | request.add_header('Authorization', self.server_data.authorization_header()) 345 | request.add_header('User-Agent', 'GenePatternRest') 346 | response = urllib.request.urlopen(request) 347 | 348 | self.json = response.read().decode('utf-8') 349 | self.info = json.loads(self.json) 350 | self.load_info() 351 | 352 | def load_info(self): 353 | """ 354 | Parses the JSON object stored at GPJob.info and assigns its metadata to 355 | properties of this GPJob object. 356 | 357 | Primarily intended to be called from GPJob.get_info(). 358 | """ 359 | self.task_name = self.info['taskName'] 360 | self.task_lsid = self.info['taskLsid'] 361 | self.user_id = self.info['userId'] 362 | self.job_number = int(self.info['jobId']) 363 | self.status = self.get_status_message() 364 | self.date_submitted = self.info['dateSubmitted'] 365 | self.log_files = self.info['logFiles'] 366 | self.output_files = self.info['outputFiles'] 367 | self.num_output_files = self.info['numOutputFiles'] 368 | self.input_params = self.info['inputParams'] if 'inputParams' in self.info else None 369 | 370 | # Create children, if relevant 371 | self.children = self.get_child_jobs() 372 | 373 | def get_input_params(self): 374 | """Return the input parameters used to launch the job""" 375 | 376 | # Lazily load info 377 | if self.info is None: self.get_info() 378 | 379 | return { list(p.keys())[0]:list(p.values())[0] for p in self.input_params } 380 | 381 | def get_child_jobs(self): 382 | """ 383 | Queries the GenePattern server for child jobs of this job, creates GPJob 384 | objects representing each of them and assigns the list of them to the 385 | GPJob.children property. Then return this list. 386 | """ 387 | # Lazily load info 388 | if self.info is None: 389 | self.get_info() 390 | 391 | # Lazily load children 392 | if self.children: 393 | return self.children 394 | else: 395 | if 'children' in self.info: 396 | child_list = [] 397 | for child in self.info['children']['items']: 398 | child_job = GPJob(self.server_data, child['jobId']) 399 | child_job.info = child 400 | child_job.load_info() 401 | child_list.append(child_job) 402 | return child_list 403 | else: # No children? Return empty list 404 | return [] 405 | 406 | def is_finished(self): 407 | """ 408 | Queries the server to check if the job has been completed. 409 | Returns True or False. 410 | """ 411 | self.get_info() 412 | 413 | if 'status' not in self.info: 414 | return False 415 | if 'isFinished' not in self.info['status']: 416 | return False 417 | 418 | return self.info['status']['isFinished'] 419 | 420 | def has_error(self): 421 | """ 422 | Queries the server to check if the job has an error. 423 | Returns True or False. 424 | """ 425 | self.get_info() 426 | 427 | if 'status' not in self.info: 428 | return False 429 | if 'hasError' not in self.info['status']: 430 | return False 431 | 432 | return self.info['status']['hasError'] 433 | 434 | def is_pending(self): 435 | """ 436 | Queries the server to check if the job is pending. 437 | Returns True or False. 438 | """ 439 | self.get_info() 440 | 441 | if 'status' not in self.info: 442 | return False 443 | if 'isPending' not in self.info['status']: 444 | return False 445 | 446 | return self.info['status']['isPending'] 447 | 448 | def get_status_message(self): 449 | """ 450 | Returns the status message for the job, querying the 451 | server if necessary. 452 | """ 453 | # Lazily load info 454 | if self.info is None: 455 | self.get_info() 456 | 457 | return self.info['status']['statusMessage'] 458 | 459 | def get_tags(self): 460 | """ 461 | Returns the tags for the job, querying the 462 | server if necessary. 463 | """ 464 | # Lazily load info 465 | if self.info is None: 466 | self.get_info() 467 | 468 | if 'tags' in self.info: 469 | return [structure['tag']['tag'] for structure in self.info['tags']] 470 | else: 471 | return [] 472 | 473 | def get_comments(self): 474 | """ 475 | Returns the comments for the job, querying the 476 | server if necessary. 477 | """ 478 | # Lazily load info 479 | if self.info is None: 480 | self.get_info() 481 | 482 | if 'comments' in self.info: 483 | return [structure['text'] for structure in self.info['comments']['comments']] 484 | else: 485 | return [] 486 | 487 | def get_output_files(self): 488 | """ 489 | Returns a list of the files output by the job, querying the server if 490 | necessary. If the job has output no files, an empty list will be 491 | returned. 492 | """ 493 | # Lazily load info 494 | if self.info is None: 495 | self.get_info() 496 | 497 | if 'outputFiles' in self.info: 498 | return [GPFile(self.server_data, f['link']['href']) for f in self.info['outputFiles']] 499 | else: 500 | return [] 501 | 502 | def get_file(self, name): 503 | """ 504 | Returns the output file with the specified name, if no output files 505 | match, returns None. 506 | """ 507 | files = self.get_output_files() 508 | for f in files: 509 | if f.get_name() == name: 510 | return f 511 | return None 512 | 513 | def wait_until_done(self): 514 | """ 515 | This method will not return until the job is either complete or has 516 | reached an error state. This queries the server periodically to check 517 | for an update in status. 518 | """ 519 | wait = 1 520 | while True: 521 | time.sleep(wait) 522 | self.get_info() 523 | if self.info['status']['isFinished']: 524 | break 525 | # implements a crude exponential back off 526 | wait = min(wait * 2, 60) 527 | 528 | def get_job_status_url(self): 529 | """ 530 | Returns the URL of the job's status page on the GenePattern server 531 | """ 532 | return self.server_data.url + "/pages/index.jsf?jobid=" + self.uri 533 | 534 | def get_permissions(self): 535 | """Get the permissions object for the GP job""" 536 | url = f'{self.server_data.url}/rest/v1/jobs/{self.job_number}/permissions' 537 | request = urllib.request.Request(url) 538 | if self.server_data.authorization_header() is not None: 539 | request.add_header('Authorization', self.server_data.authorization_header()) 540 | request.add_header('User-Agent', 'GenePatternRest') 541 | 542 | response = urllib.request.urlopen(request) 543 | return json.loads(response.read()) 544 | 545 | def set_permissions(self, permissions): 546 | """Set the group permissions for the job""" 547 | url = f'{self.server_data.url}/rest/v1/jobs/{self.job_number}/permissions' 548 | data = json.dumps(permissions).encode('utf8') 549 | request = urllib.request.Request(url, data=data, method='PUT') 550 | if self.server_data.authorization_header() is not None: 551 | request.add_header('Authorization', self.server_data.authorization_header()) 552 | request.add_header('User-Agent', 'GenePatternRest') 553 | urllib.request.urlopen(request) 554 | 555 | def terminate(self): 556 | """Terminate a running or pending job""" 557 | url = f'{self.server_data.url}/rest/v1/jobs/{self.job_number}/terminate' 558 | request = urllib.request.Request(url, method='DELETE') 559 | if self.server_data.authorization_header() is not None: 560 | request.add_header('Authorization', self.server_data.authorization_header()) 561 | request.add_header('User-Agent', 'GenePatternRest') 562 | return urllib.request.urlopen(request).code == 200 563 | 564 | 565 | class GPJobSpec(object): 566 | """ 567 | Data needed to make a request to perform a job on a Gene Pattern server 568 | 569 | Encapsulates the data needed to make a server call to run a job. This 570 | includes the LSID of the job, and the parameters. Helper methods set 571 | the LSID and parameters. 572 | """ 573 | 574 | def __init__(self, server_data, lsid): 575 | self.params = [] 576 | self.lsid = lsid 577 | self.server_data = server_data 578 | 579 | def set_parameter(self, name, values, group_id=None): 580 | """ 581 | Sets the value of a parameter for the GPJobSpec 582 | :param name: name of the parameter 583 | :param values: list of values for the parameter 584 | :param group_id: optional parameter group ID 585 | :return: 586 | """ 587 | if not isinstance(values, list): 588 | values = [values] 589 | if group_id is None: 590 | self.params.append({'name': name, 'values': values}) 591 | else: 592 | self.params.append({'name': name, 'groupId': group_id, 'values': values}) 593 | 594 | 595 | class GPTask(GPResource): 596 | """Describes a GenePattern task (module or pipeline). 597 | 598 | The constructor retrieves data transfer object (DTO) describing task from GenePattern server. 599 | The DTO contains general task information (LSID, Category, Description, Version comment), 600 | a parameter list and a list of initial values. Class includes getters for each of these 601 | components. 602 | 603 | """ 604 | json = None # Define the backing JSON string 605 | server_data = None 606 | description = None 607 | name = None 608 | documentation = None 609 | lsid = None 610 | version = None 611 | params = None 612 | dto = None 613 | 614 | _params_loaded = False 615 | submit_json = None 616 | job_spec = None 617 | job = None 618 | job_number = None 619 | 620 | def __init__(self, server_data, name_or_lsid, task_dict=None): 621 | GPResource.__init__(self, name_or_lsid) 622 | self.server_data = server_data 623 | 624 | # Initialize descriptive attributes if available 625 | if task_dict is not None: 626 | if 'name' in task_dict: 627 | self.name = task_dict['name'] 628 | if 'lsid' in task_dict: 629 | self.lsid = task_dict['lsid'] 630 | if 'description' in task_dict: 631 | self.description = task_dict['description'] 632 | if 'documentation' in task_dict: 633 | self.documentation = task_dict['documentation'] 634 | if 'version' in task_dict: 635 | self.version = task_dict['version'] 636 | 637 | def param_load(self): 638 | """ 639 | Queries the server for the parameter information and other metadata associated with 640 | this task 641 | """ 642 | escaped_uri = urllib.parse.quote(self.uri) 643 | request = urllib.request.Request(self.server_data.url + '/rest/v1/tasks/' + escaped_uri) 644 | if self.server_data.authorization_header() is not None: 645 | request.add_header('Authorization', self.server_data.authorization_header()) 646 | request.add_header('User-Agent', 'GenePatternRest') 647 | response = urllib.request.urlopen(request) 648 | self.json = response.read().decode('utf-8') 649 | self.dto = json.loads(self.json) 650 | 651 | self.description = self.dto['description'] if 'description' in self.dto else "" 652 | self.name = self.dto['name'] 653 | self.documentation = self.dto['documentation'] if 'documentation' in self.dto else "" 654 | self.lsid = self.dto['lsid'] 655 | self.version = self.dto['version'] if 'version' in self.dto else "" 656 | self.params = [] 657 | for param in self.dto['params']: 658 | self.params.append(GPTaskParam(self, param)) 659 | self._params_loaded = True 660 | 661 | def get_lsid(self): 662 | """ 663 | :return: Returns the task's LSID as a string 664 | """ 665 | return self.lsid 666 | 667 | def get_name(self): 668 | """ 669 | :return: Returns the task's name as a string 670 | """ 671 | return self.name 672 | 673 | def get_description(self): 674 | """ 675 | :return: Returns the task's description as a string 676 | """ 677 | return self.description 678 | 679 | def get_version(self): 680 | """ 681 | :return: Returns the task's version as a string 682 | """ 683 | return self.version 684 | 685 | def get_parameters(self): 686 | """ 687 | :return: Returns a list of GPTaskParam objects representing the parameters for this 688 | task, in order 689 | """ 690 | return self.params 691 | 692 | def make_job_spec(self): 693 | """ 694 | :return: Returns a GPJobSpec used to launch a job of this task type 695 | """ 696 | # If the parameters haven't been loaded yet, do so 697 | if not self._params_loaded and self.server_data is not None: 698 | self.param_load() 699 | 700 | return GPJobSpec(self.server_data, self.lsid) 701 | 702 | 703 | class GPTaskParam(object): 704 | """ 705 | Encapsulates single parameter information. 706 | 707 | The constructor's input parameter is the data transfer object 708 | associated with a single task parameter (i.e., element from list 709 | returned by GPTask.getParameters) 710 | """ 711 | task = None 712 | dto = None 713 | name = None 714 | description = None 715 | attributes = None 716 | 717 | def __init__(self, task, dto): 718 | self.task = task 719 | self.dto = dto 720 | self.name = list(dto)[0] 721 | if 'description' in dto[self.name]: 722 | self.description = dto[self.name]['description'] 723 | else: 724 | self.description = '' 725 | self.attributes = dto[self.name]['attributes'] 726 | 727 | def get_dto(self): 728 | """ 729 | Returns a raw object representing the parameter. This is mostly used to 730 | initialize GPTaskParam objects 731 | """ 732 | return self.dto 733 | 734 | def get_name(self): 735 | """ 736 | :return: Returns the parameter name as a string 737 | """ 738 | return self.name 739 | 740 | def is_optional(self): 741 | """ 742 | Returns whether the parameter is optional or required 743 | :return: Return True if optional, False if required 744 | """ 745 | if (('optional' in self.attributes and bool(self.attributes['optional'].strip())) or 746 | ('minValue' in self.attributes and self.attributes['minValue'] == 0)): 747 | return True 748 | else: 749 | return False 750 | 751 | def get_description(self): 752 | """ 753 | :return: Returns the parameter description as a string 754 | """ 755 | return self.description 756 | 757 | def get_type(self): 758 | """ 759 | Returns either 'File' or 'String'. 760 | 761 | The type attribute (e.g., java.io.File, java.lang.Integer, java.lang.Float), 762 | which might give a hint as to what string should represent, 763 | is not enforced and not employed consistently across all tasks, so we ignore. 764 | """ 765 | 766 | if 'TYPE' in self.attributes and 'MODE' in self.attributes: 767 | dto_type = self.attributes['TYPE'] 768 | dto_mode = self.attributes['MODE'] 769 | if dto_type == 'FILE' and dto_mode == 'IN': 770 | return 'File' 771 | return 'String' 772 | 773 | def is_password(self): 774 | """ 775 | Indicates whether password flag associated with string parameter. 776 | 777 | If string parameter flagged as password, UI should not display 778 | parameter value on input field (e.g., mask out with asterisks). 779 | 780 | """ 781 | 782 | if 'type' in self.attributes and self.attributes['type'] == 'PASSWORD': 783 | return True 784 | else: 785 | return False 786 | 787 | def allow_multiple(self): 788 | """ 789 | Return whether the parameter allows multiple values or not 790 | :return: Return True if the parameter allows multiple values, otherwise False 791 | """ 792 | # note that maxValue means "max number of values", and is an integer, not a string 793 | if ('maxValue' in self.attributes) and (self.attributes['maxValue'] > 1): 794 | return True 795 | elif ('numValues' in self.attributes) and ('+' in self.attributes['numValues']): 796 | return True 797 | else: 798 | return False 799 | 800 | def get_default_value(self): 801 | """ 802 | Return the default value for the parameter. If here is no default value, return None 803 | """ 804 | if ('default_value' in self.attributes and 805 | bool(self.attributes['default_value'].strip())): 806 | return self.attributes['default_value'] 807 | else: 808 | return None 809 | 810 | def is_choice_param(self): 811 | """ 812 | :return: Return True if this is a choice parameter, otherwise False 813 | """ 814 | return 'choiceInfo' in self.dto[self.name] 815 | 816 | def get_choice_status(self): 817 | """ 818 | Returns a message field, which indicates whether choices statically 819 | or dynamically defined, and flag indicating whether a dynamic file 820 | selection loading error occurred. 821 | 822 | Throws an error if this is not a choice parameter. 823 | """ 824 | if 'choiceInfo' not in self.dto[self.name]: 825 | raise GPException('not a choice parameter') 826 | 827 | status = self.dto[self.name]['choiceInfo']['status'] 828 | return status['message'], status['flag'] 829 | 830 | def get_choice_href(self): 831 | """ 832 | Returns the HREF of a dynamic choice parameter. 833 | Throws an error if this is not a choice parameter. 834 | """ 835 | if 'choiceInfo' not in self.dto[self.name]: 836 | raise GPException('not a choice parameter') 837 | 838 | return self.dto[self.name]['choiceInfo']['href'] 839 | 840 | def get_choice_selected_value(self): 841 | """ 842 | Returns the default selection from a choice menu 843 | Throws an error if this is not a choice parameter. 844 | """ 845 | if 'choiceInfo' not in self.dto[self.name]: 846 | raise GPException('not a choice parameter') 847 | choice_info_dto = self.dto[self.name]['choiceInfo'] 848 | if 'selectedValue' in choice_info_dto: 849 | return self.dto[self.name]['choiceInfo']['selectedValue'] 850 | else: 851 | return None 852 | 853 | def allow_choice_custom_value(self): 854 | """ 855 | Returns boolean indicating whether choice parameter supports custom value. 856 | 857 | If choice parameter supports custom value, user can provide parameter value 858 | other than those provided in choice list. 859 | """ 860 | if 'choiceInfo' not in self.dto[self.name]: 861 | raise GPException('not a choice parameter') 862 | return self._is_string_true(self.dto[self.name]['choiceInfo']['choiceAllowCustom']) 863 | 864 | # this needs additional work - some kind of limited polling to give server time to assemble list 865 | def get_choices(self): 866 | """ 867 | Returns a list of dictionary objects, one dictionary object per choice. 868 | 869 | Each object has two keys defined: 'value', 'label'. 870 | The 'label' entry is what should be displayed on the UI, the 'value' entry 871 | is what is written into GPJobSpec. 872 | """ 873 | 874 | if 'choiceInfo' not in self.dto[self.name]: 875 | raise GPException('not a choice parameter') 876 | if self.get_choice_status()[1] == "NOT_INITIALIZED": 877 | print(self.get_choice_status()) 878 | print("choice status not initialized") 879 | 880 | request = urllib.request.Request(self.get_choice_href()) 881 | if self.task.server_data.authorization_header() is not None: 882 | request.add_header('Authorization', self.task.server_data.authorization_header()) 883 | request.add_header('User-Agent', 'GenePatternRest') 884 | response = urllib.request.urlopen(request) 885 | self.dto[self.name]['choiceInfo'] = json.loads(response.read().decode('utf-8')) 886 | return self.dto[self.name]['choiceInfo']['choices'] 887 | 888 | def get_alt_name(self): 889 | """ 890 | Returns the alternate name of a parameter. 891 | Only pipeline prompt-when-run parameters 892 | can have alternate names and alternate descriptions 893 | """ 894 | if ('altName' in self.attributes and 895 | bool(self.attributes['altName'].strip())): 896 | return self.attributes['altName'] 897 | else: 898 | return None 899 | 900 | def get_alt_description(self): 901 | """ 902 | Returns the alternate description of a parameter. 903 | Only pipeline prompt-when-run parameters 904 | can have alternate names and alternate descriptions 905 | """ 906 | if 'altDescription' in self.attributes and bool(self.attributes['altDescription'].strip()): 907 | return self.attributes['altDescription'] 908 | else: 909 | return None 910 | 911 | @staticmethod 912 | def _is_string_true(test): 913 | """ 914 | Determines whether a string value is "True" for the purposes of GenePattern's 915 | parameter parsing 916 | """ 917 | if type(test) is bool: 918 | return test 919 | return test.lower() in ('on', 'yes', 'true') 920 | 921 | 922 | class GPException(Exception): 923 | """ 924 | An exception raised by GenePattern and returned to the user 925 | """ 926 | def __init__(self, value): 927 | self.value = value 928 | 929 | def __str__(self): 930 | return repr(self.value) 931 | 932 | 933 | class GPJSONEncoder(json.JSONEncoder): 934 | """ 935 | Custom JSON encoder for encoding GenePattern classes 936 | """ 937 | def default(self, o): 938 | if isinstance(o, GPFile): 939 | return o.get_url() 940 | 941 | return {'__{}__'.format(o.__class__.__name__): o.__dict__} 942 | -------------------------------------------------------------------------------- /gp/data.py: -------------------------------------------------------------------------------- 1 | __authors__ = ['Thorin Tabor'] 2 | __copyright__ = 'Copyright 2014-2022, Regents of the University of California & Broad Institute' 3 | __version__ = '0.1.2' 4 | __status__ = 'Beta' 5 | 6 | """ 7 | GenePattern Data Tools 8 | 9 | Tools for loading GenePattern data files (such as GCT or ODF files) and 10 | working with their contents in a Pandas DataFrame. 11 | 12 | Compatible with Python 3.4+ 13 | """ 14 | 15 | import gp 16 | import re 17 | import io 18 | import types 19 | import pandas as pd 20 | import urllib.request 21 | 22 | 23 | def GCT(gct_obj): 24 | """ 25 | Create a Dataframe with the contents of the GCT file 26 | """ 27 | # Handle all the various initialization types and get an IO object 28 | gct_io = _obtain_io(gct_obj) 29 | 30 | # Load the GCT file into a DataFrame 31 | df = pd.read_csv(gct_io, sep='\t', header=2, index_col=[0, 1], skip_blank_lines=True) 32 | 33 | # Return the Dataframe 34 | return df 35 | 36 | 37 | class CLS: 38 | def __init__(self, cls_obj): 39 | """ 40 | Create a CLS object with the contents of a CLS file 41 | 42 | For more information on the CLS format see: 43 | http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide 44 | 45 | :cls_obj: The CLS file. Accepts a file-like object, a file path, a URL to the file 46 | or a string containing the raw data. 47 | """ 48 | 49 | hdr_line_re = re.compile(r"^(?P[0-9]+)\s+(?P[0-9]+)\s+1\s*$") 50 | assign_line_re = re.compile(r"^\s*(?:.+\s+)*.+\s*$", re.ASCII) 51 | 52 | # Handle all the various initialization types and get an IO object 53 | cls_io = _obtain_io(cls_obj) 54 | 55 | # Read the file as an array of lines 56 | raw_lines = cls_io.readlines() 57 | 58 | # Convert byte strings to unicode strings 59 | raw_lines = _bytes_to_str(raw_lines) 60 | 61 | # Validate cls file format and contents 62 | hdr_line_match = re.match(hdr_line_re, raw_lines[0]) 63 | if hdr_line_match: 64 | (self.num_samples, self.num_classes) = (int(hdr_line_match["samples"]), int(hdr_line_match["classes"])) 65 | 66 | self.class_names = raw_lines[1].replace('#', '').split() 67 | if len(self.class_names) != self.num_classes: 68 | raise ValueError("Mismatch in {0} between number of class names declared ({1}) and number provided ({2})".format(cls_obj, self.num_classes, len(self.class_names))) 69 | 70 | else: 71 | raise ValueError("Bad format in {0} for header line: {1}".format(cls_obj, raw_lines[0])) 72 | 73 | assign_line_match = re.match(assign_line_re, raw_lines[2]) 74 | if assign_line_match: 75 | self.class_assignments = [i for i in raw_lines[2].split()] 76 | if self.num_samples != len(self.class_assignments): 77 | raise ValueError("Mismatch in {0} between number of samples declared ({1}) and number of class assignments provided ({2})".format(cls_obj, self.num_samples, len(self.class_assignments))) 78 | else: 79 | raise ValueError("Bad format in {0} for class assignment line: {1}".format(cls_obj, raw_lines[2])) 80 | 81 | 82 | def ODF(odf_obj): 83 | """ 84 | Create a Dataframe with the contents of the ODF file 85 | 86 | For more information on the ODF format see: 87 | http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide 88 | 89 | :odf_obj: The ODF file. Accepts a file-like object, a file path, a URL to the file 90 | or a string containing the raw data. 91 | """ 92 | 93 | # Handle all the various initialization types and get an IO object 94 | odf_io = _obtain_io(odf_obj) 95 | 96 | # Read the file as an array of lines 97 | raw_lines = odf_io.readlines() 98 | 99 | # Convert byte strings to unicode strings 100 | raw_lines = _bytes_to_str(raw_lines) 101 | 102 | try: 103 | # Read the header count 104 | header_count = _extract_header_number(raw_lines) 105 | 106 | # Read the header dict 107 | headers = _parse_header(raw_lines) 108 | 109 | # Read the model 110 | model = _extract_model(headers) 111 | 112 | # Read the column names, if available 113 | column_names = _extract_column_names(headers) 114 | 115 | # Assemble the data 116 | data_lines = _join_data_lines(raw_lines, header_count) 117 | 118 | # Put together new IO 119 | odf_string_io = io.StringIO(data_lines) 120 | 121 | # Load the ODF file into a DataFrame 122 | df = pd.read_csv(odf_string_io, sep='\t', header=None, names=column_names, skip_blank_lines=True) 123 | 124 | # Apply ODF-specific properties 125 | _apply_odf_properties(df, headers, model) 126 | 127 | # Return the Dataframe 128 | return df 129 | 130 | # Catch any errors related to parsing the ODF file 131 | except Exception: 132 | raise TypeError('Error parsing ODF file') 133 | 134 | ############################ 135 | # Shared Utility Functions # 136 | ############################ 137 | 138 | 139 | def write_odf(df, file_path, headers=None): 140 | """ 141 | Writes the provided DataFrame to a ODF file. 142 | 143 | Assumes that the DataFrame matches the structure of those produced 144 | by the ODF() function in this library 145 | 146 | :param df: the DataFrame to write to ODF 147 | :param file_path: path to which to write the ODF file 148 | :param headers: A dict of ODF headers, if none are provided will attempt to read them from the ODF file 149 | :return: 150 | """ 151 | if headers is None and hasattr(df, 'headers'): 152 | headers = df.headers 153 | else: 154 | raise AttributeError('ODF headers not provided') 155 | 156 | with open(file_path, 'w') as file: 157 | file.write(_header_dict_to_str(headers)) 158 | df.to_csv(file, sep='\t', header=False, index=False, mode='w+') 159 | 160 | 161 | def write_gct(df, file_path): 162 | """ 163 | Writes the provided DataFrame to a GCT file. 164 | 165 | Assumes that the DataFrame matches the structure of those produced 166 | by the GCT() function in this library 167 | 168 | :param df: 169 | :param file_path: 170 | :return: 171 | """ 172 | with open(file_path, 'w') as file: 173 | file.write('#1.2\n' + str(len(df.index)) + '\t' + str(len(df.columns)) + '\n') 174 | df.to_csv(file, sep='\t', mode='w+') 175 | 176 | 177 | def _is_url(url): 178 | """ 179 | Used to determine if a given string represents a URL 180 | """ 181 | regex = re.compile( 182 | r'^(?:http|ftp)s?://' # http:// or https:// 183 | r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... 184 | r'localhost|' # localhost... 185 | r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip 186 | r'(?::\d+)?' # optional port 187 | r'(?:/?|[/?]\S+)$', re.IGNORECASE) 188 | if regex.match(url) is not None: 189 | return True 190 | else: 191 | return False 192 | 193 | 194 | def _obtain_io(init_obj): 195 | io_obj = None 196 | 197 | # Check to see if init_obj is a GPFile object from the GenePattern Python Client 198 | if isinstance(init_obj, gp.GPFile): 199 | io_obj = init_obj.open() 200 | 201 | # Check to see if init_obj is a file-like object 202 | # Skip if a file-like object has already been obtained 203 | if hasattr(init_obj, 'read') and io_obj is None: 204 | io_obj = init_obj 205 | 206 | # Check to see if gct_obj is a string 207 | # Skip if a file-like object has already been obtained 208 | if isinstance(init_obj, str) and io_obj is None: 209 | 210 | # Check to see if the string contains multiple lines 211 | # If it does, it is likely raw data 212 | if '\n' in init_obj: 213 | # Wrap the raw data in a StringIO (file-like object) 214 | io_obj = io.StringIO(init_obj) 215 | 216 | # Check to see if the string contains a URL 217 | # Skip if a file-like object has already been obtained 218 | if _is_url(init_obj) and io_obj is None: 219 | io_obj = urllib.request.urlopen(init_obj) 220 | 221 | # Otherwise try treating the string as a file path 222 | # If this doesn't work throw an error, we don't know what to do with this string. 223 | # Skip if a file-like object has already been obtained 224 | if io_obj is None: 225 | try: 226 | # Point gct_obj to file (read in the code below) 227 | io_obj = open(init_obj, 'r') 228 | except IOError: 229 | raise IOError('Input string not determined to be raw data, URL or readable file.') 230 | 231 | # If we still don't have a file-like object at this point, throw an error 232 | if io_obj is None: 233 | raise TypeError('Unknown type passed to GCT() or ODF()') 234 | 235 | # Return the io_obj 236 | return io_obj 237 | 238 | 239 | ######################### 240 | # ODF Utility Functions # 241 | ######################### 242 | 243 | 244 | def _header_dict_to_str(headers): 245 | # Define the list of headers to handle as special cases 246 | special = ['HeaderLines', 'COLUMN_NAMES', 'COLUMN_TYPES', 'Model', 'DataLines'] 247 | 248 | # Add the initial ODF version line 249 | combined = 'ODF 1.0\n' 250 | 251 | # Add HeaderLines 252 | combined += 'HeaderLines=' + str(len(headers)) + '\n' 253 | 254 | # Add column names, if available 255 | if 'COLUMN_NAMES' in headers: 256 | combined += 'COLUMN_NAMES:' + str(headers['COLUMN_NAMES']) + '\n' 257 | 258 | # Add column types, if available 259 | if 'COLUMN_TYPES' in headers: 260 | combined += 'COLUMN_TYPES:' + str(headers['COLUMN_TYPES']) + '\n' 261 | 262 | # Add model, if available 263 | if 'Model' in headers: 264 | combined += 'Model=' + str(headers['Model']) + '\n' 265 | 266 | # Add remaining headers 267 | for key, value in sorted(headers.items()): 268 | if key not in special: 269 | combined += str(key) + '=' + str(value) + '\n' 270 | 271 | # Add data lines, if available 272 | if 'DataLines' in headers: 273 | combined += 'DataLines=' + str(headers['DataLines']) + '\n' 274 | 275 | # Return the combined header string 276 | return combined 277 | 278 | 279 | def _apply_odf_properties(df, headers, model): 280 | """ 281 | Attach properties to the Dataframe to carry along ODF metadata 282 | 283 | :param df: The dataframe to be modified 284 | :param headers: The ODF header lines 285 | :param model: The ODF model type 286 | """ 287 | df.headers = headers 288 | df.model = model 289 | 290 | 291 | def _bytes_to_str(lines): 292 | """ 293 | Convert all lines from byte string to unicode string, if necessary 294 | """ 295 | if len(lines) >= 1 and hasattr(lines[0], 'decode'): 296 | return [line.decode('utf-8') for line in lines] 297 | else: 298 | return lines 299 | 300 | 301 | def _extract_header_value(line): 302 | """ 303 | Extracts a key / value pair from a header line in an ODF file 304 | """ 305 | 306 | # Skip blank lines, returning None 307 | if not line: 308 | return None 309 | 310 | # Attempt to split by equals sign 311 | halves = line.split('=') 312 | if len(halves) > 1: 313 | key = halves[0].strip() 314 | value = halves[1].strip() 315 | return {key: value} 316 | 317 | # Otherwise, attempt to split by colon 318 | else: 319 | halves = line.split(':') 320 | key = halves[0].strip() 321 | value = halves[1].strip() 322 | return {key: value} 323 | 324 | 325 | def _extract_column_names(headers): 326 | """ 327 | Return an array containing the column names, extracted from the headers 328 | """ 329 | if 'COLUMN_NAMES' in headers: 330 | name_string = headers['COLUMN_NAMES'] 331 | return name_string.split('\t') 332 | else: 333 | return None 334 | 335 | 336 | def _extract_model(headers): 337 | """ 338 | Return an array containing the column names, extracted from the headers 339 | """ 340 | return headers['Model'] 341 | 342 | 343 | def _extract_header_number(lines): 344 | """ 345 | Extracts the number of header lines from the second line of the ODF file 346 | """ 347 | pair = _extract_header_value(lines[1]) 348 | value_list = list(pair.values()) 349 | return int(value_list[0]) 350 | 351 | 352 | def _parse_header(lines): 353 | """ 354 | Parse the ODF header and return a dict of all key / value pairs 355 | """ 356 | header_count = _extract_header_number(lines) 357 | header_dict = {} 358 | for i in range(2, header_count + 2): 359 | pair = _extract_header_value(lines[i]) 360 | 361 | if not pair: # Ignore empty strings 362 | continue 363 | 364 | header_dict.update(pair) 365 | return header_dict 366 | 367 | 368 | def count_header_blanks(lines, count): 369 | """ 370 | Count the number of blank lines in the header 371 | """ 372 | blanks = 0 373 | for i in range(2, count + 2): 374 | pair = _extract_header_value(lines[i]) 375 | if not pair: 376 | blanks += 1 377 | return blanks 378 | 379 | 380 | def _join_data_lines(lines, skip): 381 | """ 382 | Join all the data lines into a byte string 383 | """ 384 | lines = list(map(str.strip, lines)) 385 | blank_lines = count_header_blanks(lines, skip) 386 | body = lines[skip + blank_lines + 2:] 387 | return '\n'.join(body) 388 | -------------------------------------------------------------------------------- /gp/modules.py: -------------------------------------------------------------------------------- 1 | """ 2 | GenePattern Module Creator 3 | 4 | Tools for converting Python scripts into GenePattern server modules 5 | 6 | Compatible with Python 3.4+ 7 | """ 8 | import getpass 9 | import json 10 | import os 11 | import pathlib 12 | import socket 13 | import string 14 | import zipfile 15 | from enum import Enum 16 | from datetime import datetime 17 | 18 | import re 19 | 20 | from gp import GPServer 21 | 22 | __authors__ = ['Thorin Tabor'] 23 | __version__ = '0.2.0' 24 | __status__ = 'Alpha' 25 | 26 | 27 | class StringEnum(str, Enum): 28 | """ 29 | Enum where members are also (and must be) strings 30 | Necessary for JSON serialization of the Enums declared here 31 | """ 32 | 33 | 34 | class Privacy(StringEnum): 35 | PRIVATE = "private" 36 | PUBLIC = "public" 37 | 38 | 39 | class Quality(StringEnum): 40 | DEVELOPMENT = "development" 41 | PREPRODUCTION = "preproduction" 42 | PRODUCTION = "production" 43 | 44 | 45 | class OS(StringEnum): 46 | ANY = "any" 47 | LINUX = "linux" 48 | MAC = "mac" 49 | WINDOWS = "windows" 50 | 51 | 52 | class CPU(StringEnum): 53 | ANY = "any" 54 | ALPHA = "alpha" 55 | INTEL = "intel" 56 | POWERPC = "powerpc" 57 | SPARC = "sparn" 58 | 59 | 60 | MANIFEST_FILE_NAME = "manifest" 61 | DEFAULT_LSID_AUTHORITY = 0 62 | 63 | 64 | class GPTaskSpec: 65 | """ 66 | Specification needed to create a new GenePattern module 67 | """ 68 | 69 | def __init__(self, name=None, description="", version_comment="", author="", institution="", 70 | categories=[], privacy=Privacy.PRIVATE, quality=Quality.DEVELOPMENT, 71 | file_format=[], os=OS.ANY, cpu=CPU.ANY, language="Python", 72 | user=None, support_files=[], documentation="", license="", 73 | lsid=None, version=1, lsid_authority=DEFAULT_LSID_AUTHORITY, command_line=None, parameters=[]): 74 | 75 | self.name = name 76 | self.description = description 77 | self.version_comment = version_comment 78 | self.author = author 79 | self.institution = institution 80 | 81 | self.categories = categories 82 | self.privacy = privacy 83 | self.quality = quality 84 | 85 | self.file_format = file_format 86 | self.os = os 87 | self.cpu = cpu 88 | self.language = language 89 | 90 | self.user = user 91 | self.support_files = support_files 92 | self.documentation = documentation 93 | self.license = license 94 | 95 | # Use None for no LSID checking 96 | self.lsid_authority = lsid_authority if lsid_authority != DEFAULT_LSID_AUTHORITY else LSIDAuthority() 97 | self.version = version 98 | self.lsid = lsid if lsid else self._get_lsid() 99 | self.command_line = command_line 100 | self.parameters = parameters 101 | 102 | def validate(self): 103 | """ 104 | Perform some basic checks to help ensure that the specification is valid. 105 | Throws an exception if an invalid value is found. 106 | Returns true if all checks were passed. 107 | :return: boolean 108 | """ 109 | # Check all values for None 110 | for attr in self.__dict__: 111 | if self.__dict__[attr] is None: 112 | raise ValueError(attr + " is not set") 113 | 114 | # Validate name 115 | invalid_chars = GPTaskSpec.invalid_chars() 116 | if any(char in invalid_chars for char in self.name): 117 | raise ValueError("module name includes invalid characters: " + self.name) 118 | 119 | # Validate LSID 120 | self._valid_lsid() 121 | 122 | # Validate categories 123 | if not self.all_strings(self.categories): 124 | raise TypeError("categories contains non-string value: " + str(self.categories)) 125 | 126 | # Validate file formats 127 | if not self.all_strings(self.file_format): 128 | raise TypeError("file_format contains non-string value: " + str(self.file_format)) 129 | 130 | # Validate support files 131 | if not self.all_strings(self.support_files): 132 | raise TypeError("support_files contains non-string value: " + str(self.support_files)) 133 | 134 | # Validate parameter list 135 | if not self._all_params(self.parameters): 136 | raise TypeError("parameters contains non-GPParamSpec value: " + str(self.parameters)) 137 | 138 | # Validate individual parameters 139 | for param in self.parameters: 140 | param.validate() 141 | 142 | # Return that everything validates 143 | return True 144 | 145 | def create_zip(self, clean=True, increment_version=True, register=True): 146 | """ 147 | Creates a GenePattern module zip file for upload and installation on a GenePattern server 148 | :param clean: boolean 149 | :return: 150 | """ 151 | # First validate the attributes 152 | self.validate() 153 | 154 | # Check to see if an existing interferes with module creation 155 | if os.path.exists(MANIFEST_FILE_NAME): 156 | raise OSError("existing manifest blocks manifest file creation") 157 | 158 | # Write the manifest 159 | self.write_manifest() 160 | 161 | # Create the zip 162 | self._zip_files() 163 | 164 | # Increment the version of the module 165 | if increment_version: 166 | self.version += 1 167 | 168 | # Register the module with the LSID authority 169 | if register and self.lsid_authority: 170 | self.lsid_authority.register(self) 171 | 172 | # Clean up the manifest 173 | if clean: 174 | os.remove(MANIFEST_FILE_NAME) 175 | 176 | def _get_lsid(self): 177 | """ 178 | Assigns the module an LSID from the LSID authority 179 | :return: 180 | """ 181 | # If no LSID authority, skip LSID assignment 182 | if self.lsid_authority is None: 183 | return 184 | 185 | # Otherwise assign the LSID 186 | return self.lsid_authority.lsid() 187 | 188 | def _zip_files(self): 189 | """ 190 | Adds the manifest and all support files to the zip file 191 | :return: 192 | """ 193 | # Create the zip file 194 | zip = zipfile.ZipFile(self.name + '.zip', 'w', zipfile.ZIP_DEFLATED) 195 | 196 | # Add the manifest file to the zip 197 | zip.write(MANIFEST_FILE_NAME) 198 | 199 | # Add the support files to the zip 200 | for file in self.support_files: 201 | zip.write(file) 202 | 203 | # Close the zip file 204 | zip.close() 205 | 206 | def write_manifest(self, module_directory=""): 207 | """ 208 | Write a GenePattern manifest file for the module 209 | :param module_directory: optionally write to subdirectory 210 | :return: 211 | """ 212 | # First validate the spec 213 | self.validate() 214 | 215 | # Open the manifest file for writing 216 | manifest_file = open(os.path.join(module_directory, "manifest"), "w") 217 | 218 | # Write the header 219 | manifest_file.write("# " + self.name + "\n") 220 | manifest_file.write("# " + str(datetime.now()) + "\n") 221 | manifest_file.write("# Generated by Python Module Creator v" + __version__ + "\n") 222 | 223 | # Write initial attributes 224 | manifest_file.write("JVMLevel=\n") 225 | manifest_file.write("LSID=" + self.manifest_escape(self.lsid) + ':' + str(self.version) + "\n") 226 | manifest_file.write("author=" + self._author_line() + "\n") 227 | manifest_file.write("categories=" + ';'.join(self.categories) + "\n") 228 | manifest_file.write("commandLine=" + self.command_line + "\n") 229 | manifest_file.write("cpuType=" + str(self.cpu.value) + "\n") 230 | manifest_file.write("description=" + self.description + "\n") 231 | manifest_file.write("fileFormat=" + ';'.join(self.file_format) + "\n") 232 | manifest_file.write("language=" + self.language + "\n") 233 | manifest_file.write("license=" + self.license + "\n") 234 | manifest_file.write("name=" + self.name + "\n") 235 | manifest_file.write("os=" + str(self.os.value) + "\n") 236 | 237 | # Write parameter attributes 238 | for index, param in enumerate(self.parameters): 239 | manifest_file.write(param.manifest_repr(index+1)) 240 | 241 | # Write footer attributes 242 | manifest_file.write("privacy=" + str(self.privacy.value) + "\n") 243 | manifest_file.write("publicationDate=" + self._publication_date() + "\n") 244 | manifest_file.write("quality=" + str(self.quality.value) + "\n") 245 | manifest_file.write("taskDoc=" + self.documentation + "\n") 246 | manifest_file.write("taskType=" + self._task_type() + "\n") 247 | manifest_file.write("userid=" + self.user + "\n") 248 | manifest_file.write("version=" + self.version_comment + "\n") 249 | 250 | # Close the file 251 | manifest_file.close() 252 | 253 | def _author_line(self): 254 | """ 255 | Helper method to concatenate author and institution values, if necessary 256 | :return: string 257 | """ 258 | if self.author and self.institution: 259 | return self.author + ";" + self.institution 260 | elif self.author: 261 | return self.author 262 | else: 263 | return self.institution 264 | 265 | @staticmethod 266 | def _publication_date(): 267 | """ 268 | Helper method to return the publication date in the expected format 269 | :return: string 270 | """ 271 | return datetime.now().strftime("%m/%d/%Y %H\:%M") 272 | 273 | def _task_type(self): 274 | """ 275 | Helper method for extracting taskType from the categories list 276 | :return: 277 | """ 278 | if self.categories: 279 | return self.categories[0] 280 | else: 281 | return "" 282 | 283 | @staticmethod 284 | def manifest_escape(string): 285 | """ 286 | Escape colon and equals characters for inclusion in manifest file 287 | :param string: 288 | :return: string 289 | """ 290 | return string.replace(':', '\:').replace('=', '\=') 291 | 292 | @staticmethod 293 | def all_strings(arr): 294 | """ 295 | Ensures that the argument is a list that either is empty or contains only strings 296 | :param arr: list 297 | :return: 298 | """ 299 | if not isinstance([], list): 300 | raise TypeError("non-list value found where list is expected") 301 | return all(isinstance(x, str) for x in arr) 302 | 303 | @staticmethod 304 | def _all_params(arr): 305 | """ 306 | Ensures that the argument is a list that either is empty or contains only GPParamSpec's 307 | :param arr: list 308 | :return: 309 | """ 310 | if not isinstance([], list): 311 | raise TypeError("non-list value found for parameters") 312 | return all(isinstance(x, GPParamSpec) for x in arr) 313 | 314 | def _valid_lsid(self): 315 | """ 316 | Performs some basic (non-comprehensive) LSID validation 317 | :return: 318 | """ 319 | if not isinstance(self.lsid, str): 320 | raise TypeError("lsid is not a string, string expected: " + str(self.lsid)) 321 | 322 | if self.lsid.count(':') != 4: 323 | raise ValueError("lsid contains incorrect number of colons, 4 expected: " + str(self.lsid)) 324 | 325 | if self.lsid.split(':')[0].lower() != 'urn': 326 | raise ValueError("lsid does not begin with urn: " + str(self.lsid)) 327 | 328 | # If an LSID authority is specified, check with the authority 329 | if self.lsid_authority: 330 | if not self.lsid_authority.validate(self.lsid, check_existing=False): 331 | raise ValueError("lsid does not the authority: " + str(self.lsid)) 332 | 333 | @staticmethod 334 | def invalid_chars(): 335 | """ 336 | Returns a set of characters which are not valid in module or parameter names 337 | :return: 338 | """ 339 | return set(string.punctuation.replace("_", "").replace(".", "") + string.whitespace) 340 | 341 | 342 | class Type(StringEnum): 343 | FILE = "FILE" 344 | TEXT = "TEXT" 345 | INTEGER = "Integer" 346 | FLOATING_POINT = "Floating Point" 347 | DIRECTORY = "DIRECTORY" 348 | PASSWORD = "PASSWORD" 349 | 350 | 351 | class JavaType(StringEnum): 352 | FILE = "java.io.File" 353 | TEXT = "java.lang.String" 354 | INTEGER = "java.lang.Integer" 355 | FLOATING_POINT = "java.lang.Float" 356 | DIRECTORY = "DIRECTORY" 357 | PASSWORD = "PASSWORD" 358 | 359 | 360 | class Optional(StringEnum): 361 | REQUIRED = "" 362 | OPTIONAL = "on" 363 | 364 | 365 | class GPParamSpec: 366 | """ 367 | Specification needed to create a parameter for a new GenePattern module 368 | """ 369 | def __init__(self, name=None, description="", optional=Optional.REQUIRED, 370 | type=Type.TEXT, choices={}, value="", default_value="", 371 | file_format=[], min_values=0, max_values=1, 372 | flag="", prefix_when_specified=False): 373 | 374 | self.name = name 375 | self.description = description 376 | self.optional = optional 377 | 378 | self.type = type 379 | self.choices = choices 380 | self.value = value 381 | self.default_value = default_value 382 | 383 | self.file_format = file_format 384 | self.min_values = min_values 385 | self.max_values = max_values 386 | 387 | self.flag = flag 388 | self.prefix_when_specified = prefix_when_specified 389 | 390 | def validate(self): 391 | # Check all values for None, only max_values is allowed to be None 392 | for attr in self.__dict__: 393 | if self.__dict__[attr] is None and attr != "max_values": 394 | raise ValueError(attr + " is not set") 395 | 396 | # Validate name 397 | invalid_chars = GPTaskSpec.invalid_chars() 398 | if any(char in invalid_chars for char in self.name): 399 | raise ValueError("parameter name includes invalid characters: " + self.name) 400 | 401 | # Validate min_values 402 | if not isinstance(self.min_values, int): 403 | raise ValueError("min_values not an int in: " + self.name) 404 | 405 | # Validate max_values 406 | if not isinstance(self.max_values, int) and self.max_values is not None and self.max_values != float("inf"): 407 | raise ValueError("max_values not an int, None or infinity in: " + self.name) 408 | 409 | # Validate file formats 410 | if not GPTaskSpec.all_strings(self.file_format): 411 | raise TypeError("file_format contains non-string value in parameter: " + self.name) 412 | 413 | # Validate choices dict 414 | if not isinstance(self.choices, dict): 415 | raise TypeError("choices is not dict in parameter: " + self.name) 416 | 417 | # Return that everything validates 418 | return True 419 | 420 | def manifest_repr(self, p_num): 421 | """ 422 | Builds a manifest string representation of the parameters and returns it 423 | :param p_num: int 424 | :return: string 425 | """ 426 | # Build the parameter prefix 427 | prefix = "p" + str(p_num) + "_" 428 | 429 | # Generate the manifest string 430 | manifest = prefix + "MODE=" + ("IN" if self.type == Type.FILE else "") + "\n" 431 | manifest += prefix + "TYPE=" + str(self.type.value) + "\n" 432 | if self.type == Type.FILE and len(self.choices) > 0: 433 | manifest += prefix + "choices=" + self._choices() + "\n" 434 | manifest += prefix + "default_value=" + self.default_value + "\n" 435 | manifest += prefix + "description=" + GPTaskSpec.manifest_escape(self.description) + "\n" 436 | manifest += prefix + "fileFormat=" + ';'.join(self.file_format) + "\n" 437 | manifest += prefix + "flag=" + self.flag + "\n" 438 | manifest += prefix + "name=" + self.name + "\n" 439 | manifest += prefix + "numValues=" + self._num_values() + "\n" 440 | manifest += prefix + "optional=" + str(self.optional.value) + "\n" 441 | manifest += prefix + "prefix=" + (self.flag if self.prefix_when_specified else "") + "\n" 442 | manifest += prefix + "prefix_when_specified=" + (self.flag if self.prefix_when_specified else "") + "\n" 443 | manifest += prefix + "type=" + self._java_type() + "\n" 444 | manifest += prefix + "value=" + (self._choices() if self.type != Type.FILE and len(self.choices) > 0 else "") + "\n" 445 | 446 | # Return the manifest string 447 | return manifest 448 | 449 | def _choices(self): 450 | """ 451 | Generate a string of choices as key/value pairs 452 | :return: string 453 | """ 454 | # Generate key/value strings 455 | pairs = [] 456 | for key, value in self.choices.items(): 457 | pairs.append(str(value) + "=" + str(key)) 458 | 459 | # Assemble into overall string and escape 460 | return GPTaskSpec.manifest_escape(";".join(pairs)) 461 | 462 | def _num_values(self): 463 | """ 464 | Generate a valid num_values string based off min_values and max_values 465 | :return: string 466 | """ 467 | # Add min_values to string 468 | num_values = str(self.min_values) if self.min_values else "0" 469 | 470 | # Handle infinite max_values or finite max_values 471 | if self.max_values is None or self.max_values == float("inf"): 472 | num_values += "+" 473 | else: 474 | num_values += ".." + str(self.max_values) 475 | 476 | # Return the num_values string 477 | return num_values 478 | 479 | def _java_type(self): 480 | """ 481 | Translates GenePattern type string to Java type string 482 | :return: string 483 | """ 484 | return JavaType[self.type.name].value 485 | 486 | 487 | class LSIDAuthority: 488 | """ 489 | Class representing a Life Science Identifier (LSID) authority used to assign an LSID to the 490 | GenePattern modules produced by this package. 491 | """ 492 | authority = None 493 | base_lsid = None 494 | module_count = None 495 | registered_modules = None 496 | 497 | def __init__(self, authority=None): 498 | """ 499 | Initializes an LSID authority. Looks for an LSID authority file. If no such file is found, 500 | a file will be created with an LSID based off the machine's hostname or IP address. 501 | 502 | LSID authority standard location: 503 | ~/.genepattern/lsid_authority.json 504 | 505 | :param authority: Must be a file path to the LSID authority file, or a gpserver object (not implemented). 506 | """ 507 | 508 | # Handle default authority file locations 509 | if authority is None: 510 | # Check for LSID authority file in user directory 511 | user_dir = str(pathlib.Path.home()) 512 | gp_dir = os.path.join(user_dir, '.genepattern') 513 | default_authority_file = os.path.join(gp_dir, 'lsid_authority.json') 514 | if os.path.isfile(default_authority_file): 515 | # Authority file found, assign path 516 | authority = default_authority_file 517 | else: 518 | # No authority file found, lazily create file 519 | self._create_authority_file(default_authority_file) 520 | authority = default_authority_file 521 | 522 | # Handle a GenePattern server as the LSID authority 523 | if type(authority) == GPServer: 524 | raise NotImplementedError("Support for GenePattern server as a remote LSID authority is not implemented.") 525 | 526 | # Handle a string file path as an LSID authority 527 | if type(authority) == str: 528 | if os.path.isfile(authority): 529 | if os.access(authority, os.R_OK) and os.access(authority, os.W_OK): 530 | try: 531 | # Load the authority file 532 | self.authority = authority 533 | self._load_lsid_authority() 534 | except Exception as e: 535 | raise RuntimeError("Unable to read authority file due to: " + str(e)) 536 | else: 537 | raise RuntimeError("Missing permissions on provided LSID authority file") 538 | else: 539 | raise RuntimeError("Provided LSID authority isn't a file") 540 | 541 | @staticmethod 542 | def _generate_namespace(): 543 | """ 544 | Generate an LSID namespace based off Jupyter user or system user 545 | :return: string 546 | """ 547 | raw_namespace = None 548 | 549 | # Get the Jupyter user, if available 550 | try: 551 | raw_namespace = os.environ['JPY_USER'] 552 | except KeyError: 553 | pass 554 | 555 | # Otherwise get the current user 556 | if raw_namespace is None or raw_namespace == '': 557 | raw_namespace = getpass.getuser() 558 | 559 | # Remove illegal characters and return 560 | return re.sub(r'[^\w.-]', '-', raw_namespace) 561 | 562 | @staticmethod 563 | def _generate_domain(): 564 | """ 565 | Generate an LSID domain based off a setting file or the hostname 566 | :return: string 567 | """ 568 | 569 | # Check for LSID domain setting file 570 | try: 571 | user_dir = str(pathlib.Path.home()) 572 | jupyter_dir = os.path.join(user_dir, '.jupyter') 573 | domain_path = os.path.join(jupyter_dir, 'lsid_domain') 574 | with open(domain_path, 'r') as domain_file: 575 | domain = str(domain_file.read()).strip() 576 | if domain is not None and domain != '': 577 | return domain 578 | except: 579 | # Ignore exceptions, simply fall back to the domain name 580 | pass 581 | 582 | # If this fails, return the fully qualified domain name 583 | return socket.getfqdn() 584 | 585 | def _generate_base_lsid(self): 586 | """ 587 | Generates and returns a base LSID 588 | :return: 589 | """ 590 | domain = self._generate_domain() 591 | namespace = self._generate_namespace() 592 | 593 | # Return the base LSID 594 | return "urn:lsid:" + domain + ":" + namespace 595 | 596 | def _create_blank_authority(self): 597 | """ 598 | Returns a dictionary structure representing a blank LSID authority file 599 | :return: dict 600 | """ 601 | return { 602 | 'base_lsid': self._generate_base_lsid(), 603 | 'module_count': 0, 604 | 'registered_modules': {}, 605 | } 606 | 607 | def _create_authority_file(self, file_path): 608 | """ 609 | Create a new LSID authority file at the indicated location 610 | :param file_path: location of LSID authority file 611 | """ 612 | parent_dir = os.path.dirname(os.path.realpath(file_path)) 613 | 614 | # Create the parent directory if it does not exist 615 | if not os.path.exists(parent_dir): 616 | os.makedirs(parent_dir) 617 | 618 | # Create blank LSID authority structure 619 | blank = self._create_blank_authority() 620 | 621 | # Write blank structure to new authority file 622 | with open(file_path, 'w+') as authority_file: 623 | json.dump(blank, authority_file, sort_keys=True, indent=4, separators=(',', ': ')) 624 | 625 | def _load_lsid_authority(self): 626 | """ 627 | Load (or reload) the LSID authority file and set class attributes 628 | """ 629 | authority_file = open(self.authority, 'r') 630 | authority_json = json.load(authority_file) 631 | authority_file.close() 632 | self.base_lsid = authority_json['base_lsid'] 633 | self.module_count = int(authority_json['module_count']) 634 | self.registered_modules = authority_json['registered_modules'] 635 | 636 | def _next_lsid_number(self): 637 | """ 638 | Return a string representing the next module number for this LSID authority 639 | :return: 640 | """ 641 | if self.module_count is None: 642 | raise Exception("Module count in LSID authority not initialized") 643 | 644 | return str(self.module_count+1).zfill(4) 645 | 646 | def _assemble_lsid(self, module_number): 647 | """ 648 | Return an assembled LSID based off the provided module number and the authority's base LSID. 649 | Note: Never includes the module's version number. 650 | :param module_number: 651 | :return: string 652 | """ 653 | if self.base_lsid is None: 654 | raise Exception("Base LSID in LSID authority not initialized") 655 | 656 | return self.base_lsid + ":" + str(module_number) 657 | 658 | def lsid(self): 659 | """ 660 | Acquire a new LSID assigned by the LSID authority 661 | :return: string - assigned LSID 662 | """ 663 | return self._assemble_lsid(self._next_lsid_number()) 664 | 665 | def register(self, task_spec): 666 | """ 667 | Registers a module specification with the LSID authority. 668 | Validates that it possesses an LSID assigned by the authority. 669 | Raises an exception if registration wasn't successful. 670 | :param task_spec: 671 | :return: boolean - True if registration was successful 672 | """ 673 | if self.validate(task_spec.lsid): 674 | # Add the module name to the map 675 | self.registered_modules[task_spec.lsid] = task_spec.name 676 | 677 | # Increment module count 678 | self.module_count += 1 679 | 680 | # Write the updated LSID authority file and reload 681 | with open(self.authority, 'w') as authority_file: 682 | json.dump({ 683 | 'base_lsid': self.base_lsid, 684 | 'module_count': self.module_count, 685 | 'registered_modules': self.registered_modules, 686 | }, authority_file, sort_keys=True, indent=4, separators=(',', ': ')) 687 | self._load_lsid_authority() 688 | else: 689 | raise RuntimeError("Module LSID id not valid: " + str(task_spec.lsid)) 690 | 691 | return True 692 | 693 | def validate(self, lsid, check_existing=True): 694 | """ 695 | Validates an LSID with the LSID authority. 696 | :param lsid: 697 | :return: boolean - is the LSID valid with this authority? 698 | """ 699 | # Base LSID matches 700 | if not lsid.startswith(self.base_lsid): 701 | return False 702 | 703 | # Module number isn't already taken 704 | if check_existing and lsid in self.registered_modules: 705 | return False 706 | 707 | # Everything checks out, return True 708 | return True 709 | 710 | def lookup(self, lsid): 711 | """ 712 | Look up the name of a module by LSID assigned by the authority. 713 | Returns None if the LSID is not found. 714 | :param lsid: 715 | :return: string or none 716 | """ 717 | if self.registered_modules is None or lsid not in self.registered_modules: 718 | return None 719 | else: 720 | return self.registered_modules[lsid] 721 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | # Read version and other metadata from file 5 | __version__ = '24.01' 6 | 7 | with open('README.md') as f: 8 | long_description = f.read() 9 | 10 | setup( 11 | name='genepattern-python', 12 | packages=['gp'], 13 | version=__version__, 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | description='Library for programmatically interacting with GenePattern from Python.', 17 | author='Thorin Tabor', 18 | author_email='tmtabor@cloud.ucsd.edu', 19 | url='https://github.com/genepattern/genepattern-python', 20 | download_url='https://github.com/genepattern/genepattern-python/archive/' + __version__ + '.tar.gz', 21 | keywords=['genepattern', 'genomics', 'bioinformatics'], 22 | license='BSD' 23 | ) 24 | -------------------------------------------------------------------------------- /test/test_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for loading GCT and ODF files into pandas dataframes 3 | """ 4 | import pytest 5 | 6 | import gp 7 | import gp.data 8 | import urllib.request 9 | 10 | 11 | @pytest.fixture(scope="session", autouse=True) 12 | def before_tests(request): 13 | # Download files for local use 14 | urllib.request.urlretrieve("https://datasets.genepattern.org/data/all_aml/all_aml_test.gct", "all_aml_test.gct") 15 | urllib.request.urlretrieve("https://datasets.genepattern.org/data/all_aml/all_aml_test.preprocessed.comp.marker.odf", 16 | "all_aml_test.preprocessed.comp.marker.odf") 17 | 18 | # Clean up after ourselves 19 | request.addfinalizer(after_tests) 20 | 21 | 22 | def test_gct_load_gpfile(): 23 | gpfile = gp.GPFile(gp.GPServer('http://genepattern.broadinstitute.org/gp', '', ''), 24 | 'https://datasets.genepattern.org/data/all_aml/all_aml_test.gct') 25 | gct = gp.data.GCT(gpfile) 26 | gct_asserts(gct) 27 | 28 | 29 | def test_gct_load_file(): 30 | file = open('all_aml_test.gct', 'r') 31 | gct = gp.data.GCT(file) 32 | gct_asserts(gct) 33 | 34 | 35 | def test_gct_load_url(): 36 | gct = gp.data.GCT('https://datasets.genepattern.org/data/all_aml/all_aml_test.gct') 37 | gct_asserts(gct) 38 | 39 | 40 | def test_gct_load_path(): 41 | gct = gp.data.GCT('all_aml_test.gct') 42 | gct_asserts(gct) 43 | 44 | 45 | def test_gct_load_string(): 46 | with open('all_aml_test.gct', 'r') as file: 47 | file_str = file.read() 48 | gct = gp.data.GCT(file_str) 49 | gct_asserts(gct) 50 | 51 | 52 | def test_odf_load_gpfile(): 53 | gpfile = gp.GPFile(gp.GPServer('http://genepattern.broadinstitute.org/gp', '', ''), 54 | 'https://datasets.genepattern.org/data/all_aml/all_aml_test.preprocessed.comp.marker.odf') 55 | odf = gp.data.ODF(gpfile) 56 | odf_asserts(odf) 57 | 58 | 59 | def test_odf_load_file(): 60 | file = open('all_aml_test.preprocessed.comp.marker.odf', 'r') 61 | odf = gp.data.ODF(file) 62 | odf_asserts(odf) 63 | 64 | 65 | def test_odf_load_url(): 66 | odf = gp.data.ODF('https://datasets.genepattern.org/data/all_aml/all_aml_test.preprocessed.comp.marker.odf') 67 | odf_asserts(odf) 68 | 69 | 70 | def test_odf_load_path(): 71 | odf = gp.data.ODF('all_aml_test.preprocessed.comp.marker.odf') 72 | odf_asserts(odf) 73 | 74 | 75 | def test_odf_load_string(): 76 | with open('all_aml_test.preprocessed.comp.marker.odf', 'r') as file: 77 | file_str = file.read() 78 | odf = gp.data.ODF(file_str) 79 | odf_asserts(odf) 80 | 81 | 82 | def after_tests(): 83 | pass 84 | 85 | 86 | ##################### 87 | # Utility functions # 88 | ##################### 89 | 90 | 91 | def gct_asserts(odf): 92 | assert odf.row_count() 93 | assert odf.col_count() 94 | 95 | 96 | def odf_asserts(odf): 97 | assert odf.model is not None 98 | assert odf.headers is not None 99 | assert odf.row_count() 100 | assert odf.col_count() 101 | --------------------------------------------------------------------------------