├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── TODO.md ├── build_and_upload.sh ├── pygbrowse ├── __init__.py ├── datasources.py ├── intervaloverlaps.py ├── plots.py ├── romannumerals.py └── utilities.py ├── pygbrowse_demonstration.ipynb ├── pygbrowse_logo_1_med_flat.png └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # What's New? 2 | 3 | ### 0.1.0 July 24, 2018 4 | * Initial Release 5 | ### 0.2.0 August 02, 2018 6 | * Major updates to gene model backend 7 | * Use of tabix to retrieve GFF3 data for major improvements in speed and memory consumption 8 | * Added ability to filter transcripts by support level (defaults to all) 9 | ### 0.3.0 August 05, 2018 10 | * Added automatic y-axis scaling for BedPlots 11 | * Improved ytick labeling for BedPlots. 12 | * Made WigPlots solid-filled by default. 13 | 14 | ### 0.3.1 August 09, 2018 15 | * Fixed bug where initial exons and five-prime UTRs would be missed on + strand genes. 16 | * Minor bugfixes 17 | 18 | ### X.X.X 19 | * Implemented alpha transparency for WigPlots -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Dylan Skola 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![alt pygbrowse](https://raw.githubusercontent.com/phageghost/python-genome-browser/master/pygbrowse_logo_1_med_flat.png) 2 | 3 | # python-genome-browser 4 | Tools for making plots of genomic datasets in a genome-browser-like format. 5 | 6 | ## What does it do? 7 | 8 | The python genome browser, AKA pygbrowse, makes it easy to generate aligned plots of genomic data. This is similar to software such as the [Integrated Genome Viewer (IGV) from the Broad Institute](http://software.broadinstitute.org/software/igv/) or webserver-based public genome browsers such as the [UCSC genome browser](http://genome.ucsc.edu) or the [WashU Epigenome Browser](http://epigenomegateway.wustl.edu/). 9 | 10 | ## Typical use cases: 11 | 12 | 1. Visualizing data inside a Jupyter notebook. 13 | 2. Preparing figures for presentation or publication. 14 | 15 | ## Why not just use those tools you just mentioned? 16 | 17 | For a couple of reasons: 18 | 19 | 1. Speed and ease of use: The workflow for converting data to the appropriate formats, uploading the data to a remote webserver (or providing URLs for hosted data), then selecting the region to visualize and waiting for it to be rendered is slow and cumbersome with many manual interventions needed. Although APIs exist for some of these tools, creating a flexible end-to-end automated visualizer that takes arbitrary data and displays it on your screen would require quite a bit of custom scripting. 20 | 2. Flexibility: Different genome browsers natively accept different subsets of the data types commonly used in genomics, and interconverting them is tedious. In some cases certain data types are not supported at all (e.g. long-range interactions such as HiC are not suported by the UCSC genome browser). Pygbrowse (will) natively support most of the most common data formats, removing one or more "data munging" operations from your workflow. 21 | 3. Transparency: Current genome browsers have very specific requirements for the format of their input data, not all of which are as well-documented as we might like. In addition, very little feedback is provided regarding such errors, leading the user into a painful trial-and-error process in order to load their data. Pygbrowse strives to be more flexible in the format of its input data and to provide helpful feedback when problems do occur. 22 | 4. Beauty: They say that beauty is in the eye of the beholder but to _this_ beholder the default outputs of the available genome browsers are aesthetically-lacking. They often require extensive manipulation in Adobe Illustrator or Ink to prepare them for publication or even to be legible in a presentation slide. Pygbrowse is designed from the ground up for generating static figures with proportions that can be easily scaled for common use cases. 23 | 24 | ## Wait, you're calling this a genome browser but I can't really browse around like I can with those other tools, can I? 25 | 26 | Well, you _can_ browse, in a sense, by calling the visualizer with different sets of coordinates. But it's not really designed for the kind of interactive, dynamic browsing experience provided by the other tools -- as mentioned earlier it is optimzed for rendering static "browser snapshots". That being said, however, we may add controls to provide interactive browser functionality when used inside a Jupyter notebook in a future version . . . 27 | 28 | Logo makes use of clipart by Paula Washington at [AnimalsClipart.com](http://animalsclipart.com/pig-design/) 29 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # ToDo: 2 | 3 | 1. Gene Models 4 | 1. Ensure compatibility with all versions of Ensembl GFF3 5 | 2. Add GTF support 6 | 3. Add RefSeq support 7 | 4. Auto-scale heights of gene models 8 | 2. Vector data 9 | 1. Add support for stranded data 10 | 2. Add support for Wig files 11 | * Use pyBigWig package 12 | 3. Add BAM support 13 | 3. Interaction data 14 | 1. Add arbitrary interactions (not confined to bins). Plot "arbitrary bins" (start-end) 15 | 2. Ensure current parser is compatible with BED-PE 16 | 4. Interval data 17 | 1. Refactor class structure to match others (access using query() method, etc.) 18 | 5. Matrix data 19 | 1. Use tabix to search CSV matrices on disk and read only applicable rows 20 | 2. Plot diagonal cells 21 | * Generate image, then transform, instead of transforming data and generating image. We can 22 | probably do this transparently by passing a Transform object to the axes. 23 | 4. General 24 | 2. Shrink logo. 25 | 3. Look into eliminating requirement for seaborn (at this point I think we only use the styles). 26 | 4. Make HiC bins diagonal instead of square. 27 | 5. Add parameter for subplots to share yaxis limits. 28 | 6. Add installation instructions to GitHub README 29 | 7. Allow for querying entire chromosome by passing 0s as start and end arguments 30 | 8. Add automatic windowing function to reduce complexity of plots (and corresponding PDF file sizes). 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /build_and_upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo Did you remember to increment the version number? 3 | python setup.py sdist bdist_wheel 4 | twine upload --repository-url https://upload.pypi.org/legacy/ dist/* 5 | -------------------------------------------------------------------------------- /pygbrowse/__init__.py: -------------------------------------------------------------------------------- 1 | from . import datasources 2 | from . import plots 3 | from . import utilities 4 | 5 | name = 'pygbrowse' 6 | -------------------------------------------------------------------------------- /pygbrowse/datasources.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy 4 | import pandas 5 | import pysam 6 | from scipy.signal import convolve 7 | 8 | from . import utilities 9 | from .utilities import log_print 10 | 11 | DEFAULT_TAG_COUNT_NORMALIZATION_TARGET = 10000000 12 | DEFAULT_FEATURE_SOURCES = ('ensembl', 'havana', 'ensembl_havana') 13 | DEFAULT_GENE_TYPES = ( 14 | 'gene', 'RNA', 'mt_gene', 'lincRNA_gene', 'miRNA_gene', 'ncRNA_gene', 'rRNA_gene', 'snRNA_gene', 'snoRNA_gene', 15 | 'processed_transcript') 16 | DEFAULT_TRANSCRIPT_TYPES = ('mRNA', 'transcript', 'lincRNA', 'lnc_RNA', 'miRNA', 'ncRNA', 'snRNA', 'snoRNA') 17 | DEFAULT_COMPONENT_TYPES = ('CDS', 'three_prime_UTR', 'five_prime_UTR') 18 | # DEFAULT_MAXIMUM_TRANSCRIPT_SUPPORT = 5 19 | 20 | # ToDo: For each class, allow option of loading into memory or leaving on disk (where applicable) 21 | # ToDo: Add indexing of on-disk csv-like files 22 | # ToDo: Refactor vector data sources to transparently interpolate sparse vectors. Probably will have to drop dict-of-series interface. 23 | 24 | class _ChromWrapper: 25 | def __init__(self, chrom, parent_data_source): 26 | self.chrom = chrom 27 | self.parent_data_source = parent_data_source 28 | 29 | def __getitem__(self, key): 30 | # print(key) 31 | # ToDo: Add support for step argument 32 | try: # See if key is a slice 33 | query_start = key.start 34 | query_end = key.stop 35 | except TypeError: # if not, treat as a scalar index 36 | query_start = key 37 | query_end = key + 1 38 | 39 | return self.parent_data_source.query(query_chrom=self.chrom, query_start=query_start, query_end=query_end) 40 | 41 | 42 | class _DataVector: 43 | def __init__(self, chrom, parent_data_source): 44 | self.loc = _ChromWrapper(chrom=chrom, parent_data_source=parent_data_source) 45 | 46 | 47 | class _VectorDataSource: 48 | # ToDo: Add methods for arithmetic and such, as done for old Pileups class 49 | def __init__(self, transform=None, smoothing_bandwidth=0): 50 | self.transform = transform 51 | if smoothing_bandwidth: 52 | self.convolution_kernel = utilities.gaussian_kernel(smoothing_bandwidth) 53 | else: 54 | self.convolution_kernel = None 55 | 56 | def _query(self, query_chrom, query_start, query_end): 57 | print('Stub method -- must be overridden by inheritors') 58 | 59 | def query(self, query_chrom, query_start, query_end): 60 | query_result = self._query(query_chrom=query_chrom, query_start=query_start, query_end=query_end) 61 | 62 | if self.convolution_kernel is not None: 63 | query_result = pandas.Series(convolve(query_result, self.convolution_kernel, mode='same'), index=query_result.index) 64 | if self.transform: 65 | query_result = self.transform(query_result) 66 | 67 | return query_result 68 | 69 | def __getitem__(self, key): 70 | return _DataVector(chrom=key, parent_data_source=self) 71 | 72 | 73 | class SparseVectors(_VectorDataSource): 74 | def __init__(self, series_dict, transform=None, convolution_kernel=None): 75 | self.data = series_dict 76 | self.transform = transform 77 | self.convolution_kernel = convolution_kernel 78 | 79 | 80 | def _query(self, query_chrom, query_start, query_end): 81 | this_chrom_vector = self.data[query_chrom] 82 | start_ipos = numpy.searchsorted(this_chrom_vector.keys(), query_start) - 1 83 | end_ipos = numpy.searchsorted(this_chrom_vector.keys(), query_end) + 1 84 | 85 | return this_chrom_vector.iloc[start_ipos:end_ipos] 86 | 87 | 88 | class TagDirectory(_VectorDataSource): 89 | tag_strand_translator = {0: '+', 1: '-'} 90 | 91 | def __init__(self, tag_directory_path, normalize_to=DEFAULT_TAG_COUNT_NORMALIZATION_TARGET, transform=None, 92 | smoothing_bandwidth=0): 93 | super(TagDirectory, self).__init__(transform=transform, smoothing_bandwidth=smoothing_bandwidth) 94 | 95 | self.tag_directory_path = tag_directory_path 96 | 97 | if normalize_to: 98 | # extract total tag count from tagInfo.txt 99 | tag_info_fname = os.path.join(tag_directory_path, 'tagInfo.txt') 100 | with open(tag_info_fname, 'rt') as tag_info_file: 101 | sizeline = tag_info_file.readlines()[1].strip().split('\t') 102 | num_tags = int(float(sizeline[2])) 103 | 104 | self.normalization_factor = normalize_to / num_tags 105 | 106 | def _query(self, query_chrom, query_start, query_end, read_handling='reads'): 107 | # ToDo: Add argument validation to all functions and methods with string parameters 108 | # ToDo: Add verbosity-based logging output 109 | # ToDo; Compare performance with memory-mapped pandas DataFrames 110 | query_result = pandas.Series(numpy.zeros(query_end - query_start), index=numpy.arange(query_start, query_end)) 111 | 112 | tag_filename = os.path.join(self.tag_directory_path, '{}.tags.tsv'.format(query_chrom)) 113 | start_offset = utilities.binary_search_tag_file(tag_filename=tag_filename, search_target=query_start + 1) 114 | 115 | done = False 116 | with open(tag_filename, 'rt') as tag_file: 117 | tag_file.seek(start_offset) 118 | # print(start_offset) 119 | while not done: 120 | line_fields = tag_file.readline().strip().split('\t') 121 | # print(line_fields) 122 | if len(line_fields) > 1: 123 | # chrom = line_fields[0] 124 | read_start = int(line_fields[1]) - 1 125 | # strand = self.tag_strand_translator[int(line_fields[2])] 126 | depth = float(line_fields[3]) 127 | 128 | if read_handling == 'starts': 129 | assert read_start > query_start 130 | if read_start < query_end: 131 | query_result.loc[read_start] += depth 132 | else: 133 | done = True 134 | 135 | elif read_handling == 'reads': 136 | # ToDo: Hard to do this in a streaming fashion because we don't know how far upstream to seek to capture left-overhanging reads. 137 | read_len = int(line_fields[4]) 138 | if query_start < read_start <= query_end or query_start < read_start + read_len <= query_end: 139 | # print(max(read_start, query_start), min(read_start + read_len, 140 | # query_end)) 141 | query_result.loc[max(read_start, query_start):min(read_start + read_len, 142 | query_end)] += depth # trim to visible vector 143 | else: 144 | done = True 145 | 146 | query_result *= self.normalization_factor 147 | 148 | return query_result 149 | 150 | 151 | class IntervalData: 152 | # HOMER_PEAKFILE_HEADER_ROW = 39 153 | # HOMER_PEAKFILE_COLUMN_RENAMER = {'chr': 'chrom', 'start': 'chromStart', 'end': 'chromEnd'} 154 | HOMER_PEAKFILE_NAMES = ('chrom', 'chromStart', 'chromEnd', 'strand', 'normed_tag_count') 155 | HOMER_ANNOTATEDPEAKS_COLUMN_RENAMER = {'Chr': 'chrom', 'Start': 'chromStart', 'End': 'chromEnd', 'Strand': 'strand'} 156 | 157 | def __init__(self, interval_data, format='bed'): 158 | """ 159 | Loads genomic interval information in various formats and stores them in a standardized form as a 160 | pandas.DataFrame in self.data. 161 | 162 | :param:`interval_data` should be a pandas.DataFrame representing BED-formatted genomic data, or, 163 | alternatively, a filename pointing to one of the following file formats: 164 | 165 | * A BED file 166 | * A HOMER peak file 167 | * A HOMER annotated peak file. 168 | 169 | If a filename is passed instead of a DataFrame, :param:`format` should be specified. Allowed values are: 170 | 'bed', 'homer', 'homer_annotated' 171 | 172 | :param interval_data: 173 | :param format: 174 | """ 175 | try: 176 | _ = interval_data.loc[:, ['chrom', 'chromStart', 'chromEnd', 'strand']] 177 | 178 | except KeyError: # maybe it's a BED DataFrame without column names? 179 | log_print('Guessing this is a BED-style DataFrame without column names') 180 | 181 | assert interval_data.shape[1] >= 3, 'Not enough columns (got {})!'.format(interval_data.shape[1]) 182 | 183 | if interval_data.shape[1] >= 6: # assume name is still separate column 184 | self.data = interval_data.copy() 185 | self.data.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 186 | 'score', 'strand'] + list(self.data.columns)[6:] 187 | self.data.index = self.data['name'] 188 | 189 | elif interval_data.shape[1] == 5: # assume name has been made the index and deleted from the columns 190 | self.data = interval_data.copy() 191 | self.data.columns = ['chrom', 'chromStart', 'chromEnd', 'score', 192 | 'strand'] 193 | else: 194 | self.data = interval_data.copy() 195 | self.data.columns = ['chrom', 'chromStart', 'chromEnd', 'score', 196 | 'strand'][:interval_data.shape[1] - 5] 197 | 198 | self.data.index.name = 'IntervalID' 199 | 200 | except (AttributeError,): # guessing it's a filename string 201 | log_print('Guessing {} is a filename'.format(interval_data)) 202 | # if format == 'auto': 203 | # extension = filename.split('.')[-1] 204 | # if extension.lower() == 'bed': 205 | # format = 'bed' 206 | # elif extension.lower() == 'homer': 207 | # # ToDo: Add more sophisticated methods of detecting formats since, e.g. .txt can refer to many. 208 | # format = 'homer' 209 | 210 | if format == 'bed': 211 | self.data = pandas.read_csv(interval_data, sep='\t', index_col=3, comment='#', header=None, 212 | names=['chrom', 'chromStart', 'chromEnd', 'score', 'strand']) 213 | elif format == 'homer': 214 | self.data = pandas.read_csv(interval_data, sep='\t', index_col=0, comment='#', header=None) 215 | self.data.columns = list(self.HOMER_PEAKFILE_NAMES) + list(self.data.columns)[len(self.HOMER_PEAKFILE_NAMES):] 216 | self.data.index.name = 'peak_id' 217 | # self.data = self.data.rename(columns=self.HOMER_PEAKFILE_COLUMN_RENAMER) 218 | 219 | elif format == 'homer_annotated': 220 | self.data = pandas.read_csv(interval_data, index_col=0, sep='\t') 221 | self.data.index.name = self.data.index.name.split(' ')[0] 222 | self.data = self.data.rename(columns=self.HOMER_ANNOTATEDPEAKS_COLUMN_RENAMER) 223 | 224 | else: # seems to be a properly-formatted DataFrame so just store it 225 | self.data = interval_data 226 | 227 | self.data = self.data.sort_values(['chrom', 'chromStart']) 228 | 229 | 230 | class _GeneModels(): 231 | def __init__(self): 232 | """ 233 | Superclass for data sources that describe gene models (gene boundaries, transcript 234 | boundaries, exons, introns, UTRs, etc.). 235 | """ 236 | pass 237 | 238 | def _query(self, query_chromosome, query_start, query_end): 239 | print('Must be overridden by inheritors!') 240 | 241 | def query(self, chromosome, start, end): 242 | return self._query(query_chromosome=chromosome, query_start=start, query_end=end) 243 | 244 | 245 | from pygbrowse.datasources import _GeneModels 246 | 247 | class Gff3Annotations(_GeneModels): 248 | def __init__(self, 249 | gff3_filename, 250 | incoming_chromosome_name_converter=lambda x: utilities.convert_chromosome_name(x, dialect='ensembl'), 251 | outgoing_chromosome_name_converter=lambda x: utilities.convert_chromosome_name(x, dialect='ucsc'), 252 | feature_sources=DEFAULT_FEATURE_SOURCES, 253 | gene_types=DEFAULT_GENE_TYPES, 254 | transcript_types=DEFAULT_TRANSCRIPT_TYPES, 255 | component_types=DEFAULT_COMPONENT_TYPES, 256 | # maximum_transcript_support=DEFAULT_MAXIMUM_TRANSCRIPT_SUPPORT 257 | ): 258 | 259 | super(Gff3Annotations, self).__init__() 260 | 261 | self.tabix_file = pysam.TabixFile(gff3_filename) 262 | self.incoming_chromosome_name_converter = incoming_chromosome_name_converter 263 | self.outgoing_chromosome_name_converter = outgoing_chromosome_name_converter 264 | self.feature_sources = feature_sources 265 | self.gene_types = gene_types 266 | self.transcript_types = transcript_types 267 | self.component_types = component_types 268 | # self.maximum_transcript_support = maximum_transcript_support 269 | 270 | def _query(self, query_chromosome, query_start, query_end): 271 | gene_names_to_ensembl_ids = {} 272 | genes = {} 273 | transcripts = {} 274 | components = {} 275 | component_num = 0 # serial index for components without IDs 276 | 277 | query_rows = self.tabix_file.fetch(self.incoming_chromosome_name_converter(query_chromosome), query_start, 278 | query_end) 279 | 280 | for line in query_rows: 281 | split_line = line.strip('\n').split('\t') 282 | source, feature_type = split_line[1], split_line[2] 283 | 284 | if source in self.feature_sources: 285 | contig = split_line[0] 286 | start = int(split_line[3]) 287 | end = int(split_line[4]) 288 | strand = split_line[6] 289 | 290 | fields = dict(field_value_pair.split('=') for field_value_pair in split_line[8].split(';')) 291 | # print(line) 292 | 293 | if feature_type in self.gene_types: 294 | ensembl_id = fields['ID'] 295 | gene_name = fields['Name'] 296 | # assert ensembl_id not in genes, 'Duplicate entry for gene {} on line {}'.format(ensembl_id, 297 | # line_num) 298 | 299 | genes[ensembl_id] = {'contig': contig, 300 | 'start': start - 1, # convert 1-based to 0-based 301 | 'end': end, 302 | 'strand': strand, 303 | 'transcripts': []} 304 | 305 | genes[ensembl_id].update(fields) 306 | if gene_name not in gene_names_to_ensembl_ids: 307 | gene_names_to_ensembl_ids[gene_name] = [] 308 | gene_names_to_ensembl_ids[gene_name].append(ensembl_id) 309 | # print('\t added gene {}'.format(ensembl_id)) 310 | 311 | elif feature_type in self.transcript_types: 312 | # print('\ttranscript has gene parent {}. {}'.format(parent, parent in genes)) 313 | # try: 314 | # transcript_support_level = int(fields['transcript_support_level'].split(' ')[0]) 315 | # except ValueError: 316 | # passed_support_filter = False 317 | # else: 318 | # passed_support_filter = transcript_support_level < self.maximum_transcript_support 319 | ensembl_id = fields['ID'] 320 | transcripts[ensembl_id] = {'contig': contig, 321 | 'start': start - 1, # convert 1-based to 0-based 322 | 'end': end, 323 | 'strand': strand, 324 | 'components': []} 325 | transcripts[ensembl_id].update(fields) 326 | 327 | # print('\t added transcript {} with parent {}'.format(ensembl_id, parent)) 328 | 329 | 330 | elif feature_type in self.component_types: 331 | # print('\tcomponent has transcript parent {}. {}'.format(parent, parent in transcripts)) 332 | if 'exon_id' in fields: 333 | ensembl_id = fields['exon_id'] 334 | else: 335 | ensembl_id = str(component_num) 336 | component_num += 1 337 | 338 | components[ensembl_id] = {'contig': contig, 339 | 'start': start - 1, # convert 1-based to 0-based 340 | 'end': end, 341 | 'strand': strand, 342 | 'type': feature_type} 343 | components[ensembl_id].update(fields) 344 | 345 | for transcript_id, transcript_data in transcripts.items(): 346 | parent = transcript_data['Parent'] 347 | if parent in genes: 348 | genes[parent]['transcripts'].append(transcript_id) 349 | else: 350 | print('orphan transcript {} with missing parent {}!'.format(transcript_id, parent)) 351 | 352 | for component_id, component_data in components.items(): 353 | parent = component_data['Parent'] 354 | if parent in transcripts: 355 | transcripts[parent]['components'].append(component_id) 356 | else: 357 | print('orphan component {} with missing parent {}!'.format(component_id, parent)) 358 | 359 | return genes, transcripts, components, gene_names_to_ensembl_ids 360 | 361 | 362 | 363 | class _MatrixData: 364 | def __init__(self): 365 | pass 366 | 367 | def _query(self): 368 | print('Must be overridden by inheritors') 369 | 370 | def query(self, chrom, start, end): 371 | return self._query(query_chrom=chrom, query_start=start, query_end=end) 372 | 373 | 374 | class HicDataDir(_MatrixData): 375 | def __init__(self, 376 | fname_template='/home/dskola/projects/coupled_peaks/hic/c57_hic_corrs_{}.tsv', 377 | binsize=10000): 378 | self.fname_template = fname_template 379 | self.binsize = binsize 380 | 381 | def _query(self, query_chrom, query_start, query_end): 382 | this_chrom_fname = self.fname_template.format(query_chrom) 383 | this_chrom_data = pandas.read_csv(this_chrom_fname, sep='\t', index_col=0) 384 | 385 | rounded_start = utilities.roundto(query_start, binsize) 386 | rounded_end = utilities.roundto(query_end, binsize) 387 | 388 | return this_chrom_data.loc[rounded_start:rounded_end, rounded_start:rounded_end] 389 | 390 | 391 | class HicDataDict(_MatrixData): 392 | def __init__(self, data_dict, bin_size): 393 | # self.data_dict = {chrom:self.rename_hic_df(data_dict[chrom]) for chrom in data_dict} 394 | self.data_dict = data_dict 395 | self.bin_size = bin_size 396 | 397 | def _query(self, query_chrom, query_start, query_end): 398 | rounded_start = utilities.roundto(query_start, self.bin_size) 399 | rounded_end = utilities.roundto(query_end, self.bin_size) 400 | 401 | return self.data_dict[query_chrom].loc[rounded_start:rounded_end, rounded_start:rounded_end] 402 | 403 | @staticmethod 404 | def rename_hic_df(hic_df): 405 | hic_df.index = [int(name.split('-')[1]) for name in hic_df.index] 406 | hic_df.columns = [int(name.split('-')[1]) for name in hic_df.columns] 407 | return hic_df -------------------------------------------------------------------------------- /pygbrowse/intervaloverlaps.py: -------------------------------------------------------------------------------- 1 | def compute_interval_overlaps(A, B, min_overlap=0, is_sorted=False): 2 | """ 3 | Given two lists, A and B, of interval tuples in the form (name, start, end) 4 | return a list of tuples of the form: 5 | 6 | (A_name, B_name, overlap_start, overlap_end) 7 | 8 | for every pair of intervals that overlaps by at least 9 | """ 10 | overlaps = [] 11 | 12 | if not is_sorted: 13 | A = sorted(A, key=lambda x: x[1]) 14 | B = sorted(B, key=lambda x: x[1]) 15 | 16 | A_ptr = 0 17 | B_ptr = 0 18 | 19 | # Two conditions must hold to have an overlap: 20 | # B start <= A end 21 | # A start <= B end 22 | 23 | # initialize the loop by checking for overlaps at the start (all B intervals that overlap with the first interval in A) 24 | if B[B_ptr][1] <= A[A_ptr][2]: 25 | while B[B_ptr][1] <= A[A_ptr][2]: 26 | overlap_start = max(A[A_ptr][1], B[B_ptr][1]) 27 | overlap_end = min(A[A_ptr][2], B[B_ptr][2]) 28 | if overlap_end - overlap_start >= min_overlap: 29 | overlaps.append((A[A_ptr][0], B[B_ptr][0], overlap_start, overlap_end)) 30 | if B_ptr < len(B) - 1: 31 | B_ptr += 1 32 | else: 33 | break 34 | B_ptr -= 1 35 | # print('initialized: {}, {}, {}, {} '.format(A_ptr, B_ptr, A[A_ptr], B[B_ptr])) 36 | # advance the A pointer until B start is upstream of A end 37 | while True: 38 | # print('\tnew run: {}, {}, {}, {} '.format(A_ptr, B_ptr, A[A_ptr], B[B_ptr])) 39 | if A_ptr < len(A) - 1: 40 | A_ptr += 1 41 | # print('\tadvanced A: {}, {}, {}, {} '.format(A_ptr, B_ptr, A[A_ptr], B[B_ptr])) 42 | else: 43 | break 44 | # advance the B pointer until A start is upstream of B end 45 | while A[A_ptr][1] > B[B_ptr][2]: 46 | if B_ptr < len(B) - 1: 47 | B_ptr += 1 48 | # print('\tadvanced B: {}, {}, {}, {} '.format(A_ptr, B_ptr, A[A_ptr], B[B_ptr])) 49 | else: 50 | break 51 | # print('aligned: {}, {}, {}, {} '.format(A_ptr, B_ptr, A[A_ptr], B[B_ptr])) 52 | # capture the overlaps in B until B start is no longer upstream of A end 53 | if B[B_ptr][1] <= A[A_ptr][2]: 54 | while B[B_ptr][1] <= A[A_ptr][2]: 55 | overlap_start = max(A[A_ptr][1], B[B_ptr][1]) 56 | overlap_end = min(A[A_ptr][2], B[B_ptr][2]) 57 | # print('grabbing: {}, {}, {}, {} '.format(A_ptr, B_ptr, A[A_ptr], B[B_ptr])) 58 | if overlap_end - overlap_start >= min_overlap: 59 | overlaps.append((A[A_ptr][0], B[B_ptr][0], 60 | overlap_start, overlap_end)) 61 | if B_ptr < len(B) - 1: 62 | B_ptr += 1 63 | # print('\tadvanced B: {}, {}, {}, {} '.format(A_ptr, B_ptr, A[A_ptr], B[B_ptr])) 64 | else: 65 | break 66 | # A_ptr += 1 67 | B_ptr -= 1 68 | return overlaps 69 | 70 | 71 | def test(): 72 | test_A = [('A_1', 4, 8), ('A_2', 11, 19)] 73 | test_B = [('B_1', 1, 6), ('B_2', 9, 10), ('B_3', 13, 15), ('B_4', 17, 20)] 74 | print(compute_interval_overlaps(test_A, test_B)) 75 | -------------------------------------------------------------------------------- /pygbrowse/plots.py: -------------------------------------------------------------------------------- 1 | import intervaltree 2 | import matplotlib 3 | import matplotlib.pyplot as plt 4 | import numpy 5 | import pandas 6 | import scipy 7 | import scipy.signal 8 | from scipy import ndimage 9 | import seaborn 10 | 11 | from . import utilities 12 | 13 | DEFAULT_ARC_POINTS = 200 14 | DEFAULT_YLABEL_PAD = 50 15 | CHROMOSOME_DIALECT = 'ucsc' 16 | 17 | 18 | # ToDo: Move this stuff to a separate module so we can delete wholesale when the Ellipse class gets fixed. 19 | def compute_half_arc_points(center, a, b, theta1, theta2, num_points=DEFAULT_ARC_POINTS): 20 | """ 21 | Computes the coordinates for component points of a polygonal approximation to 22 | an ellipse for a single quadrant. 23 | """ 24 | # ToDo: Add input validation to make sure we stay within a single quadrant. 25 | x_coords = numpy.empty(num_points) 26 | y_coords = numpy.empty(num_points) 27 | 28 | for i in range(0, num_points): 29 | theta = (theta2 - theta1) * (i / max(num_points - 1, 1)) + theta1 30 | fi = numpy.pi / 2 - numpy.arctan(numpy.tan(theta)) 31 | x = center[0] + a * numpy.cos(fi) 32 | y = center[1] + b * numpy.sin(fi) 33 | x_coords[i] = x 34 | y_coords[i] = y 35 | 36 | return x_coords, y_coords 37 | 38 | 39 | def draw_arc(ax, center, height, width, theta1=0, theta2=numpy.pi, color='k', direction='down', 40 | num_points=DEFAULT_ARC_POINTS): 41 | """ 42 | Since Matplotlib's Arc Patches are broken at the moment, we draw arcs using the ax.plot() method 43 | instead. 44 | """ 45 | LEFT_END_THETA = numpy.pi / 2 46 | RIGHT_END_THETA = numpy.pi * 1.5 47 | MIDPOINT_THETA = numpy.pi 48 | 49 | vertical_baseline = center[1] 50 | 51 | assert LEFT_END_THETA <= theta1 <= theta2 <= RIGHT_END_THETA 52 | 53 | b = height 54 | a = width / 2 55 | 56 | # determine how to allocate points 57 | left_angle_span = min(max(MIDPOINT_THETA - theta1, 0), theta2 - theta1) 58 | right_angle_span = min(max(theta2 - MIDPOINT_THETA, 0), theta2 - theta1) 59 | total_angle_span = left_angle_span + right_angle_span 60 | left_points = int(num_points * left_angle_span / total_angle_span) 61 | right_points = num_points - left_points 62 | 63 | x_coords = numpy.empty(num_points) 64 | y_coords = numpy.empty(num_points) 65 | 66 | if left_points: 67 | # plot upper left quadrant 68 | left_theta2 = theta1 + left_angle_span 69 | x, y = compute_half_arc_points(center=(center[0], 0), 70 | a=a, b=b, 71 | theta1=theta1, theta2=left_theta2, 72 | num_points=left_points) 73 | x_coords[:left_points] = x[:] 74 | y_coords[:left_points] = y[:] 75 | if right_points: 76 | # plot upper right quadrant 77 | right_theta1 = theta2 - right_angle_span 78 | x, y = compute_half_arc_points(center=(center[0], 0), 79 | a=a, b=b, 80 | theta1=right_theta1, theta2=theta2, 81 | num_points=right_points) 82 | x_coords[left_points:] = x[:] 83 | y_coords[left_points:] = y[:] 84 | 85 | if direction == 'down': 86 | y_coords = - y_coords 87 | 88 | y_coords += vertical_baseline 89 | 90 | ax.plot(x_coords, y_coords, color=color) 91 | 92 | 93 | def draw_visible_arc(ax, center, height, width, ws, we, 94 | color='k', direction='down', num_points=DEFAULT_ARC_POINTS): 95 | """ 96 | Draws a 180 degree elliptical arc truncated by an interval of x coordinates. 97 | Does not truncate based on y coordinates 98 | """ 99 | # ToDo: Subtract 1 pi from all coordinates 100 | LEFT_END_THETA = numpy.pi / 2 + 0.00001 101 | RIGHT_END_THETA = numpy.pi * 1.5 - 0.00001 102 | 103 | def infer_theta_cutoff(x, arc_center, arc_width): 104 | a = arc_width / 2 105 | fi = numpy.arccos((x - arc_center[0]) / a) 106 | theta = numpy.arctan(1 / numpy.tan(fi)) + numpy.pi 107 | return theta 108 | 109 | if ws > center[0] - width / 2: 110 | theta_start = infer_theta_cutoff(x=ws, arc_center=center, arc_width=width) 111 | else: 112 | theta_start = LEFT_END_THETA 113 | 114 | if we < center[0] + width / 2: 115 | theta_end = infer_theta_cutoff(x=we, arc_center=center, arc_width=width) 116 | else: 117 | theta_end = RIGHT_END_THETA 118 | 119 | draw_arc(ax=ax, center=center, 120 | height=height, width=width, 121 | theta1=theta_start, theta2=theta_end, 122 | color=color, direction=direction, 123 | num_points=num_points) 124 | 125 | 126 | def draw_arc_interaction(ax, 127 | xlim, ylim, 128 | left_bin_center, 129 | right_bin_center, 130 | color='k', 131 | baseline=0.0, 132 | vertical_scaling_factor=1, 133 | direction='down', 134 | num_points=DEFAULT_ARC_POINTS): 135 | """ 136 | """ 137 | arc_width = right_bin_center - left_bin_center 138 | 139 | if direction == 'down': 140 | vertical_span = baseline - ylim[0] 141 | else: 142 | vertical_span = ylim[1] - baseline 143 | 144 | arc_height = vertical_span * (arc_width / (xlim[1] - xlim[0])) * vertical_scaling_factor 145 | 146 | draw_visible_arc(ax=ax, 147 | center=((left_bin_center + right_bin_center) / 2, baseline), 148 | height=arc_height, 149 | ws=xlim[0], 150 | we=xlim[1], 151 | width=arc_width, 152 | color=color, 153 | direction=direction, 154 | num_points=num_points) 155 | 156 | 157 | class _BrowserSubPlot: 158 | def __init__(self): 159 | # self.chrom = None 160 | # self.ws = None 161 | # self.we = None 162 | # self.fig_width = None 163 | # self.row_height = None 164 | pass 165 | 166 | # def set_globals(self, chrom, ws, we, fig_width=64, row_height=4): 167 | # self.chrom = chrom 168 | # self.ws = ws 169 | # self.we = we 170 | # self.fig_width = fig_width 171 | # self.row_height = row_height 172 | 173 | # @property 174 | # def aspect_ratio(self): 175 | # return self.fig_width / self.row_height 176 | 177 | def plot(self, ax, chrom, ws, we, fig_width, row_height): 178 | print('Stub method -- must be overridden by inheritors') 179 | 180 | 181 | class InteractionPlot(_BrowserSubPlot): 182 | def __init__(self, interaction_df, 183 | bin_size, 184 | arc_color=(0.7, 0.3, 0.6), 185 | direction='down', 186 | baseline=None, 187 | vertical_scaling_factor=1, 188 | thickness_column=None, 189 | show_bin_centers=True, 190 | label='Plac-seq'): 191 | super(InteractionPlot, self).__init__() 192 | 193 | self.interaction_df = interaction_df 194 | self.bin_size = bin_size 195 | if baseline is None: 196 | if direction == 'down': 197 | baseline = 1 198 | else: 199 | baseline = 0 200 | self.baseline = baseline 201 | self.vertical_scaling_factor = vertical_scaling_factor 202 | self.arc_color = arc_color 203 | self.label = label 204 | self.direction = direction 205 | self.show_bin_centers = show_bin_centers 206 | self.thickness_column = thickness_column 207 | 208 | 209 | def plot(self, ax, chrom, ws, we, fig_width, row_height): 210 | # Filter the interaction DataFrame to interactions with at least one anchor point within the visible window. 211 | visible_interactions = self.interaction_df.loc[self.interaction_df['chr1'] == chrom] 212 | left_bin_midpoints = (visible_interactions['end1'] + visible_interactions['start1']) / 2 213 | right_bin_midpoints = (visible_interactions['end2'] + visible_interactions['start2']) / 2 214 | left_visible = (left_bin_midpoints >= ws) & (left_bin_midpoints <= we) 215 | right_visible = (right_bin_midpoints >= ws) & (right_bin_midpoints <= we) 216 | visible_interactions = visible_interactions.loc[left_visible | right_visible] 217 | 218 | original_ylim = ax.get_ylim() 219 | ax.set_xlim(ws, we) # ToDo: Standardize this behavior across all subplot classes 220 | 221 | for interaction_id in visible_interactions.index: 222 | draw_arc_interaction(ax, 223 | left_bin_center=left_bin_midpoints.loc[interaction_id], 224 | right_bin_center=right_bin_midpoints.loc[interaction_id], 225 | xlim=(ws, we), 226 | ylim=original_ylim, 227 | color=self.arc_color, 228 | baseline=self.baseline, 229 | vertical_scaling_factor=self.vertical_scaling_factor, 230 | direction=self.direction) 231 | 232 | ax.set_xlim(ws, we) 233 | ax.set_ylim(original_ylim) 234 | 235 | if self.label: 236 | ax.set_ylabel(self.label) 237 | 238 | if self.show_bin_centers: 239 | leftmost_tick = numpy.ceil((ws - self.bin_size / 2) / self.bin_size) * self.bin_size + self.bin_size / 2 240 | rightmost_tick = numpy.floor( 241 | (we - self.bin_size / 2) / self.bin_size + 1) * self.bin_size + self.bin_size / 2 242 | 243 | ax.set_xticks(numpy.arange(leftmost_tick, rightmost_tick, self.bin_size)) 244 | ax.set_xticklabels([]) 245 | 246 | if self.direction == 'down': 247 | ax.xaxis.set_ticks_position('top') 248 | 249 | ax.set_yticks([]) 250 | 251 | class BedPlot(_BrowserSubPlot): 252 | DEFAULT_PATCH_KWARGS = {'linewidth': 1, 'edgecolor': 'k'} 253 | CHROM_COL_NUM = 0 254 | START_COL_NUM = 1 255 | END_COL_NUM = 2 256 | 257 | def __init__(self, interval_data, 258 | label='', 259 | cmap='RdBu_r', 260 | baseline=0.5, 261 | color='k', 262 | color_by='', 263 | display_value='', 264 | patch_height=0.5, 265 | pad_fraction=0.1, 266 | patch_kwargs=None): 267 | """ 268 | Takes an iterable of tuples in the form: 269 | 270 | (name, DataFrame) 271 | 272 | to be plotted in order using the .plot() method. 273 | """ 274 | super(BedPlot, self).__init__() 275 | 276 | self.interval_data = interval_data.data 277 | self.color = color 278 | self.pad_fraction = pad_fraction 279 | 280 | self.color_by = color_by 281 | if self.color_by: 282 | assert self.color_by in self.interval_data.columns, 'Color-by column {} is not in the interval data!'.format( 283 | self.color_by) 284 | extent = numpy.abs(self.interval_data[self.color_by]).max() 285 | 286 | self.color_mapper = matplotlib.cm.ScalarMappable(norm=matplotlib.colors.Normalize(vmin=-extent, 287 | vmax=extent), 288 | cmap=cmap) 289 | else: 290 | self.color_mapper = None 291 | 292 | self.display_value = display_value 293 | if self.display_value: 294 | assert self.display_value in self.interval_data.columns, 'Display value column {} is not in the interval data!'.format( 295 | self.display_value) 296 | self.label = label 297 | self.patch_height = patch_height 298 | 299 | self.baseline = baseline 300 | self.patch_kwargs = self.DEFAULT_PATCH_KWARGS 301 | if patch_kwargs: 302 | self.patch_kwargs.update(patch_kwargs) 303 | 304 | def plot(self, ax, chrom, ws, we, fig_width, row_height): 305 | ylim = ax.get_ylim() 306 | vert_span = ylim[1] - ylim[0] 307 | 308 | utilities.add_label(ax=ax, tick=self.baseline, tick_label=self.label, axis='y') 309 | 310 | 311 | visible_intervals = self.interval_data.loc[(self.interval_data.chrom == chrom) & ( 312 | ((ws <= self.interval_data.chromStart) & (self.interval_data.chromStart <= we)) | ( 313 | (ws <= self.interval_data.chromEnd) & (self.interval_data.chromEnd <= we)))] 314 | 315 | for interval_idx in range(visible_intervals.shape[0]): 316 | this_interval = visible_intervals.iloc[interval_idx] 317 | start_loc = this_interval['chromStart'] 318 | end_loc = this_interval['chromEnd'] 319 | assert end_loc > start_loc, 'interval end point must be greater than interval start!' 320 | 321 | if self.color_by: 322 | interval_color = self.color_mapper.to_rgba(this_interval[self.color_by]) 323 | else: 324 | interval_color = self.color 325 | 326 | rec = matplotlib.patches.Rectangle(xy=(start_loc, self.baseline - self.patch_height / 2), 327 | width=end_loc - start_loc, 328 | height=self.patch_height, 329 | facecolor=interval_color, 330 | **self.patch_kwargs) 331 | ax.add_patch(rec) 332 | if self.display_value: 333 | ax.text(x=(start_loc + end_loc) / 2, y=self.baseline, 334 | s='{:>0.2}'.format(this_interval[self.display_value]), ha='center') 335 | 336 | utilities.adjust_limits(ax=ax, new_position=self.baseline + self.patch_height / 2, 337 | axis='y', padding_fraction=self.pad_fraction) 338 | utilities.adjust_limits(ax=ax, new_position=self.baseline - self.patch_height / 2, 339 | axis='y', padding_fraction=self.pad_fraction) 340 | 341 | # print(self.label, ax.get_yticks(), ax.get_yticklabels(), ax.get_ylim()) 342 | 343 | 344 | class WigPlot(_BrowserSubPlot): 345 | # ToDo: Add support for stranded data 346 | def __init__(self, genomic_vector_data, label=None, color=None, solid=True, alpha=1.0, 347 | center_vector=False, scale_vector_to_plot=False, 348 | label_rotation=0, 349 | # ylim=None, 350 | smoothing_bandwidth=0): 351 | super(WigPlot, self).__init__() # placeholder since currently the superclass constructor does nothing. 352 | self.data = genomic_vector_data 353 | self.color = color 354 | self.solid = solid 355 | self.alpha = alpha 356 | self.center = center_vector 357 | self.scale_vector_to_plot = scale_vector_to_plot 358 | self.label = label 359 | 360 | if smoothing_bandwidth: 361 | self.convolution_kernel = utilities.gaussian_kernel(smoothing_bandwidth) 362 | else: 363 | self.convolution_kernel = None 364 | 365 | self.label_rotation = label_rotation 366 | 367 | 368 | def plot(self, ax, chrom, ws, we, fig_width, row_height): 369 | ylim = ax.get_ylim() 370 | 371 | vert_span = (ylim[1] - ylim[0]) 372 | vert_center = vert_span / 2 + ylim[0] 373 | 374 | this_plot_vector = self.data[chrom].loc[ws:we] 375 | 376 | if self.convolution_kernel is not None: 377 | this_plot_vector = pandas.Series( 378 | scipy.signal.convolve(this_plot_vector, self.convolution_kernel, mode='same'), 379 | index=this_plot_vector.index) 380 | 381 | if self.scale_vector_to_plot: 382 | this_plot_vector /= (this_plot_vector.max() - this_plot_vector.min()) 383 | this_plot_vector *= vert_span 384 | 385 | if self.center: 386 | this_plot_vector -= this_plot_vector.mean() 387 | this_plot_vector += vert_center 388 | 389 | this_plot_vector = this_plot_vector.loc[(this_plot_vector.index >= ws) & (this_plot_vector.index < we)] 390 | this_plot_vector.name = self.label 391 | 392 | if self.solid: 393 | ax.fill_between(x=this_plot_vector.index, y1=this_plot_vector, color=self.color, alpha=self.alpha, label=self.label) 394 | else: 395 | ax.plot(this_plot_vector.index, this_plot_vector, color=self.color, alpha=self.alpha, label=self.label) 396 | 397 | ax.autoscale(enable=True, axis='y') 398 | 399 | # ToDo: Allow labeling either by ylabel or by ax.legend 400 | if self.label: 401 | ax.set_ylabel(self.label, rotation=self.label_rotation, labelpad=DEFAULT_YLABEL_PAD) 402 | 403 | 404 | class GeneModelPlot(_BrowserSubPlot): 405 | def __init__(self, 406 | gene_annotation_data, 407 | label='Genes', 408 | color='k', 409 | feature_height=0.12, 410 | chevron_height=0.05, 411 | chevron_width=0.04, 412 | chevron_spacing=0.10, 413 | truncation_size=0.10, 414 | utr_endcap_width=0.04, 415 | gene_name_fontsize=8, 416 | genes_include=[]): 417 | 418 | super(GeneModelPlot, self).__init__() 419 | 420 | self.gene_annotation_data = gene_annotation_data 421 | 422 | self.color = color 423 | self.label = label 424 | self.feature_height = feature_height # in inches 425 | self.chevron_height = chevron_height # in inches 426 | self.chevron_width = chevron_width # in inches 427 | self.chevron_spacing = chevron_spacing # in inches 428 | self.truncation_size = truncation_size # in inches 429 | self.utr_endcap_width = utr_endcap_width # in inches 430 | self.gene_name_fontsize = gene_name_fontsize 431 | self.genes_include = genes_include # the list of genes to be included in plot 432 | 433 | @staticmethod 434 | def _arrange_genes(gene_data_list): 435 | """ 436 | Given an iterable of gene data dictionaries, 437 | returns a list of lists of gene names that 438 | should be displayed at various levels. 439 | """ 440 | gene_data_list = sorted(gene_data_list, key=lambda x: x['end'] - x['start'], reverse=True) 441 | 442 | display_levels = [intervaltree.IntervalTree(), ] 443 | 444 | for gene_data in gene_data_list: 445 | found_home = False 446 | level_idx = 0 447 | while not found_home: 448 | if level_idx >= len(display_levels): 449 | display_levels.append(intervaltree.IntervalTree()) 450 | if display_levels[level_idx].overlaps(gene_data['start'], gene_data['end']): 451 | level_idx += 1 452 | else: 453 | display_levels[level_idx].addi(gene_data['start'], gene_data['end'], data=gene_data) 454 | found_home = True 455 | 456 | return [[gene_interval.data['ID'] for gene_interval in this_level] for this_level in display_levels] 457 | 458 | def plot(self, ax, chrom, ws, we, fig_width, row_height): 459 | # find overlapping genes 460 | overlapping_genes, overlapping_transcripts, overlapping_components, ids_to_names = self.gene_annotation_data.query( 461 | chrom, ws, we) 462 | # overlapping_genes = self.genes.overlapping(chrom, ws, we) 463 | # overlapping_components = self.components.overlapping(chrom, ws, we) 464 | 465 | gene_display_levels = self._arrange_genes(overlapping_genes.values()) 466 | ax.set_ylim((-0.5, len(gene_display_levels) - 1 + 0.5)) 467 | 468 | # convert inches to data coordinates 469 | chevron_spacing_dt = (we - ws) / (fig_width / self.chevron_spacing) 470 | chevron_width_dt = (we - ws) / (fig_width / self.chevron_width) 471 | truncation_width_dt = (we - ws) / (fig_width / self.truncation_size) 472 | utr_endcap_width_dt = (we - ws) / (fig_width / self.utr_endcap_width) 473 | 474 | feature_height_dt = (ax.get_ylim()[1] - ax.get_ylim()[0]) / (row_height / self.feature_height) 475 | chevron_height_dt = (ax.get_ylim()[1] - ax.get_ylim()[0]) / (row_height / self.chevron_height) 476 | truncation_height_dt = (ax.get_ylim()[1] - ax.get_ylim()[0]) / (row_height / self.truncation_size) 477 | 478 | for gene_num, level_genes in enumerate(gene_display_levels): 479 | 480 | # ToDo: make this universal. Divide the gene body into non-overlapping segments, each type of which has a template. 481 | 482 | for gene_id in level_genes: 483 | gene_data = overlapping_genes[gene_id] 484 | # print(gene_id, gene_data['Name']) 485 | if len(self.genes_include) > 0 and (not any(gene_data['Name'] in s for s in self.genes_include)): 486 | continue 487 | 488 | left_truncated = gene_data['start'] < ws 489 | right_truncated = gene_data['end'] > we 490 | 491 | visible_gene_start = max(gene_data['start'], ws) 492 | if left_truncated: 493 | visible_gene_start += truncation_width_dt * 2 494 | visible_gene_end = min(gene_data['end'], we) 495 | if right_truncated: 496 | visible_gene_end -= truncation_width_dt * 2 497 | 498 | ax.plot((visible_gene_start, visible_gene_end), (gene_num, gene_num), color=self.color) 499 | ax.text(x=(visible_gene_start + visible_gene_end) / 2, 500 | y=gene_num + feature_height_dt * 1.5, 501 | s=gene_data['Name'], 502 | ha='center', 503 | fontsize=self.gene_name_fontsize) 504 | 505 | num_chevrons = int(max((visible_gene_end - visible_gene_start) / chevron_spacing_dt, 1)) 506 | chevron_remainder = (visible_gene_end - visible_gene_start) - (num_chevrons - 1) * chevron_spacing_dt 507 | 508 | if gene_data['strand'] == '+': 509 | chevron_x_delta = -chevron_width_dt 510 | else: 511 | chevron_x_delta = chevron_width_dt 512 | 513 | for chevron_idx in range(num_chevrons): 514 | chevron_x = visible_gene_start + chevron_idx * chevron_spacing_dt + chevron_remainder / 2 515 | 516 | ax.plot((chevron_x, chevron_x + chevron_x_delta), (gene_num, gene_num + chevron_height_dt), 517 | color=self.color) 518 | ax.plot((chevron_x, chevron_x + chevron_x_delta), (gene_num, gene_num - chevron_height_dt), 519 | color=self.color) 520 | 521 | if left_truncated: 522 | y_points = [gene_num, gene_num - truncation_height_dt, gene_num + truncation_height_dt] 523 | left_x_point = ws + 1 524 | right_x_point = ws + truncation_width_dt + 1 525 | 526 | x_points = numpy.array([left_x_point, right_x_point, right_x_point]) 527 | 528 | larr1 = matplotlib.patches.Polygon(numpy.vstack([x_points, y_points]).T, 529 | edgecolor='k', 530 | facecolor='w', 531 | fill=True, 532 | transform=ax.transData, 533 | zorder=3) 534 | larr2 = matplotlib.patches.Polygon(numpy.vstack([x_points + truncation_width_dt, y_points]).T, 535 | edgecolor='k', 536 | facecolor='w', 537 | fill=True, 538 | transform=ax.transData, 539 | zorder=3) 540 | 541 | ax.add_patch(larr1) 542 | ax.add_patch(larr2) 543 | 544 | if right_truncated: 545 | y_points = [gene_num, gene_num - truncation_height_dt, gene_num + truncation_height_dt] 546 | left_x_point = we - truncation_width_dt - 1 547 | right_x_point = we - 1 548 | 549 | x_points = numpy.array([right_x_point, left_x_point, left_x_point]) 550 | 551 | rarr1 = matplotlib.patches.Polygon(xy=numpy.vstack([x_points, y_points]).T, 552 | edgecolor='k', 553 | facecolor='w', 554 | fill=True, 555 | transform=ax.transData, 556 | zorder=3) 557 | rarr2 = matplotlib.patches.Polygon(numpy.vstack([x_points - truncation_width_dt, y_points]).T, 558 | edgecolor='k', 559 | facecolor='w', 560 | fill=True, 561 | transform=ax.transData, 562 | zorder=3) 563 | ax.add_patch(rarr1) 564 | ax.add_patch(rarr2) 565 | 566 | # Identify components belonging to this gene 567 | # this_gene_components = set([]) 568 | # for transcript_id in gene_data['transcripts']: 569 | # for component_id in overlapping_components: 570 | # this_gene_components.add(component_id) 571 | 572 | # plot components 573 | for component_id in overlapping_components: 574 | component_data = overlapping_components[component_id] 575 | # print('\t', component_id, component_data) 576 | if ((component_data['start'] >= visible_gene_start) and ( 577 | component_data['start'] <= visible_gene_end)) or ( 578 | (component_data['end'] >= visible_gene_start) and ( 579 | component_data['end'] <= visible_gene_end)): 580 | 581 | # ToDo: systematize and condense the following: 582 | if component_data['type'] == 'five_prime_UTR': 583 | # plot the "body" of the UTR 584 | if gene_data['strand'] == '+': 585 | utr_body = matplotlib.patches.Rectangle( 586 | xy=(component_data['start'], gene_num - feature_height_dt / 2), 587 | width=max(component_data['end'] - component_data['start'] - utr_endcap_width_dt, 0), 588 | height=feature_height_dt, 589 | facecolor=self.color) 590 | utr_endcap = matplotlib.patches.Rectangle( 591 | xy=(component_data['end'] - utr_endcap_width_dt, gene_num - feature_height_dt), 592 | width=utr_endcap_width_dt, 593 | height=feature_height_dt * 2, 594 | facecolor=self.color) 595 | 596 | else: 597 | utr_body = matplotlib.patches.Rectangle(xy=( 598 | component_data['start'] + utr_endcap_width_dt, gene_num - feature_height_dt / 2), 599 | width=max( 600 | component_data['end'] - component_data[ 601 | 'start'] - utr_endcap_width_dt, 0), 602 | height=feature_height_dt, 603 | facecolor=self.color) 604 | utr_endcap = matplotlib.patches.Rectangle( 605 | xy=(component_data['start'], gene_num - feature_height_dt), 606 | width=utr_endcap_width_dt, 607 | height=feature_height_dt * 2, 608 | facecolor=self.color) 609 | 610 | ax.add_patch(utr_body) 611 | ax.add_patch(utr_endcap) 612 | 613 | elif component_data['type'] == 'three_prime_UTR': 614 | # plot the "body" of the UTR 615 | if gene_data['strand'] == '-': 616 | utr_body = matplotlib.patches.Rectangle( 617 | xy=(component_data['start'], gene_num - feature_height_dt / 2), 618 | width=max(component_data['end'] - component_data['start'] - utr_endcap_width_dt, 0), 619 | height=feature_height_dt, 620 | facecolor=self.color) 621 | utr_endcap = matplotlib.patches.Rectangle( 622 | xy=(component_data['end'] - utr_endcap_width_dt, gene_num - feature_height_dt), 623 | width=utr_endcap_width_dt, 624 | height=feature_height_dt * 2, 625 | facecolor=self.color) 626 | 627 | else: 628 | utr_body = matplotlib.patches.Rectangle(xy=( 629 | component_data['start'] + utr_endcap_width_dt, gene_num - feature_height_dt / 2), 630 | width=max( 631 | component_data['end'] - component_data[ 632 | 'start'] - self.utr_endcap_width, 0), 633 | height=feature_height_dt, 634 | facecolor=self.color) 635 | utr_endcap = matplotlib.patches.Rectangle( 636 | xy=(component_data['start'], gene_num - feature_height_dt), 637 | width=utr_endcap_width_dt, 638 | height=feature_height_dt * 2, 639 | facecolor=self.color) 640 | 641 | ax.add_patch(utr_body) 642 | ax.add_patch(utr_endcap) 643 | 644 | elif component_data['type'] == 'CDS': 645 | cds = matplotlib.patches.Rectangle( 646 | xy=(component_data['start'], gene_num - feature_height_dt), 647 | width=component_data['end'] - component_data['start'], 648 | height=feature_height_dt * 2, 649 | facecolor=self.color) 650 | ax.add_patch(cds) 651 | 652 | ax.set_yticks([]) 653 | ax.set_ylabel(self.label) 654 | 655 | 656 | def compute_ax_row_positions(row_heights, ax_spacing=0.1): 657 | """ 658 | Given a sequence of row heights (in inches), and the size of the space to put between 659 | axes (in fractions of total row height), returns a list of bottom coordinates 660 | and heights for each row, suitable for passing to the fig.add_ax() method. 661 | """ 662 | bottoms = [] 663 | heights = [] 664 | total_canvas_height = numpy.sum(row_heights) 665 | fig_height = total_canvas_height * (1 + ax_spacing * len(row_heights)) 666 | cur_vertical_pos = 1 667 | for row_idx in range(len(row_heights)): 668 | this_row_height = row_heights[row_idx] / fig_height 669 | cur_vertical_pos -= this_row_height 670 | heights.append(this_row_height) 671 | bottoms.append(cur_vertical_pos) 672 | cur_vertical_pos -= ax_spacing 673 | return bottoms, heights 674 | 675 | 676 | class GenomeBrowser: 677 | VECTOR_LEGEND_LOC = 0 678 | 679 | def __init__(self, subplot_objects): 680 | """ 681 | Given a 2D nested list of genomic subplot objects it will allow the user to call the .visualize() method 682 | to generate plots of genomic data. 683 | 684 | :param plot_objects: 685 | """ 686 | self.subplot_objects = subplot_objects 687 | 688 | def visualize(self, chrom, start, end, 689 | fig_width=12, 690 | row_heights=1, 691 | ax_spacing=0.05, 692 | num_xticks=10, 693 | seaborn_style=seaborn.axes_style(style='ticks', 694 | rc={'axes.edgecolor': 'w', 'axes.facecolor': '#EAEAF2'})): 695 | """ 696 | Generate, display and return a matplotlib.Figure object comprising one or more Axes representing the genomic 697 | data tracks specified at initialization. 698 | 699 | The region to plot is specified by the parameters chrom, start, and end. 700 | 701 | :param:`fig_width` is specified in inches 702 | 703 | :param:`row_heights` can be specified as a scalar value (in inches), in which case the same row height will be 704 | used for all subplots, or as an iterable, in which case the row heights will be applied to the subplots 705 | in order. 706 | 707 | :param chrom: 708 | :param start: 709 | :param end: 710 | :param fig_width: 711 | :param row_heights: 712 | :param ax_spacing: 713 | :param num_xticks: 714 | :param seaborn_style: 715 | :return: 716 | """ 717 | # ToDo: Add gene (or other feature) lookup instead of specifying coordinates. 718 | start, end = int(start), int(end) 719 | 720 | assert end > start, 'Window end must be greater than window start! Got: {}, {}'.format(start, end) 721 | 722 | # if we receive a scalar here, use it as the height for all rows 723 | try: 724 | if len(row_heights) == 1: 725 | row_heights = row_heights * len(self.subplot_objects) # treat as a uniform row height 726 | except TypeError: 727 | row_heights = [row_heights] * len(self.subplot_objects) # treat as a uniform row height 728 | 729 | assert len(row_heights) == len(self.subplot_objects) 730 | 731 | span = end - start 732 | xtick_increment = span / num_xticks 733 | rounding_increment = 5 * 10 ** numpy.round(numpy.log10(xtick_increment) - 1) 734 | xtick_increment = utilities.roundto(xtick_increment, rounding_increment) 735 | num_ticks = int(span / xtick_increment) + 1 736 | round_start = utilities.roundto(start, rounding_increment) 737 | 738 | seaborn.set_style(seaborn_style) 739 | 740 | fig = plt.figure(len(self.subplot_objects), 741 | figsize=(fig_width, numpy.sum(row_heights) * (1 + ax_spacing * len(self.subplot_objects)))) 742 | bottoms, heights = compute_ax_row_positions(row_heights=row_heights, ax_spacing=ax_spacing) 743 | 744 | for ax_idx in range(len(self.subplot_objects)): 745 | this_ax = fig.add_axes([0, bottoms[ax_idx], 1, heights[ax_idx]]) 746 | 747 | if ax_idx == len(self.subplot_objects) - 1: 748 | this_ax.set_xticks(numpy.arange(num_ticks) * xtick_increment + round_start) 749 | this_ax.ticklabel_format(axis='x', style='sci', scilimits=(-2, 2)) 750 | this_ax.set_xlabel('{} position'.format(chrom)) 751 | 752 | else: # clear out xticks but plot objects can override this later 753 | this_ax.set_xlabel('') 754 | this_ax.set_xticks([]) 755 | 756 | plot_object_subset = self.subplot_objects[ax_idx] 757 | 758 | # Set default plot limits (can be changed by client objects) 759 | this_ax.set_ylim(0, 1) 760 | this_ax.set_xlim(start, end) 761 | 762 | for plot_object in plot_object_subset: 763 | plot_object.plot(this_ax, chrom=chrom, ws=start, we=end, fig_width=fig_width, 764 | row_height=row_heights[ax_idx]) 765 | 766 | # ToDo: Refactor legend code to get colors and names from objects not from axes handles. 767 | # if len(this_ax.get_legend_handles_labels()[1]): 768 | # this_ax.legend(loc=self.VECTOR_LEGEND_LOC) 769 | 770 | return fig 771 | 772 | class HicPlot: 773 | def __init__(self, data, label='', vertical_scale=1, cmap='YlOrRd', label_rotation=0, transform=lambda x: x**2, max_masked_diag=2): 774 | self.data = data 775 | self.cmap = cmap 776 | self.transform = transform # ToDo: Move the transform to the Data provider 777 | self.max_masked_diag = max_masked_diag 778 | self.label = label 779 | self.label_rotation = label_rotation 780 | self.vertical_scale = vertical_scale 781 | 782 | def plot(self, ax, chrom, ws, we, fig_width, row_height): 783 | visible_start_bin = utilities.roundto(ws, self.data.bin_size) 784 | visible_end_bin = utilities.roundto(we, self.data.bin_size) 785 | visible_span = visible_end_bin - visible_start_bin 786 | 787 | data_start_bin = visible_start_bin - visible_span // 2 788 | data_end_bin = visible_end_bin + visible_span // 2 789 | 790 | plot_data = self.data.query(chrom, data_start_bin, data_end_bin) 791 | 792 | for diag in range(self.max_masked_diag): 793 | plot_data.values[utilities.diag_indices(plot_data.shape[0], diag)] = 0 794 | 795 | plot_data = pandas.DataFrame(ndimage.rotate(plot_data, 45, reshape=False), 796 | index=plot_data.index, columns=plot_data.columns) 797 | # print(plot_data.shape) 798 | 799 | # Trim back to visible area 800 | plot_data = plot_data.loc[visible_start_bin:visible_end_bin,visible_start_bin:visible_end_bin] 801 | 802 | # Only show upper diagonal 803 | plot_data = plot_data.iloc[:-plot_data.shape[0] // 2,:] 804 | 805 | 806 | plot_data = self.transform(plot_data) 807 | 808 | # Re-index plot_data to allow it to play nicely with ax limits 809 | 810 | ax.set_ylim(0, plot_data.shape[1]) 811 | ax.imshow(plot_data, cmap=self.cmap, aspect='auto', extent=(ws, we, 0, plot_data.shape[1])) 812 | ax.set_xticks([]) 813 | ax.set_ylabel(self.label, rotation=self.label_rotation, labelpad=DEFAULT_YLABEL_PAD) 814 | # print('Done on {}'.format(ax)) 815 | return plot_data 816 | 817 | 818 | def match_ylims(fig, ax_nums): 819 | """ 820 | Will make the upper ylim of each of the numbered axes of :param fig: listed in 821 | :param ax_nums: equal to the maximum found in any of the numbered axes. 822 | """ 823 | max_extent = max([fig.get_axes()[ax_num].get_ylim()[1] for ax_num in ax_nums]) 824 | for ax_num in ax_nums: 825 | fig.get_axes()[ax_num].set_ylim((0, max_extent)) -------------------------------------------------------------------------------- /pygbrowse/romannumerals.py: -------------------------------------------------------------------------------- 1 | def int_to_roman(input): 2 | """ 3 | Convert an integer to Roman numerals. 4 | 5 | Examples: 6 | >>> int_to_roman(0) 7 | Traceback (most recent call last): 8 | ValueError: Argument must be between 1 and 3999 9 | 10 | >>> int_to_roman(-1) 11 | Traceback (most recent call last): 12 | ValueError: Argument must be between 1 and 3999 13 | 14 | >>> int_to_roman(1.5) 15 | Traceback (most recent call last): 16 | TypeError: expected integer, got 17 | 18 | >>> for i in range(1, 21): print int_to_roman(i) 19 | ... 20 | I 21 | II 22 | III 23 | IV 24 | V 25 | VI 26 | VII 27 | VIII 28 | IX 29 | X 30 | XI 31 | XII 32 | XIII 33 | XIV 34 | XV 35 | XVI 36 | XVII 37 | XVIII 38 | XIX 39 | XX 40 | >>> print int_to_roman(2000) 41 | MM 42 | >>> print int_to_roman(1999) 43 | MCMXCIX 44 | """ 45 | if type(input) != type(1): 46 | raise TypeError("expected integer, got %s" % type(input)) 47 | if not 0 < input < 4000: 48 | raise ValueError("Argument must be between 1 and 3999") 49 | ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1) 50 | nums = ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I') 51 | result = "" 52 | for i in range(len(ints)): 53 | count = int(input / ints[i]) 54 | result += nums[i] * count 55 | input -= ints[i] * count 56 | return result 57 | 58 | 59 | def roman_to_int(input): 60 | """ 61 | Convert a roman numeral to an integer. 62 | 63 | >>> r = range(1, 4000) 64 | >>> nums = [int_to_roman(i) for i in r] 65 | >>> ints = [roman_to_int(n) for n in nums] 66 | >>> print r == ints 67 | 1 68 | 69 | >>> roman_to_int('VVVIV') 70 | Traceback (most recent call last): 71 | ... 72 | ValueError: input is not a valid roman numeral: VVVIV 73 | >>> roman_to_int(1) 74 | Traceback (most recent call last): 75 | ... 76 | TypeError: expected string, got 77 | >>> roman_to_int('a') 78 | Traceback (most recent call last): 79 | ... 80 | ValueError: input is not a valid roman numeral: A 81 | >>> roman_to_int('IL') 82 | Traceback (most recent call last): 83 | ... 84 | ValueError: input is not a valid roman numeral: IL 85 | """ 86 | if type(input) != type(""): 87 | raise TypeError("expected string, got %s" % type(input)) 88 | input = input.upper() 89 | nums = ['M', 'D', 'C', 'L', 'X', 'V', 'I'] 90 | ints = [1000, 500, 100, 50, 10, 5, 1] 91 | places = [] 92 | for c in input: 93 | if not c in nums: 94 | raise ValueError("input is not a valid roman numeral: %s" % input) 95 | for i in range(len(input)): 96 | c = input[i] 97 | value = ints[nums.index(c)] 98 | # If the next place holds a larger number, this value is negative. 99 | try: 100 | nextvalue = ints[nums.index(input[i + 1])] 101 | if nextvalue > value: 102 | value *= -1 103 | except IndexError: 104 | # there is no next place. 105 | pass 106 | places.append(value) 107 | sum = 0 108 | for n in places: sum += n 109 | # Easiest test for validity... 110 | if int_to_roman(sum) == input: 111 | return sum 112 | else: 113 | raise ValueError('input is not a valid roman numeral: %s' % input) 114 | -------------------------------------------------------------------------------- /pygbrowse/utilities.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import subprocess 4 | 5 | import numpy 6 | from scipy.stats import norm 7 | 8 | from . import romannumerals 9 | 10 | 11 | # ToDo: Bring back scale bar 12 | # ToDo: Add option for solid fill of vectors 13 | 14 | 15 | def roundto(num, nearest): 16 | """ 17 | Rounds :param:`num` to the nearest increment of :param:`nearest` 18 | """ 19 | return int((num + (nearest / 2)) // nearest * nearest) 20 | 21 | 22 | def convert_chromosome_name(chrom_string, dialect='ucsc'): 23 | """ 24 | Try to auto-detect chromosome number and convert it to the specified "dialect". 25 | 26 | Valid dialects are "ucsc", "ensembl" and "yeast". 27 | 28 | :param chrom_string: 29 | :param source: 30 | :param dest: 31 | :return: 32 | """ 33 | try: 34 | chrom_string = str(romannumerals.roman_to_int(chrom_string)) 35 | except ValueError: 36 | pass 37 | 38 | if dialect == 'ensembl': 39 | if chrom_string == 'chrM': 40 | return 'dmel_mitochonrdion_genome' 41 | elif chrom_string[:3].lower() == 'chr': 42 | return chrom_string[3:] 43 | else: 44 | return chrom_string 45 | elif dialect == 'ucsc': 46 | if chrom_string == 'dmel_mitochondrion_genome': 47 | return 'chrM' 48 | elif chrom_string[:3].lower() == 'chr': 49 | return chrom_string 50 | else: 51 | return 'chr{}'.format(chrom_string) 52 | elif dialect == 'yeast': 53 | if chrom_string[:3].lower() == 'chr': 54 | chrom_string = chrom_string[3:] 55 | try: 56 | return romannumerals.int_to_roman(int(chrom_string)) 57 | except ValueError: 58 | return chrom_string 59 | else: 60 | raise ValueError('Unknown dialect {}'.format(dialect)) 61 | 62 | 63 | def binary_search_tag_file(tag_filename, search_target): 64 | """ 65 | Find the offset (in bytes) in :param:`tag_filename` that corresponds 66 | to the start of the first tag that is equal to or greater than :param:`search_target`. 67 | 68 | If none of the reads have a start position greater than :param:`search_target`, 69 | return None. 70 | 71 | Note that positions in tag files have a 1-based index. 72 | """ 73 | 74 | def get_read_start(file_offset): 75 | tag_file.seek(file_offset) 76 | if file_offset > 0: 77 | _ = tag_file.readline() # read forward to get to a line start 78 | this_line = tag_file.readline().strip() 79 | if tag_file.tell() >= filesize: 80 | # We've reached the end of the file and the reads are still upstream of the target 81 | return None 82 | else: 83 | return int(this_line.split('\t')[1]) 84 | 85 | filesize = os.path.getsize(tag_filename) 86 | search_window_start = 0 87 | search_window_end = filesize - 1 88 | guess_genomic_start = -1 89 | guess = int((search_window_start + search_window_end) / 2) 90 | 91 | with open(tag_filename, 'rt') as tag_file: 92 | first_genomic_start = get_read_start(search_window_start) 93 | # last_genomic_start = get_read_position(search_window_end) 94 | 95 | if search_target < first_genomic_start: 96 | return search_window_start 97 | 98 | while search_window_end - search_window_start > 1: 99 | guess = int((search_window_start + search_window_end) / 2) 100 | guess_genomic_start = get_read_start(guess) 101 | 102 | if guess_genomic_start == None: 103 | return None 104 | 105 | # print(search_window_start, guess, search_window_end, guess_genomic_start) 106 | 107 | if guess_genomic_start < search_target: 108 | # print('\ttoo low!') 109 | search_window_start = guess 110 | 111 | elif guess_genomic_start > search_target: 112 | search_window_end = guess 113 | 114 | # print('\ttoo high!') 115 | else: 116 | # print('\tjust right!') 117 | break 118 | 119 | if guess_genomic_start == -1: 120 | return None 121 | 122 | if guess_genomic_start < search_target: 123 | guess += 1 124 | 125 | tag_file.seek(guess) 126 | _ = tag_file.readline() 127 | guess = tag_file.tell() 128 | 129 | return guess 130 | 131 | 132 | def bgzip_gff(gff3_fname, bgzipped_fname): 133 | """ 134 | Compress a GFF3 file in block-gzip format (requires that bgzip be accessible on the current path). 135 | 136 | If :param gff3_fname: ends with '.gz' assumes that the file is gzipped, otherwise assumes it is uncompressed. 137 | 138 | :param gzipped_fname: 139 | :param bgzipped_fname: 140 | :return: 141 | """ 142 | if bgzipped_fname == gff3_fname: 143 | log_print('Destination and source file cannot have the same name!') 144 | 145 | cmd_line = '{} {} | sort -k1,1 -k4,4n | bgzip > {}'.format(('cat', 'zcat')[gff3_fname.endswith('.gz')], gff3_fname, 146 | bgzipped_fname) 147 | try: 148 | assert os.path.isfile(gff3_fname) # needed since no error occurs otherwise 149 | subprocess.check_call(cmd_line, shell=True) 150 | 151 | except subprocess.CalledProcessError as cpe: 152 | log_print('Unsuccessful. Got return code {}'.format(cpe.returncode)) 153 | 154 | except AssertionError: 155 | log_print('{} not found!'.format(gff3_fname)) 156 | 157 | else: 158 | log_print('Successfully generated block-gzipped file {} from {}'.format(bgzipped_fname, gff3_fname)) 159 | 160 | 161 | def generate_tabix_index(target_fname): 162 | """ 163 | Index :param target_fname: with tabix. Requires that the directory in which :param:target_fname: resides is 164 | writeable. 165 | 166 | :param target_fname: 167 | :return: 168 | """ 169 | cmd_line = 'tabix -f -p gff {}'.format(target_fname) 170 | try: 171 | return_code = subprocess.check_call(cmd_line, shell=True) 172 | except subprocess.CalledProcessError as cpe: 173 | log_print('Unsuccessful. Got return code {}'.format(cpe.returncode)) 174 | else: 175 | log_print('Successfully indexed block-gzipped file {}'.format(target_fname)) 176 | 177 | 178 | def pretty_now(): 179 | """ 180 | Returns the current date/time in a nicely formatted string (without decimal seconds) 181 | """ 182 | return datetime.datetime.strftime(datetime.datetime.now(), '%Y-%b-%d %H:%M:%S') 183 | 184 | 185 | def log_print(message, tabs=1): 186 | """ 187 | Print a chunk of text preceded by a timestamp and an optional number of tabs (default 1). 188 | 189 | :param message: 190 | :param tabs: 191 | :return: 192 | """ 193 | print('{}{}{}'.format(pretty_now(), '\t' * tabs, message)) 194 | 195 | 196 | def gaussian_kernel(sd, sd_cutoff=3, normalize=False): 197 | """ 198 | Generate and return a numpy.Array whose elements are proportional to the PDF of a normal distribution 199 | having standard deviation :param:`sd`. 200 | 201 | :param sd: 202 | :param sd_cutoff: 203 | :param normalize: 204 | :return: 205 | """ 206 | bw = sd_cutoff * sd * 2 + 1 207 | midpoint = sd_cutoff * sd 208 | kern = numpy.zeros(bw) 209 | frozen_rv = norm(scale=sd) 210 | for i in range(bw): 211 | kern[i] = frozen_rv.pdf(i - midpoint) 212 | if normalize: 213 | kern = kern / kern.max() 214 | return kern 215 | 216 | 217 | def add_label(ax, tick, tick_label, axis='x'): 218 | """ 219 | Updates the set of ticks and tick labels for the specified matplotlib.Axes object 220 | and axis. 221 | 222 | If the tick already exists, it's label will be updated. If not, it will be created and labeled 223 | appropriately. 224 | 225 | """ 226 | if axis == 'y': 227 | tick_getter, label_getter = ax.get_yticks, ax.get_yticklabels 228 | tick_setter, label_setter = ax.set_yticks, ax.set_yticklabels 229 | else: 230 | tick_getter, label_getter = ax.get_xticks, ax.get_xticklabels 231 | tick_setter, label_setter = ax.set_xticks, ax.set_xticklabels 232 | 233 | labels = dict(zip(tick_getter(), label_getter())) 234 | labels[tick] = tick_label 235 | new_ticks, new_labels = zip(*sorted(labels.items())) 236 | tick_setter(new_ticks) 237 | label_setter(new_labels) 238 | 239 | 240 | def adjust_limits(ax, new_position, axis='y', padding_fraction=0.1): 241 | """ 242 | If necessary adjusts the limits for the specified :param axis: on 243 | :param ax: to accomodate :param new_position: according to the 244 | following scheme: 245 | 246 | 1. Assumes that the current limits are the 247 | smallest and largest content item minus / plus a padding equal to 248 | :param padding_fraction: * the span between the smallest 249 | and largest content item. 250 | 2. If :param new_position: is beyond the inferred content limits, 251 | adjust the padding to :param padding_fraction: * the new content 252 | span, then adjust the plot limits to the new content limits 253 | minus / plus the new padding. 254 | """ 255 | assert padding_fraction < 0.5, 'padding_fraction must be below 0.5!' 256 | 257 | if axis == 'y': 258 | limit_getter = ax.get_ylim 259 | limit_setter = ax.set_ylim 260 | else: 261 | limit_getter = ax.get_xlim 262 | limit_setter = ax.set_xlim 263 | 264 | current_plot_min, current_plot_max = limit_getter() 265 | current_plot_span = current_plot_max - current_plot_min 266 | current_data_span = current_plot_span / (1 + 2 * padding_fraction) 267 | current_pad = current_data_span * padding_fraction 268 | current_data_min = current_plot_min + current_pad 269 | current_data_max = current_plot_max - current_pad 270 | 271 | # print(current_plot_min, current_plot_max, current_plot_span) 272 | # print(current_data_min, current_data_max, current_data_span, current_pad) 273 | 274 | if new_position > current_data_max: 275 | new_data_min = current_data_min 276 | new_data_max = new_position 277 | 278 | elif new_position < current_data_min: 279 | new_data_min = new_position 280 | new_data_max = current_data_max 281 | else: 282 | # no changes needed 283 | return 284 | 285 | new_data_span = new_data_max - new_data_min 286 | new_pad = new_data_span * padding_fraction 287 | new_plot_min = new_data_min - new_pad 288 | new_plot_max = new_data_max + new_pad 289 | 290 | # print(new_data_min, new_data_max, new_data_span, new_pad) 291 | # print(new_plot_min, new_plot_max) 292 | 293 | limit_setter((new_plot_min, new_plot_max)) 294 | 295 | 296 | def diag_indices(n, k=0): 297 | """ 298 | Return the indices corresponding to the kth diagonal of an n X n array 299 | in the form of a tuple of (x coords, y coords). 300 | 301 | Created since numpy does not provide this functionality. 302 | """ 303 | if k <= 0: 304 | x_coords = numpy.arange(-k, n) 305 | y_coords = numpy.arange(0, n + k) 306 | else: 307 | x_coords = numpy.arange(0, n - k) 308 | y_coords = numpy.arange(k, n) 309 | 310 | return (x_coords, y_coords) -------------------------------------------------------------------------------- /pygbrowse_logo_1_med_flat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phageghost/python-genome-browser/c052bb72ca89664f48d1248b5a4500482c823ee7/pygbrowse_logo_1_med_flat.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | VER = '0.3.3' 4 | AUTHOR = 'Dylan Skola' 5 | 6 | print('*' * 80) 7 | print('* {:<76} *'.format('python-genome-browser version {} by {}'.format(VER, AUTHOR))) 8 | print('*' * 80) 9 | print() 10 | 11 | with open("README.md", "r") as fh: 12 | long_description = fh.read() 13 | 14 | setuptools.setup(name='pygbrowse', 15 | version=VER, 16 | description='Tools for making plots of genomic datasets in a genome-browser-like format ', 17 | long_description=long_description, 18 | url='https://github.com/phageghost/python-genome-browser', 19 | author='phageghost', 20 | author_email='pygbrowse@phageghost.net', 21 | license='MIT', 22 | packages=['pygbrowse'], 23 | install_requires=['numpy', 'scipy', 'pandas', 'matplotlib', 'seaborn', 'pysam', 'intervaltree'], 24 | zip_safe=False, 25 | classifiers=( 26 | "Programming Language :: Python :: 3", 27 | "License :: OSI Approved :: MIT License", 28 | "Operating System :: OS Independent", 29 | ), 30 | ) 31 | --------------------------------------------------------------------------------