├── .gitignore ├── CHANGES.txt ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs └── README.md ├── examples ├── batchproducers │ └── BatchProducersWithTileDB.ipynb ├── bigwig_files_from_encode_for_label_comparison │ ├── README.md │ └── download_bigwigs.sh ├── dbingest │ ├── dbingest_examples.ipynb │ ├── hg38.chrom.sizes │ ├── run_db_ingest.sh │ ├── run_db_ingest_single_threaded.sh │ └── tier1.encode.dnase.tasks.tsv ├── hg38.blacklist.bed.gz ├── hg38.chrom.sizes ├── hg38.chrom21.sizes ├── labelgen │ ├── bigwig_files_from_encode_for_label_comparison │ ├── genomewide_labels_examples.py │ ├── genomewide_labels_examples.sh │ ├── hg38.chrom.sizes │ ├── hg38.chrom21.sizes │ ├── peak_files_from_encode_for_label_comparison │ ├── save_source_labels_in_labelgen.sh │ └── tasks.labelgen.tsv ├── peak_files_from_encode_for_label_comparison │ └── README └── seqdataloader_examples.ipynb ├── requirements.txt ├── seqdataloader ├── attrib_config.py ├── batchproducers │ ├── __init__.py │ └── coordbased │ │ ├── __init__.py │ │ ├── coordbatchproducers.py │ │ ├── coordbatchtransformers.py │ │ ├── coordstovals │ │ ├── __init__.py │ │ ├── bigwig.py │ │ ├── core.py │ │ ├── fasta.py │ │ ├── lookup.py │ │ └── tiledb.py │ │ └── core.py ├── bounded_process_pool_executor.py ├── dbingest │ ├── README.md │ └── __init__.py ├── dbingest_single_threaded │ └── __init__.py ├── labelgen │ ├── __init__.py │ ├── classification_label_protocols.py │ ├── regression_label_protocols.py │ ├── rolling_average.py │ └── utils.py ├── queue_config.py ├── tdb_config.py └── utils.py ├── setup.py └── tests ├── test_tiledb_coords_to_vals.benchmark.py └── test_tiledb_coords_to_vals.py /.gitignore: -------------------------------------------------------------------------------- 1 | seqdataloader.egg-info/ 2 | build/ 3 | dist/ 4 | *.hdf5 5 | *.bigWig 6 | *.gz 7 | *.pyc 8 | __pycache__ 9 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | v0.1,01/24/2019 -- Initial release. 2 | v0.11,01/30/2019 -- Added flags --chroms_to_keep, --chroms_to_exclude --store_positives_only. Indices stored as strings rather than tuples for more standard bed format output. 3 | v0.111,01/31/2019 -- default value of False used for flag --store_positives_only when genomewide_labels is called from python script rather than command line script. changed name of module name of genomewide_labels to seqdataloader to avoid user confusion when importing the code 4 | v0.113,02/01/2019 --Updated how CHROM , START, END are stored in data frame to avoid problems with quoting in the output bed files. (v.112 is skipped due to problem uploading to pypi) 5 | v0.114,02/01/2019 --removed redundant code pass for storing positives only 6 | v0.115,02/01/2019 -- handled an edge case discovered by Soumya 7 | v0.116,02/07/2019 -- more robus saving of data frames to output hdf5 format 8 | v0.117,02/17/2019 -- added minimum required versions for all dependency packages in setup.py 9 | v0.118,02/17/2019 -- format=table for saving to hdf5 changed as optional,non-default 10 | v0.120,02/26/2019 -- functionality to add a bed file with ambiguous regions for each task. saving as format=table in hdf5 is still optional, but now set to default, as it is desired in most cases 11 | v0.121,02/27/2019 -- ambiguous peaks labeled w/ np.nan 12 | v0.122,02/28/2019 -- write each chromosome to output file individually to reduce ram usage 13 | v0.123,03/02/2019 -- ambig_bed default 14 | [missing documentation] 15 | v0.127,09/05/2019 -- (Av Shrikumar) added functionality to be able to load batches from a downsampled negative set, retrieve labels via a lookup table, Coordinate object is now a namedtuple rather than a full object. 16 | v0.128,09/10/2019 -- (Anna Shcherbina) fixing issue with wheel structure in pypi release 17 | 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Kundaje Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include . *.md 2 | recursive-include docs * 3 | recursive-include seqdataloader/ * 4 | include requirements.txt 5 | include examples/tasks.tsv 6 | include examples/genomewide_labels_examples.py 7 | include examples/genomewide_labels_examples.sh 8 | include examples/hg38.chrom.sizes 9 | include examples/bigwig_files_from_encode_for_label_comparison/download_bigwigs.sh 10 | recursive-exclude examples/ *bigWig 11 | recursive-exclude examples/ *bed.gz 12 | recursive-exclude examples/ *hdf5 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # seqdataloader 2 | Sequence data label generation and ingestion into deep learning models 3 | 4 | ## Installation 5 | `pip install seqdataloader` 6 | 7 | If you plan to modify the code, you can install it in development mode: 8 | `pip install -e seqdataloader` 9 | 10 | Please note: to use the "dbingest" functionality in seqdataloader, python>=3.7 is needed. 11 | 12 | # Quick Start 13 | 14 | ## labelgen 15 | The input for the labelgen submodule is a 4 column tab-delimited file with the following fields: 16 | 17 | * "task" -- required. User-specified task name 18 | * "narrowPeak" -- Path to narrowPeak file. (Optional if "bigwig" is specified.) 19 | * "bigwig" -- Path to bigwig file (optional if "narrowPeak" is specified.) 20 | * "ambig" -- bed file containing user-specified regions to label as ambiguous (optional) 21 | 22 | ``` 23 | genomewide_labels --task_list tasks.tsv \ 24 | --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \ 25 | --output_type gzip \ # (one of gzip, bz2, hdf5, pkl) 26 | --chrom_sizes hg38.chrom.sizes \ 27 | --bin_stride 50 \ 28 | --left_flank 400 \ 29 | --right_flank 400 \ 30 | --bin_size 200 \ 31 | --task_hreads 10 \ 32 | --chrom_threads 4 \ 33 | --allow_ambiguous \ 34 | --labeling_approach peak_summit_in_bin_classification 35 | ``` 36 | And for regression: 37 | ``` 38 | genomewide_labels --task_list tasks.tsv \ 39 | --outf regressionlabels.allbins.hg38.hdf5 \ 40 | --output_type hdf5 \ 41 | --chrom_sizes hg38.chrom.sizes \ 42 | --bin_stride 50 \ 43 | --left_flank 400 \ 44 | --right_flank 400 \ 45 | --chrom_threads 24 \ 46 | --task_threads 2 \ 47 | --label_transformer asinh \ one of None, asinh, log10, log, default is asinh 48 | --labeling_approach all_genome_bins_regression 49 | ``` 50 | 51 | labeling_approach can be one of: 52 | 53 | "peak_summit_in_bin_classification" 54 | 55 | "peak_percent_overlap_with_bin_classification" 56 | 57 | "peak_summit_in_bin_regression" 58 | 59 | "peak_percent_overlap_with_bin_regression" 60 | 61 | "all_genome_bins_regression" 62 | 63 | 64 | ### How to run 65 | Sample datasets are included in the folder `examples/peak_files_from_encode_for_label_comparison` and `examples/bigwig_files_from_encode_for_label_comparison` 66 | 67 | ### Executing seqdataloader as a script: 68 | Execute the script: 69 | 70 | `examples/genomewide_labels.sh` for examples on how to generate classification and regression labels on sample datasets. 71 | The script generates binary classification labels (1,0,-1 for ambiguous) or continuous regression labels reflective of bigWig coverage in a bin in bed file format: 72 | 73 | http://mitra.stanford.edu/kundaje/seqdataloader/classificationlabels.50PercentOverlap.tsv.gz 74 | 75 | http://mitra.stanford.edu/kundaje/seqdataloader/classificationlabels.SummitWithin200bpCenter.tsv.gz 76 | 77 | http://mitra.stanford.edu/kundaje/seqdataloader/regressionlabels.50PercentOverlap.tsv.gz 78 | 79 | http://mitra.stanford.edu/kundaje/seqdataloader/regressionlabels.SummitWithin200bpCenter.tsv.gz 80 | 81 | Corresponding WashU Browser Tracks with optimal narrowPeak and associated bin labels are here: 82 | http://epigenomegateway.wustl.edu/legacy/?genome=hg38&session=GDB2BTMGnB&statusId=1154897038 83 | 84 | ### calling seqdataloader as a Python function: 85 | ``` 86 | from seqdataloader import * 87 | classification_params={ 88 | 'task_list':"tasks.tsv", 89 | 'outf':"classificationlabels.SummitWithin200bpCenter.tsv.gz", 90 | 'output_type':'gzip', 91 | 'chrom_sizes':'hg38.chrom.sizes', 92 | 'chroms_to_keep':['chr21'], 93 | "store_positives_only":True, 94 | 'bin_stride':50, 95 | 'left_flank':400, 96 | 'right_flank':400, 97 | 'bin_size':200, 98 | 'chrom_threads':10, 99 | 'task_threads':4, 100 | 'allow_ambiguous':True, 101 | 'labeling_approach':'peak_summit_in_bin_classification' 102 | } 103 | genomewide_labels(classification_params) 104 | 105 | regression_params={ 106 | 'task_list':"tasks.tsv", 107 | 'outf':"regressionlabels.all_genome_bins_regression.hdf5", 108 | 'output_type':'hdf5', 109 | 'chrom_sizes':'hg38.chrom.sizes', 110 | 'store_values_above_thresh': 0, 111 | 'chroms_to_keep':['chr21'], 112 | 'bin_stride':50, 113 | 'left_flank':400, 114 | 'right_flank':400, 115 | 'bin_size':200, 116 | 'chrom_threads':10, 117 | 'task_threads':4, 118 | 'labeling_approach':'all_genome_bins_regression', 119 | 'label_transformer':'log10', 120 | 'label_transfomer_pseudocount':0.001 121 | } 122 | genomewide_labels(regression_params) 123 | ``` 124 | ### Regression label transformations 125 | 126 | In regression mode ( "peak_summit_in_bin_regression", "peak_percent_overlap_with_bin_regression", "all_genome_bins_regression"), the generated labels can be transformed in one of several ways. You can use the arguments `label_transformer` and `label_transformer_pseudocount` to specify the desired tranformation. Allowed values are: 127 | 128 | * asinh -- numpy.arcsinh(values) will be computed (this is the default) 129 | * None -- no label transformation will be performed 130 | * log10 -- numpy.log10(values + pseudocount) will be computed using a pseudocount specified by `label_transformer_pseudocount` argument. If this argument is not provided,a default pseudocout of 0.001 is used. 131 | * log -- numpy.log(values + pseudocount) will be computed using a pseudcount as above. 132 | 133 | ### A note on file outputs 134 | 135 | The code supports several output types: `hdf5`, `gzip`, `pkl`, `bz2`. 136 | Specify your desired output type with the flag `--output_type`. The default setting for this flag is `gzip` 137 | Please note that the large bottleneck in the code is writing the files to disk. `hdf5` has negligible overhead, but using `gzip` or `bz2` may increase runtime. Timining benchmarks are provided in `examples/genomewide_labels.sh` 138 | 139 | You may speed up i/o by writing chromosome outputs to separate files in parallel. This is currently only supported for the `gzip` and `bz2` output types, as i/o is less of a bottleneck for `hdf5` and `pkl` output formats. Use the flag `--split_output_by_chrom` to invoke this parallelized saving of chromosomes. 140 | 141 | ## dbingest 142 | 143 | The input tsv file must have a subset of the columns corresponding to the supported configurations: 144 | ``` 145 | * encode_config 146 | ** dataset 147 | ** fc_bigwig 148 | ** pval_bigwig 149 | ** count_bigwig_plus_5p 150 | ** count_bigwig_minus_5p 151 | ** count_bigwig_unstranded_5p 152 | ** idr_peak 153 | ** overlap_peak 154 | ** ambig_peak 155 | 156 | * generic_bigwig 157 | ** bigwig_track 158 | 159 | ``` 160 | # Dependencies 161 | 162 | Please make sure the following dependencies are installed on your system to use SeqDataLoader: 163 | * pybedtools 164 | * pyBigWig 165 | * pandas 166 | * numpy 167 | * multiprocessing 168 | 169 | 170 | ## Documentation and benchmarks 171 | 172 | Testing, benchmarks, and documentation can be found in the `docs` folder 173 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | Documentation for benchmarks and testing of seqdataloader 2 | -------------------------------------------------------------------------------- /examples/batchproducers/BatchProducersWithTileDB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#load tutorial utilities \n", 10 | "%reload_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "%matplotlib inline\n", 13 | "import warnings\n", 14 | "warnings.filterwarnings('ignore')" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "Using TensorFlow backend.\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "#unit tests for class seqdataloader.batchproducers.coordbased.coordstovals.BasicTiledbProfileCoordsToVals\n", 32 | "from seqdataloader.batchproducers.coordbased.coordstovals.tiledb import *\n", 33 | "\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "#generate some test coords objects \n", 43 | "from collections import namedtuple\n", 44 | "Coord=namedtuple('Coord','chrom start end isplusstrand')\n", 45 | "coords=[Coord('chr1',1000000,2000000,True),\n", 46 | " Coord('chr2',1000000,2000000,True),\n", 47 | " Coord('chr3',1000000,2000000,True),\n", 48 | " Coord('chr4',1000000,2000000,True),\n", 49 | " Coord('chr5',1000000,2000000,True),\n", 50 | " Coord('chr6',1000000,2000000,True),\n", 51 | " Coord('chr7',1000000,2000000,True),\n", 52 | " Coord('chr1',1000000,2000000,False),\n", 53 | " Coord('chr2',1000000,2000000,False),\n", 54 | " Coord('chr3',1000000,2000000,False),\n", 55 | " Coord('chr4',1000000,2000000,False),\n", 56 | " Coord('chr5',1000000,2000000,False),\n", 57 | " Coord('chr6',1000000,2000000,False),\n", 58 | " Coord('chr7',1000000,2000000,False)]\n", 59 | "\n", 60 | "\n", 61 | "pos_label_source_attribute=\"fc_bigwig\"\n", 62 | "neg_label_source_attribute=\"fc_bigwig\"\n", 63 | "\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(14, 1000000)" 75 | ] 76 | }, 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "\n", 84 | "#case 1: tiledb_paths is a string\n", 85 | "tiledb_paths=\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\"\n", 86 | "ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,\n", 87 | " pos_label_source_attribute=pos_label_source_attribute,\n", 88 | " neg_label_source_attribute=neg_label_source_attribute)\n", 89 | "string_vals=ctov.__call__(coords)\n", 90 | "string_vals.shape\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 9, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "coords=[Coord('chr1',1000,2000,True),\n", 100 | " Coord('chr2',1000,2000,True),\n", 101 | " Coord('chr3',1000,2000,True),\n", 102 | " Coord('chr4',1000,2000,True),\n", 103 | " Coord('chr5',1000,2000,True),\n", 104 | " Coord('chr6',1000,2000,True),\n", 105 | " Coord('chr7',1000,2000,True),\n", 106 | " Coord('chr1',1000,2000,False),\n", 107 | " Coord('chr2',1000,2000,False),\n", 108 | " Coord('chr3',1000,2000,False),\n", 109 | " Coord('chr4',1000,2000,False),\n", 110 | " Coord('chr5',1000,2000,False),\n", 111 | " Coord('chr6',1000,2000,False),\n", 112 | " Coord('chr7',1000,2000,False)]" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 10, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "string_vals=ctov.__call__(coords)\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 6, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "[array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156 , 0.15591 ,\n", 133 | " 0.15591 ],\n", 134 | " [0.16829 , 0.16829 , 0.16829 , ..., 0.67333001, 0.67333001,\n", 135 | " 0.75756001],\n", 136 | " [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n", 137 | " 0.25251999],\n", 138 | " ...,\n", 139 | " [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n", 140 | " 0.30206999],\n", 141 | " [0. , 0. , 0. , ..., 1.17837 , 1.17837 ,\n", 142 | " 1.17837 ],\n", 143 | " [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n", 144 | " 1.00989997]]),\n", 145 | " array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156 , 0.15591 ,\n", 146 | " 0.15591 ],\n", 147 | " [0.16829 , 0.16829 , 0.16829 , ..., 0.67333001, 0.67333001,\n", 148 | " 0.75756001],\n", 149 | " [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n", 150 | " 0.25251999],\n", 151 | " ...,\n", 152 | " [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n", 153 | " 0.30206999],\n", 154 | " [0. , 0. , 0. , ..., 1.17837 , 1.17837 ,\n", 155 | " 1.17837 ],\n", 156 | " [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n", 157 | " 1.00989997]]),\n", 158 | " array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156 , 0.15591 ,\n", 159 | " 0.15591 ],\n", 160 | " [0.16829 , 0.16829 , 0.16829 , ..., 0.67333001, 0.67333001,\n", 161 | " 0.75756001],\n", 162 | " [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n", 163 | " 0.25251999],\n", 164 | " ...,\n", 165 | " [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n", 166 | " 0.30206999],\n", 167 | " [0. , 0. , 0. , ..., 1.17837 , 1.17837 ,\n", 168 | " 1.17837 ],\n", 169 | " [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n", 170 | " 1.00989997]])]" 171 | ] 172 | }, 173 | "execution_count": 6, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "#case2: tiledb_paths is a list\n", 180 | "tiledb_paths=[\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\",\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\",\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\"]\n", 181 | "ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,\n", 182 | " pos_label_source_attribute=pos_label_source_attribute,\n", 183 | " neg_label_source_attribute=neg_label_source_attribute)\n", 184 | "list_vals=ctov.__call__(coords)\n", 185 | "list_vals" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 7, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "{'mode0': array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156 , 0.15591 ,\n", 197 | " 0.15591 ],\n", 198 | " [0.16829 , 0.16829 , 0.16829 , ..., 0.67333001, 0.67333001,\n", 199 | " 0.75756001],\n", 200 | " [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n", 201 | " 0.25251999],\n", 202 | " ...,\n", 203 | " [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n", 204 | " 0.30206999],\n", 205 | " [0. , 0. , 0. , ..., 1.17837 , 1.17837 ,\n", 206 | " 1.17837 ],\n", 207 | " [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n", 208 | " 1.00989997]]),\n", 209 | " 'mode1': array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156 , 0.15591 ,\n", 210 | " 0.15591 ],\n", 211 | " [0.16829 , 0.16829 , 0.16829 , ..., 0.67333001, 0.67333001,\n", 212 | " 0.75756001],\n", 213 | " [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n", 214 | " 0.25251999],\n", 215 | " ...,\n", 216 | " [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n", 217 | " 0.30206999],\n", 218 | " [0. , 0. , 0. , ..., 1.17837 , 1.17837 ,\n", 219 | " 1.17837 ],\n", 220 | " [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n", 221 | " 1.00989997]]),\n", 222 | " 'mode2': array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156 , 0.15591 ,\n", 223 | " 0.15591 ],\n", 224 | " [0.16829 , 0.16829 , 0.16829 , ..., 0.67333001, 0.67333001,\n", 225 | " 0.75756001],\n", 226 | " [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n", 227 | " 0.25251999],\n", 228 | " ...,\n", 229 | " [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n", 230 | " 0.30206999],\n", 231 | " [0. , 0. , 0. , ..., 1.17837 , 1.17837 ,\n", 232 | " 1.17837 ],\n", 233 | " [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n", 234 | " 1.00989997]])}" 235 | ] 236 | }, 237 | "execution_count": 7, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "#case3: tiledb_paths is a dict\n", 244 | "tiledb_paths={'mode0':\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\",\n", 245 | " 'mode1':\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\",\n", 246 | " 'mode2':\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\"}\n", 247 | "\n", 248 | "ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,\n", 249 | " pos_label_source_attribute=pos_label_source_attribute,\n", 250 | " neg_label_source_attribute=neg_label_source_attribute)\n", 251 | "dict_vals=ctov.__call__(coords)\n", 252 | "dict_vals\n" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [] 261 | } 262 | ], 263 | "metadata": { 264 | "kernelspec": { 265 | "display_name": "Python 3", 266 | "language": "python", 267 | "name": "python3" 268 | }, 269 | "language_info": { 270 | "codemirror_mode": { 271 | "name": "ipython", 272 | "version": 3 273 | }, 274 | "file_extension": ".py", 275 | "mimetype": "text/x-python", 276 | "name": "python", 277 | "nbconvert_exporter": "python", 278 | "pygments_lexer": "ipython3", 279 | "version": "3.7.0" 280 | } 281 | }, 282 | "nbformat": 4, 283 | "nbformat_minor": 2 284 | } 285 | -------------------------------------------------------------------------------- /examples/bigwig_files_from_encode_for_label_comparison/README.md: -------------------------------------------------------------------------------- 1 | #GM12878 (Rep 2) 2 | DNAse GM12878 Stam hg38 https://www.encodeproject.org/files/ENCFF743ULW/@@download/ENCFF743ULW.bigWig 3 | #Hepg2 (Rep2) 4 | DNAse Hepg2 Stam hg38 https://www.encodeproject.org/files/ENCFF842XRQ/@@download/ENCFF842XRQ.bigWig 5 | #(Rep1+Rep2, fc bigwig) 6 | Max K562 Snyder hg38 https://www.encodeproject.org/files/ENCFF796GHK/@@download/ENCFF796GHK.bigWig 7 | #(Rep1+Rep2, pval bigwig) 8 | Max K562 Snyder hg38 https://www.encodeproject.org/files/ENCFF412VKD/@@download/ENCFF412VKD.bigWig 9 | #(Rep1+Rep2, fc bigwig) 10 | Myc K562 Snyder hg38 https://www.encodeproject.org/files/ENCFF667QJZ/@@download/ENCFF667QJZ.bigWig 11 | #(Rep1+Rep2, pval bigwig) 12 | Myc K562 Snyder hg38 https://www.encodeproject.org/files/ENCFF139YUD/@@download/ENCFF139YUD.bigWig 13 | 14 | -------------------------------------------------------------------------------- /examples/bigwig_files_from_encode_for_label_comparison/download_bigwigs.sh: -------------------------------------------------------------------------------- 1 | wget https://www.encodeproject.org/files/ENCFF743ULW/@@download/ENCFF743ULW.bigWig 2 | wget https://www.encodeproject.org/files/ENCFF842XRQ/@@download/ENCFF842XRQ.bigWig 3 | wget https://www.encodeproject.org/files/ENCFF796GHK/@@download/ENCFF796GHK.bigWig 4 | wget https://www.encodeproject.org/files/ENCFF412VKD/@@download/ENCFF412VKD.bigWig 5 | wget https://www.encodeproject.org/files/ENCFF667QJZ/@@download/ENCFF667QJZ.bigWig 6 | wget https://www.encodeproject.org/files/ENCFF139YUD/@@download/ENCFF139YUD.bigWig 7 | 8 | -------------------------------------------------------------------------------- /examples/dbingest/hg38.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 248956422 2 | chr2 242193529 3 | chr3 198295559 4 | chr4 190214555 5 | chr5 181538259 6 | chr6 170805979 7 | chr7 159345973 8 | chr8 145138636 9 | chr9 138394717 10 | chr10 133797422 11 | chr11 135086622 12 | chr12 133275309 13 | chr13 114364328 14 | chr14 107043718 15 | chr15 101991189 16 | chr16 90338345 17 | chr17 83257441 18 | chr18 80373285 19 | chr19 58617616 20 | chr20 64444167 21 | chr21 46709983 22 | chr22 50818468 23 | chrX 156040895 24 | chrY 57227415 25 | chrM 16569 26 | -------------------------------------------------------------------------------- /examples/dbingest/run_db_ingest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | db_ingest --tiledb_metadata tier1.encode.dnase.tasks.tsv \ 3 | --array_name db/dnase \ 4 | --overwrite \ 5 | --chrom_sizes hg38.chrom.sizes \ 6 | --attribute_config encode_pipeline \ 7 | --coord_tile_size 10000 \ 8 | --task_tile_size 1 \ 9 | --write_chunk 30000000 \ 10 | --threads 20 \ 11 | --max_queue_size 50 \ 12 | --max_mem_g 200 13 | 14 | -------------------------------------------------------------------------------- /examples/dbingest/run_db_ingest_single_threaded.sh: -------------------------------------------------------------------------------- 1 | db_ingest_single_threaded --tiledb_metadata tier1.encode.dnase.tasks.tsv \ 2 | --tiledb_group db/dnase \ 3 | --overwrite \ 4 | --chrom_sizes hg38.chrom.sizes \ 5 | --tile_size 10000 \ 6 | --write_chunk 10000000 7 | -------------------------------------------------------------------------------- /examples/dbingest/tier1.encode.dnase.tasks.tsv: -------------------------------------------------------------------------------- 1 | dataset fc_bigwig pval_bigwig idr_peak overlap_peak ambig_peak count_bigwig_plus_5p count_bigwig_minus_5p count_bigwig_unstranded_5p 2 | ENCSR000EMT /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-macs2_signal_track/shard-0/execution/ENCSR000EMT.merged.nodup.fc.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-macs2_signal_track/shard-0/execution/ENCSR000EMT.merged.nodup.pval.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-bowtie2/shard-0/execution/ENCSR000EMT.merged.bam.bpnet.plus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-bowtie2/shard-0/execution/ENCSR000EMT.merged.bam.bpnet.minus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-bowtie2/shard-0/execution/ENCSR000EMT.merged.bam.bpnet.unstranded.bw 3 | ENCSR000EMU /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-macs2_signal_track/shard-0/execution/ENCSR000EMU.merged.nodup.fc.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-macs2_signal_track/shard-0/execution/ENCSR000EMU.merged.nodup.pval.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-bowtie2/shard-0/execution/ENCSR000EMU.merged.bam.bpnet.plus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-bowtie2/shard-0/execution/ENCSR000EMU.merged.bam.bpnet.minus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-bowtie2/shard-0/execution/ENCSR000EMU.merged.bam.bpnet.unstranded.bw 4 | ENCSR000EOT /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-macs2_signal_track/shard-0/execution/ENCSR000EOT.merged.nodup.fc.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-macs2_signal_track/shard-0/execution/ENCSR000EOT.merged.nodup.pval.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-bowtie2/shard-0/execution/ENCSR000EOT.merged.bam.bpnet.plus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-bowtie2/shard-0/execution/ENCSR000EOT.merged.bam.bpnet.minus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-bowtie2/shard-0/execution/ENCSR000EOT.merged.bam.bpnet.unstranded.bw 5 | ENCSR149XIL /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/39e50d95-1423-4dca-acd1-4b685ab94c4c/call-macs2_signal_track/shard-0/execution/ENCSR149XIL.merged.nodup.fc.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/39e50d95-1423-4dca-acd1-4b685ab94c4c/call-macs2_signal_track/shard-0/execution/ENCSR149XIL.merged.nodup.pval.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/ce805260-55f8-43c8-b2a1-a232b4a0e369/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/ce805260-55f8-43c8-b2a1-a232b4a0e369/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/39e50d95-1423-4dca-acd1-4b685ab94c4c/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/38f0a76b-e6c6-444e-84e5-b5a98a554694/call-bowtie2/shard-0/execution/ENCSR149XIL.merged.bam.bpnet.plus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/38f0a76b-e6c6-444e-84e5-b5a98a554694/call-bowtie2/shard-0/execution/ENCSR149XIL.merged.bam.bpnet.minus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/38f0a76b-e6c6-444e-84e5-b5a98a554694/call-bowtie2/shard-0/execution/ENCSR149XIL.merged.bam.bpnet.unstranded.bw 6 | ENCSR477RTP /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/d754a34e-bc9f-4270-8020-bc37e8d195ba/call-macs2_signal_track/shard-0/execution/ENCSR477RTP.merged.nodup.fc.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/b9e61b7e-4173-4b8c-aa38-9e55d81fef0e/call-macs2_signal_track/shard-0/execution/ENCSR477RTP.merged.nodup.pval.signal.bigwig /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/d754a34e-bc9f-4270-8020-bc37e8d195ba/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/d754a34e-bc9f-4270-8020-bc37e8d195ba/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/d754a34e-bc9f-4270-8020-bc37e8d195ba/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/f38bfd43-b57f-4c55-be06-b02d3f16512a/call-bowtie2/shard-0/execution/ENCSR477RTP.merged.bam.bpnet.plus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/f38bfd43-b57f-4c55-be06-b02d3f16512a/call-bowtie2/shard-0/execution/ENCSR477RTP.merged.bam.bpnet.minus.bw /oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/f38bfd43-b57f-4c55-be06-b02d3f16512a/call-bowtie2/shard-0/execution/ENCSR477RTP.merged.bam.bpnet.unstranded.bw 7 | -------------------------------------------------------------------------------- /examples/hg38.blacklist.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kundajelab/seqdataloader/5c043e7d2e5296aa01e83c4a5febf7f5272468d2/examples/hg38.blacklist.bed.gz -------------------------------------------------------------------------------- /examples/hg38.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 248956422 2 | chr2 242193529 3 | chr3 198295559 4 | chr4 190214555 5 | chr5 181538259 6 | chr6 170805979 7 | chr7 159345973 8 | chr8 145138636 9 | chr9 138394717 10 | chr10 133797422 11 | chr11 135086622 12 | chr12 133275309 13 | chr13 114364328 14 | chr14 107043718 15 | chr15 101991189 16 | chr16 90338345 17 | chr17 83257441 18 | chr18 80373285 19 | chr19 58617616 20 | chr20 64444167 21 | chr21 46709983 22 | chr22 50818468 23 | chrX 156040895 24 | chrY 57227415 -------------------------------------------------------------------------------- /examples/hg38.chrom21.sizes: -------------------------------------------------------------------------------- 1 | chr21 46709983 2 | -------------------------------------------------------------------------------- /examples/labelgen/bigwig_files_from_encode_for_label_comparison: -------------------------------------------------------------------------------- 1 | ../bigwig_files_from_encode_for_label_comparison -------------------------------------------------------------------------------- /examples/labelgen/genomewide_labels_examples.py: -------------------------------------------------------------------------------- 1 | from seqdataloader.labelgen import * 2 | classification_params={ 3 | 'task_list':"tasks.tsv", 4 | 'outf':"classificationlabels.SummitWithin200bpCenter.tsv.gz", 5 | 'output_type':'gzip', 6 | 'chrom_sizes':'hg38.chrom.sizes', 7 | 'chroms_to_keep':['chr21'], 8 | "store_positives_only":True, 9 | 'bin_stride':50, 10 | 'left_flank':400, 11 | 'right_flank':400, 12 | 'bin_size':200, 13 | 'threads':10, 14 | 'subthreads':4, 15 | 'allow_ambiguous':True, 16 | 'labeling_approach':'peak_summit_in_bin_classification' 17 | } 18 | genomewide_labels(classification_params) 19 | 20 | regression_params={ 21 | 'task_list':"tasks.tsv", 22 | 'outf':"regressionlabels.all_genome_bins_regression.hdf5", 23 | 'output_type':'hdf5', 24 | 'chrom_sizes':'hg38.chrom.sizes', 25 | 'store_values_above_thresh': 0, 26 | 'chroms_to_keep':['chr21'], 27 | 'bin_stride':50, 28 | 'left_flank':400, 29 | 'right_flank':400, 30 | 'bin_size':200, 31 | 'threads':10, 32 | 'subthreads':4, 33 | 'labeling_approach':'all_genome_bins_regression' 34 | } 35 | genomewide_labels(regression_params) 36 | -------------------------------------------------------------------------------- /examples/labelgen/genomewide_labels_examples.sh: -------------------------------------------------------------------------------- 1 | #Classification Approach 1: Summit Must Lie Within 200 BP Bin 2 | ## Timing 3 | ## writing to gzip 4 | ##real 11m46.403s 5 | ##user 18m40.788s 6 | ##sys 6m18.136s 7 | 8 | ## Writing to bz2: 9 | ## real 14m44.037s 10 | ## user 21m49.384s 11 | ## sys 6m21.000s 12 | 13 | #genomewide_labels --task_list tasks.labelgen.tsv \ 14 | # --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \ 15 | # --output_type gzip \ 16 | # --chrom_sizes hg38.chrom.sizes \ 17 | # --bin_stride 50 \ 18 | # --left_flank 400 \ 19 | # --right_flank 400 \ 20 | # --bin_size 200 \ 21 | # --chrom_threads 10 \ 22 | # --task_threads 4 \ 23 | # --allow_ambiguous \ 24 | # --labeling_approach peak_summit_in_bin_classification 25 | 26 | #Example of restricting analysis to a single chromosome with --chroms_to_keep flag 27 | #genomewide_labels --task_list tasks.labelgen.tsv \ 28 | # --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \ 29 | # --output_type gzip \ 30 | # --chrom_sizes hg38.chrom.sizes \ 31 | # --chroms_to_keep chr21 \ 32 | # --bin_stride 50 \ 33 | # --left_flank 400 \ 34 | # --right_flank 400 \ 35 | # --bin_size 200 \ 36 | # --chrom_threads 10 \ 37 | # --task_threads 4 \ 38 | # --allow_ambiguous \ 39 | # --labeling_approach peak_summit_in_bin_classification 40 | 41 | 42 | #Example with only positives stored 43 | genomewide_labels --task_list tasks.labelgen.tsv \ 44 | --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \ 45 | --output_type gzip \ 46 | --chrom_sizes hg38.chrom.sizes \ 47 | --chroms_to_keep chr21 \ 48 | --bin_stride 50 \ 49 | --left_flank 400 \ 50 | --right_flank 400 \ 51 | --bin_size 200 \ 52 | --chrom_threads 10 \ 53 | --task_threads 4 \ 54 | --allow_ambiguous \ 55 | --store_positives_only \ 56 | --labeling_approach peak_summit_in_bin_classification 57 | 58 | 59 | 60 | ##Classification Approach 2: 50% Overlap Between Peak and 200 BP Bin (50% of the Smaller of the Two) 61 | ## Timing 62 | ## real18m56.337s 63 | ## user25m23.004s 64 | ## sys7m58.104s 65 | 66 | #genomewide_labels --task_list tasks.labelgen.tsv \ 67 | # --outf classificationlabels.50PercentOverlap.tsv.gz \ 68 | # --output_type gzip \ 69 | # --chrom_sizes hg38.chrom.sizes \ 70 | # --bin_stride 50 \ 71 | # --left_flank 400 \ 72 | # --right_flank 400 \ 73 | # --chrom_threads 10 \ 74 | # --task_threads 4 \ 75 | # --allow_ambiguous \ 76 | # --overlap_thresh 0.5 \ 77 | # --labeling_approach peak_percent_overlap_with_bin_classification 78 | 79 | ##Regression Approach 1:Summit Must Lie Within 200 BP Bin 80 | ## Timing: 81 | ## real 18m15.728s 82 | ## user 24m25.028s 83 | ## sys 7m58.244s 84 | 85 | #genomewide_labels --task_list tasks.labelgen.tsv \ 86 | # --outf regressionlabels.SummitWithin200bpCenter.tsv.gz \ 87 | # --output_type gzip \ 88 | # --chrom_sizes hg38.chrom.sizes \ 89 | # --bin_stride 50 \ 90 | # --left_flank 400 \ 91 | # --right_flank 400 \ 92 | # --bin_size 200 \ 93 | # --chrom_threads 10 \ 94 | # --task_threads 4 \ 95 | # --allow_ambiguous \ 96 | # --labeling_approach peak_summit_in_bin_regression 97 | # 98 | 99 | #Regression Approach 2: 50% Overlap Between Peak and 200 BP Bin (50% of the Smaller of the Two) 100 | ## real18m56.337s 101 | ## user25m23.004s 102 | ## sys7m58.104s 103 | 104 | #genomewide_labels --task_list tasks.labelgen.tsv \ 105 | # --outf regressionlabels.50PercentOverlap.tsv.gz \ 106 | # --output_type gzip \ 107 | # --chrom_sizes hg38.chrom.sizes \ 108 | # --bin_stride 50 \ 109 | # --left_flank 400 \ 110 | # --right_flank 400 \ 111 | # --chrom_threads 10 \ 112 | # --task_threads 4 \ 113 | # --allow_ambiguous \ 114 | # --overlap_thresh 0.5 \ 115 | # --labeling_approach peak_percent_overlap_with_bin_regression 116 | 117 | 118 | ##Regression Approach 3: Provide bedtools coverage in the bigWig for every bin in the genome 119 | 120 | ##Timing for hdf5 save 121 | ## real 8m51.275s 122 | ## user 17m38.576s 123 | ## sys 6m14.768s 124 | #genomewide_labels --task_list tasks.labelgen.tsv \ 125 | # --outf regressionlabels.allbins.hg38.hdf5 \ 126 | # --output_type hdf5 \ 127 | # --chrom_sizes hg38.chrom.sizes \ 128 | # --bin_stride 50 \ 129 | # --left_flank 400 \ 130 | # --right_flank 400 \ 131 | # --chrom_threads 24 \ 132 | # --task_threads 2 \ 133 | # --labeling_approach all_genome_bins_regression 134 | 135 | ## Timing (pkl) 136 | ## real 23m10.448s 137 | ## user 31m55.056s 138 | ## sys 5m39.880s 139 | #genomewide_labels --task_list tasks.labelgen.tsv \ 140 | # --outf regressionlabels.allbins.hg38.pkl \ 141 | # --output_type pkl \ 142 | # --chrom_sizes hg38.chrom.sizes \ 143 | # --bin_stride 50 \ 144 | # --left_flank 400 \ 145 | # --right_flank 400 \ 146 | # --chrom_threads 24 \ 147 | # --task_threads 2 \ 148 | # --labeling_approach all_genome_bins_regression 149 | 150 | 151 | ## Timing for full data frame (gzip) 152 | ## real29m50.597s 153 | ## user38m2.020s 154 | ## sys6m34.064s 155 | 156 | ## Timing for chromosome-specific dataframes (gzip) 157 | ## real 21m35.525s 158 | ## user 51m55.496s 159 | ## sys 7m49.140s 160 | #genomewide_labels --task_list tasks.labelgen.tsv \ 161 | # --outf regressionlabels.allbins.hg38.tsv.gz \ 162 | # --output_type gzip \ 163 | # --chrom_sizes hg38.chrom.sizes \ 164 | # --bin_stride 50 \ 165 | # --left_flank 400 \ 166 | # --right_flank 400 \ 167 | # --chrom_threads 24 \ 168 | # --task_threads 2 \ 169 | # --split_output_by_chrom \ 170 | # --labeling_approach all_genome_bins_regression 171 | -------------------------------------------------------------------------------- /examples/labelgen/hg38.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 248956422 2 | chr2 242193529 3 | chr3 198295559 4 | chr4 190214555 5 | chr5 181538259 6 | chr6 170805979 7 | chr7 159345973 8 | chr8 145138636 9 | chr9 138394717 10 | chr10 133797422 11 | chr11 135086622 12 | chr12 133275309 13 | chr13 114364328 14 | chr14 107043718 15 | chr15 101991189 16 | chr16 90338345 17 | chr17 83257441 18 | chr18 80373285 19 | chr19 58617616 20 | chr20 64444167 21 | chr21 46709983 22 | chr22 50818468 23 | chrX 156040895 24 | chrY 57227415 -------------------------------------------------------------------------------- /examples/labelgen/hg38.chrom21.sizes: -------------------------------------------------------------------------------- 1 | chr21 46709983 2 | -------------------------------------------------------------------------------- /examples/labelgen/peak_files_from_encode_for_label_comparison: -------------------------------------------------------------------------------- 1 | ../peak_files_from_encode_for_label_comparison -------------------------------------------------------------------------------- /examples/labelgen/save_source_labels_in_labelgen.sh: -------------------------------------------------------------------------------- 1 | #save output as tsv.gz 2 | genomewide_labels --task_list tasks.labelgen.tsv \ 3 | --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \ 4 | --output_type gzip \ 5 | --chrom_sizes hg38.chrom.sizes \ 6 | --bin_stride 50 \ 7 | --left_flank 400 \ 8 | --right_flank 400 \ 9 | --bin_size 200 \ 10 | --chrom_threads 10 \ 11 | --task_threads 4 \ 12 | --allow_ambiguous \ 13 | --labeling_approach peak_summit_in_bin_classification \ 14 | --save_label_source 15 | #save output as hdf5 16 | genomewide_labels --task_list tasks.labelgen.tsv \ 17 | --outf classificationlabels.SummitWithin200bpCenter.hdf5 \ 18 | --output_type hdf5 \ 19 | --chrom_sizes hg38.chrom.sizes \ 20 | --bin_stride 50 \ 21 | --left_flank 400 \ 22 | --right_flank 400 \ 23 | --bin_size 200 \ 24 | --chrom_threads 10 \ 25 | --task_threads 4 \ 26 | --allow_ambiguous \ 27 | --labeling_approach peak_summit_in_bin_classification \ 28 | --save_label_source 29 | 30 | -------------------------------------------------------------------------------- /examples/labelgen/tasks.labelgen.tsv: -------------------------------------------------------------------------------- 1 | task narrowPeak bigwig ambig 2 | ENCFF209DJG peak_files_from_encode_for_label_comparison/ENCFF209DJG.bed.gz ./bigwig_files_from_encode_for_label_comparison/ENCFF842XRQ.bigWig hg38.blacklist.bed.gz 3 | ENCFF605WXD peak_files_from_encode_for_label_comparison/ENCFF605WXD.bed.gz ./bigwig_files_from_encode_for_label_comparison/ENCFF667QJZ.bigWig hg38.blacklist.bed.gz 4 | ENCFF073ORT peak_files_from_encode_for_label_comparison/ENCFF073ORT.bed.gz ./bigwig_files_from_encode_for_label_comparison/ENCFF743ULW.bigWig hg38.blacklist.bed.gz 5 | ENCFF618VMC peak_files_from_encode_for_label_comparison/ENCFF618VMC.bed.gz ./bigwig_files_from_encode_for_label_comparison/ENCFF796GHK.bigWig hg38.blacklist.bed.gz 6 | -------------------------------------------------------------------------------- /examples/peak_files_from_encode_for_label_comparison/README: -------------------------------------------------------------------------------- 1 | DNAse GM12878 Stam hg38 https://www.encodeproject.org/files/ENCFF073ORT/@@download/ENCFF073ORT.bed.gz 2 | DNAse Hepg2 Stam hg38 https://www.encodeproject.org/files/ENCFF209DJG/@@download/ENCFF209DJG.bed.gz 3 | Max K562 Snyder hg38 https://www.encodeproject.org/files/ENCFF618VMC/@@download/ENCFF618VMC.bed.gz 4 | Myc K562 Snyder hg38 https://www.encodeproject.org/files/ENCFF605WXD/@@download/ENCFF605WXD.bed.gz 5 | -------------------------------------------------------------------------------- /examples/seqdataloader_examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#load tutorial utilities \n", 10 | "%reload_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "%matplotlib inline\n", 13 | "import warnings\n", 14 | "warnings.filterwarnings('ignore')" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Ingesting data into tileDB " 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from seqdataloader.dbingest import * " 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "The header of the input task file should contain (one or more) of the following fields: \n", 38 | " * dataset (this one's required -- it's a unique label for your dataset) \n", 39 | " * pval_bigwig \n", 40 | " * fc_bigwig \n", 41 | " * count_bigwig_plus_5p \n", 42 | " * count_bigwig_minux_5p\n", 43 | " * idr_peak\n", 44 | " * overlap_peak \n", 45 | " * ambig_peak \n", 46 | " \n", 47 | "The file paths can be either local or web-based URL's. " 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "cat: tasks.dbingest.tsv: No such file or directory\r\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "!cat tasks.dbingest.tsv" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "You can run the ingest code as a python function: " 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "loaded tiledb metadata\n", 84 | "loaded chrom sizes\n", 85 | "tiledb group already exists\n", 86 | "got data dict\n", 87 | "parsed pool inputs\n", 88 | "made pool!\n", 89 | "warning: the array: hepg2_dnase_encode/ENCFF209DJG.chr21 already exists. You provided the --overwrite flag, so it will be updated/overwritten\n", 90 | "here\n", 91 | "store_summits:True\n", 92 | "summit_indicator:2\n", 93 | "got:idr_peak for chrom:chr21\n", 94 | "store_summits:False\n", 95 | "summit_indicator:None\n", 96 | "got:fc_bigwig for chrom:chr21\n", 97 | "store_summits:False\n", 98 | "summit_indicator:None\n", 99 | "got:ambig_peak for chrom:chr21\n", 100 | "starting to write output\n", 101 | "got cur vals\n", 102 | "idr_peak\n", 103 | "dict_to_write[key].shape:(46709983,)\n", 104 | "fc_bigwig\n", 105 | "dict_to_write[key].shape:(46709983,)\n", 106 | "ambig_peak\n", 107 | "dict_to_write[key].shape:(46709983,)\n", 108 | "updated data dict for writing\n", 109 | "finalizing the write\n", 110 | "0\n", 111 | "1000000\n", 112 | "2000000\n", 113 | "3000000\n", 114 | "4000000\n", 115 | "5000000\n", 116 | "6000000\n", 117 | "7000000\n", 118 | "8000000\n", 119 | "9000000\n", 120 | "10000000\n", 121 | "11000000\n", 122 | "12000000\n", 123 | "13000000\n", 124 | "14000000\n", 125 | "15000000\n", 126 | "16000000\n", 127 | "17000000\n", 128 | "18000000\n", 129 | "19000000\n", 130 | "20000000\n", 131 | "21000000\n", 132 | "22000000\n", 133 | "23000000\n", 134 | "24000000\n", 135 | "25000000\n", 136 | "26000000\n", 137 | "27000000\n", 138 | "28000000\n", 139 | "29000000\n", 140 | "30000000\n", 141 | "31000000\n", 142 | "32000000\n", 143 | "33000000\n", 144 | "34000000\n", 145 | "35000000\n", 146 | "36000000\n", 147 | "37000000\n", 148 | "38000000\n", 149 | "39000000\n", 150 | "40000000\n", 151 | "41000000\n", 152 | "42000000\n", 153 | "43000000\n", 154 | "44000000\n", 155 | "45000000\n", 156 | "46000000\n", 157 | "length of pool inputs:48\n", 158 | "made pool\n", 159 | "start:0, end:1000000\n", 160 | "start:1000000, end:2000000\n", 161 | "start:2000000, end:3000000\n", 162 | "start:3000000, end:4000000\n", 163 | "start:4000000, end:5000000\n", 164 | "start:5000000, end:6000000\n", 165 | "start:6000000, end:7000000\n", 166 | "start:7000000, end:8000000\n", 167 | "start:8000000, end:9000000\n", 168 | "start:9000000, end:10000000\n", 169 | "start:10000000, end:11000000\n", 170 | "start:11000000, end:12000000\n", 171 | "start:12000000, end:13000000\n", 172 | "start:13000000, end:14000000\n", 173 | "start:14000000, end:15000000\n", 174 | "start:15000000, end:16000000\n", 175 | "start:16000000, end:17000000\n", 176 | "start:17000000, end:18000000\n", 177 | "start:18000000, end:19000000\n", 178 | "start:19000000, end:20000000\n", 179 | "start:20000000, end:21000000\n", 180 | "start:21000000, end:22000000\n", 181 | "start:22000000, end:23000000\n", 182 | "start:23000000, end:24000000\n", 183 | "start:24000000, end:25000000\n", 184 | "start:25000000, end:26000000\n", 185 | "start:26000000, end:27000000\n", 186 | "start:27000000, end:28000000\n", 187 | "start:28000000, end:29000000\n", 188 | "start:29000000, end:30000000\n", 189 | "start:30000000, end:31000000\n", 190 | "start:31000000, end:32000000\n", 191 | "start:32000000, end:33000000\n", 192 | "start:33000000, end:34000000\n", 193 | "start:34000000, end:35000000\n", 194 | "start:35000000, end:36000000\n", 195 | "start:36000000, end:37000000\n", 196 | "start:37000000, end:38000000\n", 197 | "start:38000000, end:39000000\n", 198 | "start:39000000, end:40000000\n", 199 | "start:40000000, end:41000000\n", 200 | "start:41000000, end:42000000\n", 201 | "start:42000000, end:43000000\n", 202 | "start:43000000, end:44000000\n", 203 | "start:44000000, end:45000000\n", 204 | "start:45000000, end:46000000\n", 205 | "start:46000000, end:47000000\n", 206 | "start:47000000, end:46709983\n", 207 | "done writing\n", 208 | "wrote array to disk for dataset:hepg2_dnase_encode/ENCFF209DJG.chr21\n" 209 | ] 210 | }, 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "'done'" 215 | ] 216 | }, 217 | "execution_count": 4, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "args={\"tiledb_metadata\":\"tasks.dbingest.tsv\",\n", 224 | " \"tiledb_group\":\"hepg2_dnase_encode\",\n", 225 | " \"overwrite\":True,\n", 226 | " \"chrom_sizes\":\"hg38.chrom21.sizes\",\n", 227 | " \"chrom_threads\":1,\n", 228 | " \"task_threads\":1,\n", 229 | " \"write_threads\":1}\n", 230 | "\n", 231 | "ingest(args)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "Or you can run the code as a script: " 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 4, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "cat: tasks.dbingest.local.tsv: No such file or directory\r\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "!cat ~/seqdataltasks.dbingest.local.tsv" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "!db_ingest --tiledb_metadata tasks.dbingest.local.tsv \\\n", 265 | " --tiledb_group hepg2_dnase_encode \\\n", 266 | " --overwrite \\\n", 267 | " --chrom_sizes hg38.chrom.sizes \\\n", 268 | " --chrom_threads 25 \\\n", 269 | " --attribute_config encode_pipeline \\\n", 270 | " --tile_size 9000 \\\n", 271 | " --batch_size 1000000\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 8, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "odict_keys(['pval_bigwig', 'fc_bigwig', 'count_bigwig_plus_5p', 'count_bigwig_minux_5p', 'idr_peak', 'overlap_peak', 'ambig_peak'])\n", 284 | "odict_keys(['pval_bigwig', 'fc_bigwig', 'count_bigwig_plus_5p', 'count_bigwig_minux_5p', 'idr_peak', 'overlap_peak', 'ambig_peak'])\n" 285 | ] 286 | } 287 | ], 288 | "source": [ 289 | "#we can examine the array \n", 290 | "import tiledb \n", 291 | "data=tiledb.DenseArray(\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY.chr1\",'r')\n", 292 | "subset=data[30000000:31000000]\n", 293 | "print(subset.keys())\n", 294 | "data=tiledb.DenseArray(\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY.chr21\",'r')\n", 295 | "subset=data[30000000:31000000]\n", 296 | "print(subset.keys())" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 6, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "array([0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 308 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 309 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 310 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 311 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 312 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 313 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 314 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 315 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 316 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 317 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 318 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 319 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 320 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 321 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 322 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 323 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 324 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 325 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 326 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 327 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 328 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 329 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 330 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 331 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 332 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 333 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 334 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 335 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 336 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 337 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 338 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 339 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 340 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 341 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 342 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 343 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 344 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 345 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 346 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 347 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 348 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 349 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 350 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 351 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 352 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 353 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 354 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 355 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 356 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 357 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 358 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 359 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 360 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 361 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 362 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 363 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 364 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 365 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 366 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 367 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 368 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 369 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 370 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 371 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 372 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 373 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 374 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 375 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 376 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 377 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 378 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 379 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 380 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 381 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 382 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 383 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 384 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 385 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 386 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 387 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 388 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 389 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 390 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 391 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 392 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 393 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 394 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 395 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 396 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 397 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 398 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 399 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 400 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 401 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 402 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 403 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 404 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 405 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 406 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 407 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 408 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 409 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 410 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 411 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 412 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 413 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 414 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 415 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 416 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 417 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 418 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 419 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 420 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 421 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 422 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 423 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 424 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 425 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 426 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 427 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 428 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 429 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 430 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 431 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 432 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 433 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 434 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 435 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 436 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 437 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 438 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 439 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 440 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 441 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 442 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 443 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 444 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 445 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 446 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 447 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 448 | " 0. , 0. , 0. , 0. , 0. ,\n", 449 | " 0. , 0. , 0. , 0. , 0. ,\n", 450 | " 0. , 0. , 0. , 0. , 0. ,\n", 451 | " 0. , 0. , 0. , 0. , 0. ,\n", 452 | " 0. , 0. , 0. , 0. , 0. ,\n", 453 | " 0. , 0. , 0. , 0. , 0. ,\n", 454 | " 0. , 0. , 0. , 0. , 0. ,\n", 455 | " 0. , 0. , 0. , 0. , 0. ,\n", 456 | " 0. , 0. , 0. , 0. , 0. ,\n", 457 | " 0. , 0. , 0. , 0. , 0. ,\n", 458 | " 0. , 0. , 0. , 0. , 0. ,\n", 459 | " 0. , 0. , 0. , 0. , 0. ,\n", 460 | " 0. , 0. , 0. , 0. , 0. ,\n", 461 | " 0. , 0. , 0. , 0. , 0. ,\n", 462 | " 0. , 0. , 0. , 0. , 0. ,\n", 463 | " 0. , 0. , 0. , 0. , 0. ,\n", 464 | " 0. , 0. , 0. , 0. , 0. ,\n", 465 | " 0. , 0. , 0. , 0. , 0. ,\n", 466 | " 0. , 0. , 0. , 0. , 0. ,\n", 467 | " 0. , 0. , 0. , 0. , 0. ,\n", 468 | " 0. , 0. , 0. , 0. , 0. ,\n", 469 | " 0. , 0. , 0. , 0. , 0. ,\n", 470 | " 0. , 0. , 0. , 0. , 0. ,\n", 471 | " 0. , 0. , 0. , 0. , 0. ,\n", 472 | " 0. , 0. , 0. , 0. , 0. ,\n", 473 | " 0. , 0. , 0. , 0. , 0. ,\n", 474 | " 0. , 0. , 0. , 0. , 0. ,\n", 475 | " 0. , 0. , 0. , 0. , 0. ,\n", 476 | " 0. , 0. , 0. , 0. , 0. ,\n", 477 | " 0. , 0. , 0. , 0. , 0. ,\n", 478 | " 0. , 0. , 0. , 0. , 0. ,\n", 479 | " 0. , 0. , 0. , 0. , 0. ,\n", 480 | " 0. , 0. , 0. , 0. , 0. ,\n", 481 | " 0. , 0. , 0. , 0. , 0. ,\n", 482 | " 0. , 0. , 0. , 0. , 0. ,\n", 483 | " 0. , 0. , 0. , 0. , 0. ,\n", 484 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 485 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 486 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 487 | " 0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n", 488 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 489 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 490 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 491 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 492 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 493 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 494 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 495 | " 0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n", 496 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 497 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 498 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 499 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 500 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 501 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 502 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 503 | " 0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n", 504 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 505 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n", 506 | " 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ],\n", 507 | " dtype=float32)" 508 | ] 509 | }, 510 | "execution_count": 6, 511 | "metadata": {}, 512 | "output_type": "execute_result" 513 | } 514 | ], 515 | "source": [ 516 | "subset['fc_bigwig'][0:1000]" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 7, 522 | "metadata": {}, 523 | "outputs": [ 524 | { 525 | "data": { 526 | "text/plain": [ 527 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 528 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 529 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 530 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 531 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 532 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 533 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 534 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 535 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 536 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 537 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 538 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 539 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 540 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 541 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 542 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 543 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 544 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 545 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 546 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 547 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 548 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 549 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 550 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 551 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 552 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 553 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 554 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 555 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 556 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 557 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 558 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 559 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 560 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 561 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 562 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 563 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 564 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 565 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 566 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 567 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 568 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 569 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 570 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 571 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 572 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" 573 | ] 574 | }, 575 | "execution_count": 7, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "subset['idr_peak'][0:1000]" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "## Genomewide classification labels " 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "metadata": {}, 595 | "outputs": [], 596 | "source": [ 597 | "from seqdataloader.labelgen import *\n", 598 | "classification_params={\n", 599 | " 'task_list':\"tasks.labelgen.tsv\",\n", 600 | " 'outf':\"classificationlabels.SummitWithin200bpCenter.tsv.gz\",\n", 601 | " 'output_type':'gzip',\n", 602 | " 'chrom_sizes':'hg38.chrom.sizes',\n", 603 | " 'chroms_to_keep':['chr21'],\n", 604 | " \"store_positives_only\":True,\n", 605 | " 'bin_stride':50,\n", 606 | " 'left_flank':400,\n", 607 | " 'right_flank':400,\n", 608 | " 'bin_size':200,\n", 609 | " 'task_threads':10,\n", 610 | " 'chrom_threads':4,\n", 611 | " 'allow_ambiguous':True,\n", 612 | " 'labeling_approach':'peak_summit_in_bin_classification'\n", 613 | " }\n", 614 | "genomewide_labels(classification_params)\n", 615 | "\n" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "## Genomewide regression labels " 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "regression_params={\n", 632 | " 'task_list':\"tasks.labelgen.tsv\",\n", 633 | " 'outf':\"regressionlabels.all_genome_bins_regression.hdf5\",\n", 634 | " 'output_type':'hdf5',\n", 635 | " 'chrom_sizes':'hg38.chrom.sizes',\n", 636 | " 'store_values_above_thresh': 0,\n", 637 | " 'chroms_to_keep':['chr21'],\n", 638 | " 'bin_stride':50,\n", 639 | " 'left_flank':400,\n", 640 | " 'right_flank':400,\n", 641 | " 'bin_size':200,\n", 642 | " 'threads':10,\n", 643 | " 'subthreads':4,\n", 644 | " 'labeling_approach':'all_genome_bins_regression'\n", 645 | " }\n", 646 | "genomewide_labels(regression_params)\n" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "let's examine the output dataframe for the regression case: " 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [ 662 | "regression_data=pd.read_hdf(\"regressionlabels.all_genome_bins_regression.hdf5\")" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": null, 668 | "metadata": {}, 669 | "outputs": [], 670 | "source": [ 671 | "regression_data.head()" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "regression_negatives=pd.read_hdf(\"universal_negatives.regressionlabels.all_genome_bins_regression.hdf5\")\n", 681 | "regression_negatives.head" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "for the classification case, we specified \"store_positives_only\", so the script generated two dataframes: \n", 689 | " * Universal negatives \n", 690 | " * Dataframe where each bin is >0 for at least one task " 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "classification_pos=pd.read_csv(\"classificationlabels.SummitWithin200bpCenter.tsv.gz\",sep='\\t',header=0)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "metadata": {}, 706 | "outputs": [], 707 | "source": [ 708 | "classification_pos.head()" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "classification_neg=pd.read_csv(\"universal_negatives.classificationlabels.SummitWithin200bpCenter.tsv.gz\",sep='\\t',header=0)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": {}, 724 | "outputs": [], 725 | "source": [ 726 | "classification_neg.head()" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": null, 732 | "metadata": {}, 733 | "outputs": [], 734 | "source": [] 735 | } 736 | ], 737 | "metadata": { 738 | "kernelspec": { 739 | "display_name": "Python 3", 740 | "language": "python", 741 | "name": "python3" 742 | }, 743 | "language_info": { 744 | "codemirror_mode": { 745 | "name": "ipython", 746 | "version": 3 747 | }, 748 | "file_extension": ".py", 749 | "mimetype": "text/x-python", 750 | "name": "python", 751 | "nbconvert_exporter": "python", 752 | "pygments_lexer": "ipython3", 753 | "version": "3.7.0" 754 | } 755 | }, 756 | "nbformat": 4, 757 | "nbformat_minor": 2 758 | } 759 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.15 2 | pandas>=0.23.4 3 | cython>=0.27.3 4 | deeptools>=3.0.1 5 | psutil 6 | pybedtools>=0.7 7 | pyBigWig>=0.3.7 8 | pyfaidx 9 | tiledb>=0.4.4 10 | -------------------------------------------------------------------------------- /seqdataloader/attrib_config.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | allowed_attributes={} 3 | allowed_attributes['bigwig']={'dtype':'float32', 4 | 'opener':open_bigwig_for_parsing, 5 | 'parser':parse_bigwig_chrom_vals, 6 | 'store_summits':False} 7 | allowed_attributes['bed_no_summit']={'dtype':'int', 8 | 'opener':open_csv_for_parsing, 9 | 'parser':parse_narrowPeak_chrom_vals, 10 | 'store_summits':False, 11 | 'summit_from_peak_center':False} 12 | allowed_attributes['bed_summit_from_peak_center']={'dtype':'int', 13 | 'opener':open_csv_for_parsing, 14 | 'parser':parse_narrowPeak_chrom_vals, 15 | 'store_summits':True, 16 | 'summit_indicator':2, 17 | 'summit_from_peak_center':True} 18 | allowed_attributes['bed_summit_from_last_col']={'dtype':'int', 19 | 'opener':open_csv_for_parsing, 20 | 'parser':parse_narrowPeak_chrom_vals, 21 | 'store_summits':True, 22 | 'summit_indicator':2, 23 | 'summit_from_peak_center':False} 24 | 25 | def get_generic_bigwig_config(): 26 | attrib_info=dict() 27 | attrib_info['bigwig_track']=allowed_attributes['bigwig'] 28 | attrib_info['ambig_peak']=allowed_attributes['bed_no_summit'] 29 | return attrib_info 30 | 31 | def get_encode_with_controls_config(): 32 | attrib_info=get_encode_config() 33 | #add the control count tracks 34 | attrib_info['control_count_bigwig_unstranded_5p']=allowed_attributes['bigwig'] 35 | attrib_info['control_count_bigwig_plus_5p']=allowed_attributes['bigwig'] 36 | attrib_info['control_count_bigwig_minus_5p']=allowed_attributes['bigwig'] 37 | return attrib_info 38 | 39 | 40 | def get_encode_config(): 41 | attrib_info=dict() 42 | 43 | attrib_info['pval_bigwig']=allowed_attributes['bigwig'] 44 | attrib_info['fc_bigwig']=allowed_attributes['bigwig'] 45 | attrib_info['count_bigwig_plus_5p']=allowed_attributes['bigwig'] 46 | attrib_info['count_bigwig_minus_5p']=allowed_attributes['bigwig'] 47 | attrib_info['count_bigwig_unstranded_5p']=allowed_attributes['bigwig'] 48 | attrib_info['idr_peak']=allowed_attributes['bed_summit_from_last_col'] 49 | attrib_info['overlap_peak']=allowed_attributes['bed_summit_from_last_col'] 50 | attrib_info['ambig_peak']=allowed_attributes['bed_no_summit'] 51 | return attrib_info 52 | 53 | def get_attribute_info_from_file(attribute_config_file): 54 | config_metadata=open(attribute_config_file,'r').read().strip().split('\n') 55 | attrib_info={} 56 | for line in config_metadata: 57 | tokens=line.split('\t') 58 | field_name=tokens[0] 59 | field_type=tokens[1] 60 | attrib_info[field_name]=allowed_attributes[field_type] 61 | return attrib_info 62 | 63 | def get_attribute_info(attribute_config,attribute_config_file): 64 | assert (attribute_config is None) or (attribute_config_file is None) 65 | if attribute_config_file is not None: 66 | return get_attribute_info_from_file(attribute_config_file) 67 | try: 68 | name_to_config=dict() 69 | name_to_config['encode_pipeline_with_controls']=get_encode_with_controls_config() 70 | name_to_config['encode_pipeline']=get_encode_config() 71 | name_to_config['generic_bigwig']=get_generic_bigwig_config() 72 | attrib_info=name_to_config[attribute_config] 73 | return attrib_info 74 | except Exception as e: 75 | raise e 76 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | from . import coordbased 3 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | from . import coordstovals 3 | from . import coordbatchproducers 4 | from . import coordbatchtransformers 5 | from . import core 6 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/coordbatchproducers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | import gzip 3 | from .core import Coordinates 4 | import numpy as np 5 | 6 | 7 | class KerasSequenceApiCoordsBatchProducer(object): 8 | 9 | """ 10 | Args: 11 | batch_size (int): note that if you apply some kind of augmentation, 12 | then this value will end up being half of the actual batch size. 13 | shuffle_before_epoch (boolean, optional): default False 14 | seed (int): default 1234; needed if shuffle=True 15 | """ 16 | def __init__(self, batch_size, shuffle_before_epoch, seed): 17 | self.coords_list = self._get_coordslist() 18 | self.batch_size = batch_size 19 | self.shuffle_before_epoch = shuffle_before_epoch 20 | self.seed = seed 21 | if (self.shuffle_before_epoch): 22 | self.rng = np.random.RandomState(self.seed) 23 | self._shuffle_coordslist() 24 | 25 | def _get_coordslist(self): 26 | raise NotImplementedError() 27 | 28 | def _shuffle_coordslist(self): 29 | self.rng.shuffle(self.coords_list) 30 | 31 | def __getitem__(self, index): 32 | """ 33 | Args: 34 | index (:obj:`int`): index of the batch 35 | 36 | Returns: 37 | :obj:`list`: the coordinates for a complete batch 38 | """ 39 | return self.coords_list[index*self.batch_size: 40 | (index+1)*self.batch_size] 41 | 42 | def __len__(self): 43 | """ 44 | Returns: 45 | The total number of batches to return 46 | """ 47 | return int(np.ceil(len(self.coords_list)/float(self.batch_size))) 48 | 49 | def on_epoch_end(self): 50 | """ 51 | Things to be executed after the epoch - like shuffling the coords 52 | """ 53 | if (self.shuffle_before_epoch): 54 | self._shuffle_coordslist() 55 | 56 | 57 | class BedFileObj(object): 58 | def __init__(self, bed_file, hastitle=False): 59 | print("Heads up: coordinates in bed file" 60 | +" are assumed to be on the positive strand;" 61 | +" if strand in the bed file is improtant to you, please" 62 | +" add that feature to SimpleCoordsBatchProducer") 63 | self.bed_file = bed_file 64 | self.hastitle = hastitle 65 | self.coords_list = self._read_bed_file() 66 | 67 | def _read_bed_file(self): 68 | coords_list = [] 69 | for linenum,line in enumerate((gzip.open(self.bed_file) if ".gz" 70 | in self.bed_file 71 | else open(self.bed_file))): 72 | if (linenum > 0 or self.hastitle==False): 73 | (chrom, start_str, end_str) =\ 74 | line.decode("utf-8").rstrip().split("\t")[0:3] 75 | coords_list.append(Coordinates(chrom=chrom, 76 | start=int(start_str), 77 | end=int(end_str))) 78 | return coords_list 79 | 80 | def __len__(self): 81 | return len(self.coords_list) 82 | 83 | def get_strided_subsample(self, offset, stride): 84 | return self.coords_list[offset::stride] 85 | 86 | def assert_sorted(self): 87 | prev_entry = self.coords_list[0] 88 | for entry in self.coords_list[1:]: 89 | if entry.chrom==prev_entry.chrom: 90 | assert entry.start >= prev_entry.start, ("Bed file "+ 91 | self.bed_file+" is not sorted; "+str(entry) 92 | +" follows "+str(prev_entry)) 93 | prev_entry = entry 94 | 95 | 96 | class DownsampleNegativesCoordsBatchProducer( 97 | KerasSequenceApiCoordsBatchProducer): 98 | 99 | def __init__(self, pos_bed_file, neg_bed_file, 100 | target_proportion_positives, **kwargs): 101 | 102 | print("Reading in positive bed file") 103 | self.pos_bedfileobj = BedFileObj(bed_file=pos_bed_file) 104 | print("Got",len(self.pos_bedfileobj.coords_list), 105 | " coords in positive bed file") 106 | print("Reading in negative bed file") 107 | self.neg_bedfileobj = BedFileObj(bed_file=neg_bed_file) 108 | print("Got",len(self.neg_bedfileobj.coords_list), 109 | " coords in negative bed file") 110 | self.neg_bedfileobj.assert_sorted() 111 | 112 | self.target_proportion_positives = target_proportion_positives 113 | self.subsample_factor = int(np.ceil( 114 | (len(self.neg_bedfileobj.coords_list) 115 | *(self.target_proportion_positives/ 116 | (1-self.target_proportion_positives)) )/ 117 | len(self.pos_bedfileobj.coords_list))) 118 | print("The target proportion of positives of", 119 | self.target_proportion_positives,"requires the negative set" 120 | +" to be subsampled by a factor of",self.subsample_factor, 121 | "which will result in a #neg of", 122 | int(len(self.neg_bedfileobj.coords_list)/self.subsample_factor)) 123 | self.last_used_offset = -1 124 | super(DownsampleNegativesCoordsBatchProducer, self).__init__(**kwargs) 125 | 126 | def _shuffle_coordslist(self): 127 | self.rng.shuffle(self.subsampled_neg_coords) 128 | self.rng.shuffle(self.pos_coords) 129 | fracpos = len(self.pos_coords)/( 130 | len(self.pos_coords) + len(self.subsampled_neg_coords)) 131 | #interleave evenly 132 | pos_included = 0 133 | neg_included = 0 134 | new_coordslist = [] 135 | for i in range(len(self.pos_coords)+len(self.subsampled_neg_coords)): 136 | if (pos_included < (pos_included+neg_included)*(fracpos)): 137 | new_coordslist.append(self.pos_coords[pos_included]) 138 | pos_included += 1 139 | else: 140 | new_coordslist.append(self.subsampled_neg_coords[neg_included]) 141 | neg_included += 1 142 | assert pos_included==len(self.pos_coords) 143 | assert neg_included==len(self.subsampled_neg_coords) 144 | self.coords_list = new_coordslist 145 | 146 | def _get_coordslist(self): 147 | self.last_used_offset += 1 148 | self.last_used_offset = self.last_used_offset%self.subsample_factor 149 | print("Using an offset of ",self.last_used_offset," before striding") 150 | self.last_used_offset = self.last_used_offset%self.subsample_factor 151 | subsampled_neg_coords = self.neg_bedfileobj.get_strided_subsample( 152 | offset=self.last_used_offset, 153 | stride=self.subsample_factor) 154 | pos_coords = self.pos_bedfileobj.coords_list 155 | self.subsampled_neg_coords = subsampled_neg_coords 156 | self.pos_coords = pos_coords 157 | return pos_coords+subsampled_neg_coords 158 | 159 | def on_epoch_end(self): 160 | #get negative set with potentially different stride 161 | self.coords_list = self._get_coordslist() 162 | #perform shuffling as needed 163 | super(DownsampleNegativesCoordsBatchProducer, self).on_epoch_end() 164 | 165 | 166 | class SimpleCoordsBatchProducer(KerasSequenceApiCoordsBatchProducer): 167 | 168 | """ 169 | Args: 170 | bed_file (string): file with the bed coordinates. 171 | Assumes coordinates are on the positive strand. 172 | coord_batch_transformer (AbstracCoordBatchTransformer): does things 173 | like revcomp and random jitter 174 | """ 175 | def __init__(self, bed_file, 176 | hastitle=False, 177 | coord_batch_transformer=None, 178 | **kwargs): 179 | self.bed_file = BedFileObj(bed_file=bed_file, hastitle=hastitle) 180 | if (coord_batch_transformer is not None): 181 | raise DeprecationWarning( 182 | "Moving forward, coords_batch_transformer should be" 183 | +" specified as an argument to KerasBatchGenerator" 184 | +", not as an arugment to the CoordsBatchProducer." 185 | +" This is to allow different CoordsBatchProducer" 186 | +" implementations to be used with the same" 187 | +" coords_batch_transformer code.") 188 | self.coord_batch_transformer = coord_batch_transformer 189 | super(SimpleCoordsBatchProducer, self).__init__(**kwargs) 190 | 191 | def _get_coordslist(self): 192 | return [x for x in self.bed_file.coords_list] 193 | 194 | def __getitem__(self, index): 195 | orig_batch = self.coords_list[index*self.batch_size: 196 | (index+1)*self.batch_size] 197 | if (self.coord_batch_transformer is not None): 198 | return self.coord_batch_transformer(orig_batch) 199 | else: 200 | return orig_batch 201 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/coordbatchtransformers.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | from .core import Coordinates 3 | import numpy as np 4 | 5 | 6 | def get_revcomp(coordinate): 7 | return Coordinates(chrom=coordinate.chrom, 8 | start=coordinate.start, end=coordinate.end, 9 | isplusstrand=(coordinate.isplusstrand==False)) 10 | 11 | 12 | class AbstractCoordBatchTransformer(object): 13 | 14 | def __call__(self, coords): 15 | """ 16 | Args: 17 | coords (:obj:`list` of :obj:`Coordinates` objects): 18 | 19 | Returns: 20 | another :obj:`list` of :obj:`Coordinates` 21 | """ 22 | raise NotImplementedError() 23 | 24 | def chain(self, coord_batch_transformer): 25 | return lambda coords: coord_batch_transformer(self(coords)) 26 | 27 | 28 | class ReverseComplementAugmenter(AbstractCoordBatchTransformer): 29 | """ 30 | Returns a list of Coordinates twice the length of the 31 | original list by appending the reverse complements 32 | of the original coordinates at the end 33 | """ 34 | def __call__(self, coords): 35 | return coords + [get_revcomp(x) for x in coords] 36 | 37 | 38 | class UniformJitter(AbstractCoordBatchTransformer): 39 | 40 | def __init__(self, maxshift, seed=1234, chromsizes_file=None): 41 | """ 42 | Returns a list of Coordinates jittered relative to the original 43 | coordinates by a shift of up to +/- maxshift. Size of the 44 | shift is sampled from a uniform distribution. 45 | 46 | Args: 47 | maxshift (:obj:`int`): maximum possible shift to sample 48 | chromsizes (:obj:`string`): path to a chromsizes file. If 49 | specified, shifts will be adjusted so as to avoid going 50 | over the end of the chromosome. Default is None. 51 | """ 52 | self.rng = np.random.RandomState(seed) 53 | self.maxshift = maxshift 54 | self.chromsizes = ( 55 | self._read_chromsizes(chromsizes_file=chromsizes_file) 56 | if chromsizes_file is not None else None) 57 | 58 | def _read_chromsizes(self, chromsizes_file): 59 | chrom_to_size = {} 60 | for row in open(chromsizes_file): 61 | chrom,chromlen = row.rstrip().split("\t") 62 | chromlen = int(chromlen) 63 | chrom_to_size[chrom] = chromlen 64 | return chrom_to_size 65 | 66 | def __call__(self, coords): 67 | a_list = [] 68 | for coord in coords: 69 | chrom = coord.chrom 70 | start = coord.start 71 | end = coord.end 72 | isplusstrand = coord.isplusstrand 73 | shift_size = int(self.rng.uniform(low=0, high=(2*self.maxshift + 1)) 74 | - self.maxshift) 75 | shift_size = max(-start, shift_size) 76 | if self.chromsizes is not None: 77 | shift_size = min(self.chromsizes[chrom]-end, shift_size) 78 | start = start + shift_size 79 | end = end + shift_size 80 | a_list.append(Coordinates(chrom=chrom, start=start, 81 | end=end, isplusstrand=isplusstrand)) 82 | return a_list 83 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/coordstovals/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | from . import bigwig 3 | from . import fasta 4 | from . import core 5 | from . import lookup 6 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/coordstovals/bigwig.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | import numpy as np 3 | import pyBigWig 4 | from .core import CoordsToVals, get_new_coors_around_center 5 | from ..core import Coordinates 6 | 7 | 8 | def rolling_window(a, window): 9 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 10 | strides = a.strides + (a.strides[-1],) 11 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 12 | 13 | 14 | def smooth_profiles(profiles, smoothing_window): 15 | assert len(profiles.shape)==3 16 | leftpadlen = int((smoothing_window-1)/2) 17 | rightpadlen =\ 18 | (smoothing_window-1)-int((smoothing_window-1)/2) 19 | padded_profiles = np.pad( 20 | array=profiles, 21 | pad_width=((0,0),(leftpadlen, rightpadlen), (0,0)), 22 | mode='edge') 23 | smoothed_profiles = np.mean(rolling_window( 24 | a=padded_profiles.transpose(0,2,1), 25 | window=smoothing_window), axis=-1).transpose((0,2,1)) 26 | return smoothed_profiles 27 | 28 | 29 | class BigWigReader(object): 30 | 31 | def __init__(self, bigwig_path): 32 | """ 33 | Args: 34 | bigwig_path (:obj:`str`): path to the .bw file 35 | """ 36 | self.bigwig_path = bigwig_path 37 | self.bw = pyBigWig.open(bigwig_path) 38 | 39 | def read_values(self, coors): 40 | """ 41 | Args: 42 | coords (:obj:`list` of :obj:Coordinates) 43 | 44 | Returns: 45 | ndarray of dims (nexamples x width). All the coordinates must be 46 | of the same length. 47 | """ 48 | to_return = [] 49 | for coor in coors: 50 | to_append = np.nan_to_num( 51 | x=self.bw.values(coor.chrom, coor.start, coor.end)) 52 | if (coor.isplusstrand==False): 53 | to_append = to_append[::-1] 54 | to_return.append(to_append) 55 | lengths = set([len(x) for x in to_return]) 56 | assert len(lengths)==1, ("All the sequences must be of the same" 57 | +"lengths, but lengths are "+str(lengths)) 58 | return np.array(to_return) 59 | 60 | 61 | class LogCountsAndProfile(CoordsToVals): 62 | 63 | def __init__(self, bigwig_path, counts_mode_name, 64 | profile_mode_name): 65 | self.reader = BigWigReader(bigwig_path=bigwig_path) 66 | self.counts_mode_name = counts_mode_name 67 | self.profile_mode_name = profile_mode_name 68 | 69 | def __call__(self, coors): 70 | profile_values = self.reader.read_values(coors=coors) 71 | counts = np.log(np.sum(profile_values, axis=-1)+1) 72 | to_return = {self.counts_mode_name: counts, 73 | self.profile_mode_name: profile_values} 74 | return to_return 75 | 76 | 77 | class AbstractCountAndProfileTransformer(object): 78 | 79 | def __call__(self, counts, profiles): 80 | raise NotImplementedError() 81 | 82 | def chain(self, count_and_profile_transformer): 83 | def chained_count_and_profile_transformer(counts, profiles): 84 | counts, profiles = self(counts=counts, profiles=profiles) 85 | return count_and_profile_transformer( 86 | counts=counts, profiles=profiles) 87 | return chained_count_and_profile_transformer 88 | 89 | 90 | class LogCountsPlusOne(AbstractCountAndProfileTransformer): 91 | 92 | def __call__(self, counts, profiles): 93 | return np.log(counts+1), profiles 94 | 95 | 96 | class SmoothProfiles(AbstractCountAndProfileTransformer): 97 | 98 | def __init__(self, smoothing_windows): 99 | self.smoothing_windows = smoothing_windows 100 | 101 | def __call__(self, counts, profiles): 102 | profiles_to_return = np.concatenate([ 103 | smooth_profiles(profiles=profiles, smoothing_window=x) 104 | for x in self.smoothing_windows], axis=-1) 105 | return counts, profiles_to_return 106 | 107 | 108 | class MultiTrackCountsAndProfile(CoordsToVals): 109 | 110 | def __init__(self, bigwig_paths, 111 | counts_and_profiles_transformer, 112 | counts_mode_name, 113 | profile_mode_name, center_size_to_use): 114 | self.bigwig_readers = [BigWigReader(bigwig_path=x) 115 | for x in bigwig_paths] 116 | self.counts_and_profiles_transformer = counts_and_profiles_transformer 117 | self.counts_mode_name = counts_mode_name 118 | self.profile_mode_name = profile_mode_name 119 | self.center_size_to_use = center_size_to_use 120 | 121 | def _get_counts_and_vals(self, coors): 122 | new_coors = get_new_coors_around_center( 123 | coors=coors, 124 | center_size_to_use=self.center_size_to_use) 125 | #concatenate the results of the bigwig readers along the last axis 126 | profiles = np.concatenate([ 127 | x.read_values(coors=new_coors)[:,:,None] 128 | for x in self.bigwig_readers], axis=-1) 129 | counts = np.sum(profiles, axis=1) 130 | return (counts, profiles) 131 | 132 | def __call__(self, coors): 133 | counts, profiles = self._get_counts_and_vals(coors=coors) 134 | counts_transformed, profile_transformed =\ 135 | self.counts_and_profiles_transformer( 136 | counts=counts, 137 | profiles=profiles) 138 | return {self.counts_mode_name: counts_transformed, 139 | self.profile_mode_name: profile_transformed} 140 | 141 | 142 | class AbstractPosAndNegStrandCountsAndProfile(CoordsToVals): 143 | 144 | def __init__(self, pos_strand_bigwig_path, neg_strand_bigwig_path, 145 | counts_mode_name, profile_mode_name, 146 | center_size_to_use): 147 | self.pos_strand_reader =\ 148 | BigWigReader(bigwig_path=pos_strand_bigwig_path) 149 | self.neg_strand_reader =\ 150 | BigWigReader(bigwig_path=neg_strand_bigwig_path) 151 | self.counts_mode_name = counts_mode_name 152 | self.profile_mode_name = profile_mode_name 153 | self.center_size_to_use = center_size_to_use 154 | 155 | def _get_pos_and_neg_counts_and_vals(self, coors): 156 | new_coors = get_new_coors_around_center( 157 | coors=coors, 158 | center_size_to_use=self.center_size_to_use) 159 | first_strand_profile_values = self.pos_strand_reader.read_values( 160 | coors=new_coors) 161 | second_strand_profile_values = np.abs( 162 | self.neg_strand_reader.read_values(coors=new_coors)) 163 | pos_profile_values = [] 164 | neg_profile_values = [] 165 | #need to swap the pos and neg strands if the strand is negative 166 | for (first_strand, second_strand, coor) in zip(first_strand_profile_values, 167 | second_strand_profile_values, 168 | coors): 169 | if (coor.isplusstrand==True): 170 | pos_profile_values.append(first_strand) 171 | neg_profile_values.append(second_strand) 172 | else: 173 | pos_profile_values.append(second_strand) 174 | neg_profile_values.append(first_strand) 175 | pos_profile_values = np.array(pos_profile_values) 176 | neg_profile_values = np.array(neg_profile_values) 177 | pos_counts = np.sum(pos_profile_values, axis=-1) 178 | neg_counts = np.sum(neg_profile_values, axis=-1) 179 | return (pos_counts, neg_counts, pos_profile_values, neg_profile_values) 180 | 181 | """ 182 | Returns: 183 | ndarray: combined/transformed counts 184 | ndarray: combined/transformed profile 185 | """ 186 | def combine_pos_and_neg_counts_and_vals(self, 187 | pos_counts, neg_counts, pos_profile_values, neg_profile_values): 188 | raise NotImplementedError() 189 | 190 | def __call__(self, coors): 191 | pos_counts, neg_counts, pos_profile_values, neg_profile_values =( 192 | self._get_pos_and_neg_counts_and_vals(coors=coors)) 193 | counts_ndarray, profile_ndarray =\ 194 | self.combine_pos_and_neg_counts_and_vals( pos_counts=pos_counts, neg_counts=neg_counts, 195 | pos_profile_values=pos_profile_values, 196 | neg_profile_values=neg_profile_values) 197 | return {self.counts_mode_name: counts_ndarray, 198 | self.profile_mode_name: profile_ndarray} 199 | 200 | 201 | class PosAndNegSeparateLogCounts(AbstractPosAndNegStrandCountsAndProfile): 202 | 203 | def __init__(self, **kwargs): 204 | super(PosAndNegSeparateLogCounts,self).__init__(**kwargs) 205 | 206 | def combine_pos_and_neg_counts_and_vals(self, 207 | pos_counts, neg_counts, pos_profile_values, neg_profile_values): 208 | 209 | return (np.concatenate([np.log(pos_counts+1)[:,None], 210 | np.log(neg_counts+1)[:,None]], axis=1), 211 | np.concatenate( 212 | [pos_profile_values[:,:,None], 213 | neg_profile_values[:,:,None]], axis=2)) 214 | 215 | 216 | class PosAndNegSmoothWindowCollapsedLogCounts( 217 | AbstractPosAndNegStrandCountsAndProfile): 218 | 219 | def __init__(self, smoothing_windows, **kwargs): 220 | super(PosAndNegSmoothWindowCollapsedLogCounts, self).__init__(**kwargs) 221 | self.smoothing_windows = smoothing_windows 222 | 223 | def combine_pos_and_neg_counts_and_vals(self, pos_counts, neg_counts, 224 | pos_profile_values, neg_profile_values): 225 | 226 | profile_sum = ( 227 | pos_profile_values[:,:]+ 228 | neg_profile_values[:,:]) 229 | 230 | smoothed_profiles = [] 231 | for smoothing_window in self.smoothing_windows: 232 | padded_profile = smooth_profiles(profiles=profile_sum[:,:,None], 233 | smoothing_window=smoothing_window) 234 | smoothed_profiles.append(padded_profile) 235 | 236 | smoothed_profiles = np.concatenate(smoothed_profiles, axis=2) 237 | 238 | return (np.log(pos_counts+neg_counts+1), smoothed_profiles) 239 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/coordstovals/core.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | from ..core import Coordinates 3 | 4 | 5 | def get_new_coors_around_center(coors, center_size_to_use): 6 | new_coors = [] 7 | for coor in coors: 8 | coor_center = int(0.5*(coor.start + coor.end)) 9 | left_flank = int(0.5*center_size_to_use) 10 | right_flank = center_size_to_use - left_flank 11 | new_start = coor_center-left_flank 12 | new_end = coor_center+right_flank 13 | new_coors.append(Coordinates(chrom=coor.chrom, 14 | start=new_start, end=new_end, 15 | isplusstrand=coor.isplusstrand)) 16 | return new_coors 17 | 18 | 19 | class CoordsToVals(object): 20 | 21 | def __call__(self, coors): 22 | """ 23 | Args: 24 | coors (:obj:`list` of :obj:`Coordinates`): 25 | 26 | Returns: 27 | numpy ndarray OR list of ndarrays OR a dict of mode_name->ndarray. 28 | Returns a list of ndarrays if returning multiple modes. 29 | Alternatively, returns a dict where key is the mode name 30 | and the value is the ndarray for the mode. 31 | """ 32 | raise NotImplementedError() 33 | 34 | 35 | class CoordsToValsJoiner(CoordsToVals): 36 | 37 | def __init__(self, coordstovals_list): 38 | """ 39 | Joins batches returned by other CoordsToVals objects 40 | 41 | Args: 42 | coorstovals_list (:obj:`list` of :obj:`CoordsToVals`): List of 43 | CoordsToVals whose values to combine 44 | """ 45 | self.coordstovals_list = coordstovals_list 46 | 47 | def __call__(self, coors): 48 | batch_to_return = None 49 | for idx,coordstovals_obj in enumerate(self.coordstovals_list): 50 | the_batch = coordstovals_obj(coors=coors) 51 | assert the_batch is not None 52 | if isinstance(the_batch, dict): 53 | assert ((batch_to_return is None) or 54 | (isinstance(batch_to_return, dict))), ( 55 | "coordstovals object at idx"+str(idx) 56 | +" returned a dict, but previous coordstovals" 57 | +" objects had a return type incompatible with this") 58 | if (batch_to_return is None): 59 | batch_to_return = {} 60 | for key in the_batch: 61 | assert key not in batch_to_return, ( 62 | "coordstovals object at idx"+str(idx) 63 | +" returned a dict with a key of "+key 64 | +", which collides with a pre-existing key returned by" 65 | +" another coordstovals object") 66 | batch_to_return.update(the_batch) 67 | else: 68 | assert ((batch_to_return is None) or 69 | (isinstance(batch_to_return, list))), ( 70 | "coordstovals object at idx"+str(idx) 71 | +" returned a type incompatible with dict, but previous" 72 | +" coordstovals objects had a return type of dict") 73 | if (isinstance(the_batch, list)==False): 74 | the_batch = [the_batch] 75 | if (batch_to_return is None): 76 | batch_to_return = [] 77 | batch_to_return.extend(the_batch) 78 | if (batch_to_return is None): 79 | batch_to_return = [] 80 | return batch_to_return 81 | 82 | 83 | class AbstractSingleNdarrayCoordsToVals(CoordsToVals): 84 | 85 | def __init__(self, mode_name=None): 86 | """ 87 | Args: 88 | mode_name (:obj:`str`, optional): default None. If None, then 89 | the return of __call__ will be a numpy ndarray. Otherwise, it 90 | will be a dictionary with a key of mode_name and a value being 91 | the numpy ndarray. 92 | """ 93 | self.mode_name = mode_name 94 | 95 | def _get_ndarray(self, coors): 96 | """ 97 | Args: 98 | coors (:obj:`list` of :obj:`Coordinates): 99 | 100 | Returns: 101 | numpy ndarray 102 | """ 103 | raise NotImplementedError() 104 | 105 | def __call__(self, coors): 106 | ndarray = self._get_ndarray(coors) 107 | if (self.mode_name is None): 108 | return ndarray 109 | else: 110 | return {self.mode_name: ndarray} 111 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/coordstovals/fasta.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | import numpy as np 3 | from pyfaidx import Fasta 4 | from .core import AbstractSingleNdarrayCoordsToVals 5 | 6 | 7 | ltrdict = { 8 | 'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 9 | 'n':[0,0,0,0],'A':[1,0,0,0],'C':[0,1,0,0],'G':[0,0,1,0], 10 | 'T':[0,0,0,1],'N':[0,0,0,0]} 11 | 12 | 13 | def onehot_encoder(seq): 14 | return np.array([ltrdict.get(x,[0,0,0,0]) for x in seq]) 15 | 16 | 17 | class PyfaidxCoordsToVals(AbstractSingleNdarrayCoordsToVals): 18 | 19 | def __init__(self, genome_fasta_path, center_size_to_use=None, **kwargs): 20 | """ 21 | Args: 22 | genome_fasta_path (:obj:`str`): path to the genome .fa file 23 | **kwargs: arguments for :obj:`AbstractSingleNdarrayCoordsToVals` 24 | """ 25 | super(PyfaidxCoordsToVals, self).__init__(**kwargs) 26 | self.center_size_to_use = center_size_to_use 27 | self.genome_fasta = genome_fasta_path 28 | 29 | def _get_ndarray(self, coors): 30 | """ 31 | Args: 32 | coors (:obj:`list` of :obj:`Coordinates): if 33 | center_size_to_use is not specified, all the 34 | coordinates must be of the same length 35 | 36 | Returns: 37 | numpy ndarray of dims (nexamples x width x 4) 38 | """ 39 | genome_object = Fasta(self.genome_fasta) 40 | seqs = [] 41 | for coor in coors: 42 | if (self.center_size_to_use is not None): 43 | the_center = int((coor.start + coor.end)*0.5) 44 | if (coor.chrom in genome_object): 45 | seqs.append(genome_object[coor.chrom][ 46 | the_center-int(0.5*self.center_size_to_use): 47 | the_center+(self.center_size_to_use 48 | -int(0.5*self.center_size_to_use))]) 49 | else: 50 | print(coor.chrom+" not in "+self.genome_fasta) 51 | else: 52 | if (coor.chrom in genome_object): 53 | seqs.append(genome_object[coor.chrom][coor.start:coor.end]) 54 | else: 55 | print(coor.chrom+" not in "+self.genome_fasta) 56 | genome_object.close() 57 | 58 | onehot_seqs = [] 59 | for seq,coor in zip(seqs, coors): 60 | onehot = onehot_encoder(seq=seq.seq) 61 | if (coor.isplusstrand==False): 62 | onehot = onehot[::-1, ::-1] 63 | onehot_seqs.append(onehot) 64 | lengths = set([len(x) for x in onehot_seqs]) 65 | if (len(lengths) > 0): 66 | assert len(lengths)==1, ("All the sequences must be of the same" 67 | +"lengths, but lengths are "+str(lengths)) 68 | return np.array(onehot_seqs) 69 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/coordstovals/lookup.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | import numpy as np 3 | from .core import AbstractSingleNdarrayCoordsToVals 4 | from ..core import Coordinates 5 | import gzip 6 | 7 | 8 | class SimpleLookup(AbstractSingleNdarrayCoordsToVals): 9 | 10 | def __init__(self, lookup_file, 11 | transformation=None, 12 | default_returnval=0.0, **kwargs): 13 | super(SimpleLookup, self).__init__(**kwargs) 14 | self.lookup_file = lookup_file 15 | self.transformation = transformation 16 | self.default_returnval = default_returnval 17 | self.lookup = {} 18 | self.num_labels = None 19 | for line in (gzip.open(self.lookup_file) if ".gz" 20 | in self.lookup_file else open(self.lookup_file)): 21 | (chrom, start_str, end_str, *labels) =\ 22 | line.decode("utf-8").rstrip().split("\t") 23 | coord = Coordinates(chrom=chrom, 24 | start=int(start_str), 25 | end=int(end_str)) 26 | labels = [(self.transformation(float(x)) 27 | if self.transformation is not None else float(x)) 28 | for x in labels] 29 | self.lookup[(coord.chrom, coord.start, coord.end)] = labels 30 | if (self.num_labels is None): 31 | self.num_labels = len(labels) 32 | else: 33 | assert len(labels)==self.num_labels,( 34 | "Unequal label lengths; "+str(len(labels), self.num_labels)) 35 | 36 | def _get_ndarray(self, coors): 37 | to_return = [] 38 | for coor in coors: 39 | if (coor.chrom, coor.start, coor.end) not in self.lookup: 40 | to_return.append(np.ones(self.num_labels) 41 | *self.default_returnval) 42 | else: 43 | to_return.append( 44 | self.lookup[(coor.chrom, coor.start, coor.end)]) 45 | return np.array(to_return) 46 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/coordstovals/tiledb.py: -------------------------------------------------------------------------------- 1 | import tiledb 2 | import numpy as np 3 | from .core import CoordsToVals 4 | 5 | class BasicTiledbProfileCoordsToVals(CoordsToVals): 6 | def __init__(self, tiledb_paths, pos_label_source_attribute, neg_label_source_attribute=None, center_size_to_use=None, **kwargs): 7 | ''' 8 | tiledb_paths can be a single string or a list of strings or a dictionary mapping from mode name to string. 9 | ''' 10 | self.tiledb_paths=tiledb_paths 11 | #identify the data type of tiledb_paths 12 | self.type_tiledb_paths=type(self.tiledb_paths) 13 | #identify the corresponding function to use for querying tiledb 14 | self.call_function=self.get_call_function() 15 | #positive and negative strand values may correspond to different attirbutes of tiledb database 16 | self.pos_label_source_attribute=pos_label_source_attribute 17 | self.neg_label_source_attribute=neg_label_source_attribute 18 | 19 | def get_call_function(self): 20 | ''' 21 | determines function to use for querying coord values based 22 | on the data type of tiledb_paths attribute 23 | ''' 24 | if self.type_tiledb_paths == str: 25 | return self.__call__string 26 | elif self.type_tiledb_paths == list: 27 | return self.__call__list 28 | elif self.type_tiledb_paths == dict: 29 | return self.__call__dict 30 | else: 31 | raise Exception("Unsupported data type for BasicTiledbProfileCoordsToVals:"+str(self.type_tiledb_paths)) 32 | 33 | def __call__dict(self,coords): 34 | ''' 35 | self.tiledb_paths is a dictinary mapping from mode name to string 36 | ''' 37 | vals={} 38 | for mode_name in self.tiledb_paths: 39 | cur_tiledb_path=self.tiledb_paths[mode_name] 40 | vals[mode_name]=self.query_tiledb(cur_tiledb_path,coords) 41 | return vals 42 | 43 | def __call__list(self,coords): 44 | ''' 45 | self.tiledb_paths is a list of strings 46 | ''' 47 | vals=[self.query_tiledb(cur_tiled_path,coords) for cur_tiled_path in self.tiledb_paths] 48 | return vals 49 | 50 | def __call__string(self,coords): 51 | ''' 52 | self.tiledb_paths is a string 53 | ''' 54 | vals=self.query_tiledb(self.tiledb_paths,coords) 55 | return vals 56 | 57 | def __call__(self,coords): 58 | ''' 59 | coords is a list of named tuples : .chrom, .start, .end, .isplusstrand 60 | returns nparray of values associated with coordinates 61 | ''' 62 | assert len(coords)>0 63 | self.ctx = tiledb.Ctx() 64 | return self.call_function(coords) 65 | 66 | def query_tiledb(self,cur_tiledb_path,coords): 67 | ''' 68 | queries tiledb database for a specific batch of coordinates for a single dataset/task. 69 | ''' 70 | labels=np.zeros((len(coords),coords[0].end-coords[0].start)) 71 | for i in range(len(coords)): 72 | coord=coords[i] 73 | #open the tiledb for access in a pre-defined context 74 | with tiledb.DenseArray('.'.join([cur_tiledb_path,coord.chrom]), mode='r',ctx=self.ctx) as cur_array: 75 | if coord.isplusstrand: 76 | #query positive strand (or non-stranded entity) 77 | cur_vals=cur_array[coord.start:coord.end][self.pos_label_source_attribute] 78 | else: 79 | #query negative strand , make sure to reverse the values 80 | cur_vals=cur_array[coord.start:coord.end][self.neg_label_source_attribute][::-1] 81 | labels[i]=cur_vals 82 | return labels 83 | -------------------------------------------------------------------------------- /seqdataloader/batchproducers/coordbased/core.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | from collections import namedtuple 3 | import keras 4 | 5 | 6 | Coordinates = namedtuple("Coordinates", 7 | ["chrom", "start", "end", "isplusstrand"]) 8 | Coordinates.__new__.__defaults__ = (True,) 9 | 10 | 11 | def apply_mask(tomask, mask): 12 | if isinstance(tomask, dict): 13 | return dict([(key, val[mask]) for key,val in tomask.items()]) 14 | elif isinstance(tomask, list): 15 | return [x[mask] for x in mask] 16 | else: 17 | return x[mask] 18 | 19 | 20 | class KerasBatchGenerator(keras.utils.Sequence): 21 | 22 | """ 23 | Args: 24 | coordsbatch_producer (KerasSequenceApiCoordsBatchProducer) 25 | inputs_coordstovals (CoordsToVals) 26 | targets_coordstovals (CoordsToVals) 27 | sampleweights_coordstovals (CoordsToVals) 28 | coordsbatch_transformer (AbstracCoordBatchTransformer) 29 | qc_func (callable): function that can be used to filter 30 | out poor-quality sequences. 31 | sampleweights_coordstoval: either this argument or 32 | sampleweights_from_inputstargets could be used to 33 | specify sample weights. sampleweights_coordstoval 34 | takes a batch of coords as inputs. 35 | sampleweights_from_inputstargets: either this argument or 36 | sampleweights_coordstoval could be used to 37 | specify sample weights. sampleweights_from_inputstargets 38 | takes the inputs and targets values to generate the weights. 39 | """ 40 | def __init__(self, coordsbatch_producer, 41 | inputs_coordstovals, 42 | targets_coordstovals, 43 | coordsbatch_transformer=None, 44 | qc_func=None, 45 | sampleweights_coordstovals=None, 46 | sampleweights_from_inputstargets=None): 47 | self.coordsbatch_producer = coordsbatch_producer 48 | self.inputs_coordstovals = inputs_coordstovals 49 | self.targets_coordstovals = targets_coordstovals 50 | self.coordsbatch_transformer = coordsbatch_transformer 51 | self.sampleweights_coordstovals = sampleweights_coordstovals 52 | self.sampleweights_from_inputstargets =\ 53 | sampleweights_from_inputstargets 54 | if sampleweights_coordstovals is not None: 55 | assert sampleweights_from_inputstargets is None 56 | if sampleweights_from_inputstargets is not None: 57 | assert sampleweights_coordstovals is None 58 | self.qc_func = qc_func 59 | 60 | def __getitem__(self, index): 61 | coords_batch = self.coordsbatch_producer[index] 62 | if (self.coordsbatch_transformer is not None): 63 | coords_batch = self.coordsbatch_transformer(coords_batch) 64 | inputs = self.inputs_coordstovals(coords_batch) 65 | if (self.targets_coordstovals is not None): 66 | targets = self.targets_coordstovals(coords_batch) 67 | else: 68 | targets=None 69 | if (self.qc_func is not None): 70 | qc_mask = self.qc_func(inputs=inputs, targets=targets) 71 | inputs = apply_mask(tomask=inputs, mask=qc_mask) 72 | if (targets is not None): 73 | targets = apply_mask(tomask=targets, mask=qc_mask) 74 | else: 75 | qc_mask = None 76 | if (self.sampleweights_coordstovals is not None): 77 | sample_weights = self.sampleweights_coordstovals(coords_batch) 78 | return (inputs, targets, sample_weights) 79 | elif (self.sampleweights_from_inputstargets is not None): 80 | sample_weights = self.sampleweights_from_inputstargets( 81 | inputs=inputs, targets=targets) 82 | return (inputs, targets, sample_weights) 83 | else: 84 | if (self.targets_coordstovals is not None): 85 | return (inputs, targets) 86 | else: 87 | return inputs 88 | 89 | def __len__(self): 90 | return len(self.coordsbatch_producer) 91 | 92 | def on_epoch_end(self): 93 | self.coordsbatch_producer.on_epoch_end() 94 | -------------------------------------------------------------------------------- /seqdataloader/bounded_process_pool_executor.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import concurrent.futures 3 | 4 | name = 'bounded_pool_executor' 5 | class _BoundedPoolExecutor: 6 | 7 | semaphore = None 8 | 9 | def acquire(self): 10 | self.semaphore.acquire() 11 | 12 | def release(self, fn): 13 | self.semaphore.release() 14 | 15 | def submit(self, fn, *args, **kwargs): 16 | self.acquire() 17 | future = super().submit(fn, *args, **kwargs) 18 | future.add_done_callback(self.release) 19 | 20 | return future 21 | 22 | 23 | class BoundedProcessPoolExecutor(_BoundedPoolExecutor, concurrent.futures.ProcessPoolExecutor): 24 | def __init__(self, max_workers=None,mp_context=None, initializer=None, initargs=()): 25 | super().__init__(max_workers,mp_context,initializer,initargs) 26 | self.semaphore = multiprocessing.BoundedSemaphore(max_workers) 27 | 28 | -------------------------------------------------------------------------------- /seqdataloader/dbingest/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Example command: 3 | 4 | To ingest a microglia dataset with an unstranded bigwig, idr peaks, optimal peaks, 5 | a blacklist of genomic regions to avoid, and negative (Non-peak) regions that are gc-matched to the idr peak set 6 | we would run dbingest as follows: 7 | 8 | contents of metadata.tsv: 9 | 10 | ``` 11 | dataset idr_peak overlap_peak ambig_peak negatives_peak count_bigwig_unstranded_5p 12 | microglia microglia.idr.optimal.narrowPeak microglia.overlap.optimal.narrowPeak blacklist/GRch38/GRch38_unified_blacklist.bed microglia.gc.matched.negatives.bed ../data/microglia.unstranded.bw 13 | ``` 14 | 15 | contents of attributes.txt: 16 | 17 | ``` 18 | idr_peak bed_summit_from_last_col 19 | overlap_peak bed_summit_from_last_col 20 | ambig_peak bed_no_summit 21 | negatives_peak bed_no_summit 22 | count_bigwig_unstranded_5p bigwig 23 | ``` 24 | 25 | The attributes file indicates how each column in the metadata.tsv file should be parsed. Supported values are 26 | 27 | * bed_summit_from_last_col -- this assumes the input file is in narrowPeak (or similar) format, where the summit offset from the start coordinate is in the last column of the file. File is stored as an array of 0 (no peak) 1 (peak) and 2 (summit). 28 | 29 | * bed_no_summit -- this assumes that the input file is a bed file without summit information -- peak intervals are centered on (start+end)/2 30 | 31 | * bigwig -- treat the input file as a bigwig 32 | 33 | * bed_no_summit -- do not calculate summits for the provided bed file (i.e. store the bed file as an array of 0 (no peak) and 1 (peak) but don't store a value of 2 to indicate summit). 34 | 35 | The command to run to ingest the metadata.csv file to tiledb is: 36 | 37 | ``` 38 | 39 | db_ingest --tiledb_metadata metadata.tsv \ 40 | --array_name microglia_db \ 41 | --overwrite \ 42 | --chrom_sizes hg38.chrom.sizes \ 43 | --attribute_config_file attribs.txt \ 44 | --coord_tile_size 10000 \ 45 | --task_tile_size 1 \ 46 | --write_chunk 30000000 \ 47 | --threads 40 \ 48 | --max_queue_size 50 \ 49 | --max_mem_g 200 50 | ``` 51 | 52 | 53 | -------------------------------------------------------------------------------- /seqdataloader/dbingest/__init__.py: -------------------------------------------------------------------------------- 1 | ## helper functions to ingest bigwig and narrowPeak data files into a tileDB instance. 2 | ## tileDB instances are indexed by coordinate 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | import math 7 | import psutil 8 | from multiprocessing import Pool, Process, Queue 9 | import os 10 | import signal 11 | import tiledb 12 | import pickle 13 | import argparse 14 | import pandas as pd 15 | import numpy as np 16 | from collections import OrderedDict 17 | from ..attrib_config import * 18 | from ..queue_config import * 19 | from ..utils import * 20 | from ..tdb_config import * 21 | import gc 22 | import time 23 | import sys 24 | 25 | def args_object_from_args_dict(args_dict): 26 | #create an argparse.Namespace from the dictionary of inputs 27 | args_object=argparse.Namespace() 28 | #set the defaults 29 | vars(args_object)['overwrite']=False 30 | vars(args_object)['coord_tile_size']=10000 31 | vars(args_boject)['task_tile_size']=1 32 | vars(args_object)['attribute_config']=None 33 | vars(args_object)['attribute_config_file']=None 34 | vars(args_object)['write_chunk']=30000000 35 | vars(args_object)['threads']=1 36 | vars(args_object)['max_queue_size']=30 37 | vars(args_object)['max_mem_g']=100 38 | for key in args_dict: 39 | vars(args_object)[key]=args_dict[key] 40 | #set any defaults that are unset 41 | args=args_object 42 | return args 43 | 44 | def parse_args(): 45 | parser=argparse.ArgumentParser(description="ingest data into tileDB") 46 | parser.add_argument("--tiledb_metadata",help="each row is a dataset, each column corresponds to an attribute") 47 | parser.add_argument("--array_name") 48 | parser.add_argument("--overwrite",default=False,action="store_true") 49 | parser.add_argument("--chrom_sizes",help="2 column tsv-separated file. Column 1 = chromsome name; Column 2 = chromosome size") 50 | parser.add_argument("--coord_tile_size",type=int,default=10000,help="coordinate axis tile size") 51 | parser.add_argument("--task_tile_size",type=int,default=1,help="task axis tile size") 52 | parser.add_argument("--attribute_config",default=None,help="the following are supported: encode_pipeline, encode_pipeline_with_controls, generic_bigwig") 53 | parser.add_argument("--attribute_config_file",default=None,help="file with 2 columns; first column indicates attribute name; 2nd column indicates attribute type, which is one of bigwig, bed_no_summit, bed_summit_from_peak_center, bed_summit_from_last_col") 54 | parser.add_argument("--write_chunk",type=int,default=30000000,help="number of bases to write to disk in one tileDB DenseArray write operation") 55 | parser.add_argument("--threads",type=int,default=1,help="number of chunks to process in parallel") 56 | parser.add_argument("--max_queue_size",type=int,default=30) 57 | parser.add_argument("--max_mem_g",type=int,default=100,help="maximum memory usage in Gigabytes") 58 | return parser.parse_args() 59 | 60 | 61 | def init_worker(): 62 | signal.signal(signal.SIGINT, signal.SIG_IGN) 63 | 64 | def kill_child_processes(parent_pid, sig=signal.SIGTERM): 65 | try: 66 | parent = psutil.Process(parent_pid) 67 | except psutil.NoSuchProcess: 68 | return 69 | children = parent.children(recursive=True) 70 | for process in children: 71 | process.send_signal(sig) 72 | 73 | 74 | def create_new_array(tdb_Context, 75 | size, 76 | array_out_name, 77 | coord_tile_size, 78 | task_tile_size, 79 | attribute_config, 80 | attribute_config_file, 81 | compressor='gzip', 82 | compression_level=-1, 83 | var=False): 84 | ''' 85 | Creates an empty tileDB array 86 | size= tuple(num_indices,num_tasks) 87 | ''' 88 | coord_tile_size=min(size[0],coord_tile_size) 89 | task_tile_size=max([1,min(size[1],task_tile_size)]) 90 | tiledb_dim_coords = tiledb.Dim( 91 | name='genome_coordinate', 92 | domain=(0, size[0]), 93 | tile=coord_tile_size, 94 | dtype='uint32') 95 | tiledb_dim_tasks=tiledb.Dim( 96 | name='task', 97 | domain=(0,size[1]),#max([1,size[1]])), 98 | tile=task_tile_size, 99 | dtype='uint32') 100 | tiledb_dom = tiledb.Domain(tiledb_dim_coords,tiledb_dim_tasks,ctx=tdb_Context) 101 | 102 | #generate the attribute information 103 | attribute_info=get_attribute_info(attribute_config,attribute_config_file) 104 | attribs=[] 105 | for key in attribute_info: 106 | attribs.append(tiledb.Attr( 107 | name=key, 108 | var=var, 109 | filters=tiledb.FilterList([tiledb.GzipFilter()]), 110 | dtype=attribute_info[key]['dtype'])) 111 | 112 | tiledb_schema = tiledb.ArraySchema( 113 | domain=tiledb_dom, 114 | attrs=tuple(attribs), 115 | cell_order='row-major', 116 | tile_order='row-major') 117 | 118 | tiledb.DenseArray.create(array_out_name, tiledb_schema) 119 | print("created empty array on disk") 120 | return 121 | 122 | 123 | 124 | def extract_metadata_field(row,field): 125 | dataset=row['dataset'] 126 | try: 127 | return row[field] 128 | except: 129 | print("tiledb_metadata has no column "+field+" for dataset:"+str(dataset)) 130 | return None 131 | 132 | def open_data_for_parsing(row,attribute_info): 133 | try: 134 | data_dict={} 135 | cols=list(row.index) 136 | if 'dataset' in cols: 137 | cols.remove('dataset') 138 | for col in cols: 139 | cur_fname=extract_metadata_field(row,col) 140 | if isinstance(cur_fname,str): 141 | assert os.path.exists(cur_fname), "The path:"+str(cur_fname)+" does not exist. If you meant to skip this column, leave it empty in the metadata sheet." 142 | elif math.isnan(float(cur_fname)): 143 | continue 144 | elif cur_fname is None: 145 | continue 146 | data_dict[col]=attribute_info[col]['opener'](cur_fname,parallel=True) 147 | return data_dict 148 | except Exception as e: 149 | print(repr(e)) 150 | kill_child_processes(os.getpid()) 151 | raise 152 | 153 | def ingest(args): 154 | if type(args)==type({}): 155 | args=args_object_from_args_dict(args) 156 | if args.write_chunk > max_write_chunk: 157 | print("WARNING: You have specified a write_chunk size of"+str(args.write_chunk)+" but the maximum supported with python serialization is:"+str(max_write_chunk)+". It will be reset to "+str(max_write_chunk)) 158 | args.write_chunk=max_write_chunk 159 | 160 | #create a queue to write the array 161 | global write_queue 162 | write_queue=Queue(maxsize=args.max_queue_size) 163 | 164 | #config 165 | tdb_Config=tiledb.Config(tdb_config_params) 166 | tdb_write_Context=tiledb.Ctx(config=tdb_Config) 167 | tdb_read_Context=tiledb.Ctx(config=tdb_Config) 168 | 169 | overwrite=args.overwrite 170 | coord_tile_size=args.coord_tile_size 171 | task_tile_size=args.task_tile_size 172 | attribute_config=args.attribute_config 173 | attribute_config_file=args.attribute_config_file 174 | updating=False 175 | 176 | attribute_info=get_attribute_info(args.attribute_config,args.attribute_config_file) 177 | tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t') 178 | num_tasks=tiledb_metadata.shape[0] 179 | print("num_tasks:"+str(num_tasks)) 180 | 181 | print("loaded tiledb metadata") 182 | chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t') 183 | print("loaded chrom sizes") 184 | chrom_indices,num_indices=transform_chrom_size_to_indices(chrom_sizes) 185 | print("num_indices:"+str(num_indices)) 186 | array_out_name=args.array_name 187 | if tiledb.object_type(array_out_name) == "array": 188 | if overwrite==False: 189 | raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting") 190 | else: 191 | print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten") 192 | updating=True 193 | else: 194 | #create the array: 195 | create_new_array(tdb_Context=tdb_write_Context, 196 | size=(num_indices,num_tasks-1), 197 | attribute_config=attribute_config, 198 | attribute_config_file=attribute_config_file, 199 | array_out_name=array_out_name, 200 | coord_tile_size=coord_tile_size, 201 | task_tile_size=task_tile_size, 202 | var=False) 203 | print("created new array:"+str(array_out_name)) 204 | #create metadata array 205 | metadata_dict={} 206 | metadata_dict['tasks']=[i for i in tiledb_metadata['dataset']] 207 | metadata_dict['chroms']=[i for i in chrom_indices.keys()] 208 | metadata_dict['sizes']=[i[2] for i in list(chrom_indices.values())] 209 | metadata_dict['offsets']=[i[0] for i in list(chrom_indices.values())] 210 | num_tasks=tiledb_metadata['dataset'].shape[0] 211 | 212 | num_chroms=len(chrom_indices.keys()) 213 | with tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') as cur_array: 214 | cur_array.meta['num_tasks']=num_tasks 215 | cur_array.meta['num_chroms']=num_chroms 216 | for task_index in range(num_tasks): 217 | cur_array.meta['_'.join(['task',str(task_index)])]=metadata_dict['tasks'][task_index] 218 | for chrom_index in range(num_chroms): 219 | cur_array.meta['_'.join(['chrom',str(chrom_index)])]=metadata_dict['chroms'][chrom_index] 220 | cur_array.meta['_'.join(['size',str(chrom_index)])]=metadata_dict['sizes'][chrom_index] 221 | cur_array.meta['_'.join(['offset',str(chrom_index)])]=metadata_dict['offsets'][chrom_index] 222 | print("created tiledb metadata") 223 | pool=Pool(processes=args.threads,initializer=init_worker) 224 | print("made pool") 225 | pool_inputs=[] 226 | for task_index,task_row in tiledb_metadata.iterrows(): 227 | dataset=task_row['dataset'] 228 | #read in filenames for bigwigs 229 | data_dict=open_data_for_parsing(task_row,attribute_info) 230 | for start_chunk_index in range(0,num_indices,args.write_chunk): 231 | end_chunk_index=start_chunk_index+min([num_indices,args.write_chunk]) 232 | #convert global indices to chrom+pos indices 233 | chunk_chrom_coords=transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices) 234 | if chunk_chrom_coords is None: 235 | raise Exception("failed to tranform indices:"+str(start_chunk_index)+"-"+str(end_chunk_index)+ " to chrom coords;"+str(chrom_indices)) 236 | for coord_set in chunk_chrom_coords: 237 | pool_inputs.append((task_index,data_dict,attribute_info,coord_set,args)) 238 | pool_feed_chunk_start=0 239 | pool_feed_chunk_max=len(pool_inputs) 240 | chunks_to_process=len(pool_inputs) 241 | array_writer=Process(target=write_array,args=([args,updating,chunks_to_process])) 242 | try: 243 | array_writer.start() 244 | except Exception as e: 245 | raise e 246 | 247 | try: 248 | while pool_feed_chunk_start < pool_feed_chunk_max: 249 | pool_feed_chunk_end=min([pool_feed_chunk_start+queue_feed_chunk_size,pool_feed_chunk_max]) 250 | #only do mapping if queue size is not exceeded & total memory consumption is not exceeded 251 | write_queue_size=write_queue.qsize() 252 | mem_used=psutil.virtual_memory().used / (10**9) 253 | print("mapping to pool, queue size:"+str(write_queue_size)) 254 | print("mapping to pool, mem used:"+str(mem_used)) 255 | while (write_queue_size >=args.max_queue_size) or (mem_used >=args.max_mem_g): 256 | time.sleep(10) 257 | print("sending to pool:"+str(pool_feed_chunk_start)+"-"+str(pool_feed_chunk_end)+"/"+str(chunks_to_process)) 258 | pool.map(process_chunk,pool_inputs[pool_feed_chunk_start:pool_feed_chunk_end]) 259 | pool_feed_chunk_start+=queue_feed_chunk_size 260 | time.sleep(60) 261 | pool.close() 262 | except KeyboardInterrupt: 263 | kill_child_processes(os.getpid()) 264 | pool.terminate() 265 | raise 266 | except Exception as e: 267 | print(e) 268 | kill_child_processes(os.getpid()) 269 | raise 270 | 271 | #wait until we're done writing to the tiledb array 272 | array_writer.join() 273 | print("array_writer.join() is complete") 274 | print("shutting down pool") 275 | pool.join() 276 | print('done!') 277 | 278 | def process_chunk(inputs): 279 | try: 280 | task_index=inputs[0] 281 | data_dict=inputs[1] 282 | attribute_info=inputs[2] 283 | coord_set=inputs[3] 284 | args=inputs[4] 285 | 286 | attribute_config=args.attribute_config 287 | dict_to_write=OrderedDict() 288 | chrom=coord_set[0] 289 | start_pos=coord_set[1] 290 | end_pos=coord_set[2] 291 | start_index=coord_set[3] 292 | end_index=coord_set[4] 293 | for attribute in data_dict: 294 | cur_parser=attribute_info[attribute]['parser'] 295 | cur_vals=cur_parser([data_dict[attribute],chrom,start_pos,end_pos,attribute_info[attribute]]) 296 | dict_to_write[attribute]=cur_vals[-1] #the last entry in the tuple is the actual numpy array of values; the first entries store start and end blocks 297 | payload=pickle.dumps([task_index,start_index,end_index,dict_to_write],pickle.HIGHEST_PROTOCOL) 298 | write_queue.put(payload) 299 | gc.collect() 300 | except: 301 | kill_child_processes(os.getpid()) 302 | raise 303 | 304 | def write_array(args, updating, chunks_to_process): 305 | try: 306 | #config 307 | tdb_Config=tiledb.Config(tdb_config_params) 308 | tdb_write_Context=tiledb.Ctx(config=tdb_Config) 309 | 310 | if updating is True: 311 | tdb_read_Context=tiledb.Ctx(config=tdb_Config) 312 | cur_array_toread=tiledb.DenseArray(args.array_name,ctx=tdb_read_Context,mode='r') 313 | cur_array_towrite=tiledb.DenseArray(args.array_name,ctx=tdb_write_Context,mode='w') 314 | chunks_processed=0 315 | while chunks_processed < chunks_to_process: 316 | while write_queue.empty() is True: 317 | time.sleep(10) 318 | processed_chunk=write_queue.get() 319 | processed_chunk_unpickled=pickle.loads(processed_chunk) 320 | task_index=processed_chunk_unpickled[0] 321 | start_index=processed_chunk_unpickled[1] 322 | end_index=processed_chunk_unpickled[2] 323 | dict_to_write=processed_chunk_unpickled[3] 324 | if updating is True: 325 | #we are only updating some attributes in the array 326 | cur_vals=cur_array_toread[start_index:end_index,task_index] 327 | #print("got cur vals for task "+str(task_index)+" for "+str(start_index)+":"+str(end_index)) 328 | for key in dict_to_write: 329 | cur_vals[key]=dict_to_write[key] 330 | dict_to_write=cur_vals 331 | print("updated data dict for writing:"+args.array_name) 332 | else: 333 | #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array 334 | required_attrib=list(get_attribute_info(args.attribute_config,args.attribute_config_file).keys()) 335 | #print(str(required_attrib)) 336 | for attrib in required_attrib: 337 | if attrib not in dict_to_write: 338 | print("augmenting") 339 | dict_to_write[attrib]=np.full(end_index-start_index,np.nan) 340 | #write in chunks 341 | cur_array_towrite[start_index:end_index,task_index]=dict_to_write 342 | print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) 343 | gc.collect() 344 | chunks_processed+=1 345 | print("wrote to disk "+str(task_index)+" for "+str(start_index)+":"+str(end_index)+";"+str(chunks_processed)+"/"+str(chunks_to_process)) 346 | assert chunks_processed >=chunks_to_process 347 | print("closing arrays") 348 | if updating is True: 349 | cur_array_toread.close() 350 | cur_array_towrite.close() 351 | return 352 | 353 | except KeyboardInterrupt: 354 | kill_child_processes(os.getpid()) 355 | #try to delete all tmp files 356 | raise 357 | except Exception as e: 358 | print(e) 359 | kill_child_processes(os.getpid()) 360 | raise Exception(e.message) 361 | 362 | 363 | def main(): 364 | args=parse_args() 365 | ingest(args) 366 | 367 | if __name__=="__main__": 368 | main() 369 | 370 | 371 | -------------------------------------------------------------------------------- /seqdataloader/dbingest_single_threaded/__init__.py: -------------------------------------------------------------------------------- 1 | ## helper functions to ingest bigwig and narrowPeak data files into a tileDB instance. 2 | ## tileDB instances are indexed by coordinate 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | import psutil 7 | #import multiprocessing as mp 8 | #mpx = mp.get_context('spawn') 9 | import tiledb 10 | import pdb 11 | import argparse 12 | import pandas as pd 13 | import numpy as np 14 | from collections import OrderedDict 15 | from ..attrib_config import * 16 | from ..utils import * 17 | from ..tdb_config import * 18 | import gc 19 | 20 | 21 | def args_object_from_args_dict(args_dict): 22 | #create an argparse.Namespace from the dictionary of inputs 23 | args_object=argparse.Namespace() 24 | #set the defaults 25 | vars(args_object)['overwrite']=False 26 | vars(args_object)['coord_tile_size']=10000 27 | vars(args_boject)['task_tile_size']=1 28 | vars(args_object)['attribute_config']='encode_pipeline' 29 | vars(args_object)['write_chunk']=None 30 | for key in args_dict: 31 | vars(args_object)[key]=args_dict[key] 32 | #set any defaults that are unset 33 | args=args_object 34 | return args 35 | 36 | def parse_args(): 37 | parser=argparse.ArgumentParser(description="ingest data into tileDB") 38 | parser.add_argument("--tiledb_metadata",help="fields are: dataset, fc_bigwig, pval_bigwig, count_bigwig_plus_5p, count_bigwig_minus_5p, count_bigwig_unstranded_5p, idr_peak, overlap_peak, ambig_peak") 39 | parser.add_argument("--tiledb_group") 40 | parser.add_argument("--overwrite",default=False,action="store_true") 41 | parser.add_argument("--chrom_sizes",help="2 column tsv-separated file. Column 1 = chromsome name; Column 2 = chromosome size") 42 | parser.add_argument("--coord_tile_size",type=int,default=10000,help="coordinate axis tile size") 43 | parser.add_argument("--task_tile_size",type=int,default=1,help="task axis tile size") 44 | parser.add_argument("--attribute_config",default='encode_pipeline',help="the following are supported: encode_pipeline, generic_bigwig") 45 | parser.add_argument("--write_chunk",type=int,default=None,help="number of bases to write to disk in one tileDB DenseArray write operation") 46 | return parser.parse_args() 47 | 48 | def create_new_array(tdb_Context, 49 | size, 50 | array_out_name, 51 | coord_tile_size, 52 | task_tile_size, 53 | attribute_config, 54 | compressor='gzip', 55 | compression_level=-1, 56 | var=False): 57 | ''' 58 | Creates an empty tileDB array 59 | size= tuple(num_indices,num_tasks) 60 | ''' 61 | coord_tile_size=min(size[0],coord_tile_size) 62 | task_tile_size=min(size[1],task_tile_size) 63 | tiledb_dim_coords = tiledb.Dim( 64 | name='genome_coordinate', 65 | domain=(0, size[0]), 66 | tile=coord_tile_size, 67 | dtype='uint32') 68 | tiledb_dim_tasks=tiledb.Dim( 69 | name='task', 70 | domain=(0,size[1]), 71 | tile=task_tile_size, 72 | dtype='uint32') 73 | tiledb_dom = tiledb.Domain(tiledb_dim_coords,tiledb_dim_tasks,ctx=tdb_Context) 74 | 75 | #generate the attribute information 76 | attribute_info=get_attribute_info(attribute_config) 77 | attribs=[] 78 | for key in attribute_info: 79 | attribs.append(tiledb.Attr( 80 | name=key, 81 | var=var, 82 | filters=tiledb.FilterList([tiledb.GzipFilter()]), 83 | dtype=attribute_info[key]['dtype'])) 84 | 85 | tiledb_schema = tiledb.ArraySchema( 86 | domain=tiledb_dom, 87 | attrs=tuple(attribs), 88 | cell_order='row-major', 89 | tile_order='row-major') 90 | 91 | tiledb.DenseArray.create(array_out_name, tiledb_schema) 92 | print("created empty array on disk") 93 | return 94 | 95 | 96 | 97 | def extract_metadata_field(row,field): 98 | dataset=row['dataset'] 99 | try: 100 | return row[field] 101 | except: 102 | print("tiledb_metadata has no column "+field+" for dataset:"+str(dataset)) 103 | return None 104 | 105 | def open_data_for_parsing(row,attribute_info): 106 | try: 107 | data_dict={} 108 | cols=list(row.index) 109 | if 'dataset' in cols: 110 | cols.remove('dataset') 111 | for col in cols: 112 | cur_fname=extract_metadata_field(row,col) 113 | if cur_fname is not None: 114 | data_dict[col]=attribute_info[col]['opener'](cur_fname) 115 | return data_dict 116 | except Exception as e: 117 | print(repr(e)) 118 | raise e 119 | 120 | def get_subdict(full_dict,start,end): 121 | subdict=dict() 122 | for key in full_dict: 123 | subdict[key]=full_dict[key][start:end] 124 | print(subdict.keys()) 125 | return subdict 126 | 127 | def ingest_single_threaded(args): 128 | if type(args)==type({}): 129 | args=args_object_from_args_dict(args) 130 | 131 | #config 132 | tdb_Config=tiledb.Config(tdb_config_params) 133 | tdb_write_Context=tiledb.Ctx(config=tdb_Config) 134 | tdb_read_Context=tiledb.Ctx(config=tdb_Config) 135 | 136 | overwrite=args.overwrite 137 | coord_tile_size=args.coord_tile_size 138 | task_tile_size=args.task_tile_size 139 | attribute_config=args.attribute_config 140 | updating=False 141 | 142 | attribute_info=get_attribute_info(args.attribute_config) 143 | tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t') 144 | num_tasks=tiledb_metadata.shape[0] 145 | 146 | print("loaded tiledb metadata") 147 | chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t') 148 | print("loaded chrom sizes") 149 | chrom_indices,num_indices=transform_chrom_size_to_indices(chrom_sizes) 150 | print("num_indices:"+str(num_indices)) 151 | array_out_name=args.tiledb_group 152 | if tiledb.object_type(array_out_name) == "array": 153 | if overwrite==False: 154 | raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting") 155 | else: 156 | print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten") 157 | updating=True 158 | else: 159 | #create the array: 160 | create_new_array(tdb_Context=tdb_write_Context, 161 | size=(num_indices,num_tasks), 162 | attribute_config=attribute_config, 163 | array_out_name=array_out_name, 164 | coord_tile_size=coord_tile_size, 165 | task_tile_size=task_tile_size, 166 | var=False) 167 | print("created new array:"+str(array_out_name)) 168 | #create metadata array 169 | metadata_dict={} 170 | metadata_dict['tasks']=[i for i in tiledb_metadata['dataset']] 171 | metadata_dict['chroms']=[i for i in chrom_indices.keys()] 172 | metadata_dict['sizes']=[i[2] for i in list(chrom_indices.values())] 173 | metadata_dict['offsets']=[i[0] for i in list(chrom_indices.values())] 174 | num_tasks=tiledb_metadata['dataset'].shape[0] 175 | num_chroms=len(chrom_indices.keys()) 176 | with tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') as cur_array: 177 | cur_array.meta['num_tasks']=num_tasks 178 | cur_array.meta['num_chroms']=num_chroms 179 | for task_index in range(num_tasks): 180 | cur_array.meta['_'.join(['task',str(task_index)])]=metadata_dict['tasks'][task_index] 181 | for chrom_index in range(num_chroms): 182 | cur_array.meta['_'.join(['chrom',str(chrom_index)])]=metadata_dict['chroms'][chrom_index] 183 | cur_array.meta['_'.join(['size',str(chrom_index)])]=metadata_dict['sizes'][chrom_index] 184 | cur_array.meta['_'.join(['offset',str(chrom_index)])]=metadata_dict['offsets'][chrom_index] 185 | print("created tiledb metadata") 186 | if updating is True: 187 | cur_array_toread=tiledb.DenseArray(array_out_name,ctx=tdb_read_Context,mode='r') 188 | else: 189 | cur_array_toread=None 190 | cur_array_towrite=tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') 191 | for task_index,task_row in tiledb_metadata.iterrows(): 192 | dataset=task_row['dataset'] 193 | print(dataset) 194 | #read in filenames for bigwigs 195 | data_dict=open_data_for_parsing(task_row,attribute_info) 196 | for start_chunk_index in range(0,num_indices,args.write_chunk): 197 | print(str(start_chunk_index)+'/'+str(num_indices)) 198 | end_chunk_index=start_chunk_index+min([num_indices,start_chunk_index+args.write_chunk]) 199 | print("end chunk index:"+str(end_chunk_index)) 200 | #convert global indices to chrom+pos indices 201 | chunk_chrom_coords=transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices) 202 | print("processing:"+str(chunk_chrom_coords)) 203 | for coord_set in chunk_chrom_coords: 204 | print("\t"+"coord_set:"+str(coord_set)) 205 | process_chunk(task_index,data_dict,attribute_info,coord_set,updating,args,cur_array_toread,cur_array_towrite) 206 | print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) 207 | print("wrote chrom array for task:"+str(dataset)+"for index:"+str(start_chunk_index)) 208 | print("closing arrays") 209 | if cur_array_to_read is not None: 210 | cur_array_toread.close() 211 | cur_array_towrite.close() 212 | print('done!') 213 | 214 | def process_chunk(task_index, data_dict, attribute_info, coord_set, updating, args, cur_array_toread, cur_array_towrite): 215 | attribute_config=args.attribute_config 216 | dict_to_write=OrderedDict() 217 | chrom=coord_set[0] 218 | start_pos=coord_set[1] 219 | end_pos=coord_set[2] 220 | start_index=coord_set[3] 221 | end_index=coord_set[4] 222 | for attribute in data_dict: 223 | cur_parser=attribute_info[attribute]['parser'] 224 | cur_vals=cur_parser([data_dict[attribute],chrom,start_pos,end_pos,attribute_info[attribute]]) 225 | dict_to_write[attribute]=cur_vals[-1] #the last entry in the tuple is the actual numpy array of values; the first entries store start and end blocks 226 | print("got:"+str(attribute)+" for task "+str(task_index)+" for "+str(chrom)+":"+str(start_pos)+"-"+str(end_pos)) 227 | 228 | if updating is True: 229 | #we are only updating some attributes in the array 230 | cur_vals=cur_array_toread[start_index:end_index,task_index] 231 | print("got cur vals for task "+str(task_index)+" for "+str(chrom)+":"+str(start_pos)+"-"+str(end_pos)) 232 | for key in dict_to_write: 233 | cur_vals[key]=dict_to_write[key] 234 | dict_to_write=cur_vals 235 | print("updated data dict for writing:"+array_out_name) 236 | else: 237 | #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array 238 | required_attrib=list(get_attribute_info(attribute_config).keys()) 239 | for attrib in required_attrib: 240 | if attrib not in dict_to_write: 241 | dict_to_write[attrib]=np.full(end_pos-start_pos,np.nan) 242 | 243 | #write in chunks 244 | cur_array_towrite[start_index:end_index,task_index]=dict_to_write 245 | print("wrote to disk "+str(task_index)+" for "+str(chrom)+":"+str(start_pos)+"-"+str(end_pos)) 246 | gc.collect() 247 | 248 | def main(): 249 | args=parse_args() 250 | ingest_single_threaded(args) 251 | 252 | if __name__=="__main__": 253 | main() 254 | 255 | 256 | -------------------------------------------------------------------------------- /seqdataloader/labelgen/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | import argparse 3 | from pybedtools import BedTool 4 | import pyBigWig 5 | import pandas as pd 6 | import numpy as np 7 | import pdb 8 | import csv 9 | import sys 10 | from .classification_label_protocols import * 11 | from .regression_label_protocols import * 12 | import gzip 13 | import os 14 | #from ..bounded_process_pool_executor import * 15 | from concurrent.futures import * 16 | #graceful shutdown 17 | import psutil 18 | import signal 19 | import gc 20 | import string 21 | import random 22 | import pickle 23 | 24 | #Approaches to determining classification labels 25 | #Others can be added here (imported from classification_label_protocols) 26 | labeling_approaches={ 27 | "peak_summit_in_bin_classification":peak_summit_in_bin_classification, 28 | "peak_percent_overlap_with_bin_classification":peak_percent_overlap_with_bin_classification, 29 | "peak_summit_in_bin_regression":peak_summit_in_bin_regression, 30 | "peak_percent_overlap_with_bin_regression":peak_percent_overlap_with_bin_regression, 31 | "all_genome_bins_regression":all_genome_bins_regression 32 | } 33 | 34 | def randomString(stringLength=16): 35 | """Generate a random string of fixed length """ 36 | letters = string.ascii_lowercase 37 | return ''.join(random.choice(letters) for i in range(stringLength)) 38 | 39 | def init_worker(): 40 | signal.signal(signal.SIGINT, signal.SIG_IGN) 41 | 42 | def kill_child_processes(parent_pid, sig=signal.SIGTERM): 43 | try: 44 | parent = psutil.Process(parent_pid) 45 | except psutil.NoSuchProcess: 46 | return 47 | children = parent.children(recursive=True) 48 | for process in children: 49 | process.send_signal(sig) 50 | 51 | def add_filename_prefix(fname,prefix): 52 | splits=fname.split('/') 53 | if len(splits)==1: 54 | #local path 55 | return prefix+'.'+fname 56 | else: 57 | cur_dir='/'.join(splits[0:-1]) 58 | cur_fname=splits[-1] 59 | modified_fname=prefix+'.'+cur_fname 60 | return '/'.join([cur_dir,modified_fname]) 61 | 62 | def parse_args(): 63 | parser=argparse.ArgumentParser(description="Generate genome-wide labeled bins for a set of narrowPeak task files ") 64 | parser.add_argument("--task_list",help="this is a tab-separated file with the name of the task in the first column, the path to the corresponding narrowPeak(.gz) file in the second column (optionally), and the path to the corresponding bigWig file in the third column (optionally, for regression)") 65 | parser.add_argument("--task_list_sep",default='\t') 66 | parser.add_argument("--outf",help="output filename that labeled bed file will be saved to.") 67 | parser.add_argument("--output_type",choices=['gzip','hdf5','pkl','bz2'],default='gzip',help="format to save output, one of gzip, hdf5, pkl, bz2") 68 | parser.add_argument("--split_output_by_chrom",action="store_true",default=False) 69 | parser.add_argument("--split_output_by_task",action="store_true",default=False,help="creates a separate output file for each task's labels") 70 | parser.add_argument("--chrom_sizes",help="chromsizes file for the reference genome. First column is chrom name; second column is chrom size") 71 | parser.add_argument("--chroms_to_keep",nargs="+",default=None,help="list of chromosomes, as defined in the --chrom_sizes file, to include in label generation. All chromosomes will be used if this argument is not provided. This is most useful if generating a train/test/validate split for deep learning models") 72 | parser.add_argument("--chroms_to_exclude",nargs="+",default=None,help="list of chromosomes, as defined in the --chrom_sizes file, to exclude in label generation. No chromosomes will be excluded if this argument is not provided. This is most useful if generating a train/test/validate split for deep learning models") 73 | parser.add_argument("--bin_stride",type=int,default=50,help="bin_stride to shift adjacent bins by") 74 | parser.add_argument("--left_flank",type=int,default=400,help="left flank") 75 | parser.add_argument("--right_flank",type=int,default=400,help="right flank") 76 | parser.add_argument("--bin_size",type=int,default=200,help="flank around bin center where peak summit falls in a positive bin") 77 | 78 | parser.add_argument("--task_threads",type=int,default=1,help="Number of tasks to process for a given chromosome.") 79 | parser.add_argument("--chrom_threads",type=int,default=4,help="Number of chromosomes to process at once.") 80 | parser.add_argument("--bigwig_stats",choices=['mean','min','max','coverage','std'],default='mean',help="Value to extract from bigwig file") 81 | parser.add_argument("--overlap_thresh",type=float,default=0.5,help="minimum percent of bin that must overlap a peak for a positive label") 82 | parser.add_argument("--allow_ambiguous",default=False,action="store_true") 83 | parser.add_argument("--store_positives_only",default=False,action="store_true") 84 | parser.add_argument("--store_values_above_thresh",default=None,type=float,help="for the regression case, determine the minimum row value to include in the output data frame (i.,e. remove bins that are 0 for all tasks by setting this to 0") 85 | parser.add_argument("--labeling_approach",choices=["peak_summit_in_bin_classification", 86 | "peak_percent_overlap_with_bin_classification", 87 | "peak_summit_in_bin_regression", 88 | "peak_percent_overlap_with_bin_regression", 89 | "all_genome_bins_regression"]) 90 | parser.add_argument("--label_transformer",default="asinh",help="type of transformation to apply to the labels; one of None, asinh, log10, log") 91 | parser.add_argument("--label_transformer_pseudocount",type=float,default=0.001,help="pseudocount to add to values if using log10 or log label transformations") 92 | parser.add_argument("--temp_dir",default="/tmp") 93 | parser.add_argument("--save_label_source",default=False,action='store_true',help='a separate dataframe is created that stores the source file, peak region, and (if available) peak name for each genome bin, or NA') 94 | if len(sys.argv)==1: 95 | parser.print_help(sys.stderr) 96 | sys.exit(1) 97 | return parser.parse_args() 98 | 99 | def get_labels_one_task(inputs): 100 | #unravel the inputs 101 | task_name=inputs[0] 102 | task_bed=inputs[1] 103 | task_bigwig=inputs[2] 104 | task_ambig=inputs[3] 105 | chrom=inputs[4] 106 | first_coord=inputs[5] 107 | final_coord=inputs[6] 108 | args=inputs[7] 109 | #determine the appropriate labeling approach 110 | print("in get_labels_one_task") 111 | return labeling_approaches[args.labeling_approach](task_name,task_bed,task_bigwig,task_ambig,chrom,first_coord,final_coord,args) 112 | 113 | def get_chrom_labels(inputs): 114 | #print(inputs) 115 | #unravel inputs 116 | chrom=inputs[0] 117 | chrom_size=inputs[1] 118 | bed_and_bigwig_dict=inputs[2] 119 | tasks=inputs[3] 120 | args=inputs[4] 121 | #pre-allocate a pandas data frame to store bin labels for the current chromosome. Fill with zeros 122 | #determine the index tuple values 123 | try: 124 | chroms,all_start_pos,all_end_pos,first_bin_start,final_bin_start=get_indices(chrom,chrom_size,args) 125 | except: 126 | return (chrom,None,None) 127 | columns=['CHR','START','END']+list(tasks['task']) 128 | num_entries=len(chroms.values) 129 | chrom_df = pd.DataFrame(0,index=np.arange(num_entries),columns=columns) 130 | chrom_df['CHR']=chroms.values 131 | chrom_df['START']=all_start_pos.values 132 | chrom_df['END']=all_end_pos.values 133 | if args.save_label_source is True: 134 | chrom_label_source_dict={} 135 | print("pre-allocated df for chrom:"+str(chrom)+"with dimensions:"+str(chrom_df.shape)) 136 | 137 | #create a thread pool to label bins, each task gets assigned a thread 138 | pool_inputs=[] 139 | for task_name in bed_and_bigwig_dict: 140 | task_bed=bed_and_bigwig_dict[task_name]['bed'] 141 | task_bigwig=bed_and_bigwig_dict[task_name]['bigwig'] 142 | task_ambig=bed_and_bigwig_dict[task_name]['ambig'] 143 | pool_inputs.append((task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args)) 144 | try: 145 | with ProcessPoolExecutor(max_workers=args.task_threads,initializer=init_worker) as pool: 146 | bin_values=pool.map(get_labels_one_task,pool_inputs) 147 | pool.shutdown(wait=True) 148 | except KeyboardInterrupt: 149 | print('detected keyboard interrupt') 150 | #shutdown the pool 151 | pool.shutdown(wait=False) 152 | # Kill remaining child processes 153 | kill_child_processes(os.getpid()) 154 | raise 155 | except Exception as e: 156 | print(repr(e)) 157 | #shutdown the pool 158 | pool.shudown(wait=False) 159 | # Kill remaining child processes 160 | kill_child_processes(os.getpid()) 161 | raise e 162 | 163 | for task_name,task_labels,label_source_dict in bin_values: 164 | if task_labels is None: 165 | continue 166 | chrom_df[task_name]=task_labels 167 | if args.save_label_source is True: 168 | chrom_label_source_dict.update(label_source_dict) 169 | 170 | #convert label source dictionary to dataframe 171 | if args.save_label_source is True: 172 | chrom_label_source_df=pd.DataFrame.from_dict(chrom_label_source_dict,orient='index') 173 | cols=list(chrom_label_source_df.columns) 174 | chrom_label_source_df['CHR']=chrom_df['CHR'][chrom_label_source_df.index] 175 | chrom_label_source_df['START']=chrom_df['START'][chrom_label_source_df.index] 176 | chrom_label_source_df['END']=chrom_df['END'][chrom_label_source_df.index] 177 | #reorder so that chr,start,end are at the front, sort by bin start position 178 | ordered_cols=['CHR','START','END']+cols 179 | chrom_label_source_df=chrom_label_source_df[ordered_cols].sort_values(by='START') 180 | 181 | else: 182 | chrom_label_source_df=None 183 | if args.split_output_by_chrom==True: 184 | outf=add_filename_prefix(args.outf,chrom) 185 | if args.output_type in ["gzip","bz2"]: 186 | chrom_df.to_csv(outf,sep='\t',float_format="%.2f",header=True,index=False,mode='wb',compression=args.output_type,chunksize=1000000) 187 | elif args.output_type == "hdf5": 188 | chrom_df=chrom_df.set_index(['CHR','START','END']) 189 | chrom_df.to_hdf(args.outf+"."+chrom,key="data",mode='w', append=True, format='table',min_itemsize=30) 190 | if args.save_label_source is True: 191 | outf_labels=add_filename_prefix(args.outf,'label_source.'+chrom) 192 | if args.output_type in ["gzip","bz2"]: 193 | chrom_label_source_df.to_csv(outf_labels,sep='\t',float_format="%.2f",header=True,index=False,mode='wb',compression=args.output_type,chunksize=1000000) 194 | elif args.output_type=="hdf5": 195 | chrom_label_source_df=chrom_label_source_df.set_index(['CHR','START','END']) 196 | chrom_label_source_df.to_hdf(outf_labels,key='data',mode='w',append=True,format='table',min_itemsize=30) 197 | return (chrom, None, None) 198 | else: 199 | #dump to tmp file -- needed to avoid passing very large objects between processes 200 | pickle_name=randomString() 201 | pickle_path='/'.join([args.temp_dir,pickle_name]) 202 | print("dumping chrom outputs to pickle:"+pickle_path) 203 | with open(pickle_path,'wb') as f: 204 | pickle.dump(chrom_df,f) 205 | return (chrom,pickle_path,chrom_label_source_df) 206 | 207 | 208 | def get_bed_and_bigwig_dict(tasks): 209 | print("creating dictionary of bed files and bigwig files for each task:") 210 | bed_and_bigwig_dict=dict() 211 | for index,row in tasks.iterrows(): 212 | task_name=row['task'] 213 | print(task_name) 214 | bed_and_bigwig_dict[task_name]=dict() 215 | 216 | #get the peak file associated with the task (if provided) 217 | if "narrowPeak" not in row: 218 | task_bed=None 219 | else: 220 | print(row['narrowPeak']) 221 | assert os.path.exists(row["narrowPeak"]) 222 | task_bed=row['narrowPeak'] 223 | bed_and_bigwig_dict[task_name]['bed']=task_bed 224 | 225 | #get the BigWig file associated with the task (if provided) 226 | if "bigwig" not in row: 227 | task_bigwig=None 228 | else: 229 | print(row['bigwig']) 230 | assert os.path.exists(row["bigwig"]) 231 | task_bigwig=row['bigwig'] 232 | bed_and_bigwig_dict[task_name]['bigwig']=task_bigwig 233 | 234 | #get the ambiguous peaks 235 | if "ambig" not in row: 236 | ambig_bed=None 237 | else: 238 | assert os.path.exists(row["ambig"]) 239 | ambig_bed=row['ambig'] 240 | bed_and_bigwig_dict[task_name]['ambig']=ambig_bed 241 | 242 | return bed_and_bigwig_dict 243 | 244 | def get_indices(chrom,chrom_size,args): 245 | print("getting indices") 246 | final_bin_start=((chrom_size-args.right_flank-args.bin_size)//args.bin_stride)*args.bin_stride 247 | #final_coord=(chrom_size//args.bin_stride)*args.bin_stride 248 | first_bin_start=args.left_flank 249 | if final_bin_start<=first_bin_start: 250 | print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping") 251 | return None 252 | chroms=[] 253 | start_pos=[] 254 | end_pos=[] 255 | for index in range(first_bin_start,final_bin_start+1,args.bin_stride): 256 | chroms.append(chrom) 257 | start_pos.append(index-args.left_flank) 258 | end_pos.append(index+args.bin_size+args.right_flank) 259 | return pd.Series(chroms),pd.Series(start_pos),pd.Series(end_pos),first_bin_start,final_bin_start 260 | 261 | 262 | def write_output(task_names,full_df,first_chrom,args,mode='w',task_split_engaged=False,outf=None,labels=False): 263 | ''' 264 | Save genome-wide labels to disk in gzip, hdf5, or pkl format 265 | ''' 266 | 267 | if (args.split_output_by_task==True) and (task_split_engaged==False) : 268 | for task in task_names: 269 | task_df=full_df[['CHR','START','END',task]] 270 | cur_outf=add_filename_prefix(args.outf,task.replace('/','.')) 271 | write_output([task],task_df,first_chrom,args,mode=mode,task_split_engaged=True,outf=cur_outf) 272 | return 273 | if outf==None: 274 | outf=args.outf 275 | if labels==True: 276 | outf=add_filename_prefix(outf,'label_source') 277 | all_negative_df=None 278 | if (args.store_positives_only==True) and (labels==False): 279 | #find regions with at least one positive entry per task 280 | all_negative_df=full_df[['CHR','START','END']][(full_df[task_names]<=0).all(1)] 281 | full_df=full_df[(full_df[task_names]>0).any(1)] 282 | if (args.store_values_above_thresh is not None) and (labels==False): 283 | all_negative_df=full_df[['CHR','START','END']][(full_df[task_names]<=args.store_values_above_thresh).all(1)] 284 | full_df=full_df[(full_df[task_names]>args.store_values_above_thresh).any(1)] 285 | 286 | #determine if header needs to be stored 287 | if first_chrom is True: 288 | header=True 289 | else: 290 | header=False 291 | 292 | #get the universal negatives file name 293 | if all_negative_df is not None: 294 | if outf.startswith('/'): 295 | basename_outf=outf.split('/')[-1] 296 | prefix_outf='/'.join(outf.split('/')[0:-1]) 297 | universal_negatives_outf='.'.join(['/'.join([prefix_outf,"universal_negatives"]),basename_outf]) 298 | else: 299 | universal_negatives_outf='.'.join([outf,"universal_negatives"]) 300 | if args.output_type=="gzip": 301 | try: 302 | full_df.to_csv(outf,sep='\t',header=header,index=False,mode=mode+'b',compression='gzip',chunksize=1000000) 303 | if all_negative_df is not None: 304 | all_negative_df.to_csv(universal_negatives_outf,sep='\t',header=header,index=False,mode=mode+'b',compression='gzip',chunksize=1000000) 305 | except: 306 | print("warning! some chromosomes in your file are too small to produce values, skipping") 307 | pass 308 | 309 | elif args.output_type=="bz2": 310 | try: 311 | full_df.to_csv(outf,sep='\t',header=header,index=False,mode=mode+'b',compression='bz2',chunksize=1000000) 312 | if all_negative_df is not None: 313 | all_negative_df.to_csv(universal_negatives_outf,sep='\t',header=header,index=False,mode=mode+'b',compression='bz2',chunksize=1000000) 314 | except: 315 | print("warning! some chromosomes in your file are too small to produce values, skipping") 316 | pass 317 | elif args.output_type=="hdf5": 318 | full_df=full_df.set_index(['CHR','START','END']) 319 | if mode=='w': 320 | append=False 321 | else: 322 | append=True 323 | try: 324 | full_df.to_hdf(outf,key="data",mode=mode, append=append, format='table',min_itemsize=30) 325 | if all_negative_df is not None: 326 | all_negative_df.set_index(['CHR','START','END']) 327 | all_negative_df.to_hdf(universal_negatives_outf,key="data",mode=mode, append=append, format='table',min_itemsize=30) 328 | except: 329 | print("warning! some chromosomes in your file are too small to produce values, skipping") 330 | pass 331 | elif args.output_type=="pkl": 332 | full_df=full_df.set_index(['CHR','START','END']) 333 | try: 334 | full_df.to_pickle(outf,compression="gzip") 335 | if all_negative_df is not None: 336 | all_negative_df.set_index(['CHR','START','END']) 337 | all_negative_df.to_pickle(universal_negatives_outf,compression="gzip") 338 | except: 339 | print("warning! some chromosomes in your file are too small to produce values, skipping") 340 | pass 341 | def args_object_from_args_dict(args_dict): 342 | #create an argparse.Namespace from the dictionary of inputs 343 | args_object=argparse.Namespace() 344 | #set the defaults 345 | vars(args_object)['split_output_by_chrom']=False 346 | vars(args_object)['split_output_by_task']=False 347 | vars(args_object)['chroms_to_keep']=None 348 | vars(args_object)['chroms_to_exclude']=None 349 | vars(args_object)['bin_stride']=50 350 | vars(args_object)['left_flank']=400 351 | vars(args_object)['right_flank']=400 352 | vars(args_object)['bin_size']=200 353 | vars(args_object)['chrom_threads']=4 354 | vars(args_object)['task_threads']=1 355 | vars(args_object)['overlap_thresh']=0.5 356 | vars(args_object)['allow_ambiguous']=True 357 | vars(args_object)['store_positives_only']=False 358 | vars(args_object)['store_values_above_thresh']=None 359 | vars(args_object)['output_hdf5_low_mem']=False 360 | vars(args_object)['task_list_sep']='\t' 361 | vars(args_object)['bigwig_stats']='mean' 362 | vars(args_object)['label_transformer']='asinh' 363 | vars(args_object)['label_transformer_pseudocount']=0.001 364 | vars(args_object)['temp_dir']='/tmp' 365 | vars(args_object)['save_label_source']=False 366 | for key in args_dict: 367 | vars(args_object)[key]=args_dict[key] 368 | #set any defaults that are unset 369 | args=args_object 370 | return args 371 | 372 | def genomewide_labels(args): 373 | if type(args)==type({}): 374 | args=args_object_from_args_dict(args) 375 | 376 | #read in the metadata file with: 377 | #task names in column 1, 378 | #path to peak file in column 2, 379 | #path to bigWig file in column 3 380 | #path to ambiguous peaks in column 4 (bed) 381 | tasks=pd.read_csv(args.task_list,sep=args.task_list_sep,header=0) 382 | bed_and_bigwig_dict=get_bed_and_bigwig_dict(tasks) 383 | chrom_sizes=pd.read_csv(args.chrom_sizes,sep='\t',header=None) 384 | 385 | processed_first_chrom=False 386 | #create a Pool to process chromosomes in parallel 387 | pool_args=[] 388 | chrom_order=[] 389 | for index,row in chrom_sizes.iterrows(): 390 | chrom=row[0] 391 | 392 | #determine whether this chromosome should be included in the label file 393 | if args.chroms_to_keep!=None: 394 | if chrom not in args.chroms_to_keep: 395 | continue 396 | if args.chroms_to_exclude!=None: 397 | if chrom in args.chroms_to_exclude: 398 | continue 399 | chrom_order.append(chrom) 400 | chrom_size=row[1] 401 | pool_args.append((chrom,chrom_size,bed_and_bigwig_dict,tasks,args)) 402 | print("creating chromosome thread pool") 403 | try: 404 | #with ThreadPool(args.chrom_threads) as pool: 405 | with ProcessPoolExecutor(max_workers=args.chrom_threads,initializer=init_worker) as pool: 406 | processed_chrom_outputs=pool.map(get_chrom_labels,pool_args) 407 | pool.shutdown(wait=True) 408 | 409 | except KeyboardInterrupt: 410 | print('detected keyboard interrupt') 411 | #shutdown the pool 412 | pool.shutdown(wait=False) 413 | # Kill remaining child processes 414 | kill_child_processes(os.getpid()) 415 | raise 416 | except Exception as e: 417 | print(repr(e)) 418 | #shutdown the pool 419 | pool.shutdown(wait=False) 420 | # Kill remaining child processes 421 | kill_child_processes(os.getpid()) 422 | raise e 423 | 424 | #if the user is happy with separate files for each chromosome, these have already been written to disk. We are done 425 | if args.split_output_by_chrom==True: 426 | exit() 427 | mode='w' 428 | first_chrom=True 429 | for chrom, pickle_path,chrom_label_source_df in processed_chrom_outputs: 430 | #write to output file! 431 | if pickle_path is None: 432 | continue 433 | print("loading temp file with chromosome data:") 434 | with open(pickle_path,'rb') as f: 435 | chrom_df=pickle.load(f) 436 | print("writing output chromosomes:"+str(chrom)) 437 | if chrom_label_source_df is not None: 438 | write_output(tasks['task'],chrom_label_source_df,first_chrom,args,mode=mode,labels=True) 439 | write_output(tasks['task'],chrom_df,first_chrom,args,mode=mode) 440 | #delete the temp file 441 | os.remove(pickle_path) 442 | first_chrom=False 443 | mode='a' 444 | print("done!") 445 | 446 | def main(): 447 | args=parse_args() 448 | genomewide_labels(args) 449 | 450 | if __name__=="__main__": 451 | try: 452 | multiprocessing.set_start_method('forkserver') 453 | except: 454 | print("context already set") 455 | main() 456 | -------------------------------------------------------------------------------- /seqdataloader/labelgen/classification_label_protocols.py: -------------------------------------------------------------------------------- 1 | from math import floor,ceil 2 | import pandas as pd 3 | from multiprocessing.pool import ThreadPool 4 | from .utils import rolling_window 5 | import pdb 6 | import numpy as np 7 | from pybedtools import BedTool 8 | 9 | 10 | def peak_summit_in_bin_classification(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args): 11 | ''' 12 | For each peak, the summit position is determined. 13 | 14 | The minimum bin with bedtools coverage is args.binsize upstream of the summit; 15 | The max bin with bedtools coverage is args.binsize downstream of the summit 16 | 17 | Within this range, bin centers are shifted by args.bin_stride 18 | 19 | If specified in args.allow_ambiguous, then coverage is also computed in adjacent bins to the two extremes are marked as 20 | ambiguous 21 | ''' 22 | #get the peaks for the current chromosome by intersecting the task_bed with the chromosome coordinates 23 | task_bed=BedTool(task_bed) 24 | if task_ambig is not None: 25 | task_ambig=BedTool(task_ambig) 26 | min_chrom_coord=first_bin_start 27 | max_chrom_coord=final_bin_start 28 | if min_chrom_coord >= max_chrom_coord: 29 | print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping") 30 | return task_name,None 31 | chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord) 32 | chrom_bed=BedTool(chrom_coords,from_string=True) 33 | chrom_task_bed=task_bed.intersect(chrom_bed) 34 | chrom_ambig_bed=None 35 | if ((args.allow_ambiguous==True) and (task_ambig!=None)): 36 | chrom_ambig_bed=task_ambig.intersect(chrom_bed) 37 | print("got peak subset for chrom:"+str(chrom)+" for task:"+str(task_name)) 38 | 39 | #pre-allocate a numpy array of 0's 40 | num_bins=(final_bin_start-first_bin_start)//args.bin_stride+1 41 | coverage_vals=np.zeros(num_bins) 42 | if args.save_label_source is True: 43 | label_source_dict=dict() 44 | else: 45 | label_source_dict=None 46 | 47 | for entry in chrom_task_bed: 48 | chrom=entry[0] 49 | peak_start=int(entry[1]) 50 | peak_end=int(entry[2]) 51 | summit=peak_start+int(entry[-1]) 52 | 53 | chromosome_min_bin_index=ceil((summit-args.bin_size-first_bin_start)/args.bin_stride) 54 | min_bin_start=chromosome_min_bin_index*args.bin_stride 55 | chromosome_max_bin_index=floor((summit-first_bin_start)/args.bin_stride) 56 | max_bin_start=chromosome_max_bin_index*args.bin_stride 57 | 58 | #get mean coverage in bigwig for each bin specified above 59 | index_coverage_vals=chromosome_min_bin_index 60 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 61 | if index_coverage_vals >= 0 and index_coverage_vals <= (num_bins - 1): 62 | coverage_vals[index_coverage_vals]=1 63 | if args.save_label_source is True: 64 | label_source_dict[index_coverage_vals]={} 65 | label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom 66 | label_source_dict[index_coverage_vals][task_name+".START"]=peak_start 67 | label_source_dict[index_coverage_vals][task_name+".END"]=peak_end 68 | index_coverage_vals+=1 69 | 70 | #if allow_ambiguous supplied by user, shift 1 bin left and 1 bin right 71 | if args.allow_ambiguous==True: 72 | chromosome_min_bin_index-=1 73 | if chromosome_min_bin_index > 0 and chromosome_min_bin_index <= (num_bins - 1): 74 | coverage_vals[chromosome_min_bin_index]=np.nan 75 | if (args.save_label_source is True) and (chromosome_min_bin_index in label_source_dict): 76 | del label_source_dict[chromosome_min_bin_index] 77 | chromosome_max_bin_index+=1 78 | if chromosome_max_bin_index >= 0 and chromosome_max_bin_index < (num_bins - 1): 79 | coverage_vals[chromosome_max_bin_index]=np.nan 80 | if (args.save_label_source is True) and (chromosome_max_bin_index in label_source_dict): 81 | del label_source_dict[chromosome_max_bin_index] 82 | 83 | #if a bed file of ambiguous labels is specified, label them with -1 84 | if ((args.allow_ambiguous==True) and (chrom_ambig_bed!=None)): 85 | for entry in chrom_ambig_bed: 86 | chrom=entry[0] 87 | peak_start=int(entry[1]) 88 | peak_end=int(entry[2]) 89 | summit=peak_start+int(entry[-1]) 90 | 91 | chromosome_min_bin_index=ceil((summit-args.bin_size-first_bin_start)/args.bin_stride) 92 | min_bin_start=chromosome_min_bin_index*args.bin_stride 93 | chromosome_max_bin_index=floor((summit-first_bin_start)/args.bin_stride) 94 | max_bin_start=chromosome_max_bin_index*args.bin_stride 95 | 96 | #get mean coverage in bigwig for each bin specified above 97 | index_coverage_vals=chromosome_min_bin_index 98 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 99 | if index_coverage_vals >= 0 and index_coverage_vals <= (num_bins - 1): 100 | coverage_vals[index_coverage_vals]=np.nan 101 | if args.save_label_source is True: 102 | if index_coverage_vals in label_source_dict: 103 | del label_source_dict[index_coverage_vals] 104 | index_coverage_vals+=1 105 | 106 | 107 | print("finished chromosome:"+str(chrom)+" for task:"+str(task_name)) 108 | return task_name,coverage_vals,label_source_dict 109 | 110 | def peak_percent_overlap_with_bin_classification(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args): 111 | ''' 112 | 50% of the central 200bp region in a 1kb bin must overlap with the peak for a positive label 113 | ''' 114 | #get the peaks for the current chromosome by intersecting the task_bed with the chromosome coordinates 115 | task_bed=BedTool(task_bed) 116 | if task_ambig is not None: 117 | task_ambig=BedTool(task_ambig) 118 | min_chrom_coord=first_bin_start 119 | max_chrom_coord=final_bin_start 120 | if min_chrom_coord >= max_chrom_coord: 121 | print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping") 122 | return task_name, None 123 | chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord) 124 | chrom_bed=BedTool(chrom_coords,from_string=True) 125 | chrom_task_bed=task_bed.intersect(chrom_bed) 126 | chrom_ambig_bed=None 127 | if ((args.allow_ambiguous==True) and (task_ambig!=None)): 128 | chrom_ambig_bed=task_ambig.intersect(chrom_bed) 129 | print("got peak subset for chrom:"+str(chrom)+" for task:"+str(task_name)) 130 | #pre-allocate a numpy array of 0's 131 | num_bins=(final_bin_start-first_bin_start)//args.bin_stride+1 132 | coverage_vals=np.zeros(num_bins) 133 | 134 | if args.save_label_source is True: 135 | label_source_dict=dict() 136 | else: 137 | label_source_dict=None 138 | 139 | for entry in chrom_task_bed: 140 | chrom=entry[0] 141 | peak_start=int(entry[1]) 142 | peak_end=int(entry[2]) 143 | min_overlap=int(round(args.overlap_thresh*min(args.bin_size, (peak_end-peak_start)))) 144 | 145 | #get the bin indices that overlap the peak 146 | chromosome_min_bin_index=ceil((peak_start-(args.bin_size-min_overlap)-first_bin_start)/args.bin_stride) 147 | min_bin_start=chromosome_min_bin_index*args.bin_stride 148 | chromosome_max_bin_index=floor((peak_end-min_overlap-first_bin_start)/args.bin_stride) 149 | max_bin_start=chromosome_max_bin_index*args.bin_stride 150 | 151 | #get mean coverage in bigwig for each bin specified above 152 | index_coverage_vals=chromosome_min_bin_index 153 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 154 | if index_coverage_vals >= 0 and index_coverage_vals <= (num_bins - 1): 155 | coverage_vals[index_coverage_vals]=1 156 | if args.save_label_source is True: 157 | label_source_dict[index_coverage_vals]={} 158 | label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom 159 | label_source_dict[index_coverage_vals][task_name+".START"]=peak_start 160 | label_source_dict[index_coverage_vals][task_name+".END"]=peak_end 161 | index_coverage_vals+=1 162 | 163 | #if allow_ambiguous supplied by user, shift 1 bin left and 1 bin right 164 | if args.allow_ambiguous==True: 165 | if chromosome_min_bin_index > 0 and chromosome_min_bin_index <= (num_bins - 1): 166 | chromosome_min_bin_index-=1 167 | coverage_vals[chromosome_min_bin_index]=np.nan 168 | if (args.save_label_source is True) and (chromosome_min_bin_index in label_source_dict): 169 | del label_source_dict[chromosome_min_bin_index] 170 | 171 | if chromosome_max_bin_index >= 0 and chromosome_max_bin_index < (num_bins - 1): 172 | chromosome_max_bin_index+=1 173 | coverage_vals[chromosome_max_bin_index]=np.nan 174 | if (args.save_label_source is True) and (chromosome_max_bin_index in label_source_dict): 175 | del label_source_dict[chromosome_max_bin_index] 176 | 177 | 178 | if ((args.allow_ambiguous==True) and (task_ambig!=None)): 179 | for entry in chrom_ambig_bed: 180 | chrom=entry[0] 181 | peak_start=int(entry[1]) 182 | peak_end=int(entry[2]) 183 | min_overlap=int(round(args.overlap_thresh*min(args.bin_size, (peak_end-peak_start)))) 184 | 185 | #get the bin indices that overlap the peak 186 | chromosome_min_bin_index=ceil((peak_start-(args.bin_size-min_overlap)-first_bin_start)/args.bin_stride) 187 | min_bin_start=chromosome_min_bin_index*args.bin_stride 188 | chromosome_max_bin_index=floor((peak_end-min_overlap-first_bin_start)/args.bin_stride) 189 | max_bin_start=chromosome_max_bin_index*args.bin_stride 190 | 191 | #get mean coverage in bigwig for each bin specified above 192 | index_coverage_vals=chromosome_min_bin_index 193 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 194 | if index_coverage_vals >= 0 and index_coverage_vals <= (num_bins - 1): 195 | coverage_vals[index_coverage_vals]=np.nan 196 | if (args.save_label_source is True) and (index_coverage_vals in label_source_dict): 197 | del label_source_dict[index_coverage_vals] 198 | index_coverage_vals+=1 199 | 200 | print("finished chromosome:"+str(chrom)+" for task:"+str(task_name)) 201 | return task_name, coverage_vals, label_source_dict 202 | -------------------------------------------------------------------------------- /seqdataloader/labelgen/regression_label_protocols.py: -------------------------------------------------------------------------------- 1 | from math import floor,ceil 2 | import pandas as pd 3 | from .utils import rolling_window 4 | import pdb 5 | import numpy as np 6 | from pybedtools import BedTool 7 | import pyBigWig 8 | 9 | def transform_label_vals(labels,label_transformer,pseudocount=0.001): 10 | if label_transformer is None: 11 | return labels 12 | elif label_transformer=="None": 13 | return labels 14 | elif label_transformer == 'asinh': 15 | return np.arcsinh(labels) 16 | elif label_transformer == 'log10': 17 | return np.log10(labels+pseudocount) 18 | elif label_transformer == 'log': 19 | return np.log(labels+pseudocount) 20 | else: 21 | raise Exception("transform_label_vals argument must be one of None, asinh, log10, log; you provided:"+str(label_transformer)) 22 | 23 | 24 | def peak_summit_in_bin_regression(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args): 25 | ''' 26 | For each peak, the summit position is determined. 27 | 28 | The minimum bin with bedtools coverage is args.binsize upstream of the summit; 29 | The max bin with bedtools coverage is args.binsize downstream of the summit 30 | 31 | Within this range, bin centers are shifted by args.bin_stride 32 | 33 | If specified in args.allow_ambiguous, then coverage is also computed in adjacent bins to the two extremes are marked as 34 | ambiguous 35 | ''' 36 | print("starting chromosome:"+str(chrom)+" for task:"+str(task_name)) 37 | task_bigwig=pyBigWig.open(task_bigwig) 38 | task_bed=BedTool(task_bed) 39 | if task_ambig is not None: 40 | task_ambig=BedTool(task_ambig) 41 | #get the peaks for the current chromosome by intersecting the task_bed with the chromosome coordinates 42 | min_chrom_coord=first_bin_start 43 | max_chrom_coord=final_bin_start 44 | if min_chrom_coord >= max_chrom_coord: 45 | print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping") 46 | return task_name, None, None 47 | chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord) 48 | chrom_bed=BedTool(chrom_coords,from_string=True) 49 | chrom_task_bed=task_bed.intersect(chrom_bed) 50 | chrom_ambig_bed=None 51 | if ((args.allow_ambiguous==True) and (task_ambig!=None)): 52 | chrom_ambig_bed=task_ambig.intersect(chrom_bed) 53 | print("got peak subset for chrom:"+str(chrom)+" for task:"+str(task_name)) 54 | 55 | #pre-allocate a numpy array of 0's 56 | num_bins=(final_bin_start-first_bin_start)//args.bin_stride+1 57 | coverage_vals=np.zeros(num_bins) 58 | if args.save_label_source is True: 59 | label_source_dict=dict() 60 | else: 61 | label_source_dict=None 62 | 63 | for entry in chrom_task_bed: 64 | chrom=entry[0] 65 | peak_start=int(entry[1]) 66 | peak_end=int(entry[2]) 67 | summit=peak_start+int(entry[-1]) 68 | 69 | chromosome_min_bin_index=ceil((summit-args.bin_size)/args.bin_stride) 70 | min_bin_start=chromosome_min_bin_index*args.bin_stride 71 | chromosome_max_bin_index=floor(summit/args.bin_stride) 72 | max_bin_start=chromosome_max_bin_index*args.bin_stride 73 | 74 | #if allow_ambiguous supplied by user, shift 1 bin left and 1 bin right 75 | if args.allow_ambiguous==True: 76 | min_bin_start-=args.bin_stride 77 | chromosome_min_bin_index-=1 78 | max_bin_start+=args.bin_stride 79 | chromosome_max_bin_index+=1 80 | #get mean coverage in bigwig for each bin specified above 81 | index_coverage_vals=chromosome_min_bin_index 82 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 83 | if index_coverage_vals>=0 and index_coverage_vals < num_bins: 84 | try: 85 | coverage_vals[index_coverage_vals]=task_bigwig.stats(chrom,bin_start,bin_start+args.bin_size,args.bigwig_stats)[0] 86 | if args.save_label_source is True: 87 | label_source_dict[index_coverage_vals]={} 88 | label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom 89 | label_source_dict[index_coverage_vals][task_name+".START"]=peak_start 90 | label_source_dict[index_coverage_vals][task_name+".END"]=peak_end 91 | except: 92 | print("could not get coverage:"+str(chrom)+":"+str(bin_start)+"-"+str(bin_start+args.bin_size)+" for task:"+str(task_name)) 93 | index_coverage_vals+=1 94 | 95 | print("checking ambig") 96 | if chrom_ambig_bed!=None: 97 | for entry in chrom_ambig_bed: 98 | chrom=entry[0] 99 | peak_start=int(entry[1]) 100 | peak_end=int(entry[2]) 101 | summit=peak_start+int(entry[-1]) 102 | 103 | chromosome_min_bin_index=ceil((summit-args.bin_size)/args.bin_stride) 104 | min_bin_start=chromosome_min_bin_index*args.bin_stride 105 | chromosome_max_bin_index=floor(summit/args.bin_stride) 106 | max_bin_start=chromosome_max_bin_index*args.bin_stride 107 | 108 | #get mean coverage in bigwig for each bin specified above 109 | index_coverage_vals=chromosome_min_bin_index 110 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 111 | if index_coverage_vals>=0 and index_coverage_vals < num_bins: 112 | coverage_vals[index_coverage_vals]=np.nan 113 | if args.save_label_source is True: 114 | if index_coverage_vals in label_source_dict: 115 | del label_source_dict[index_coverage_vals] 116 | index_coverage_vals+=1 117 | 118 | 119 | print("finished chromosome:"+str(chrom)+" for task:"+str(task_name)) 120 | tranformed_vals=transform_label_vals(coverage_vals,args.label_transformer,args.label_transformer_pseudocount) 121 | return task_name,transformed_vals, label_source_dict 122 | 123 | def peak_percent_overlap_with_bin_regression(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args): 124 | ''' 125 | 50% of the central 200bp region in a 1kb bin must overlap with the peak for coverage to be computed in the provided bigWig 126 | ''' 127 | #get the peaks for the current chromosome by intersecting the task_bed with the chromosome coordinates 128 | print("starting chromosome:"+str(chrom)+" for task:"+str(task_name)) 129 | task_bigwig=pyBigWig.open(task_bigwig) 130 | task_bed=BedTool(task_bed) 131 | if task_ambig is not None: 132 | task_ambig=BedTool(task_ambig) 133 | min_chrom_coord=first_bin_start 134 | max_chrom_coord=final_bin_start 135 | if min_chrom_coord >= max_chrom_coord: 136 | print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping") 137 | return task_name,None,None 138 | chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord) 139 | chrom_bed=BedTool(chrom_coords,from_string=True) 140 | chrom_task_bed=task_bed.intersect(chrom_bed) 141 | chrom_ambig_bed=None 142 | if ((args.allow_ambiguous==True) and (task_ambig!=None)): 143 | chrom_ambig_bed=task_ambig.intersect(chrom_bed) 144 | 145 | print("got peak subset for chrom:"+str(chrom)+" for task:"+str(task_name)) 146 | #pre-allocate a numpy array of 0's 147 | num_bins=(final_bin_start-first_bin_start)//args.bin_stride+1 148 | coverage_vals=np.zeros(num_bins) 149 | if args.save_label_source is True: 150 | label_source_dict=dict() 151 | else: 152 | label_source_dict=None 153 | 154 | for entry in chrom_task_bed: 155 | chrom=entry[0] 156 | peak_start=int(entry[1]) 157 | peak_end=int(entry[2]) 158 | min_overlap=int(round(args.overlap_thresh*args.bin_size)) 159 | 160 | #get the bin indices that overlap the peak 161 | chromosome_min_bin_index=(peak_start-min_overlap-first_bin_start)//args.bin_stride 162 | min_bin_start=chromosome_min_bin_index*args.bin_stride 163 | chromosome_max_bin_index=(peak_end-min_overlap-first_bin_start)//args.bin_stride 164 | max_bin_start=chromosome_max_bin_index*args.bin_stride 165 | 166 | #if allow_ambiguous supplied by user, shift 1 bin left and 1 bin right 167 | if args.allow_ambiguous==True: 168 | min_bin_start-=args.bin_stride 169 | chromosome_min_bin_index-=1 170 | max_bin_start+=args.bin_stride 171 | chromosome_max_bin_index+=1 172 | 173 | #get mean coverage in bigwig for each bin specified above 174 | index_coverage_vals=chromosome_min_bin_index 175 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 176 | if index_coverage_vals>=0 and index_coverage_vals < num_bins: 177 | try: 178 | coverage_vals[index_coverage_vals]=task_bigwig.stats(chrom,bin_start,bin_start+args.bin_size,args.bigwig_stats)[0] 179 | if args.save_label_source is True: 180 | label_source_dict[index_coverage_vals]={} 181 | label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom 182 | label_source_dict[index_coverage_vals][task_name+".START"]=peak_start 183 | label_source_dict[index_coverage_vals][task_name+".END"]=peak_end 184 | except: 185 | print("could not get coverage:"+str(chrom)+":"+str(bin_start)+"-"+str(bin_start+args.bin_size)+" for task:"+str(task_name)) 186 | index_coverage_vals+=1 187 | 188 | if ((args.allow_ambiguous==True) and (task_ambig!=None)): 189 | for entry in chrom_ambig_bed: 190 | chrom=entry[0] 191 | peak_start=int(entry[1]) 192 | peak_end=int(entry[2]) 193 | min_overlap=int(round(args.overlap_thresh*args.bin_size)) 194 | 195 | #get the bin indices that overlap the peak 196 | chromosome_min_bin_index=(peak_start-min_overlap-first_bin_start)//args.bin_stride 197 | min_bin_start=chromosome_min_bin_index*args.bin_stride 198 | chromosome_max_bin_index=(peak_end-min_overlap-first_bin_start)//args.bin_stride 199 | max_bin_start=chromosome_max_bin_index*args.bin_stride 200 | #get mean coverage in bigwig for each bin specified above 201 | index_coverage_vals=chromosome_min_bin_index 202 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 203 | if index_coverage_vals>=0 and index_coverage_vals < num_bins: 204 | coverage_vals[index_coverage_vals]=np.nan 205 | if args.save_label_source is True: 206 | if index_coverage_vals in label_source_dict: 207 | del label_source_dict[index_coverage_vals] 208 | index_coverage_vals+=1 209 | 210 | print("finished chromosome:"+str(chrom)+" for task:"+str(task_name)) 211 | transformed_vals=transform_label_vals(coverage_vals,args.label_transformer,args.label_transformer_pseudocount) 212 | return task_name,transformed_vals, label_source_dict 213 | 214 | def all_genome_bins_regression(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args): 215 | ''' 216 | compute bigWig coverage for all bins in the chromosome, regardless of whether a called peak overlaps the bin 217 | ''' 218 | print("starting chromosome:"+str(chrom)+" for task:"+str(task_name)) 219 | task_bigwig=pyBigWig.open(task_bigwig) 220 | if task_ambig is not None: 221 | task_ambig=BedTool(task_ambig) 222 | 223 | #get the BigWig value at each position along the chromosome, (cutting off anything that extends beyond final_coord) 224 | try: 225 | values=task_bigwig.values(chrom,first_bin_start,final_bin_start+args.bin_size,numpy=True) 226 | except: 227 | print("Warning! Chromosome:"+str(chrom)+" appears not to be present in the bigWig file for task:"+task_name) 228 | return task_name,None,None 229 | #replace nan values with 0 230 | values=np.nan_to_num(values) 231 | #reshape the values such that number of columns is equal to the bin_stride 232 | values=np.reshape(values,((final_bin_start+args.bin_size-first_bin_start)//args.bin_stride,args.bin_stride)) 233 | #sum across the columns 234 | strided_sums=np.sum(values,axis=1) 235 | 236 | #compute rolling average for each bin 237 | bin_means=np.sum(rolling_window(strided_sums,args.bin_size//args.bin_stride),-1)/args.bin_size 238 | norm_bin_means=transform_label_vals(bin_means,args.label_transformer,args.label_transformer_pseudocount) 239 | num_bins=norm_bin_means.shape[0] 240 | #add in ambiguous bins 241 | chrom_ambig_bed=None 242 | if ((args.allow_ambiguous==True) and (task_ambig is not None)): 243 | min_chrom_coord=first_bin_start 244 | max_chrom_coord=final_bin_start 245 | if min_chrom_coord >= max_chrom_coord: 246 | print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping") 247 | return task_name,None,None 248 | chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord) 249 | chrom_bed=BedTool(chrom_coords,from_string=True) 250 | chrom_ambig_bed=task_ambig.intersect(chrom_bed) 251 | for entry in chrom_ambig_bed: 252 | chrom=entry[0] 253 | peak_start=int(entry[1]) 254 | peak_end=int(entry[2]) 255 | summit=peak_start+int(entry[-1]) 256 | chromosome_min_bin_index=ceil((summit-args.bin_size)/args.bin_stride) 257 | min_bin_start=chromosome_min_bin_index*args.bin_stride 258 | chromosome_max_bin_index=floor(summit/args.bin_stride) 259 | max_bin_start=chromosome_max_bin_index*args.bin_stride 260 | 261 | #get mean coverage in bigwig for each bin specified above 262 | index_coverage_vals=chromosome_min_bin_index 263 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 264 | if index_coverage_vals>=0 and index_coverage_vals < num_bins: 265 | norm_bin_means[index_coverage_vals]=np.nan 266 | index_coverage_vals+=1 267 | 268 | label_source_dict=None 269 | if args.save_label_source is True: 270 | assert task_bed is not None 271 | print("getting source peaks for genome bins") 272 | task_bed=BedTool(task_bed) 273 | min_chrom_coord=first_bin_start 274 | max_chrom_coord=final_bin_start 275 | if min_chrom_coord >= max_chrom_coord: 276 | return task_name, norm_bin_means, None 277 | chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord) 278 | chrom_bed=BedTool(chrom_coords,from_string=True) 279 | chrom_task_bed=task_bed.intersect(chrom_bed) 280 | for entry in chrom_task_bed: 281 | chrom=entry[0] 282 | peak_start=int(entry[1]) 283 | peak_end=int(entry[2]) 284 | summit=peak_start+int(entry[-1]) 285 | 286 | chromosome_min_bin_index=ceil((summit-args.bin_size)/args.bin_stride) 287 | min_bin_start=chromosome_min_bin_index*args.bin_stride 288 | chromosome_max_bin_index=floor(summit/args.bin_stride) 289 | max_bin_start=chromosome_max_bin_index*args.bin_stride 290 | 291 | #get mean coverage in bigwig for each bin specified above 292 | index_coverage_vals=chromosome_min_bin_index 293 | for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride): 294 | if index_coverage_vals>=0 and index_coverage_vals < num_bins: 295 | if norm_bin_means[index_coverage_vals] is not np.nan: 296 | label_source_dict[index_coverage_vals]={} 297 | label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom 298 | label_source_dict[index_coverage_vals][task_name+".START"]=peak_start 299 | label_source_dict[index_coverage_vals][task_name+".END"]=peak_end 300 | index_coverage_vals+=1 301 | 302 | print("finished chromosome:"+str(chrom)+" for task:"+str(task_name)) 303 | return task_name,norm_bin_means, label_source_dict 304 | 305 | -------------------------------------------------------------------------------- /seqdataloader/labelgen/rolling_average.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pyBigWig 4 | import pandas as pd 5 | import pdb 6 | from math import floor 7 | 8 | def rolling_window(a, window): 9 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 10 | strides = a.strides + (a.strides[-1],) 11 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 12 | 13 | 14 | task_bigwig=pyBigWig.open("./bigwig_files_from_encode_for_label_comparison/ENCFF842XRQ.bigWig") 15 | chromsizes=pd.read_csv("hg38.chrom.sizes",header=None,sep='\t') 16 | bin_size=200 17 | bin_stride=50 18 | left_flank=400 19 | right_flank=400 20 | seq_size=left_flank+right_flank+bin_size 21 | task_name="test" 22 | for index,row in chromsizes.iterrows(): 23 | chrom=row[0] 24 | chromsize=row[1] 25 | nbins=chromsize//bin_stride 26 | final_coord=nbins*bin_stride 27 | print(final_coord) 28 | print(chromsize) 29 | values=task_bigwig.values(chrom,0,final_coord,numpy=True) 30 | print("got values") 31 | cols=bin_stride 32 | rows=final_coord//cols 33 | values=np.reshape(values,(rows,cols)) 34 | print("completed reshape!") 35 | #sum the bins 36 | binsums=np.sum(values,axis=1) 37 | print("completed bin sums") 38 | bin_means=np.sum(rolling_window(binsums,bin_size//bin_stride),-1)/bin_size 39 | print("rolled") 40 | non_zero_inds=np.nonzero(bin_means)[0] 41 | non_zero_seq_start=non_zero_inds*bin_stride-left_flank 42 | non_zero_seq_end=non_zero_seq_start+seq_size 43 | non_zero_bins=dict() 44 | for i in range(non_zero_inds.shape[0]): 45 | bin_index=non_zero_inds[i] 46 | cur_bin_mean=bin_means[bin_index] 47 | non_zero_bins[(chrom,non_zero_seq_start[i],non_zero_seq_end[i])]=dict() 48 | non_zero_bins[(chrom,non_zero_seq_start[i],non_zero_seq_end[i])][task_name]=cur_bin_mean 49 | print("finished chrom:"+str(chrom)+" for task:"+str(task_name)) 50 | 51 | -------------------------------------------------------------------------------- /seqdataloader/labelgen/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def rolling_window(a, window): 4 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 5 | strides = a.strides + (a.strides[-1],) 6 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 7 | -------------------------------------------------------------------------------- /seqdataloader/queue_config.py: -------------------------------------------------------------------------------- 1 | queue_feed_chunk_size=50 #number of items to queue into pool at once for processing 2 | max_write_chunk=30000000 3 | -------------------------------------------------------------------------------- /seqdataloader/tdb_config.py: -------------------------------------------------------------------------------- 1 | tdb_config_params={"sm.check_coord_dups":False, 2 | "sm.check_coord_oob":False, 3 | "sm.check_global_order":False, 4 | "sm.num_writer_threads":50, 5 | "sm.num_reader_threads":50, 6 | "sm.num_async_threads":50, 7 | "vfs.num_threads":50} 8 | # "sm.memory_budget":"5000000000" 9 | -------------------------------------------------------------------------------- /seqdataloader/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pyBigWig 4 | from pybedtools import BedTool 5 | from itertools import islice 6 | from collections import OrderedDict 7 | 8 | def open_bigwig_for_parsing(fname,parallel=False): 9 | if not parallel: 10 | return pyBigWig.open(fname) 11 | else: 12 | #pybigwig objects cannot be pickled 13 | return fname 14 | def open_csv_for_parsing(fname,parallel=False): 15 | #if not parallel: 16 | return BedTool(fname) 17 | #else: 18 | # 19 | # return fname 20 | 21 | def parse_bigwig_chrom_vals(entry): 22 | bigwig_object=entry[0] 23 | if type(bigwig_object)==str: 24 | bigwig_object=pyBigWig.open(bigwig_object) 25 | chrom=entry[1] 26 | start=entry[2] 27 | end=entry[3] 28 | cur_attribute_info=entry[4] 29 | #note: pybigwig uses NA in place of 0 where there are no reads, replace with 0. 30 | bw_chroms=bigwig_object.chroms().keys() 31 | if chrom not in bw_chroms: 32 | print("WARNING: chromosome:"+str(chrom)+ " was not found in the bigwig file:"+str(bigwig_object)) 33 | size=(end-start+1) 34 | signal_data=np.full(size,np.nan) 35 | else: 36 | #check to see if chromosome in bigwig, if not, return all NA's & warning that chromosome is not present in the dataset 37 | try: 38 | signal_data=np.nan_to_num(bigwig_object.values(chrom,start,end)) 39 | except Exception as e: 40 | print(chrom+"\t"+str(start)+"\t"+str(end)+str(cur_attribute_info)) 41 | raise e 42 | return start, end, signal_data 43 | 44 | 45 | def parse_narrowPeak_chrom_vals(entry): 46 | task_bed=entry[0] 47 | chrom=entry[1] 48 | start=entry[2] 49 | end=entry[3] 50 | num_entries=end-start 51 | chrom_coords=chrom+'\t'+str(start)+'\t'+str(end) 52 | chrom_bed=BedTool(chrom_coords,from_string=True) 53 | cur_bed=task_bed.intersect(chrom_bed) 54 | cur_attribute_info=entry[4] 55 | store_summits=None 56 | summit_indicator=None 57 | summit_from_peak_center=None 58 | if 'store_summits' in cur_attribute_info: 59 | store_summits=cur_attribute_info['store_summits'] 60 | if store_summits is True: 61 | summit_from_peak_center=cur_attribute_info['summit_from_peak_center'] 62 | summit_indicator=cur_attribute_info['summit_indicator'] 63 | signal_data = np.zeros(num_entries, dtype=np.int) 64 | warned=False 65 | summits=[] 66 | for entry in cur_bed: 67 | #offset relative to start position of the interval 68 | entry_start=int(entry[1])-start 69 | entry_end=int(entry[2])-start 70 | signal_data[entry_start:entry_end]=1 71 | #add in summits in a separate step to avoid overwriting them with "1's" for overlaping peak coordinates; 72 | #The overwriting issue is particularly relevant for pseudobulk data. 73 | if store_summits is True: 74 | if summit_from_peak_center is True: 75 | summit_pos=int(entry_start+(entry_end-entry_start)*0.5) 76 | else: 77 | try: 78 | summit_pos=entry_start+int(entry[-1]) 79 | except: 80 | print("WARNING: could not add summit position from last column of narrowPeak file, falling back to peak center"+str(entry)) 81 | summit_pos=int(entry_start+(entry_end-entry_start)*0.5) 82 | if (summit_pos < entry_end) and (summit_pos > entry_start): 83 | summits.append(summit_pos) 84 | else: 85 | print("WARNING: summit position outside peak region position,skipping:"+str(entry)) 86 | if store_summits is True: 87 | signal_data[summits]=summit_indicator 88 | return start, end, signal_data 89 | 90 | def chunkify(iterable,chunk): 91 | it=iter(iterable) 92 | while True: 93 | piece=list(islice(it,chunk)) 94 | if piece: 95 | yield piece 96 | else: 97 | return 98 | 99 | def transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices): 100 | #print("transforming:"+str(start_chunk_index)+'-'+str(end_chunk_index)) 101 | #print(str(chrom_indices)) 102 | chrom_coords=[] 103 | while True: 104 | found=False 105 | for chrom in chrom_indices: 106 | cur_chrom_start_index=chrom_indices[chrom][0] 107 | cur_chrom_end_index=chrom_indices[chrom][1] 108 | cur_chrom_size=chrom_indices[chrom][2] 109 | if start_chunk_index >=cur_chrom_start_index: 110 | if start_chunk_index < cur_chrom_end_index: 111 | found=True 112 | #start position is on this chromosome 113 | chrom_coord_start=start_chunk_index-cur_chrom_start_index 114 | #check if end coordinate falls on same chromosome 115 | if end_chunk_index < cur_chrom_end_index: 116 | #on one chrom 117 | chrom_coord_end=end_chunk_index-cur_chrom_start_index 118 | chrom_coords.append((chrom,chrom_coord_start,chrom_coord_end,start_chunk_index,end_chunk_index)) 119 | return chrom_coords 120 | else: 121 | chrom_coord_end=cur_chrom_size 122 | chrom_coords.append((chrom,chrom_coord_start,chrom_coord_end,start_chunk_index,cur_chrom_end_index)) 123 | #update start_chunk_index 124 | start_chunk_index=cur_chrom_end_index 125 | if found is False: 126 | if len(chrom_coords)==0: 127 | raise Exception("failed to tranform indices:"+str(start_chunk_index)+"-"+str(end_chunk_index)+ " to chrom coords;"+str(chrom_indices)) 128 | else: 129 | return chrom_coords 130 | 131 | def transform_chrom_size_to_indices(chrom_sizes): 132 | ''' 133 | chrom_sizes is a dataframe 134 | get 0-based tdb coordinates for start & end of each chromosome 135 | ''' 136 | start_coord=0 137 | chrom_indices=OrderedDict() 138 | for index,row in chrom_sizes.iterrows(): 139 | chrom=row[0] 140 | size=row[1] 141 | chrom_indices[chrom]=[start_coord,start_coord+size,size] 142 | start_coord=start_coord+size 143 | return chrom_indices,start_coord 144 | 145 | 146 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup,find_packages 2 | config = { 3 | 'include_package_data': True, 4 | 'author': 'Anna Shcherbina', 5 | 'author_email': 'annashch@stanford.edu', 6 | 'url': 'https://github.com/kundajelab/seqdataloader', 7 | 'description': 'Generate genome-wide classification and regression labels for DNA accessibility data.', 8 | 'version': '1.2', 9 | 'packages': ['seqdataloader'], 10 | 'setup_requires': [], 11 | 'install_requires': ['numpy>=1.15','pandas>=0.23.4','cython>=0.27.3','deeptools>=3.0.1','pybedtools>=0.7','pyBigWig>=0.3.7', 'pyfaidx','tiledb>=0.4.4'], 12 | 'scripts': [], 13 | 'entry_points': {'console_scripts': ['genomewide_labels=seqdataloader.labelgen.__init__:main', 14 | 'db_ingest=seqdataloader.dbingest.__init__:main', 15 | 'db_ingest_single_threaded=seqdataloader.dbingest_single_threaded.__init__:main', 16 | 'seqdataloader_get_outliers=seqdataloader.helpers.get_outliers:main']}, 17 | 'name': 'seqdataloader' 18 | } 19 | 20 | if __name__== '__main__': 21 | setup(**config) 22 | -------------------------------------------------------------------------------- /tests/test_tiledb_coords_to_vals.benchmark.py: -------------------------------------------------------------------------------- 1 | #unit tests for class seqdataloader.batchproducers.coordbased.coordstovals.BasicTiledbProfileCoordsToVals 2 | import pdb 3 | from seqdataloader.batchproducers.coordbased.coordstovals.tiledb import * 4 | 5 | #generate some test coords objects 6 | from collections import namedtuple 7 | Coord=namedtuple('Coord','chrom start end isplusstrand') 8 | coords=[Coord('chr1',1000000,2000000,True), 9 | Coord('chr2',1000000,2000000,True), 10 | Coord('chr3',1000000,2000000,True), 11 | Coord('chr4',1000000,2000000,True), 12 | Coord('chr5',1000000,2000000,True), 13 | Coord('chr6',1000000,2000000,True), 14 | Coord('chr7',1000000,2000000,True), 15 | Coord('chr1',1000000,2000000,False), 16 | Coord('chr2',1000000,2000000,False), 17 | Coord('chr3',1000000,2000000,False), 18 | Coord('chr4',1000000,2000000,False), 19 | Coord('chr5',1000000,2000000,False), 20 | Coord('chr6',1000000,2000000,False), 21 | Coord('chr7',1000000,2000000,False)] 22 | 23 | 24 | pos_label_source_attribute="fc_bigwig" 25 | neg_label_source_attribute="fc_bigwig" 26 | 27 | 28 | #case 1: tiledb_paths is a string 29 | tiledb_paths="/mnt/data/tiledb/encode/dnase/ENCSR000EOY" 30 | ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths, 31 | pos_label_source_attribute=pos_label_source_attribute, 32 | neg_label_source_attribute=neg_label_source_attribute) 33 | string_vals=ctov.__call__(coords) 34 | 35 | -------------------------------------------------------------------------------- /tests/test_tiledb_coords_to_vals.py: -------------------------------------------------------------------------------- 1 | #unit tests for class seqdataloader.batchproducers.coordbased.coordstovals.BasicTiledbProfileCoordsToVals 2 | import pdb 3 | from seqdataloader.batchproducers.coordbased.coordstovals.tiledb import * 4 | 5 | #generate some test coords objects 6 | from collections import namedtuple 7 | Coord=namedtuple('Coord','chrom start end isplusstrand') 8 | coords=[Coord('chr1',1000000,2000000,True), 9 | Coord('chr2',1000000,2000000,True), 10 | Coord('chr3',1000000,2000000,True), 11 | Coord('chr4',1000000,2000000,True), 12 | Coord('chr5',1000000,2000000,True), 13 | Coord('chr6',1000000,2000000,True), 14 | Coord('chr7',1000000,2000000,True), 15 | Coord('chr1',1000000,2000000,False), 16 | Coord('chr2',1000000,2000000,False), 17 | Coord('chr3',1000000,2000000,False), 18 | Coord('chr4',1000000,2000000,False), 19 | Coord('chr5',1000000,2000000,False), 20 | Coord('chr6',1000000,2000000,False), 21 | Coord('chr7',1000000,2000000,False)] 22 | 23 | 24 | pos_label_source_attribute="fc_bigwig" 25 | neg_label_source_attribute="fc_bigwig" 26 | 27 | 28 | #case 1: tiledb_paths is a string 29 | tiledb_paths="/mnt/data/tiledb/encode/dnase/ENCSR000EOY" 30 | ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths, 31 | pos_label_source_attribute=pos_label_source_attribute, 32 | neg_label_source_attribute=neg_label_source_attribute) 33 | string_vals=ctov.__call__(coords) 34 | pdb.set_trace() 35 | 36 | #case2: tiledb_paths is a list 37 | tiledb_paths=["/mnt/data/tiledb/encode/dnase/ENCSR000EOY","/mnt/data/tiledb/encode/dnase/ENCSR000EOY","/mnt/data/tiledb/encode/dnase/ENCSR000EOY"] 38 | ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths, 39 | pos_label_source_attribute=pos_label_source_attribute, 40 | neg_label_source_attribute=neg_label_source_attribute) 41 | list_vals=ctov.__call__(coords) 42 | pdb.set_trace() 43 | 44 | #case3: tiledb_paths is a dict 45 | tiledb_paths={'mode0':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY", 46 | 'mode1':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY", 47 | 'mode2':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY"} 48 | 49 | ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths, 50 | pos_label_source_attribute=pos_label_source_attribute, 51 | neg_label_source_attribute=neg_label_source_attribute) 52 | dict_vals=ctov.__call__(coords) 53 | pdb.set_trace() 54 | 55 | --------------------------------------------------------------------------------