├── .gitignore
├── CHANGES.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    └── README.md
├── examples
    ├── batchproducers
    │   └── BatchProducersWithTileDB.ipynb
    ├── bigwig_files_from_encode_for_label_comparison
    │   ├── README.md
    │   └── download_bigwigs.sh
    ├── dbingest
    │   ├── dbingest_examples.ipynb
    │   ├── hg38.chrom.sizes
    │   ├── run_db_ingest.sh
    │   ├── run_db_ingest_single_threaded.sh
    │   └── tier1.encode.dnase.tasks.tsv
    ├── hg38.blacklist.bed.gz
    ├── hg38.chrom.sizes
    ├── hg38.chrom21.sizes
    ├── labelgen
    │   ├── bigwig_files_from_encode_for_label_comparison
    │   ├── genomewide_labels_examples.py
    │   ├── genomewide_labels_examples.sh
    │   ├── hg38.chrom.sizes
    │   ├── hg38.chrom21.sizes
    │   ├── peak_files_from_encode_for_label_comparison
    │   ├── save_source_labels_in_labelgen.sh
    │   └── tasks.labelgen.tsv
    ├── peak_files_from_encode_for_label_comparison
    │   └── README
    └── seqdataloader_examples.ipynb
├── requirements.txt
├── seqdataloader
    ├── attrib_config.py
    ├── batchproducers
    │   ├── __init__.py
    │   └── coordbased
    │   │   ├── __init__.py
    │   │   ├── coordbatchproducers.py
    │   │   ├── coordbatchtransformers.py
    │   │   ├── coordstovals
    │   │       ├── __init__.py
    │   │       ├── bigwig.py
    │   │       ├── core.py
    │   │       ├── fasta.py
    │   │       ├── lookup.py
    │   │       └── tiledb.py
    │   │   └── core.py
    ├── bounded_process_pool_executor.py
    ├── dbingest
    │   ├── README.md
    │   └── __init__.py
    ├── dbingest_single_threaded
    │   └── __init__.py
    ├── labelgen
    │   ├── __init__.py
    │   ├── classification_label_protocols.py
    │   ├── regression_label_protocols.py
    │   ├── rolling_average.py
    │   └── utils.py
    ├── queue_config.py
    ├── tdb_config.py
    └── utils.py
├── setup.py
└── tests
    ├── test_tiledb_coords_to_vals.benchmark.py
    └── test_tiledb_coords_to_vals.py


/.gitignore:
--------------------------------------------------------------------------------
1 | seqdataloader.egg-info/
2 | build/
3 | dist/
4 | *.hdf5
5 | *.bigWig
6 | *.gz
7 | *.pyc
8 | __pycache__
9 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
 1 | v0.1,01/24/2019 -- Initial release.
 2 | v0.11,01/30/2019 -- Added flags --chroms_to_keep, --chroms_to_exclude --store_positives_only. Indices stored as strings rather than tuples for more standard bed format output.
 3 | v0.111,01/31/2019 -- default value of False used for flag --store_positives_only when genomewide_labels is called from python script rather than command line script. changed name of module name of genomewide_labels to seqdataloader to avoid user confusion when importing the code 
 4 | v0.113,02/01/2019 --Updated how CHROM , START, END are stored in data frame to avoid problems with quoting in the output bed files. (v.112 is skipped due to problem uploading to pypi)
 5 | v0.114,02/01/2019 --removed redundant code pass for storing positives only
 6 | v0.115,02/01/2019 -- handled an edge case discovered by Soumya 
 7 | v0.116,02/07/2019 -- more robus saving of data frames to output hdf5 format
 8 | v0.117,02/17/2019 -- added minimum required versions for all dependency packages in setup.py
 9 | v0.118,02/17/2019 -- format=table for saving to hdf5 changed as optional,non-default
10 | v0.120,02/26/2019 -- functionality to add a bed file with ambiguous regions for each task. saving as format=table in hdf5 is still optional, but now set to default, as it is desired in most cases
11 | v0.121,02/27/2019 -- ambiguous peaks labeled w/ np.nan
12 | v0.122,02/28/2019 -- write each chromosome to output file individually to reduce ram usage
13 | v0.123,03/02/2019 -- ambig_bed default
14 | [missing documentation]
15 | v0.127,09/05/2019 -- (Av Shrikumar) added functionality to be able to load batches from a downsampled negative set, retrieve labels via a lookup table, Coordinate object is now a namedtuple rather than a full object.
16 | v0.128,09/10/2019 -- (Anna Shcherbina) fixing issue with wheel structure in pypi release
17 | 
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Kundaje Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include . *.md
 2 | recursive-include docs *
 3 | recursive-include seqdataloader/ *
 4 | include requirements.txt
 5 | include examples/tasks.tsv
 6 | include examples/genomewide_labels_examples.py
 7 | include examples/genomewide_labels_examples.sh
 8 | include examples/hg38.chrom.sizes
 9 | include examples/bigwig_files_from_encode_for_label_comparison/download_bigwigs.sh
10 | recursive-exclude examples/ *bigWig
11 | recursive-exclude examples/ *bed.gz
12 | recursive-exclude examples/ *hdf5
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # seqdataloader
  2 | Sequence data label generation and ingestion into deep learning models
  3 | 
  4 | ## Installation
  5 | `pip install seqdataloader`
  6 | 
  7 | If you plan to modify the code, you can install it in development mode: 
  8 | `pip install -e seqdataloader` 
  9 | 
 10 | Please note: to use the "dbingest" functionality in seqdataloader, python>=3.7 is needed. 
 11 | 
 12 | # Quick Start
 13 | 
 14 | ## labelgen 
 15 | The input for the labelgen submodule is a 4 column tab-delimited file with the following fields: 
 16 | 
 17 | * "task" -- required. User-specified task name 
 18 | * "narrowPeak" -- Path to narrowPeak file. (Optional if "bigwig" is specified.) 
 19 | * "bigwig" -- Path to bigwig file (optional if "narrowPeak" is specified.)
 20 | * "ambig" -- bed file containing user-specified regions to label as ambiguous (optional) 
 21 | 
 22 | ```
 23 | genomewide_labels --task_list tasks.tsv \
 24 | 		  --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \
 25 | 		  --output_type gzip \ # (one of gzip, bz2, hdf5, pkl) 
 26 | 		  --chrom_sizes hg38.chrom.sizes \
 27 | 		  --bin_stride 50 \
 28 | 		  --left_flank 400 \
 29 | 		  --right_flank 400 \
 30 | 		  --bin_size 200 \
 31 | 		  --task_hreads 10 \
 32 | 		  --chrom_threads 4 \
 33 | 		  --allow_ambiguous \
 34 | 		  --labeling_approach peak_summit_in_bin_classification 
 35 | ```
 36 | And for regression: 
 37 | ```
 38 | genomewide_labels --task_list tasks.tsv \
 39 |        --outf regressionlabels.allbins.hg38.hdf5 \
 40 |        --output_type hdf5 \
 41 |        --chrom_sizes hg38.chrom.sizes \
 42 |        --bin_stride 50 \
 43 |        --left_flank 400 \
 44 |        --right_flank 400 \
 45 |        --chrom_threads 24 \
 46 |        --task_threads 2 \
 47 |        --label_transformer asinh \ one of None, asinh, log10, log, default is asinh 
 48 |        --labeling_approach all_genome_bins_regression
 49 | ```
 50 | 
 51 | labeling_approach can be one of:
 52 | 
 53 |     "peak_summit_in_bin_classification"
 54 | 
 55 |     "peak_percent_overlap_with_bin_classification"
 56 | 
 57 |     "peak_summit_in_bin_regression"
 58 | 
 59 |     "peak_percent_overlap_with_bin_regression"
 60 |     
 61 |     "all_genome_bins_regression"
 62 |     
 63 | 
 64 | ### How to run 
 65 | Sample datasets are included in the folder `examples/peak_files_from_encode_for_label_comparison` and `examples/bigwig_files_from_encode_for_label_comparison`
 66 | 
 67 | ### Executing seqdataloader as a script: 
 68 | Execute the script:
 69 | 
 70 | `examples/genomewide_labels.sh` for examples on how to generate classification and regression labels on sample datasets.
 71 | The script generates binary classification labels (1,0,-1 for ambiguous) or continuous regression labels reflective of bigWig coverage in a bin  in bed file format:
 72 | 
 73 | http://mitra.stanford.edu/kundaje/seqdataloader/classificationlabels.50PercentOverlap.tsv.gz
 74 | 
 75 | http://mitra.stanford.edu/kundaje/seqdataloader/classificationlabels.SummitWithin200bpCenter.tsv.gz
 76 | 
 77 | http://mitra.stanford.edu/kundaje/seqdataloader/regressionlabels.50PercentOverlap.tsv.gz
 78 | 
 79 | http://mitra.stanford.edu/kundaje/seqdataloader/regressionlabels.SummitWithin200bpCenter.tsv.gz
 80 | 
 81 | Corresponding WashU Browser Tracks with optimal narrowPeak and associated bin labels are here:
 82 | http://epigenomegateway.wustl.edu/legacy/?genome=hg38&session=GDB2BTMGnB&statusId=1154897038
 83 | 
 84 | ### calling seqdataloader as a Python function: 
 85 | ```
 86 | from seqdataloader import *
 87 | classification_params={
 88 |     'task_list':"tasks.tsv",
 89 |     'outf':"classificationlabels.SummitWithin200bpCenter.tsv.gz",
 90 |     'output_type':'gzip',
 91 |     'chrom_sizes':'hg38.chrom.sizes',
 92 |     'chroms_to_keep':['chr21'],
 93 |     "store_positives_only":True,
 94 |     'bin_stride':50,
 95 |     'left_flank':400,
 96 |     'right_flank':400,
 97 |     'bin_size':200,
 98 |     'chrom_threads':10,
 99 |     'task_threads':4,
100 |     'allow_ambiguous':True,
101 |     'labeling_approach':'peak_summit_in_bin_classification'
102 |     }
103 | genomewide_labels(classification_params)
104 | 
105 | regression_params={
106 |     'task_list':"tasks.tsv",
107 |     'outf':"regressionlabels.all_genome_bins_regression.hdf5",
108 |     'output_type':'hdf5',
109 |     'chrom_sizes':'hg38.chrom.sizes',
110 |     'store_values_above_thresh': 0,
111 |     'chroms_to_keep':['chr21'],
112 |     'bin_stride':50,
113 |     'left_flank':400,
114 |     'right_flank':400,
115 |     'bin_size':200,
116 |     'chrom_threads':10,
117 |     'task_threads':4,
118 |     'labeling_approach':'all_genome_bins_regression',
119 |     'label_transformer':'log10',
120 |     'label_transfomer_pseudocount':0.001
121 |     }
122 | genomewide_labels(regression_params)
123 | ```
124 | ### Regression label transformations 
125 | 
126 | In regression mode (   "peak_summit_in_bin_regression", "peak_percent_overlap_with_bin_regression", "all_genome_bins_regression"), the generated labels can be transformed in one of several ways. You can use the arguments `label_transformer` and `label_transformer_pseudocount` to specify the desired tranformation. Allowed values are: 
127 | 
128 | * asinh --  numpy.arcsinh(values) will be computed (this is the default) 
129 | * None -- no label transformation will be performed 
130 | * log10 --  numpy.log10(values + pseudocount) will be computed using a pseudocount specified by `label_transformer_pseudocount` argument. If this argument is not provided,a default pseudocout of 0.001 is used. 
131 | * log -- numpy.log(values + pseudocount) will be computed using a pseudcount as above. 
132 | 
133 | ### A note on file outputs
134 | 
135 | The code supports several output types: `hdf5`, `gzip`, `pkl`, `bz2`.
136 | Specify your desired output type with the flag `--output_type`. The default setting for this flag is `gzip`
137 | Please note that the large bottleneck in the code is writing the files to disk. `hdf5` has negligible overhead, but using `gzip` or `bz2` may increase runtime. Timining benchmarks are provided in `examples/genomewide_labels.sh`
138 | 
139 | You may speed up i/o by writing chromosome outputs to separate files in parallel. This is currently only supported for the `gzip` and `bz2` output types, as i/o is less of a bottleneck for `hdf5` and `pkl` output formats. Use the flag `--split_output_by_chrom` to invoke this parallelized saving of chromosomes.
140 | 
141 | ## dbingest 
142 | 
143 | The input tsv file must have a subset of the columns corresponding to the supported configurations: 
144 | ```
145 |  * encode_config 
146 |         ** dataset
147 | 	** fc_bigwig 
148 | 	** pval_bigwig
149 | 	** count_bigwig_plus_5p
150 | 	** count_bigwig_minus_5p
151 | 	** count_bigwig_unstranded_5p
152 | 	** idr_peak
153 | 	** overlap_peak 
154 | 	** ambig_peak 
155 | 	
156 | * generic_bigwig 
157 | 	** bigwig_track 
158 | 	
159 | ```
160 | # Dependencies
161 | 
162 | Please make sure the following dependencies are installed on your system to use SeqDataLoader:
163 | * pybedtools
164 | * pyBigWig 
165 | * pandas
166 | * numpy
167 | * multiprocessing
168 | 
169 | 
170 | ## Documentation and benchmarks
171 | 
172 | Testing, benchmarks, and documentation can be found in the `docs` folder
173 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | Documentation for benchmarks and testing of seqdataloader
2 | 


--------------------------------------------------------------------------------
/examples/batchproducers/BatchProducersWithTileDB.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#load  tutorial utilities \n",
 10 |     "%reload_ext autoreload\n",
 11 |     "%autoreload 2\n",
 12 |     "%matplotlib inline\n",
 13 |     "import warnings\n",
 14 |     "warnings.filterwarnings('ignore')"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stderr",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "Using TensorFlow backend.\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "#unit tests for class seqdataloader.batchproducers.coordbased.coordstovals.BasicTiledbProfileCoordsToVals\n",
 32 |     "from seqdataloader.batchproducers.coordbased.coordstovals.tiledb import *\n",
 33 |     "\n"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "#generate some test coords objects \n",
 43 |     "from collections import namedtuple\n",
 44 |     "Coord=namedtuple('Coord','chrom start end isplusstrand')\n",
 45 |     "coords=[Coord('chr1',1000000,2000000,True),\n",
 46 |     "        Coord('chr2',1000000,2000000,True),\n",
 47 |     "        Coord('chr3',1000000,2000000,True),\n",
 48 |     "        Coord('chr4',1000000,2000000,True),\n",
 49 |     "        Coord('chr5',1000000,2000000,True),\n",
 50 |     "        Coord('chr6',1000000,2000000,True),\n",
 51 |     "        Coord('chr7',1000000,2000000,True),\n",
 52 |     "        Coord('chr1',1000000,2000000,False),\n",
 53 |     "        Coord('chr2',1000000,2000000,False),\n",
 54 |     "        Coord('chr3',1000000,2000000,False),\n",
 55 |     "        Coord('chr4',1000000,2000000,False),\n",
 56 |     "        Coord('chr5',1000000,2000000,False),\n",
 57 |     "        Coord('chr6',1000000,2000000,False),\n",
 58 |     "        Coord('chr7',1000000,2000000,False)]\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "pos_label_source_attribute=\"fc_bigwig\"\n",
 62 |     "neg_label_source_attribute=\"fc_bigwig\"\n",
 63 |     "\n"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 5,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(14, 1000000)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 5,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "\n",
 84 |     "#case 1: tiledb_paths is a string\n",
 85 |     "tiledb_paths=\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\"\n",
 86 |     "ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,\n",
 87 |     "                                    pos_label_source_attribute=pos_label_source_attribute,\n",
 88 |     "                                    neg_label_source_attribute=neg_label_source_attribute)\n",
 89 |     "string_vals=ctov.__call__(coords)\n",
 90 |     "string_vals.shape\n"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 9,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "coords=[Coord('chr1',1000,2000,True),\n",
100 |     "        Coord('chr2',1000,2000,True),\n",
101 |     "        Coord('chr3',1000,2000,True),\n",
102 |     "        Coord('chr4',1000,2000,True),\n",
103 |     "        Coord('chr5',1000,2000,True),\n",
104 |     "        Coord('chr6',1000,2000,True),\n",
105 |     "        Coord('chr7',1000,2000,True),\n",
106 |     "        Coord('chr1',1000,2000,False),\n",
107 |     "        Coord('chr2',1000,2000,False),\n",
108 |     "        Coord('chr3',1000,2000,False),\n",
109 |     "        Coord('chr4',1000,2000,False),\n",
110 |     "        Coord('chr5',1000,2000,False),\n",
111 |     "        Coord('chr6',1000,2000,False),\n",
112 |     "        Coord('chr7',1000,2000,False)]"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 10,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "string_vals=ctov.__call__(coords)\n"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 6,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "[array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156     , 0.15591   ,\n",
133 |        "         0.15591   ],\n",
134 |        "        [0.16829   , 0.16829   , 0.16829   , ..., 0.67333001, 0.67333001,\n",
135 |        "         0.75756001],\n",
136 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n",
137 |        "         0.25251999],\n",
138 |        "        ...,\n",
139 |        "        [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n",
140 |        "         0.30206999],\n",
141 |        "        [0.        , 0.        , 0.        , ..., 1.17837   , 1.17837   ,\n",
142 |        "         1.17837   ],\n",
143 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n",
144 |        "         1.00989997]]),\n",
145 |        " array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156     , 0.15591   ,\n",
146 |        "         0.15591   ],\n",
147 |        "        [0.16829   , 0.16829   , 0.16829   , ..., 0.67333001, 0.67333001,\n",
148 |        "         0.75756001],\n",
149 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n",
150 |        "         0.25251999],\n",
151 |        "        ...,\n",
152 |        "        [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n",
153 |        "         0.30206999],\n",
154 |        "        [0.        , 0.        , 0.        , ..., 1.17837   , 1.17837   ,\n",
155 |        "         1.17837   ],\n",
156 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n",
157 |        "         1.00989997]]),\n",
158 |        " array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156     , 0.15591   ,\n",
159 |        "         0.15591   ],\n",
160 |        "        [0.16829   , 0.16829   , 0.16829   , ..., 0.67333001, 0.67333001,\n",
161 |        "         0.75756001],\n",
162 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n",
163 |        "         0.25251999],\n",
164 |        "        ...,\n",
165 |        "        [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n",
166 |        "         0.30206999],\n",
167 |        "        [0.        , 0.        , 0.        , ..., 1.17837   , 1.17837   ,\n",
168 |        "         1.17837   ],\n",
169 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n",
170 |        "         1.00989997]])]"
171 |       ]
172 |      },
173 |      "execution_count": 6,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "#case2: tiledb_paths is a list\n",
180 |     "tiledb_paths=[\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\",\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\",\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\"]\n",
181 |     "ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,\n",
182 |     "                                    pos_label_source_attribute=pos_label_source_attribute,\n",
183 |     "                                    neg_label_source_attribute=neg_label_source_attribute)\n",
184 |     "list_vals=ctov.__call__(coords)\n",
185 |     "list_vals"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 7,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "{'mode0': array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156     , 0.15591   ,\n",
197 |        "         0.15591   ],\n",
198 |        "        [0.16829   , 0.16829   , 0.16829   , ..., 0.67333001, 0.67333001,\n",
199 |        "         0.75756001],\n",
200 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n",
201 |        "         0.25251999],\n",
202 |        "        ...,\n",
203 |        "        [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n",
204 |        "         0.30206999],\n",
205 |        "        [0.        , 0.        , 0.        , ..., 1.17837   , 1.17837   ,\n",
206 |        "         1.17837   ],\n",
207 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n",
208 |        "         1.00989997]]),\n",
209 |        " 'mode1': array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156     , 0.15591   ,\n",
210 |        "         0.15591   ],\n",
211 |        "        [0.16829   , 0.16829   , 0.16829   , ..., 0.67333001, 0.67333001,\n",
212 |        "         0.75756001],\n",
213 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n",
214 |        "         0.25251999],\n",
215 |        "        ...,\n",
216 |        "        [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n",
217 |        "         0.30206999],\n",
218 |        "        [0.        , 0.        , 0.        , ..., 1.17837   , 1.17837   ,\n",
219 |        "         1.17837   ],\n",
220 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n",
221 |        "         1.00989997]]),\n",
222 |        " 'mode2': array([[1.23467004, 1.22853994, 1.24536002, ..., 0.156     , 0.15591   ,\n",
223 |        "         0.15591   ],\n",
224 |        "        [0.16829   , 0.16829   , 0.16829   , ..., 0.67333001, 0.67333001,\n",
225 |        "         0.75756001],\n",
226 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 0.25251999, 0.25251999,\n",
227 |        "         0.25251999],\n",
228 |        "        ...,\n",
229 |        "        [0.92584997, 1.00989997, 1.00989997, ..., 0.28422001, 0.28422001,\n",
230 |        "         0.30206999],\n",
231 |        "        [0.        , 0.        , 0.        , ..., 1.17837   , 1.17837   ,\n",
232 |        "         1.17837   ],\n",
233 |        "        [0.67333001, 0.67333001, 0.67333001, ..., 1.00989997, 1.00989997,\n",
234 |        "         1.00989997]])}"
235 |       ]
236 |      },
237 |      "execution_count": 7,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "#case3: tiledb_paths is a dict\n",
244 |     "tiledb_paths={'mode0':\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\",\n",
245 |     "              'mode1':\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\",\n",
246 |     "              'mode2':\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY\"}\n",
247 |     "\n",
248 |     "ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,\n",
249 |     "                                    pos_label_source_attribute=pos_label_source_attribute,\n",
250 |     "                                    neg_label_source_attribute=neg_label_source_attribute)\n",
251 |     "dict_vals=ctov.__call__(coords)\n",
252 |     "dict_vals\n"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": []
261 |   }
262 |  ],
263 |  "metadata": {
264 |   "kernelspec": {
265 |    "display_name": "Python 3",
266 |    "language": "python",
267 |    "name": "python3"
268 |   },
269 |   "language_info": {
270 |    "codemirror_mode": {
271 |     "name": "ipython",
272 |     "version": 3
273 |    },
274 |    "file_extension": ".py",
275 |    "mimetype": "text/x-python",
276 |    "name": "python",
277 |    "nbconvert_exporter": "python",
278 |    "pygments_lexer": "ipython3",
279 |    "version": "3.7.0"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 2
284 | }
285 | 


--------------------------------------------------------------------------------
/examples/bigwig_files_from_encode_for_label_comparison/README.md:
--------------------------------------------------------------------------------
 1 | #GM12878 (Rep 2)
 2 | DNAse	GM12878	Stam	hg38	https://www.encodeproject.org/files/ENCFF743ULW/@@download/ENCFF743ULW.bigWig
 3 | #Hepg2 (Rep2)
 4 | DNAse	Hepg2	Stam	hg38	https://www.encodeproject.org/files/ENCFF842XRQ/@@download/ENCFF842XRQ.bigWig
 5 | #(Rep1+Rep2, fc bigwig) 
 6 | Max	K562	Snyder	hg38	https://www.encodeproject.org/files/ENCFF796GHK/@@download/ENCFF796GHK.bigWig
 7 | #(Rep1+Rep2, pval bigwig)
 8 | Max	K562	Snyder	hg38	https://www.encodeproject.org/files/ENCFF412VKD/@@download/ENCFF412VKD.bigWig
 9 | #(Rep1+Rep2, fc bigwig)
10 | Myc	K562	Snyder	hg38	https://www.encodeproject.org/files/ENCFF667QJZ/@@download/ENCFF667QJZ.bigWig
11 | #(Rep1+Rep2, pval bigwig)
12 | Myc	K562	Snyder	hg38	https://www.encodeproject.org/files/ENCFF139YUD/@@download/ENCFF139YUD.bigWig
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/bigwig_files_from_encode_for_label_comparison/download_bigwigs.sh:
--------------------------------------------------------------------------------
1 | wget https://www.encodeproject.org/files/ENCFF743ULW/@@download/ENCFF743ULW.bigWig
2 | wget https://www.encodeproject.org/files/ENCFF842XRQ/@@download/ENCFF842XRQ.bigWig
3 | wget https://www.encodeproject.org/files/ENCFF796GHK/@@download/ENCFF796GHK.bigWig
4 | wget https://www.encodeproject.org/files/ENCFF412VKD/@@download/ENCFF412VKD.bigWig
5 | wget https://www.encodeproject.org/files/ENCFF667QJZ/@@download/ENCFF667QJZ.bigWig
6 | wget https://www.encodeproject.org/files/ENCFF139YUD/@@download/ENCFF139YUD.bigWig
7 | 
8 | 


--------------------------------------------------------------------------------
/examples/dbingest/hg38.chrom.sizes:
--------------------------------------------------------------------------------
 1 | chr1	248956422
 2 | chr2	242193529
 3 | chr3	198295559
 4 | chr4	190214555
 5 | chr5	181538259
 6 | chr6	170805979
 7 | chr7	159345973
 8 | chr8	145138636
 9 | chr9	138394717
10 | chr10	133797422
11 | chr11	135086622
12 | chr12	133275309
13 | chr13	114364328
14 | chr14	107043718
15 | chr15	101991189
16 | chr16	90338345
17 | chr17	83257441
18 | chr18	80373285
19 | chr19	58617616
20 | chr20	64444167
21 | chr21	46709983
22 | chr22	50818468
23 | chrX	156040895
24 | chrY	57227415
25 | chrM	16569
26 | 


--------------------------------------------------------------------------------
/examples/dbingest/run_db_ingest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | db_ingest --tiledb_metadata tier1.encode.dnase.tasks.tsv \
 3 | 	  --array_name db/dnase \
 4 | 	  --overwrite \
 5 | 	  --chrom_sizes hg38.chrom.sizes \
 6 | 	  --attribute_config encode_pipeline \
 7 | 	  --coord_tile_size 10000 \
 8 | 	  --task_tile_size 1 \
 9 | 	  --write_chunk 30000000 \
10 | 	  --threads 20 \
11 | 	  --max_queue_size 50 \
12 | 	  --max_mem_g 200
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/dbingest/run_db_ingest_single_threaded.sh:
--------------------------------------------------------------------------------
1 | db_ingest_single_threaded --tiledb_metadata tier1.encode.dnase.tasks.tsv \
2 | 	--tiledb_group db/dnase \
3 | 	--overwrite \
4 | 	--chrom_sizes hg38.chrom.sizes \
5 | 	--tile_size 10000 \
6 | 	--write_chunk 10000000
7 | 


--------------------------------------------------------------------------------
/examples/dbingest/tier1.encode.dnase.tasks.tsv:
--------------------------------------------------------------------------------
1 | dataset	fc_bigwig	pval_bigwig	idr_peak	overlap_peak	ambig_peak	count_bigwig_plus_5p	count_bigwig_minus_5p	count_bigwig_unstranded_5p
2 | ENCSR000EMT	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-macs2_signal_track/shard-0/execution/ENCSR000EMT.merged.nodup.fc.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-macs2_signal_track/shard-0/execution/ENCSR000EMT.merged.nodup.pval.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-bowtie2/shard-0/execution/ENCSR000EMT.merged.bam.bpnet.plus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-bowtie2/shard-0/execution/ENCSR000EMT.merged.bam.bpnet.minus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/13da5ebe-0941-4855-8599-40bbcc5c58b4/call-bowtie2/shard-0/execution/ENCSR000EMT.merged.bam.bpnet.unstranded.bw
3 | ENCSR000EMU	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-macs2_signal_track/shard-0/execution/ENCSR000EMU.merged.nodup.fc.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-macs2_signal_track/shard-0/execution/ENCSR000EMU.merged.nodup.pval.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-bowtie2/shard-0/execution/ENCSR000EMU.merged.bam.bpnet.plus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-bowtie2/shard-0/execution/ENCSR000EMU.merged.bam.bpnet.minus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/c9ef8473-1374-41ef-9fab-8f07288e94e7/call-bowtie2/shard-0/execution/ENCSR000EMU.merged.bam.bpnet.unstranded.bw
4 | ENCSR000EOT	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-macs2_signal_track/shard-0/execution/ENCSR000EOT.merged.nodup.fc.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-macs2_signal_track/shard-0/execution/ENCSR000EOT.merged.nodup.pval.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-bowtie2/shard-0/execution/ENCSR000EOT.merged.bam.bpnet.plus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-bowtie2/shard-0/execution/ENCSR000EOT.merged.bam.bpnet.minus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/09ce5f39-5360-411b-88dd-b86f4a1286a7/call-bowtie2/shard-0/execution/ENCSR000EOT.merged.bam.bpnet.unstranded.bw
5 | ENCSR149XIL	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/39e50d95-1423-4dca-acd1-4b685ab94c4c/call-macs2_signal_track/shard-0/execution/ENCSR149XIL.merged.nodup.fc.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/39e50d95-1423-4dca-acd1-4b685ab94c4c/call-macs2_signal_track/shard-0/execution/ENCSR149XIL.merged.nodup.pval.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/ce805260-55f8-43c8-b2a1-a232b4a0e369/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/ce805260-55f8-43c8-b2a1-a232b4a0e369/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/39e50d95-1423-4dca-acd1-4b685ab94c4c/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/38f0a76b-e6c6-444e-84e5-b5a98a554694/call-bowtie2/shard-0/execution/ENCSR149XIL.merged.bam.bpnet.plus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/38f0a76b-e6c6-444e-84e5-b5a98a554694/call-bowtie2/shard-0/execution/ENCSR149XIL.merged.bam.bpnet.minus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/38f0a76b-e6c6-444e-84e5-b5a98a554694/call-bowtie2/shard-0/execution/ENCSR149XIL.merged.bam.bpnet.unstranded.bw
6 | ENCSR477RTP	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/d754a34e-bc9f-4270-8020-bc37e8d195ba/call-macs2_signal_track/shard-0/execution/ENCSR477RTP.merged.nodup.fc.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/b9e61b7e-4173-4b8c-aa38-9e55d81fef0e/call-macs2_signal_track/shard-0/execution/ENCSR477RTP.merged.nodup.pval.signal.bigwig	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/d754a34e-bc9f-4270-8020-bc37e8d195ba/call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/d754a34e-bc9f-4270-8020-bc37e8d195ba/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/d754a34e-bc9f-4270-8020-bc37e8d195ba/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz.ambiguous.bed.gz	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/f38bfd43-b57f-4c55-be06-b02d3f16512a/call-bowtie2/shard-0/execution/ENCSR477RTP.merged.bam.bpnet.plus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/f38bfd43-b57f-4c55-be06-b02d3f16512a/call-bowtie2/shard-0/execution/ENCSR477RTP.merged.bam.bpnet.minus.bw	/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/atac/f38bfd43-b57f-4c55-be06-b02d3f16512a/call-bowtie2/shard-0/execution/ENCSR477RTP.merged.bam.bpnet.unstranded.bw
7 | 


--------------------------------------------------------------------------------
/examples/hg38.blacklist.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kundajelab/seqdataloader/5c043e7d2e5296aa01e83c4a5febf7f5272468d2/examples/hg38.blacklist.bed.gz


--------------------------------------------------------------------------------
/examples/hg38.chrom.sizes:
--------------------------------------------------------------------------------
 1 | chr1	248956422
 2 | chr2	242193529
 3 | chr3	198295559
 4 | chr4	190214555
 5 | chr5	181538259
 6 | chr6	170805979
 7 | chr7	159345973
 8 | chr8	145138636
 9 | chr9	138394717
10 | chr10	133797422
11 | chr11	135086622
12 | chr12	133275309
13 | chr13	114364328
14 | chr14	107043718
15 | chr15	101991189
16 | chr16	90338345
17 | chr17	83257441
18 | chr18	80373285
19 | chr19	58617616
20 | chr20	64444167
21 | chr21	46709983
22 | chr22	50818468
23 | chrX	156040895
24 | chrY	57227415


--------------------------------------------------------------------------------
/examples/hg38.chrom21.sizes:
--------------------------------------------------------------------------------
1 | chr21	46709983
2 | 


--------------------------------------------------------------------------------
/examples/labelgen/bigwig_files_from_encode_for_label_comparison:
--------------------------------------------------------------------------------
1 | ../bigwig_files_from_encode_for_label_comparison


--------------------------------------------------------------------------------
/examples/labelgen/genomewide_labels_examples.py:
--------------------------------------------------------------------------------
 1 | from seqdataloader.labelgen import *
 2 | classification_params={
 3 |     'task_list':"tasks.tsv",
 4 |     'outf':"classificationlabels.SummitWithin200bpCenter.tsv.gz",
 5 |     'output_type':'gzip',
 6 |     'chrom_sizes':'hg38.chrom.sizes',
 7 |     'chroms_to_keep':['chr21'],
 8 |     "store_positives_only":True,
 9 |     'bin_stride':50,
10 |     'left_flank':400,
11 |     'right_flank':400,
12 |     'bin_size':200,
13 |     'threads':10,
14 |     'subthreads':4,
15 |     'allow_ambiguous':True,
16 |     'labeling_approach':'peak_summit_in_bin_classification'
17 |     }
18 | genomewide_labels(classification_params)
19 | 
20 | regression_params={
21 |     'task_list':"tasks.tsv",
22 |     'outf':"regressionlabels.all_genome_bins_regression.hdf5",
23 |     'output_type':'hdf5',
24 |     'chrom_sizes':'hg38.chrom.sizes',
25 |     'store_values_above_thresh': 0,
26 |     'chroms_to_keep':['chr21'],
27 |     'bin_stride':50,
28 |     'left_flank':400,
29 |     'right_flank':400,
30 |     'bin_size':200,
31 |     'threads':10,
32 |     'subthreads':4,
33 |     'labeling_approach':'all_genome_bins_regression'
34 |     }
35 | genomewide_labels(regression_params)
36 | 


--------------------------------------------------------------------------------
/examples/labelgen/genomewide_labels_examples.sh:
--------------------------------------------------------------------------------
  1 | #Classification Approach 1: Summit Must Lie Within 200 BP Bin
  2 | ## Timing 
  3 | ## writing to gzip 
  4 | ##real 11m46.403s
  5 | ##user 18m40.788s
  6 | ##sys 6m18.136s
  7 | 
  8 | ## Writing to bz2:
  9 | ## real 14m44.037s
 10 | ## user 21m49.384s
 11 | ## sys 6m21.000s
 12 | 
 13 | #genomewide_labels --task_list tasks.labelgen.tsv \
 14 | #		  --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \
 15 | #		  --output_type gzip \
 16 | #		  --chrom_sizes hg38.chrom.sizes \
 17 | #		  --bin_stride 50 \
 18 | #		  --left_flank 400 \
 19 | #		  --right_flank 400 \
 20 | #		  --bin_size 200 \
 21 | #		  --chrom_threads 10 \
 22 | #		  --task_threads 4 \
 23 | #		  --allow_ambiguous \
 24 | #		  --labeling_approach peak_summit_in_bin_classification
 25 | 
 26 | #Example of restricting analysis to a single chromosome with --chroms_to_keep flag 
 27 | #genomewide_labels --task_list tasks.labelgen.tsv \
 28 | #		  --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \
 29 | #		  --output_type gzip \
 30 | #		  --chrom_sizes hg38.chrom.sizes \
 31 | #		  --chroms_to_keep chr21 \
 32 | #		  --bin_stride 50 \
 33 | #		  --left_flank 400 \
 34 | #		  --right_flank 400 \
 35 | #		  --bin_size 200 \
 36 | #		  --chrom_threads 10 \
 37 | #		  --task_threads 4 \
 38 | #		  --allow_ambiguous \
 39 | #		  --labeling_approach peak_summit_in_bin_classification
 40 | 
 41 | 
 42 | #Example with only positives stored
 43 | genomewide_labels --task_list tasks.labelgen.tsv \
 44 | 		  --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \
 45 | 		  --output_type gzip \
 46 | 		  --chrom_sizes hg38.chrom.sizes \
 47 | 		  --chroms_to_keep chr21 \
 48 | 		  --bin_stride 50 \
 49 | 		  --left_flank 400 \
 50 | 		  --right_flank 400 \
 51 | 		  --bin_size 200 \
 52 | 		  --chrom_threads 10 \
 53 | 		  --task_threads 4 \
 54 | 		  --allow_ambiguous \
 55 | 		  --store_positives_only \
 56 | 		  --labeling_approach peak_summit_in_bin_classification
 57 | 
 58 | 
 59 | 
 60 | ##Classification Approach 2: 50% Overlap Between Peak and 200 BP Bin (50% of the Smaller of the Two)
 61 | ## Timing 
 62 | ## real18m56.337s
 63 | ## user25m23.004s
 64 | ## sys7m58.104s
 65 | 
 66 | #genomewide_labels --task_list tasks.labelgen.tsv \
 67 | #       --outf classificationlabels.50PercentOverlap.tsv.gz \
 68 | #       --output_type gzip \
 69 | #       --chrom_sizes hg38.chrom.sizes \
 70 | #       --bin_stride 50 \
 71 | #       --left_flank 400 \
 72 | #       --right_flank 400 \
 73 | #       --chrom_threads 10 \
 74 | #       --task_threads 4 \
 75 | #       --allow_ambiguous \
 76 | #       --overlap_thresh 0.5 \
 77 | #       --labeling_approach peak_percent_overlap_with_bin_classification
 78 | 
 79 | ##Regression Approach 1:Summit Must Lie Within 200 BP Bin
 80 | ## Timing:
 81 | ## real 18m15.728s
 82 | ## user 24m25.028s
 83 | ## sys 7m58.244s
 84 | 
 85 | #genomewide_labels --task_list tasks.labelgen.tsv \
 86 | #       --outf regressionlabels.SummitWithin200bpCenter.tsv.gz \
 87 | #       --output_type gzip \
 88 | #       --chrom_sizes hg38.chrom.sizes \
 89 | #       --bin_stride 50 \
 90 | #       --left_flank 400 \
 91 | #       --right_flank 400 \
 92 | #       --bin_size 200 \
 93 | #       --chrom_threads 10 \
 94 | #       --task_threads 4 \
 95 | #       --allow_ambiguous \
 96 | #       --labeling_approach peak_summit_in_bin_regression
 97 | #
 98 | 
 99 | #Regression Approach 2: 50% Overlap Between Peak and 200 BP Bin (50% of the Smaller of the Two)
100 | ## real18m56.337s
101 | ## user25m23.004s
102 | ## sys7m58.104s
103 | 
104 | #genomewide_labels --task_list tasks.labelgen.tsv \
105 | #       --outf regressionlabels.50PercentOverlap.tsv.gz \
106 | #       --output_type gzip \
107 | #       --chrom_sizes hg38.chrom.sizes \
108 | #       --bin_stride 50 \
109 | #       --left_flank 400 \
110 | #       --right_flank 400 \
111 | #       --chrom_threads 10 \
112 | #       --task_threads 4 \
113 | #       --allow_ambiguous \
114 | #       --overlap_thresh 0.5 \
115 | #       --labeling_approach peak_percent_overlap_with_bin_regression
116 | 
117 | 
118 | ##Regression Approach 3: Provide bedtools coverage in the bigWig for every bin in the genome
119 | 
120 | ##Timing for hdf5 save 
121 | ## real 8m51.275s
122 | ## user 17m38.576s
123 | ## sys 6m14.768s
124 | #genomewide_labels --task_list tasks.labelgen.tsv \
125 | #       --outf regressionlabels.allbins.hg38.hdf5 \
126 | #       --output_type hdf5 \
127 | #       --chrom_sizes hg38.chrom.sizes \
128 | #       --bin_stride 50 \
129 | #       --left_flank 400 \
130 | #       --right_flank 400 \
131 | #       --chrom_threads 24 \
132 | #       --task_threads 2 \
133 | #       --labeling_approach all_genome_bins_regression
134 | 
135 | ## Timing (pkl) 
136 | ## real    23m10.448s
137 | ## user    31m55.056s
138 | ## sys     5m39.880s
139 | #genomewide_labels --task_list tasks.labelgen.tsv \
140 | #       --outf regressionlabels.allbins.hg38.pkl \
141 | #       --output_type pkl \
142 | #       --chrom_sizes hg38.chrom.sizes \
143 | #       --bin_stride 50 \
144 | #       --left_flank 400 \
145 | #       --right_flank 400 \
146 | #       --chrom_threads 24 \
147 | #       --task_threads 2 \
148 | #       --labeling_approach all_genome_bins_regression
149 | 
150 | 
151 | ## Timing for full data frame (gzip) 
152 | ##  real29m50.597s
153 | ##  user38m2.020s
154 | ##  sys6m34.064s
155 | 
156 | ## Timing for chromosome-specific dataframes (gzip) 
157 | ## real 21m35.525s
158 | ## user 51m55.496s
159 | ## sys 7m49.140s
160 | #genomewide_labels --task_list tasks.labelgen.tsv \
161 | #       --outf regressionlabels.allbins.hg38.tsv.gz \
162 | #       --output_type gzip \
163 | #       --chrom_sizes hg38.chrom.sizes \
164 | #       --bin_stride 50 \
165 | #       --left_flank 400 \
166 | #       --right_flank 400 \
167 | #       --chrom_threads 24 \
168 | #       --task_threads 2 \
169 | #       --split_output_by_chrom \
170 | #       --labeling_approach all_genome_bins_regression
171 | 


--------------------------------------------------------------------------------
/examples/labelgen/hg38.chrom.sizes:
--------------------------------------------------------------------------------
 1 | chr1	248956422
 2 | chr2	242193529
 3 | chr3	198295559
 4 | chr4	190214555
 5 | chr5	181538259
 6 | chr6	170805979
 7 | chr7	159345973
 8 | chr8	145138636
 9 | chr9	138394717
10 | chr10	133797422
11 | chr11	135086622
12 | chr12	133275309
13 | chr13	114364328
14 | chr14	107043718
15 | chr15	101991189
16 | chr16	90338345
17 | chr17	83257441
18 | chr18	80373285
19 | chr19	58617616
20 | chr20	64444167
21 | chr21	46709983
22 | chr22	50818468
23 | chrX	156040895
24 | chrY	57227415


--------------------------------------------------------------------------------
/examples/labelgen/hg38.chrom21.sizes:
--------------------------------------------------------------------------------
1 | chr21	46709983
2 | 


--------------------------------------------------------------------------------
/examples/labelgen/peak_files_from_encode_for_label_comparison:
--------------------------------------------------------------------------------
1 | ../peak_files_from_encode_for_label_comparison


--------------------------------------------------------------------------------
/examples/labelgen/save_source_labels_in_labelgen.sh:
--------------------------------------------------------------------------------
 1 | #save output as tsv.gz
 2 | genomewide_labels --task_list tasks.labelgen.tsv \
 3 | 		  --outf classificationlabels.SummitWithin200bpCenter.tsv.gz \
 4 | 		  --output_type gzip \
 5 | 		  --chrom_sizes hg38.chrom.sizes \
 6 | 		  --bin_stride 50 \
 7 | 		  --left_flank 400 \
 8 | 		  --right_flank 400 \
 9 | 		  --bin_size 200 \
10 | 		  --chrom_threads 10 \
11 | 		  --task_threads 4 \
12 | 		  --allow_ambiguous \
13 | 		  --labeling_approach peak_summit_in_bin_classification \
14 | 		  --save_label_source
15 | #save output as hdf5
16 | genomewide_labels --task_list tasks.labelgen.tsv \
17 | 		  --outf classificationlabels.SummitWithin200bpCenter.hdf5 \
18 | 		  --output_type hdf5 \
19 | 		  --chrom_sizes hg38.chrom.sizes \
20 | 		  --bin_stride 50 \
21 | 		  --left_flank 400 \
22 | 		  --right_flank 400 \
23 | 		  --bin_size 200 \
24 | 		  --chrom_threads 10 \
25 | 		  --task_threads 4 \
26 | 		  --allow_ambiguous \
27 | 		  --labeling_approach peak_summit_in_bin_classification \
28 | 		  --save_label_source
29 | 
30 | 


--------------------------------------------------------------------------------
/examples/labelgen/tasks.labelgen.tsv:
--------------------------------------------------------------------------------
1 | task	narrowPeak	bigwig	ambig
2 | ENCFF209DJG	peak_files_from_encode_for_label_comparison/ENCFF209DJG.bed.gz	./bigwig_files_from_encode_for_label_comparison/ENCFF842XRQ.bigWig	hg38.blacklist.bed.gz
3 | ENCFF605WXD	peak_files_from_encode_for_label_comparison/ENCFF605WXD.bed.gz	./bigwig_files_from_encode_for_label_comparison/ENCFF667QJZ.bigWig	hg38.blacklist.bed.gz
4 | ENCFF073ORT	peak_files_from_encode_for_label_comparison/ENCFF073ORT.bed.gz	./bigwig_files_from_encode_for_label_comparison/ENCFF743ULW.bigWig	hg38.blacklist.bed.gz
5 | ENCFF618VMC	peak_files_from_encode_for_label_comparison/ENCFF618VMC.bed.gz	./bigwig_files_from_encode_for_label_comparison/ENCFF796GHK.bigWig	hg38.blacklist.bed.gz
6 | 


--------------------------------------------------------------------------------
/examples/peak_files_from_encode_for_label_comparison/README:
--------------------------------------------------------------------------------
1 | DNAse	GM12878	Stam	hg38	https://www.encodeproject.org/files/ENCFF073ORT/@@download/ENCFF073ORT.bed.gz
2 | DNAse	Hepg2	Stam	hg38	https://www.encodeproject.org/files/ENCFF209DJG/@@download/ENCFF209DJG.bed.gz
3 | Max	K562	Snyder	hg38	https://www.encodeproject.org/files/ENCFF618VMC/@@download/ENCFF618VMC.bed.gz
4 | Myc	K562	Snyder	hg38	https://www.encodeproject.org/files/ENCFF605WXD/@@download/ENCFF605WXD.bed.gz
5 | 


--------------------------------------------------------------------------------
/examples/seqdataloader_examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#load  tutorial utilities \n",
 10 |     "%reload_ext autoreload\n",
 11 |     "%autoreload 2\n",
 12 |     "%matplotlib inline\n",
 13 |     "import warnings\n",
 14 |     "warnings.filterwarnings('ignore')"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Ingesting data into tileDB "
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "from seqdataloader.dbingest import * "
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "The header of the input task file should contain (one or more) of the following fields: \n",
 38 |     "    * dataset (this one's required -- it's a unique label for your dataset) \n",
 39 |     "    * pval_bigwig \n",
 40 |     "    * fc_bigwig \n",
 41 |     "    * count_bigwig_plus_5p \n",
 42 |     "    * count_bigwig_minux_5p\n",
 43 |     "    * idr_peak\n",
 44 |     "    * overlap_peak \n",
 45 |     "    * ambig_peak \n",
 46 |     "    \n",
 47 |     "The file paths can be either local or web-based URL's. "
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "cat: tasks.dbingest.tsv: No such file or directory\r\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "!cat tasks.dbingest.tsv"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "You can run the ingest code as a python function: "
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "loaded tiledb metadata\n",
 84 |       "loaded chrom sizes\n",
 85 |       "tiledb group already exists\n",
 86 |       "got data dict\n",
 87 |       "parsed pool inputs\n",
 88 |       "made pool!\n",
 89 |       "warning: the array: hepg2_dnase_encode/ENCFF209DJG.chr21 already exists. You provided the --overwrite flag, so it will be updated/overwritten\n",
 90 |       "here\n",
 91 |       "store_summits:True\n",
 92 |       "summit_indicator:2\n",
 93 |       "got:idr_peak for chrom:chr21\n",
 94 |       "store_summits:False\n",
 95 |       "summit_indicator:None\n",
 96 |       "got:fc_bigwig for chrom:chr21\n",
 97 |       "store_summits:False\n",
 98 |       "summit_indicator:None\n",
 99 |       "got:ambig_peak for chrom:chr21\n",
100 |       "starting to write output\n",
101 |       "got cur vals\n",
102 |       "idr_peak\n",
103 |       "dict_to_write[key].shape:(46709983,)\n",
104 |       "fc_bigwig\n",
105 |       "dict_to_write[key].shape:(46709983,)\n",
106 |       "ambig_peak\n",
107 |       "dict_to_write[key].shape:(46709983,)\n",
108 |       "updated data dict for writing\n",
109 |       "finalizing the write\n",
110 |       "0\n",
111 |       "1000000\n",
112 |       "2000000\n",
113 |       "3000000\n",
114 |       "4000000\n",
115 |       "5000000\n",
116 |       "6000000\n",
117 |       "7000000\n",
118 |       "8000000\n",
119 |       "9000000\n",
120 |       "10000000\n",
121 |       "11000000\n",
122 |       "12000000\n",
123 |       "13000000\n",
124 |       "14000000\n",
125 |       "15000000\n",
126 |       "16000000\n",
127 |       "17000000\n",
128 |       "18000000\n",
129 |       "19000000\n",
130 |       "20000000\n",
131 |       "21000000\n",
132 |       "22000000\n",
133 |       "23000000\n",
134 |       "24000000\n",
135 |       "25000000\n",
136 |       "26000000\n",
137 |       "27000000\n",
138 |       "28000000\n",
139 |       "29000000\n",
140 |       "30000000\n",
141 |       "31000000\n",
142 |       "32000000\n",
143 |       "33000000\n",
144 |       "34000000\n",
145 |       "35000000\n",
146 |       "36000000\n",
147 |       "37000000\n",
148 |       "38000000\n",
149 |       "39000000\n",
150 |       "40000000\n",
151 |       "41000000\n",
152 |       "42000000\n",
153 |       "43000000\n",
154 |       "44000000\n",
155 |       "45000000\n",
156 |       "46000000\n",
157 |       "length of pool inputs:48\n",
158 |       "made pool\n",
159 |       "start:0, end:1000000\n",
160 |       "start:1000000, end:2000000\n",
161 |       "start:2000000, end:3000000\n",
162 |       "start:3000000, end:4000000\n",
163 |       "start:4000000, end:5000000\n",
164 |       "start:5000000, end:6000000\n",
165 |       "start:6000000, end:7000000\n",
166 |       "start:7000000, end:8000000\n",
167 |       "start:8000000, end:9000000\n",
168 |       "start:9000000, end:10000000\n",
169 |       "start:10000000, end:11000000\n",
170 |       "start:11000000, end:12000000\n",
171 |       "start:12000000, end:13000000\n",
172 |       "start:13000000, end:14000000\n",
173 |       "start:14000000, end:15000000\n",
174 |       "start:15000000, end:16000000\n",
175 |       "start:16000000, end:17000000\n",
176 |       "start:17000000, end:18000000\n",
177 |       "start:18000000, end:19000000\n",
178 |       "start:19000000, end:20000000\n",
179 |       "start:20000000, end:21000000\n",
180 |       "start:21000000, end:22000000\n",
181 |       "start:22000000, end:23000000\n",
182 |       "start:23000000, end:24000000\n",
183 |       "start:24000000, end:25000000\n",
184 |       "start:25000000, end:26000000\n",
185 |       "start:26000000, end:27000000\n",
186 |       "start:27000000, end:28000000\n",
187 |       "start:28000000, end:29000000\n",
188 |       "start:29000000, end:30000000\n",
189 |       "start:30000000, end:31000000\n",
190 |       "start:31000000, end:32000000\n",
191 |       "start:32000000, end:33000000\n",
192 |       "start:33000000, end:34000000\n",
193 |       "start:34000000, end:35000000\n",
194 |       "start:35000000, end:36000000\n",
195 |       "start:36000000, end:37000000\n",
196 |       "start:37000000, end:38000000\n",
197 |       "start:38000000, end:39000000\n",
198 |       "start:39000000, end:40000000\n",
199 |       "start:40000000, end:41000000\n",
200 |       "start:41000000, end:42000000\n",
201 |       "start:42000000, end:43000000\n",
202 |       "start:43000000, end:44000000\n",
203 |       "start:44000000, end:45000000\n",
204 |       "start:45000000, end:46000000\n",
205 |       "start:46000000, end:47000000\n",
206 |       "start:47000000, end:46709983\n",
207 |       "done writing\n",
208 |       "wrote array to disk for dataset:hepg2_dnase_encode/ENCFF209DJG.chr21\n"
209 |      ]
210 |     },
211 |     {
212 |      "data": {
213 |       "text/plain": [
214 |        "'done'"
215 |       ]
216 |      },
217 |      "execution_count": 4,
218 |      "metadata": {},
219 |      "output_type": "execute_result"
220 |     }
221 |    ],
222 |    "source": [
223 |     "args={\"tiledb_metadata\":\"tasks.dbingest.tsv\",\n",
224 |     "      \"tiledb_group\":\"hepg2_dnase_encode\",\n",
225 |     "     \"overwrite\":True,\n",
226 |     "     \"chrom_sizes\":\"hg38.chrom21.sizes\",\n",
227 |     "     \"chrom_threads\":1,\n",
228 |     "     \"task_threads\":1,\n",
229 |     "     \"write_threads\":1}\n",
230 |     "\n",
231 |     "ingest(args)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "Or you can run the code as a script: "
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 4,
244 |    "metadata": {},
245 |    "outputs": [
246 |     {
247 |      "name": "stdout",
248 |      "output_type": "stream",
249 |      "text": [
250 |       "cat: tasks.dbingest.local.tsv: No such file or directory\r\n"
251 |      ]
252 |     }
253 |    ],
254 |    "source": [
255 |     "!cat ~/seqdataltasks.dbingest.local.tsv"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "!db_ingest --tiledb_metadata tasks.dbingest.local.tsv \\\n",
265 |     "    --tiledb_group hepg2_dnase_encode \\\n",
266 |     "    --overwrite \\\n",
267 |     "    --chrom_sizes hg38.chrom.sizes \\\n",
268 |     "    --chrom_threads 25 \\\n",
269 |     "    --attribute_config encode_pipeline \\\n",
270 |     "    --tile_size 9000 \\\n",
271 |     "    --batch_size 1000000\n"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 8,
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "name": "stdout",
281 |      "output_type": "stream",
282 |      "text": [
283 |       "odict_keys(['pval_bigwig', 'fc_bigwig', 'count_bigwig_plus_5p', 'count_bigwig_minux_5p', 'idr_peak', 'overlap_peak', 'ambig_peak'])\n",
284 |       "odict_keys(['pval_bigwig', 'fc_bigwig', 'count_bigwig_plus_5p', 'count_bigwig_minux_5p', 'idr_peak', 'overlap_peak', 'ambig_peak'])\n"
285 |      ]
286 |     }
287 |    ],
288 |    "source": [
289 |     "#we can examine the array \n",
290 |     "import tiledb \n",
291 |     "data=tiledb.DenseArray(\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY.chr1\",'r')\n",
292 |     "subset=data[30000000:31000000]\n",
293 |     "print(subset.keys())\n",
294 |     "data=tiledb.DenseArray(\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY.chr21\",'r')\n",
295 |     "subset=data[30000000:31000000]\n",
296 |     "print(subset.keys())"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 6,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/plain": [
307 |        "array([0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
308 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
309 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
310 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
311 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
312 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
313 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
314 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
315 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
316 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
317 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
318 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
319 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
320 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
321 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
322 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
323 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
324 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
325 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
326 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
327 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
328 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
329 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
330 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
331 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
332 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
333 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
334 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
335 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
336 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
337 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
338 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
339 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
340 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
341 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
342 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
343 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
344 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
345 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
346 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
347 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
348 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
349 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
350 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
351 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
352 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
353 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
354 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
355 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
356 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
357 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
358 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
359 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
360 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
361 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
362 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
363 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
364 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
365 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
366 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
367 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
368 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
369 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
370 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
371 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
372 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
373 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
374 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
375 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
376 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
377 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
378 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
379 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
380 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
381 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
382 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
383 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
384 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
385 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
386 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
387 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
388 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
389 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
390 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
391 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
392 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
393 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
394 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
395 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
396 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
397 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
398 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
399 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
400 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
401 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
402 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
403 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
404 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
405 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
406 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
407 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
408 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
409 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
410 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
411 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
412 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
413 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
414 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
415 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
416 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
417 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
418 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
419 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
420 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
421 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
422 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
423 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
424 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
425 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
426 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
427 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
428 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
429 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
430 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
431 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
432 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
433 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
434 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
435 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
436 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
437 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
438 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
439 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
440 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
441 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
442 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
443 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
444 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
445 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
446 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
447 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
448 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
449 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
450 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
451 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
452 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
453 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
454 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
455 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
456 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
457 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
458 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
459 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
460 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
461 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
462 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
463 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
464 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
465 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
466 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
467 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
468 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
469 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
470 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
471 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
472 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
473 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
474 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
475 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
476 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
477 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
478 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
479 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
480 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
481 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
482 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
483 |        "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
484 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
485 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
486 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
487 |        "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
488 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
489 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
490 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
491 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
492 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
493 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
494 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
495 |        "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
496 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
497 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
498 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
499 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
500 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
501 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
502 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
503 |        "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
504 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
505 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
506 |        "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ],\n",
507 |        "      dtype=float32)"
508 |       ]
509 |      },
510 |      "execution_count": 6,
511 |      "metadata": {},
512 |      "output_type": "execute_result"
513 |     }
514 |    ],
515 |    "source": [
516 |     "subset['fc_bigwig'][0:1000]"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": 7,
522 |    "metadata": {},
523 |    "outputs": [
524 |     {
525 |      "data": {
526 |       "text/plain": [
527 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
528 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
529 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
530 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
531 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
532 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
533 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
534 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
535 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
536 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
537 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
538 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
539 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
540 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
541 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
542 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
543 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
544 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
545 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
546 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
547 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
548 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
549 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
550 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
551 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
552 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
553 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
554 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
555 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
556 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
557 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
558 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
559 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
560 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
561 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
562 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
563 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
564 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
565 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
566 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
567 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
568 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
569 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
570 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
571 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
572 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])"
573 |       ]
574 |      },
575 |      "execution_count": 7,
576 |      "metadata": {},
577 |      "output_type": "execute_result"
578 |     }
579 |    ],
580 |    "source": [
581 |     "subset['idr_peak'][0:1000]"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "markdown",
586 |    "metadata": {},
587 |    "source": [
588 |     "## Genomewide classification labels "
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": null,
594 |    "metadata": {},
595 |    "outputs": [],
596 |    "source": [
597 |     "from seqdataloader.labelgen import *\n",
598 |     "classification_params={\n",
599 |     "    'task_list':\"tasks.labelgen.tsv\",\n",
600 |     "    'outf':\"classificationlabels.SummitWithin200bpCenter.tsv.gz\",\n",
601 |     "    'output_type':'gzip',\n",
602 |     "    'chrom_sizes':'hg38.chrom.sizes',\n",
603 |     "    'chroms_to_keep':['chr21'],\n",
604 |     "    \"store_positives_only\":True,\n",
605 |     "    'bin_stride':50,\n",
606 |     "    'left_flank':400,\n",
607 |     "    'right_flank':400,\n",
608 |     "    'bin_size':200,\n",
609 |     "    'task_threads':10,\n",
610 |     "    'chrom_threads':4,\n",
611 |     "    'allow_ambiguous':True,\n",
612 |     "    'labeling_approach':'peak_summit_in_bin_classification'\n",
613 |     "    }\n",
614 |     "genomewide_labels(classification_params)\n",
615 |     "\n"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "markdown",
620 |    "metadata": {},
621 |    "source": [
622 |     "## Genomewide regression labels "
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {},
629 |    "outputs": [],
630 |    "source": [
631 |     "regression_params={\n",
632 |     "    'task_list':\"tasks.labelgen.tsv\",\n",
633 |     "    'outf':\"regressionlabels.all_genome_bins_regression.hdf5\",\n",
634 |     "    'output_type':'hdf5',\n",
635 |     "    'chrom_sizes':'hg38.chrom.sizes',\n",
636 |     "    'store_values_above_thresh': 0,\n",
637 |     "    'chroms_to_keep':['chr21'],\n",
638 |     "    'bin_stride':50,\n",
639 |     "    'left_flank':400,\n",
640 |     "    'right_flank':400,\n",
641 |     "    'bin_size':200,\n",
642 |     "    'threads':10,\n",
643 |     "    'subthreads':4,\n",
644 |     "    'labeling_approach':'all_genome_bins_regression'\n",
645 |     "    }\n",
646 |     "genomewide_labels(regression_params)\n"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "markdown",
651 |    "metadata": {},
652 |    "source": [
653 |     "let's examine the output dataframe for the regression case: "
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": null,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "regression_data=pd.read_hdf(\"regressionlabels.all_genome_bins_regression.hdf5\")"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": null,
668 |    "metadata": {},
669 |    "outputs": [],
670 |    "source": [
671 |     "regression_data.head()"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "code",
676 |    "execution_count": null,
677 |    "metadata": {},
678 |    "outputs": [],
679 |    "source": [
680 |     "regression_negatives=pd.read_hdf(\"universal_negatives.regressionlabels.all_genome_bins_regression.hdf5\")\n",
681 |     "regression_negatives.head"
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "markdown",
686 |    "metadata": {},
687 |    "source": [
688 |     "for the classification case, we specified \"store_positives_only\", so the script generated two dataframes: \n",
689 |     "    * Universal negatives \n",
690 |     "    * Dataframe where each bin is >0 for at least one task "
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": null,
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": [
699 |     "classification_pos=pd.read_csv(\"classificationlabels.SummitWithin200bpCenter.tsv.gz\",sep='\\t',header=0)"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": null,
705 |    "metadata": {},
706 |    "outputs": [],
707 |    "source": [
708 |     "classification_pos.head()"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "code",
713 |    "execution_count": null,
714 |    "metadata": {},
715 |    "outputs": [],
716 |    "source": [
717 |     "classification_neg=pd.read_csv(\"universal_negatives.classificationlabels.SummitWithin200bpCenter.tsv.gz\",sep='\\t',header=0)"
718 |    ]
719 |   },
720 |   {
721 |    "cell_type": "code",
722 |    "execution_count": null,
723 |    "metadata": {},
724 |    "outputs": [],
725 |    "source": [
726 |     "classification_neg.head()"
727 |    ]
728 |   },
729 |   {
730 |    "cell_type": "code",
731 |    "execution_count": null,
732 |    "metadata": {},
733 |    "outputs": [],
734 |    "source": []
735 |   }
736 |  ],
737 |  "metadata": {
738 |   "kernelspec": {
739 |    "display_name": "Python 3",
740 |    "language": "python",
741 |    "name": "python3"
742 |   },
743 |   "language_info": {
744 |    "codemirror_mode": {
745 |     "name": "ipython",
746 |     "version": 3
747 |    },
748 |    "file_extension": ".py",
749 |    "mimetype": "text/x-python",
750 |    "name": "python",
751 |    "nbconvert_exporter": "python",
752 |    "pygments_lexer": "ipython3",
753 |    "version": "3.7.0"
754 |   }
755 |  },
756 |  "nbformat": 4,
757 |  "nbformat_minor": 2
758 | }
759 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.15
 2 | pandas>=0.23.4
 3 | cython>=0.27.3
 4 | deeptools>=3.0.1
 5 | psutil
 6 | pybedtools>=0.7
 7 | pyBigWig>=0.3.7
 8 | pyfaidx
 9 | tiledb>=0.4.4
10 | 


--------------------------------------------------------------------------------
/seqdataloader/attrib_config.py:
--------------------------------------------------------------------------------
 1 | from .utils import *
 2 | allowed_attributes={}
 3 | allowed_attributes['bigwig']={'dtype':'float32',
 4 |                                          'opener':open_bigwig_for_parsing,
 5 |                                          'parser':parse_bigwig_chrom_vals,
 6 |                                          'store_summits':False}
 7 | allowed_attributes['bed_no_summit']={'dtype':'int',
 8 |                                'opener':open_csv_for_parsing,
 9 |                                'parser':parse_narrowPeak_chrom_vals,
10 |                                'store_summits':False,
11 |                                'summit_from_peak_center':False}
12 | allowed_attributes['bed_summit_from_peak_center']={'dtype':'int',
13 |                                'opener':open_csv_for_parsing,
14 |                                'parser':parse_narrowPeak_chrom_vals,
15 |                                'store_summits':True,
16 |                                'summit_indicator':2,
17 |                                'summit_from_peak_center':True}
18 | allowed_attributes['bed_summit_from_last_col']={'dtype':'int',
19 |                                'opener':open_csv_for_parsing,
20 |                                'parser':parse_narrowPeak_chrom_vals,
21 |                                'store_summits':True,
22 |                                'summit_indicator':2,
23 |                                'summit_from_peak_center':False}
24 | 
25 | def get_generic_bigwig_config():
26 |     attrib_info=dict()
27 |     attrib_info['bigwig_track']=allowed_attributes['bigwig']
28 |     attrib_info['ambig_peak']=allowed_attributes['bed_no_summit']
29 |     return attrib_info 
30 | 
31 | def get_encode_with_controls_config():
32 |     attrib_info=get_encode_config()
33 |     #add the control count tracks 
34 |     attrib_info['control_count_bigwig_unstranded_5p']=allowed_attributes['bigwig']
35 |     attrib_info['control_count_bigwig_plus_5p']=allowed_attributes['bigwig']
36 |     attrib_info['control_count_bigwig_minus_5p']=allowed_attributes['bigwig']
37 |     return attrib_info
38 |     
39 | 
40 | def get_encode_config():
41 |     attrib_info=dict()
42 | 
43 |     attrib_info['pval_bigwig']=allowed_attributes['bigwig']
44 |     attrib_info['fc_bigwig']=allowed_attributes['bigwig']
45 |     attrib_info['count_bigwig_plus_5p']=allowed_attributes['bigwig']
46 |     attrib_info['count_bigwig_minus_5p']=allowed_attributes['bigwig']
47 |     attrib_info['count_bigwig_unstranded_5p']=allowed_attributes['bigwig']
48 |     attrib_info['idr_peak']=allowed_attributes['bed_summit_from_last_col']
49 |     attrib_info['overlap_peak']=allowed_attributes['bed_summit_from_last_col']
50 |     attrib_info['ambig_peak']=allowed_attributes['bed_no_summit']
51 |     return attrib_info 
52 | 
53 | def get_attribute_info_from_file(attribute_config_file):
54 |     config_metadata=open(attribute_config_file,'r').read().strip().split('\n')
55 |     attrib_info={}
56 |     for line in config_metadata:
57 |         tokens=line.split('\t')
58 |         field_name=tokens[0]
59 |         field_type=tokens[1]
60 |         attrib_info[field_name]=allowed_attributes[field_type]
61 |     return attrib_info
62 | 
63 | def get_attribute_info(attribute_config,attribute_config_file):
64 |     assert (attribute_config is None) or (attribute_config_file is None)
65 |     if attribute_config_file is not None:
66 |         return get_attribute_info_from_file(attribute_config_file) 
67 |     try:
68 |         name_to_config=dict()
69 |         name_to_config['encode_pipeline_with_controls']=get_encode_with_controls_config()
70 |         name_to_config['encode_pipeline']=get_encode_config()
71 |         name_to_config['generic_bigwig']=get_generic_bigwig_config()
72 |         attrib_info=name_to_config[attribute_config]
73 |         return attrib_info
74 |     except Exception as e:
75 |         raise e
76 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, print_function, absolute_import
2 | from . import coordbased
3 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, print_function, absolute_import
2 | from . import coordstovals
3 | from . import coordbatchproducers
4 | from . import coordbatchtransformers
5 | from . import core
6 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/coordbatchproducers.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | import gzip
  3 | from .core import Coordinates
  4 | import numpy as np
  5 | 
  6 | 
  7 | class KerasSequenceApiCoordsBatchProducer(object): 
  8 | 
  9 |     """
 10 |     Args:
 11 |         batch_size (int): note that if you apply some kind of augmentation,
 12 |             then this value will end up being half of the actual batch size.
 13 |         shuffle_before_epoch (boolean, optional): default False
 14 |         seed (int): default 1234; needed if shuffle=True
 15 |     """
 16 |     def __init__(self, batch_size, shuffle_before_epoch, seed):
 17 |         self.coords_list = self._get_coordslist()
 18 |         self.batch_size = batch_size
 19 |         self.shuffle_before_epoch = shuffle_before_epoch
 20 |         self.seed = seed
 21 |         if (self.shuffle_before_epoch):
 22 |             self.rng = np.random.RandomState(self.seed)
 23 |             self._shuffle_coordslist()
 24 | 
 25 |     def _get_coordslist(self):
 26 |         raise NotImplementedError()
 27 |     
 28 |     def _shuffle_coordslist(self):
 29 |         self.rng.shuffle(self.coords_list)
 30 |   
 31 |     def __getitem__(self, index):
 32 |         """
 33 |         Args:
 34 |             index (:obj:`int`): index of the batch
 35 |         
 36 |         Returns:
 37 |             :obj:`list`: the coordinates for a complete batch
 38 |         """
 39 |         return self.coords_list[index*self.batch_size:
 40 |                                 (index+1)*self.batch_size]
 41 |     
 42 |     def __len__(self):
 43 |         """
 44 |         Returns:
 45 |             The total number of batches to return
 46 |         """
 47 |         return int(np.ceil(len(self.coords_list)/float(self.batch_size)))
 48 |    
 49 |     def on_epoch_end(self):
 50 |         """
 51 |         Things to be executed after the epoch - like shuffling the coords
 52 |         """
 53 |         if (self.shuffle_before_epoch):
 54 |             self._shuffle_coordslist()
 55 | 
 56 | 
 57 | class BedFileObj(object):
 58 |     def __init__(self, bed_file, hastitle=False):
 59 |         print("Heads up: coordinates in bed file"
 60 |               +" are assumed to be on the positive strand;"
 61 |               +" if strand in the bed file is improtant to you, please"
 62 |               +" add that feature to SimpleCoordsBatchProducer")
 63 |         self.bed_file = bed_file 
 64 |         self.hastitle = hastitle
 65 |         self.coords_list = self._read_bed_file()
 66 | 
 67 |     def _read_bed_file(self):
 68 |         coords_list = []
 69 |         for linenum,line in enumerate((gzip.open(self.bed_file) if ".gz"
 70 |                                        in self.bed_file
 71 |                                        else open(self.bed_file))):
 72 |             if (linenum > 0 or self.hastitle==False):
 73 |                 (chrom, start_str, end_str) =\
 74 |                   line.decode("utf-8").rstrip().split("\t")[0:3]
 75 |                 coords_list.append(Coordinates(chrom=chrom,
 76 |                                               start=int(start_str),
 77 |                                               end=int(end_str)))
 78 |         return coords_list
 79 | 
 80 |     def __len__(self):
 81 |         return len(self.coords_list)
 82 | 
 83 |     def get_strided_subsample(self, offset, stride):
 84 |         return self.coords_list[offset::stride]
 85 | 
 86 |     def assert_sorted(self):
 87 |         prev_entry = self.coords_list[0]
 88 |         for entry in self.coords_list[1:]:
 89 |             if entry.chrom==prev_entry.chrom:
 90 |                 assert entry.start >= prev_entry.start, ("Bed file "+
 91 |                         self.bed_file+" is not sorted; "+str(entry)
 92 |                         +" follows "+str(prev_entry))
 93 |             prev_entry = entry
 94 |             
 95 | 
 96 | class DownsampleNegativesCoordsBatchProducer(
 97 |         KerasSequenceApiCoordsBatchProducer):
 98 | 
 99 |     def __init__(self, pos_bed_file, neg_bed_file,
100 |                        target_proportion_positives, **kwargs):
101 | 
102 |         print("Reading in positive bed file")
103 |         self.pos_bedfileobj = BedFileObj(bed_file=pos_bed_file)
104 |         print("Got",len(self.pos_bedfileobj.coords_list),
105 |               " coords in positive bed file")
106 |         print("Reading in negative bed file")
107 |         self.neg_bedfileobj = BedFileObj(bed_file=neg_bed_file)
108 |         print("Got",len(self.neg_bedfileobj.coords_list),
109 |               " coords in negative bed file")
110 |         self.neg_bedfileobj.assert_sorted()
111 | 
112 |         self.target_proportion_positives = target_proportion_positives
113 |         self.subsample_factor = int(np.ceil(
114 |             (len(self.neg_bedfileobj.coords_list)
115 |              *(self.target_proportion_positives/
116 |                (1-self.target_proportion_positives)) )/
117 |             len(self.pos_bedfileobj.coords_list)))
118 |         print("The target proportion of positives of",
119 |               self.target_proportion_positives,"requires the negative set"
120 |               +" to be subsampled by a factor of",self.subsample_factor,
121 |               "which will result in a #neg of",
122 |               int(len(self.neg_bedfileobj.coords_list)/self.subsample_factor))
123 |         self.last_used_offset = -1
124 |         super(DownsampleNegativesCoordsBatchProducer, self).__init__(**kwargs)
125 | 
126 |     def _shuffle_coordslist(self):
127 |         self.rng.shuffle(self.subsampled_neg_coords)
128 |         self.rng.shuffle(self.pos_coords)
129 |         fracpos = len(self.pos_coords)/(
130 |                     len(self.pos_coords) + len(self.subsampled_neg_coords))
131 |         #interleave evenly
132 |         pos_included = 0
133 |         neg_included = 0
134 |         new_coordslist = []
135 |         for i in range(len(self.pos_coords)+len(self.subsampled_neg_coords)):
136 |             if (pos_included < (pos_included+neg_included)*(fracpos)):
137 |                 new_coordslist.append(self.pos_coords[pos_included])
138 |                 pos_included += 1
139 |             else:
140 |                 new_coordslist.append(self.subsampled_neg_coords[neg_included])
141 |                 neg_included += 1
142 |         assert pos_included==len(self.pos_coords)
143 |         assert neg_included==len(self.subsampled_neg_coords)
144 |         self.coords_list = new_coordslist
145 | 
146 |     def _get_coordslist(self):
147 |         self.last_used_offset += 1
148 |         self.last_used_offset = self.last_used_offset%self.subsample_factor
149 |         print("Using an offset of ",self.last_used_offset," before striding")
150 |         self.last_used_offset = self.last_used_offset%self.subsample_factor
151 |         subsampled_neg_coords = self.neg_bedfileobj.get_strided_subsample(
152 |                                 offset=self.last_used_offset,
153 |                                 stride=self.subsample_factor) 
154 |         pos_coords = self.pos_bedfileobj.coords_list
155 |         self.subsampled_neg_coords = subsampled_neg_coords
156 |         self.pos_coords = pos_coords
157 |         return pos_coords+subsampled_neg_coords
158 |    
159 |     def on_epoch_end(self):
160 |         #get negative set with potentially different stride
161 |         self.coords_list = self._get_coordslist()
162 |         #perform shuffling as needed
163 |         super(DownsampleNegativesCoordsBatchProducer, self).on_epoch_end()        
164 | 
165 | 
166 | class SimpleCoordsBatchProducer(KerasSequenceApiCoordsBatchProducer):
167 | 
168 |     """
169 |     Args:
170 |         bed_file (string): file with the bed coordinates.
171 |             Assumes coordinates are on the positive strand.
172 |         coord_batch_transformer (AbstracCoordBatchTransformer): does things
173 |             like revcomp and random jitter
174 |     """
175 |     def __init__(self, bed_file,
176 |                        hastitle=False,
177 |                        coord_batch_transformer=None,
178 |                        **kwargs):  
179 |         self.bed_file = BedFileObj(bed_file=bed_file, hastitle=hastitle)
180 |         if (coord_batch_transformer is not None):
181 |             raise DeprecationWarning(
182 |              "Moving forward, coords_batch_transformer should be"
183 |              +" specified as an argument to KerasBatchGenerator"
184 |              +", not as an arugment to the CoordsBatchProducer."
185 |              +" This is to allow different CoordsBatchProducer"
186 |              +" implementations to be used with the same"
187 |              +" coords_batch_transformer code.")
188 |         self.coord_batch_transformer = coord_batch_transformer
189 |         super(SimpleCoordsBatchProducer, self).__init__(**kwargs)
190 | 
191 |     def _get_coordslist(self):
192 |         return [x for x in self.bed_file.coords_list]
193 |         
194 |     def __getitem__(self, index):
195 |         orig_batch = self.coords_list[index*self.batch_size:
196 |                                       (index+1)*self.batch_size]
197 |         if (self.coord_batch_transformer is not None):
198 |             return self.coord_batch_transformer(orig_batch)
199 |         else:
200 |             return orig_batch
201 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/coordbatchtransformers.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function, absolute_import
 2 | from .core import Coordinates
 3 | import numpy as np
 4 | 
 5 | 
 6 | def get_revcomp(coordinate):
 7 |     return Coordinates(chrom=coordinate.chrom,
 8 |                        start=coordinate.start, end=coordinate.end,
 9 |                        isplusstrand=(coordinate.isplusstrand==False))
10 | 
11 | 
12 | class AbstractCoordBatchTransformer(object):
13 |   
14 |     def __call__(self, coords):
15 |         """
16 |         Args:
17 |             coords (:obj:`list` of :obj:`Coordinates` objects):
18 | 
19 |         Returns:
20 |             another :obj:`list` of :obj:`Coordinates`
21 |         """
22 |         raise NotImplementedError()
23 |     
24 |     def chain(self, coord_batch_transformer):
25 |         return lambda coords: coord_batch_transformer(self(coords))
26 |       
27 |       
28 | class ReverseComplementAugmenter(AbstractCoordBatchTransformer):
29 |     """
30 |         Returns a list of Coordinates twice the length of the
31 |             original list by appending the reverse complements
32 |             of the original coordinates at the end
33 |     """
34 |     def __call__(self, coords):
35 |         return coords + [get_revcomp(x) for x in coords]
36 |       
37 |       
38 | class UniformJitter(AbstractCoordBatchTransformer):
39 |     
40 |     def __init__(self, maxshift, seed=1234, chromsizes_file=None):
41 |         """
42 |           Returns a list of Coordinates jittered relative to the original
43 |             coordinates by a shift of up to +/- maxshift. Size of the
44 |             shift is sampled from a uniform distribution.
45 |             
46 |           Args:
47 |             maxshift (:obj:`int`): maximum possible shift to sample
48 |             chromsizes (:obj:`string`): path to a chromsizes file. If
49 |                 specified, shifts will be adjusted so as to avoid going
50 |                 over the end of the chromosome. Default is None.
51 |         """
52 |         self.rng = np.random.RandomState(seed)
53 |         self.maxshift = maxshift
54 |         self.chromsizes = (
55 |             self._read_chromsizes(chromsizes_file=chromsizes_file)
56 |             if chromsizes_file is not None else None)
57 |     
58 |     def _read_chromsizes(self, chromsizes_file):
59 |         chrom_to_size = {}
60 |         for row in open(chromsizes_file):
61 |             chrom,chromlen = row.rstrip().split("\t")
62 |             chromlen = int(chromlen)
63 |             chrom_to_size[chrom] = chromlen
64 |         return chrom_to_size
65 |    
66 |     def __call__(self, coords):
67 |         a_list = []
68 |         for coord in coords:
69 |             chrom = coord.chrom
70 |             start = coord.start
71 |             end = coord.end
72 |             isplusstrand = coord.isplusstrand
73 |             shift_size = int(self.rng.uniform(low=0, high=(2*self.maxshift + 1))
74 |                              - self.maxshift)
75 |             shift_size = max(-start, shift_size)
76 |             if self.chromsizes is not None:
77 |                 shift_size = min(self.chromsizes[chrom]-end, shift_size)
78 |             start = start + shift_size
79 |             end = end + shift_size
80 |             a_list.append(Coordinates(chrom=chrom, start=start,
81 |                                       end=end, isplusstrand=isplusstrand))
82 |         return a_list
83 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/coordstovals/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, print_function, absolute_import
2 | from . import bigwig
3 | from . import fasta
4 | from . import core
5 | from . import lookup
6 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/coordstovals/bigwig.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function, absolute_import
  2 | import numpy as np
  3 | import pyBigWig
  4 | from .core import CoordsToVals, get_new_coors_around_center
  5 | from ..core import Coordinates
  6 | 
  7 | 
  8 | def rolling_window(a, window):
  9 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
 10 |     strides = a.strides + (a.strides[-1],)
 11 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 12 | 
 13 | 
 14 | def smooth_profiles(profiles, smoothing_window):
 15 |     assert len(profiles.shape)==3
 16 |     leftpadlen = int((smoothing_window-1)/2)
 17 |     rightpadlen =\
 18 |         (smoothing_window-1)-int((smoothing_window-1)/2)
 19 |     padded_profiles = np.pad(
 20 |         array=profiles,
 21 |         pad_width=((0,0),(leftpadlen, rightpadlen), (0,0)),
 22 |         mode='edge')
 23 |     smoothed_profiles = np.mean(rolling_window(
 24 |                         a=padded_profiles.transpose(0,2,1),
 25 |                         window=smoothing_window), axis=-1).transpose((0,2,1))
 26 |     return smoothed_profiles
 27 | 
 28 | 
 29 | class BigWigReader(object):
 30 | 
 31 |     def __init__(self, bigwig_path):
 32 |         """
 33 |         Args:
 34 |             bigwig_path (:obj:`str`): path to the .bw file
 35 |         """
 36 |         self.bigwig_path = bigwig_path
 37 |         self.bw = pyBigWig.open(bigwig_path)
 38 |         
 39 |     def read_values(self, coors):
 40 |         """
 41 |         Args:
 42 |             coords (:obj:`list` of :obj:Coordinates)
 43 |         
 44 |         Returns:
 45 |             ndarray of dims (nexamples x width). All the coordinates must be
 46 |                 of the same length.
 47 |         """
 48 |         to_return = []
 49 |         for coor in coors:
 50 |             to_append = np.nan_to_num(
 51 |                           x=self.bw.values(coor.chrom, coor.start, coor.end))
 52 |             if (coor.isplusstrand==False):
 53 |                 to_append = to_append[::-1]
 54 |             to_return.append(to_append)
 55 |         lengths = set([len(x) for x in to_return])
 56 |         assert len(lengths)==1, ("All the sequences must be of the same"
 57 |             +"lengths, but lengths are "+str(lengths))
 58 |         return np.array(to_return)
 59 | 
 60 | 
 61 | class LogCountsAndProfile(CoordsToVals):
 62 | 
 63 |     def __init__(self, bigwig_path, counts_mode_name,
 64 |                        profile_mode_name):
 65 |         self.reader = BigWigReader(bigwig_path=bigwig_path)
 66 |         self.counts_mode_name = counts_mode_name
 67 |         self.profile_mode_name = profile_mode_name
 68 |     
 69 |     def __call__(self, coors):
 70 |         profile_values = self.reader.read_values(coors=coors)
 71 |         counts = np.log(np.sum(profile_values, axis=-1)+1)
 72 |         to_return = {self.counts_mode_name: counts,
 73 |                      self.profile_mode_name: profile_values}
 74 |         return to_return
 75 | 
 76 | 
 77 | class AbstractCountAndProfileTransformer(object):
 78 | 
 79 |     def __call__(self, counts, profiles):
 80 |         raise NotImplementedError()
 81 | 
 82 |     def chain(self, count_and_profile_transformer):
 83 |         def chained_count_and_profile_transformer(counts, profiles):
 84 |             counts, profiles = self(counts=counts, profiles=profiles)
 85 |             return count_and_profile_transformer(
 86 |                     counts=counts, profiles=profiles)
 87 |         return chained_count_and_profile_transformer
 88 | 
 89 | 
 90 | class LogCountsPlusOne(AbstractCountAndProfileTransformer):
 91 | 
 92 |     def __call__(self, counts, profiles):
 93 |         return np.log(counts+1), profiles
 94 | 
 95 | 
 96 | class SmoothProfiles(AbstractCountAndProfileTransformer):
 97 | 
 98 |     def __init__(self, smoothing_windows):
 99 |         self.smoothing_windows = smoothing_windows
100 | 
101 |     def __call__(self, counts, profiles):
102 |         profiles_to_return = np.concatenate([
103 |            smooth_profiles(profiles=profiles, smoothing_window=x)
104 |            for x in self.smoothing_windows], axis=-1)
105 |         return counts, profiles_to_return
106 | 
107 | 
108 | class MultiTrackCountsAndProfile(CoordsToVals):
109 | 
110 |     def __init__(self, bigwig_paths,
111 |                        counts_and_profiles_transformer,
112 |                        counts_mode_name,
113 |                        profile_mode_name, center_size_to_use):
114 |         self.bigwig_readers = [BigWigReader(bigwig_path=x)
115 |                                for x in bigwig_paths]
116 |         self.counts_and_profiles_transformer = counts_and_profiles_transformer
117 |         self.counts_mode_name = counts_mode_name
118 |         self.profile_mode_name = profile_mode_name
119 |         self.center_size_to_use = center_size_to_use
120 | 
121 |     def _get_counts_and_vals(self, coors):
122 |         new_coors = get_new_coors_around_center(
123 |                             coors=coors,
124 |                             center_size_to_use=self.center_size_to_use)
125 |         #concatenate the results of the bigwig readers along the last axis
126 |         profiles = np.concatenate([
127 |                           x.read_values(coors=new_coors)[:,:,None]
128 |                           for x in self.bigwig_readers], axis=-1)
129 |         counts = np.sum(profiles, axis=1)
130 |         return (counts, profiles)
131 |     
132 |     def __call__(self, coors):
133 |         counts, profiles = self._get_counts_and_vals(coors=coors)
134 |         counts_transformed, profile_transformed =\
135 |             self.counts_and_profiles_transformer(
136 |                   counts=counts,
137 |                   profiles=profiles)
138 |         return {self.counts_mode_name: counts_transformed,
139 |                 self.profile_mode_name: profile_transformed}
140 |  
141 | 
142 | class AbstractPosAndNegStrandCountsAndProfile(CoordsToVals):
143 | 
144 |     def __init__(self, pos_strand_bigwig_path, neg_strand_bigwig_path,
145 |                        counts_mode_name, profile_mode_name,
146 |                        center_size_to_use):
147 |         self.pos_strand_reader =\
148 |           BigWigReader(bigwig_path=pos_strand_bigwig_path)
149 |         self.neg_strand_reader =\
150 |           BigWigReader(bigwig_path=neg_strand_bigwig_path)
151 |         self.counts_mode_name = counts_mode_name
152 |         self.profile_mode_name = profile_mode_name
153 |         self.center_size_to_use = center_size_to_use
154 |         
155 |     def _get_pos_and_neg_counts_and_vals(self, coors):
156 |         new_coors = get_new_coors_around_center(
157 |                             coors=coors,
158 |                             center_size_to_use=self.center_size_to_use)
159 |         first_strand_profile_values = self.pos_strand_reader.read_values(
160 |                                   coors=new_coors)
161 |         second_strand_profile_values = np.abs(
162 |             self.neg_strand_reader.read_values(coors=new_coors))
163 |         pos_profile_values = []
164 |         neg_profile_values = []
165 |         #need to swap the pos and neg strands if the strand is negative
166 |         for (first_strand, second_strand, coor) in zip(first_strand_profile_values,
167 |                                                        second_strand_profile_values,
168 |                                                        coors):
169 |             if (coor.isplusstrand==True):
170 |                 pos_profile_values.append(first_strand)
171 |                 neg_profile_values.append(second_strand)
172 |             else:
173 |                 pos_profile_values.append(second_strand)
174 |                 neg_profile_values.append(first_strand)
175 |         pos_profile_values = np.array(pos_profile_values)
176 |         neg_profile_values = np.array(neg_profile_values)        
177 |         pos_counts = np.sum(pos_profile_values, axis=-1)
178 |         neg_counts = np.sum(neg_profile_values, axis=-1)
179 |         return (pos_counts, neg_counts, pos_profile_values, neg_profile_values)
180 |     
181 |     """
182 |     Returns:
183 |         ndarray: combined/transformed counts
184 |         ndarray: combined/transformed profile
185 |     """
186 |     def combine_pos_and_neg_counts_and_vals(self,
187 |       pos_counts, neg_counts, pos_profile_values, neg_profile_values):
188 |         raise NotImplementedError()
189 |     
190 |     def __call__(self, coors):
191 |         pos_counts, neg_counts, pos_profile_values, neg_profile_values =(
192 |             self._get_pos_and_neg_counts_and_vals(coors=coors))
193 |         counts_ndarray, profile_ndarray =\
194 |           self.combine_pos_and_neg_counts_and_vals( pos_counts=pos_counts, neg_counts=neg_counts,
195 |             pos_profile_values=pos_profile_values,
196 |             neg_profile_values=neg_profile_values)
197 |         return {self.counts_mode_name: counts_ndarray,
198 |                 self.profile_mode_name: profile_ndarray}
199 | 
200 | 
201 | class PosAndNegSeparateLogCounts(AbstractPosAndNegStrandCountsAndProfile):
202 |   
203 |     def __init__(self, **kwargs):
204 |         super(PosAndNegSeparateLogCounts,self).__init__(**kwargs)
205 |   
206 |     def combine_pos_and_neg_counts_and_vals(self,
207 |       pos_counts, neg_counts, pos_profile_values, neg_profile_values):
208 |       
209 |         return (np.concatenate([np.log(pos_counts+1)[:,None], 
210 |                                 np.log(neg_counts+1)[:,None]], axis=1),
211 |                 np.concatenate(
212 |                     [pos_profile_values[:,:,None],
213 |                      neg_profile_values[:,:,None]], axis=2))
214 |     
215 | 
216 | class PosAndNegSmoothWindowCollapsedLogCounts(
217 |         AbstractPosAndNegStrandCountsAndProfile):
218 |   
219 |     def __init__(self, smoothing_windows, **kwargs):
220 |         super(PosAndNegSmoothWindowCollapsedLogCounts, self).__init__(**kwargs)
221 |         self.smoothing_windows = smoothing_windows
222 |   
223 |     def combine_pos_and_neg_counts_and_vals(self, pos_counts, neg_counts,
224 |         pos_profile_values, neg_profile_values):
225 |         
226 |         profile_sum = (
227 |             pos_profile_values[:,:]+
228 |             neg_profile_values[:,:])
229 |         
230 |         smoothed_profiles = []
231 |         for smoothing_window in self.smoothing_windows:
232 |             padded_profile = smooth_profiles(profiles=profile_sum[:,:,None],
233 |                                              smoothing_window=smoothing_window) 
234 |             smoothed_profiles.append(padded_profile)
235 |         
236 |         smoothed_profiles = np.concatenate(smoothed_profiles, axis=2)
237 |         
238 |         return (np.log(pos_counts+neg_counts+1), smoothed_profiles)  
239 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/coordstovals/core.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function, absolute_import
  2 | from ..core import Coordinates
  3 | 
  4 | 
  5 | def get_new_coors_around_center(coors, center_size_to_use):
  6 |     new_coors = []
  7 |     for coor in coors:
  8 |         coor_center = int(0.5*(coor.start + coor.end))
  9 |         left_flank = int(0.5*center_size_to_use)
 10 |         right_flank = center_size_to_use - left_flank
 11 |         new_start = coor_center-left_flank
 12 |         new_end = coor_center+right_flank
 13 |         new_coors.append(Coordinates(chrom=coor.chrom,
 14 |                                      start=new_start, end=new_end,
 15 |                                      isplusstrand=coor.isplusstrand))
 16 |     return new_coors
 17 | 
 18 | 
 19 | class CoordsToVals(object):
 20 |     
 21 |     def __call__(self, coors):
 22 |         """
 23 |         Args:
 24 |             coors (:obj:`list` of :obj:`Coordinates`):
 25 | 
 26 |         Returns:
 27 |             numpy ndarray OR list of ndarrays OR a dict of mode_name->ndarray.
 28 |               Returns a list of ndarrays if returning multiple modes.
 29 |               Alternatively, returns a dict where key is the mode name
 30 |               and the value is the ndarray for the mode.
 31 |         """
 32 |         raise NotImplementedError()
 33 | 
 34 | 
 35 | class CoordsToValsJoiner(CoordsToVals):
 36 | 
 37 |     def __init__(self, coordstovals_list):
 38 |         """
 39 |         Joins batches returned by other CoordsToVals objects
 40 | 
 41 |         Args:
 42 |             coorstovals_list (:obj:`list` of :obj:`CoordsToVals`): List of
 43 |                 CoordsToVals whose values to combine
 44 |         """
 45 |         self.coordstovals_list = coordstovals_list
 46 |                 
 47 |     def __call__(self, coors):
 48 |         batch_to_return = None        
 49 |         for idx,coordstovals_obj in enumerate(self.coordstovals_list):
 50 |             the_batch = coordstovals_obj(coors=coors)
 51 |             assert the_batch is not None
 52 |             if isinstance(the_batch, dict):
 53 |                 assert ((batch_to_return is None) or
 54 |                         (isinstance(batch_to_return, dict))), (
 55 |                         "coordstovals object at idx"+str(idx)
 56 |                         +" returned a dict, but previous coordstovals"
 57 |                         +" objects had a return type incompatible with this")
 58 |                 if (batch_to_return is None):
 59 |                     batch_to_return = {}
 60 |                 for key in the_batch:
 61 |                     assert key not in batch_to_return, (
 62 |                       "coordstovals object at idx"+str(idx)
 63 |                       +" returned a dict with a key of "+key
 64 |                       +", which collides with a pre-existing key returned by"
 65 |                       +" another coordstovals object")
 66 |                 batch_to_return.update(the_batch)
 67 |             else:
 68 |                 assert ((batch_to_return is None) or
 69 |                         (isinstance(batch_to_return, list))), (
 70 |                         "coordstovals object at idx"+str(idx)
 71 |                         +" returned a type incompatible with dict, but previous"
 72 |                         +" coordstovals objects had a return type of dict")
 73 |                 if (isinstance(the_batch, list)==False):
 74 |                     the_batch = [the_batch]
 75 |                 if (batch_to_return is None):
 76 |                     batch_to_return = []
 77 |                 batch_to_return.extend(the_batch)
 78 |         if (batch_to_return is None):
 79 |             batch_to_return = []
 80 |         return batch_to_return
 81 | 
 82 | 
 83 | class AbstractSingleNdarrayCoordsToVals(CoordsToVals):
 84 | 
 85 |     def __init__(self, mode_name=None):
 86 |         """
 87 |         Args:
 88 |             mode_name (:obj:`str`, optional): default None. If None, then
 89 |                 the return of __call__ will be a numpy ndarray. Otherwise, it
 90 |                 will be a dictionary with a key of mode_name and a value being
 91 |                 the numpy ndarray.
 92 |         """
 93 |         self.mode_name = mode_name
 94 |     
 95 |     def _get_ndarray(self, coors):
 96 |         """
 97 |         Args:
 98 |             coors (:obj:`list` of :obj:`Coordinates):
 99 |             
100 |         Returns:
101 |             numpy ndarray
102 |         """
103 |         raise NotImplementedError()
104 |     
105 |     def __call__(self, coors):
106 |         ndarray = self._get_ndarray(coors)
107 |         if (self.mode_name is None):
108 |             return ndarray
109 |         else:
110 |             return {self.mode_name: ndarray}
111 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/coordstovals/fasta.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function, absolute_import
 2 | import numpy as np
 3 | from pyfaidx import Fasta
 4 | from .core import AbstractSingleNdarrayCoordsToVals
 5 | 
 6 | 
 7 | ltrdict = {
 8 |    'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1],
 9 |    'n':[0,0,0,0],'A':[1,0,0,0],'C':[0,1,0,0],'G':[0,0,1,0],
10 |    'T':[0,0,0,1],'N':[0,0,0,0]}
11 | 
12 | 
13 | def onehot_encoder(seq):
14 |     return np.array([ltrdict.get(x,[0,0,0,0]) for x in seq]) 
15 | 
16 | 
17 | class PyfaidxCoordsToVals(AbstractSingleNdarrayCoordsToVals):
18 | 
19 |     def __init__(self, genome_fasta_path, center_size_to_use=None, **kwargs):
20 |         """
21 |         Args:
22 |             genome_fasta_path (:obj:`str`): path to the genome .fa file
23 |             **kwargs: arguments for :obj:`AbstractSingleNdarrayCoordsToVals`
24 |         """
25 |         super(PyfaidxCoordsToVals, self).__init__(**kwargs)
26 |         self.center_size_to_use = center_size_to_use
27 |         self.genome_fasta = genome_fasta_path
28 |     
29 |     def _get_ndarray(self, coors):
30 |         """
31 |         Args:
32 |             coors (:obj:`list` of :obj:`Coordinates): if
33 |                 center_size_to_use is not specified, all the
34 |                 coordinates must be of the same length
35 |             
36 |         Returns:
37 |             numpy ndarray of dims (nexamples x width x 4)
38 |         """
39 |         genome_object = Fasta(self.genome_fasta)
40 |         seqs = []
41 |         for coor in coors:
42 |           if (self.center_size_to_use is not None):
43 |             the_center = int((coor.start + coor.end)*0.5)
44 |             if (coor.chrom in genome_object):
45 |                 seqs.append(genome_object[coor.chrom][
46 |                     the_center-int(0.5*self.center_size_to_use):
47 |                     the_center+(self.center_size_to_use
48 |                                 -int(0.5*self.center_size_to_use))])
49 |             else:
50 |                 print(coor.chrom+" not in "+self.genome_fasta)
51 |           else:
52 |             if (coor.chrom in genome_object):
53 |                 seqs.append(genome_object[coor.chrom][coor.start:coor.end])
54 |             else:
55 |                 print(coor.chrom+" not in "+self.genome_fasta)
56 |         genome_object.close()
57 | 
58 |         onehot_seqs = []
59 |         for seq,coor in zip(seqs, coors):
60 |             onehot = onehot_encoder(seq=seq.seq)
61 |             if (coor.isplusstrand==False):
62 |                 onehot = onehot[::-1, ::-1]
63 |             onehot_seqs.append(onehot)
64 |         lengths = set([len(x) for x in onehot_seqs])
65 |         if (len(lengths) > 0):
66 |             assert len(lengths)==1, ("All the sequences must be of the same"
67 |                 +"lengths, but lengths are "+str(lengths))
68 |         return np.array(onehot_seqs)
69 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/coordstovals/lookup.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function, absolute_import
 2 | import numpy as np
 3 | from .core import AbstractSingleNdarrayCoordsToVals
 4 | from ..core import Coordinates
 5 | import gzip
 6 | 
 7 | 
 8 | class SimpleLookup(AbstractSingleNdarrayCoordsToVals):
 9 | 
10 |     def __init__(self, lookup_file,
11 |                        transformation=None,
12 |                        default_returnval=0.0, **kwargs):
13 |         super(SimpleLookup, self).__init__(**kwargs)
14 |         self.lookup_file = lookup_file
15 |         self.transformation = transformation
16 |         self.default_returnval = default_returnval
17 |         self.lookup = {}
18 |         self.num_labels = None
19 |         for line in (gzip.open(self.lookup_file) if ".gz"
20 |                      in self.lookup_file else open(self.lookup_file)):
21 |             (chrom, start_str, end_str, *labels) =\
22 |               line.decode("utf-8").rstrip().split("\t")
23 |             coord = Coordinates(chrom=chrom,
24 |                                 start=int(start_str),
25 |                                 end=int(end_str))
26 |             labels = [(self.transformation(float(x))
27 |                        if self.transformation is not None else float(x))
28 |                       for x in labels] 
29 |             self.lookup[(coord.chrom, coord.start, coord.end)] = labels
30 |             if (self.num_labels is None):
31 |                 self.num_labels = len(labels)
32 |             else:
33 |                 assert len(labels)==self.num_labels,(
34 |                   "Unequal label lengths; "+str(len(labels), self.num_labels))
35 |     
36 |     def _get_ndarray(self, coors):
37 |         to_return = []
38 |         for coor in coors:
39 |             if (coor.chrom, coor.start, coor.end) not in self.lookup:
40 |                 to_return.append(np.ones(self.num_labels)
41 |                                  *self.default_returnval)
42 |             else:
43 |                 to_return.append(
44 |                     self.lookup[(coor.chrom, coor.start, coor.end)])
45 |         return np.array(to_return)
46 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/coordstovals/tiledb.py:
--------------------------------------------------------------------------------
 1 | import tiledb
 2 | import numpy as np
 3 | from .core import CoordsToVals
 4 | 
 5 | class BasicTiledbProfileCoordsToVals(CoordsToVals):
 6 |     def __init__(self, tiledb_paths, pos_label_source_attribute, neg_label_source_attribute=None, center_size_to_use=None, **kwargs):
 7 |         '''
 8 |         tiledb_paths can be a single string or a list of strings or a dictionary mapping from  mode name to string. 
 9 |         '''
10 |         self.tiledb_paths=tiledb_paths
11 |         #identify the data type of tiledb_paths 
12 |         self.type_tiledb_paths=type(self.tiledb_paths)
13 |         #identify the corresponding function to use for querying tiledb 
14 |         self.call_function=self.get_call_function()
15 |         #positive and negative strand values may correspond to different attirbutes of tiledb database
16 |         self.pos_label_source_attribute=pos_label_source_attribute
17 |         self.neg_label_source_attribute=neg_label_source_attribute
18 | 
19 |     def get_call_function(self):
20 |         '''
21 |         determines function to use for querying coord values based 
22 |         on the data type of tiledb_paths attribute 
23 |         '''
24 |         if self.type_tiledb_paths == str:
25 |             return self.__call__string
26 |         elif self.type_tiledb_paths == list:
27 |             return self.__call__list
28 |         elif self.type_tiledb_paths == dict:
29 |             return self.__call__dict
30 |         else:
31 |             raise Exception("Unsupported data type for BasicTiledbProfileCoordsToVals:"+str(self.type_tiledb_paths))
32 |                 
33 |     def __call__dict(self,coords):
34 |         '''
35 |         self.tiledb_paths is a dictinary mapping from mode name to string
36 |         '''
37 |         vals={}
38 |         for mode_name in self.tiledb_paths:
39 |             cur_tiledb_path=self.tiledb_paths[mode_name]
40 |             vals[mode_name]=self.query_tiledb(cur_tiledb_path,coords)
41 |         return vals
42 |     
43 |     def __call__list(self,coords):
44 |         '''
45 |         self.tiledb_paths is a list of strings  
46 |         '''
47 |         vals=[self.query_tiledb(cur_tiled_path,coords) for cur_tiled_path in self.tiledb_paths]
48 |         return vals 
49 |     
50 |     def __call__string(self,coords):
51 |         '''
52 |         self.tiledb_paths is a string 
53 |         '''
54 |         vals=self.query_tiledb(self.tiledb_paths,coords)
55 |         return vals
56 |     
57 |     def __call__(self,coords):
58 |         '''
59 |         coords is a list of named tuples : .chrom, .start, .end, .isplusstrand    
60 |         returns nparray of values associated with coordinates
61 |         '''
62 |         assert len(coords)>0        
63 |         self.ctx = tiledb.Ctx()
64 |         return self.call_function(coords)
65 | 
66 |     def query_tiledb(self,cur_tiledb_path,coords):
67 |         '''
68 |         queries tiledb database for a specific batch of coordinates for a single dataset/task. 
69 |         '''
70 |         labels=np.zeros((len(coords),coords[0].end-coords[0].start))
71 |         for i in range(len(coords)):
72 |             coord=coords[i]
73 |             #open the tiledb for access in a pre-defined context 
74 |             with tiledb.DenseArray('.'.join([cur_tiledb_path,coord.chrom]), mode='r',ctx=self.ctx) as cur_array:
75 |                 if coord.isplusstrand:
76 |                     #query positive strand (or non-stranded entity)
77 |                     cur_vals=cur_array[coord.start:coord.end][self.pos_label_source_attribute]
78 |                 else:
79 |                     #query negative strand , make sure to reverse the values
80 |                     cur_vals=cur_array[coord.start:coord.end][self.neg_label_source_attribute][::-1]
81 |             labels[i]=cur_vals
82 |         return labels
83 | 


--------------------------------------------------------------------------------
/seqdataloader/batchproducers/coordbased/core.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function, absolute_import
 2 | from collections import namedtuple
 3 | import keras
 4 | 
 5 | 
 6 | Coordinates = namedtuple("Coordinates",
 7 |                          ["chrom", "start", "end", "isplusstrand"])
 8 | Coordinates.__new__.__defaults__ = (True,)
 9 | 
10 | 
11 | def apply_mask(tomask, mask):
12 |     if isinstance(tomask, dict):
13 |         return dict([(key, val[mask]) for key,val in tomask.items()])
14 |     elif isinstance(tomask, list):
15 |         return [x[mask] for x in mask]
16 |     else:
17 |         return x[mask]
18 | 
19 | 
20 | class KerasBatchGenerator(keras.utils.Sequence):
21 |   
22 |     """
23 |     Args:
24 |         coordsbatch_producer (KerasSequenceApiCoordsBatchProducer)
25 |         inputs_coordstovals (CoordsToVals)
26 |         targets_coordstovals (CoordsToVals)
27 |         sampleweights_coordstovals (CoordsToVals)
28 |         coordsbatch_transformer (AbstracCoordBatchTransformer)
29 |         qc_func (callable): function that can be used to filter
30 |             out poor-quality sequences.
31 |         sampleweights_coordstoval: either this argument or
32 |             sampleweights_from_inputstargets could be used to
33 |             specify sample weights. sampleweights_coordstoval
34 |             takes a batch of coords as inputs.
35 |         sampleweights_from_inputstargets: either this argument or
36 |             sampleweights_coordstoval could be used to
37 |             specify sample weights. sampleweights_from_inputstargets
38 |             takes the inputs and targets values to generate the weights.
39 |     """
40 |     def __init__(self, coordsbatch_producer,
41 |                        inputs_coordstovals,
42 |                        targets_coordstovals,
43 |                        coordsbatch_transformer=None,
44 |                        qc_func=None,
45 |                        sampleweights_coordstovals=None,
46 |                        sampleweights_from_inputstargets=None):
47 |         self.coordsbatch_producer = coordsbatch_producer
48 |         self.inputs_coordstovals = inputs_coordstovals
49 |         self.targets_coordstovals = targets_coordstovals
50 |         self.coordsbatch_transformer = coordsbatch_transformer
51 |         self.sampleweights_coordstovals = sampleweights_coordstovals
52 |         self.sampleweights_from_inputstargets =\
53 |             sampleweights_from_inputstargets
54 |         if sampleweights_coordstovals is not None:
55 |             assert sampleweights_from_inputstargets is None
56 |         if sampleweights_from_inputstargets is not None:
57 |             assert sampleweights_coordstovals is None
58 |         self.qc_func = qc_func
59 |  
60 |     def __getitem__(self, index):
61 |         coords_batch = self.coordsbatch_producer[index]
62 |         if (self.coordsbatch_transformer is not None):
63 |             coords_batch = self.coordsbatch_transformer(coords_batch)
64 |         inputs = self.inputs_coordstovals(coords_batch)
65 |         if (self.targets_coordstovals is not None):
66 |             targets = self.targets_coordstovals(coords_batch)
67 |         else:
68 |             targets=None
69 |         if (self.qc_func is not None):
70 |             qc_mask = self.qc_func(inputs=inputs, targets=targets)
71 |             inputs = apply_mask(tomask=inputs, mask=qc_mask)
72 |             if (targets is not None):
73 |                 targets = apply_mask(tomask=targets, mask=qc_mask)
74 |         else:
75 |             qc_mask = None
76 |         if (self.sampleweights_coordstovals is not None):
77 |             sample_weights = self.sampleweights_coordstovals(coords_batch)
78 |             return (inputs, targets, sample_weights)
79 |         elif (self.sampleweights_from_inputstargets is not None):
80 |             sample_weights = self.sampleweights_from_inputstargets(
81 |                                 inputs=inputs, targets=targets)
82 |             return (inputs, targets, sample_weights)
83 |         else:
84 |             if (self.targets_coordstovals is not None):
85 |                 return (inputs, targets)
86 |             else:
87 |                 return inputs
88 |    
89 |     def __len__(self):
90 |         return len(self.coordsbatch_producer)
91 |     
92 |     def on_epoch_end(self):
93 |         self.coordsbatch_producer.on_epoch_end()
94 | 


--------------------------------------------------------------------------------
/seqdataloader/bounded_process_pool_executor.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import concurrent.futures
 3 | 
 4 | name = 'bounded_pool_executor'
 5 | class _BoundedPoolExecutor:
 6 | 
 7 |     semaphore = None
 8 |     
 9 |     def acquire(self):
10 |         self.semaphore.acquire()
11 |         
12 |     def release(self, fn):
13 |         self.semaphore.release()
14 |         
15 |     def submit(self, fn, *args, **kwargs):
16 |         self.acquire()
17 |         future = super().submit(fn, *args, **kwargs)
18 |         future.add_done_callback(self.release)
19 |         
20 |         return future
21 |     
22 | 
23 | class BoundedProcessPoolExecutor(_BoundedPoolExecutor, concurrent.futures.ProcessPoolExecutor):
24 |     def __init__(self, max_workers=None,mp_context=None, initializer=None, initargs=()):
25 |         super().__init__(max_workers,mp_context,initializer,initargs)
26 |         self.semaphore = multiprocessing.BoundedSemaphore(max_workers)
27 |                                                                                                         
28 | 


--------------------------------------------------------------------------------
/seqdataloader/dbingest/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Example command: 
 3 | 
 4 | To ingest a microglia dataset with an unstranded bigwig, idr peaks, optimal peaks, 
 5 | a blacklist of genomic regions to avoid, and negative (Non-peak) regions that are gc-matched to the idr peak set 
 6 | we would run dbingest as follows: 
 7 | 
 8 | contents of metadata.tsv: 
 9 | 
10 | ```
11 | dataset     idr_peak                            overlap_peak                            ambig_peak              negatives_peak  count_bigwig_unstranded_5p
12 | microglia   microglia.idr.optimal.narrowPeak    microglia.overlap.optimal.narrowPeak     blacklist/GRch38/GRch38_unified_blacklist.bed     microglia.gc.matched.negatives.bed ../data/microglia.unstranded.bw
13 | ```
14 | 
15 | contents of attributes.txt: 
16 | 
17 | ```
18 | idr_peak        bed_summit_from_last_col
19 | overlap_peak    bed_summit_from_last_col
20 | ambig_peak      bed_no_summit
21 | negatives_peak  bed_no_summit
22 | count_bigwig_unstranded_5p      bigwig
23 | ```
24 | 
25 | The attributes file indicates how each column in the metadata.tsv file should be parsed. Supported values are  
26 | 
27 | * bed_summit_from_last_col -- this assumes the input file is in narrowPeak (or similar) format, where the summit offset from the start coordinate is in the last column of the file.  File is stored as an array of 0 (no peak) 1 (peak) and 2 (summit). 
28 | 
29 | * bed_no_summit -- this assumes that the input file is a bed file without summit information -- peak intervals are centered on (start+end)/2   
30 | 
31 | * bigwig -- treat the input file as a bigwig   
32 | 
33 | * bed_no_summit -- do not calculate summits for the provided bed file (i.e. store the bed file as an array of 0 (no peak) and 1 (peak) but don't store a value of 2 to indicate summit). 
34 | 
35 | The command to run to ingest the metadata.csv file to tiledb is: 
36 | 
37 | ```
38 | 
39 | db_ingest --tiledb_metadata metadata.tsv \
40 |           --array_name microglia_db \
41 |           --overwrite \
42 |           --chrom_sizes hg38.chrom.sizes \
43 |           --attribute_config_file attribs.txt \
44 |           --coord_tile_size 10000 \
45 |           --task_tile_size 1 \
46 |           --write_chunk 30000000 \
47 |           --threads 40 \
48 |           --max_queue_size 50 \
49 |           --max_mem_g 200
50 | ```
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/seqdataloader/dbingest/__init__.py:
--------------------------------------------------------------------------------
  1 | ## helper functions to ingest bigwig and narrowPeak data files into a tileDB instance.
  2 | ## tileDB instances are indexed by coordinate
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | import math
  7 | import psutil
  8 | from multiprocessing import Pool, Process, Queue
  9 | import os
 10 | import signal
 11 | import tiledb
 12 | import pickle
 13 | import argparse
 14 | import pandas as pd
 15 | import numpy as np
 16 | from collections import OrderedDict
 17 | from ..attrib_config import *
 18 | from ..queue_config import * 
 19 | from ..utils import *
 20 | from ..tdb_config import * 
 21 | import gc
 22 | import time
 23 | import sys
 24 | 
 25 | def args_object_from_args_dict(args_dict):
 26 |     #create an argparse.Namespace from the dictionary of inputs
 27 |     args_object=argparse.Namespace()
 28 |     #set the defaults
 29 |     vars(args_object)['overwrite']=False
 30 |     vars(args_object)['coord_tile_size']=10000
 31 |     vars(args_boject)['task_tile_size']=1
 32 |     vars(args_object)['attribute_config']=None
 33 |     vars(args_object)['attribute_config_file']=None
 34 |     vars(args_object)['write_chunk']=30000000
 35 |     vars(args_object)['threads']=1
 36 |     vars(args_object)['max_queue_size']=30
 37 |     vars(args_object)['max_mem_g']=100
 38 |     for key in args_dict:
 39 |         vars(args_object)[key]=args_dict[key]
 40 |     #set any defaults that are unset 
 41 |     args=args_object    
 42 |     return args 
 43 |     
 44 | def parse_args():
 45 |     parser=argparse.ArgumentParser(description="ingest data into tileDB")
 46 |     parser.add_argument("--tiledb_metadata",help="each row is a dataset, each column corresponds to an attribute")
 47 |     parser.add_argument("--array_name")
 48 |     parser.add_argument("--overwrite",default=False,action="store_true") 
 49 |     parser.add_argument("--chrom_sizes",help="2 column tsv-separated file. Column 1 = chromsome name; Column 2 = chromosome size")
 50 |     parser.add_argument("--coord_tile_size",type=int,default=10000,help="coordinate axis tile size")
 51 |     parser.add_argument("--task_tile_size",type=int,default=1,help="task axis tile size")
 52 |     parser.add_argument("--attribute_config",default=None,help="the following are supported: encode_pipeline, encode_pipeline_with_controls, generic_bigwig")
 53 |     parser.add_argument("--attribute_config_file",default=None,help="file with 2 columns; first column indicates attribute name; 2nd column indicates attribute type, which is one of bigwig, bed_no_summit, bed_summit_from_peak_center, bed_summit_from_last_col")
 54 |     parser.add_argument("--write_chunk",type=int,default=30000000,help="number of bases to write to disk in one tileDB DenseArray write operation")
 55 |     parser.add_argument("--threads",type=int,default=1,help="number of chunks to process in parallel")
 56 |     parser.add_argument("--max_queue_size",type=int,default=30)
 57 |     parser.add_argument("--max_mem_g",type=int,default=100,help="maximum memory usage in Gigabytes")
 58 |     return parser.parse_args()
 59 |     
 60 |     
 61 | def init_worker():
 62 |     signal.signal(signal.SIGINT, signal.SIG_IGN)
 63 | 
 64 | def kill_child_processes(parent_pid, sig=signal.SIGTERM):
 65 |     try:
 66 |         parent = psutil.Process(parent_pid)
 67 |     except psutil.NoSuchProcess:
 68 |         return
 69 |     children = parent.children(recursive=True)
 70 |     for process in children:
 71 |         process.send_signal(sig)
 72 | 
 73 |                 
 74 | def create_new_array(tdb_Context,
 75 |                      size,
 76 |                      array_out_name,
 77 |                      coord_tile_size,
 78 |                      task_tile_size,
 79 |                      attribute_config,
 80 |                      attribute_config_file,
 81 |                      compressor='gzip',
 82 |                      compression_level=-1,
 83 |                      var=False):
 84 |     '''
 85 |     Creates an empty tileDB array
 86 |     size= tuple(num_indices,num_tasks)
 87 |     '''
 88 |     coord_tile_size=min(size[0],coord_tile_size)
 89 |     task_tile_size=max([1,min(size[1],task_tile_size)])
 90 |     tiledb_dim_coords = tiledb.Dim(
 91 |         name='genome_coordinate',
 92 |         domain=(0, size[0]),
 93 |         tile=coord_tile_size,
 94 |         dtype='uint32')
 95 |     tiledb_dim_tasks=tiledb.Dim(
 96 |         name='task',
 97 |         domain=(0,size[1]),#max([1,size[1]])),
 98 |         tile=task_tile_size,
 99 |         dtype='uint32')
100 |     tiledb_dom = tiledb.Domain(tiledb_dim_coords,tiledb_dim_tasks,ctx=tdb_Context)
101 | 
102 |     #generate the attribute information
103 |     attribute_info=get_attribute_info(attribute_config,attribute_config_file)
104 |     attribs=[]
105 |     for key in attribute_info:
106 |         attribs.append(tiledb.Attr(
107 |             name=key,
108 |             var=var,
109 |             filters=tiledb.FilterList([tiledb.GzipFilter()]),
110 |             dtype=attribute_info[key]['dtype']))
111 |     
112 |     tiledb_schema = tiledb.ArraySchema(
113 |         domain=tiledb_dom,
114 |         attrs=tuple(attribs),
115 |         cell_order='row-major',
116 |         tile_order='row-major')
117 |     
118 |     tiledb.DenseArray.create(array_out_name, tiledb_schema)
119 |     print("created empty array on disk")
120 |     return
121 |     
122 |     
123 | 
124 | def extract_metadata_field(row,field):
125 |     dataset=row['dataset'] 
126 |     try:
127 |         return row[field]
128 |     except:
129 |         print("tiledb_metadata has no column "+field+" for dataset:"+str(dataset))
130 |         return None
131 | 
132 | def open_data_for_parsing(row,attribute_info):
133 |     try:
134 |         data_dict={}
135 |         cols=list(row.index)
136 |         if 'dataset' in cols:
137 |             cols.remove('dataset')
138 |         for col in cols:
139 |             cur_fname=extract_metadata_field(row,col)
140 |             if isinstance(cur_fname,str):
141 |                 assert os.path.exists(cur_fname), "The path:"+str(cur_fname)+" does not exist. If you meant to skip this column, leave it empty in the metadata sheet." 
142 |             elif math.isnan(float(cur_fname)):
143 |                 continue
144 |             elif cur_fname is  None:
145 |                 continue
146 |             data_dict[col]=attribute_info[col]['opener'](cur_fname,parallel=True)
147 |         return data_dict
148 |     except Exception as e:
149 |         print(repr(e))
150 |         kill_child_processes(os.getpid())
151 |         raise
152 |     
153 | def ingest(args):
154 |     if type(args)==type({}):
155 |         args=args_object_from_args_dict(args)
156 |     if args.write_chunk > max_write_chunk:
157 |         print("WARNING: You have specified a write_chunk size of"+str(args.write_chunk)+" but the maximum supported with python serialization is:"+str(max_write_chunk)+". It will be reset to "+str(max_write_chunk))
158 |         args.write_chunk=max_write_chunk
159 | 
160 |     #create a queue to write the array
161 |     global write_queue
162 |     write_queue=Queue(maxsize=args.max_queue_size)
163 | 
164 |     #config
165 |     tdb_Config=tiledb.Config(tdb_config_params)
166 |     tdb_write_Context=tiledb.Ctx(config=tdb_Config)   
167 |     tdb_read_Context=tiledb.Ctx(config=tdb_Config)
168 |     
169 |     overwrite=args.overwrite
170 |     coord_tile_size=args.coord_tile_size
171 |     task_tile_size=args.task_tile_size
172 |     attribute_config=args.attribute_config
173 |     attribute_config_file=args.attribute_config_file
174 |     updating=False
175 | 
176 |     attribute_info=get_attribute_info(args.attribute_config,args.attribute_config_file)
177 |     tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t')
178 |     num_tasks=tiledb_metadata.shape[0]
179 |     print("num_tasks:"+str(num_tasks))
180 |     
181 |     print("loaded tiledb metadata")
182 |     chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t')
183 |     print("loaded chrom sizes")
184 |     chrom_indices,num_indices=transform_chrom_size_to_indices(chrom_sizes)
185 |     print("num_indices:"+str(num_indices))
186 |     array_out_name=args.array_name
187 |     if tiledb.object_type(array_out_name) == "array":
188 |         if overwrite==False:
189 |             raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting")
190 |         else:
191 |             print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten")
192 |             updating=True
193 |     else:
194 |         #create the array:
195 |         create_new_array(tdb_Context=tdb_write_Context,
196 |                          size=(num_indices,num_tasks-1),
197 |                          attribute_config=attribute_config,
198 |                          attribute_config_file=attribute_config_file,
199 |                          array_out_name=array_out_name,
200 |                          coord_tile_size=coord_tile_size,
201 |                          task_tile_size=task_tile_size,
202 |                          var=False)
203 |         print("created new array:"+str(array_out_name))
204 |         #create metadata array
205 |         metadata_dict={}
206 |         metadata_dict['tasks']=[i for i in tiledb_metadata['dataset']]
207 |         metadata_dict['chroms']=[i for i in chrom_indices.keys()]
208 |         metadata_dict['sizes']=[i[2] for i in list(chrom_indices.values())]
209 |         metadata_dict['offsets']=[i[0] for i in list(chrom_indices.values())]
210 |         num_tasks=tiledb_metadata['dataset'].shape[0]
211 |         
212 |         num_chroms=len(chrom_indices.keys())
213 |         with tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') as cur_array:
214 |             cur_array.meta['num_tasks']=num_tasks
215 |             cur_array.meta['num_chroms']=num_chroms
216 |             for task_index in range(num_tasks):
217 |                 cur_array.meta['_'.join(['task',str(task_index)])]=metadata_dict['tasks'][task_index]
218 |             for chrom_index in range(num_chroms):
219 |                 cur_array.meta['_'.join(['chrom',str(chrom_index)])]=metadata_dict['chroms'][chrom_index]
220 |                 cur_array.meta['_'.join(['size',str(chrom_index)])]=metadata_dict['sizes'][chrom_index]
221 |                 cur_array.meta['_'.join(['offset',str(chrom_index)])]=metadata_dict['offsets'][chrom_index]                                
222 |         print("created tiledb metadata")
223 |     pool=Pool(processes=args.threads,initializer=init_worker)
224 |     print("made pool") 
225 |     pool_inputs=[] 
226 |     for task_index,task_row in tiledb_metadata.iterrows():
227 |         dataset=task_row['dataset']
228 |         #read in filenames for bigwigs
229 |         data_dict=open_data_for_parsing(task_row,attribute_info)
230 |         for start_chunk_index in range(0,num_indices,args.write_chunk):
231 |             end_chunk_index=start_chunk_index+min([num_indices,args.write_chunk])
232 |             #convert global indices to chrom+pos indices
233 |             chunk_chrom_coords=transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices)
234 |             if chunk_chrom_coords is None:
235 |                 raise Exception("failed to tranform indices:"+str(start_chunk_index)+"-"+str(end_chunk_index)+ " to chrom coords;"+str(chrom_indices))
236 |             for coord_set in chunk_chrom_coords:
237 |                 pool_inputs.append((task_index,data_dict,attribute_info,coord_set,args))
238 |     pool_feed_chunk_start=0
239 |     pool_feed_chunk_max=len(pool_inputs)
240 |     chunks_to_process=len(pool_inputs)
241 |     array_writer=Process(target=write_array,args=([args,updating,chunks_to_process]))
242 |     try:
243 |         array_writer.start()
244 |     except Exception as e:
245 |         raise e
246 | 
247 |     try:
248 |         while pool_feed_chunk_start < pool_feed_chunk_max:
249 |             pool_feed_chunk_end=min([pool_feed_chunk_start+queue_feed_chunk_size,pool_feed_chunk_max])
250 |             #only do mapping if queue size is not exceeded & total memory consumption is not exceeded
251 |             write_queue_size=write_queue.qsize()
252 |             mem_used=psutil.virtual_memory().used / (10**9)
253 |             print("mapping to pool, queue size:"+str(write_queue_size))
254 |             print("mapping to pool, mem used:"+str(mem_used))
255 |             while (write_queue_size >=args.max_queue_size) or (mem_used >=args.max_mem_g):
256 |                 time.sleep(10)
257 |             print("sending to pool:"+str(pool_feed_chunk_start)+"-"+str(pool_feed_chunk_end)+"/"+str(chunks_to_process))
258 |             pool.map(process_chunk,pool_inputs[pool_feed_chunk_start:pool_feed_chunk_end])
259 |             pool_feed_chunk_start+=queue_feed_chunk_size
260 |             time.sleep(60)
261 |         pool.close()
262 |     except KeyboardInterrupt:
263 |         kill_child_processes(os.getpid())
264 |         pool.terminate()
265 |         raise
266 |     except Exception as e:
267 |         print(e)
268 |         kill_child_processes(os.getpid())
269 |         raise 
270 |         
271 |     #wait until we're done writing to the tiledb array
272 |     array_writer.join()
273 |     print("array_writer.join() is complete")
274 |     print("shutting down pool")
275 |     pool.join()
276 |     print('done!') 
277 | 
278 | def process_chunk(inputs):
279 |     try:
280 |         task_index=inputs[0]
281 |         data_dict=inputs[1]
282 |         attribute_info=inputs[2]
283 |         coord_set=inputs[3]
284 |         args=inputs[4]
285 | 
286 |         attribute_config=args.attribute_config
287 |         dict_to_write=OrderedDict()
288 |         chrom=coord_set[0]
289 |         start_pos=coord_set[1]
290 |         end_pos=coord_set[2]
291 |         start_index=coord_set[3]
292 |         end_index=coord_set[4] 
293 |         for attribute in data_dict:
294 |             cur_parser=attribute_info[attribute]['parser']
295 |             cur_vals=cur_parser([data_dict[attribute],chrom,start_pos,end_pos,attribute_info[attribute]])
296 |             dict_to_write[attribute]=cur_vals[-1] #the last entry in the tuple is the actual numpy array of values; the first entries store start and end blocks
297 |         payload=pickle.dumps([task_index,start_index,end_index,dict_to_write],pickle.HIGHEST_PROTOCOL)
298 |         write_queue.put(payload)
299 |         gc.collect()
300 |     except: 
301 |         kill_child_processes(os.getpid())
302 |         raise
303 | 
304 | def write_array(args, updating, chunks_to_process):    
305 |     try:
306 |         #config
307 |         tdb_Config=tiledb.Config(tdb_config_params)
308 |         tdb_write_Context=tiledb.Ctx(config=tdb_Config)   
309 |         
310 |         if updating is True:
311 |             tdb_read_Context=tiledb.Ctx(config=tdb_Config)
312 |             cur_array_toread=tiledb.DenseArray(args.array_name,ctx=tdb_read_Context,mode='r')
313 |         cur_array_towrite=tiledb.DenseArray(args.array_name,ctx=tdb_write_Context,mode='w')
314 |         chunks_processed=0
315 |         while chunks_processed < chunks_to_process:
316 |             while write_queue.empty() is True:
317 |                 time.sleep(10)
318 |             processed_chunk=write_queue.get()
319 |             processed_chunk_unpickled=pickle.loads(processed_chunk)
320 |             task_index=processed_chunk_unpickled[0]
321 |             start_index=processed_chunk_unpickled[1]
322 |             end_index=processed_chunk_unpickled[2]
323 |             dict_to_write=processed_chunk_unpickled[3]
324 |             if updating is True:
325 |                 #we are only updating some attributes in the array
326 |                 cur_vals=cur_array_toread[start_index:end_index,task_index]            
327 |                 #print("got cur vals for task "+str(task_index)+" for "+str(start_index)+":"+str(end_index))
328 |                 for key in dict_to_write:
329 |                     cur_vals[key]=dict_to_write[key]
330 |                 dict_to_write=cur_vals
331 |                 print("updated data dict for writing:"+args.array_name) 
332 |             else:
333 |                 #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array
334 |                 required_attrib=list(get_attribute_info(args.attribute_config,args.attribute_config_file).keys())
335 |                 #print(str(required_attrib))
336 |                 for attrib in required_attrib:
337 |                     if attrib not in dict_to_write:
338 |                         print("augmenting")
339 |                         dict_to_write[attrib]=np.full(end_index-start_index,np.nan)
340 |             #write in chunks
341 |             cur_array_towrite[start_index:end_index,task_index]=dict_to_write
342 |             print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2))
343 |             gc.collect()
344 |             chunks_processed+=1
345 |             print("wrote to disk "+str(task_index)+" for "+str(start_index)+":"+str(end_index)+";"+str(chunks_processed)+"/"+str(chunks_to_process))
346 |         assert chunks_processed >=chunks_to_process
347 |         print("closing arrays")
348 |         if updating is True:
349 |             cur_array_toread.close()
350 |         cur_array_towrite.close()
351 |         return 
352 | 
353 |     except KeyboardInterrupt:
354 |         kill_child_processes(os.getpid())
355 |         #try to delete all tmp files
356 |         raise
357 |     except Exception as e:
358 |         print(e)
359 |         kill_child_processes(os.getpid())
360 |         raise Exception(e.message) 
361 | 
362 |     
363 | def main():
364 |     args=parse_args()
365 |     ingest(args)
366 |         
367 | if __name__=="__main__":
368 |     main() 
369 |     
370 |     
371 | 


--------------------------------------------------------------------------------
/seqdataloader/dbingest_single_threaded/__init__.py:
--------------------------------------------------------------------------------
  1 | ## helper functions to ingest bigwig and narrowPeak data files into a tileDB instance.
  2 | ## tileDB instances are indexed by coordinate
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | import psutil
  7 | #import multiprocessing as mp
  8 | #mpx = mp.get_context('spawn')
  9 | import tiledb
 10 | import pdb
 11 | import argparse
 12 | import pandas as pd
 13 | import numpy as np
 14 | from collections import OrderedDict
 15 | from ..attrib_config import *
 16 | from ..utils import *
 17 | from ..tdb_config import * 
 18 | import gc
 19 | 
 20 |     
 21 | def args_object_from_args_dict(args_dict):
 22 |     #create an argparse.Namespace from the dictionary of inputs
 23 |     args_object=argparse.Namespace()
 24 |     #set the defaults
 25 |     vars(args_object)['overwrite']=False
 26 |     vars(args_object)['coord_tile_size']=10000
 27 |     vars(args_boject)['task_tile_size']=1
 28 |     vars(args_object)['attribute_config']='encode_pipeline'
 29 |     vars(args_object)['write_chunk']=None
 30 |     for key in args_dict:
 31 |         vars(args_object)[key]=args_dict[key]
 32 |     #set any defaults that are unset 
 33 |     args=args_object    
 34 |     return args 
 35 |     
 36 | def parse_args():
 37 |     parser=argparse.ArgumentParser(description="ingest data into tileDB")
 38 |     parser.add_argument("--tiledb_metadata",help="fields are: dataset, fc_bigwig, pval_bigwig, count_bigwig_plus_5p, count_bigwig_minus_5p, count_bigwig_unstranded_5p, idr_peak, overlap_peak, ambig_peak")
 39 |     parser.add_argument("--tiledb_group")
 40 |     parser.add_argument("--overwrite",default=False,action="store_true") 
 41 |     parser.add_argument("--chrom_sizes",help="2 column tsv-separated file. Column 1 = chromsome name; Column 2 = chromosome size")
 42 |     parser.add_argument("--coord_tile_size",type=int,default=10000,help="coordinate axis tile size")
 43 |     parser.add_argument("--task_tile_size",type=int,default=1,help="task axis tile size")
 44 |     parser.add_argument("--attribute_config",default='encode_pipeline',help="the following are supported: encode_pipeline, generic_bigwig")
 45 |     parser.add_argument("--write_chunk",type=int,default=None,help="number of bases to write to disk in one tileDB DenseArray write operation") 
 46 |     return parser.parse_args()
 47 | 
 48 | def create_new_array(tdb_Context,
 49 |                      size,
 50 |                      array_out_name,
 51 |                      coord_tile_size,
 52 |                      task_tile_size,
 53 |                      attribute_config,
 54 |                      compressor='gzip',
 55 |                      compression_level=-1,
 56 |                      var=False):
 57 |     '''
 58 |     Creates an empty tileDB array
 59 |     size= tuple(num_indices,num_tasks)
 60 |     '''
 61 |     coord_tile_size=min(size[0],coord_tile_size)
 62 |     task_tile_size=min(size[1],task_tile_size)
 63 |     tiledb_dim_coords = tiledb.Dim(
 64 |         name='genome_coordinate',
 65 |         domain=(0, size[0]),
 66 |         tile=coord_tile_size,
 67 |         dtype='uint32')
 68 |     tiledb_dim_tasks=tiledb.Dim(
 69 |         name='task',
 70 |         domain=(0,size[1]),
 71 |         tile=task_tile_size,
 72 |         dtype='uint32')
 73 |     tiledb_dom = tiledb.Domain(tiledb_dim_coords,tiledb_dim_tasks,ctx=tdb_Context)
 74 | 
 75 |     #generate the attribute information
 76 |     attribute_info=get_attribute_info(attribute_config)
 77 |     attribs=[]
 78 |     for key in attribute_info:
 79 |         attribs.append(tiledb.Attr(
 80 |             name=key,
 81 |             var=var,
 82 |             filters=tiledb.FilterList([tiledb.GzipFilter()]),
 83 |             dtype=attribute_info[key]['dtype']))
 84 |     
 85 |     tiledb_schema = tiledb.ArraySchema(
 86 |         domain=tiledb_dom,
 87 |         attrs=tuple(attribs),
 88 |         cell_order='row-major',
 89 |         tile_order='row-major')
 90 |     
 91 |     tiledb.DenseArray.create(array_out_name, tiledb_schema)
 92 |     print("created empty array on disk")
 93 |     return
 94 |     
 95 |     
 96 | 
 97 | def extract_metadata_field(row,field):
 98 |     dataset=row['dataset'] 
 99 |     try:
100 |         return row[field]
101 |     except:
102 |         print("tiledb_metadata has no column "+field+" for dataset:"+str(dataset))
103 |         return None
104 | 
105 | def open_data_for_parsing(row,attribute_info):
106 |     try:
107 |         data_dict={}
108 |         cols=list(row.index)
109 |         if 'dataset' in cols:
110 |             cols.remove('dataset')
111 |         for col in cols:
112 |             cur_fname=extract_metadata_field(row,col)
113 |             if cur_fname is not None:
114 |                 data_dict[col]=attribute_info[col]['opener'](cur_fname)
115 |         return data_dict
116 |     except Exception as e:
117 |         print(repr(e))
118 |         raise e
119 |     
120 | def get_subdict(full_dict,start,end):
121 |     subdict=dict()
122 |     for key in full_dict:
123 |         subdict[key]=full_dict[key][start:end]
124 |     print(subdict.keys())
125 |     return subdict
126 |     
127 | def ingest_single_threaded(args):
128 |     if type(args)==type({}):
129 |         args=args_object_from_args_dict(args)
130 | 
131 |     #config
132 |     tdb_Config=tiledb.Config(tdb_config_params)
133 |     tdb_write_Context=tiledb.Ctx(config=tdb_Config)   
134 |     tdb_read_Context=tiledb.Ctx(config=tdb_Config)
135 |     
136 |     overwrite=args.overwrite
137 |     coord_tile_size=args.coord_tile_size
138 |     task_tile_size=args.task_tile_size
139 |     attribute_config=args.attribute_config
140 |     updating=False
141 | 
142 |     attribute_info=get_attribute_info(args.attribute_config) 
143 |     tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t')
144 |     num_tasks=tiledb_metadata.shape[0]
145 |     
146 |     print("loaded tiledb metadata")
147 |     chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t')
148 |     print("loaded chrom sizes")
149 |     chrom_indices,num_indices=transform_chrom_size_to_indices(chrom_sizes)
150 |     print("num_indices:"+str(num_indices))
151 |     array_out_name=args.tiledb_group
152 |     if tiledb.object_type(array_out_name) == "array":
153 |         if overwrite==False:
154 |             raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting")
155 |         else:
156 |             print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten")
157 |             updating=True
158 |     else:
159 |         #create the array:
160 |         create_new_array(tdb_Context=tdb_write_Context,
161 |                          size=(num_indices,num_tasks),
162 |                          attribute_config=attribute_config,
163 |                          array_out_name=array_out_name,
164 |                          coord_tile_size=coord_tile_size,
165 |                          task_tile_size=task_tile_size,
166 |                          var=False)
167 |         print("created new array:"+str(array_out_name))
168 |         #create metadata array
169 |         metadata_dict={}
170 |         metadata_dict['tasks']=[i for i in tiledb_metadata['dataset']]
171 |         metadata_dict['chroms']=[i for i in chrom_indices.keys()]
172 |         metadata_dict['sizes']=[i[2] for i in list(chrom_indices.values())]
173 |         metadata_dict['offsets']=[i[0] for i in list(chrom_indices.values())]
174 |         num_tasks=tiledb_metadata['dataset'].shape[0]
175 |         num_chroms=len(chrom_indices.keys())
176 |         with tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') as cur_array:
177 |             cur_array.meta['num_tasks']=num_tasks
178 |             cur_array.meta['num_chroms']=num_chroms
179 |             for task_index in range(num_tasks):
180 |                 cur_array.meta['_'.join(['task',str(task_index)])]=metadata_dict['tasks'][task_index]
181 |             for chrom_index in range(num_chroms):
182 |                 cur_array.meta['_'.join(['chrom',str(chrom_index)])]=metadata_dict['chroms'][chrom_index]
183 |                 cur_array.meta['_'.join(['size',str(chrom_index)])]=metadata_dict['sizes'][chrom_index]
184 |                 cur_array.meta['_'.join(['offset',str(chrom_index)])]=metadata_dict['offsets'][chrom_index]                                
185 |         print("created tiledb metadata")
186 |     if updating is True:
187 |         cur_array_toread=tiledb.DenseArray(array_out_name,ctx=tdb_read_Context,mode='r')
188 |     else:
189 |         cur_array_toread=None
190 |     cur_array_towrite=tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w')
191 |     for task_index,task_row in tiledb_metadata.iterrows():
192 |         dataset=task_row['dataset']
193 |         print(dataset) 
194 |         #read in filenames for bigwigs
195 |         data_dict=open_data_for_parsing(task_row,attribute_info)
196 |         for start_chunk_index in range(0,num_indices,args.write_chunk):
197 |             print(str(start_chunk_index)+'/'+str(num_indices)) 
198 |             end_chunk_index=start_chunk_index+min([num_indices,start_chunk_index+args.write_chunk])
199 |             print("end chunk index:"+str(end_chunk_index))
200 |             #convert global indices to chrom+pos indices
201 |             chunk_chrom_coords=transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices)
202 |             print("processing:"+str(chunk_chrom_coords))
203 |             for coord_set in chunk_chrom_coords:
204 |                 print("\t"+"coord_set:"+str(coord_set))
205 |                 process_chunk(task_index,data_dict,attribute_info,coord_set,updating,args,cur_array_toread,cur_array_towrite)
206 |                 print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2))            
207 |                 print("wrote chrom array for task:"+str(dataset)+"for index:"+str(start_chunk_index))
208 |     print("closing arrays")
209 |     if cur_array_to_read is not None:
210 |         cur_array_toread.close()
211 |     cur_array_towrite.close()
212 |     print('done!') 
213 | 
214 | def process_chunk(task_index, data_dict, attribute_info, coord_set, updating, args, cur_array_toread, cur_array_towrite):
215 |     attribute_config=args.attribute_config
216 |     dict_to_write=OrderedDict()
217 |     chrom=coord_set[0]
218 |     start_pos=coord_set[1]
219 |     end_pos=coord_set[2]
220 |     start_index=coord_set[3]
221 |     end_index=coord_set[4] 
222 |     for attribute in data_dict:
223 |         cur_parser=attribute_info[attribute]['parser']
224 |         cur_vals=cur_parser([data_dict[attribute],chrom,start_pos,end_pos,attribute_info[attribute]])
225 |         dict_to_write[attribute]=cur_vals[-1] #the last entry in the tuple is the actual numpy array of values; the first entries store start and end blocks 
226 |         print("got:"+str(attribute)+" for task "+str(task_index)+" for "+str(chrom)+":"+str(start_pos)+"-"+str(end_pos))
227 | 
228 |     if updating is True:
229 |         #we are only updating some attributes in the array
230 |         cur_vals=cur_array_toread[start_index:end_index,task_index]            
231 |         print("got cur vals for task "+str(task_index)+" for "+str(chrom)+":"+str(start_pos)+"-"+str(end_pos))
232 |         for key in dict_to_write:
233 |             cur_vals[key]=dict_to_write[key]
234 |         dict_to_write=cur_vals
235 |         print("updated data dict for writing:"+array_out_name) 
236 |     else:
237 |         #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array
238 |         required_attrib=list(get_attribute_info(attribute_config).keys())
239 |         for attrib in required_attrib:
240 |             if attrib not in dict_to_write:
241 |                 dict_to_write[attrib]=np.full(end_pos-start_pos,np.nan)
242 |             
243 |     #write in chunks
244 |     cur_array_towrite[start_index:end_index,task_index]=dict_to_write
245 |     print("wrote to disk "+str(task_index)+" for "+str(chrom)+":"+str(start_pos)+"-"+str(end_pos))
246 |     gc.collect() 
247 |     
248 | def main():
249 |     args=parse_args()
250 |     ingest_single_threaded(args)
251 |         
252 | if __name__=="__main__":
253 |     main() 
254 |     
255 |     
256 | 


--------------------------------------------------------------------------------
/seqdataloader/labelgen/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function, absolute_import
  2 | import argparse
  3 | from pybedtools import BedTool
  4 | import pyBigWig 
  5 | import pandas as pd
  6 | import numpy as np 
  7 | import pdb
  8 | import csv
  9 | import sys
 10 | from .classification_label_protocols import *
 11 | from .regression_label_protocols import * 
 12 | import gzip 
 13 | import os
 14 | #from ..bounded_process_pool_executor import *
 15 | from concurrent.futures import * 
 16 | #graceful shutdown
 17 | import psutil
 18 | import signal 
 19 | import gc
 20 | import string
 21 | import random 
 22 | import pickle
 23 | 
 24 | #Approaches to determining classification labels
 25 | #Others can be added here (imported from classification_label_protocols) 
 26 | labeling_approaches={
 27 |     "peak_summit_in_bin_classification":peak_summit_in_bin_classification,
 28 |     "peak_percent_overlap_with_bin_classification":peak_percent_overlap_with_bin_classification,
 29 |     "peak_summit_in_bin_regression":peak_summit_in_bin_regression,
 30 |     "peak_percent_overlap_with_bin_regression":peak_percent_overlap_with_bin_regression,
 31 |     "all_genome_bins_regression":all_genome_bins_regression
 32 |     }
 33 | 
 34 | def randomString(stringLength=16):
 35 |     """Generate a random string of fixed length """
 36 |     letters = string.ascii_lowercase
 37 |     return ''.join(random.choice(letters) for i in range(stringLength))
 38 |  
 39 | def init_worker():
 40 |     signal.signal(signal.SIGINT, signal.SIG_IGN)
 41 | 
 42 | def kill_child_processes(parent_pid, sig=signal.SIGTERM):
 43 |     try:
 44 |         parent = psutil.Process(parent_pid)
 45 |     except psutil.NoSuchProcess:
 46 |         return
 47 |     children = parent.children(recursive=True)
 48 |     for process in children:
 49 |         process.send_signal(sig)
 50 |         
 51 | def add_filename_prefix(fname,prefix):
 52 |     splits=fname.split('/')
 53 |     if len(splits)==1:
 54 |         #local path
 55 |         return prefix+'.'+fname
 56 |     else:
 57 |         cur_dir='/'.join(splits[0:-1])
 58 |         cur_fname=splits[-1]
 59 |         modified_fname=prefix+'.'+cur_fname
 60 |         return '/'.join([cur_dir,modified_fname])        
 61 | 
 62 | def parse_args():
 63 |     parser=argparse.ArgumentParser(description="Generate genome-wide labeled bins for a set of narrowPeak task files ")
 64 |     parser.add_argument("--task_list",help="this is a tab-separated file with the name of the task in the first column, the path to the corresponding narrowPeak(.gz) file in the second column (optionally), and the path to the corresponding bigWig file in the third column (optionally, for regression)")
 65 |     parser.add_argument("--task_list_sep",default='\t')
 66 |     parser.add_argument("--outf",help="output filename that labeled bed file will be saved to.")
 67 |     parser.add_argument("--output_type",choices=['gzip','hdf5','pkl','bz2'],default='gzip',help="format to save output, one of gzip, hdf5, pkl, bz2")
 68 |     parser.add_argument("--split_output_by_chrom",action="store_true",default=False)
 69 |     parser.add_argument("--split_output_by_task",action="store_true",default=False,help="creates a separate output file for each task's labels")
 70 |     parser.add_argument("--chrom_sizes",help="chromsizes file for the reference genome. First column is chrom name; second column is chrom size")
 71 |     parser.add_argument("--chroms_to_keep",nargs="+",default=None,help="list of chromosomes, as defined in the --chrom_sizes file, to include in label generation. All chromosomes will be used if this argument is not provided. This is most useful if generating a train/test/validate split for deep learning models")
 72 |     parser.add_argument("--chroms_to_exclude",nargs="+",default=None,help="list of chromosomes, as defined in the --chrom_sizes file, to exclude in label generation. No chromosomes will be excluded if this argument is not provided. This is most useful if generating a train/test/validate split for deep learning models")
 73 |     parser.add_argument("--bin_stride",type=int,default=50,help="bin_stride to shift adjacent bins by")
 74 |     parser.add_argument("--left_flank",type=int,default=400,help="left flank")
 75 |     parser.add_argument("--right_flank",type=int,default=400,help="right flank")
 76 |     parser.add_argument("--bin_size",type=int,default=200,help="flank around bin center where peak summit falls in a positive bin")
 77 | 
 78 |     parser.add_argument("--task_threads",type=int,default=1,help="Number of tasks to process for a given chromosome.")
 79 |     parser.add_argument("--chrom_threads",type=int,default=4,help="Number of chromosomes to process at once.")
 80 |     parser.add_argument("--bigwig_stats",choices=['mean','min','max','coverage','std'],default='mean',help="Value to extract from bigwig file")
 81 |     parser.add_argument("--overlap_thresh",type=float,default=0.5,help="minimum percent of bin that must overlap a peak for a positive label")
 82 |     parser.add_argument("--allow_ambiguous",default=False,action="store_true")
 83 |     parser.add_argument("--store_positives_only",default=False,action="store_true")
 84 |     parser.add_argument("--store_values_above_thresh",default=None,type=float,help="for the regression case, determine the minimum row value to include in the output data frame (i.,e. remove bins that are 0 for all tasks by setting this to 0") 
 85 |     parser.add_argument("--labeling_approach",choices=["peak_summit_in_bin_classification",
 86 |                                                        "peak_percent_overlap_with_bin_classification",
 87 |                                                        "peak_summit_in_bin_regression",
 88 |                                                        "peak_percent_overlap_with_bin_regression",
 89 |                                                        "all_genome_bins_regression"])
 90 |     parser.add_argument("--label_transformer",default="asinh",help="type of transformation to apply to the labels; one of None, asinh, log10, log")
 91 |     parser.add_argument("--label_transformer_pseudocount",type=float,default=0.001,help="pseudocount to add to values if using log10 or log label transformations")
 92 |     parser.add_argument("--temp_dir",default="/tmp")
 93 |     parser.add_argument("--save_label_source",default=False,action='store_true',help='a separate dataframe is created that stores the source file, peak region, and (if available) peak name for each genome bin, or NA')
 94 |     if len(sys.argv)==1:
 95 |         parser.print_help(sys.stderr)
 96 |         sys.exit(1)       
 97 |     return parser.parse_args()
 98 | 
 99 | def get_labels_one_task(inputs):
100 |     #unravel the inputs 
101 |     task_name=inputs[0]
102 |     task_bed=inputs[1]
103 |     task_bigwig=inputs[2]
104 |     task_ambig=inputs[3]
105 |     chrom=inputs[4]
106 |     first_coord=inputs[5]
107 |     final_coord=inputs[6]
108 |     args=inputs[7]
109 |     #determine the appropriate labeling approach
110 |     print("in get_labels_one_task") 
111 |     return labeling_approaches[args.labeling_approach](task_name,task_bed,task_bigwig,task_ambig,chrom,first_coord,final_coord,args)    
112 |     
113 | def get_chrom_labels(inputs):
114 |     #print(inputs)
115 |     #unravel inputs
116 |     chrom=inputs[0]
117 |     chrom_size=inputs[1]
118 |     bed_and_bigwig_dict=inputs[2]
119 |     tasks=inputs[3]
120 |     args=inputs[4] 
121 |     #pre-allocate a pandas data frame to store bin labels for the current chromosome. Fill with zeros    
122 |     #determine the index tuple values
123 |     try:
124 |         chroms,all_start_pos,all_end_pos,first_bin_start,final_bin_start=get_indices(chrom,chrom_size,args)
125 |     except:
126 |         return (chrom,None,None)
127 |     columns=['CHR','START','END']+list(tasks['task'])
128 |     num_entries=len(chroms.values)
129 |     chrom_df = pd.DataFrame(0,index=np.arange(num_entries),columns=columns)
130 |     chrom_df['CHR']=chroms.values
131 |     chrom_df['START']=all_start_pos.values
132 |     chrom_df['END']=all_end_pos.values
133 |     if args.save_label_source is True:
134 |         chrom_label_source_dict={}
135 |     print("pre-allocated df for chrom:"+str(chrom)+"with dimensions:"+str(chrom_df.shape))
136 | 
137 |     #create a thread pool to label bins, each task gets assigned a thread 
138 |     pool_inputs=[]
139 |     for task_name in bed_and_bigwig_dict:
140 |         task_bed=bed_and_bigwig_dict[task_name]['bed']
141 |         task_bigwig=bed_and_bigwig_dict[task_name]['bigwig']
142 |         task_ambig=bed_and_bigwig_dict[task_name]['ambig'] 
143 |         pool_inputs.append((task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args))
144 |     try:
145 |         with ProcessPoolExecutor(max_workers=args.task_threads,initializer=init_worker) as pool: 
146 |             bin_values=pool.map(get_labels_one_task,pool_inputs)
147 |             pool.shutdown(wait=True)
148 |     except KeyboardInterrupt:
149 |         print('detected keyboard interrupt')
150 |         #shutdown the pool
151 |         pool.shutdown(wait=False)
152 |         # Kill remaining child processes
153 |         kill_child_processes(os.getpid())
154 |         raise 
155 |     except Exception as e:
156 |         print(repr(e))
157 |         #shutdown the pool
158 |         pool.shudown(wait=False)
159 |         # Kill remaining child processes
160 |         kill_child_processes(os.getpid())
161 |         raise e
162 | 
163 |     for task_name,task_labels,label_source_dict in bin_values:
164 |         if task_labels is None:
165 |             continue
166 |         chrom_df[task_name]=task_labels
167 |         if args.save_label_source is True:
168 |             chrom_label_source_dict.update(label_source_dict)
169 |             
170 |     #convert label source dictionary to dataframe
171 |     if args.save_label_source is True:
172 |         chrom_label_source_df=pd.DataFrame.from_dict(chrom_label_source_dict,orient='index')
173 |         cols=list(chrom_label_source_df.columns)
174 |         chrom_label_source_df['CHR']=chrom_df['CHR'][chrom_label_source_df.index]
175 |         chrom_label_source_df['START']=chrom_df['START'][chrom_label_source_df.index]
176 |         chrom_label_source_df['END']=chrom_df['END'][chrom_label_source_df.index]        
177 |         #reorder so that chr,start,end are at the front, sort by bin start position
178 |         ordered_cols=['CHR','START','END']+cols
179 |         chrom_label_source_df=chrom_label_source_df[ordered_cols].sort_values(by='START')
180 |         
181 |     else:
182 |         chrom_label_source_df=None
183 |     if args.split_output_by_chrom==True:
184 |         outf=add_filename_prefix(args.outf,chrom)
185 |         if args.output_type in ["gzip","bz2"]:
186 |             chrom_df.to_csv(outf,sep='\t',float_format="%.2f",header=True,index=False,mode='wb',compression=args.output_type,chunksize=1000000)
187 |         elif args.output_type == "hdf5":
188 |             chrom_df=chrom_df.set_index(['CHR','START','END'])
189 |             chrom_df.to_hdf(args.outf+"."+chrom,key="data",mode='w', append=True, format='table',min_itemsize=30)
190 |         if args.save_label_source is True:
191 |             outf_labels=add_filename_prefix(args.outf,'label_source.'+chrom)
192 |             if args.output_type in ["gzip","bz2"]:
193 |                 chrom_label_source_df.to_csv(outf_labels,sep='\t',float_format="%.2f",header=True,index=False,mode='wb',compression=args.output_type,chunksize=1000000)
194 |             elif args.output_type=="hdf5":
195 |                 chrom_label_source_df=chrom_label_source_df.set_index(['CHR','START','END'])
196 |                 chrom_label_source_df.to_hdf(outf_labels,key='data',mode='w',append=True,format='table',min_itemsize=30)                
197 |         return (chrom, None, None)
198 |     else:        
199 |         #dump to tmp file -- needed to avoid passing very large objects between processes
200 |         pickle_name=randomString()
201 |         pickle_path='/'.join([args.temp_dir,pickle_name])
202 |         print("dumping chrom outputs to pickle:"+pickle_path)
203 |         with open(pickle_path,'wb') as f:
204 |             pickle.dump(chrom_df,f)
205 |         return (chrom,pickle_path,chrom_label_source_df)
206 | 
207 | 
208 | def get_bed_and_bigwig_dict(tasks):
209 |     print("creating dictionary of bed files and bigwig files for each task:")
210 |     bed_and_bigwig_dict=dict()
211 |     for index,row in tasks.iterrows():
212 |         task_name=row['task']
213 |         print(task_name) 
214 |         bed_and_bigwig_dict[task_name]=dict()
215 |         
216 |         #get the peak file associated with the task (if provided)
217 |         if "narrowPeak" not in row: 
218 |             task_bed=None
219 |         else:
220 |             print(row['narrowPeak'])
221 |             assert os.path.exists(row["narrowPeak"])
222 |             task_bed=row['narrowPeak']
223 |         bed_and_bigwig_dict[task_name]['bed']=task_bed
224 |         
225 |         #get the BigWig file associated with the task (if provided)
226 |         if "bigwig"  not in row: 
227 |             task_bigwig=None
228 |         else:
229 |             print(row['bigwig'])
230 |             assert os.path.exists(row["bigwig"])
231 |             task_bigwig=row['bigwig']
232 |         bed_and_bigwig_dict[task_name]['bigwig']=task_bigwig
233 |         
234 |         #get the ambiguous peaks
235 |         if "ambig"  not in row: 
236 |             ambig_bed=None
237 |         else: 
238 |             assert os.path.exists(row["ambig"])
239 |             ambig_bed=row['ambig']
240 |         bed_and_bigwig_dict[task_name]['ambig']=ambig_bed
241 |         
242 |     return bed_and_bigwig_dict
243 | 
244 | def get_indices(chrom,chrom_size,args):
245 |     print("getting indices")
246 |     final_bin_start=((chrom_size-args.right_flank-args.bin_size)//args.bin_stride)*args.bin_stride
247 |     #final_coord=(chrom_size//args.bin_stride)*args.bin_stride
248 |     first_bin_start=args.left_flank
249 |     if final_bin_start<=first_bin_start:
250 |         print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping")
251 |         return None 
252 |     chroms=[]
253 |     start_pos=[]
254 |     end_pos=[]
255 |     for index in range(first_bin_start,final_bin_start+1,args.bin_stride):
256 |         chroms.append(chrom)
257 |         start_pos.append(index-args.left_flank)
258 |         end_pos.append(index+args.bin_size+args.right_flank)
259 |     return pd.Series(chroms),pd.Series(start_pos),pd.Series(end_pos),first_bin_start,final_bin_start 
260 | 
261 | 
262 | def write_output(task_names,full_df,first_chrom,args,mode='w',task_split_engaged=False,outf=None,labels=False):
263 |     '''
264 |     Save genome-wide labels to disk in gzip, hdf5, or pkl format 
265 |     '''
266 |     
267 |     if (args.split_output_by_task==True) and (task_split_engaged==False) :
268 |         for task in task_names:
269 |             task_df=full_df[['CHR','START','END',task]]
270 |             cur_outf=add_filename_prefix(args.outf,task.replace('/','.'))
271 |             write_output([task],task_df,first_chrom,args,mode=mode,task_split_engaged=True,outf=cur_outf)
272 |         return
273 |     if outf==None:
274 |         outf=args.outf
275 |     if labels==True:
276 |         outf=add_filename_prefix(outf,'label_source')
277 |     all_negative_df=None
278 |     if (args.store_positives_only==True) and (labels==False):
279 |         #find regions with at least one positive entry per task
280 |         all_negative_df=full_df[['CHR','START','END']][(full_df[task_names]<=0).all(1)]
281 |         full_df=full_df[(full_df[task_names]>0).any(1)]
282 |     if (args.store_values_above_thresh is not None) and (labels==False):
283 |         all_negative_df=full_df[['CHR','START','END']][(full_df[task_names]<=args.store_values_above_thresh).all(1)]
284 |         full_df=full_df[(full_df[task_names]>args.store_values_above_thresh).any(1)]
285 |         
286 |     #determine if header needs to be stored
287 |     if first_chrom is True:
288 |         header=True
289 |     else:
290 |         header=False
291 | 
292 |     #get the universal negatives file name
293 |     if all_negative_df is not None:
294 |         if outf.startswith('/'):
295 |             basename_outf=outf.split('/')[-1]
296 |             prefix_outf='/'.join(outf.split('/')[0:-1])
297 |             universal_negatives_outf='.'.join(['/'.join([prefix_outf,"universal_negatives"]),basename_outf])
298 |         else:
299 |             universal_negatives_outf='.'.join([outf,"universal_negatives"])        
300 |     if args.output_type=="gzip":
301 |         try:
302 |             full_df.to_csv(outf,sep='\t',header=header,index=False,mode=mode+'b',compression='gzip',chunksize=1000000)
303 |             if all_negative_df is not None:
304 |                 all_negative_df.to_csv(universal_negatives_outf,sep='\t',header=header,index=False,mode=mode+'b',compression='gzip',chunksize=1000000)
305 |         except:
306 |             print("warning! some chromosomes in your file are too small to produce values, skipping") 
307 |             pass
308 |             
309 |     elif args.output_type=="bz2":
310 |         try:
311 |             full_df.to_csv(outf,sep='\t',header=header,index=False,mode=mode+'b',compression='bz2',chunksize=1000000)
312 |             if all_negative_df is not None:
313 |                 all_negative_df.to_csv(universal_negatives_outf,sep='\t',header=header,index=False,mode=mode+'b',compression='bz2',chunksize=1000000)
314 |         except:
315 |             print("warning! some chromosomes in your file are too small to produce values, skipping") 
316 |             pass
317 |     elif args.output_type=="hdf5":
318 |         full_df=full_df.set_index(['CHR','START','END'])
319 |         if mode=='w':
320 |             append=False
321 |         else:
322 |             append=True
323 |         try:
324 |             full_df.to_hdf(outf,key="data",mode=mode, append=append, format='table',min_itemsize=30)
325 |             if all_negative_df is not None:
326 |                 all_negative_df.set_index(['CHR','START','END'])
327 |                 all_negative_df.to_hdf(universal_negatives_outf,key="data",mode=mode, append=append, format='table',min_itemsize=30)
328 |         except:
329 |             print("warning! some chromosomes in your file are too small to produce values, skipping") 
330 |             pass
331 |     elif args.output_type=="pkl":
332 |         full_df=full_df.set_index(['CHR','START','END'])
333 |         try:
334 |             full_df.to_pickle(outf,compression="gzip")
335 |             if all_negative_df is not None:
336 |                 all_negative_df.set_index(['CHR','START','END'])
337 |                 all_negative_df.to_pickle(universal_negatives_outf,compression="gzip")
338 |         except:
339 |             print("warning! some chromosomes in your file are too small to produce values, skipping") 
340 |             pass
341 | def args_object_from_args_dict(args_dict):
342 |     #create an argparse.Namespace from the dictionary of inputs
343 |     args_object=argparse.Namespace()
344 |     #set the defaults
345 |     vars(args_object)['split_output_by_chrom']=False
346 |     vars(args_object)['split_output_by_task']=False
347 |     vars(args_object)['chroms_to_keep']=None
348 |     vars(args_object)['chroms_to_exclude']=None
349 |     vars(args_object)['bin_stride']=50
350 |     vars(args_object)['left_flank']=400
351 |     vars(args_object)['right_flank']=400
352 |     vars(args_object)['bin_size']=200
353 |     vars(args_object)['chrom_threads']=4
354 |     vars(args_object)['task_threads']=1
355 |     vars(args_object)['overlap_thresh']=0.5
356 |     vars(args_object)['allow_ambiguous']=True
357 |     vars(args_object)['store_positives_only']=False
358 |     vars(args_object)['store_values_above_thresh']=None
359 |     vars(args_object)['output_hdf5_low_mem']=False
360 |     vars(args_object)['task_list_sep']='\t'
361 |     vars(args_object)['bigwig_stats']='mean'
362 |     vars(args_object)['label_transformer']='asinh'
363 |     vars(args_object)['label_transformer_pseudocount']=0.001
364 |     vars(args_object)['temp_dir']='/tmp'
365 |     vars(args_object)['save_label_source']=False
366 |     for key in args_dict:
367 |         vars(args_object)[key]=args_dict[key]
368 |     #set any defaults that are unset 
369 |     args=args_object    
370 |     return args 
371 |         
372 | def genomewide_labels(args):
373 |     if type(args)==type({}):
374 |         args=args_object_from_args_dict(args)
375 |         
376 |     #read in the metadata file with:
377 |     #task names in column 1,
378 |     #path to peak file in column 2,
379 |     #path to bigWig file in column 3
380 |     #path to ambiguous peaks in column 4 (bed) 
381 |     tasks=pd.read_csv(args.task_list,sep=args.task_list_sep,header=0)
382 |     bed_and_bigwig_dict=get_bed_and_bigwig_dict(tasks) 
383 |     chrom_sizes=pd.read_csv(args.chrom_sizes,sep='\t',header=None)
384 | 
385 |     processed_first_chrom=False
386 |     #create a Pool to process chromosomes in parallel
387 |     pool_args=[]
388 |     chrom_order=[] 
389 |     for index,row in chrom_sizes.iterrows():
390 |         chrom=row[0]
391 |         
392 |         #determine whether this chromosome should be included in the label file
393 |         if args.chroms_to_keep!=None:
394 |             if chrom not in args.chroms_to_keep:
395 |                 continue
396 |         if args.chroms_to_exclude!=None:
397 |             if chrom in args.chroms_to_exclude:
398 |                 continue 
399 |         chrom_order.append(chrom) 
400 |         chrom_size=row[1]
401 |         pool_args.append((chrom,chrom_size,bed_and_bigwig_dict,tasks,args))
402 |     print("creating chromosome thread pool")
403 |     try:
404 |         #with ThreadPool(args.chrom_threads) as pool:
405 |         with ProcessPoolExecutor(max_workers=args.chrom_threads,initializer=init_worker) as pool:
406 |             processed_chrom_outputs=pool.map(get_chrom_labels,pool_args)
407 |             pool.shutdown(wait=True)
408 | 
409 |     except KeyboardInterrupt:
410 |         print('detected keyboard interrupt')
411 |         #shutdown the pool
412 |         pool.shutdown(wait=False)
413 |         # Kill remaining child processes
414 |         kill_child_processes(os.getpid())
415 |         raise 
416 |     except Exception as e:
417 |         print(repr(e))
418 |         #shutdown the pool
419 |         pool.shutdown(wait=False)
420 |         # Kill remaining child processes
421 |         kill_child_processes(os.getpid())
422 |         raise e
423 | 
424 |     #if the user is happy with separate files for each chromosome, these have already been written to disk. We are done 
425 |     if args.split_output_by_chrom==True:
426 |         exit()
427 |     mode='w'
428 |     first_chrom=True
429 |     for chrom, pickle_path,chrom_label_source_df in processed_chrom_outputs:
430 |         #write to output file!
431 |         if pickle_path is None:
432 |             continue
433 |         print("loading temp file with chromosome data:")
434 |         with open(pickle_path,'rb') as f:
435 |             chrom_df=pickle.load(f)
436 |         print("writing output chromosomes:"+str(chrom))
437 |         if chrom_label_source_df is not None:
438 |             write_output(tasks['task'],chrom_label_source_df,first_chrom,args,mode=mode,labels=True)
439 |         write_output(tasks['task'],chrom_df,first_chrom,args,mode=mode)
440 |         #delete the temp file
441 |         os.remove(pickle_path) 
442 |         first_chrom=False
443 |         mode='a'
444 |     print("done!")
445 |     
446 | def main():
447 |     args=parse_args()     
448 |     genomewide_labels(args)
449 |     
450 | if __name__=="__main__":
451 |     try:
452 |         multiprocessing.set_start_method('forkserver')
453 |     except:
454 |         print("context already set") 
455 |     main()
456 | 


--------------------------------------------------------------------------------
/seqdataloader/labelgen/classification_label_protocols.py:
--------------------------------------------------------------------------------
  1 | from math import floor,ceil
  2 | import pandas as pd
  3 | from multiprocessing.pool import ThreadPool
  4 | from .utils import rolling_window
  5 | import pdb
  6 | import numpy as np
  7 | from pybedtools import BedTool
  8 | 
  9 | 
 10 | def peak_summit_in_bin_classification(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args):
 11 |     '''
 12 |     For each peak, the summit position is determined.
 13 | 
 14 |     The minimum bin with bedtools coverage is args.binsize upstream of the summit;
 15 |     The max bin with bedtools coverage is args.binsize downstream of the summit
 16 | 
 17 |     Within this range, bin centers are shifted by args.bin_stride
 18 | 
 19 |     If specified in args.allow_ambiguous, then coverage is also computed in adjacent bins to the two extremes are marked as
 20 |     ambiguous
 21 |     '''
 22 |     #get the peaks for the current chromosome by intersecting the task_bed with the chromosome coordinates
 23 |     task_bed=BedTool(task_bed)
 24 |     if task_ambig is not None: 
 25 |         task_ambig=BedTool(task_ambig)
 26 |     min_chrom_coord=first_bin_start
 27 |     max_chrom_coord=final_bin_start
 28 |     if min_chrom_coord >= max_chrom_coord:
 29 |         print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping")
 30 |         return task_name,None
 31 |     chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord)
 32 |     chrom_bed=BedTool(chrom_coords,from_string=True)
 33 |     chrom_task_bed=task_bed.intersect(chrom_bed)
 34 |     chrom_ambig_bed=None
 35 |     if ((args.allow_ambiguous==True) and (task_ambig!=None)):
 36 |         chrom_ambig_bed=task_ambig.intersect(chrom_bed)
 37 |     print("got peak subset for chrom:"+str(chrom)+" for task:"+str(task_name))
 38 | 
 39 |     #pre-allocate a numpy array of 0's
 40 |     num_bins=(final_bin_start-first_bin_start)//args.bin_stride+1
 41 |     coverage_vals=np.zeros(num_bins)
 42 |     if args.save_label_source is True:
 43 |         label_source_dict=dict()
 44 |     else:
 45 |         label_source_dict=None 
 46 |                 
 47 |     for entry in chrom_task_bed:
 48 |         chrom=entry[0]
 49 |         peak_start=int(entry[1])
 50 |         peak_end=int(entry[2])
 51 |         summit=peak_start+int(entry[-1])
 52 |         
 53 |         chromosome_min_bin_index=ceil((summit-args.bin_size-first_bin_start)/args.bin_stride)
 54 |         min_bin_start=chromosome_min_bin_index*args.bin_stride
 55 |         chromosome_max_bin_index=floor((summit-first_bin_start)/args.bin_stride)
 56 |         max_bin_start=chromosome_max_bin_index*args.bin_stride
 57 | 
 58 |         #get mean coverage in bigwig for each bin specified above
 59 |         index_coverage_vals=chromosome_min_bin_index
 60 |         for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
 61 |             if index_coverage_vals >= 0 and index_coverage_vals <= (num_bins - 1):
 62 |                 coverage_vals[index_coverage_vals]=1
 63 |                 if args.save_label_source is True:
 64 |                     label_source_dict[index_coverage_vals]={}
 65 |                     label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom
 66 |                     label_source_dict[index_coverage_vals][task_name+".START"]=peak_start
 67 |                     label_source_dict[index_coverage_vals][task_name+".END"]=peak_end
 68 |             index_coverage_vals+=1
 69 | 
 70 |         #if allow_ambiguous supplied by user, shift 1 bin left and 1 bin right
 71 |         if args.allow_ambiguous==True:
 72 |             chromosome_min_bin_index-=1
 73 |             if chromosome_min_bin_index > 0 and chromosome_min_bin_index <= (num_bins - 1):
 74 |                 coverage_vals[chromosome_min_bin_index]=np.nan
 75 |                 if (args.save_label_source is True) and (chromosome_min_bin_index in label_source_dict):
 76 |                     del label_source_dict[chromosome_min_bin_index]
 77 |             chromosome_max_bin_index+=1
 78 |             if chromosome_max_bin_index >= 0 and chromosome_max_bin_index < (num_bins - 1):
 79 |                 coverage_vals[chromosome_max_bin_index]=np.nan
 80 |                 if (args.save_label_source is True) and (chromosome_max_bin_index in label_source_dict):
 81 |                     del label_source_dict[chromosome_max_bin_index]
 82 |                 
 83 |     #if a bed file of ambiguous labels is specified, label them with -1
 84 |     if ((args.allow_ambiguous==True) and (chrom_ambig_bed!=None)):
 85 |         for entry in chrom_ambig_bed:
 86 |             chrom=entry[0]
 87 |             peak_start=int(entry[1])
 88 |             peak_end=int(entry[2])
 89 |             summit=peak_start+int(entry[-1])
 90 | 
 91 |             chromosome_min_bin_index=ceil((summit-args.bin_size-first_bin_start)/args.bin_stride)
 92 |             min_bin_start=chromosome_min_bin_index*args.bin_stride
 93 |             chromosome_max_bin_index=floor((summit-first_bin_start)/args.bin_stride)
 94 |             max_bin_start=chromosome_max_bin_index*args.bin_stride
 95 | 
 96 |             #get mean coverage in bigwig for each bin specified above
 97 |             index_coverage_vals=chromosome_min_bin_index
 98 |             for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
 99 |                 if index_coverage_vals >= 0 and index_coverage_vals <= (num_bins - 1):
100 |                     coverage_vals[index_coverage_vals]=np.nan
101 |                     if args.save_label_source is True:
102 |                         if index_coverage_vals in label_source_dict:
103 |                             del label_source_dict[index_coverage_vals]                                                                                    
104 |                 index_coverage_vals+=1
105 |                 
106 |         
107 |     print("finished chromosome:"+str(chrom)+" for task:"+str(task_name))
108 |     return task_name,coverage_vals,label_source_dict
109 | 
110 | def peak_percent_overlap_with_bin_classification(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args):
111 |     '''
112 |     50% of the central 200bp region in a 1kb bin must overlap with the peak for a positive label
113 |     '''
114 |     #get the peaks for the current chromosome by intersecting the task_bed with the chromosome coordinates
115 |     task_bed=BedTool(task_bed)
116 |     if task_ambig is not None: 
117 |         task_ambig=BedTool(task_ambig)
118 |     min_chrom_coord=first_bin_start
119 |     max_chrom_coord=final_bin_start
120 |     if min_chrom_coord >= max_chrom_coord:
121 |         print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping")
122 |         return task_name, None
123 |     chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord)
124 |     chrom_bed=BedTool(chrom_coords,from_string=True)
125 |     chrom_task_bed=task_bed.intersect(chrom_bed)
126 |     chrom_ambig_bed=None
127 |     if ((args.allow_ambiguous==True) and (task_ambig!=None)):
128 |         chrom_ambig_bed=task_ambig.intersect(chrom_bed)
129 |     print("got peak subset for chrom:"+str(chrom)+" for task:"+str(task_name))
130 |     #pre-allocate a numpy array of 0's
131 |     num_bins=(final_bin_start-first_bin_start)//args.bin_stride+1
132 |     coverage_vals=np.zeros(num_bins)
133 |                     
134 |     if args.save_label_source is True:
135 |         label_source_dict=dict()
136 |     else:
137 |         label_source_dict=None 
138 | 
139 |     for entry in chrom_task_bed:
140 |         chrom=entry[0]
141 |         peak_start=int(entry[1])
142 |         peak_end=int(entry[2])
143 |         min_overlap=int(round(args.overlap_thresh*min(args.bin_size, (peak_end-peak_start))))
144 | 
145 |         #get the bin indices that overlap the peak
146 |         chromosome_min_bin_index=ceil((peak_start-(args.bin_size-min_overlap)-first_bin_start)/args.bin_stride)
147 |         min_bin_start=chromosome_min_bin_index*args.bin_stride
148 |         chromosome_max_bin_index=floor((peak_end-min_overlap-first_bin_start)/args.bin_stride)
149 |         max_bin_start=chromosome_max_bin_index*args.bin_stride
150 | 
151 |         #get mean coverage in bigwig for each bin specified above
152 |         index_coverage_vals=chromosome_min_bin_index
153 |         for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
154 |             if index_coverage_vals >= 0 and index_coverage_vals <= (num_bins - 1):
155 |                 coverage_vals[index_coverage_vals]=1
156 |                 if args.save_label_source is True:
157 |                     label_source_dict[index_coverage_vals]={}
158 |                     label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom
159 |                     label_source_dict[index_coverage_vals][task_name+".START"]=peak_start
160 |                     label_source_dict[index_coverage_vals][task_name+".END"]=peak_end
161 |                 index_coverage_vals+=1
162 | 
163 |         #if allow_ambiguous supplied by user, shift 1 bin left and 1 bin right
164 |         if args.allow_ambiguous==True:
165 |             if chromosome_min_bin_index > 0 and chromosome_min_bin_index <= (num_bins - 1):
166 |                 chromosome_min_bin_index-=1
167 |                 coverage_vals[chromosome_min_bin_index]=np.nan
168 |                 if (args.save_label_source is True) and (chromosome_min_bin_index in label_source_dict):
169 |                     del label_source_dict[chromosome_min_bin_index]
170 | 
171 |             if chromosome_max_bin_index >= 0 and chromosome_max_bin_index < (num_bins - 1):
172 |                 chromosome_max_bin_index+=1
173 |                 coverage_vals[chromosome_max_bin_index]=np.nan
174 |                 if (args.save_label_source is True) and (chromosome_max_bin_index in label_source_dict):
175 |                     del label_source_dict[chromosome_max_bin_index]
176 | 
177 |                 
178 |     if ((args.allow_ambiguous==True) and (task_ambig!=None)):
179 |         for entry in chrom_ambig_bed:
180 |             chrom=entry[0]
181 |             peak_start=int(entry[1])
182 |             peak_end=int(entry[2])
183 |             min_overlap=int(round(args.overlap_thresh*min(args.bin_size, (peak_end-peak_start))))
184 | 
185 |             #get the bin indices that overlap the peak
186 |             chromosome_min_bin_index=ceil((peak_start-(args.bin_size-min_overlap)-first_bin_start)/args.bin_stride)
187 |             min_bin_start=chromosome_min_bin_index*args.bin_stride
188 |             chromosome_max_bin_index=floor((peak_end-min_overlap-first_bin_start)/args.bin_stride)
189 |             max_bin_start=chromosome_max_bin_index*args.bin_stride
190 | 
191 |             #get mean coverage in bigwig for each bin specified above
192 |             index_coverage_vals=chromosome_min_bin_index
193 |             for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
194 |                 if index_coverage_vals >= 0 and index_coverage_vals <= (num_bins - 1):
195 |                     coverage_vals[index_coverage_vals]=np.nan
196 |                     if (args.save_label_source is True) and (index_coverage_vals in label_source_dict):
197 |                         del label_source_dict[index_coverage_vals]
198 |                     index_coverage_vals+=1
199 |         
200 |     print("finished chromosome:"+str(chrom)+" for task:"+str(task_name))
201 |     return task_name, coverage_vals, label_source_dict
202 | 


--------------------------------------------------------------------------------
/seqdataloader/labelgen/regression_label_protocols.py:
--------------------------------------------------------------------------------
  1 | from math import floor,ceil
  2 | import pandas as pd
  3 | from .utils import rolling_window 
  4 | import pdb
  5 | import numpy as np 
  6 | from pybedtools import BedTool
  7 | import pyBigWig
  8 | 
  9 | def transform_label_vals(labels,label_transformer,pseudocount=0.001):
 10 |     if label_transformer is None:
 11 |         return labels
 12 |     elif label_transformer=="None":
 13 |         return labels 
 14 |     elif label_transformer == 'asinh':
 15 |         return np.arcsinh(labels)
 16 |     elif label_transformer == 'log10':
 17 |         return np.log10(labels+pseudocount)
 18 |     elif label_transformer == 'log':
 19 |         return np.log(labels+pseudocount)
 20 |     else:
 21 |         raise Exception("transform_label_vals argument must be one of None, asinh, log10, log; you provided:"+str(label_transformer)) 
 22 | 
 23 | 
 24 | def peak_summit_in_bin_regression(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args):
 25 |     '''
 26 |     For each peak, the summit position is determined. 
 27 | 
 28 |     The minimum bin with bedtools coverage is args.binsize upstream of the summit;
 29 |     The max bin with bedtools coverage is args.binsize downstream of the summit 
 30 | 
 31 |     Within this range, bin centers are shifted by args.bin_stride 
 32 | 
 33 |     If specified in args.allow_ambiguous, then coverage is also computed in adjacent bins to the two extremes are marked as 
 34 |     ambiguous 
 35 |     '''
 36 |     print("starting chromosome:"+str(chrom)+" for task:"+str(task_name))
 37 |     task_bigwig=pyBigWig.open(task_bigwig)
 38 |     task_bed=BedTool(task_bed)
 39 |     if task_ambig is not None:
 40 |         task_ambig=BedTool(task_ambig) 
 41 |     #get the peaks for the current chromosome by intersecting the task_bed with the chromosome coordinates 
 42 |     min_chrom_coord=first_bin_start
 43 |     max_chrom_coord=final_bin_start
 44 |     if min_chrom_coord >= max_chrom_coord:
 45 |         print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping")
 46 |         return task_name, None, None
 47 |     chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord)
 48 |     chrom_bed=BedTool(chrom_coords,from_string=True)
 49 |     chrom_task_bed=task_bed.intersect(chrom_bed)
 50 |     chrom_ambig_bed=None
 51 |     if ((args.allow_ambiguous==True) and (task_ambig!=None)):
 52 |         chrom_ambig_bed=task_ambig.intersect(chrom_bed)
 53 |     print("got peak subset for chrom:"+str(chrom)+" for task:"+str(task_name))
 54 |     
 55 |     #pre-allocate a numpy array of 0's
 56 |     num_bins=(final_bin_start-first_bin_start)//args.bin_stride+1 
 57 |     coverage_vals=np.zeros(num_bins)
 58 |     if args.save_label_source is True:
 59 |         label_source_dict=dict()
 60 |     else:
 61 |         label_source_dict=None 
 62 |         
 63 |     for entry in chrom_task_bed:
 64 |         chrom=entry[0]
 65 |         peak_start=int(entry[1])
 66 |         peak_end=int(entry[2])
 67 |         summit=peak_start+int(entry[-1])
 68 | 
 69 |         chromosome_min_bin_index=ceil((summit-args.bin_size)/args.bin_stride)
 70 |         min_bin_start=chromosome_min_bin_index*args.bin_stride
 71 |         chromosome_max_bin_index=floor(summit/args.bin_stride)
 72 |         max_bin_start=chromosome_max_bin_index*args.bin_stride 
 73 | 
 74 |         #if allow_ambiguous supplied by user, shift 1 bin left and 1 bin right 
 75 |         if args.allow_ambiguous==True:
 76 |             min_bin_start-=args.bin_stride
 77 |             chromosome_min_bin_index-=1
 78 |             max_bin_start+=args.bin_stride
 79 |             chromosome_max_bin_index+=1
 80 |         #get mean coverage in bigwig for each bin specified above
 81 |         index_coverage_vals=chromosome_min_bin_index
 82 |         for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
 83 |             if index_coverage_vals>=0 and index_coverage_vals < num_bins:
 84 |                 try:
 85 |                     coverage_vals[index_coverage_vals]=task_bigwig.stats(chrom,bin_start,bin_start+args.bin_size,args.bigwig_stats)[0]
 86 |                     if args.save_label_source is True:
 87 |                         label_source_dict[index_coverage_vals]={}
 88 |                         label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom
 89 |                         label_source_dict[index_coverage_vals][task_name+".START"]=peak_start
 90 |                         label_source_dict[index_coverage_vals][task_name+".END"]=peak_end
 91 |                 except:
 92 |                     print("could not get coverage:"+str(chrom)+":"+str(bin_start)+"-"+str(bin_start+args.bin_size)+" for task:"+str(task_name))
 93 |             index_coverage_vals+=1
 94 |             
 95 |     print("checking ambig")
 96 |     if chrom_ambig_bed!=None:
 97 |         for entry in chrom_ambig_bed:
 98 |             chrom=entry[0]
 99 |             peak_start=int(entry[1])
100 |             peak_end=int(entry[2])
101 |             summit=peak_start+int(entry[-1])
102 | 
103 |             chromosome_min_bin_index=ceil((summit-args.bin_size)/args.bin_stride)
104 |             min_bin_start=chromosome_min_bin_index*args.bin_stride
105 |             chromosome_max_bin_index=floor(summit/args.bin_stride)
106 |             max_bin_start=chromosome_max_bin_index*args.bin_stride
107 |             
108 |             #get mean coverage in bigwig for each bin specified above
109 |             index_coverage_vals=chromosome_min_bin_index
110 |             for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
111 |                 if index_coverage_vals>=0 and index_coverage_vals < num_bins: 
112 |                     coverage_vals[index_coverage_vals]=np.nan
113 |                     if args.save_label_source is True:
114 |                         if index_coverage_vals in label_source_dict:
115 |                             del label_source_dict[index_coverage_vals]
116 |                 index_coverage_vals+=1
117 | 
118 | 
119 |     print("finished chromosome:"+str(chrom)+" for task:"+str(task_name))
120 |     tranformed_vals=transform_label_vals(coverage_vals,args.label_transformer,args.label_transformer_pseudocount)
121 |     return task_name,transformed_vals, label_source_dict 
122 | 
123 | def peak_percent_overlap_with_bin_regression(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args):
124 |     '''
125 |     50% of the central 200bp region in a 1kb bin must overlap with the peak for coverage to be computed in the provided bigWig 
126 |     '''
127 |     #get the peaks for the current chromosome by intersecting the task_bed with the chromosome coordinates
128 |     print("starting chromosome:"+str(chrom)+" for task:"+str(task_name))
129 |     task_bigwig=pyBigWig.open(task_bigwig)
130 |     task_bed=BedTool(task_bed)
131 |     if task_ambig is not None:
132 |         task_ambig=BedTool(task_ambig) 
133 |     min_chrom_coord=first_bin_start
134 |     max_chrom_coord=final_bin_start
135 |     if min_chrom_coord >= max_chrom_coord:
136 |         print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping")
137 |         return task_name,None,None
138 |     chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord)
139 |     chrom_bed=BedTool(chrom_coords,from_string=True)
140 |     chrom_task_bed=task_bed.intersect(chrom_bed)
141 |     chrom_ambig_bed=None
142 |     if ((args.allow_ambiguous==True) and (task_ambig!=None)):
143 |         chrom_ambig_bed=task_ambig.intersect(chrom_bed)
144 |         
145 |     print("got peak subset for chrom:"+str(chrom)+" for task:"+str(task_name))
146 |     #pre-allocate a numpy array of 0's
147 |     num_bins=(final_bin_start-first_bin_start)//args.bin_stride+1 
148 |     coverage_vals=np.zeros(num_bins)
149 |     if args.save_label_source is True:
150 |         label_source_dict=dict()
151 |     else:
152 |         label_source_dict=None
153 |         
154 |     for entry in chrom_task_bed:
155 |         chrom=entry[0]
156 |         peak_start=int(entry[1])
157 |         peak_end=int(entry[2])
158 |         min_overlap=int(round(args.overlap_thresh*args.bin_size))        
159 | 
160 |         #get the bin indices that overlap the peak
161 |         chromosome_min_bin_index=(peak_start-min_overlap-first_bin_start)//args.bin_stride
162 |         min_bin_start=chromosome_min_bin_index*args.bin_stride 
163 |         chromosome_max_bin_index=(peak_end-min_overlap-first_bin_start)//args.bin_stride
164 |         max_bin_start=chromosome_max_bin_index*args.bin_stride
165 | 
166 |         #if allow_ambiguous supplied by user, shift 1 bin left and 1 bin right 
167 |         if args.allow_ambiguous==True:
168 |             min_bin_start-=args.bin_stride
169 |             chromosome_min_bin_index-=1
170 |             max_bin_start+=args.bin_stride
171 |             chromosome_max_bin_index+=1
172 | 
173 |         #get mean coverage in bigwig for each bin specified above 
174 |         index_coverage_vals=chromosome_min_bin_index
175 |         for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
176 |             if index_coverage_vals>=0 and index_coverage_vals < num_bins:
177 |                 try:
178 |                     coverage_vals[index_coverage_vals]=task_bigwig.stats(chrom,bin_start,bin_start+args.bin_size,args.bigwig_stats)[0]
179 |                     if args.save_label_source is True:
180 |                         label_source_dict[index_coverage_vals]={}
181 |                         label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom
182 |                         label_source_dict[index_coverage_vals][task_name+".START"]=peak_start
183 |                         label_source_dict[index_coverage_vals][task_name+".END"]=peak_end
184 |                 except:
185 |                     print("could not get coverage:"+str(chrom)+":"+str(bin_start)+"-"+str(bin_start+args.bin_size)+" for task:"+str(task_name))                    
186 |             index_coverage_vals+=1
187 | 
188 |     if ((args.allow_ambiguous==True) and (task_ambig!=None)):
189 |         for entry in chrom_ambig_bed:
190 |             chrom=entry[0]
191 |             peak_start=int(entry[1])
192 |             peak_end=int(entry[2])
193 |             min_overlap=int(round(args.overlap_thresh*args.bin_size))        
194 | 
195 |             #get the bin indices that overlap the peak
196 |             chromosome_min_bin_index=(peak_start-min_overlap-first_bin_start)//args.bin_stride
197 |             min_bin_start=chromosome_min_bin_index*args.bin_stride 
198 |             chromosome_max_bin_index=(peak_end-min_overlap-first_bin_start)//args.bin_stride
199 |             max_bin_start=chromosome_max_bin_index*args.bin_stride
200 |             #get mean coverage in bigwig for each bin specified above 
201 |             index_coverage_vals=chromosome_min_bin_index
202 |             for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
203 |                 if index_coverage_vals>=0 and index_coverage_vals < num_bins:
204 |                     coverage_vals[index_coverage_vals]=np.nan
205 |                     if args.save_label_source is True:
206 |                         if index_coverage_vals in label_source_dict:
207 |                             del label_source_dict[index_coverage_vals]
208 |                 index_coverage_vals+=1        
209 | 
210 |     print("finished chromosome:"+str(chrom)+" for task:"+str(task_name))
211 |     transformed_vals=transform_label_vals(coverage_vals,args.label_transformer,args.label_transformer_pseudocount)
212 |     return task_name,transformed_vals, label_source_dict
213 | 
214 | def all_genome_bins_regression(task_name,task_bed,task_bigwig,task_ambig,chrom,first_bin_start,final_bin_start,args):
215 |     '''
216 |     compute bigWig coverage for all bins in the chromosome, regardless of whether a called peak overlaps the bin
217 |     '''
218 |     print("starting chromosome:"+str(chrom)+" for task:"+str(task_name))
219 |     task_bigwig=pyBigWig.open(task_bigwig)
220 |     if task_ambig is not None:
221 |         task_ambig=BedTool(task_ambig) 
222 | 
223 |     #get the BigWig value at each position along the chromosome, (cutting off anything that extends beyond final_coord)
224 |     try:
225 |         values=task_bigwig.values(chrom,first_bin_start,final_bin_start+args.bin_size,numpy=True)
226 |     except:
227 |         print("Warning! Chromosome:"+str(chrom)+" appears not to be present in the bigWig file for task:"+task_name)
228 |         return task_name,None,None
229 |     #replace nan values with 0 
230 |     values=np.nan_to_num(values) 
231 |     #reshape the values such that number of columns is equal to the bin_stride 
232 |     values=np.reshape(values,((final_bin_start+args.bin_size-first_bin_start)//args.bin_stride,args.bin_stride))
233 |     #sum across the columns
234 |     strided_sums=np.sum(values,axis=1)
235 | 
236 |     #compute rolling average for each bin
237 |     bin_means=np.sum(rolling_window(strided_sums,args.bin_size//args.bin_stride),-1)/args.bin_size
238 |     norm_bin_means=transform_label_vals(bin_means,args.label_transformer,args.label_transformer_pseudocount)
239 |     num_bins=norm_bin_means.shape[0]
240 |     #add in ambiguous bins
241 |     chrom_ambig_bed=None
242 |     if ((args.allow_ambiguous==True) and (task_ambig is not None)):
243 |         min_chrom_coord=first_bin_start
244 |         max_chrom_coord=final_bin_start
245 |         if min_chrom_coord >= max_chrom_coord:
246 |             print("the chromosome"+chrom+" is too short for the specified settings of --left_flank, --right_flank, --bin_size, skipping")
247 |             return task_name,None,None
248 |         chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord)
249 |         chrom_bed=BedTool(chrom_coords,from_string=True)
250 |         chrom_ambig_bed=task_ambig.intersect(chrom_bed)
251 |         for entry in chrom_ambig_bed:
252 |             chrom=entry[0]
253 |             peak_start=int(entry[1])
254 |             peak_end=int(entry[2])
255 |             summit=peak_start+int(entry[-1])
256 |             chromosome_min_bin_index=ceil((summit-args.bin_size)/args.bin_stride)
257 |             min_bin_start=chromosome_min_bin_index*args.bin_stride
258 |             chromosome_max_bin_index=floor(summit/args.bin_stride)
259 |             max_bin_start=chromosome_max_bin_index*args.bin_stride
260 |             
261 |             #get mean coverage in bigwig for each bin specified above
262 |             index_coverage_vals=chromosome_min_bin_index
263 |             for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
264 |                 if index_coverage_vals>=0 and index_coverage_vals < num_bins: 
265 |                     norm_bin_means[index_coverage_vals]=np.nan
266 |                 index_coverage_vals+=1
267 | 
268 |     label_source_dict=None
269 |     if args.save_label_source is True:
270 |         assert task_bed is not None
271 |         print("getting source peaks for genome bins")
272 |         task_bed=BedTool(task_bed)
273 |         min_chrom_coord=first_bin_start
274 |         max_chrom_coord=final_bin_start
275 |         if min_chrom_coord >= max_chrom_coord:
276 |             return task_name, norm_bin_means, None 
277 |         chrom_coords=chrom+'\t'+str(min_chrom_coord)+'\t'+str(max_chrom_coord)
278 |         chrom_bed=BedTool(chrom_coords,from_string=True)
279 |         chrom_task_bed=task_bed.intersect(chrom_bed)
280 |         for entry in chrom_task_bed:
281 |             chrom=entry[0]
282 |             peak_start=int(entry[1])
283 |             peak_end=int(entry[2])
284 |             summit=peak_start+int(entry[-1])
285 |             
286 |             chromosome_min_bin_index=ceil((summit-args.bin_size)/args.bin_stride)
287 |             min_bin_start=chromosome_min_bin_index*args.bin_stride
288 |             chromosome_max_bin_index=floor(summit/args.bin_stride)
289 |             max_bin_start=chromosome_max_bin_index*args.bin_stride
290 |             
291 |             #get mean coverage in bigwig for each bin specified above
292 |             index_coverage_vals=chromosome_min_bin_index
293 |             for bin_start in range(min_bin_start,max_bin_start+1,args.bin_stride):
294 |                 if index_coverage_vals>=0 and index_coverage_vals < num_bins:
295 |                     if norm_bin_means[index_coverage_vals] is not np.nan:
296 |                         label_source_dict[index_coverage_vals]={}
297 |                         label_source_dict[index_coverage_vals][task_name+".CHR"]=chrom
298 |                         label_source_dict[index_coverage_vals][task_name+".START"]=peak_start
299 |                         label_source_dict[index_coverage_vals][task_name+".END"]=peak_end
300 |                 index_coverage_vals+=1            
301 | 
302 |     print("finished chromosome:"+str(chrom)+" for task:"+str(task_name))    
303 |     return task_name,norm_bin_means, label_source_dict
304 | 
305 | 


--------------------------------------------------------------------------------
/seqdataloader/labelgen/rolling_average.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd 
 3 | import pyBigWig
 4 | import pandas as pd 
 5 | import pdb
 6 | from math import floor
 7 | 
 8 | def rolling_window(a, window):
 9 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
10 |     strides = a.strides + (a.strides[-1],)
11 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
12 |             
13 | 
14 | task_bigwig=pyBigWig.open("./bigwig_files_from_encode_for_label_comparison/ENCFF842XRQ.bigWig")
15 | chromsizes=pd.read_csv("hg38.chrom.sizes",header=None,sep='\t')
16 | bin_size=200
17 | bin_stride=50
18 | left_flank=400
19 | right_flank=400
20 | seq_size=left_flank+right_flank+bin_size
21 | task_name="test"
22 | for index,row in chromsizes.iterrows():
23 |     chrom=row[0]
24 |     chromsize=row[1]
25 |     nbins=chromsize//bin_stride
26 |     final_coord=nbins*bin_stride 
27 |     print(final_coord)
28 |     print(chromsize) 
29 |     values=task_bigwig.values(chrom,0,final_coord,numpy=True)
30 |     print("got values") 
31 |     cols=bin_stride
32 |     rows=final_coord//cols 
33 |     values=np.reshape(values,(rows,cols))
34 |     print("completed reshape!") 
35 |     #sum the bins
36 |     binsums=np.sum(values,axis=1)
37 |     print("completed bin sums")
38 |     bin_means=np.sum(rolling_window(binsums,bin_size//bin_stride),-1)/bin_size 
39 |     print("rolled")
40 |     non_zero_inds=np.nonzero(bin_means)[0]
41 |     non_zero_seq_start=non_zero_inds*bin_stride-left_flank
42 |     non_zero_seq_end=non_zero_seq_start+seq_size
43 |     non_zero_bins=dict()
44 |     for i in range(non_zero_inds.shape[0]):
45 |         bin_index=non_zero_inds[i]
46 |         cur_bin_mean=bin_means[bin_index]
47 |         non_zero_bins[(chrom,non_zero_seq_start[i],non_zero_seq_end[i])]=dict()
48 |         non_zero_bins[(chrom,non_zero_seq_start[i],non_zero_seq_end[i])][task_name]=cur_bin_mean
49 |     print("finished chrom:"+str(chrom)+" for task:"+str(task_name))                                                                      
50 |     
51 | 


--------------------------------------------------------------------------------
/seqdataloader/labelgen/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | def rolling_window(a, window):
4 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
5 |     strides = a.strides + (a.strides[-1],)
6 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
7 | 


--------------------------------------------------------------------------------
/seqdataloader/queue_config.py:
--------------------------------------------------------------------------------
1 | queue_feed_chunk_size=50 #number of items to queue into pool at once for processing
2 | max_write_chunk=30000000
3 | 


--------------------------------------------------------------------------------
/seqdataloader/tdb_config.py:
--------------------------------------------------------------------------------
1 | tdb_config_params={"sm.check_coord_dups":False,
2 |                    "sm.check_coord_oob":False,
3 |                    "sm.check_global_order":False,
4 |                    "sm.num_writer_threads":50,
5 |                    "sm.num_reader_threads":50,
6 |                    "sm.num_async_threads":50,
7 |                    "vfs.num_threads":50}
8 | #                   "sm.memory_budget":"5000000000"
9 | 


--------------------------------------------------------------------------------
/seqdataloader/utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import pyBigWig
  4 | from pybedtools import BedTool
  5 | from itertools import islice
  6 | from collections import OrderedDict
  7 | 
  8 | def open_bigwig_for_parsing(fname,parallel=False):
  9 |     if not parallel:
 10 |         return pyBigWig.open(fname)
 11 |     else:
 12 |         #pybigwig objects cannot be pickled
 13 |         return fname 
 14 | def open_csv_for_parsing(fname,parallel=False):
 15 |     #if not parallel:
 16 |     return BedTool(fname)
 17 |     #else:
 18 |         #
 19 |     #    return fname
 20 | 
 21 | def parse_bigwig_chrom_vals(entry):
 22 |     bigwig_object=entry[0]
 23 |     if type(bigwig_object)==str:
 24 |         bigwig_object=pyBigWig.open(bigwig_object)
 25 |     chrom=entry[1]
 26 |     start=entry[2]
 27 |     end=entry[3]
 28 |     cur_attribute_info=entry[4]
 29 |     #note: pybigwig uses NA in place of 0 where there are no reads, replace with 0.
 30 |     bw_chroms=bigwig_object.chroms().keys()
 31 |     if chrom not in bw_chroms:
 32 |         print("WARNING: chromosome:"+str(chrom)+ " was not found in the bigwig file:"+str(bigwig_object))
 33 |         size=(end-start+1)
 34 |         signal_data=np.full(size,np.nan)
 35 |     else: 
 36 |         #check to see if chromosome in bigwig, if not, return all NA's & warning that chromosome is not present in the dataset
 37 |         try:
 38 |             signal_data=np.nan_to_num(bigwig_object.values(chrom,start,end))
 39 |         except Exception as e:
 40 |             print(chrom+"\t"+str(start)+"\t"+str(end)+str(cur_attribute_info))
 41 |             raise e
 42 |     return start, end, signal_data
 43 | 
 44 | 
 45 | def parse_narrowPeak_chrom_vals(entry):
 46 |     task_bed=entry[0]
 47 |     chrom=entry[1]
 48 |     start=entry[2]
 49 |     end=entry[3]
 50 |     num_entries=end-start
 51 |     chrom_coords=chrom+'\t'+str(start)+'\t'+str(end)
 52 |     chrom_bed=BedTool(chrom_coords,from_string=True)
 53 |     cur_bed=task_bed.intersect(chrom_bed)
 54 |     cur_attribute_info=entry[4]
 55 |     store_summits=None
 56 |     summit_indicator=None
 57 |     summit_from_peak_center=None
 58 |     if 'store_summits' in cur_attribute_info:
 59 |         store_summits=cur_attribute_info['store_summits']
 60 |         if store_summits is True:
 61 |             summit_from_peak_center=cur_attribute_info['summit_from_peak_center'] 
 62 |             summit_indicator=cur_attribute_info['summit_indicator']
 63 |     signal_data = np.zeros(num_entries, dtype=np.int)
 64 |     warned=False
 65 |     summits=[]
 66 |     for entry in cur_bed:
 67 |         #offset relative to start position of the interval
 68 |         entry_start=int(entry[1])-start
 69 |         entry_end=int(entry[2])-start
 70 |         signal_data[entry_start:entry_end]=1
 71 |         #add in summits in a separate step to avoid overwriting them with "1's" for overlaping peak coordinates;
 72 |         #The overwriting issue is particularly relevant for pseudobulk data. 
 73 |         if store_summits is True:
 74 |             if summit_from_peak_center is True:
 75 |                 summit_pos=int(entry_start+(entry_end-entry_start)*0.5)
 76 |             else:
 77 |                 try:
 78 |                     summit_pos=entry_start+int(entry[-1])
 79 |                 except:
 80 |                     print("WARNING: could not add summit position from last column of narrowPeak file, falling back to peak center"+str(entry))
 81 |                     summit_pos=int(entry_start+(entry_end-entry_start)*0.5)                
 82 |             if (summit_pos < entry_end) and (summit_pos > entry_start):
 83 |                 summits.append(summit_pos)
 84 |             else:
 85 |                 print("WARNING: summit position outside peak region position,skipping:"+str(entry))                    
 86 |     if store_summits is True:
 87 |         signal_data[summits]=summit_indicator
 88 |     return start, end, signal_data 
 89 |     
 90 | def chunkify(iterable,chunk):
 91 |     it=iter(iterable)
 92 |     while True:
 93 |         piece=list(islice(it,chunk))
 94 |         if piece:
 95 |             yield piece
 96 |         else:
 97 |             return
 98 |         
 99 | def transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices):
100 |     #print("transforming:"+str(start_chunk_index)+'-'+str(end_chunk_index))
101 |     #print(str(chrom_indices))
102 |     chrom_coords=[]
103 |     while True:
104 |         found=False
105 |         for chrom in chrom_indices:
106 |             cur_chrom_start_index=chrom_indices[chrom][0]
107 |             cur_chrom_end_index=chrom_indices[chrom][1]
108 |             cur_chrom_size=chrom_indices[chrom][2]
109 |             if start_chunk_index >=cur_chrom_start_index:
110 |                 if start_chunk_index < cur_chrom_end_index:
111 |                     found=True
112 |                     #start position is on this chromosome
113 |                     chrom_coord_start=start_chunk_index-cur_chrom_start_index
114 |                     #check if end coordinate falls on same chromosome
115 |                     if end_chunk_index < cur_chrom_end_index:
116 |                         #on one chrom
117 |                         chrom_coord_end=end_chunk_index-cur_chrom_start_index
118 |                         chrom_coords.append((chrom,chrom_coord_start,chrom_coord_end,start_chunk_index,end_chunk_index))
119 |                         return chrom_coords
120 |                     else:
121 |                         chrom_coord_end=cur_chrom_size
122 |                         chrom_coords.append((chrom,chrom_coord_start,chrom_coord_end,start_chunk_index,cur_chrom_end_index))
123 |                         #update start_chunk_index
124 |                         start_chunk_index=cur_chrom_end_index
125 |         if found is False:
126 |             if len(chrom_coords)==0:
127 |                 raise Exception("failed to tranform indices:"+str(start_chunk_index)+"-"+str(end_chunk_index)+ " to chrom coords;"+str(chrom_indices))
128 |             else:
129 |                 return chrom_coords
130 |         
131 | def transform_chrom_size_to_indices(chrom_sizes):
132 |     '''
133 |     chrom_sizes is a dataframe 
134 |     get 0-based tdb coordinates for start & end of each chromosome 
135 |     '''
136 |     start_coord=0
137 |     chrom_indices=OrderedDict() 
138 |     for index,row in chrom_sizes.iterrows():
139 |         chrom=row[0]
140 |         size=row[1]
141 |         chrom_indices[chrom]=[start_coord,start_coord+size,size]
142 |         start_coord=start_coord+size
143 |     return chrom_indices,start_coord
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup,find_packages
 2 | config = {
 3 |     'include_package_data': True,
 4 |     'author': 'Anna Shcherbina',
 5 |     'author_email': 'annashch@stanford.edu',
 6 |     'url': 'https://github.com/kundajelab/seqdataloader',
 7 |     'description': 'Generate genome-wide classification and regression labels for DNA accessibility data.',
 8 |     'version': '1.2',
 9 |     'packages': ['seqdataloader'],
10 |     'setup_requires': [],
11 |     'install_requires': ['numpy>=1.15','pandas>=0.23.4','cython>=0.27.3','deeptools>=3.0.1','pybedtools>=0.7','pyBigWig>=0.3.7', 'pyfaidx','tiledb>=0.4.4'],
12 |     'scripts': [],
13 |     'entry_points': {'console_scripts': ['genomewide_labels=seqdataloader.labelgen.__init__:main',
14 |                                          'db_ingest=seqdataloader.dbingest.__init__:main',
15 |                                          'db_ingest_single_threaded=seqdataloader.dbingest_single_threaded.__init__:main',
16 |                                          'seqdataloader_get_outliers=seqdataloader.helpers.get_outliers:main']},
17 |     'name': 'seqdataloader'
18 | }
19 | 
20 | if __name__== '__main__':
21 |     setup(**config)
22 | 


--------------------------------------------------------------------------------
/tests/test_tiledb_coords_to_vals.benchmark.py:
--------------------------------------------------------------------------------
 1 | #unit tests for class seqdataloader.batchproducers.coordbased.coordstovals.BasicTiledbProfileCoordsToVals
 2 | import pdb
 3 | from seqdataloader.batchproducers.coordbased.coordstovals.tiledb import *
 4 | 
 5 | #generate some test coords objects 
 6 | from collections import namedtuple
 7 | Coord=namedtuple('Coord','chrom start end isplusstrand')
 8 | coords=[Coord('chr1',1000000,2000000,True),
 9 |         Coord('chr2',1000000,2000000,True),
10 |         Coord('chr3',1000000,2000000,True),
11 |         Coord('chr4',1000000,2000000,True),
12 |         Coord('chr5',1000000,2000000,True),
13 |         Coord('chr6',1000000,2000000,True),
14 |         Coord('chr7',1000000,2000000,True),
15 |         Coord('chr1',1000000,2000000,False),
16 |         Coord('chr2',1000000,2000000,False),
17 |         Coord('chr3',1000000,2000000,False),
18 |         Coord('chr4',1000000,2000000,False),
19 |         Coord('chr5',1000000,2000000,False),
20 |         Coord('chr6',1000000,2000000,False),
21 |         Coord('chr7',1000000,2000000,False)]
22 | 
23 | 
24 | pos_label_source_attribute="fc_bigwig"
25 | neg_label_source_attribute="fc_bigwig"
26 | 
27 | 
28 | #case 1: tiledb_paths is a string
29 | tiledb_paths="/mnt/data/tiledb/encode/dnase/ENCSR000EOY"
30 | ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,
31 |                                     pos_label_source_attribute=pos_label_source_attribute,
32 |                                     neg_label_source_attribute=neg_label_source_attribute)
33 | string_vals=ctov.__call__(coords)
34 | 
35 | 


--------------------------------------------------------------------------------
/tests/test_tiledb_coords_to_vals.py:
--------------------------------------------------------------------------------
 1 | #unit tests for class seqdataloader.batchproducers.coordbased.coordstovals.BasicTiledbProfileCoordsToVals
 2 | import pdb
 3 | from seqdataloader.batchproducers.coordbased.coordstovals.tiledb import *
 4 | 
 5 | #generate some test coords objects 
 6 | from collections import namedtuple
 7 | Coord=namedtuple('Coord','chrom start end isplusstrand')
 8 | coords=[Coord('chr1',1000000,2000000,True),
 9 |         Coord('chr2',1000000,2000000,True),
10 |         Coord('chr3',1000000,2000000,True),
11 |         Coord('chr4',1000000,2000000,True),
12 |         Coord('chr5',1000000,2000000,True),
13 |         Coord('chr6',1000000,2000000,True),
14 |         Coord('chr7',1000000,2000000,True),
15 |         Coord('chr1',1000000,2000000,False),
16 |         Coord('chr2',1000000,2000000,False),
17 |         Coord('chr3',1000000,2000000,False),
18 |         Coord('chr4',1000000,2000000,False),
19 |         Coord('chr5',1000000,2000000,False),
20 |         Coord('chr6',1000000,2000000,False),
21 |         Coord('chr7',1000000,2000000,False)]
22 | 
23 | 
24 | pos_label_source_attribute="fc_bigwig"
25 | neg_label_source_attribute="fc_bigwig"
26 | 
27 | 
28 | #case 1: tiledb_paths is a string
29 | tiledb_paths="/mnt/data/tiledb/encode/dnase/ENCSR000EOY"
30 | ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,
31 |                                     pos_label_source_attribute=pos_label_source_attribute,
32 |                                     neg_label_source_attribute=neg_label_source_attribute)
33 | string_vals=ctov.__call__(coords)
34 | pdb.set_trace() 
35 | 
36 | #case2: tiledb_paths is a list
37 | tiledb_paths=["/mnt/data/tiledb/encode/dnase/ENCSR000EOY","/mnt/data/tiledb/encode/dnase/ENCSR000EOY","/mnt/data/tiledb/encode/dnase/ENCSR000EOY"]
38 | ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,
39 |                                     pos_label_source_attribute=pos_label_source_attribute,
40 |                                     neg_label_source_attribute=neg_label_source_attribute)
41 | list_vals=ctov.__call__(coords)
42 | pdb.set_trace() 
43 | 
44 | #case3: tiledb_paths is a dict
45 | tiledb_paths={'mode0':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY",
46 |               'mode1':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY",
47 |               'mode2':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY"}
48 | 
49 | ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,
50 |                                     pos_label_source_attribute=pos_label_source_attribute,
51 |                                     neg_label_source_attribute=neg_label_source_attribute)
52 | dict_vals=ctov.__call__(coords)
53 | pdb.set_trace() 
54 | 
55 | 


--------------------------------------------------------------------------------