├── .environmentLinux.yaml ├── .github └── workflows │ ├── build.yml │ └── pypi.yml ├── .gitignore ├── .gitmodules ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── libBigWig ├── LICENSE ├── README.md ├── bigWig.h ├── bigWigIO.h ├── bwCommon.h ├── bwRead.c ├── bwStats.c ├── bwValues.c ├── bwValues.h ├── bwWrite.c └── io.c ├── pyBigWig.c ├── pyBigWig.h ├── pyBigWigTest ├── __init__.py ├── test.bigBed ├── test.bw └── test.py ├── pyproject.toml ├── setup.cfg └── setup.py /.environmentLinux.yaml: -------------------------------------------------------------------------------- 1 | name: foo 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - default 6 | dependencies: 7 | - gcc_linux-64 8 | - curl 9 | - zlib 10 | - python = 3.9 11 | - pip 12 | - numpy 13 | - pytest -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: 3 | pull_request: 4 | push: 5 | 6 | jobs: 7 | testLinux: 8 | name: Test Conda Linux 9 | runs-on: "ubuntu-latest" 10 | defaults: 11 | run: 12 | shell: bash -l {0} 13 | steps: 14 | - uses: actions/checkout@v2 15 | - uses: conda-incubator/setup-miniconda@v2 16 | with: 17 | activate-environment: foo 18 | environment-file: .environmentLinux.yaml 19 | python-version: 3.9 20 | auto-activate-base: false 21 | - run: | 22 | pip install . 23 | pytest pyBigWigTest/test.py 24 | 25 | test-builds: 26 | runs-on: ubuntu-latest 27 | 28 | steps: 29 | - uses: actions/checkout@v3 30 | with: 31 | fetch-depth: 0 32 | - name: Set up Python 33 | uses: actions/setup-python@v4 34 | with: 35 | python-version: '3.9' 36 | - name: Install build prerequisites 37 | run: | 38 | python -m pip install --upgrade build numpy 39 | - name: Install cibuildwheel 40 | run: | 41 | python -m pip install --upgrade cibuildwheel 42 | - name: Build wheel(s) 43 | run: | 44 | python -m cibuildwheel --output-dir wheelhouse 45 | - name: Build sdist 46 | run: | 47 | python -m build --sdist 48 | - uses: actions/upload-artifact@v3 49 | with: 50 | name: pyBigWig-build 51 | path: | 52 | wheelhouse/* 53 | dist/pyBigWig*.tar.gz 54 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: pypi 2 | on: [push] 3 | jobs: 4 | pypi: 5 | name: upload to pypi 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v3 9 | with: 10 | fetch-depth: 0 11 | - name: Set up Python 12 | uses: actions/setup-python@v4 13 | with: 14 | python-version: '3.9' 15 | - name: Install build prerequisites 16 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 17 | run: | 18 | python -m pip install --upgrade twine build cibuildwheel numpy 19 | - name: sdist 20 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 21 | run: | 22 | python -m build --sdist 23 | - name: wheel 24 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 25 | run: | 26 | python -m cibuildwheel --output-dir wheelhouse 27 | - name: upload 28 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 29 | env: 30 | TWINE_USERNAME: "__token__" 31 | TWINE_PASSWORD: ${{ secrets.pypi_password }} 32 | run: | 33 | twine upload dist/* 34 | twine upload wheelhouse/* 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *,cover 44 | 45 | # Translations 46 | *.mo 47 | *.pot 48 | 49 | # Django stuff: 50 | *.log 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # PyBuilder 56 | target/ 57 | 58 | *.o 59 | #./setup.py sdist creates this 60 | MANIFEST 61 | 62 | *.swp 63 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/pyBigWig/7300b0a4599e7f72085c3c27c19b375e3a2c2cc0/.gitmodules -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Devon Ryan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.h 2 | include **/*.h 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI version](https://badge.fury.io/py/pyBigWig.svg)](https://badge.fury.io/py/pyBigWig) [![Travis-CI status](https://travis-ci.org/deeptools/pyBigWig.svg?branch=master)](https://travis-ci.org/dpryan79/pyBigWig.svg?branch=master) [![bioconda-badge](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io) [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.45238.svg)](http://dx.doi.org/10.5281/zenodo.45238) 2 | 3 | # pyBigWig 4 | A python extension, written in C, for quick access to bigBed files and access to and creation of bigWig files. This extension uses [libBigWig](https://github.com/dpryan79/libBigWig) for local and remote file access. 5 | 6 | Table of Contents 7 | ================= 8 | 9 | * [Installation](#installation) 10 | * [Requirements](#requirements) 11 | * [Usage](#usage) 12 | * [Load the extension](#load-the-extension) 13 | * [Open a bigWig or bigBed file](#open-a-bigwig-or-bigbed-file) 14 | * [Determining the file type](#determining-the-file-type) 15 | * [Access the list of chromosomes and their lengths](#access-the-list-of-chromosomes-and-their-lengths) 16 | * [Print the header](#print-the-header) 17 | * [Compute summary information on a range](#compute-summary-information-on-a-range) 18 | * [A note on statistics and zoom levels](#a-note-on-statistics-and-zoom-levels) 19 | * [Retrieve values for individual bases in a range](#retrieve-values-for-individual-bases-in-a-range) 20 | * [Retrieve all intervals in a range](#retrieve-all-intervals-in-a-range) 21 | * [Retrieving bigBed entries](#retrieving-bigbed-entries) 22 | * [Add a header to a bigWig file](#add-a-header-to-a-bigwig-file) 23 | * [Adding entries to a bigWig file](#adding-entries-to-a-bigwig-file) 24 | * [Close a bigWig or bigBed file](#close-a-bigwig-or-bigbed-file) 25 | * [Numpy](#numpy) 26 | * [Remote file access](#remote-file-access) 27 | * [Empty files](#empty-files) 28 | * [A note on coordinates](#a-note-on-coordinates) 29 | * [Galaxy](#galaxy) 30 | 31 | # Installation 32 | You can install this extension directly from github with: 33 | 34 | pip install pyBigWig 35 | 36 | or with conda 37 | 38 | conda install pybigwig -c conda-forge -c bioconda 39 | 40 | ## Requirements 41 | 42 | The follow non-python requirements must be installed: 43 | 44 | - libcurl (and the `curl-config` config) 45 | - zlib 46 | 47 | The headers and libraries for these are required. 48 | 49 | # Usage 50 | Basic usage is as follows: 51 | 52 | ## Load the extension 53 | 54 | >>> import pyBigWig 55 | 56 | ## Open a bigWig or bigBed file 57 | 58 | This will work if your working directory is the pyBigWig source code directory. 59 | 60 | >>> bw = pyBigWig.open("test/test.bw") 61 | 62 | Note that if the file doesn't exist you'll see an error message and `None` will be returned. Be default, all files are opened for reading and not writing. You can alter this by passing a mode containing `w`: 63 | 64 | >>> bw = pyBigWig.open("test/output.bw", "w") 65 | 66 | Note that a file opened for writing can't be queried for its intervals or statistics, it can *only* be written to. If you open a file for writing then you will next need to add a header (see the section on this below). 67 | 68 | Local and remote bigBed read access is also supported: 69 | 70 | >>> bb = pyBigWig.open("https://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed") 71 | 72 | While you can specify a mode for bigBed files, it is ignored. The object returned by `pyBigWig.open()` is the same regardless of whether you're opening a bigWig or bigBed file. 73 | 74 | ## Determining the file type 75 | 76 | Since bigWig and bigBed files can both be opened, it may be necessary to determine whether a given `bigWigFile` object points to a bigWig or bigBed file. To that end, one can use the `isBigWig()` and `isBigBed()` functions: 77 | 78 | >>> bw = pyBigWig.open("test/test.bw") 79 | >>> bw.isBigWig() 80 | True 81 | >>> bw.isBigBed() 82 | False 83 | 84 | ## Access the list of chromosomes and their lengths 85 | 86 | `bigWigFile` objects contain a dictionary holding the chromosome lengths, which can be accessed with the `chroms()` accessor. 87 | 88 | >>> bw.chroms() 89 | dict_proxy({'1': 195471971L, '10': 130694993L}) 90 | 91 | You can also directly query a particular chromosome. 92 | 93 | >>> bw.chroms("1") 94 | 195471971L 95 | 96 | The lengths are stored a the "long" integer type, which is why there's an `L` suffix. If you specify a non-existant chromosome then nothing is output. 97 | 98 | >>> bw.chroms("c") 99 | >>> 100 | 101 | ## Print the header 102 | 103 | It's sometimes useful to print a bigWig's header. This is presented here as a python dictionary containing: the version (typically `4`), the number of zoom levels (`nLevels`), the number of bases described (`nBasesCovered`), the minimum value (`minVal`), the maximum value (`maxVal`), the sum of all values (`sumData`), and the sum of all squared values (`sumSquared`). The last two of these are needed for determining the mean and standard deviation. 104 | 105 | >>> bw.header() 106 | {'maxVal': 2L, 'sumData': 272L, 'minVal': 0L, 'version': 4L, 'sumSquared': 500L, 'nLevels': 1L, 'nBasesCovered': 154L} 107 | 108 | Note that this is also possible for bigBed files and the same dictionary keys will be present. Entries such as `maxVal`, `sumData`, `minVal`, and `sumSquared` are then largely not meaningful. 109 | 110 | ## Compute summary information on a range 111 | 112 | bigWig files are used to store values associated with positions and ranges of them. Typically we want to quickly access the average value over a range, which is very simple: 113 | 114 | >>> bw.stats("1", 0, 3) 115 | [0.2000000054637591] 116 | 117 | Suppose instead of the mean value, we instead wanted the maximum value: 118 | 119 | >>> bw.stats("1", 0, 3, type="max") 120 | [0.30000001192092896] 121 | 122 | Other options are "min" (the minimum value), "coverage" (the fraction of bases covered), and "std" (the standard deviation of the values). 123 | 124 | It's often the case that we would instead like to compute values of some number of evenly spaced bins in a given interval, which is also simple: 125 | 126 | >>> bw.stats("1",99, 200, type="max", nBins=2) 127 | [1.399999976158142, 1.5] 128 | 129 | `nBins` defaults to 1, just as `type` defaults to `mean`. 130 | 131 | If the start and end positions are omitted then the entire chromosome is used: 132 | 133 | >>> bw.stats("1") 134 | [1.3351851569281683] 135 | 136 | ### A note on statistics and zoom levels 137 | 138 | > A note to the lay reader: This section is rather technical and included only for the sake of completeness. The summary is that if your needs require exact mean/max/etc. summary values for an interval or intervals and that a small trade-off in speed is acceptable, that you should use the `exact=True` option in the `stats()` function. 139 | 140 | By default, there are some unintuitive aspects to computing statistics on ranges in a bigWig file. The bigWig format was originally created in the context of genome browsers. There, computing exact summary statistics for a given interval is less important than quickly being able to compute an approximate statistic (after all, browsers need to be able to quickly display a number of contiguous intervals and support scrolling/zooming). Because of this, bigWig files contain not only interval-value associations, but also `sum of values`/`sum of squared values`/`minimum value`/`maximum value`/`number of bases covered` for equally sized bins of various sizes. These different sizes are referred to as "zoom levels". The smallest zoom level has bins that are 16 times the mean interval size in the file and each subsequent zoom level has bins 4 times larger than the previous. This methodology is used in Kent's tools and, therefore, likely used in almost every currently existing bigWig file. 141 | 142 | When a bigWig file is queried for a summary statistic, the size of the interval is used to determine whether to use a zoom level and, if so, which one. The optimal zoom level is that which has the largest bins no more than half the width of the desired interval. If no such zoom level exists, the original intervals are instead used for the calculation. 143 | 144 | For the sake of consistency with other tools, pyBigWig adopts this same methodology. However, since this is (A) unintuitive and (B) undesirable in some applications, pyBigWig enables computation of exact summary statistics regardless of the interval size (i.e., it allows ignoring the zoom levels). This was originally proposed [here](https://github.com/dpryan79/pyBigWig/issues/12) and an example is below: 145 | 146 | >>> import pyBigWig 147 | >>> from numpy import mean 148 | >>> bw = pyBigWig.open("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign75mer.bigWig") 149 | >>> bw.stats('chr1', 89294, 91629) 150 | [0.20120902053804418] 151 | >>> mean(bw.values('chr1', 89294, 91629)) 152 | 0.22213841940688142 153 | >>> bw.stats('chr1', 89294, 91629, exact=True) 154 | [0.22213841940688142] 155 | 156 | ## Retrieve values for individual bases in a range 157 | 158 | While the `stats()` method **can** be used to retrieve the original values for each base (e.g., by setting `nBins` to the number of bases), it's preferable to instead use the `values()` accessor. 159 | 160 | >>> bw.values("1", 0, 3) 161 | [0.10000000149011612, 0.20000000298023224, 0.30000001192092896] 162 | 163 | The list produced will always contain one value for every base in the range specified. If a particular base has no associated value in the bigWig file then the returned value will be `nan`. 164 | 165 | >>> bw.values("1", 0, 4) 166 | [0.10000000149011612, 0.20000000298023224, 0.30000001192092896, nan] 167 | 168 | ## Retrieve all intervals in a range 169 | 170 | Sometimes it's convenient to retrieve all entries overlapping some range. This can be done with the `intervals()` function: 171 | 172 | >>> bw.intervals("1", 0, 3) 173 | ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224), (2, 3, 0.30000001192092896)) 174 | 175 | What's returned is a list of tuples containing: the start position, end end position, and the value. Thus, the example above has values of `0.1`, `0.2`, and `0.3` at positions `0`, `1`, and `2`, respectively. 176 | 177 | If the start and end position are omitted then all intervals on the chromosome specified are returned: 178 | 179 | >>> bw.intervals("1") 180 | ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224), (2, 3, 0.30000001192092896), (100, 150, 1.399999976158142), (150, 151, 1.5)) 181 | 182 | ## Retrieving bigBed entries 183 | 184 | As opposed to bigWig files, bigBed files hold entries, which are intervals with an associated string. You can access these entries using the `entries()` function: 185 | 186 | >>> bb = pyBigWig.open("https://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed") 187 | >>> bb.entries('chr1', 10000000, 10020000) 188 | [(10009333, 10009640, '61035\t130\t-\t0.026\t0.42\t404'), (10014007, 10014289, '61047\t136\t-\t0.029\t0.42\t404'), (10014373, 10024307, '61048\t630\t-\t5.420\t0.00\t2672399')] 189 | 190 | The output is a list of entry tuples. The tuple elements are the `start` and `end` position of each entry, followed by its associated `string`. The string is returned exactly as it's held in the bigBed file, so parsing it is left to you. To determine what the various fields are in these string, consult the SQL string: 191 | 192 | >>> bb.SQL() 193 | table RnaElements 194 | "BED6 + 3 scores for RNA Elements data" 195 | ( 196 | string chrom; "Reference sequence chromosome or scaffold" 197 | uint chromStart; "Start position in chromosome" 198 | uint chromEnd; "End position in chromosome" 199 | string name; "Name of item" 200 | uint score; "Normalized score from 0-1000" 201 | char[1] strand; "+ or - or . for unknown" 202 | float level; "Expression level such as RPKM or FPKM. Set to -1 for no data." 203 | float signif; "Statistical significance such as IDR. Set to -1 for no data." 204 | uint score2; "Additional measurement/count e.g. number of reads. Set to 0 for no data." 205 | ) 206 | 207 | Note that the first three entries in the SQL string are not part of the string. 208 | 209 | If you only need to know where entries are and not their associated values, you can save memory by additionally specifying `withString=False` in `entries()`: 210 | 211 | >>> bb.entries('chr1', 10000000, 10020000, withString=False) 212 | [(10009333, 10009640), (10014007, 10014289), (10014373, 10024307)] 213 | 214 | ## Add a header to a bigWig file 215 | 216 | If you've opened a file for writing then you'll need to give it a header before you can add any entries. The header contains all of the chromosomes, **in order**, and their sizes. If your genome has two chromosomes, chr1 and chr2, of lengths 1 and 1.5 million bases, then the following would add an appropriate header: 217 | 218 | >>> bw.addHeader([("chr1", 1000000), ("chr2", 1500000)]) 219 | 220 | bigWig headers are case-sensitive, so `chr1` and `Chr1` are different. Likewise, `1` and `chr1` are not the same, so you can't mix Ensembl and UCSC chromosome names. After adding a header, you can then add entries. 221 | 222 | By default, up to 10 "zoom levels" are constructed for bigWig files. You can change this default number with the `maxZooms` optional argument. A common use of this is to create a bigWig file that simply holds intervals and no zoom levels: 223 | 224 | >>> bw.addHeader([("chr1", 1000000), ("chr2", 1500000)], maxZooms=0) 225 | 226 | If you set `maxTooms=0`, please note that IGV and many other tools WILL NOT WORK as they assume that at least one zoom level will be present. You are advised to use the default unless you do not expect the bigWig files to be used by other packages. 227 | 228 | ## Adding entries to a bigWig file 229 | 230 | Assuming you've opened a file for writing and added a header, you can then add entries. Note that the entries **must** be added in order, as bigWig files always contain ordered intervals. There are three formats that bigWig files can use internally to store entries. The most commonly observed format is identical to a [bedGraph](https://genome.ucsc.edu/goldenpath/help/bedgraph.html) file: 231 | 232 | chr1 0 100 0.0 233 | chr1 100 120 1.0 234 | chr1 125 126 200.0 235 | 236 | These entries would be added as follows: 237 | 238 | >>> bw.addEntries(["chr1", "chr1", "chr1"], [0, 100, 125], ends=[5, 120, 126], values=[0.0, 1.0, 200.0]) 239 | 240 | Each entry occupies 12 bytes before compression. 241 | 242 | The second format uses a fixed span, but a variable step size between entries. These can be represented in a [wiggle](http://genome.ucsc.edu/goldenpath/help/wiggle.html) file as: 243 | 244 | variableStep chrom=chr1 span=20 245 | 500 -2.0 246 | 600 150.0 247 | 635 25.0 248 | 249 | The above entries describe (1-based) positions 501-520, 601-620 and 636-655. These would be added as follows: 250 | 251 | >>> bw.addEntries("chr1", [500, 600, 635], values=[-2.0, 150.0, 25.0], span=20) 252 | 253 | Each entry of this type occupies 8 bytes before compression. 254 | 255 | The final format uses a fixed step and span for each entry, corresponding to the fixedStep [wiggle format](http://genome.ucsc.edu/goldenpath/help/wiggle.html): 256 | 257 | fixedStep chrom=chr1 step=30 span=20 258 | -5.0 259 | -20.0 260 | 25.0 261 | 262 | The above entries describe (1-based) bases 901-920, 931-950 and 961-980 and would be added as follows: 263 | 264 | >>> bw.addEntries("chr1", 900, values=[-5.0, -20.0, 25.0], span=20, step=30) 265 | 266 | Each entry of this type occupies 4 bytes. 267 | 268 | Note that pyBigWig will try to prevent you from adding entries in an incorrect order. This, however, requires additional over-head. Should that not be acceptable, you can simply specify `validate=False` when adding entries: 269 | 270 | >>> bw.addEntries(["chr1", "chr1", "chr1"], [100, 0, 125], ends=[120, 5, 126], values=[0.0, 1.0, 200.0], validate=False) 271 | 272 | You're obviously then responsible for ensuring that you **do not** add entries out of order. The resulting files would otherwise largley not be usable. 273 | 274 | ## Close a bigWig or bigBed file 275 | 276 | A file can be closed with a simple `bw.close()`, as is commonly done with other file types. For files opened for writing, closing a file writes any buffered entries to disk, constructs and writes the file index, and constructs zoom levels. Consequently, this can take a bit of time. 277 | 278 | # Numpy 279 | 280 | As of version 0.3.0, pyBigWig supports input of coordinates using numpy integers and vectors in some functions **if numpy was installed prior to installing pyBigWig**. To determine if pyBigWig was installed with numpy support by checking the `numpy` accessor: 281 | 282 | >>> import pyBigWig 283 | >>> pyBigWig.numpy 284 | 1 285 | 286 | If `pyBigWig.numpy` is `1`, then pyBigWig was compiled with numpy support. This means that `addEntries()` can accept numpy coordinates: 287 | 288 | >>> import pyBigWig 289 | >>> import numpy 290 | >>> bw = pyBigWig.open("/tmp/delete.bw", "w") 291 | >>> bw.addHeader([("1", 1000)], maxZooms=0) 292 | >>> chroms = np.array(["1"] * 10) 293 | >>> starts = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype=np.int64) 294 | >>> ends = np.array([5, 15, 25, 35, 45, 55, 65, 75, 85, 95], dtype=np.int64) 295 | >>> values0 = np.array(np.random.random_sample(10), dtype=np.float64) 296 | >>> bw.addEntries(chroms, starts, ends=ends, values=values0) 297 | >>> bw.close() 298 | 299 | Additionally, `values()` can directly output a numpy vector: 300 | 301 | >>> bw = bw.open("/tmp/delete.bw") 302 | >>> bw.values('1', 0, 10, numpy=True) 303 | [ 0.74336642 0.74336642 0.74336642 0.74336642 0.74336642 nan 304 | nan nan nan nan] 305 | >>> type(bw.values('1', 0, 10, numpy=True)) 306 | 307 | 308 | # Remote file access 309 | 310 | If you do not have curl installed, pyBigWig will be installed without the ability to access remote files. You can determine if you will be able to access remote files with `pyBigWig.remote`. If that returns 1, then you can access remote files. If it returns 0 then you can't. 311 | 312 | # Empty files 313 | 314 | As of version 0.3.5, pyBigWig is able to read and write bigWig files lacking entries. Please note that such files are generally not compatible with other programs, since there's no definition of how a bigWig file with no entries should look. For such a file, the `intervals()` accessor will return `None`, the `stats()` function will return a list of `None` of the desired length, and `values()` will return `[]` (an empty list). This should generally allow programs utilizing pyBigWig to continue without issue. 315 | 316 | For those wishing to mimic the functionality of pyBigWig/libBigWig in this regard, please note that it looks at the number of bases covered (as reported in the file header) to check for "empty" files. 317 | 318 | # A note on coordinates 319 | 320 | Wiggle, bigWig, and bigBed files use 0-based half-open coordinates, which are also used by this extension. So to access the value for the first base on `chr1`, one would specify the starting position as `0` and the end position as `1`. Similarly, bases 100 to 115 would have a start of `99` and an end of `115`. This is simply for the sake of consistency with the underlying bigWig file and may change in the future. 321 | 322 | # Galaxy 323 | 324 | pyBigWig is also available as a package in [Galaxy](http://www.usegalaxy.org). You can find it in the toolshed and the [IUC](https://wiki.galaxyproject.org/IUC) is currently hosting the XML definition of this on [github](https://github.com/galaxyproject/tools-iuc/tree/master/packages/package_python_2_7_10_pybigwig_0_2_8). 325 | -------------------------------------------------------------------------------- /libBigWig/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Devon Ryan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /libBigWig/README.md: -------------------------------------------------------------------------------- 1 | ![Master build status](https://travis-ci.org/dpryan79/libBigWig.svg?branch=master) [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.45278.svg)](http://dx.doi.org/10.5281/zenodo.45278) 2 | 3 | A C library for reading/parsing local and remote bigWig and bigBed files. While Kent's source code is free to use for these purposes, it's really inappropriate as library code since it has the unfortunate habit of calling `exit()` whenever there's an error. If that's then used inside of something like python then the python interpreter gets killed. This library is aimed at resolving these sorts of issues and should also use more standard things like curl and has a friendlier license to boot. 4 | 5 | Documentation is automatically generated by doxygen and can be found under `docs/html` or online [here](https://cdn.rawgit.com/dpryan79/libBigWig/master/docs/html/index.html). 6 | 7 | # Example 8 | 9 | The only functions and structures that end users need to care about are in "bigWig.h". Below is a commented example. You can see the files under `test/` for further examples. 10 | 11 | #include "bigWig.h" 12 | int main(int argc, char *argv[]) { 13 | bigWigFile_t *fp = NULL; 14 | bwOverlappingIntervals_t *intervals = NULL; 15 | double *stats = NULL; 16 | if(argc != 2) { 17 | fprintf(stderr, "Usage: %s {file.bw|URL://path/file.bw}\n", argv[0]); 18 | return 1; 19 | } 20 | 21 | //Initialize enough space to hold 128KiB (1<<17) of data at a time 22 | if(bwInit(1<<17) != 0) { 23 | fprintf(stderr, "Received an error in bwInit\n"); 24 | return 1; 25 | } 26 | 27 | //Open the local/remote file 28 | fp = bwOpen(argv[1], NULL, "r"); 29 | if(!fp) { 30 | fprintf(stderr, "An error occurred while opening %s\n", argv[1]); 31 | return 1; 32 | } 33 | 34 | //Get values in a range (0-based, half open) without NAs 35 | intervals = bwGetValues(fp, "chr1", 10000000, 10000100, 0); 36 | bwDestroyOverlappingIntervals(intervals); //Free allocated memory 37 | 38 | //Get values in a range (0-based, half open) with NAs 39 | intervals = bwGetValues(fp, "chr1", 10000000, 10000100, 1); 40 | bwDestroyOverlappingIntervals(intervals); //Free allocated memory 41 | 42 | //Get the full intervals that overlap 43 | intervals = bwGetOverlappingIntervals(fp, "chr1", 10000000, 10000100); 44 | bwDestroyOverlappingIntervals(intervals); 45 | 46 | //Get an example statistic - standard deviation 47 | //We want ~4 bins in the range 48 | stats = bwStats(fp, "chr1", 10000000, 10000100, 4, dev); 49 | if(stats) { 50 | printf("chr1:10000000-10000100 std. dev.: %f %f %f %f\n", stats[0], stats[1], stats[2], stats[3]); 51 | free(stats); 52 | } 53 | 54 | bwClose(fp); 55 | bwCleanup(); 56 | return 0; 57 | } 58 | 59 | ##Writing example 60 | 61 | N.B., creation of bigBed files is not supported (there are no plans to change this). 62 | 63 | Below is an example of how to write bigWig files. You can also find this file under `test/exampleWrite.c`. Unlike with Kent's tools, you can create bigWig files entry by entry without needing an intermediate wiggle or bedGraph file. Entries in bigWig files are stored in blocks with each entry in a block referring to the same chromosome and having the same type, of which there are three (see the [wiggle specification](http://genome.ucsc.edu/goldenpath/help/wiggle.html) for more information on this). 64 | 65 | #include "bigWig.h" 66 | 67 | int main(int argc, char *argv[]) { 68 | bigWigFile_t *fp = NULL; 69 | char *chroms[] = {"1", "2"}; 70 | char *chromsUse[] = {"1", "1", "1"}; 71 | uint32_t chrLens[] = {1000000, 1500000}; 72 | uint32_t starts[] = {0, 100, 125, 73 | 200, 220, 230, 74 | 500, 600, 625, 75 | 700, 800, 850}; 76 | uint32_t ends[] = {5, 120, 126, 77 | 205, 226, 231}; 78 | float values[] = {0.0f, 1.0f, 200.0f, 79 | -2.0f, 150.0f, 25.0f, 80 | 0.0f, 1.0f, 200.0f, 81 | -2.0f, 150.0f, 25.0f, 82 | -5.0f, -20.0f, 25.0f, 83 | -5.0f, -20.0f, 25.0f}; 84 | 85 | if(bwInit(1<<17) != 0) { 86 | fprintf(stderr, "Received an error in bwInit\n"); 87 | return 1; 88 | } 89 | 90 | fp = bwOpen("example_output.bw", NULL, "w"); 91 | if(!fp) { 92 | fprintf(stderr, "An error occurred while opening example_output.bw for writingn\n"); 93 | return 1; 94 | } 95 | 96 | //Allow up to 10 zoom levels, though fewer will be used in practice 97 | if(bwCreateHdr(fp, 10)) goto error; 98 | 99 | //Create the chromosome lists 100 | fp->cl = bwCreateChromList(chroms, chrLens, 2); 101 | if(!fp->cl) goto error; 102 | 103 | //Write the header 104 | if(bwWriteHdr(fp)) goto error; 105 | 106 | //Some example bedGraph-like entries 107 | if(bwAddIntervals(fp, chromsUse, starts, ends, values, 3)) goto error; 108 | //We can continue appending similarly formatted entries 109 | //N.B. you can't append a different chromosome (those always go into different 110 | if(bwAppendIntervals(fp, starts+3, ends+3, values+3, 3)) goto error; 111 | 112 | //Add a new block of entries with a span. Since bwAdd/AppendIntervals was just used we MUST create a new block 113 | if(bwAddIntervalSpans(fp, "1", starts+6, 20, values+6, 3)) goto error; 114 | //We can continue appending similarly formatted entries 115 | if(bwAppendIntervalSpans(fp, starts+9, values+9, 3)) goto error; 116 | 117 | //Add a new block of fixed-step entries 118 | if(bwAddIntervalSpanSteps(fp, "1", 900, 20, 30, values+12, 3)) goto error; 119 | //The start is then 760, since that's where the previous step ended 120 | if(bwAppendIntervalSpanSteps(fp, values+15, 3)) goto error; 121 | 122 | //Add a new chromosome 123 | chromsUse[0] = "2"; 124 | chromsUse[1] = "2"; 125 | chromsUse[2] = "2"; 126 | if(bwAddIntervals(fp, chromsUse, starts, ends, values, 3)) goto error; 127 | 128 | //Closing the file causes the zoom levels to be created 129 | bwClose(fp); 130 | bwCleanup(); 131 | 132 | return 0; 133 | 134 | error: 135 | fprintf(stderr, "Received an error somewhere!\n"); 136 | bwClose(fp); 137 | bwCleanup(); 138 | return 1; 139 | } 140 | 141 | # Testing file types 142 | 143 | As of version 0.3.0, this library supports accessing bigBed files, which are related to bigWig files. Applications that need to support both bigWig and bigBed input can use the `bwIsBigWig` and `bbIsBigBed` functions to determine if their inputs are bigWig/bigBed files: 144 | 145 | ...code... 146 | if(bwIsBigWig(input_file_name, NULL)) { 147 | //do something 148 | } else if(bbIsBigBed(input_file_name, NULL)) { 149 | //do something else 150 | } else { 151 | //handle unknown input 152 | } 153 | 154 | Note that these two functions rely on the "magic number" at the beginning of each file, which differs between bigWig and bigBed files. 155 | 156 | # bigBed support 157 | 158 | Support for accessing bigBed files was added in version 0.3.0. The function names used for accessing bigBed files are similar to those used for bigWig files. 159 | 160 | Function | Use 161 | --- | --- 162 | bbOpen | Opens a bigBed file 163 | bbGetSQL | Returns the SQL string (if it exists) in a bigBed file 164 | bbGetOverlappingEntries | Returns all entries overlapping an interval (either with or without their associated strings 165 | bbDestroyOverlappingEntries | Free memory allocated by the above command 166 | 167 | Other functions, such as `bwClose` and `bwInit`, are shared between bigWig and bigBed files. See `test/testBigBed.c` for a full example. 168 | 169 | # A note on bigBed entries 170 | 171 | Inside bigBed files, entries are stored as chromosome, start, and end coordinates with an (optional) associated string. For example, a "bedRNAElements" file from Encode has name, score, strand, "level", "significance", and "score2" values associated with each entry. These are stored inside the bigBed files as a single tab-separated character vector (char \*), which makes parsing difficult. The names of the various fields inside of bigBed files is stored as an SQL string, for example: 172 | 173 | table RnaElements 174 | "BED6 + 3 scores for RNA Elements data " 175 | ( 176 | string chrom; "Reference sequence chromosome or scaffold" 177 | uint chromStart; "Start position in chromosome" 178 | uint chromEnd; "End position in chromosome" 179 | string name; "Name of item" 180 | uint score; "Normalized score from 0-1000" 181 | char[1] strand; "+ or - or . for unknown" 182 | float level; "Expression level such as RPKM or FPKM. Set to -1 for no data." 183 | float signif; "Statistical significance such as IDR. Set to -1 for no data." 184 | uint score2; "Additional measurement/count e.g. number of reads. Set to 0 for no data." 185 | ) 186 | 187 | Entries will then be of the form (one per line): 188 | 189 | 59426 115 - 0.021 0.48 218 190 | 51 209 + 0.071 0.74 130 191 | 52 170 + 0.045 0.61 171 192 | 59433 178 - 0.049 0.34 296 193 | 53 156 + 0.038 0.19 593 194 | 59436 186 - 0.054 0.15 1010 195 | 59437 506 - 1.560 0.00 430611 196 | 197 | Note that chromosome and start/end intervals are stored separately, so there's no need to parse them out of string. libBigWig can return these entries, either with or without the above associated strings. Parsing these string is left to the application requiring them and is currently outside the scope of this library. 198 | 199 | # Interval/Entry iterators 200 | 201 | Sometimes it is desirable to request a large number of intervals from a bigWig file or entries from a bigBed file, but not hold them all in memory at once (e.g., due to saving memory). To support this, libBigWig (since version 0.3.0) supports two kinds of iterators. The general process of using iterators is: (1) iterator creation, (2) traversal, and finally (3) iterator destruction. Only iterator creation differs between bigWig and bigBed files. 202 | 203 | Importantly, iterators return results by one or more blocks. This is for convenience, since bigWig intervals and bigBed entries are stored in together in fixed-size groups, called blocks. The number of blocks of entries returned, therefore, is an option that can be specified to balance performance and memory usage. 204 | 205 | ## Iterator creation 206 | 207 | For bigwig files, iterators are created with the `bwOverlappingIntervalsIterator()`. This function takes chromosomal bounds (chromosome name, start, and end position) as well as a number of blocks. The equivalent function for bigBed files is `bbOverlappingEntriesIterator()`, which additionally takes a `withString` argutment, which dictates whether the returned entries include the associated string values or not. 208 | 209 | Each of the aforementioned files returns a pointer to a `bwOverlapIterator_t` object. The only important parts of this structure for end users are the following members: `entries`, `intervals`, and `data`. `entries` is a pointer to a `bbOverlappingEntries_t` object, or `NULL` if a bigWig file is being used. Likewise, `intervals` is a pointer to a `bwOverlappingIntervals_t` object, or `NULL` if a bigBed file is being used. `data` is a special pointer, used to signify the end of iteration. Thus, when `data` is a `NULL` pointer, iteration has ended. 210 | 211 | ## Iterator traversal 212 | 213 | Regardless of whether a bigWig or bigBed file is being used, the `bwIteratorNext()` function will free currently used memory and load the appropriate intervals or entries for the next block(s). On error, this will return a NULL pointer (memory is already internally freed in this case). 214 | 215 | ## Iterator destruction 216 | 217 | `bwOverlapIterator_t` objects MUST be destroyed after use. This can be done with the `bwIteratorDestroy()` function. 218 | 219 | ## Example 220 | 221 | A full example is provided in `tests/testIterator.c`, but a small example of iterating over all bigWig intervals in `chr1:0-10000000` in chunks of 5 blocks follows: 222 | 223 | iter = bwOverlappingIntervalsIterator(fp, "chr1", 0, 10000000, 5); 224 | while(iter->data) { 225 | //Do stuff with iter->intervals 226 | iter = bwIteratorNext(iter); 227 | } 228 | bwIteratorDestroy(iter); 229 | 230 | # A note on bigWig statistics 231 | 232 | The results of `min`, `max`, and `mean` should be the same as those from `BigWigSummary`. `stdev` and `coverage`, however, may differ due to Kent's tools producing incorrect results (at least for `coverage`, though the same appears to be the case for `stdev`). 233 | 234 | # Python interface 235 | 236 | There are currently two python interfaces that make use of libBigWig: [pyBigWig](https://github.com/dpryan79/pyBigWig) by me and [bw-python](https://github.com/brentp/bw-python) by Brent Pederson. Those interested are encouraged to give both a try! 237 | -------------------------------------------------------------------------------- /libBigWig/bigWig.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBBIGWIG_H 2 | #define LIBBIGWIG_H 3 | 4 | #include "bigWigIO.h" 5 | #include "bwValues.h" 6 | #include 7 | #include 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | /*! \mainpage libBigWig 14 | * 15 | * \section Introduction 16 | * 17 | * libBigWig is a C library for parsing local/remote bigWig and bigBed files. This is similar to Kent's library from UCSC, except 18 | * * The license is much more liberal 19 | * * This code doesn't call `exit()` on error, thereby killing the calling application. 20 | * 21 | * External files are accessed using [curl](http://curl.haxx.se/). 22 | * 23 | * Please submit issues and pull requests [here](https://github.com/dpryan79/libBigWig). 24 | * 25 | * \section Compilation 26 | * 27 | * Assuming you already have the curl libraries installed (not just the curl binary!): 28 | * 29 | * make install prefix=/some/path 30 | * 31 | * \section Writing bigWig files 32 | * 33 | * There are three methods for storing values in a bigWig file, further described in the [wiggle format](http://genome.ucsc.edu/goldenpath/help/wiggle.html). The entries within the file are grouped into "blocks" and each such block is limited to storing entries of a single type. So, it is unwise to use a single bedGraph-like endtry followed by a single fixed-step entry followed by a variable-step entry, as that would require three separate blocks, with additional space required for each. 34 | * 35 | * \section Testing file types 36 | * 37 | * As of version 0.3.0, libBigWig supports reading bigBed files. If an application needs to support both bigBed and bigWig input, then the `bwIsBigWig` and `bbIsBigBed` functions can be used to determine the file type. These both use the "magic" number at the beginning of the file to determine the file type. 38 | * 39 | * \section Interval and entry iterators 40 | * 41 | * As of version 0.3.0, libBigWig supports iterating over intervals in bigWig files and entries in bigBed files. The number of intervals/entries returned with each iteration can be controlled by setting the number of blocks processed in each iteration (intervals and entries are group inside of bigWig and bigBed files into blocks of entries). See `test/testIterator.c` for an example. 42 | * 43 | * \section Examples 44 | * 45 | * Please see [README.md](README.md) and the files under `test/` for examples. 46 | */ 47 | 48 | 49 | /*! \file bigWig.h 50 | * 51 | * These are the functions and structured that should be used by external users. While I don't particularly recommend dealing with some of the structures (e.g., a bigWigHdr_t), they're described here in case you need them. 52 | * 53 | * BTW, this library doesn't switch endianness as appropriate, since I kind of assume that there's only one type produced these days. 54 | */ 55 | 56 | /*! 57 | * The library version number 58 | */ 59 | #define LIBBIGWIG_VERSION 0.4.8 60 | 61 | /*! 62 | * If 1, then this library was compiled with remote file support. 63 | */ 64 | #ifdef NOCURL 65 | #define LIBBIGWIG_CURL 0 66 | #ifndef CURLTYPE_DEFINED 67 | #define CURLTYPE_DEFINED 68 | typedef int CURLcode; 69 | typedef void CURL; 70 | #endif 71 | #else 72 | #define LIBBIGWIG_CURL 1 73 | #endif 74 | 75 | /*! 76 | * The magic number of a bigWig file. 77 | */ 78 | #define BIGWIG_MAGIC 0x888FFC26 79 | /*! 80 | * The magic number of a bigBed file. 81 | */ 82 | #define BIGBED_MAGIC 0x8789F2EB 83 | /*! 84 | * The magic number of a "cirTree" block in a file. 85 | */ 86 | #define CIRTREE_MAGIC 0x78ca8c91 87 | /*! 88 | * The magic number of an index block in a file. 89 | */ 90 | #define IDX_MAGIC 0x2468ace0 91 | /*! 92 | * The default number of children per block. 93 | */ 94 | #define DEFAULT_nCHILDREN 64 95 | /*! 96 | * The default decompression buffer size in bytes. This is used to determin 97 | */ 98 | #define DEFAULT_BLOCKSIZE 32768 99 | 100 | /*! 101 | * An enum that dictates the type of statistic to fetch for a given interval 102 | */ 103 | enum bwStatsType { 104 | doesNotExist = -1, /*!< This does nothing */ 105 | mean = 0, /*!< The mean value */ 106 | average = 0, /*!< The mean value */ 107 | stdev = 1, /*!< The standard deviation of the values */ 108 | dev = 1, /*!< The standard deviation of the values */ 109 | max = 2, /*!< The maximum value */ 110 | min = 3, /*!< The minimum value */ 111 | cov = 4, /*!< The number of bases covered */ 112 | coverage = 4, /*!bufSize*/ 198 | bwLL *firstIndexNode; /** 65535 will result in a maximum of 10. 485 | * @return 0 on success. 486 | */ 487 | int bwCreateHdr(bigWigFile_t *fp, int32_t maxZooms); 488 | 489 | /*! 490 | * @brief Take a list of chromosome names and lengths and return a pointer to a chromList_t 491 | * This MUST be run before `bwWriteHdr()`. Note that the input is NOT free()d! 492 | * @param chroms A list of chromosomes. 493 | * @param lengths The length of each chromosome. 494 | * @param n The number of chromosomes (thus, the length of `chroms` and `lengths`) 495 | * @return A pointer to a chromList_t or NULL on error. 496 | */ 497 | chromList_t *bwCreateChromList(const char* const* chroms, const uint32_t *lengths, int64_t n); 498 | 499 | /*! 500 | * @brief Write a the header to a bigWig file. 501 | * You must have already opened the output file, created a header and a chromosome list. 502 | * @param bw The output bigWigFile_t pointer. 503 | * @see bwCreateHdr 504 | * @see bwCreateChromList 505 | */ 506 | int bwWriteHdr(bigWigFile_t *bw); 507 | 508 | /*! 509 | * @brief Write a new block of bedGraph-like intervals to a bigWig file 510 | * Adds entries of the form: 511 | * chromosome start end value 512 | * to the file. These will always be added in a new block, so you may have previously used a different storage type. 513 | * 514 | * In general it's more efficient to use the bwAppend* functions, but then you MUST know that the previously written block is of the same type. In other words, you can only use bwAppendIntervals() after bwAddIntervals() or a previous bwAppendIntervals(). 515 | * @param fp The output file pointer. 516 | * @param chrom A list of chromosomes, of length `n`. 517 | * @param start A list of start positions of length`n`. 518 | * @param end A list of end positions of length`n`. 519 | * @param values A list of values of length`n`. 520 | * @param n The length of the aforementioned lists. 521 | * @return 0 on success and another value on error. 522 | * @see bwAppendIntervals 523 | */ 524 | int bwAddIntervals(bigWigFile_t *fp, const char* const* chrom, const uint32_t *start, const uint32_t *end, const float *values, uint32_t n); 525 | 526 | /*! 527 | * @brief Append bedGraph-like intervals to a previous block of bedGraph-like intervals in a bigWig file. 528 | * If you have previously used bwAddIntervals() then this will append additional entries into the previous block (or start a new one if needed). 529 | * @param fp The output file pointer. 530 | * @param start A list of start positions of length`n`. 531 | * @param end A list of end positions of length`n`. 532 | * @param values A list of values of length`n`. 533 | * @param n The length of the aforementioned lists. 534 | * @return 0 on success and another value on error. 535 | * @warning Do NOT use this after `bwAddIntervalSpanSteps()`, `bwAppendIntervalSpanSteps()`, `bwAddIntervalSpanSteps()`, or `bwAppendIntervalSpanSteps()`. 536 | * @see bwAddIntervals 537 | */ 538 | int bwAppendIntervals(bigWigFile_t *fp, const uint32_t *start, const uint32_t *end, const float *values, uint32_t n); 539 | 540 | /*! 541 | * @brief Add a new block of variable-step entries to a bigWig file 542 | * Adds entries for the form 543 | * chromosome start value 544 | * to the file. Each block of such entries has an associated "span", so each value describes the region chromosome:start-(start+span) 545 | * 546 | * This will always start a new block of values. 547 | * @param fp The output file pointer. 548 | * @param chrom A list of chromosomes, of length `n`. 549 | * @param start A list of start positions of length`n`. 550 | * @param span The span of each entry (the must all be the same). 551 | * @param values A list of values of length`n`. 552 | * @param n The length of the aforementioned lists. 553 | * @return 0 on success and another value on error. 554 | * @see bwAppendIntervalSpans 555 | */ 556 | int bwAddIntervalSpans(bigWigFile_t *fp, const char *chrom, const uint32_t *start, uint32_t span, const float *values, uint32_t n); 557 | 558 | /*! 559 | * @brief Append to a previous block of variable-step entries. 560 | * If you previously used `bwAddIntervalSpans()`, this will continue appending more values to the block(s) it created. 561 | * @param fp The output file pointer. 562 | * @param start A list of start positions of length`n`. 563 | * @param values A list of values of length`n`. 564 | * @param n The length of the aforementioned lists. 565 | * @return 0 on success and another value on error. 566 | * @warning Do NOT use this after `bwAddIntervals()`, `bwAppendIntervals()`, `bwAddIntervalSpanSteps()` or `bwAppendIntervalSpanSteps()` 567 | * @see bwAddIntervalSpans 568 | */ 569 | int bwAppendIntervalSpans(bigWigFile_t *fp, const uint32_t *start, const float *values, uint32_t n); 570 | 571 | /*! 572 | * @brief Add a new block of fixed-step entries to a bigWig file 573 | * Adds entries for the form 574 | * value 575 | * to the file. Each block of such entries has an associated "span", "step", chromosome and start position. See the wiggle format for more details. 576 | * 577 | * This will always start a new block of values. 578 | * @param fp The output file pointer. 579 | * @param chrom The chromosome that the entries describe. 580 | * @param start The starting position of the block of entries. 581 | * @param span The span of each entry (i.e., the number of bases it describes). 582 | * @param step The step between entry start positions. 583 | * @param values A list of values of length`n`. 584 | * @param n The length of the aforementioned lists. 585 | * @return 0 on success and another value on error. 586 | * @see bwAddIntervalSpanSteps 587 | */ 588 | int bwAddIntervalSpanSteps(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t span, uint32_t step, const float *values, uint32_t n); 589 | 590 | /*! 591 | * @brief Append to a previous block of fixed-step entries. 592 | * If you previously used `bwAddIntervalSpanSteps()`, this will continue appending more values to the block(s) it created. 593 | * @param fp The output file pointer. 594 | * @param values A list of values of length`n`. 595 | * @param n The length of the aforementioned lists. 596 | * @return 0 on success and another value on error. 597 | * @warning Do NOT use this after `bwAddIntervals()`, `bwAppendIntervals()`, `bwAddIntervalSpans()` or `bwAppendIntervalSpans()` 598 | * @see bwAddIntervalSpanSteps 599 | */ 600 | int bwAppendIntervalSpanSteps(bigWigFile_t *fp, const float *values, uint32_t n); 601 | 602 | #ifdef __cplusplus 603 | } 604 | #endif 605 | 606 | #endif // LIBBIGWIG_H 607 | -------------------------------------------------------------------------------- /libBigWig/bigWigIO.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBBIGWIG_IO_H 2 | #define LIBBIGWIG_IO_H 3 | 4 | #ifndef NOCURL 5 | #include 6 | #else 7 | #include 8 | #ifndef CURLTYPE_DEFINED 9 | #define CURLTYPE_DEFINED 10 | typedef int CURLcode; 11 | typedef void CURL; 12 | #endif 13 | #define CURLE_OK 0 14 | #define CURLE_FAILED_INIT 1 15 | #endif 16 | /*! \file bigWigIO.h 17 | * These are (typically internal) IO functions, so there's generally no need for you to directly use them! 18 | */ 19 | 20 | /*! 21 | * The size of the buffer used for remote files. 22 | */ 23 | extern size_t GLOBAL_DEFAULTBUFFERSIZE; 24 | 25 | /*! 26 | * The enumerated values that indicate the connection type used to access a file. 27 | */ 28 | enum bigWigFile_type_enum { 29 | BWG_FILE = 0, 30 | BWG_HTTP = 1, 31 | BWG_HTTPS = 2, 32 | BWG_FTP = 3 33 | }; 34 | 35 | /*! 36 | * @brief This structure holds the file pointers and buffers needed for raw access to local and remote files. 37 | */ 38 | typedef struct { 39 | union { 40 | #ifndef NOCURL 41 | CURL *curl; /**1, the number of members fully copied (this is equivalent to `fread`). 26 | */ 27 | size_t bwRead(void *data, size_t sz, size_t nmemb, bigWigFile_t *fp); 28 | 29 | /*! 30 | * @brief Determine what the file position indicator say. 31 | * This is equivalent to `ftell` for local or remote files. 32 | * @param fp The file. 33 | * @return The position in the file. 34 | */ 35 | long bwTell(bigWigFile_t *fp); 36 | 37 | /*! 38 | * @brief Reads a data index (either full data or a zoom level) from a bigWig file. 39 | * There is little reason for end users to use this function. This must be freed with `bwDestroyIndex` 40 | * @param fp A valid bigWigFile_t pointer 41 | * @param offset The file offset where the index begins 42 | * @return A bwRTree_t pointer or NULL on error. 43 | */ 44 | bwRTree_t *bwReadIndex(bigWigFile_t *fp, uint64_t offset); 45 | 46 | /*! 47 | * @brief Destroy an bwRTreeNode_t and all of its children. 48 | * @param node The node to destroy. 49 | */ 50 | void bwDestroyIndexNode(bwRTreeNode_t *node); 51 | 52 | /*! 53 | * @brief Frees space allocated by `bwReadIndex` 54 | * There is generally little reason to use this, since end users should typically not need to run `bwReadIndex` themselves. 55 | * @param idx A bwRTree_t pointer allocated by `bwReadIndex`. 56 | */ 57 | void bwDestroyIndex(bwRTree_t *idx); 58 | 59 | /// @cond SKIP 60 | bwOverlapBlock_t *walkRTreeNodes(bigWigFile_t *bw, bwRTreeNode_t *root, uint32_t tid, uint32_t start, uint32_t end); 61 | void destroyBWOverlapBlock(bwOverlapBlock_t *b); 62 | /// @endcond 63 | 64 | /*! 65 | * @brief Finishes what's needed to write a bigWigFile 66 | * Flushes the buffer, converts the index linked list to a tree, writes that to disk, handles zoom level stuff, writes magic at the end 67 | * @param fp A valid bigWigFile_t pointer 68 | * @return 0 on success 69 | */ 70 | int bwFinalize(bigWigFile_t *fp); 71 | 72 | /// @cond SKIP 73 | char *bwStrdup(const char *s); 74 | /// @endcond 75 | -------------------------------------------------------------------------------- /libBigWig/bwRead.c: -------------------------------------------------------------------------------- 1 | #include "bigWig.h" 2 | #include "bwCommon.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | static uint64_t readChromBlock(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize); 9 | 10 | //Return the position in the file 11 | long bwTell(bigWigFile_t *fp) { 12 | if(fp->URL->type == BWG_FILE) return ftell(fp->URL->x.fp); 13 | return (long) (fp->URL->filePos + fp->URL->bufPos); 14 | } 15 | 16 | //Seek to a given position, always from the beginning of the file 17 | //Return 0 on success and -1 on error 18 | //To do, use the return code of urlSeek() in a more useful way. 19 | int bwSetPos(bigWigFile_t *fp, size_t pos) { 20 | CURLcode rv = urlSeek(fp->URL, pos); 21 | if(rv == CURLE_OK) return 0; 22 | return -1; 23 | } 24 | 25 | //returns the number of full members read (nmemb on success, something less on error) 26 | size_t bwRead(void *data, size_t sz, size_t nmemb, bigWigFile_t *fp) { 27 | size_t i, rv; 28 | for(i=0; iURL, data+i*sz, sz); 30 | if(rv != sz) return i; 31 | } 32 | return nmemb; 33 | } 34 | 35 | //Initializes curl and sets global variables 36 | //Returns 0 on success and 1 on error 37 | //This should be called only once and bwCleanup() must be called when finished. 38 | int bwInit(size_t defaultBufSize) { 39 | //set the buffer size, number of iterations, sleep time between iterations, etc. 40 | GLOBAL_DEFAULTBUFFERSIZE = defaultBufSize; 41 | 42 | //call curl_global_init() 43 | #ifndef NOCURL 44 | CURLcode rv; 45 | rv = curl_global_init(CURL_GLOBAL_ALL); 46 | if(rv != CURLE_OK) return 1; 47 | #endif 48 | return 0; 49 | } 50 | 51 | //This should be called before quiting, to release memory acquired by curl 52 | void bwCleanup() { 53 | #ifndef NOCURL 54 | curl_global_cleanup(); 55 | #endif 56 | } 57 | 58 | static bwZoomHdr_t *bwReadZoomHdrs(bigWigFile_t *bw) { 59 | if(bw->isWrite) return NULL; 60 | uint16_t i; 61 | bwZoomHdr_t *zhdr = malloc(sizeof(bwZoomHdr_t)); 62 | if(!zhdr) return NULL; 63 | uint32_t *level = malloc(bw->hdr->nLevels * sizeof(uint64_t)); 64 | if(!level) { 65 | free(zhdr); 66 | return NULL; 67 | } 68 | uint32_t padding = 0; 69 | uint64_t *dataOffset = malloc(sizeof(uint64_t) * bw->hdr->nLevels); 70 | if(!dataOffset) { 71 | free(zhdr); 72 | free(level); 73 | return NULL; 74 | } 75 | uint64_t *indexOffset = malloc(sizeof(uint64_t) * bw->hdr->nLevels); 76 | if(!indexOffset) { 77 | free(zhdr); 78 | free(level); 79 | free(dataOffset); 80 | return NULL; 81 | } 82 | 83 | for(i=0; ihdr->nLevels; i++) { 84 | if(bwRead((void*) &(level[i]), sizeof(uint32_t), 1, bw) != 1) goto error; 85 | if(bwRead((void*) &padding, sizeof(uint32_t), 1, bw) != 1) goto error; 86 | if(bwRead((void*) &(dataOffset[i]), sizeof(uint64_t), 1, bw) != 1) goto error; 87 | if(bwRead((void*) &(indexOffset[i]), sizeof(uint64_t), 1, bw) != 1) goto error; 88 | } 89 | 90 | zhdr->level = level; 91 | zhdr->dataOffset = dataOffset; 92 | zhdr->indexOffset = indexOffset; 93 | zhdr->idx = calloc(bw->hdr->nLevels, sizeof(bwRTree_t*)); 94 | if(!zhdr->idx) goto error; 95 | 96 | return zhdr; 97 | 98 | error: 99 | for(i=0; ihdr->nLevels; i++) { 100 | if(zhdr->idx[i]) bwDestroyIndex(zhdr->idx[i]); 101 | } 102 | free(zhdr); 103 | free(level); 104 | free(dataOffset); 105 | free(indexOffset); 106 | return NULL; 107 | } 108 | 109 | static void bwHdrDestroy(bigWigHdr_t *hdr) { 110 | int i; 111 | if(hdr->zoomHdrs) { 112 | free(hdr->zoomHdrs->level); 113 | free(hdr->zoomHdrs->dataOffset); 114 | free(hdr->zoomHdrs->indexOffset); 115 | for(i=0; inLevels; i++) { 116 | if(hdr->zoomHdrs->idx[i]) bwDestroyIndex(hdr->zoomHdrs->idx[i]); 117 | } 118 | free(hdr->zoomHdrs->idx); 119 | free(hdr->zoomHdrs); 120 | } 121 | free(hdr); 122 | } 123 | 124 | static void bwHdrRead(bigWigFile_t *bw) { 125 | uint32_t magic; 126 | if(bw->isWrite) return; 127 | bw->hdr = calloc(1, sizeof(bigWigHdr_t)); 128 | if(!bw->hdr) return; 129 | 130 | if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; //0x0 131 | if(magic != BIGWIG_MAGIC && magic != BIGBED_MAGIC) goto error; 132 | 133 | if(bwRead((void*) &(bw->hdr->version), sizeof(uint16_t), 1, bw) != 1) goto error; //0x4 134 | if(bwRead((void*) &(bw->hdr->nLevels), sizeof(uint16_t), 1, bw) != 1) goto error; //0x6 135 | if(bwRead((void*) &(bw->hdr->ctOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x8 136 | if(bwRead((void*) &(bw->hdr->dataOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x10 137 | if(bwRead((void*) &(bw->hdr->indexOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x18 138 | if(bwRead((void*) &(bw->hdr->fieldCount), sizeof(uint16_t), 1, bw) != 1) goto error; //0x20 139 | if(bwRead((void*) &(bw->hdr->definedFieldCount), sizeof(uint16_t), 1, bw) != 1) goto error; //0x22 140 | if(bwRead((void*) &(bw->hdr->sqlOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x24 141 | if(bwRead((void*) &(bw->hdr->summaryOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x2c 142 | if(bwRead((void*) &(bw->hdr->bufSize), sizeof(uint32_t), 1, bw) != 1) goto error; //0x34 143 | if(bwRead((void*) &(bw->hdr->extensionOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x38 144 | 145 | //zoom headers 146 | if(bw->hdr->nLevels) { 147 | if(!(bw->hdr->zoomHdrs = bwReadZoomHdrs(bw))) goto error; 148 | } 149 | 150 | //File summary information 151 | if(bw->hdr->summaryOffset) { 152 | if(urlSeek(bw->URL, bw->hdr->summaryOffset) != CURLE_OK) goto error; 153 | if(bwRead((void*) &(bw->hdr->nBasesCovered), sizeof(uint64_t), 1, bw) != 1) goto error; 154 | if(bwRead((void*) &(bw->hdr->minVal), sizeof(uint64_t), 1, bw) != 1) goto error; 155 | if(bwRead((void*) &(bw->hdr->maxVal), sizeof(uint64_t), 1, bw) != 1) goto error; 156 | if(bwRead((void*) &(bw->hdr->sumData), sizeof(uint64_t), 1, bw) != 1) goto error; 157 | if(bwRead((void*) &(bw->hdr->sumSquared), sizeof(uint64_t), 1, bw) != 1) goto error; 158 | } 159 | 160 | //In case of uncompressed remote files, let the IO functions know to request larger chunks 161 | bw->URL->isCompressed = (bw->hdr->bufSize > 0)?1:0; 162 | 163 | return; 164 | 165 | error: 166 | bwHdrDestroy(bw->hdr); 167 | fprintf(stderr, "[bwHdrRead] There was an error while reading in the header!\n"); 168 | bw->hdr = NULL; 169 | } 170 | 171 | static void destroyChromList(chromList_t *cl) { 172 | uint32_t i; 173 | if(!cl) return; 174 | if(cl->nKeys && cl->chrom) { 175 | for(i=0; inKeys; i++) { 176 | if(cl->chrom[i]) free(cl->chrom[i]); 177 | } 178 | } 179 | if(cl->chrom) free(cl->chrom); 180 | if(cl->len) free(cl->len); 181 | free(cl); 182 | } 183 | 184 | static uint64_t readChromLeaf(bigWigFile_t *bw, chromList_t *cl, uint32_t valueSize) { 185 | uint16_t nVals, i; 186 | uint32_t idx; 187 | char *chrom = NULL; 188 | 189 | if(bwRead((void*) &nVals, sizeof(uint16_t), 1, bw) != 1) return -1; 190 | chrom = calloc(valueSize+1, sizeof(char)); 191 | if(!chrom) return -1; 192 | 193 | for(i=0; ilen[idx]), sizeof(uint32_t), 1, bw) != 1) goto error; 197 | cl->chrom[idx] = bwStrdup(chrom); 198 | if(!(cl->chrom[idx])) goto error; 199 | } 200 | 201 | free(chrom); 202 | return nVals; 203 | 204 | error: 205 | free(chrom); 206 | return -1; 207 | } 208 | 209 | static uint64_t readChromNonLeaf(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize) { 210 | uint64_t offset , rv = 0, previous; 211 | uint16_t nVals, i; 212 | 213 | if(bwRead((void*) &nVals, sizeof(uint16_t), 1, bw) != 1) return -1; 214 | 215 | previous = bwTell(bw) + keySize; 216 | for(i=0; iisWrite) return NULL; 245 | if(bwSetPos(bw, bw->hdr->ctOffset)) return NULL; 246 | 247 | cl = calloc(1, sizeof(chromList_t)); 248 | if(!cl) return NULL; 249 | 250 | if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; 251 | if(magic != CIRTREE_MAGIC) goto error; 252 | 253 | if(bwRead((void*) &itemsPerBlock, sizeof(uint32_t), 1, bw) != 1) goto error; 254 | if(bwRead((void*) &keySize, sizeof(uint32_t), 1, bw) != 1) goto error; 255 | if(bwRead((void*) &valueSize, sizeof(uint32_t), 1, bw) != 1) goto error; 256 | if(bwRead((void*) &itemCount, sizeof(uint64_t), 1, bw) != 1) goto error; 257 | 258 | cl->nKeys = itemCount; 259 | cl->chrom = calloc(itemCount, sizeof(char*)); 260 | cl->len = calloc(itemCount, sizeof(uint32_t)); 261 | if(!cl->chrom) goto error; 262 | if(!cl->len) goto error; 263 | 264 | if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; 265 | if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; 266 | 267 | //Read in the blocks 268 | rv = readChromBlock(bw, cl, keySize); 269 | if(rv == (uint64_t) -1) goto error; 270 | if(rv != itemCount) goto error; 271 | 272 | return cl; 273 | 274 | error: 275 | destroyChromList(cl); 276 | return NULL; 277 | } 278 | 279 | //This is here mostly for convenience 280 | static void bwDestroyWriteBuffer(bwWriteBuffer_t *wb) { 281 | if(wb->p) free(wb->p); 282 | if(wb->compressP) free(wb->compressP); 283 | if(wb->firstZoomBuffer) free(wb->firstZoomBuffer); 284 | if(wb->lastZoomBuffer) free(wb->lastZoomBuffer); 285 | if(wb->nNodes) free(wb->nNodes); 286 | free(wb); 287 | } 288 | 289 | void bwClose(bigWigFile_t *fp) { 290 | if(!fp) return; 291 | if(bwFinalize(fp)) { 292 | fprintf(stderr, "[bwClose] There was an error while finishing writing a bigWig file! The output is likely truncated.\n"); 293 | } 294 | if(fp->URL) urlClose(fp->URL); 295 | if(fp->hdr) bwHdrDestroy(fp->hdr); 296 | if(fp->cl) destroyChromList(fp->cl); 297 | if(fp->idx) bwDestroyIndex(fp->idx); 298 | if(fp->writeBuffer) bwDestroyWriteBuffer(fp->writeBuffer); 299 | free(fp); 300 | } 301 | 302 | int bwIsBigWig(const char *fname, CURLcode (*callBack) (CURL*)) { 303 | uint32_t magic = 0; 304 | URL_t *URL = NULL; 305 | 306 | URL = urlOpen(fname, *callBack, NULL); 307 | 308 | if(!URL) return 0; 309 | if(urlRead(URL, (void*) &magic, sizeof(uint32_t)) != sizeof(uint32_t)) magic = 0; 310 | urlClose(URL); 311 | if(magic == BIGWIG_MAGIC) return 1; 312 | return 0; 313 | } 314 | 315 | char *bbGetSQL(bigWigFile_t *fp) { 316 | char *o = NULL; 317 | uint64_t len; 318 | if(!fp->hdr->sqlOffset) return NULL; 319 | len = fp->hdr->summaryOffset - fp->hdr->sqlOffset; //This includes the NULL terminator 320 | o = malloc(sizeof(char) * len); 321 | if(!o) goto error; 322 | if(bwSetPos(fp, fp->hdr->sqlOffset)) goto error; 323 | if(bwRead((void*) o, len, 1, fp) != 1) goto error; 324 | return o; 325 | 326 | error: 327 | if(o) free(o); 328 | printf("Got an error in bbGetSQL!\n"); 329 | return NULL; 330 | } 331 | 332 | int bbIsBigBed(const char *fname, CURLcode (*callBack) (CURL*)) { 333 | uint32_t magic = 0; 334 | URL_t *URL = NULL; 335 | 336 | URL = urlOpen(fname, *callBack, NULL); 337 | 338 | if(!URL) return 0; 339 | if(urlRead(URL, (void*) &magic, sizeof(uint32_t)) != sizeof(uint32_t)) magic = 0; 340 | urlClose(URL); 341 | if(magic == BIGBED_MAGIC) return 1; 342 | return 0; 343 | } 344 | 345 | bigWigFile_t *bwOpen(const char *fname, CURLcode (*callBack) (CURL*), const char *mode) { 346 | bigWigFile_t *bwg = calloc(1, sizeof(bigWigFile_t)); 347 | if(!bwg) { 348 | fprintf(stderr, "[bwOpen] Couldn't allocate space to create the output object!\n"); 349 | return NULL; 350 | } 351 | if((!mode) || (strchr(mode, 'w') == NULL)) { 352 | bwg->isWrite = 0; 353 | bwg->URL = urlOpen(fname, *callBack, NULL); 354 | if(!bwg->URL) { 355 | fprintf(stderr, "[bwOpen] urlOpen is NULL!\n"); 356 | goto error; 357 | } 358 | 359 | //Attempt to read in the fixed header 360 | bwHdrRead(bwg); 361 | if(!bwg->hdr) { 362 | fprintf(stderr, "[bwOpen] bwg->hdr is NULL!\n"); 363 | goto error; 364 | } 365 | 366 | //Read in the chromosome list 367 | bwg->cl = bwReadChromList(bwg); 368 | if(!bwg->cl) { 369 | fprintf(stderr, "[bwOpen] bwg->cl is NULL (%s)!\n", fname); 370 | goto error; 371 | } 372 | 373 | //Read in the index 374 | if(bwg->hdr->indexOffset) { 375 | bwg->idx = bwReadIndex(bwg, 0); 376 | if(!bwg->idx) { 377 | fprintf(stderr, "[bwOpen] bwg->idx is NULL bwg->hdr->dataOffset 0x%"PRIx64"!\n", bwg->hdr->dataOffset); 378 | goto error; 379 | } 380 | } 381 | } else { 382 | bwg->isWrite = 1; 383 | bwg->URL = urlOpen(fname, NULL, "w+"); 384 | if(!bwg->URL) goto error; 385 | bwg->writeBuffer = calloc(1,sizeof(bwWriteBuffer_t)); 386 | if(!bwg->writeBuffer) goto error; 387 | bwg->writeBuffer->l = 24; 388 | } 389 | 390 | return bwg; 391 | 392 | error: 393 | bwClose(bwg); 394 | return NULL; 395 | } 396 | 397 | bigWigFile_t *bbOpen(const char *fname, CURLcode (*callBack) (CURL*)) { 398 | bigWigFile_t *bb = calloc(1, sizeof(bigWigFile_t)); 399 | if(!bb) { 400 | fprintf(stderr, "[bbOpen] Couldn't allocate space to create the output object!\n"); 401 | return NULL; 402 | } 403 | 404 | //Set the type to 1 for bigBed 405 | bb->type = 1; 406 | 407 | bb->URL = urlOpen(fname, *callBack, NULL); 408 | if(!bb->URL) goto error; 409 | 410 | //Attempt to read in the fixed header 411 | bwHdrRead(bb); 412 | if(!bb->hdr) goto error; 413 | 414 | //Read in the chromosome list 415 | bb->cl = bwReadChromList(bb); 416 | if(!bb->cl) goto error; 417 | 418 | //Read in the index 419 | bb->idx = bwReadIndex(bb, 0); 420 | if(!bb->idx) goto error; 421 | 422 | return bb; 423 | 424 | error: 425 | bwClose(bb); 426 | return NULL; 427 | } 428 | 429 | 430 | //Implementation taken from musl: 431 | //https://git.musl-libc.org/cgit/musl/tree/src/string/strdup.c 432 | //License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT 433 | char* bwStrdup(const char *s) { 434 | size_t l = strlen(s); 435 | char *d = malloc(l+1); 436 | if (!d) return NULL; 437 | return memcpy(d, s, l+1); 438 | } 439 | -------------------------------------------------------------------------------- /libBigWig/bwStats.c: -------------------------------------------------------------------------------- 1 | #include "bigWig.h" 2 | #include "bwCommon.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | //Returns -1 if there are no applicable levels, otherwise an integer indicating the most appropriate level. 10 | //Like Kent's library, this divides the desired bin size by 2 to minimize the effect of blocks overlapping multiple bins 11 | static int32_t determineZoomLevel(const bigWigFile_t *fp, int basesPerBin) { 12 | int32_t out = -1; 13 | int64_t diff; 14 | uint32_t bestDiff = -1; 15 | uint16_t i; 16 | 17 | basesPerBin/=2; 18 | for(i=0; ihdr->nLevels; i++) { 19 | diff = basesPerBin - (int64_t) fp->hdr->zoomHdrs->level[i]; 20 | if(diff >= 0 && diff < bestDiff) { 21 | bestDiff = diff; 22 | out = i; 23 | } 24 | } 25 | return out; 26 | } 27 | 28 | /// @cond SKIP 29 | struct val_t { 30 | uint32_t nBases; 31 | float min, max, sum, sumsq; 32 | double scalar; 33 | }; 34 | 35 | struct vals_t { 36 | uint32_t n; 37 | struct val_t **vals; 38 | }; 39 | /// @endcond 40 | 41 | void destroyVals_t(struct vals_t *v) { 42 | uint32_t i; 43 | if(!v) return; 44 | for(i=0; in; i++) free(v->vals[i]); 45 | if(v->vals) free(v->vals); 46 | free(v); 47 | } 48 | 49 | //Determine the base-pair overlap between an interval and a block 50 | double getScalar(uint32_t i_start, uint32_t i_end, uint32_t b_start, uint32_t b_end) { 51 | double rv = 0.0; 52 | if(b_start <= i_start) { 53 | if(b_end > i_start) rv = ((double)(b_end - i_start))/(b_end-b_start); 54 | } else if(b_start < i_end) { 55 | if(b_end < i_end) rv = ((double)(b_end - b_start))/(b_end-b_start); 56 | else rv = ((double)(i_end - b_start))/(b_end-b_start); 57 | } 58 | 59 | return rv; 60 | } 61 | 62 | //Returns NULL on error 63 | static struct vals_t *getVals(bigWigFile_t *fp, bwOverlapBlock_t *o, int i, uint32_t tid, uint32_t start, uint32_t end) { 64 | void *buf = NULL, *compBuf = NULL; 65 | uLongf sz = fp->hdr->bufSize; 66 | int compressed = 0, rv; 67 | uint32_t *p, vtid, vstart, vend; 68 | struct vals_t *vals = NULL; 69 | struct val_t *v = NULL; 70 | 71 | if(sz) { 72 | compressed = 1; 73 | buf = malloc(sz); 74 | } 75 | sz = 0; //This is now the size of the compressed buffer 76 | 77 | if(bwSetPos(fp, o->offset[i])) goto error; 78 | 79 | vals = calloc(1,sizeof(struct vals_t)); 80 | if(!vals) goto error; 81 | 82 | v = malloc(sizeof(struct val_t)); 83 | if(!v) goto error; 84 | 85 | if(sz < o->size[i]) compBuf = malloc(o->size[i]); 86 | if(!compBuf) goto error; 87 | 88 | if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; 89 | if(compressed) { 90 | sz = fp->hdr->bufSize; 91 | rv = uncompress(buf, &sz, compBuf, o->size[i]); 92 | if(rv != Z_OK) goto error; 93 | } else { 94 | buf = compBuf; 95 | sz = o->size[i]; 96 | } 97 | 98 | p = buf; 99 | while(((uLongf) ((char*)p - (char*)buf)) < sz) { 100 | vtid = p[0]; 101 | vstart = p[1]; 102 | vend = p[2]; 103 | v->nBases = p[3]; 104 | v->min = ((float*) p)[4]; 105 | v->max = ((float*) p)[5]; 106 | v->sum = ((float*) p)[6]; 107 | v->sumsq = ((float*) p)[7]; 108 | v->scalar = getScalar(start, end, vstart, vend); 109 | 110 | if(tid == vtid) { 111 | if((start <= vstart && end > vstart) || (start < vend && start >= vstart)) { 112 | vals->vals = realloc(vals->vals, sizeof(struct val_t*)*(vals->n+1)); 113 | if(!vals->vals) goto error; 114 | vals->vals[vals->n++] = v; 115 | v = malloc(sizeof(struct val_t)); 116 | if(!v) goto error; 117 | } 118 | if(vstart > end) break; 119 | } else if(vtid > tid) { 120 | break; 121 | } 122 | p+=8; 123 | } 124 | 125 | free(v); 126 | free(buf); 127 | if(compressed) free(compBuf); 128 | return vals; 129 | 130 | error: 131 | if(buf) free(buf); 132 | if(compBuf && compressed) free(compBuf); 133 | if(v) free(v); 134 | destroyVals_t(vals); 135 | return NULL; 136 | } 137 | 138 | //On error, errno is set to ENOMEM and NaN is returned (though NaN can be returned normally) 139 | static double blockMean(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { 140 | uint32_t i, j; 141 | double output = 0.0, coverage = 0.0; 142 | struct vals_t *v = NULL; 143 | 144 | if(!blocks->n) return strtod("NaN", NULL); 145 | 146 | //Iterate over the blocks 147 | for(i=0; in; i++) { 148 | v = getVals(fp, blocks, i, tid, start, end); 149 | if(!v) goto error; 150 | for(j=0; jn; j++) { 151 | output += v->vals[j]->sum * v->vals[j]->scalar; 152 | coverage += v->vals[j]->nBases * v->vals[j]->scalar; 153 | } 154 | destroyVals_t(v); 155 | } 156 | 157 | 158 | if(!coverage) return strtod("NaN", NULL); 159 | 160 | return output/coverage; 161 | 162 | error: 163 | if(v) free(v); 164 | errno = ENOMEM; 165 | return strtod("NaN", NULL); 166 | } 167 | 168 | static double intMean(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { 169 | double sum = 0.0; 170 | uint32_t nBases = 0, i, start_use, end_use; 171 | 172 | if(!ints->l) return strtod("NaN", NULL); 173 | 174 | for(i=0; il; i++) { 175 | start_use = ints->start[i]; 176 | end_use = ints->end[i]; 177 | if(ints->start[i] < start) start_use = start; 178 | if(ints->end[i] > end) end_use = end; 179 | nBases += end_use-start_use; 180 | sum += (end_use-start_use)*((double) ints->value[i]); 181 | } 182 | 183 | return sum/nBases; 184 | } 185 | 186 | //Does UCSC compensate for partial block/range overlap? 187 | static double blockDev(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { 188 | uint32_t i, j; 189 | double mean = 0.0, ssq = 0.0, coverage = 0.0, diff; 190 | struct vals_t *v = NULL; 191 | 192 | if(!blocks->n) return strtod("NaN", NULL); 193 | 194 | //Iterate over the blocks 195 | for(i=0; in; i++) { 196 | v = getVals(fp, blocks, i, tid, start, end); 197 | if(!v) goto error; 198 | for(j=0; jn; j++) { 199 | coverage += v->vals[j]->nBases * v->vals[j]->scalar; 200 | mean += v->vals[j]->sum * v->vals[j]->scalar; 201 | ssq += v->vals[j]->sumsq * v->vals[j]->scalar; 202 | } 203 | destroyVals_t(v); 204 | v = NULL; 205 | } 206 | 207 | if(coverage<=1.0) return strtod("NaN", NULL); 208 | diff = ssq-mean*mean/coverage; 209 | if(coverage > 1.0) diff /= coverage-1; 210 | if(fabs(diff) > 1e-8) { //Ignore floating point differences 211 | return sqrt(diff); 212 | } else { 213 | return 0.0; 214 | } 215 | 216 | error: 217 | if(v) destroyVals_t(v); 218 | errno = ENOMEM; 219 | return strtod("NaN", NULL); 220 | } 221 | 222 | //This uses compensated summation to account for finite precision math 223 | static double intDev(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { 224 | double v1 = 0.0, mean, rv; 225 | uint32_t nBases = 0, i, start_use, end_use; 226 | 227 | if(!ints->l) return strtod("NaN", NULL); 228 | mean = intMean(ints, start, end); 229 | 230 | for(i=0; il; i++) { 231 | start_use = ints->start[i]; 232 | end_use = ints->end[i]; 233 | if(ints->start[i] < start) start_use = start; 234 | if(ints->end[i] > end) end_use = end; 235 | nBases += end_use-start_use; 236 | v1 += (end_use-start_use) * pow(ints->value[i]-mean, 2.0); //running sum of squared difference 237 | } 238 | 239 | if(nBases>=2) rv = sqrt(v1/(nBases-1)); 240 | else if(nBases==1) rv = sqrt(v1); 241 | else rv = strtod("NaN", NULL); 242 | 243 | return rv; 244 | } 245 | 246 | static double blockMax(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { 247 | uint32_t i, j, isNA = 1; 248 | double o = strtod("NaN", NULL); 249 | struct vals_t *v = NULL; 250 | 251 | if(!blocks->n) return o; 252 | 253 | //Iterate the blocks 254 | for(i=0; in; i++) { 255 | v = getVals(fp, blocks, i, tid, start, end); 256 | if(!v) goto error; 257 | for(j=0; jn; j++) { 258 | if(isNA) { 259 | o = v->vals[j]->max; 260 | isNA = 0; 261 | } else if(v->vals[j]->max > o) { 262 | o = v->vals[j]->max; 263 | } 264 | } 265 | destroyVals_t(v); 266 | } 267 | 268 | return o; 269 | 270 | error: 271 | destroyVals_t(v); 272 | errno = ENOMEM; 273 | return strtod("NaN", NULL); 274 | } 275 | 276 | static double intMax(bwOverlappingIntervals_t* ints) { 277 | uint32_t i; 278 | double o; 279 | 280 | if(ints->l < 1) return strtod("NaN", NULL); 281 | 282 | o = ints->value[0]; 283 | for(i=1; il; i++) { 284 | if(ints->value[i] > o) o = ints->value[i]; 285 | } 286 | 287 | return o; 288 | } 289 | 290 | static double blockMin(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { 291 | uint32_t i, j, isNA = 1; 292 | double o = strtod("NaN", NULL); 293 | struct vals_t *v = NULL; 294 | 295 | if(!blocks->n) return o; 296 | 297 | //Iterate the blocks 298 | for(i=0; in; i++) { 299 | v = getVals(fp, blocks, i, tid, start, end); 300 | if(!v) goto error; 301 | for(j=0; jn; j++) { 302 | if(isNA) { 303 | o = v->vals[j]->min; 304 | isNA = 0; 305 | } else if(v->vals[j]->min < o) o = v->vals[j]->min; 306 | } 307 | destroyVals_t(v); 308 | } 309 | 310 | return o; 311 | 312 | error: 313 | destroyVals_t(v); 314 | errno = ENOMEM; 315 | return strtod("NaN", NULL); 316 | } 317 | 318 | static double intMin(bwOverlappingIntervals_t* ints) { 319 | uint32_t i; 320 | double o; 321 | 322 | if(ints->l < 1) return strtod("NaN", NULL); 323 | 324 | o = ints->value[0]; 325 | for(i=1; il; i++) { 326 | if(ints->value[i] < o) o = ints->value[i]; 327 | } 328 | 329 | return o; 330 | } 331 | 332 | //Does UCSC compensate for only partial block/interval overlap? 333 | static double blockCoverage(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { 334 | uint32_t i, j; 335 | double o = 0.0; 336 | struct vals_t *v = NULL; 337 | 338 | if(!blocks->n) return strtod("NaN", NULL); 339 | 340 | //Iterate over the blocks 341 | for(i=0; in; i++) { 342 | v = getVals(fp, blocks, i, tid, start, end); 343 | if(!v) goto error; 344 | for(j=0; jn; j++) { 345 | o+= v->vals[j]->nBases * v->vals[j]->scalar; 346 | } 347 | destroyVals_t(v); 348 | } 349 | 350 | if(o == 0.0) return strtod("NaN", NULL); 351 | return o; 352 | 353 | error: 354 | destroyVals_t(v); 355 | errno = ENOMEM; 356 | return strtod("NaN", NULL); 357 | } 358 | 359 | static double intCoverage(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { 360 | uint32_t i, start_use, end_use; 361 | double o = 0.0; 362 | 363 | if(!ints->l) return strtod("NaN", NULL); 364 | 365 | for(i=0; il; i++) { 366 | start_use = ints->start[i]; 367 | end_use = ints->end[i]; 368 | if(start_use < start) start_use = start; 369 | if(end_use > end) end_use = end; 370 | o += end_use - start_use; 371 | } 372 | 373 | return o/(end-start); 374 | } 375 | 376 | static double blockSum(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { 377 | uint32_t i, j, sizeUse; 378 | double o = 0.0; 379 | struct vals_t *v = NULL; 380 | 381 | if(!blocks->n) return strtod("NaN", NULL); 382 | 383 | //Iterate over the blocks 384 | for(i=0; in; i++) { 385 | v = getVals(fp, blocks, i, tid, start, end); 386 | if(!v) goto error; 387 | for(j=0; jn; j++) { 388 | //Multiply the block average by min(bases covered, block overlap with interval) 389 | sizeUse = v->vals[j]->scalar; 390 | if(sizeUse > v->vals[j]->nBases) sizeUse = v->vals[j]->nBases; 391 | o+= (v->vals[j]->sum * sizeUse) / v->vals[j]->nBases; 392 | } 393 | destroyVals_t(v); 394 | } 395 | 396 | if(o == 0.0) return strtod("NaN", NULL); 397 | return o; 398 | 399 | error: 400 | destroyVals_t(v); 401 | errno = ENOMEM; 402 | return strtod("NaN", NULL); 403 | } 404 | 405 | static double intSum(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { 406 | uint32_t i, start_use, end_use; 407 | double o = 0.0; 408 | 409 | if(!ints->l) return strtod("NaN", NULL); 410 | 411 | for(i=0; il; i++) { 412 | start_use = ints->start[i]; 413 | end_use = ints->end[i]; 414 | if(start_use < start) start_use = start; 415 | if(end_use > end) end_use = end; 416 | o += (end_use - start_use) * ints->value[i]; 417 | } 418 | 419 | return o; 420 | } 421 | 422 | //Returns NULL on error, otherwise a double* that needs to be free()d 423 | static double *bwStatsFromZoom(bigWigFile_t *fp, int32_t level, uint32_t tid, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) { 424 | bwOverlapBlock_t *blocks = NULL; 425 | double *output = NULL; 426 | uint32_t pos = start, i, end2; 427 | 428 | if(!fp->hdr->zoomHdrs->idx[level]) { 429 | fp->hdr->zoomHdrs->idx[level] = bwReadIndex(fp, fp->hdr->zoomHdrs->indexOffset[level]); 430 | if(!fp->hdr->zoomHdrs->idx[level]) return NULL; 431 | } 432 | errno = 0; //Sometimes libCurls sets and then doesn't unset errno on errors 433 | 434 | output = malloc(sizeof(double)*nBins); 435 | if(!output) return NULL; 436 | 437 | for(i=0, pos=start; ihdr->zoomHdrs->idx[level]->root, tid, pos, end2); 440 | if(!blocks) goto error; 441 | 442 | switch(type) { 443 | case 0: 444 | //mean 445 | output[i] = blockMean(fp, blocks, tid, pos, end2); 446 | break; 447 | case 1: 448 | //stdev 449 | output[i] = blockDev(fp, blocks, tid, pos, end2); 450 | break; 451 | case 2: 452 | //max 453 | output[i] = blockMax(fp, blocks, tid, pos, end2); 454 | break; 455 | case 3: 456 | //min 457 | output[i] = blockMin(fp, blocks, tid, pos, end2); 458 | break; 459 | case 4: 460 | //cov 461 | output[i] = blockCoverage(fp, blocks, tid, pos, end2)/(end2-pos); 462 | break; 463 | case 5: 464 | //sum 465 | output[i] = blockSum(fp, blocks, tid, pos, end2); 466 | break; 467 | default: 468 | goto error; 469 | break; 470 | } 471 | if(errno) goto error; 472 | destroyBWOverlapBlock(blocks); 473 | pos = end2; 474 | } 475 | 476 | return output; 477 | 478 | error: 479 | fprintf(stderr, "got an error in bwStatsFromZoom in the range %"PRIu32"-%"PRIu32": %s\n", pos, end2, strerror(errno)); 480 | if(blocks) destroyBWOverlapBlock(blocks); 481 | if(output) free(output); 482 | return NULL; 483 | } 484 | 485 | double *bwStatsFromFull(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) { 486 | bwOverlappingIntervals_t *ints = NULL; 487 | double *output = malloc(sizeof(double)*nBins); 488 | uint32_t i, pos = start, end2; 489 | if(!output) return NULL; 490 | 491 | for(i=0; i 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | static uint32_t roundup(uint32_t v) { 10 | v--; 11 | v |= v >> 1; 12 | v |= v >> 2; 13 | v |= v >> 4; 14 | v |= v >> 8; 15 | v |= v >> 16; 16 | v++; 17 | return v; 18 | } 19 | 20 | //Returns the root node on success and NULL on error 21 | static bwRTree_t *readRTreeIdx(bigWigFile_t *fp, uint64_t offset) { 22 | uint32_t magic; 23 | bwRTree_t *node; 24 | 25 | if(!offset) { 26 | if(bwSetPos(fp, fp->hdr->indexOffset)) return NULL; 27 | } else { 28 | if(bwSetPos(fp, offset)) return NULL; 29 | } 30 | 31 | if(bwRead(&magic, sizeof(uint32_t), 1, fp) != 1) return NULL; 32 | if(magic != IDX_MAGIC) { 33 | fprintf(stderr, "[readRTreeIdx] Mismatch in the magic number!\n"); 34 | return NULL; 35 | } 36 | 37 | node = calloc(1, sizeof(bwRTree_t)); 38 | if(!node) return NULL; 39 | 40 | if(bwRead(&(node->blockSize), sizeof(uint32_t), 1, fp) != 1) goto error; 41 | if(bwRead(&(node->nItems), sizeof(uint64_t), 1, fp) != 1) goto error; 42 | if(bwRead(&(node->chrIdxStart), sizeof(uint32_t), 1, fp) != 1) goto error; 43 | if(bwRead(&(node->baseStart), sizeof(uint32_t), 1, fp) != 1) goto error; 44 | if(bwRead(&(node->chrIdxEnd), sizeof(uint32_t), 1, fp) != 1) goto error; 45 | if(bwRead(&(node->baseEnd), sizeof(uint32_t), 1, fp) != 1) goto error; 46 | if(bwRead(&(node->idxSize), sizeof(uint64_t), 1, fp) != 1) goto error; 47 | if(bwRead(&(node->nItemsPerSlot), sizeof(uint32_t), 1, fp) != 1) goto error; 48 | //Padding 49 | if(bwRead(&(node->blockSize), sizeof(uint32_t), 1, fp) != 1) goto error; 50 | node->rootOffset = bwTell(fp); 51 | 52 | //For remote files, libCurl sometimes sets errno to 115 and doesn't clear it 53 | errno = 0; 54 | 55 | return node; 56 | 57 | error: 58 | free(node); 59 | return NULL; 60 | } 61 | 62 | //Returns a bwRTreeNode_t on success and NULL on an error 63 | //For the root node, set offset to 0 64 | static bwRTreeNode_t *bwGetRTreeNode(bigWigFile_t *fp, uint64_t offset) { 65 | bwRTreeNode_t *node = NULL; 66 | uint8_t padding; 67 | uint16_t i; 68 | if(offset) { 69 | if(bwSetPos(fp, offset)) return NULL; 70 | } else { 71 | //seek 72 | if(bwSetPos(fp, fp->idx->rootOffset)) return NULL; 73 | } 74 | 75 | node = calloc(1, sizeof(bwRTreeNode_t)); 76 | if(!node) return NULL; 77 | 78 | if(bwRead(&(node->isLeaf), sizeof(uint8_t), 1, fp) != 1) goto error; 79 | if(bwRead(&padding, sizeof(uint8_t), 1, fp) != 1) goto error; 80 | if(bwRead(&(node->nChildren), sizeof(uint16_t), 1, fp) != 1) goto error; 81 | 82 | node->chrIdxStart = malloc(sizeof(uint32_t)*(node->nChildren)); 83 | if(!node->chrIdxStart) goto error; 84 | node->baseStart = malloc(sizeof(uint32_t)*(node->nChildren)); 85 | if(!node->baseStart) goto error; 86 | node->chrIdxEnd = malloc(sizeof(uint32_t)*(node->nChildren)); 87 | if(!node->chrIdxEnd) goto error; 88 | node->baseEnd = malloc(sizeof(uint32_t)*(node->nChildren)); 89 | if(!node->baseEnd) goto error; 90 | node->dataOffset = malloc(sizeof(uint64_t)*(node->nChildren)); 91 | if(!node->dataOffset) goto error; 92 | if(node->isLeaf) { 93 | node->x.size = malloc(node->nChildren * sizeof(uint64_t)); 94 | if(!node->x.size) goto error; 95 | } else { 96 | node->x.child = calloc(node->nChildren, sizeof(struct bwRTreeNode_t *)); 97 | if(!node->x.child) goto error; 98 | } 99 | for(i=0; inChildren; i++) { 100 | if(bwRead(&(node->chrIdxStart[i]), sizeof(uint32_t), 1, fp) != 1) goto error; 101 | if(bwRead(&(node->baseStart[i]), sizeof(uint32_t), 1, fp) != 1) goto error; 102 | if(bwRead(&(node->chrIdxEnd[i]), sizeof(uint32_t), 1, fp) != 1) goto error; 103 | if(bwRead(&(node->baseEnd[i]), sizeof(uint32_t), 1, fp) != 1) goto error; 104 | if(bwRead(&(node->dataOffset[i]), sizeof(uint64_t), 1, fp) != 1) goto error; 105 | if(node->isLeaf) { 106 | if(bwRead(&(node->x.size[i]), sizeof(uint64_t), 1, fp) != 1) goto error; 107 | } 108 | } 109 | 110 | return node; 111 | 112 | error: 113 | if(node->chrIdxStart) free(node->chrIdxStart); 114 | if(node->baseStart) free(node->baseStart); 115 | if(node->chrIdxEnd) free(node->chrIdxEnd); 116 | if(node->baseEnd) free(node->baseEnd); 117 | if(node->dataOffset) free(node->dataOffset); 118 | if(node->isLeaf && node->x.size) free(node->x.size); 119 | else if((!node->isLeaf) && node->x.child) free(node->x.child); 120 | free(node); 121 | return NULL; 122 | } 123 | 124 | void destroyBWOverlapBlock(bwOverlapBlock_t *b) { 125 | if(!b) return; 126 | if(b->size) free(b->size); 127 | if(b->offset) free(b->offset); 128 | free(b); 129 | } 130 | 131 | //Returns a bwOverlapBlock_t * object or NULL on error. 132 | static bwOverlapBlock_t *overlapsLeaf(bwRTreeNode_t *node, uint32_t tid, uint32_t start, uint32_t end) { 133 | uint16_t i, idx = 0; 134 | bwOverlapBlock_t *o = calloc(1, sizeof(bwOverlapBlock_t)); 135 | if(!o) return NULL; 136 | 137 | for(i=0; inChildren; i++) { 138 | if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue; 139 | 140 | /* 141 | The individual blocks can theoretically span multiple contigs. 142 | So if we treat the first/last contig in the range as special 143 | but anything in the middle is a guaranteed match 144 | */ 145 | if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { 146 | if(tid == node->chrIdxStart[i]) { 147 | if(node->baseStart[i] >= end) break; 148 | } else if(tid == node->chrIdxEnd[i]) { 149 | if(node->baseEnd[i] <= start) continue; 150 | } 151 | } else { 152 | if(node->baseStart[i] >= end || node->baseEnd[i] <= start) continue; 153 | } 154 | o->n++; 155 | } 156 | 157 | if(o->n) { 158 | o->offset = malloc(sizeof(uint64_t) * (o->n)); 159 | if(!o->offset) goto error; 160 | o->size = malloc(sizeof(uint64_t) * (o->n)); 161 | if(!o->size) goto error; 162 | 163 | for(i=0; inChildren; i++) { 164 | if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue; 165 | if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { 166 | if(tid == node->chrIdxStart[i]) { 167 | if(node->baseStart[i] >= end) continue; 168 | } else if(tid == node->chrIdxEnd[i]) { 169 | if(node->baseEnd[i] <= start) continue; 170 | } 171 | } else { 172 | if(node->baseStart[i] >= end || node->baseEnd[i] <= start) continue; 173 | } 174 | o->offset[idx] = node->dataOffset[i]; 175 | o->size[idx++] = node->x.size[i]; 176 | if(idx >= o->n) break; 177 | } 178 | } 179 | 180 | if(idx != o->n) { //This should never happen 181 | fprintf(stderr, "[overlapsLeaf] Mismatch between number of overlaps calculated and found!\n"); 182 | goto error; 183 | } 184 | 185 | return o; 186 | 187 | error: 188 | if(o) destroyBWOverlapBlock(o); 189 | return NULL; 190 | } 191 | 192 | //This will free l2 unless there's an error! 193 | //Returns NULL on error, otherwise the merged lists 194 | static bwOverlapBlock_t *mergeOverlapBlocks(bwOverlapBlock_t *b1, bwOverlapBlock_t *b2) { 195 | uint64_t i,j; 196 | if(!b2) return b1; 197 | if(!b2->n) { 198 | destroyBWOverlapBlock(b2); 199 | return b1; 200 | } 201 | if(!b1->n) { 202 | destroyBWOverlapBlock(b1); 203 | return b2; 204 | } 205 | j = b1->n; 206 | b1->n += b2->n; 207 | b1->offset = realloc(b1->offset, sizeof(uint64_t) * (b1->n+b2->n)); 208 | if(!b1->offset) goto error; 209 | b1->size = realloc(b1->size, sizeof(uint64_t) * (b1->n+b2->n)); 210 | if(!b1->size) goto error; 211 | 212 | for(i=0; in; i++) { 213 | b1->offset[j+i] = b2->offset[i]; 214 | b1->size[j+i] = b2->size[i]; 215 | } 216 | destroyBWOverlapBlock(b2); 217 | return b1; 218 | 219 | error: 220 | destroyBWOverlapBlock(b1); 221 | return NULL; 222 | } 223 | 224 | //Returns NULL and sets nOverlaps to >0 on error, otherwise nOverlaps is the number of file offsets returned 225 | //The output needs to be free()d if not NULL (likewise with *sizes) 226 | static bwOverlapBlock_t *overlapsNonLeaf(bigWigFile_t *fp, bwRTreeNode_t *node, uint32_t tid, uint32_t start, uint32_t end) { 227 | uint16_t i; 228 | bwOverlapBlock_t *nodeBlocks, *output = calloc(1, sizeof(bwOverlapBlock_t)); 229 | if(!output) return NULL; 230 | 231 | for(i=0; inChildren; i++) { 232 | if(tid < node->chrIdxStart[i]) break; 233 | if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue; 234 | if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { //child spans contigs 235 | if(tid == node->chrIdxStart[i]) { 236 | if(node->baseStart[i] >= end) continue; 237 | } else if(tid == node->chrIdxEnd[i]) { 238 | if(node->baseEnd[i] <= start) continue; 239 | } 240 | } else { 241 | if(end <= node->baseStart[i] || start >= node->baseEnd[i]) continue; 242 | } 243 | 244 | //We have an overlap! 245 | if(!node->x.child[i]) 246 | node->x.child[i] = bwGetRTreeNode(fp, node->dataOffset[i]); 247 | if(!node->x.child[i]) goto error; 248 | 249 | if(node->x.child[i]->isLeaf) { //leaf 250 | nodeBlocks = overlapsLeaf(node->x.child[i], tid, start, end); 251 | } else { //non-leaf 252 | nodeBlocks = overlapsNonLeaf(fp, node->x.child[i], tid, start, end); 253 | } 254 | 255 | //The output is processed the same regardless of leaf/non-leaf 256 | if(!nodeBlocks) goto error; 257 | else { 258 | output = mergeOverlapBlocks(output, nodeBlocks); 259 | if(!output) { 260 | destroyBWOverlapBlock(nodeBlocks); 261 | goto error; 262 | } 263 | } 264 | } 265 | 266 | return output; 267 | 268 | error: 269 | destroyBWOverlapBlock(output); 270 | return NULL; 271 | } 272 | 273 | //Returns NULL and sets nOverlaps to >0 on error, otherwise nOverlaps is the number of file offsets returned 274 | //The output must be free()d 275 | bwOverlapBlock_t *walkRTreeNodes(bigWigFile_t *bw, bwRTreeNode_t *root, uint32_t tid, uint32_t start, uint32_t end) { 276 | if(root->isLeaf) return overlapsLeaf(root, tid, start, end); 277 | return overlapsNonLeaf(bw, root, tid, start, end); 278 | } 279 | 280 | //In reality, a hash or some sort of tree structure is probably faster... 281 | //Return -1 (AKA 0xFFFFFFFF...) on "not there", so we can hold (2^32)-1 items. 282 | uint32_t bwGetTid(const bigWigFile_t *fp, const char *chrom) { 283 | uint32_t i; 284 | if(!chrom) return -1; 285 | for(i=0; icl->nKeys; i++) { 286 | if(strcmp(chrom, fp->cl->chrom[i]) == 0) return i; 287 | } 288 | return -1; 289 | } 290 | 291 | static bwOverlapBlock_t *bwGetOverlappingBlocks(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end) { 292 | uint32_t tid = bwGetTid(fp, chrom); 293 | 294 | if(tid == (uint32_t) -1) { 295 | fprintf(stderr, "[bwGetOverlappingBlocks] Non-existent contig: %s\n", chrom); 296 | return NULL; 297 | } 298 | 299 | //Get the info if needed 300 | if(!fp->idx) { 301 | fp->idx = readRTreeIdx(fp, fp->hdr->indexOffset); 302 | if(!fp->idx) { 303 | return NULL; 304 | } 305 | } 306 | 307 | if(!fp->idx->root) fp->idx->root = bwGetRTreeNode(fp, 0); 308 | if(!fp->idx->root) return NULL; 309 | 310 | return walkRTreeNodes(fp, fp->idx->root, tid, start, end); 311 | } 312 | 313 | void bwFillDataHdr(bwDataHeader_t *hdr, void *b) { 314 | hdr->tid = ((uint32_t*)b)[0]; 315 | hdr->start = ((uint32_t*)b)[1]; 316 | hdr->end = ((uint32_t*)b)[2]; 317 | hdr->step = ((uint32_t*)b)[3]; 318 | hdr->span = ((uint32_t*)b)[4]; 319 | hdr->type = ((uint8_t*)b)[20]; 320 | hdr->nItems = ((uint16_t*)b)[11]; 321 | } 322 | 323 | void bwDestroyOverlappingIntervals(bwOverlappingIntervals_t *o) { 324 | if(!o) return; 325 | if(o->start) free(o->start); 326 | if(o->end) free(o->end); 327 | if(o->value) free(o->value); 328 | free(o); 329 | } 330 | 331 | void bbDestroyOverlappingEntries(bbOverlappingEntries_t *o) { 332 | uint32_t i; 333 | if(!o) return; 334 | if(o->start) free(o->start); 335 | if(o->end) free(o->end); 336 | if(o->str) { 337 | for(i=0; il; i++) { 338 | if(o->str[i]) free(o->str[i]); 339 | } 340 | free(o->str); 341 | } 342 | free(o); 343 | } 344 | 345 | //Returns NULL on error, in which case o has been free()d 346 | static bwOverlappingIntervals_t *pushIntervals(bwOverlappingIntervals_t *o, uint32_t start, uint32_t end, float value) { 347 | if(o->l+1 >= o->m) { 348 | o->m = roundup(o->l+1); 349 | o->start = realloc(o->start, o->m * sizeof(uint32_t)); 350 | if(!o->start) goto error; 351 | o->end = realloc(o->end, o->m * sizeof(uint32_t)); 352 | if(!o->end) goto error; 353 | o->value = realloc(o->value, o->m * sizeof(float)); 354 | if(!o->value) goto error; 355 | } 356 | o->start[o->l] = start; 357 | o->end[o->l] = end; 358 | o->value[o->l++] = value; 359 | return o; 360 | 361 | error: 362 | bwDestroyOverlappingIntervals(o); 363 | return NULL; 364 | } 365 | 366 | static bbOverlappingEntries_t *pushBBIntervals(bbOverlappingEntries_t *o, uint32_t start, uint32_t end, char *str, int withString) { 367 | if(o->l+1 >= o->m) { 368 | o->m = roundup(o->l+1); 369 | o->start = realloc(o->start, o->m * sizeof(uint32_t)); 370 | if(!o->start) goto error; 371 | o->end = realloc(o->end, o->m * sizeof(uint32_t)); 372 | if(!o->end) goto error; 373 | if(withString) { 374 | o->str = realloc(o->str, o->m * sizeof(char**)); 375 | if(!o->str) goto error; 376 | } 377 | } 378 | o->start[o->l] = start; 379 | o->end[o->l] = end; 380 | if(withString) o->str[o->l] = bwStrdup(str); 381 | o->l++; 382 | return o; 383 | 384 | error: 385 | bbDestroyOverlappingEntries(o); 386 | return NULL; 387 | } 388 | 389 | //Returns NULL on error 390 | bwOverlappingIntervals_t *bwGetOverlappingIntervalsCore(bigWigFile_t *fp, bwOverlapBlock_t *o, uint32_t tid, uint32_t ostart, uint32_t oend) { 391 | uint64_t i; 392 | uint16_t j; 393 | int compressed = 0, rv; 394 | uLongf sz = fp->hdr->bufSize, tmp; 395 | void *buf = NULL, *compBuf = NULL; 396 | uint32_t start = 0, end , *p; 397 | float value; 398 | bwDataHeader_t hdr; 399 | bwOverlappingIntervals_t *output = calloc(1, sizeof(bwOverlappingIntervals_t)); 400 | 401 | if(!output) goto error; 402 | 403 | if(!o) return output; 404 | if(!o->n) return output; 405 | 406 | if(sz) { 407 | compressed = 1; 408 | buf = malloc(sz); 409 | } 410 | sz = 0; //This is now the size of the compressed buffer 411 | 412 | for(i=0; in; i++) { 413 | if(bwSetPos(fp, o->offset[i])) goto error; 414 | 415 | if(sz < o->size[i]) { 416 | compBuf = realloc(compBuf, o->size[i]); 417 | sz = o->size[i]; 418 | } 419 | if(!compBuf) goto error; 420 | 421 | if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; 422 | if(compressed) { 423 | tmp = fp->hdr->bufSize; //This gets over-written by uncompress 424 | rv = uncompress(buf, (uLongf *) &tmp, compBuf, o->size[i]); 425 | if(rv != Z_OK) goto error; 426 | } else { 427 | buf = compBuf; 428 | } 429 | 430 | //TODO: ensure that tmp is large enough! 431 | bwFillDataHdr(&hdr, buf); 432 | 433 | p = ((uint32_t*) buf); 434 | p += 6; 435 | if(hdr.tid != tid) continue; 436 | 437 | if(hdr.type == 3) start = hdr.start - hdr.step; 438 | 439 | //FIXME: We should ensure that sz is large enough to hold nItems of the given type 440 | for(j=0; j= oend) continue; 469 | //Push the overlap 470 | if(!pushIntervals(output, start, end, value)) goto error; 471 | } 472 | } 473 | 474 | if(compressed && buf) free(buf); 475 | if(compBuf) free(compBuf); 476 | return output; 477 | 478 | error: 479 | fprintf(stderr, "[bwGetOverlappingIntervalsCore] Got an error\n"); 480 | if(output) bwDestroyOverlappingIntervals(output); 481 | if(compressed && buf) free(buf); 482 | if(compBuf) free(compBuf); 483 | return NULL; 484 | } 485 | 486 | bbOverlappingEntries_t *bbGetOverlappingEntriesCore(bigWigFile_t *fp, bwOverlapBlock_t *o, uint32_t tid, uint32_t ostart, uint32_t oend, int withString) { 487 | uint64_t i; 488 | int compressed = 0, rv, slen; 489 | uLongf sz = fp->hdr->bufSize, tmp = 0; 490 | void *buf = NULL, *bufEnd = NULL, *compBuf = NULL; 491 | uint32_t entryTid = 0, start = 0, end; 492 | char *str; 493 | bbOverlappingEntries_t *output = calloc(1, sizeof(bbOverlappingEntries_t)); 494 | 495 | if(!output) goto error; 496 | 497 | if(!o) return output; 498 | if(!o->n) return output; 499 | 500 | if(sz) { 501 | compressed = 1; 502 | buf = malloc(sz); 503 | } 504 | sz = 0; //This is now the size of the compressed buffer 505 | 506 | for(i=0; in; i++) { 507 | if(bwSetPos(fp, o->offset[i])) goto error; 508 | 509 | if(sz < o->size[i]) { 510 | compBuf = realloc(compBuf, o->size[i]); 511 | sz = o->size[i]; 512 | } 513 | if(!compBuf) goto error; 514 | 515 | if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; 516 | if(compressed) { 517 | tmp = fp->hdr->bufSize; //This gets over-written by uncompress 518 | rv = uncompress(buf, (uLongf *) &tmp, compBuf, o->size[i]); 519 | if(rv != Z_OK) goto error; 520 | } else { 521 | buf = compBuf; 522 | tmp = o->size[i]; //TODO: Is this correct? Do non-gzipped bigBeds exist? 523 | } 524 | 525 | bufEnd = (char*)buf + tmp; 526 | while(buf < bufEnd) { 527 | entryTid = ((uint32_t*)buf)[0]; 528 | start = ((uint32_t*)buf)[1]; 529 | end = ((uint32_t*)buf)[2]; 530 | buf = (char*)buf + 12; 531 | str = (char*)buf; 532 | slen = strlen(str) + 1; 533 | buf = (char*)buf + slen; 534 | 535 | if(entryTid < tid) continue; 536 | if(entryTid > tid) break; 537 | if(end <= ostart) continue; 538 | if(start >= oend) break; 539 | 540 | //Push the overlap 541 | if(!pushBBIntervals(output, start, end, str, withString)) goto error; 542 | } 543 | 544 | buf = (char*)bufEnd - tmp; //reset the buffer pointer 545 | } 546 | 547 | if(compressed && buf) free(buf); 548 | if(compBuf) free(compBuf); 549 | return output; 550 | 551 | error: 552 | fprintf(stderr, "[bbGetOverlappingEntriesCore] Got an error\n"); 553 | buf = (char*)bufEnd - tmp; 554 | if(output) bbDestroyOverlappingEntries(output); 555 | if(compressed && buf) free(buf); 556 | if(compBuf) free(compBuf); 557 | return NULL; 558 | } 559 | 560 | //Returns NULL on error OR no intervals, which is a bad design... 561 | bwOverlappingIntervals_t *bwGetOverlappingIntervals(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end) { 562 | bwOverlappingIntervals_t *output; 563 | uint32_t tid = bwGetTid(fp, chrom); 564 | if(tid == (uint32_t) -1) return NULL; 565 | bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end); 566 | if(!blocks) return NULL; 567 | output = bwGetOverlappingIntervalsCore(fp, blocks, tid, start, end); 568 | destroyBWOverlapBlock(blocks); 569 | return output; 570 | } 571 | 572 | //Like above, but for bigBed files 573 | bbOverlappingEntries_t *bbGetOverlappingEntries(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, int withString) { 574 | bbOverlappingEntries_t *output; 575 | uint32_t tid = bwGetTid(fp, chrom); 576 | if(tid == (uint32_t) -1) return NULL; 577 | bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end); 578 | if(!blocks) return NULL; 579 | output = bbGetOverlappingEntriesCore(fp, blocks, tid, start, end, withString); 580 | destroyBWOverlapBlock(blocks); 581 | return output; 582 | } 583 | 584 | //Returns NULL on error 585 | bwOverlapIterator_t *bwOverlappingIntervalsIterator(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, uint32_t blocksPerIteration) { 586 | bwOverlapIterator_t *output = NULL; 587 | uint64_t n; 588 | uint32_t tid = bwGetTid(fp, chrom); 589 | if(tid == (uint32_t) -1) return output; 590 | output = calloc(1, sizeof(bwOverlapIterator_t)); 591 | if(!output) return output; 592 | bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end); 593 | 594 | output->bw = fp; 595 | output->tid = tid; 596 | output->start = start; 597 | output->end = end; 598 | output->blocks = blocks; 599 | output->blocksPerIteration = blocksPerIteration; 600 | 601 | if(blocks) { 602 | n = blocks->n; 603 | if(n>blocksPerIteration) blocks->n = blocksPerIteration; 604 | output->intervals = bwGetOverlappingIntervalsCore(fp, blocks,tid, start, end); 605 | blocks->n = n; 606 | output->offset = blocksPerIteration; 607 | } 608 | output->data = output->intervals; 609 | return output; 610 | } 611 | 612 | //Returns NULL on error 613 | bwOverlapIterator_t *bbOverlappingEntriesIterator(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, int withString, uint32_t blocksPerIteration) { 614 | bwOverlapIterator_t *output = NULL; 615 | uint64_t n; 616 | uint32_t tid = bwGetTid(fp, chrom); 617 | if(tid == (uint32_t) -1) return output; 618 | output = calloc(1, sizeof(bwOverlapIterator_t)); 619 | if(!output) return output; 620 | bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end); 621 | 622 | output->bw = fp; 623 | output->tid = tid; 624 | output->start = start; 625 | output->end = end; 626 | output->blocks = blocks; 627 | output->blocksPerIteration = blocksPerIteration; 628 | output->withString = withString; 629 | 630 | if(blocks) { 631 | n = blocks->n; 632 | if(n>blocksPerIteration) blocks->n = blocksPerIteration; 633 | output->entries = bbGetOverlappingEntriesCore(fp, blocks,tid, start, end, withString); 634 | blocks->n = n; 635 | output->offset = blocksPerIteration; 636 | } 637 | output->data = output->entries; 638 | return output; 639 | } 640 | 641 | void bwIteratorDestroy(bwOverlapIterator_t *iter) { 642 | if(!iter) return; 643 | if(iter->blocks) destroyBWOverlapBlock((bwOverlapBlock_t*) iter->blocks); 644 | if(iter->intervals) bwDestroyOverlappingIntervals(iter->intervals); 645 | if(iter->entries) bbDestroyOverlappingEntries(iter->entries); 646 | free(iter); 647 | } 648 | 649 | //On error, points to NULL and destroys the input 650 | bwOverlapIterator_t *bwIteratorNext(bwOverlapIterator_t *iter) { 651 | uint64_t n, *offset, *size; 652 | bwOverlapBlock_t *blocks = iter->blocks; 653 | 654 | if(iter->intervals) { 655 | bwDestroyOverlappingIntervals(iter->intervals); 656 | iter->intervals = NULL; 657 | } 658 | if(iter->entries) { 659 | bbDestroyOverlappingEntries(iter->entries); 660 | iter->entries = NULL; 661 | } 662 | iter->data = NULL; 663 | 664 | if(iter->offset < blocks->n) { 665 | //store the previous values 666 | n = blocks->n; 667 | offset = blocks->offset; 668 | size = blocks->size; 669 | 670 | //Move the start of the blocks 671 | blocks->offset += iter->offset; 672 | blocks->size += iter->offset; 673 | if(iter->offset + iter->blocksPerIteration > n) { 674 | blocks->n = blocks->n - iter->offset; 675 | } else { 676 | blocks->n = iter->blocksPerIteration; 677 | } 678 | 679 | //Get the intervals or entries, as appropriate 680 | if(iter->bw->type == 0) { 681 | //bigWig 682 | iter->intervals = bwGetOverlappingIntervalsCore(iter->bw, blocks, iter->tid, iter->start, iter->end); 683 | iter->data = iter->intervals; 684 | } else { 685 | //bigBed 686 | iter->entries = bbGetOverlappingEntriesCore(iter->bw, blocks, iter->tid, iter->start, iter->end, iter->withString); 687 | iter->data = iter->entries; 688 | } 689 | iter->offset += iter->blocksPerIteration; 690 | 691 | //reset the values in iter->blocks 692 | blocks->n = n; 693 | blocks->offset = offset; 694 | blocks->size = size; 695 | 696 | //Check for error 697 | if(!iter->intervals && !iter->entries) goto error; 698 | } 699 | 700 | return iter; 701 | 702 | error: 703 | bwIteratorDestroy(iter); 704 | return NULL; 705 | } 706 | 707 | //This is like bwGetOverlappingIntervals, except it returns 1 base windows. If includeNA is not 0, then a value will be returned for every position in the range (defaulting to NAN). 708 | //The ->end member is NULL 709 | //If includeNA is not 0 then ->start is also NULL, since it's implied 710 | //Note that bwDestroyOverlappingIntervals() will work in either case 711 | bwOverlappingIntervals_t *bwGetValues(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, int includeNA) { 712 | uint32_t i, j, n; 713 | bwOverlappingIntervals_t *output = NULL; 714 | bwOverlappingIntervals_t *intermediate = bwGetOverlappingIntervals(fp, chrom, start, end); 715 | if(!intermediate) return NULL; 716 | 717 | output = calloc(1, sizeof(bwOverlappingIntervals_t)); 718 | if(!output) goto error; 719 | if(includeNA) { 720 | output->l = end-start; 721 | output->value = malloc(output->l*sizeof(float)); 722 | if(!output->value) goto error; 723 | for(i=0; il; i++) output->value[i] = NAN; 724 | for(i=0; il; i++) { 725 | for(j=intermediate->start[i]; jend[i]; j++) { 726 | if(j < start || j >= end) continue; 727 | output->value[j-start] = intermediate->value[i]; 728 | } 729 | } 730 | } else { 731 | n = 0; 732 | for(i=0; il; i++) { 733 | if(intermediate->start[i] < start) intermediate->start[i] = start; 734 | if(intermediate->end[i] > end) intermediate->end[i] = end; 735 | n += intermediate->end[i]-intermediate->start[i]; 736 | } 737 | output->l = n; 738 | output->start = malloc(sizeof(uint32_t)*n); 739 | if(!output->start) goto error; 740 | output->value = malloc(sizeof(float)*n); 741 | if(!output->value) goto error; 742 | n = 0; //this is now the index 743 | for(i=0; il; i++) { 744 | for(j=intermediate->start[i]; jend[i]; j++) { 745 | if(j < start || j >= end) continue; 746 | output->start[n] = j; 747 | output->value[n++] = intermediate->value[i]; 748 | } 749 | } 750 | } 751 | 752 | bwDestroyOverlappingIntervals(intermediate); 753 | return output; 754 | 755 | error: 756 | if(intermediate) bwDestroyOverlappingIntervals(intermediate); 757 | if(output) bwDestroyOverlappingIntervals(output); 758 | return NULL; 759 | } 760 | 761 | void bwDestroyIndexNode(bwRTreeNode_t *node) { 762 | uint16_t i; 763 | 764 | if(!node) return; 765 | 766 | free(node->chrIdxStart); 767 | free(node->baseStart); 768 | free(node->chrIdxEnd); 769 | free(node->baseEnd); 770 | free(node->dataOffset); 771 | if(!node->isLeaf) { 772 | for(i=0; inChildren; i++) { 773 | bwDestroyIndexNode(node->x.child[i]); 774 | } 775 | free(node->x.child); 776 | } else { 777 | free(node->x.size); 778 | } 779 | free(node); 780 | } 781 | 782 | void bwDestroyIndex(bwRTree_t *idx) { 783 | bwDestroyIndexNode(idx->root); 784 | free(idx); 785 | } 786 | 787 | //Returns a pointer to the requested index (@offset, unless it's 0, in which case the index for the values is returned 788 | //Returns NULL on error 789 | bwRTree_t *bwReadIndex(bigWigFile_t *fp, uint64_t offset) { 790 | bwRTree_t *idx = readRTreeIdx(fp, offset); 791 | if(!idx) return NULL; 792 | 793 | //Read in the root node 794 | idx->root = bwGetRTreeNode(fp, idx->rootOffset); 795 | 796 | if(!idx->root) { 797 | bwDestroyIndex(idx); 798 | return NULL; 799 | } 800 | return idx; 801 | } 802 | -------------------------------------------------------------------------------- /libBigWig/bwValues.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBBIGWIG_VALUES_H 2 | #define LIBBIGWIG_VALUES_H 3 | 4 | #include 5 | /*! \file bwValues.h 6 | * 7 | * You should not directly use functions and structures defined here. They're really meant for internal use only. 8 | * 9 | * All of the structures here need to be destroyed or you'll leak memory! There are methods available to destroy anything that you need to take care of yourself. 10 | */ 11 | 12 | //N.B., coordinates are still 0-based half open! 13 | /*! 14 | * @brief A node within an R-tree holding the index for data. 15 | * 16 | * Note that there are two types of nodes: leaf and twig. Leaf nodes point to where data actually is. Twig nodes point to additional index nodes, which may or may not be leaves. Each of these nodes has additional children, which may span multiple chromosomes/contigs. 17 | * 18 | * With the start/end position, these positions refer specifically to the chromosomes specified in chrIdxStart/chrIdxEnd. Any chromosomes between these are completely spanned by a given child. 19 | */ 20 | typedef struct bwRTreeNode_t { 21 | uint8_t isLeaf; /** 3 | #endif 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "bigWigIO.h" 9 | #include 10 | #include 11 | 12 | size_t GLOBAL_DEFAULTBUFFERSIZE; 13 | 14 | #ifndef NOCURL 15 | uint64_t getContentLength(const URL_t *URL) { 16 | double size; 17 | if(curl_easy_getinfo(URL->x.curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &size) != CURLE_OK) { 18 | return 0; 19 | } 20 | if(size== -1.0) return 0; 21 | return (uint64_t) size; 22 | } 23 | 24 | //Fill the buffer, note that URL may be left in an unusable state on error! 25 | CURLcode urlFetchData(URL_t *URL, unsigned long bufSize) { 26 | CURLcode rv; 27 | char range[1024]; 28 | 29 | if(URL->filePos != (size_t) -1) URL->filePos += URL->bufLen; 30 | else URL->filePos = 0; 31 | 32 | URL->bufPos = URL->bufLen = 0; //Otherwise, we can't copy anything into the buffer! 33 | sprintf(range,"%lu-%lu", URL->filePos, URL->filePos+bufSize-1); 34 | rv = curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range); 35 | if(rv != CURLE_OK) { 36 | fprintf(stderr, "[urlFetchData] Couldn't set the range (%s)\n", range); 37 | return rv; 38 | } 39 | 40 | rv = curl_easy_perform(URL->x.curl); 41 | errno = 0; //Sometimes curl_easy_perform leaves a random errno remnant 42 | return rv; 43 | } 44 | 45 | //Read data into a buffer, ideally from a buffer already in memory 46 | //The loop is likely no longer needed. 47 | size_t url_fread(void *obuf, size_t obufSize, URL_t *URL) { 48 | size_t remaining = obufSize, fetchSize; 49 | void *p = obuf; 50 | CURLcode rv; 51 | 52 | while(remaining) { 53 | if(!URL->bufLen) { 54 | rv = urlFetchData(URL, URL->bufSize); 55 | if(rv != CURLE_OK) { 56 | fprintf(stderr, "[url_fread] urlFetchData (A) returned %s\n", curl_easy_strerror(rv)); 57 | return 0; 58 | } 59 | } else if(URL->bufLen < URL->bufPos + remaining) { //Copy the remaining buffer and reload the buffer as needed 60 | p = memcpy(p, URL->memBuf+URL->bufPos, URL->bufLen - URL->bufPos); 61 | if(!p) return 0; 62 | p += URL->bufLen - URL->bufPos; 63 | remaining -= URL->bufLen - URL->bufPos; 64 | if(remaining) { 65 | if(!URL->isCompressed) { 66 | fetchSize = URL->bufSize; 67 | } else { 68 | fetchSize = (remainingbufSize)?remaining:URL->bufSize; 69 | } 70 | rv = urlFetchData(URL, fetchSize); 71 | if(rv != CURLE_OK) { 72 | fprintf(stderr, "[url_fread] urlFetchData (B) returned %s\n", curl_easy_strerror(rv)); 73 | return 0; 74 | } 75 | } 76 | } else { 77 | p = memcpy(p, URL->memBuf+URL->bufPos, remaining); 78 | if(!p) return 0; 79 | URL->bufPos += remaining; 80 | remaining = 0; 81 | } 82 | } 83 | return obufSize; 84 | } 85 | #endif 86 | 87 | //Returns the number of bytes requested or a smaller number on error 88 | //Note that in the case of remote files, the actual amount read may be less than the return value! 89 | size_t urlRead(URL_t *URL, void *buf, size_t bufSize) { 90 | #ifndef NOCURL 91 | if(URL->type==0) { 92 | return fread(buf, bufSize, 1, URL->x.fp)*bufSize; 93 | } else { 94 | return url_fread(buf, bufSize, URL); 95 | } 96 | #else 97 | return fread(buf, bufSize, 1, URL->x.fp)*bufSize; 98 | #endif 99 | } 100 | 101 | size_t bwFillBuffer(const void *inBuf, size_t l, size_t nmemb, void *pURL) { 102 | URL_t *URL = (URL_t*) pURL; 103 | void *p = URL->memBuf; 104 | size_t copied = l*nmemb; 105 | if(!p) return 0; 106 | 107 | p += URL->bufLen; 108 | if(l*nmemb > URL->bufSize - URL->bufPos) { //We received more than we can store! 109 | copied = URL->bufSize - URL->bufLen; 110 | } 111 | memcpy(p, inBuf, copied); 112 | URL->bufLen += copied; 113 | 114 | if(!URL->memBuf) return 0; //signal error 115 | return copied; 116 | } 117 | 118 | //Seek to an arbitrary location, returning a CURLcode 119 | //Note that a local file returns CURLE_OK on success or CURLE_FAILED_INIT on any error; 120 | CURLcode urlSeek(URL_t *URL, size_t pos) { 121 | #ifndef NOCURL 122 | char range[1024]; 123 | CURLcode rv; 124 | 125 | if(URL->type == BWG_FILE) { 126 | #endif 127 | if(fseek(URL->x.fp, pos, SEEK_SET) == 0) { 128 | errno = 0; 129 | return CURLE_OK; 130 | } else { 131 | return CURLE_FAILED_INIT; //This is arbitrary 132 | } 133 | #ifndef NOCURL 134 | } else { 135 | //If the location is covered by the buffer then don't seek! 136 | if(pos < URL->filePos || pos >= URL->filePos+URL->bufLen) { 137 | URL->filePos = pos; 138 | URL->bufLen = 0; //Otherwise, filePos will get incremented on the next read! 139 | URL->bufPos = 0; 140 | //Maybe this works for FTP? 141 | sprintf(range,"%lu-%lu", pos, pos+URL->bufSize-1); 142 | rv = curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range); 143 | if(rv != CURLE_OK) { 144 | fprintf(stderr, "[urlSeek] Couldn't set the range (%s)\n", range); 145 | return rv; 146 | } 147 | rv = curl_easy_perform(URL->x.curl); 148 | if(rv != CURLE_OK) { 149 | fprintf(stderr, "[urlSeek] curl_easy_perform received an error!\n"); 150 | } 151 | errno = 0; //Don't propogate remnant resolved libCurl errors 152 | return rv; 153 | } else { 154 | URL->bufPos = pos-URL->filePos; 155 | return CURLE_OK; 156 | } 157 | } 158 | #endif 159 | } 160 | 161 | URL_t *urlOpen(const char *fname, CURLcode (*callBack)(CURL*), const char *mode) { 162 | URL_t *URL = calloc(1, sizeof(URL_t)); 163 | if(!URL) return NULL; 164 | char *url = NULL, *req = NULL; 165 | #ifndef NOCURL 166 | CURLcode code; 167 | char range[1024]; 168 | #endif 169 | 170 | URL->fname = fname; 171 | 172 | if((!mode) || (strchr(mode, 'w') == 0)) { 173 | //Set the protocol 174 | #ifndef NOCURL 175 | if(strncmp(fname, "http://", 7) == 0) URL->type = BWG_HTTP; 176 | else if(strncmp(fname, "https://", 8) == 0) URL->type = BWG_HTTPS; 177 | else if(strncmp(fname, "ftp://", 6) == 0) URL->type = BWG_FTP; 178 | else URL->type = BWG_FILE; 179 | #else 180 | URL->type = BWG_FILE; 181 | #endif 182 | 183 | //local file? 184 | if(URL->type == BWG_FILE) { 185 | URL->filePos = -1; //This signals that nothing has been read 186 | URL->x.fp = fopen(fname, "rb"); 187 | if(!(URL->x.fp)) { 188 | free(URL); 189 | fprintf(stderr, "[urlOpen] Couldn't open %s for reading\n", fname); 190 | return NULL; 191 | } 192 | #ifndef NOCURL 193 | } else { 194 | //Remote file, set up the memory buffer and get CURL ready 195 | URL->memBuf = malloc(GLOBAL_DEFAULTBUFFERSIZE); 196 | if(!(URL->memBuf)) { 197 | free(URL); 198 | fprintf(stderr, "[urlOpen] Couldn't allocate enough space for the file buffer!\n"); 199 | return NULL; 200 | } 201 | URL->bufSize = GLOBAL_DEFAULTBUFFERSIZE; 202 | URL->x.curl = curl_easy_init(); 203 | if(!(URL->x.curl)) { 204 | fprintf(stderr, "[urlOpen] curl_easy_init() failed!\n"); 205 | goto error; 206 | } 207 | //Negotiate a reasonable HTTP authentication method 208 | if(curl_easy_setopt(URL->x.curl, CURLOPT_HTTPAUTH, CURLAUTH_ANY) != CURLE_OK) { 209 | fprintf(stderr, "[urlOpen] Failed instructing curl to use any HTTP authentication it finds to be suitable!\n"); 210 | goto error; 211 | } 212 | //Follow redirects 213 | if(curl_easy_setopt(URL->x.curl, CURLOPT_FOLLOWLOCATION, 1L) != CURLE_OK) { 214 | fprintf(stderr, "[urlOpen] Failed instructing curl to follow redirects!\n"); 215 | goto error; 216 | } 217 | //Set the URL 218 | if(curl_easy_setopt(URL->x.curl, CURLOPT_URL, fname) != CURLE_OK) { 219 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_URL!\n"); 220 | goto error; 221 | } 222 | //Set the range, which doesn't do anything for HTTP 223 | sprintf(range, "0-%lu", URL->bufSize-1); 224 | if(curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range) != CURLE_OK) { 225 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_RANGE (%s)!\n", range); 226 | goto error; 227 | } 228 | //Set the callback info, which means we no longer need to directly deal with sockets and header! 229 | if(curl_easy_setopt(URL->x.curl, CURLOPT_WRITEFUNCTION, bwFillBuffer) != CURLE_OK) { 230 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_WRITEFUNCTION!\n"); 231 | goto error; 232 | } 233 | if(curl_easy_setopt(URL->x.curl, CURLOPT_WRITEDATA, (void*)URL) != CURLE_OK) { 234 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_WRITEDATA!\n"); 235 | goto error; 236 | } 237 | //Ignore certificate errors with https, libcurl just isn't reliable enough with conda 238 | if(curl_easy_setopt(URL->x.curl, CURLOPT_SSL_VERIFYPEER, 0) != CURLE_OK) { 239 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_SSL_VERIFYPEER to 0!\n"); 240 | goto error; 241 | } 242 | if(curl_easy_setopt(URL->x.curl, CURLOPT_SSL_VERIFYHOST, 0) != CURLE_OK) { 243 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_SSL_VERIFYHOST to 0!\n"); 244 | goto error; 245 | } 246 | if(callBack) { 247 | code = callBack(URL->x.curl); 248 | if(code != CURLE_OK) { 249 | fprintf(stderr, "[urlOpen] The user-supplied call back function returned an error: %s\n", curl_easy_strerror(code)); 250 | goto error; 251 | } 252 | } 253 | code = curl_easy_perform(URL->x.curl); 254 | errno = 0; //Sometimes curl_easy_perform leaves a random errno remnant 255 | if(code != CURLE_OK) { 256 | fprintf(stderr, "[urlOpen] curl_easy_perform received an error: %s\n", curl_easy_strerror(code)); 257 | goto error; 258 | } 259 | #endif 260 | } 261 | } else { 262 | URL->type = BWG_FILE; 263 | URL->x.fp = fopen(fname, mode); 264 | if(!(URL->x.fp)) { 265 | free(URL); 266 | fprintf(stderr, "[urlOpen] Couldn't open %s for writing\n", fname); 267 | return NULL; 268 | } 269 | } 270 | if(url) free(url); 271 | if(req) free(req); 272 | return URL; 273 | 274 | #ifndef NOCURL 275 | error: 276 | if(url) free(url); 277 | if(req) free(req); 278 | free(URL->memBuf); 279 | curl_easy_cleanup(URL->x.curl); 280 | free(URL); 281 | return NULL; 282 | #endif 283 | } 284 | 285 | //Performs the necessary free() operations and handles cleaning up curl 286 | void urlClose(URL_t *URL) { 287 | if(URL->type == BWG_FILE) { 288 | fclose(URL->x.fp); 289 | #ifndef NOCURL 290 | } else { 291 | free(URL->memBuf); 292 | curl_easy_cleanup(URL->x.curl); 293 | #endif 294 | } 295 | free(URL); 296 | } 297 | -------------------------------------------------------------------------------- /pyBigWig.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bigWig.h" 4 | 5 | #define pyBigWigVersion "0.3.24" 6 | 7 | typedef struct { 8 | PyObject_HEAD 9 | bigWigFile_t *bw; 10 | int32_t lastTid; //The TID of the last written entry (or -1) 11 | uint32_t lastSpan; //The span of the last written entry (if applicable) 12 | uint32_t lastStep; //The step of the last written entry (if applicable) 13 | uint32_t lastStart; //The next start position (if applicable) 14 | int lastType; //The type of the last written entry 15 | } pyBigWigFile_t; 16 | 17 | static PyObject *pyBwOpen(PyObject *self, PyObject *pyFname); 18 | static PyObject *pyBwEnter(pyBigWigFile_t *self, PyObject *args); 19 | static PyObject *pyBwClose(pyBigWigFile_t *pybw, PyObject *args); 20 | static PyObject *pyBwGetChroms(pyBigWigFile_t *pybw, PyObject *args); 21 | static PyObject *pyIsBigWig(pyBigWigFile_t *pybw, PyObject *args); 22 | static PyObject *pyIsBigBed(pyBigWigFile_t *pybw, PyObject *args); 23 | static PyObject *pyBwGetStats(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); 24 | #ifdef WITHNUMPY 25 | static PyObject *pyBwGetValues(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); 26 | #else 27 | static PyObject *pyBwGetValues(pyBigWigFile_t *pybw, PyObject *args); 28 | #endif 29 | static PyObject *pyBwGetIntervals(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); 30 | static PyObject *pyBBGetEntries(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); 31 | static PyObject *pyBBGetSQL(pyBigWigFile_t *pybw, PyObject *args); 32 | static PyObject *pyBwGetHeader(pyBigWigFile_t *pybw, PyObject *args); 33 | static PyObject *pyBwAddHeader(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); 34 | static PyObject *pyBwAddEntries(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); 35 | static void pyBwDealloc(pyBigWigFile_t *pybw); 36 | 37 | //The function types aren't actually correct... 38 | static PyMethodDef bwMethods[] = { 39 | {"open", (PyCFunction)pyBwOpen, METH_VARARGS, 40 | "Open a bigWig or bigBed file. For remote files, give a URL starting with HTTP,\n\ 41 | FTP, or HTTPS.\n\ 42 | \n\ 43 | Optional arguments:\n\ 44 | mode: An optional mode. The default is 'r', which opens a file for reading.\n\ 45 | If you specify a mode containing 'w' then you'll instead open a file\n\ 46 | for writing. Note that you then need to add an appropriate header\n\ 47 | before use. For bigBed files, only reading is supported.\n\ 48 | \n\ 49 | Returns:\n\ 50 | A bigWigFile object on success, otherwise None.\n\ 51 | \n\ 52 | Arguments:\n\ 53 | file: The name of a bigWig file.\n\ 54 | \n\ 55 | >>> import pyBigWig\n\ 56 | >>> bw = pyBigWig.open(\"some_file.bw\")\n"}, 57 | {NULL, NULL, 0, NULL} 58 | }; 59 | 60 | static PyMethodDef bwObjMethods[] = { 61 | {"header", (PyCFunction)pyBwGetHeader, METH_VARARGS, 62 | "Returns the header of a bigWig file. This contains information such as: \n\ 63 | * The version number of the file ('version').\n\ 64 | * The number of zoom levels ('nLevels').\n\ 65 | * The number of bases covered ('nBasesCovered').\n\ 66 | * The minimum value ('minVal').\n\ 67 | * The maximum value ('maxVal').\n\ 68 | * The sum of all values ('sumData').\n\ 69 | * The sum of the square of all values ('sumSquared').\n\ 70 | These are returned as a dictionary.\n\ 71 | \n\ 72 | >>> import pyBigWig\n\ 73 | >>> bw = pyBigWig.open(\"some_file.bw\")\n\ 74 | >>> bw.header()\n\ 75 | {'maxVal': 2L, 'sumData': 272L, 'minVal': 0L, 'version': 4L,\n\ 76 | 'sumSquared': 500L, 'nLevels': 1L, 'nBasesCovered': 154L}\n\ 77 | >>> bw.close()\n"}, 78 | {"close", (PyCFunction)pyBwClose, METH_VARARGS, 79 | "Close a bigWig file.\n\ 80 | \n\ 81 | >>> import pyBigWig\n\ 82 | >>> bw = pyBigWig.open(\"some_file.bw\")\n\ 83 | >>> bw.close()\n"}, 84 | {"isBigWig", (PyCFunction)pyIsBigWig, METH_VARARGS, 85 | "Returns True if the object is a bigWig file (otherwise False).\n\ 86 | >>> import pyBigWig\n\ 87 | >>> bw = pyBigWig.open(\"some_file.bigWig\")\n\ 88 | >>> bw.isBigWig()\n\ 89 | True\n\ 90 | >>> bw.isBigBed()\n\ 91 | False\n"}, 92 | {"isBigBed", (PyCFunction)pyIsBigBed, METH_VARARGS, 93 | "Returns true if the object is a bigBed file (otherwise False).\n\ 94 | >>> import pyBigWig\n\ 95 | >>> bw = pyBigWig.open(\"some_file.bigBed\")\n\ 96 | >>> bw.isBigWig()\n\ 97 | False\n\ 98 | >>> bw.isBigBed()\n\ 99 | True\n"}, 100 | {"chroms", (PyCFunction)pyBwGetChroms, METH_VARARGS, 101 | "Return a chromosome: length dictionary. The order is typically not\n\ 102 | alphabetical and the lengths are long (thus the 'L' suffix).\n\ 103 | \n\ 104 | Optional arguments:\n\ 105 | chrom: An optional chromosome name\n\ 106 | \n\ 107 | Returns:\n\ 108 | A list of chromosome lengths or a dictionary of them.\n\ 109 | \n\ 110 | >>> import pyBigWig\n\ 111 | >>> bw = pyBigWig.open(\"test/test.bw\")\n\ 112 | >>> bw.chroms()\n\ 113 | {'1': 195471971L, '10': 130694993L}\n\ 114 | \n\ 115 | Note that you may optionally supply a specific chromosome:\n\ 116 | \n\ 117 | >>> bw.chroms(\"chr1\")\n\ 118 | 195471971L\n\ 119 | \n\ 120 | If you specify a non-existant chromosome then no output is produced:\n\ 121 | \n\ 122 | >>> bw.chroms(\"foo\")\n\ 123 | >>>\n"}, 124 | {"stats", (PyCFunction)pyBwGetStats, METH_VARARGS|METH_KEYWORDS, 125 | "Return summary statistics for a given range. On error, this function throws a\n\ 126 | runtime exception.\n\ 127 | \n\ 128 | Positional arguments:\n\ 129 | chr: Chromosome name\n\ 130 | \n\ 131 | Keyword arguments:\n\ 132 | start: Starting position\n\ 133 | end: Ending position\n\ 134 | type: Summary type (mean, min, max, coverage, std, sum), default 'mean'.\n\ 135 | nBins: Number of bins into which the range should be divided before\n\ 136 | computing summary statistics. The default is 1.\n\ 137 | exact: By default, pyBigWig uses the same method as Kent's tools from UCSC\n\ 138 | for computing statistics. This means that 'zoom levels' may be\n\ 139 | used, rather than actual values (please see the pyBigWig repository\n\ 140 | on github for further information on this). To avoid this behaviour,\n\ 141 | simply specify 'exact=True'. Note that values returned will then\n\ 142 | differ from what UCSC, IGV, and similar other tools will report.\n\ 143 | \n\ 144 | >>> import pyBigWig\n\ 145 | >>> bw = pyBigWig.open(\"test/test.bw\")\n\ 146 | >>> bw.stats(\"1\", 0, 3)\n\ 147 | [0.2000000054637591]\n\ 148 | \n\ 149 | This is the mean value over the range 1:1-3 (in 1-based coordinates). If\n\ 150 | the start and end positions aren't given the entire chromosome is used.\n\ 151 | There are additional optional parameters 'type' and 'nBins'. 'type'\n\ 152 | specifies the type of summary information to calculate, which is 'mean'\n\ 153 | by default. Other possibilites for 'type' are: 'min' (minimum value),\n\ 154 | 'max' (maximum value), 'coverage' (number of covered bases), and 'std'\n\ 155 | (standard deviation). 'nBins' defines how many bins the region will be\n\ 156 | divided into and defaults to 1.\n\ 157 | \n\ 158 | >>> bw.stats(\"1\", 0, 3, type=\"min\")\n\ 159 | [0.10000000149011612]\n\ 160 | >>> bw.stats(\"1\", 0, 3, type=\"max\")\n\ 161 | [0.30000001192092896]\n\ 162 | >>> bw.stats(\"1\", 0, 10, type=\"coverage\")\n\ 163 | [0.30000000000000004]\n\ 164 | >>> bw.stats(\"1\", 0, 3, type=\"std\")\n\ 165 | [0.10000000521540645]\n\ 166 | >>> bw.stats(\"1\",99,200, type=\"max\", nBins=2)\n\ 167 | [1.399999976158142, 1.5]\n"}, 168 | #ifdef WITHNUMPY 169 | {"values", (PyCFunction)pyBwGetValues, METH_VARARGS|METH_KEYWORDS, 170 | "Retrieve the value stored for each position (or None). On error, a runtime\n\ 171 | exception is thrown.\n\ 172 | \n\ 173 | Positional arguments:\n\ 174 | chr: Chromosome name\n\ 175 | start: Starting position\n\ 176 | end: Ending position\n\ 177 | \n\ 178 | Optional arguments:\n\ 179 | numpy: If True, return a numpy array rather than a list of values. This\n\ 180 | is generally more memory efficient. Note that this option is only\n\ 181 | available if pyBigWig was installed with numpy support (check the\n\ 182 | pyBigWig.numpy() function).\n\ 183 | \n\ 184 | >>> import pyBigWig\n\ 185 | >>> bw = pyBigWig.open(\"test/test.bw\")\n\ 186 | >>> bw.values(\"1\", 0, 3)\n\ 187 | [0.10000000149011612, 0.20000000298023224, 0.30000001192092896]\n\ 188 | \n\ 189 | The length of the returned list will always match the length of the\n\ 190 | range. Any uncovered bases will have a value of None.\n\ 191 | \n\ 192 | >>> bw.values(\"1\", 0, 4)\n\ 193 | [0.10000000149011612, 0.20000000298023224, 0.30000001192092896, None]\n\ 194 | \n"}, 195 | #else 196 | {"values", (PyCFunction)pyBwGetValues, METH_VARARGS, 197 | "Retrieve the value stored for each position (or None). On error, a runtime\n\ 198 | exception is thrown.\n\ 199 | \n\ 200 | Positional arguments:\n\ 201 | chr: Chromosome name\n\ 202 | start: Starting position\n\ 203 | end: Ending position\n\ 204 | \n\ 205 | >>> import pyBigWig\n\ 206 | >>> bw = pyBigWig.open(\"test/test.bw\")\n\ 207 | >>> bw.values(\"1\", 0, 3)\n\ 208 | [0.10000000149011612, 0.20000000298023224, 0.30000001192092896]\n\ 209 | \n\ 210 | The length of the returned list will always match the length of the\n\ 211 | range. Any uncovered bases will have a value of None.\n\ 212 | \n\ 213 | >>> bw.values(\"1\", 0, 4)\n\ 214 | [0.10000000149011612, 0.20000000298023224, 0.30000001192092896, None]\n\ 215 | \n"}, 216 | #endif 217 | {"intervals", (PyCFunction)pyBwGetIntervals, METH_VARARGS|METH_KEYWORDS, 218 | "Retrieve each interval covering a part of a chromosome/region. On error, a\n\ 219 | runtime exception is thrown.\n\ 220 | \n\ 221 | Positional arguments:\n\ 222 | chr: Chromosome name\n\ 223 | \n\ 224 | Keyword arguments:\n\ 225 | start: Starting position\n\ 226 | end: Ending position\n\ 227 | \n\ 228 | If start and end aren't specified, the entire chromosome is returned.\n\ 229 | The returned object is a tuple containing the starting position, end\n\ 230 | position, and value of each interval in the file. As with all bigWig\n\ 231 | positions, those returned are 0-based half-open (e.g., a start of 0 and\n\ 232 | end of 10 specifies the first 10 positions).\n\ 233 | \n\ 234 | >>> import pyBigWig\n\ 235 | >>> bw = pyBigWig.open(\"test/test.bw\")\n\ 236 | >>> bw.intervals(\"1\", 0, 3)\n\ 237 | ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224),\n\ 238 | (2, 3, 0.30000001192092896))\n\ 239 | >>> bw.close()"}, 240 | {"entries", (PyCFunction) pyBBGetEntries, METH_VARARGS|METH_KEYWORDS, 241 | "Retrieves entries from a bigBed file. These can optionally contain the string\n\ 242 | associated with each entry.\n\ 243 | \n\ 244 | Positional arguments:\n\ 245 | chr: Chromosome name\n\ 246 | \n\ 247 | Keyword arguments:\n\ 248 | start: Starting position\n\ 249 | end: Ending position\n\ 250 | withString: If True, return the string associated with each entry.\n\ 251 | Default True.\n\ 252 | \n\ 253 | The output is a list of tuples, with members \"start\", \"end\", and \"string\"\n\ 254 | (assuming \"withString=True\"). If there are no overlapping entries, then None\n\ 255 | is returned.\n\ 256 | \n\ 257 | >>> import pyBigWig\n\ 258 | >>> bb = pyBigWig.open(\"https://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed\")\n\ 259 | >>> print(bw.entries('chr1',10000000,10020000))\n\ 260 | [(10009333, 10009640, '61035\t130\t-\t0.026\t0.42\t404'),\n\ 261 | (10014007, 10014289, '61047\t136\t-\t0.029\t0.42\t404'),\n\ 262 | (10014373, 10024307, '61048\t630\t-\t5.420\t0.00\t2672399')]\n\ 263 | >>> print(bb.entries(\"chr1\", 10000000, 10000500, withString=False))\n\ 264 | [(10009333, 10009640), (10014007, 10014289), (10014373, 10024307)]\n\ 265 | \n"}, 266 | {"SQL", (PyCFunction) pyBBGetSQL, METH_VARARGS, 267 | "Returns the SQL string associated with the file. This is typically useful for\n\ 268 | bigBed files, where this determines what is held in each column of the text\n\ 269 | string associated with entries.\n\ 270 | \n\ 271 | If there is no SQL string, then None is returned.\n\ 272 | \n\ 273 | >>> import pyBigWig\n\ 274 | >>> bb = pyBigWig.open(\"https://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed\")\n\ 275 | >>> print(bb.SQL())\n\ 276 | table RnaElements\n\ 277 | \"BED6 + 3 scores for RNA Elements data \"\n\ 278 | (\n\ 279 | string chrom; \"Reference sequence chromosome or scaffold\"\n\ 280 | uint chromStart; \"Start position in chromosome\"\n\ 281 | uint chromEnd; \"End position in chromosome\"\n\ 282 | string name; \"Name of item\"\n\ 283 | uint score; \"Normalized score from 0-1000\"\n\ 284 | char[1] strand; \"+ or - or . for unknown\"\n\ 285 | float level; \"Expression level such as RPKM or FPKM. Set to -1 for no data.\"\n\ 286 | float signif; \"Statistical significance such as IDR. Set to -1 for no data.\"\n\ 287 | uint score2; \"Additional measurement/count e.g. number of reads. Set to 0 for no data.\"\n\ 288 | )\n\ 289 | \n\ 290 | \n"}, 291 | {"addHeader", (PyCFunction)pyBwAddHeader, METH_VARARGS|METH_KEYWORDS, 292 | "Adds a header to a file opened for writing. This MUST be called before adding\n\ 293 | any entries. On error, a runtime exception is thrown.\n\ 294 | \n\ 295 | Positional arguments:\n\ 296 | cl: A chromosome list, of the form (('chr1', 1000), ('chr2', 2000), ...).\n\ 297 | In other words, each element of the list is a tuple containing a\n\ 298 | chromosome name and its associated length.\n\ 299 | \n\ 300 | Keyword arguments:\n\ 301 | maxZooms: The maximum number of zoom levels. The value must be >=0. The\n\ 302 | default is 10.\n\ 303 | \n\ 304 | >>> import pyBigWig\n\ 305 | >>> import tempfile\n\ 306 | >>> import os\n\ 307 | >>> ofile = tempfile.NamedTemporaryFile(delete=False)\n\ 308 | >>> oname = ofile.name\n\ 309 | >>> ofile.close()\n\ 310 | >>> bw = pyBigWig.open(oname, 'w')\n\ 311 | >>> bw.addHeader([(\"1\", 1000000), (\"2\", 1500000)], maxZooms=0)\n\ 312 | >>> bw.close()\n\ 313 | >>> os.remove(oname)"}, 314 | {"addEntries", (PyCFunction)pyBwAddEntries, METH_VARARGS|METH_KEYWORDS, 315 | "Adds one or more entries to a bigWig file. This returns nothing, but throws a\n\ 316 | runtime exception on error.\n\ 317 | \n\ 318 | This function always accepts an optional 'validate' option. If set to 'True',\n\ 319 | which is the default, the input entries are checked to ensure that they come\n\ 320 | after previously entered entries. This comes with significant overhead, so if\n\ 321 | this is instead 'False' then this validation is not performed.\n\ 322 | \n\ 323 | There are three manners in which entries can be stored in bigWig files.\n\ 324 | \n\ 325 | \n\ 326 | bedGraph-like entries (12 bytes each):\n\ 327 | \n\ 328 | Positional arguments:\n\ 329 | chrom: A list of chromosome. These MUST match those added with addHeader().\n\ 330 | starts: A list of start positions. These are 0-based.\n\ 331 | \n\ 332 | Keyword arguments:\n\ 333 | ends: A list of end positions. These are 0-based half open, so a start of\n\ 334 | 0 and end of 10 specifies the first 10 bases.\n\ 335 | values: A list of values.\n\ 336 | \n\ 337 | \n\ 338 | Variable-step entries (8 bytes each):\n\ 339 | \n\ 340 | Positional arguments:\n\ 341 | chrom: A chromosome name. This MUST match one added with addHeader().\n\ 342 | starts: A list of start positions. These are 0-based.\n\ 343 | \n\ 344 | Keyword arguments:\n\ 345 | values: A list of values.\n\ 346 | span: A span width. This is an integer value and specifies how many bases\n\ 347 | each entry describes. An entry with a start position of 0 and a span\n\ 348 | of 10 describes the first 10 bases.\n\ 349 | \n\ 350 | \n\ 351 | Fixed-step entries (4 bytes each):\n\ 352 | \n\ 353 | Positional arguments:\n\ 354 | chrom: A chromosome name. This MUST match one added with addHeader().\n\ 355 | starts: A start position. These are 0-based. The start position of each\n\ 356 | entry starts 'step' after the previous and describes 'span' bases.\n\ 357 | \n\ 358 | Keyword arguments:\n\ 359 | values: A list of values.\n\ 360 | span: A span width. This is an integer value and specifies how many bases\n\ 361 | each entry describes. An entry with a start position of 0 and a span\n\ 362 | of 10 describes the first 10 bases.\n\ 363 | step: A step width. Each subsequent entry begins this number of bases\n\ 364 | after the previous. So if the first entry has a start of 0 and step\n\ 365 | or 30, the second entry will start at 30.\n\ 366 | \n\ 367 | >>> import pyBigWig\n\ 368 | >>> import tempfile\n\ 369 | >>> import os\n\ 370 | >>> ofile = tempfile.NamedTemporaryFile(delete=False)\n\ 371 | >>> oname = ofile.name\n\ 372 | >>> ofile.close()\n\ 373 | >>> bw = pyBigWig.open(oname, 'w')\n\ 374 | >>> bw.addHeader([(\"1\", 1000000), (\"2\", 1500000)])\n\ 375 | >>> #Add some bedGraph-like entries\n\ 376 | >>> bw.addEntries([\"1\", \"1\", \"1\"], [0, 100, 125], ends=[5, 120, 126], values=[0.0, 1.0, 200.0])\n\ 377 | >>> #Variable-step entries, the span 500-520, 600-620, and 635-655\n\ 378 | >>> bw.addEntries(\"1\", [500, 600, 635], values=[-2.0, 150.0, 25.0], span=20)\n\ 379 | >>> #Fixed-step entries, the bases described are 900-920, 930-950, and 960-980\n\ 380 | >>> bw.addEntries(\"1\", 900, values=[-5.0, -20.0, 25.0], span=20, step=30)\n\ 381 | >>> #This only works due to using validate=False. Obviously the file is then corrupt.\n\ 382 | >>> bw.addEntries([\"1\", \"1\", \"1\"], [0, 100, 125], ends=[5, 120, 126], values=[0.0, 1.0, 200.0], validate=False)\n\ 383 | >>> bw.close()\n\ 384 | >>> os.remove(oname)"}, 385 | {"__enter__", (PyCFunction)pyBwEnter, METH_NOARGS, NULL}, 386 | {"__exit__", (PyCFunction)pyBwClose, METH_VARARGS, NULL}, 387 | {NULL, NULL, 0, NULL} 388 | }; 389 | 390 | #if PY_MAJOR_VERSION >= 3 391 | struct pyBigWigmodule_state { 392 | PyObject *error; 393 | }; 394 | 395 | #define GETSTATE(m) ((struct pyBigWigmodule_state*)PyModule_GetState(m)) 396 | 397 | static PyModuleDef pyBigWigmodule = { 398 | PyModuleDef_HEAD_INIT, 399 | "pyBigWig", 400 | "A python module for bigWig file access", 401 | -1, 402 | bwMethods, 403 | NULL, NULL, NULL, NULL 404 | }; 405 | #endif 406 | 407 | //Should set tp_dealloc, tp_print, tp_repr, tp_str, tp_members 408 | static PyTypeObject bigWigFile = { 409 | #if PY_MAJOR_VERSION >= 3 410 | PyVarObject_HEAD_INIT(NULL, 0) 411 | #else 412 | PyObject_HEAD_INIT(NULL) 413 | 0, /*ob_size*/ 414 | #endif 415 | "pyBigWig.bigWigFile", /*tp_name*/ 416 | sizeof(pyBigWigFile_t), /*tp_basicsize*/ 417 | 0, /*tp_itemsize*/ 418 | (destructor)pyBwDealloc, /*tp_dealloc*/ 419 | 0, /*tp_print*/ 420 | 0, /*tp_getattr*/ 421 | 0, /*tp_setattr*/ 422 | 0, /*tp_compare*/ 423 | 0, /*tp_repr*/ 424 | 0, /*tp_as_number*/ 425 | 0, /*tp_as_sequence*/ 426 | 0, /*tp_as_mapping*/ 427 | 0, /*tp_hash*/ 428 | 0, /*tp_call*/ 429 | 0, /*tp_str*/ 430 | PyObject_GenericGetAttr, /*tp_getattro*/ 431 | PyObject_GenericSetAttr, /*tp_setattro*/ 432 | 0, /*tp_as_buffer*/ 433 | #if PY_MAJOR_VERSION >= 3 434 | Py_TPFLAGS_DEFAULT, /*tp_flags*/ 435 | #else 436 | Py_TPFLAGS_HAVE_CLASS, /*tp_flags*/ 437 | #endif 438 | "bigWig File", /*tp_doc*/ 439 | 0, /*tp_traverse*/ 440 | 0, /*tp_clear*/ 441 | 0, /*tp_richcompare*/ 442 | 0, /*tp_weaklistoffset*/ 443 | 0, /*tp_iter*/ 444 | 0, /*tp_iternext*/ 445 | bwObjMethods, /*tp_methods*/ 446 | 0, /*tp_members*/ 447 | 0, /*tp_getset*/ 448 | 0, /*tp_base*/ 449 | 0, /*tp_dict*/ 450 | 0, /*tp_descr_get*/ 451 | 0, /*tp_descr_set*/ 452 | 0, /*tp_dictoffset*/ 453 | 0, /*tp_init*/ 454 | 0, /*tp_alloc*/ 455 | 0, /*tp_new*/ 456 | 0,0,0,0,0,0 457 | }; 458 | -------------------------------------------------------------------------------- /pyBigWigTest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/pyBigWig/7300b0a4599e7f72085c3c27c19b375e3a2c2cc0/pyBigWigTest/__init__.py -------------------------------------------------------------------------------- /pyBigWigTest/test.bigBed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/pyBigWig/7300b0a4599e7f72085c3c27c19b375e3a2c2cc0/pyBigWigTest/test.bigBed -------------------------------------------------------------------------------- /pyBigWigTest/test.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/pyBigWig/7300b0a4599e7f72085c3c27c19b375e3a2c2cc0/pyBigWigTest/test.bw -------------------------------------------------------------------------------- /pyBigWigTest/test.py: -------------------------------------------------------------------------------- 1 | import pyBigWig 2 | import tempfile 3 | import os 4 | import sys 5 | import hashlib 6 | import numpy as np 7 | 8 | class TestRemote(): 9 | fname = "http://raw.githubusercontent.com/dpryan79/pyBigWig/master/pyBigWigTest/test.bw" 10 | 11 | def doOpen(self): 12 | bw = pyBigWig.open(self.fname) 13 | assert(bw is not None) 14 | return bw 15 | 16 | def doOpenWith(self): 17 | with pyBigWig.open(self.fname) as bw: 18 | assert(bw.chroms() == {'1': 195471971, '10': 130694993}) 19 | 20 | def doChroms(self, bw): 21 | assert(bw.chroms() == {'1': 195471971, '10': 130694993}) 22 | assert(bw.chroms("1") == 195471971) 23 | assert(bw.chroms("c") is None) 24 | 25 | def doHeader(self, bw): 26 | assert(bw.header() == {'maxVal': 2, 'sumData': 272, 'minVal': 0, 'version': 4, 'sumSquared': 500, 'nLevels': 1, 'nBasesCovered': 154}) 27 | 28 | def doStats(self, bw): 29 | assert(bw.stats("1", 0, 3) == [0.2000000054637591]) 30 | assert(bw.stats("1", 0, 3, type="max") == [0.30000001192092896]) 31 | assert(bw.stats("1",99,200, type="max", nBins=2) == [1.399999976158142, 1.5]) 32 | assert(bw.stats("1",np.int64(99), np.int64(200), type="max", nBins=2) == [1.399999976158142, 1.5]) 33 | assert(bw.stats("1") == [1.3351851569281683]) 34 | 35 | def doValues(self, bw): 36 | assert(bw.values("1", 0, 3) == [0.10000000149011612, 0.20000000298023224, 0.30000001192092896]) 37 | assert(bw.values("1", np.int64(0), np.int64(3)) == [0.10000000149011612, 0.20000000298023224, 0.30000001192092896]) 38 | #assert(bw.values("1", 0, 4) == [0.10000000149011612, 0.20000000298023224, 0.30000001192092896, 'nan']) 39 | 40 | def doIntervals(self, bw): 41 | assert(bw.intervals("1", 0, 3) == ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224), (2, 3, 0.30000001192092896))) 42 | assert(bw.intervals("1", np.int64(0), np.int64(3)) == ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224), (2, 3, 0.30000001192092896))) 43 | assert(bw.intervals("1") == ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224), (2, 3, 0.30000001192092896), (100, 150, 1.399999976158142), (150, 151, 1.5))) 44 | 45 | def doSum(self, bw): 46 | assert(bw.stats("1", 100, 151, type="sum", nBins=2) == [35.0, 36.5]) 47 | 48 | def doWrite(self, bw): 49 | ofile = tempfile.NamedTemporaryFile(delete=False) 50 | oname = ofile.name 51 | ofile.close() 52 | bw2 = pyBigWig.open(oname, "w") 53 | assert(bw2 is not None) 54 | #Since this is an unordered dict(), iterating over the items can swap the order! 55 | chroms = [("1", bw.chroms("1")), ("10", bw.chroms("10"))] 56 | assert(len(bw.chroms()) == 2) 57 | bw2.addHeader(chroms, maxZooms=1) 58 | #Copy the input file 59 | for c in chroms: 60 | ints = bw.intervals(c[0]) 61 | chroms2 = [] 62 | starts = [] 63 | ends = [] 64 | values = [] 65 | for entry in ints: 66 | chroms2.append(c[0]) 67 | starts.append(entry[0]) 68 | ends.append(entry[1]) 69 | values.append(entry[2]) 70 | bw2.addEntries(chroms2, starts, ends=ends, values=values) 71 | bw2.close() 72 | #Ensure that the copied file has the same entries and max/min/etc. 73 | bw2 = pyBigWig.open(oname) 74 | assert(bw.header() == bw2.header()) 75 | assert(bw.chroms() == bw2.chroms()) 76 | for c in chroms: 77 | ints1 = bw.intervals(c[0]) 78 | ints2 = bw2.intervals(c[0]) 79 | assert(ints1 == ints2) 80 | bw.close() 81 | bw2.close() 82 | #Clean up 83 | os.remove(oname) 84 | 85 | def doWrite2(self): 86 | ''' 87 | Test all three modes of storing entries. Also test to ensure that we get error messages when doing something silly 88 | 89 | This is a modified version of the writing example from libBigWig 90 | ''' 91 | chroms = ["1"]*6 92 | starts = [0, 100, 125, 200, 220, 230, 500, 600, 625, 700, 800, 850] 93 | ends = [5, 120, 126, 205, 226, 231] 94 | values = [0.0, 1.0, 200.0, -2.0, 150.0, 25.0, 0.0, 1.0, 200.0, -2.0, 150.0, 25.0, -5.0, -20.0, 25.0, -5.0, -20.0, 25.0] 95 | ofile = tempfile.NamedTemporaryFile(delete=False) 96 | oname = ofile.name 97 | ofile.close() 98 | bw = pyBigWig.open(oname, "w") 99 | bw.addHeader([("1", 1000000), ("2", 1500000)]) 100 | 101 | #Intervals 102 | bw.addEntries(chroms[0:3], starts[0:3], ends=ends[0:3], values=values[0:3]) 103 | bw.addEntries(chroms[3:6], starts[3:6], ends=ends[3:6], values=values[3:6]) 104 | 105 | #IntervalSpans 106 | bw.addEntries("1", starts[6:9], values=values[6:9], span=20) 107 | bw.addEntries("1", starts[9:12], values=values[9:12], span=20) 108 | 109 | #IntervalSpanSteps, this should instead take an int 110 | bw.addEntries("1", 900, values=values[12:15], span=20, step=30) 111 | bw.addEntries("1", 990, values=values[15:18], span=20, step=30) 112 | 113 | #Attempt to add incorrect values. These MUST raise an exception 114 | try: 115 | bw.addEntries(chroms[0:3], starts[0:3], ends=ends[0:3], values=values[0:3]) 116 | assert(1==0) 117 | except RuntimeError: 118 | pass 119 | try: 120 | bw.addEntries("1", starts[6:9], values=values[6:9], span=20) 121 | assert(1==0) 122 | except RuntimeError: 123 | pass 124 | try: 125 | bw.addEntries("3", starts[6:9], values=values[6:9], span=20) 126 | assert(1==0) 127 | except RuntimeError: 128 | pass 129 | try: 130 | bw.addEntries("1", 900, values=values[12:15], span=20, step=30) 131 | assert(1==0) 132 | except RuntimeError: 133 | pass 134 | 135 | #Add a few intervals on a new chromosome 136 | bw.addEntries(["2"]*3, starts[0:3], ends=ends[0:3], values=values[0:3]) 137 | bw.close() 138 | #check md5sum, this is the simplest method to check correctness 139 | h = hashlib.md5(open(oname, "rb").read()).hexdigest() 140 | assert(h=="ef104f198c6ce8310acc149d0377fc16") 141 | #Clean up 142 | os.remove(oname) 143 | 144 | def doWriteEmpty(self): 145 | ofile = tempfile.NamedTemporaryFile(delete=False) 146 | oname = ofile.name 147 | ofile.close() 148 | bw = pyBigWig.open(oname, "w") 149 | bw.addHeader([("1", 1000000), ("2", 1500000)]) 150 | bw.close() 151 | 152 | #check md5sum 153 | h = hashlib.md5(open(oname, "rb").read()).hexdigest() 154 | assert(h=="361c600e5badf0b45d819552a7822937") 155 | 156 | #Ensure we can open and get reasonable results 157 | bw = pyBigWig.open(oname) 158 | assert(bw.chroms() == {'1': 1000000, '2': 1500000}) 159 | assert(bw.intervals("1") == None) 160 | assert(bw.values("1", 0, 1000000) == []) 161 | assert(bw.stats("1", 0, 1000000, nBins=2) == [None, None]) 162 | bw.close() 163 | 164 | #Clean up 165 | os.remove(oname) 166 | 167 | def doWriteNumpy(self): 168 | ofile = tempfile.NamedTemporaryFile(delete=False) 169 | oname = ofile.name 170 | ofile.close() 171 | bw = pyBigWig.open(oname, "w") 172 | bw.addHeader([("chr1", 100), ("chr2", 150), ("chr3", 200), ("chr4", 250)]) 173 | chroms = np.array(["chr1"] * 2 + ["chr2"] * 2 + ["chr3"] * 2 + ["chr4"] * 2) 174 | starts = np.array([0, 10, 40, 50, 60, 70, 80, 90], dtype=np.int64) 175 | ends = np.array([5, 15, 45, 55, 65, 75, 85, 95], dtype=np.int64) 176 | values0 = np.array(np.random.random_sample(8), dtype=np.float64) 177 | bw.addEntries(chroms, starts, ends=ends, values=values0) 178 | bw.close() 179 | 180 | vals = [(x, y, z) for x, y, z in zip(starts, ends, values0)] 181 | bw = pyBigWig.open(oname) 182 | assert(bw.chroms() == {'chr1': 100, 'chr2': 150, 'chr3': 200, 'chr4': 250}) 183 | for idx1, chrom in enumerate(["chr1", "chr2", "chr3", "chr4"]): 184 | for idx2, tup in enumerate(bw.intervals(chrom)): 185 | assert(tup[0] == starts[2 * idx1 + idx2]) 186 | assert(tup[1] == ends[2 * idx1 + idx2]) 187 | assert(np.isclose(tup[2], values0[2 * idx1 + idx2])) 188 | bw.close() 189 | 190 | #Clean up 191 | os.remove(oname) 192 | 193 | def testAll(self): 194 | bw = self.doOpen() 195 | self.doChroms(bw) 196 | if not self.fname.startswith("http"): 197 | self.doHeader(bw) 198 | self.doStats(bw) 199 | self.doSum(bw) 200 | self.doValues(bw) 201 | self.doIntervals(bw) 202 | self.doWrite(bw) 203 | self.doOpenWith() 204 | self.doWrite2() 205 | self.doWriteEmpty() 206 | self.doWriteNumpy() 207 | bw.close() 208 | 209 | class TestLocal(): 210 | def testFoo(self): 211 | blah = TestRemote() 212 | blah.fname = os.path.dirname(pyBigWig.__file__) + "/pyBigWigTest/test.bw" 213 | blah.testAll() 214 | 215 | class TestBigBed(): 216 | def testBigBed(self): 217 | fname = os.path.dirname(pyBigWig.__file__) + "/pyBigWigTest/test.bigBed" 218 | bb = pyBigWig.open(fname) 219 | assert(bb is not None) 220 | assert(bb.isBigWig() == 0) 221 | assert(bb.isBigBed() == 1) 222 | SQL = """table RnaElements 223 | "BED6 + 3 scores for RNA Elements data " 224 | ( 225 | string chrom; "Reference sequence chromosome or scaffold" 226 | uint chromStart; "Start position in chromosome" 227 | uint chromEnd; "End position in chromosome" 228 | string name; "Name of item" 229 | uint score; "Normalized score from 0-1000" 230 | char[1] strand; "+ or - or . for unknown" 231 | float level; "Expression level such as RPKM or FPKM. Set to -1 for no data." 232 | float signif; "Statistical significance such as IDR. Set to -1 for no data." 233 | uint score2; "Additional measurement/count e.g. number of reads. Set to 0 for no data." 234 | ) 235 | """ 236 | output = bb.SQL() 237 | if isinstance(output, bytes): 238 | output = output.decode('ASCII') 239 | assert(output == SQL) 240 | o = bb.entries('chr1',10000000,10020000) 241 | expected = [(10009333, 10009640, '61035\t130\t-\t0.026\t0.42\t404'), (10014007, 10014289, '61047\t136\t-\t0.029\t0.42\t404'), (10014373, 10024307, '61048\t630\t-\t5.420\t0.00\t2672399')] 242 | assert(o == expected) 243 | o = bb.entries('chr1',np.int64(10000000),np.int64(10020000)) 244 | assert(o == expected) 245 | bb.close() 246 | 247 | class TestNumpy(): 248 | def testNumpy(self): 249 | import os 250 | if pyBigWig.numpy == 0: 251 | return 0 252 | import numpy as np 253 | 254 | bw = pyBigWig.open("/tmp/delete.bw", "w") 255 | bw.addHeader([("1", 1000)], maxZooms=0) 256 | # Type 0 257 | chroms = np.array(["1"] * 10) 258 | starts = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype=np.int64) 259 | ends = np.array([5, 15, 25, 35, 45, 55, 65, 75, 85, 95], dtype=np.int64) 260 | values0 = np.array(np.random.random_sample(10), dtype=np.float64) 261 | bw.addEntries(chroms, starts, ends=ends, values=values0) 262 | 263 | starts = np.array([100, 110, 120, 130, 140, 150, 160, 170, 180, 190], dtype=np.int64) 264 | ends = np.array([105, 115, 125, 135, 145, 155, 165, 175, 185, 195], dtype=np.int64) 265 | values1 = np.array(np.random.random_sample(10), dtype=np.float64) 266 | bw.addEntries(chroms, starts, ends=ends, values=values1) 267 | 268 | # Type 1, single chrom, multiple starts/values, single span 269 | starts = np.array([200, 210, 220, 230, 240, 250, 260, 270, 280, 290], dtype=np.int64) 270 | values2 = np.array(np.random.random_sample(10), dtype=np.float64) 271 | bw.addEntries("1", starts, span=np.int64(8), values=values2) 272 | 273 | starts = np.array([300, 310, 320, 330, 340, 350, 360, 370, 380, 390], dtype=np.int64) 274 | values3 = np.array(np.random.random_sample(10), dtype=np.float64) 275 | bw.addEntries("1", starts, span=np.int64(8), values=values3) 276 | 277 | # Type 2, single chrom/start/span/step, multiple values 278 | values4 = np.array(np.random.random_sample(10), dtype=np.float64) 279 | bw.addEntries("1", np.int64(400), span=np.int64(8), step=np.int64(2), values=values4) 280 | 281 | values5 = np.array(np.random.random_sample(10), dtype=np.float64) 282 | bw.addEntries("1", np.int64(500), span=np.int64(8), step=np.int64(2), values=values5) 283 | 284 | bw.close() 285 | 286 | bw = pyBigWig.open("/tmp/delete.bw") 287 | assert(bw is not None) 288 | 289 | def compy(start, v2): 290 | v = [] 291 | for t in bw.intervals("1", start, start + 100): 292 | v.append(t[2]) 293 | v = np.array(v) 294 | assert(np.all(np.abs(v - v2) < 1e-5)) 295 | 296 | compy(0, values0) 297 | compy(100, values1) 298 | compy(200, values2) 299 | compy(300, values3) 300 | compy(400, values4) 301 | compy(500, values5) 302 | 303 | # Get values as a numpy array 304 | foo = bw.values("1", 0, 100, numpy=False) 305 | assert(isinstance(foo, list)) 306 | foo = bw.values("1", 0, 100, numpy=True) 307 | assert(isinstance(foo, np.ndarray)) 308 | 309 | bw.close() 310 | os.remove("/tmp/delete.bw") 311 | 312 | def testNumpyValues(self): 313 | if pyBigWig.numpy == 0: 314 | return 0 315 | import numpy as np 316 | 317 | fname = "http://raw.githubusercontent.com/dpryan79/pyBigWig/master/pyBigWigTest/test.bw" 318 | bw = pyBigWig.open(fname, "r") 319 | 320 | assert np.allclose( 321 | bw.values("1", 0, 20, numpy=True), 322 | np.array(bw.values("1", 0, 20), dtype=np.float32), 323 | equal_nan=True 324 | ) 325 | 326 | assert np.allclose( 327 | bw.stats("1", 0, 20, "mean", 5, numpy=True), 328 | np.array(bw.stats("1", 0, 20, "mean", 5), dtype=np.float64), 329 | equal_nan=True 330 | ) 331 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = ["numpy >= 2.0.0", "setuptools", "setuptools-scm"] 4 | 5 | [project] 6 | authors = [{name = "Devon P. Ryan", email = "dryan79@gmail.com"}] 7 | classifiers = [ 8 | "Development Status :: 5 - Production/Stable", 9 | "Intended Audience :: Developers", 10 | "License :: OSI Approved", 11 | "Programming Language :: C", 12 | "Programming Language :: Python", 13 | "Programming Language :: Python :: 3", 14 | "Programming Language :: Python :: 3.9", 15 | "Programming Language :: Python :: Implementation :: CPython", 16 | "Operating System :: POSIX", 17 | "Operating System :: Unix", 18 | "Operating System :: MacOS", 19 | ] 20 | description = "A package for accessing bigWig files using libBigWig" 21 | keywords = ["bioinformatics", "bigWig", "bigBed"] 22 | name = "pyBigWig" 23 | version = "0.3.24" 24 | readme = "README.md" 25 | requires-python = ">=3.9" 26 | 27 | [project.license] 28 | text = "MIT" 29 | 30 | [project.urls] 31 | "Bug Tracker" = "https://github.com/deeptools/pyBigWig/issues" 32 | "Download" = "https://pypi.python.org/pypi/pyBigWig" 33 | "Homepage" = "https://github.com/deeptools/pyBigWig" 34 | 35 | [tool.setuptools] 36 | # Override setuptools autodiscovery algorithm 37 | # Only include package test data/source for wheel distribution 38 | include-package-data = true 39 | packages = ["pyBigWigTest"] 40 | 41 | # Enable version inference from scm 42 | [tool.setuptools_scm] 43 | 44 | # Target only minimum CPython version 3.9 on linux for wheel build 45 | [tool.cibuildwheel] 46 | skip = "pp* cp38-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux_x86_64 *-musllinux_i686" 47 | 48 | [tool.cibuildwheel.linux] 49 | manylinux-x86_64-image = "manylinux2014" 50 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # This is required for setuptools to name the wheel with the correct 2 | # minimum python abi version 3 | # Commenting this out, since this ends up breaking wheels on anything except python 3.7 4 | #[bdist_wheel] 5 | #py-limited-api = cp37 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup, Extension 3 | from distutils import sysconfig 4 | from pathlib import Path 5 | import subprocess 6 | import glob 7 | import sys 8 | 9 | srcs = [x for x in 10 | glob.glob("libBigWig/*.c")] 11 | srcs.append("pyBigWig.c") 12 | 13 | libs=["m", "z"] 14 | 15 | # do not link to python on mac, see https://github.com/deeptools/pyBigWig/issues/58 16 | if 'dynamic_lookup' not in (sysconfig.get_config_var('LDSHARED') or ''): 17 | if sysconfig.get_config_vars('BLDLIBRARY') is not None: 18 | #Note the "-l" prefix! 19 | for e in sysconfig.get_config_vars('BLDLIBRARY')[0].split(): 20 | if e[0:2] == "-l": 21 | libs.append(e[2:]) 22 | elif sys.version_info[0] >= 3 and sys.version_info[1] >= 3: 23 | libs.append("python%i.%im" % (sys.version_info[0], sys.version_info[1])) 24 | else: 25 | libs.append("python%i.%i" % (sys.version_info[0], sys.version_info[1])) 26 | 27 | additional_libs = [sysconfig.get_config_var("LIBDIR"), sysconfig.get_config_var("LIBPL")] 28 | 29 | defines = [] 30 | try: 31 | foo, _ = subprocess.Popen(['curl-config', '--libs'], stdout=subprocess.PIPE).communicate() 32 | libs.append("curl") 33 | foo = foo.decode().strip().split() 34 | except: 35 | foo = [] 36 | defines.append(('NOCURL', None)) 37 | sys.stderr.write("Either libcurl isn't installed, it didn't come with curl-config, or curl-config isn't in your $PATH. pyBigWig will be installed without support for remote files.\n") 38 | 39 | for v in foo: 40 | if v[0:2] == '-L': 41 | additional_libs.append(v[2:]) 42 | 43 | include_dirs = ['libBigWig', sysconfig.get_config_var("INCLUDEPY")] 44 | 45 | # Add numpy build information if numpy is installed as a package 46 | try: 47 | import numpy 48 | defines.extend([('WITHNUMPY', None), ('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')]) 49 | 50 | # Ref: https://numpy.org/doc/stable/reference/c-api/coremath.html#linking-against-the-core-math-library-in-an-extension 51 | numpy_include_dir = numpy.get_include() 52 | numpy_library_dir = str(Path(numpy_include_dir) / '..' / 'lib') 53 | 54 | include_dirs.append(numpy_include_dir) 55 | additional_libs.append(numpy_library_dir) 56 | libs.append('npymath') 57 | # Silently ignore a failed import of numpy 58 | except ImportError: 59 | pass 60 | 61 | module1 = Extension('pyBigWig', 62 | sources = srcs, 63 | libraries = libs, 64 | library_dirs = additional_libs, 65 | define_macros = defines, 66 | include_dirs = include_dirs) 67 | 68 | setup( 69 | ext_modules=[module1] 70 | ) 71 | --------------------------------------------------------------------------------