├── .dockerignore ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── TODO.txt ├── docker └── Dockerfile ├── inc └── wiggletools.h ├── python └── wiggletools │ ├── Makefile │ ├── __init__.py │ ├── mergeApplyDirectory.py │ ├── mergeBedLikeDirectory.sh │ ├── mergeBigWigDirectory.py │ ├── mergeProfileDirectory.py │ ├── mergeProfilesDirectory.py │ ├── multiJob.py │ ├── parallelWiggleTools.py │ ├── wigglePlots.py │ └── wiggletoolsIndex.py ├── src ├── Makefile ├── apply.c ├── bamReader.c ├── bcfReader.c ├── bedReader.c ├── bigBedReader.c ├── bigWiggleReader.c ├── bufferedReader.c ├── bufferedReader.h ├── commandParser.c ├── fib.c ├── fib.h ├── fibpriv.h ├── hash.c ├── hash.h ├── hashfib.c ├── hashfib.h ├── mWigWriter.c ├── multiSet.c ├── multiSet.h ├── multiplexer.c ├── multiplexer.h ├── plots.c ├── recycleBin.c ├── recycleBin.h ├── reducers.c ├── samReader.c ├── setComparisons.c ├── statistics.c ├── unaryOps.c ├── vcfReader.c ├── wigReader.c ├── wigWriter.c ├── wiggleIterator.c ├── wiggleIterator.h ├── wiggletools.c └── wiggletools.h ├── technical-supplement.tex └── test ├── bam.bam ├── bam.bam.bai ├── bcf.bcf ├── bcf.bcf.csi ├── chrom_sizes ├── cram.cram ├── cram.cram.crai ├── expected ├── nearest_fixedStep.bg ├── nearest_overlapping.bg ├── pearson.txt ├── profile.txt ├── profiles.txt └── regional_means.txt ├── fixedStep.bw ├── fixedStep.wig ├── overlapping.bb ├── overlapping.bed ├── overlapping_coverage.wig ├── pileup.bg ├── program.txt ├── sam.sam ├── test.py ├── variableStep.bw ├── variableStep.wig └── vcf.vcf /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | Dockerfile 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.a 3 | *.pdf 4 | *.log 5 | *.toc 6 | *.aux 7 | *.pyc 8 | .*.swp 9 | tags 10 | test*.sh 11 | bin 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 190 | Copyright [2016-2020] EMBL-European Bioinformatics Institute 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | default: binaries 2 | 3 | bin: 4 | mkdir -p bin 5 | 6 | Wiggletools: bin 7 | cd src; make -e 8 | 9 | Parallel: Wiggletools 10 | cd python/wiggletools; make 11 | 12 | binaries: Parallel 13 | chmod 755 bin/* 14 | 15 | test: tests 16 | 17 | tests: 18 | cd test; python2.7 test.py 19 | 20 | clean: 21 | cd src; make clean 22 | rm bin/* 23 | rm lib/* 24 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | Seekable apply_paste 2 | Memory tracking & cleaning of chrom labels for text files 3 | Read score in BigBed files? => Handling overlapping iterators with value in unit and filter 4 | Read data in VCF file? 5 | HMM app? (requires reverse iterators...) 6 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 AS builder 2 | RUN apt update && apt install -y --no-install-recommends\ 3 | ca-certificates \ 4 | git \ 5 | gcc \ 6 | make \ 7 | python \ 8 | libstdc++-10-dev \ 9 | libcurl4-openssl-dev \ 10 | zlib1g-dev \ 11 | libbigwig-dev \ 12 | libhts-dev \ 13 | libgsl-dev 14 | COPY Makefile Makefile 15 | COPY src src 16 | COPY python python 17 | RUN make LIBS='-lwiggletools -lBigWig -lcurl -lz -lhts -lm -lgsl -lpthread' 18 | 19 | FROM builder as test 20 | COPY test test 21 | RUN make test 22 | 23 | FROM ubuntu:20.04 24 | RUN apt update && apt install -y --no-install-recommends \ 25 | libbigwig0 \ 26 | libhts3 \ 27 | libgsl23 28 | COPY --from=builder /bin/wiggletools /usr/local/bin/ 29 | WORKDIR /mnt 30 | ENTRYPOINT ["wiggletools"] 31 | CMD ["--help"] 32 | 33 | -------------------------------------------------------------------------------- /inc/wiggletools.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef _WIGGLETOOLS_DEF_ 16 | #define _WIGGLETOOLS_DEF_ 17 | 18 | #ifndef bool 19 | #define bool char 20 | #define true 1 21 | #define false 0 22 | #endif 23 | 24 | #include 25 | 26 | typedef struct wiggleIterator_st WiggleIterator; 27 | typedef struct multiplexer_st Multiplexer; 28 | typedef struct multiset_st Multiset; 29 | typedef struct histogram_st Histogram; 30 | 31 | // Creators 32 | WiggleIterator * SmartReader (char *, bool); 33 | WiggleIterator * CatWiggleIterator (char **, int); 34 | // Secondary creators (to force file format recognition if necessary) 35 | WiggleIterator * WiggleReader (char *); 36 | WiggleIterator * BigWiggleReader (char *, bool); 37 | WiggleIterator * BedReader (char *); 38 | WiggleIterator * BigBedReader (char *, bool); 39 | WiggleIterator * BamReader (char *, bool); 40 | WiggleIterator * SamReader (char *); 41 | WiggleIterator * VcfReader (char *); 42 | WiggleIterator * BcfReader (char *, bool); 43 | 44 | // Generic class functions 45 | void seek(WiggleIterator *, const char *, int, int); 46 | 47 | // Algebraic operations on iterators 48 | 49 | // Unary 50 | WiggleIterator * UnitWiggleIterator (WiggleIterator *); 51 | WiggleIterator * CoverageWiggleIterator (WiggleIterator *); 52 | WiggleIterator * UnionWiggleIterator (WiggleIterator *); 53 | WiggleIterator * NonOverlappingWiggleIterator (WiggleIterator *); 54 | WiggleIterator * AbsWiggleIterator (WiggleIterator * ); 55 | WiggleIterator * NaturalLogWiggleIterator (WiggleIterator *); 56 | WiggleIterator * NaturalExpWiggleIterator (WiggleIterator *); 57 | WiggleIterator * TestNonOverlappingWiggleIterator(WiggleIterator * ); 58 | WiggleIterator * TestNonOverlappingWiggleIterator(WiggleIterator *); 59 | WiggleIterator * OverlapWiggleIterator(WiggleIterator *, WiggleIterator *); 60 | WiggleIterator * TrimWiggleIterator(WiggleIterator *, WiggleIterator *); 61 | WiggleIterator * NoverlapWiggleIterator(WiggleIterator *, WiggleIterator *); 62 | WiggleIterator * NearestWiggleIterator(WiggleIterator *, WiggleIterator *); 63 | WiggleIterator * IsZero(WiggleIterator *); 64 | WiggleIterator * Floor(WiggleIterator *); 65 | // Scalar operations 66 | WiggleIterator * ScaleWiggleIterator (WiggleIterator *, double); 67 | WiggleIterator * ShiftWiggleIterator(WiggleIterator *, double); 68 | WiggleIterator * PowerWiggleIterator (WiggleIterator *, double); 69 | WiggleIterator * LogWiggleIterator (WiggleIterator * , double); 70 | WiggleIterator * ExpWiggleIterator (WiggleIterator *, double); 71 | WiggleIterator * DefaultValueWiggleIterator(WiggleIterator *, double); 72 | WiggleIterator * HighPassFilterWiggleIterator(WiggleIterator *, double); 73 | WiggleIterator * SmoothWiggleIterator(WiggleIterator * i, int); 74 | WiggleIterator * ExtendWiggleIterator(WiggleIterator * i, int); 75 | 76 | // Sets of iterators 77 | Multiplexer * newMultiplexer(WiggleIterator **, int, bool); 78 | 79 | // Reduction operators on sets 80 | 81 | WiggleIterator * SelectReduction(Multiplexer *, int); 82 | WiggleIterator * MaxReduction ( Multiplexer * ); 83 | WiggleIterator * MinReduction ( Multiplexer * ); 84 | WiggleIterator * SumReduction ( Multiplexer * ); 85 | WiggleIterator * ProductReduction ( Multiplexer * ); 86 | WiggleIterator * MeanReduction ( Multiplexer * ); 87 | WiggleIterator * VarianceReduction ( Multiplexer * ); 88 | WiggleIterator * StdDevReduction ( Multiplexer * ); 89 | WiggleIterator * EntropyReduction ( Multiplexer * ); 90 | WiggleIterator * CVReduction ( Multiplexer * ); 91 | WiggleIterator * MedianReduction ( Multiplexer * ); 92 | WiggleIterator * FillInReduction( Multiplexer * ); 93 | 94 | // Sets of sets iterators 95 | Multiset * newMultiset(Multiplexer **, int); 96 | 97 | // Reduction operators on sets of sets: 98 | WiggleIterator * TTestReduction(Multiset *); 99 | WiggleIterator * FTestReduction(Multiset *); 100 | WiggleIterator * MWUReduction(Multiset *); 101 | 102 | // Output 103 | void toFile (WiggleIterator *, char *, bool, bool); 104 | void toStdout (WiggleIterator *, bool, bool); 105 | WiggleIterator * TeeWiggleIterator(WiggleIterator *, FILE *, bool, bool); 106 | void runWiggleIterator(WiggleIterator * ); 107 | Multiplexer * TeeMultiplexer(Multiplexer *, FILE *, bool, bool); 108 | void toStdoutMultiplexer (Multiplexer *, bool, bool); 109 | void runMultiplexer(Multiplexer * ); 110 | WiggleIterator * PrintStatisticsWiggleIterator(WiggleIterator * i, FILE * file); 111 | 112 | // Statistics 113 | // Unary 114 | WiggleIterator * AUCIntegrator (WiggleIterator *); 115 | WiggleIterator * MeanIntegrator (WiggleIterator *); 116 | WiggleIterator * MinIntegrator (WiggleIterator *); 117 | WiggleIterator * MaxIntegrator (WiggleIterator *); 118 | WiggleIterator * VarianceIntegrator (WiggleIterator *); 119 | WiggleIterator * StandardDeviationIntegrator (WiggleIterator *); 120 | WiggleIterator * CoefficientOfVariationIntegrator (WiggleIterator *); 121 | WiggleIterator * NDPearsonIntegrator(Multiset *); 122 | void regionProfile(WiggleIterator *, double *, int, int, bool); 123 | void addProfile(double *, double *, int); 124 | // Binary 125 | WiggleIterator * PearsonIntegrator (WiggleIterator * , WiggleIterator * ); 126 | // Histograms 127 | Histogram * histogram(WiggleIterator **, int, int); 128 | void normalize_histogram(Histogram *); 129 | void print_histogram(Histogram *, FILE *); 130 | 131 | // Regional statistics 132 | Multiplexer * ApplyMultiplexer(WiggleIterator *, WiggleIterator * (**statistics)(WiggleIterator *), int count, WiggleIterator *, bool strict); 133 | Multiplexer * ProfileMultiplexer(WiggleIterator *, int, WiggleIterator *); 134 | Multiplexer * PasteMultiplexer(Multiplexer *, FILE *, FILE *, bool); 135 | 136 | // Cleaning up 137 | void destroyWiggleIterator (WiggleIterator *); 138 | 139 | // Big file params 140 | void setMaxBlocks(int); 141 | void setMaxHeadStart(int); 142 | 143 | // Command line parser 144 | void rollYourOwn(int argc, char ** argv); 145 | void printHelp(); 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /python/wiggletools/Makefile: -------------------------------------------------------------------------------- 1 | BINDIR=../../bin 2 | 3 | default: scripts 4 | 5 | scripts: 6 | mkdir -p ${BINDIR} 7 | cp [^_]*.py *.sh ${BINDIR} 8 | -------------------------------------------------------------------------------- /python/wiggletools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/python/wiggletools/__init__.py -------------------------------------------------------------------------------- /python/wiggletools/mergeApplyDirectory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import sys 18 | import subprocess 19 | import wigglePlots 20 | import os 21 | import os.path 22 | 23 | file=sys.argv[1] 24 | if (re.call("sort -k1,1 -k2,2n -k3,3n -m %sx/* > %s" % (file, file), shell=True)): 25 | print 'Error processing directory %sx' % file 26 | sys.exit(100) 27 | os.rmdir(file + "x") 28 | 29 | try: 30 | if os.path.getsize(wiggle_file) > 0: 31 | wigglePlots.make_overlaps(file, file + ".png") 32 | else: 33 | # Create empty file with .empty suffix 34 | open(file + ".empty", "w").close() 35 | except: 36 | sys.exit(100) 37 | -------------------------------------------------------------------------------- /python/wiggletools/mergeBedLikeDirectory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -Eeu 2 | 3 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | file=$1 18 | 19 | sort -k1,1 -k2,2n -k3,3n -m ${file}x/* > $file 20 | rm -Rf ${file}x 21 | -------------------------------------------------------------------------------- /python/wiggletools/mergeBigWigDirectory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import sys 19 | import subprocess 20 | import glob 21 | import shutil 22 | import os 23 | 24 | directory = sys.argv[1] 25 | bigwigs = glob.glob(os.path.join(directory + "x", '*.bw')) 26 | 27 | if len(bigwigs) > 0: 28 | command = ['bigWigCat', directory] + bigwigs 29 | p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 30 | return_code = p.wait() 31 | 32 | if return_code != 0: 33 | out, err = p.communicate() 34 | print "Failed: %s" % " ".join(command) 35 | print "Stdout:" 36 | print out 37 | print "Stderr:" 38 | print err 39 | sys.exit(100) 40 | 41 | else: 42 | # Create empty file with .empty suffix 43 | open(directory + ".empty", "w").close() 44 | 45 | shutil.rmtree(directory + "x") 46 | -------------------------------------------------------------------------------- /python/wiggletools/mergeProfileDirectory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import sys 19 | import glob 20 | import shutil 21 | import wigglePlots 22 | 23 | try: 24 | target = sys.argv[1] 25 | firstFile = True 26 | array = None 27 | 28 | files = glob.glob(target + "x/*") 29 | 30 | if len(files) > 0: 31 | # Scan through files and add up their profiles 32 | for file in files: 33 | for line in open(file): 34 | items = line.strip().split('\t') 35 | index = int(items[0]) 36 | value = float(items[1]) 37 | if firstFile and array is None: 38 | array = [float(value)] 39 | elif firstFile: 40 | array.append(float(value)) 41 | else: 42 | array[index] += float(value) 43 | 44 | firstFile = False 45 | 46 | if array is None: 47 | # Create empty file with .empty suffix 48 | open(target + ".empty", "w").close() 49 | else: 50 | # Write out results 51 | file = open(target, 'w') 52 | for index in range(len(array)): 53 | file.write("%i\t%f\n" % (index, array[index])) 54 | file.close() 55 | 56 | # Do a little drawing 57 | wigglePlots.make_profile_curve(target, target + ".png", format='png') 58 | else: 59 | # Create empty file with .empty suffix 60 | open(target + ".empty", "w").close() 61 | 62 | # Remove unnecessary files 63 | shutil.rmtree(target + "x") 64 | except: 65 | sys.exit(100) 66 | -------------------------------------------------------------------------------- /python/wiggletools/mergeProfilesDirectory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import sys 19 | import subprocess 20 | import wigglePlots 21 | import os 22 | import os.path 23 | import shutil 24 | 25 | try: 26 | file=sys.argv[1] 27 | if (subprocess.call("sort -k1,1 -k2,2n -k3,3n -m %sx/* > %s" % (file, file), shell=True)): 28 | print 'Error processing directory %sx' % file 29 | sys.exit(100) 30 | shutil.rmtree(file + "x") 31 | 32 | if os.path.getsize(file) > 0: 33 | wigglePlots.make_profiles_matrix(file, file + ".png", format='png') 34 | else: 35 | # Create empty file with .empty suffix 36 | open(file + ".empty", "w").close() 37 | except: 38 | sys.exit(100) 39 | -------------------------------------------------------------------------------- /python/wiggletools/multiJob.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import os 19 | import os.path 20 | import sys 21 | import tempfile 22 | import subprocess 23 | import re 24 | import glob 25 | 26 | ################################################ 27 | ## File hygiene 28 | ################################################ 29 | 30 | def clean_temp_file(file): 31 | if file is None: 32 | return 33 | 34 | if os.path.exists(file): 35 | os.remove(file) 36 | 37 | for file2 in glob.glob("%s_[0-9]*.out" % file) + glob.glob("%s_[0-9]*.err" % file): 38 | os.remove(file2) 39 | 40 | def clean_temp_files(files): 41 | map(clean_temp_file, files) 42 | 43 | ################################################ 44 | ## LSF MultiJob 45 | ################################################ 46 | 47 | def makeCommand(filename, count, batch_system='LSF', dependency=None, mem=4, working_directory='.'): 48 | name = os.path.basename(filename) 49 | if batch_system == 'LSF': 50 | bsub_cmd = "bsub -q normal -R'select[mem>%i] rusage[mem=%i]' -M%i -J'%s[1-%s]'" % (1024*mem, 1024*mem, 1024*mem, name, count) 51 | if dependency is not None: 52 | bsub_cmd += " -w '%s[*]'" % dependency 53 | output = "-o %s/%s_%%I.out -e %s/%s_%%I.err" % (working_directory, filename, working_directory, filename) 54 | jobCmd = " ".join([bsub_cmd, output, 'LSFwrapper.sh', "' multiJob.py ", filename, batch_system, "'"]) 55 | elif batch_system == 'SGE': 56 | bsub_cmd = "qsub -terse -cwd -V -b y -t 1-%s -N %s" % (count, os.path.basename(filename)) 57 | if dependency is not None: 58 | bsub_cmd += " -hold_jid %s" % dependency 59 | output = "-o %s -e %s" % (working_directory, working_directory) 60 | jobCmd = " ".join([bsub_cmd, output, "' multiJob.py ", filename, batch_system, "'"]) 61 | elif batch_system == 'local': 62 | jobCmd = "sh " + filename + ">& " + os.path.join(working_directory, filename + ".oe"); 63 | else: 64 | raise NameError 65 | 66 | return jobCmd 67 | 68 | def submit(cmds, batch_system="LSF", dependency=None, mem=4, working_directory='.'): 69 | if len(cmds) == 0: 70 | sys.stderr.write("No commands in list") 71 | raise RuntimeError 72 | return None, None 73 | 74 | descr, filename = tempfile.mkstemp(dir=working_directory) 75 | 76 | fh = open(filename, 'w') 77 | fh.write("\n".join(cmds)) 78 | fh.close() 79 | 80 | multi_job_cmd = makeCommand(filename, len(cmds), batch_system, dependency, mem, working_directory) 81 | p = subprocess.Popen(multi_job_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 82 | ret = p.wait() 83 | out, err = p.communicate() 84 | if ret != 0: 85 | print "Could not start job:" 86 | print multi_job_cmd 87 | print "OUT: " + out 88 | print "ERR: " + err 89 | print os.environ 90 | if 'SGE_ROOT' in os.environ: 91 | print "SGE_ROOT: " + os.environ['SGE_ROOT'] 92 | else: 93 | print "SGE_ROOT: UNDEF" 94 | print 'USER: ' + os.environ['USERNAME'] 95 | assert False 96 | 97 | if batch_system == 'LSF': 98 | for line in out.split('\n'): 99 | match = re.match(r'Job <([0-9]*)>', line) 100 | if match is not None: 101 | return match.group(1), filename 102 | sys.stderr.write("Could not find job id in lsf output: %s" % out) 103 | raise RuntimeError 104 | return None, None 105 | else: 106 | return re.split(r'[\n\.]', out)[0], filename 107 | 108 | 109 | ################################################ 110 | ## Worker 111 | ################################################ 112 | 113 | def main(): 114 | output = tempfile.TemporaryFile() 115 | error = tempfile.TemporaryFile() 116 | 117 | file = open( sys.argv[1] ) 118 | batch_system = sys.argv[2] 119 | 120 | if batch_system == 'LSF': 121 | index = os.environ['LSB_JOBINDEX'] 122 | elif batch_system == 'SGE': 123 | index = os.environ['SGE_TASK_ID'] 124 | else: 125 | raise NameError 126 | 127 | for i in range(int(index)): 128 | line = file.readline() 129 | 130 | print line 131 | 132 | err = subprocess.call(line.strip(), shell=True, stdout=output, stderr=error) 133 | 134 | output.seek(0) 135 | for line in output: 136 | sys.stdout.write(line) 137 | error.seek(0) 138 | for line in error: 139 | sys.stderr.write(line) 140 | 141 | sys.exit(err) 142 | 143 | if __name__ == "__main__": 144 | main() 145 | -------------------------------------------------------------------------------- /python/wiggletools/parallelWiggleTools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import sys 18 | import os.path 19 | import re 20 | import tempfile 21 | import subprocess 22 | import glob 23 | import multiJob 24 | 25 | # Directory where job stdin, stdout and stderr are stored 26 | # Must be visible to all LSF nodes 27 | # By default, right on your doorstep ;P 28 | DUMP_DIR = '.' 29 | 30 | ################################################ 31 | ## Splitting a wiggletools command into regional jobs 32 | ################################################ 33 | 34 | def create_dirs(command): 35 | for match in re.finditer(r'(write|write_bg|profile|profiles|AUC|mean|variance|pearson)\s+(\S+)\s', command): 36 | path = match.group(2) + "x" 37 | if not os.path.exists(path): 38 | os.makedirs(path) 39 | 40 | def create_new_command(command, chr, start, finish, chrom_sizes_file): 41 | command = re.sub(r'write\s+(\S+.wig)\s',r'write \1x/%s_%i_%i.wig ' % (chr, start, finish), command) 42 | # Careful: the following line has to be AFTER the one above, else they overwrite each other 43 | command = re.sub(r'write\s+(\S+.bw)\s',r'write \1x/%s_%i_%i.wig ' % (chr, start, finish), command) 44 | command = re.sub(r'write_bg\s+(\S+)\s',r'write \1x/%s_%i_%i.wig ' % (chr, start, finish), command) 45 | command = re.sub(r'(profile|profiles)\s+(\S+)\s',r'\1 \2x/%s_%i_%i ' % (chr, start, finish), command) 46 | command = re.sub(r'^(AUC|mean|variance|pearson)\s+(\S+)\s',r'\1 \2x/%s_%i_%i ' % (chr, start, finish), command) 47 | 48 | m = re.match(r'(profile|profiles)\s+(\S+)\s(\S+)\s+(.*)', command) 49 | m2 = re.match(r'(AUC|mean|variance|pearson)\s+(\S+)\s+(.*)', command) 50 | if m is not None: 51 | plot = m.group(1) 52 | output = m.group(2) 53 | width = m.group(3) 54 | iterator = m.group(4) 55 | return " ".join(map(str, ['wiggletoolsIndex.py', chrom_sizes_file, plot, output, width,'seek',chr,start,finish,iterator])) 56 | elif m2 is not None: 57 | plot = m.group(1) 58 | output = m.group(2) 59 | iterators = m.group(3) 60 | return " ".join(map(str, ['wiggletoolsIndex.py', chrom_sizes_file, plot, output,'seek',chr,start,finish,iterators])) 61 | else : 62 | # Command does not start with extraction function: 63 | return " ".join(map(str, ['wiggletoolsIndex.py', chrom_sizes_file,'do','seek',chr,start,finish,command])) 64 | 65 | 66 | def makeMapCommand(command, chrom_sizes_file, chrom_sizes, region_size): 67 | create_dirs(command) 68 | return [create_new_command(command, chr, start, min(chrom_sizes[chr], start + int(region_size)), chrom_sizes_file) for chr in sorted(chrom_sizes.keys()) for start in range(1, chrom_sizes[chr], int(region_size))] 69 | 70 | def test_makeMapCommand(): 71 | chrom_sizes = dict([("chr1", 20), ("chr2", 30)]) 72 | chrom_sizes_file = 'chrom_sizes.txt' 73 | cmd = 'add toto.bw tata.bam' 74 | destination = 'sum.bg' 75 | print makeMapCommand(cmd, chrom_sizes_file, chrom_sizes, region_size=int(3e7)) 76 | 77 | ################################################ 78 | ## Merging the results of parallel wiggletools runs 79 | ################################################ 80 | 81 | def makeReduceCommands(command): 82 | mergeBigWigCommands = ['mergeBigWigDirectory.py %s' % match.group(1) for match in re.finditer(r'write\s+(\S+.bw)\s', command)] 83 | mergeProfileCommand = ['mergeProfileDirectory.py %s' % match.group(1) for match in re.finditer(r'profile\s+(\S+)\s', command)] 84 | mergeProfilesCommand = ['mergeProfilesDirectory.py %s' % match.group(1) for match in re.finditer(r'profiles\s+(\S+)\s', command)] 85 | mergeWigglesCommand = ['mergeBedLikeDirectory.sh %s' % match.group(1) for match in re.finditer(r'write\s+(\S+.wig)\s', command)] 86 | mergeBedGraphsCommand = ['mergeBedLikeDirectory.sh %s' % match.group(1) for match in re.finditer(r'write_bg\s+(\S+.bg)\s', command)] 87 | return mergeBigWigCommands + mergeProfileCommand + mergeProfilesCommand + mergeWigglesCommand + mergeBedGraphsCommand 88 | 89 | 90 | ################################################ 91 | ## Main function 92 | ################################################ 93 | 94 | def readChromSizes(file): 95 | chrom_sizes = dict() 96 | fh = open(file) 97 | for line in fh: 98 | items = line.strip().split() 99 | chrom_sizes[items[0]] = int(items[1]) 100 | fh.close() 101 | return chrom_sizes 102 | 103 | def run(cmds, chrom_file, batch_system='local', tmp='.'): 104 | for cmd in cmds: 105 | if re.search('(apply_paste|histogram)', cmd) is not None: 106 | print "Cannot parallelize the computation of histograms or apply_paste operations" 107 | sys.exit(1) 108 | chrom_sizes = readChromSizes(chrom_file) 109 | mapCommands = sum((makeMapCommand(cmd, chrom_file, chrom_sizes, region_size=3e7) for cmd in cmds), []) 110 | jobID1, filename1 = multiJob.submit(mapCommands, batch_system=batch_system, working_directory=tmp) 111 | reduceCommands = sum((makeReduceCommands(cmd) for cmd in cmds), []) 112 | jobID2, filename2 = multiJob.submit(reduceCommands, dependency = jobID1, batch_system=batch_system, mem=8, working_directory=tmp) 113 | return jobID2, [filename1, filename2] 114 | 115 | def main(): 116 | if len(sys.argv) == 3: 117 | chrom_file = sys.argv[1] 118 | cmds = sys.argv[2:] 119 | run(cmds, chrom_file) 120 | else: 121 | print """ 122 | parallelWiggletools.py: wrapper script to run wiggletools in parallel on LSF 123 | 124 | Usage: parallelWiggletools.py chrom_sizes.txt 'command1' ['command2' [...]] 125 | 126 | Where: 127 | * chrom_sizes.txt is a tab-delimited text file with the chromosome names and lengths 128 | * command* is a valid wiggletools command, without histogram or apply_paste keywords. 129 | """ 130 | 131 | if __name__ == "__main__": 132 | main() 133 | -------------------------------------------------------------------------------- /python/wiggletools/wigglePlots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | import argparse 18 | import subprocess 19 | import numpy 20 | import matplotlib 21 | matplotlib.use('Agg') 22 | import matplotlib.pyplot as pyplot 23 | import matplotlib.cm as cm 24 | 25 | ############################################## 26 | ## Multihistogram 27 | ############################################## 28 | 29 | def make_histogram(infile, labels, out, format='pdf', normalised=True): 30 | counts = dict((X,[]) for X in labels) 31 | bin_positions = [] 32 | 33 | file = open(infile) 34 | for line in file: 35 | items = line.strip().split('\t') 36 | bin_positions.append(float(items[0])) 37 | for i in range(len(labels)): 38 | counts[labels[i]].append(float(items[i+1])) 39 | file.close() 40 | 41 | assert len(labels) == len(counts.keys()) 42 | 43 | values = numpy.array([bin_positions for X in labels]).T 44 | if normalised: 45 | weights = numpy.array([numpy.array(counts[X]) / sum(counts[X]) for X in labels]).T 46 | else: 47 | weights = numpy.array([counts[X] for X in labels]).T 48 | pyplot.hist(values, bins=len(bin_positions), weights=weights, label=labels) 49 | pyplot.legend() 50 | pyplot.savefig(out, format=format) 51 | 52 | ############################################## 53 | ## Profile curve 54 | ############################################## 55 | 56 | def make_profile_curve(infile, out, format='pdf'): 57 | X = [] 58 | Y = [] 59 | file = open(infile) 60 | for line in file: 61 | items = line.strip().split('\t') 62 | X.append(float(items[0])) 63 | Y.append(float(items[1])) 64 | file.close() 65 | 66 | pyplot.plot(X, Y, '-') 67 | pyplot.savefig(out, format=format) 68 | 69 | ############################################## 70 | ## Profile matrix 71 | ############################################## 72 | 73 | def make_profiles_matrix(infile, out, format='pdf'): 74 | M = numpy.array([map(float, line.strip().split("\t")[3:]) for line in open(infile)]) 75 | M = numpy.log(M+1) 76 | M = M / numpy.max(M) 77 | means = numpy.mean(M, axis=1) 78 | sorted_rows = range(len(means)) 79 | sorted_rows.sort(key=lambda X: means[X]) 80 | M = M[sorted_rows,:] 81 | pyplot.imshow(M,aspect='auto',cmap=cm.Greys) 82 | pyplot.savefig(out, format=format) 83 | 84 | ############################################## 85 | ## Overlap histogram 86 | ############################################## 87 | 88 | def make_overlaps(infile, out, format='pdf'): 89 | values = dict() 90 | file = open(infile) 91 | for line in file: 92 | items = line.strip().split('\t') 93 | if len(items) != 4 and len(items) != 0: 94 | file.close() 95 | return 96 | if items[3] not in values: 97 | values[items[3]] = 0 98 | values[items[3]] += float(items[-1]) 99 | file.close() 100 | 101 | keys = values.keys() 102 | labels = sorted(keys) 103 | ind = numpy.arange(len(keys)) # the x locations for the groups 104 | width = 0.35 # the width of the bars: can also be len(x) sequence 105 | pyplot.bar(ind, [values[X] for X in keys], width) 106 | pyplot.xticks(ind+width/2., keys) 107 | pyplot.savefig(out, format=format) 108 | 109 | ############################################## 110 | ## Convenience wrapper 111 | ############################################## 112 | 113 | def make_plot(plot, infile, out, labels=None, format='pdf'): 114 | if plot == 'hist': 115 | make_histogram(infile, labels, out, format) 116 | elif plot == 'profile': 117 | make_profile_curve(infile, out, format) 118 | elif plot == 'profiles': 119 | make_profiles_matrix(infile, out, format) 120 | elif plot == 'overlaps': 121 | make_overlaps(infile, out, format) 122 | 123 | ############################################## 124 | ## Command line tool 125 | ############################################## 126 | 127 | def get_options(): 128 | parser = argparse.ArgumentParser(description='Wiggletools wrapper for plot generation.') 129 | parser.add_argument('--in', '-i', dest='infile', help='WiggleTools output',required=True) 130 | parser.add_argument('--labels',dest='labels',help='Labels to the different histograms', nargs='*') 131 | parser.add_argument('--out','-o', dest='out',help='Outfile', required=True) 132 | parser.add_argument('--plot','-p', dest='plot',help='Type of plot',choices=['hist','profile','profiles','overlaps'],required=True) 133 | parser.add_argument('--format','-f', dest='format',help='File format',choices=['pdf','png','...'],default='png') 134 | options = parser.parse_args() 135 | return options 136 | 137 | def main(): 138 | options = get_options() 139 | make_plot(options.plot, options.infile, options.out, options.labels, format=options.format) 140 | 141 | if __name__ == '__main__': 142 | main() 143 | -------------------------------------------------------------------------------- /python/wiggletools/wiggletoolsIndex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright [1999-2017] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import sys 18 | import subprocess 19 | import os 20 | import re 21 | import os.path 22 | 23 | chrom_lengths_file = sys.argv[1] 24 | wiggletools_args = sys.argv[2:] 25 | 26 | def run(args): 27 | p = subprocess.Popen(" ".join(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 28 | err = p.wait() 29 | stdout, stderr = p.communicate() 30 | sys.stdout.write(stdout) 31 | sys.stderr.write(stderr) 32 | if err != 0: 33 | sys.exit(100) 34 | 35 | def main(): 36 | cmd = ['wiggletools'] + wiggletools_args 37 | run(cmd) 38 | 39 | for i in range(1, len(wiggletools_args)): 40 | if wiggletools_args[i-1] == 'write' and re.search(r'\S*.wig', wiggletools_args[i]) is not None: 41 | wiggle_file = wiggletools_args[i] 42 | if os.path.getsize(wiggle_file) > 0: 43 | bigwig_file = re.sub('.wig$','.bw', wiggle_file) 44 | run(['wigToBigWig -keepAllChromosomes -fixedSummaries',wiggle_file,chrom_lengths_file,bigwig_file]) 45 | os.remove(wiggle_file) 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=-g -Wall -O3 -std=gnu99 2 | LIBDIR=../lib 3 | BINDIR=../bin 4 | LIBS= -lwiggletools -lBigWig -lcurl -lhts -lgsl -lgslcblas -lz -lpthread -lm -llzma -lbz2 5 | OPTS=-D_PBGZF_USE 6 | 7 | default: lib bin 8 | 9 | bin: ${BINDIR}/wiggletools 10 | 11 | ${BINDIR}/wiggletools: ${LIBDIR}/libwiggletools.a wiggletools.o 12 | mkdir -p ${BINDIR} 13 | ${CC} ${CFLAGS} -L${LIBDIR} ${LDFLAGS} wiggletools.c ${LIBS} -o ${BINDIR}/wiggletools 14 | 15 | lib: ${LIBDIR}/libwiggletools.a 16 | 17 | ${LIBDIR}/libwiggletools.a: wiggleIterator.o wigReader.o bigWiggleReader.o multiplexer.o reducers.o bedReader.o bigBedReader.o bamReader.o apply.o commandParser.o wigWriter.o statistics.o unaryOps.o multiSet.o setComparisons.o bufferedReader.o vcfReader.o bcfReader.o plots.o mWigWriter.o recycleBin.o fib.o samReader.o hash.o hashfib.o 18 | mkdir -p ${LIBDIR} 19 | ar rcs ${LIBDIR}/libwiggletools.a *.o 20 | 21 | %.o: %.c; ${CC} ${CFLAGS} ${INC} ${CPPFLAGS} ${OPTS} -c $< -o $@ 22 | 23 | clean: 24 | rm -Rf *.o *.a wiggletools 25 | -------------------------------------------------------------------------------- /src/apply.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Local header 16 | #include 17 | #include 18 | #include 19 | 20 | #include "multiplexer.h" 21 | 22 | const int MAX_BUFFER = 1e6; 23 | const int MAX_BUFFER_SUM = 1e6; 24 | const int MAX_SEEK = 10; 25 | 26 | ////////////////////////////////////////////////////// 27 | // Buffered wiggleIterator 28 | ////////////////////////////////////////////////////// 29 | 30 | typedef struct bufferedWiggleIteratorData_st { 31 | char * chrom; 32 | int start; 33 | int finish; 34 | int index; 35 | int length; 36 | double * values; 37 | bool * set; 38 | struct bufferedWiggleIteratorData_st * next; 39 | double default_value; 40 | } BufferedWiggleIteratorData; 41 | 42 | static BufferedWiggleIteratorData * createBufferedWiggleIteratorData(char * chrom, int start, int finish, float default_value) { 43 | BufferedWiggleIteratorData * bufferedData = (BufferedWiggleIteratorData *) calloc(1, sizeof(BufferedWiggleIteratorData)); 44 | if (!bufferedData) { 45 | fprintf(stderr, "Could not calloc %li bytes\n", sizeof(BufferedWiggleIteratorData)); 46 | abort(); 47 | } 48 | bufferedData->chrom = chrom; 49 | bufferedData->start = start; 50 | bufferedData->finish = finish; 51 | bufferedData->index = 0; 52 | bufferedData->length = finish - start; 53 | bufferedData->values = (double *) calloc(bufferedData->length, sizeof(double)); 54 | bufferedData->set = (bool *) calloc(bufferedData->length, sizeof(bool)); 55 | bufferedData->default_value = default_value; 56 | return bufferedData; 57 | } 58 | 59 | void destroyBufferedWiggleIteratorData(BufferedWiggleIteratorData * data) { 60 | if (data->values) { 61 | free(data->values); 62 | free(data->set); 63 | } 64 | free(data); 65 | } 66 | 67 | void LooseBufferedWiggleIteratorPop(WiggleIterator * apply) { 68 | BufferedWiggleIteratorData * data = (BufferedWiggleIteratorData *) apply->data; 69 | if (apply->done) 70 | ; 71 | else if (data->index == data->length) 72 | apply->done = true; 73 | else { 74 | apply->start = data->index; 75 | if (data->set[data->index]) { 76 | apply->value = data->values[data->index]; 77 | data->index++; 78 | } else { 79 | apply->value = apply->default_value; 80 | while (data->index < data->length && !data->set[data->index]) 81 | data->index++; 82 | } 83 | apply->finish = data->index; 84 | } 85 | } 86 | 87 | void StrictBufferedWiggleIteratorPop(WiggleIterator * apply) { 88 | BufferedWiggleIteratorData * data = (BufferedWiggleIteratorData *) apply->data; 89 | while (data->index < data->length) { 90 | if (data->set[data->index]) { 91 | apply->start = data->index; 92 | apply->finish = apply->start + 1; 93 | apply->value = data->values[data->index]; 94 | data->index++; 95 | return; 96 | } else 97 | data->index++; 98 | } 99 | apply->done = true; 100 | } 101 | 102 | void BufferedWiggleIteratorSeek(WiggleIterator * apply, const char * chrom, int start, int finish) { 103 | fprintf(stderr, "Cannot seek on buffered iterator!"); 104 | exit(1); 105 | } 106 | 107 | WiggleIterator * BufferedWiggleIterator(BufferedWiggleIteratorData * data, bool strict) { 108 | WiggleIterator * apply; 109 | if (strict) 110 | apply = newWiggleIterator(data, &StrictBufferedWiggleIteratorPop, &BufferedWiggleIteratorSeek, data->default_value, false); 111 | else 112 | apply = newWiggleIterator(data, &LooseBufferedWiggleIteratorPop, &BufferedWiggleIteratorSeek, data->default_value, false); 113 | apply->chrom = data->chrom; 114 | return apply; 115 | } 116 | 117 | ////////////////////////////////////////////////////// 118 | // Fill in wiggleIterator 119 | ////////////////////////////////////////////////////// 120 | 121 | typedef struct fillInUnaryData_st { 122 | char * chrom; 123 | int start; 124 | int finish; 125 | bool first; 126 | WiggleIterator * source; 127 | } FillInUnaryData; 128 | 129 | void FillInUnaryPop(WiggleIterator * wi) { 130 | FillInUnaryData * data = (FillInUnaryData*) wi->data; 131 | WiggleIterator * source = data->source; 132 | 133 | if (data->first) { 134 | wi->chrom = data->chrom; 135 | wi->finish = data->start; 136 | data->first = false; 137 | } 138 | 139 | if (source->done) { 140 | if (wi->finish == data->finish) 141 | wi->done = true; 142 | else { 143 | wi->start = wi->finish; 144 | wi->finish = data->finish; 145 | wi->value = wi->default_value; 146 | } 147 | } else if (source->start > wi->finish) { 148 | wi->start = wi->finish; 149 | wi->finish = source->start; 150 | wi->value = wi->default_value; 151 | } else { 152 | wi->start = source->start; 153 | wi->finish = source->finish; 154 | wi->value = source->value; 155 | pop(data->source); 156 | } 157 | } 158 | 159 | WiggleIterator * FillInUnaryWiggleIterator(WiggleIterator * source, char * chrom, int start, int finish) { 160 | FillInUnaryData * data = (FillInUnaryData*) calloc(1, sizeof(FillInUnaryData)); 161 | data->chrom = chrom; 162 | data->start = start; 163 | data->finish = finish; 164 | data->source = source; 165 | data->first = true; 166 | seek(source, chrom, start, finish); 167 | return newWiggleIterator(data, &FillInUnaryPop, NULL, source->default_value, false); 168 | } 169 | 170 | ////////////////////////////////////////////////////// 171 | // Apply operator 172 | ////////////////////////////////////////////////////// 173 | 174 | typedef struct applyWiggleIteratorData_st { 175 | WiggleIterator * regions; 176 | WiggleIterator * (**statistics)(WiggleIterator *); 177 | int profile_width; 178 | bool strict; 179 | WiggleIterator * input; 180 | BufferedWiggleIteratorData * head; 181 | BufferedWiggleIteratorData * tail; 182 | } ApplyMultiplexerData; 183 | 184 | static BufferedWiggleIteratorData * createTarget(ApplyMultiplexerData * data) { 185 | return createBufferedWiggleIteratorData(data->regions->chrom, data->regions->start, data->regions->finish, data->input->default_value); 186 | } 187 | 188 | static void addTarget(ApplyMultiplexerData * data, BufferedWiggleIteratorData * bufferedData) { 189 | if (!data->head) 190 | data->head = bufferedData; 191 | else 192 | data->tail->next = bufferedData; 193 | data->tail = bufferedData; 194 | } 195 | 196 | static void createTargets(ApplyMultiplexerData * data) { 197 | if (data->regions->finish - data->regions->start >= MAX_BUFFER) { 198 | addTarget(data, createTarget(data)); 199 | pop(data->regions); 200 | } else { 201 | int length; 202 | int last_finish = data->regions->finish; 203 | int total_buffers = 0; 204 | while(!data->regions->done 205 | && (length = data->regions->finish - data->regions->start) < MAX_BUFFER 206 | && (!data->head 207 | || ((total_buffers += length) < MAX_BUFFER_SUM && data->regions->start <= last_finish + MAX_SEEK && !strcmp(data->regions->chrom, data->tail->chrom)) 208 | ) 209 | ) 210 | { 211 | if (data->regions->finish > last_finish) 212 | last_finish = data->regions->finish; 213 | addTarget(data, createTarget(data)); 214 | pop(data->regions); 215 | } 216 | } 217 | } 218 | 219 | static void pushDataOnBuffer(ApplyMultiplexerData * data, BufferedWiggleIteratorData * bufferedData) { 220 | int pos, start, finish; 221 | 222 | if (bufferedData->start > data->input->start) 223 | start = 0; 224 | else 225 | start = data->input->start - bufferedData->start; 226 | 227 | if (bufferedData->finish < data->input->finish) 228 | finish = bufferedData->length; 229 | else 230 | finish = data->input->finish - bufferedData->start; 231 | 232 | for (pos = start; pos < finish; pos++) { 233 | bufferedData->values[pos] = data->input->value; 234 | bufferedData->set[pos] = true; 235 | } 236 | 237 | } 238 | 239 | static void pushData(ApplyMultiplexerData * data) { 240 | BufferedWiggleIteratorData * bufferedData; 241 | 242 | for (bufferedData = data->head; bufferedData; bufferedData = bufferedData->next) { 243 | if (bufferedData->start >= data->input->finish) 244 | break; 245 | else 246 | pushDataOnBuffer(data, bufferedData); 247 | } 248 | } 249 | 250 | BufferedWiggleIteratorData * popApplyMultiplexerData(ApplyMultiplexerData * data) { 251 | BufferedWiggleIteratorData * bufferedData = data->head; 252 | if (data->tail == data->head) 253 | data->tail = data->head = NULL; 254 | else 255 | data->head = data->head->next; 256 | return bufferedData; 257 | } 258 | 259 | void computeApplyValues(Multiplexer * apply, ApplyMultiplexerData * data, BufferedWiggleIteratorData * bufferedData) { 260 | WiggleIterator * wi; 261 | if (bufferedData->values) 262 | wi = BufferedWiggleIterator(bufferedData, data->strict); 263 | else if (data->strict) { 264 | wi = data->input; 265 | seek(wi, bufferedData->chrom, bufferedData->start, bufferedData->finish); 266 | } else 267 | wi = FillInUnaryWiggleIterator(data->input, bufferedData->chrom, bufferedData->start, bufferedData->finish); 268 | 269 | if (data->statistics) { 270 | int i; 271 | for (i = apply->count-1; i >= 0; i--) 272 | wi = (data->statistics[i])(wi); 273 | runWiggleIterator(wi); 274 | i=0; 275 | while (wi->append) { 276 | apply->values[i] = *((double*) (wi->data)); 277 | WiggleIterator * tmp = wi; 278 | wi = wi->append; 279 | destroyWiggleIterator(tmp); 280 | i++; 281 | } 282 | } else 283 | regionProfile(wi, apply->values, apply->count, apply->finish - apply->start, false); 284 | 285 | if (wi != data->input) { 286 | // Careful not to destroy buffered data. It requires special function and is destroyed elsewhere. 287 | if (wi->data != bufferedData) 288 | free(wi->data); 289 | free(wi); 290 | } 291 | } 292 | 293 | void updateApplyMultiplexer(Multiplexer * apply, ApplyMultiplexerData * data, BufferedWiggleIteratorData * bufferedData) { 294 | apply->chrom = bufferedData->chrom; 295 | apply->start = bufferedData->start; 296 | apply->finish = bufferedData->finish; 297 | computeApplyValues(apply, data, bufferedData); 298 | } 299 | 300 | void ApplyMultiplexerPop(Multiplexer * apply) { 301 | ApplyMultiplexerData * data = (ApplyMultiplexerData *) apply->data; 302 | 303 | // If no ongoing jobs, create some 304 | if (data->head == NULL) { 305 | // Note: only exit if no more regions AND no targets waiting 306 | if (data->regions->done) { 307 | apply->done = true; 308 | return; 309 | } 310 | createTargets(data); 311 | seek(data->input, data->head->chrom, data->head->start, data->tail->finish); 312 | } 313 | 314 | // If ongoing targets are reading: 315 | // Push enough data to finish the first job 316 | if (data->head->values) { 317 | while (!data->input->done && data->input->start < data->head->finish && !strcmp(data->input->chrom, data->head->chrom)) { 318 | pushData(data); 319 | pop(data->input); 320 | } 321 | } 322 | 323 | // Return value 324 | BufferedWiggleIteratorData * bufferedData = popApplyMultiplexerData(data); 325 | updateApplyMultiplexer(apply, data, bufferedData); 326 | destroyBufferedWiggleIteratorData(bufferedData); 327 | } 328 | 329 | void ApplyMultiplexerSeek(Multiplexer * apply, const char * chrom, int start, int finish) { 330 | ApplyMultiplexerData * data = (ApplyMultiplexerData *) apply->data; 331 | BufferedWiggleIteratorData * bufferedData; 332 | while (data->head) { 333 | bufferedData = data->head; 334 | data->head = data->head->next; 335 | destroyBufferedWiggleIteratorData(bufferedData); 336 | } 337 | data->tail = NULL; 338 | seek(data->regions, chrom, start, finish); 339 | } 340 | 341 | Multiplexer * ApplyMultiplexer(WiggleIterator * regions, WiggleIterator * (**statistics)(WiggleIterator *), int count, WiggleIterator * dataset, bool strict) { 342 | ApplyMultiplexerData * data = (ApplyMultiplexerData *) calloc(1, sizeof(ApplyMultiplexerData)); 343 | data->regions = regions; 344 | data->statistics = statistics; 345 | data->input = dataset; 346 | data->strict = strict; 347 | Multiplexer * res = newCoreMultiplexer(data, count, &ApplyMultiplexerPop, &ApplyMultiplexerSeek); 348 | int i; 349 | for (i=0; i < count; i++) 350 | res->default_values[i] = NAN; 351 | popMultiplexer(res); 352 | return res; 353 | } 354 | 355 | Multiplexer * ProfileMultiplexer(WiggleIterator * regions, int width, WiggleIterator * dataset) { 356 | ApplyMultiplexerData * data = (ApplyMultiplexerData *) calloc(1, sizeof(ApplyMultiplexerData)); 357 | data->regions = regions; 358 | data->input = dataset; 359 | data->strict = false; 360 | Multiplexer * res = newCoreMultiplexer(data, width, &ApplyMultiplexerPop, &ApplyMultiplexerSeek); 361 | int i; 362 | for (i=0; i < width; i++) 363 | res->default_values[i] = NAN; 364 | popMultiplexer(res); 365 | return res; 366 | } 367 | -------------------------------------------------------------------------------- /src/bamReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "htslib/sam.h" 17 | #include "htslib/hts.h" 18 | #include "wiggleIterator.h" 19 | #include "bufferedReader.h" 20 | #include "hashfib.h" 21 | 22 | typedef struct bamFileReaderData_st { 23 | // Arguments to downloader 24 | char * filename; 25 | char * chrom; 26 | int chrom_tid; 27 | int start, stop; 28 | bool read_count; 29 | BufferedReaderData * bufferedReaderData; 30 | 31 | // BAM stuff 32 | samFile * fp; 33 | hts_idx_t * idx; 34 | bam_hdr_t * header; 35 | } BamReaderData; 36 | 37 | static void storeReadComponents(HashFib * starts, HashFib * ends, bam1_t * aln) { 38 | // Sometimes a read has coordinates, but is not mapped (cf BWA) 39 | // We should skip these exceptions 40 | if (aln->core.flag & 0x4) 41 | return; 42 | // Note that BAM coords are 0-based, hence +1 43 | int start = aln->core.pos + 1; 44 | uint32_t *cigar = bam_get_cigar(aln); 45 | int k; 46 | for (k = 0; k < aln->core.n_cigar; ++k) { 47 | int operator = cigar[k]&BAM_CIGAR_MASK; 48 | int length = cigar[k]>>BAM_CIGAR_SHIFT; 49 | switch (operator) { 50 | case BAM_CMATCH: 51 | case BAM_CEQUAL: 52 | case BAM_CDIFF: 53 | case BAM_CDEL: 54 | hashfib_insert(starts, start); 55 | hashfib_insert(ends, start + length); 56 | case BAM_CREF_SKIP: 57 | start += length; 58 | } 59 | } 60 | } 61 | 62 | static bam1_t * nextRead(BamReaderData * data, hts_itr_t * iter, bam1_t * aln) { 63 | if (sam_itr_next(data->fp, iter, aln) < 0) { 64 | bam_destroy1(aln); 65 | return NULL; 66 | } else { 67 | return aln; 68 | } 69 | } 70 | 71 | static bool consumeIteratorWithCigars(BamReaderData * data, hts_itr_t * iter, char * query_chrom, int query_chrom_tid, int query_stop) { 72 | // Iterate through data 73 | bam1_t *aln = bam_init1(); 74 | HashFib * starts = hashfib_construct(); 75 | HashFib * ends = hashfib_construct(); 76 | int start, finish = -1, chrom_tid = -1; 77 | int value = 0; 78 | 79 | aln = nextRead(data, iter, aln); 80 | 81 | while(1) { 82 | // Remove dead weight 83 | if (!hashfib_empty(ends) && hashfib_min(ends) == finish) 84 | value -= hashfib_remove_min(ends); 85 | 86 | // Stream more data into heaps 87 | while (aln 88 | && (aln->core.tid == chrom_tid || hashfib_empty(ends)) 89 | && (hashfib_empty(ends) || aln->core.pos <= hashfib_min(ends) || hashfib_empty(starts) || aln->core.pos <= hashfib_min(starts)) 90 | ) { 91 | storeReadComponents(starts, ends, aln); 92 | chrom_tid = aln->core.tid; 93 | aln = nextRead(data, iter, aln); 94 | } 95 | 96 | if (!hashfib_empty(ends)) { 97 | // Choose new start 98 | if (value) 99 | start = finish; 100 | else 101 | start = hashfib_min(starts); 102 | 103 | // If gone overboard 104 | if ((start >= query_stop && chrom_tid == query_chrom_tid) || chrom_tid > query_chrom_tid) { 105 | hashfib_destroy(starts); 106 | hashfib_destroy(ends); 107 | return false; 108 | } 109 | 110 | // Load new weight 111 | if (!hashfib_empty(starts) && hashfib_min(starts) == start) 112 | value += hashfib_remove_min(starts); 113 | 114 | // Choose finish 115 | if (hashfib_empty(starts) || hashfib_min(ends) < hashfib_min(starts)) 116 | finish = hashfib_min(ends); 117 | else 118 | finish = hashfib_min(starts); 119 | 120 | // If gone overboard 121 | if (finish > query_stop) 122 | finish = query_stop; 123 | 124 | // Push out 125 | if (pushValuesToBuffer(data->bufferedReaderData, data->header->target_name[chrom_tid], start, finish, value)) { 126 | hashfib_destroy(starts); 127 | hashfib_destroy(ends); 128 | return true; 129 | } 130 | } else { 131 | // No ends => end of file iterator 132 | hashfib_destroy(starts); 133 | hashfib_destroy(ends); 134 | return false; 135 | } 136 | } 137 | } 138 | 139 | static bool consumeIteratorNoCigars(BamReaderData * data, hts_itr_t * iter, char * query_chrom, int query_chrom_tid, int query_stop) { 140 | // Iterate through data 141 | bam1_t *aln = bam_init1(); 142 | int start, finish, value; 143 | 144 | aln = nextRead(data, iter, aln); 145 | 146 | while(aln && aln->core.tid == query_chrom_tid && aln->core.pos < query_stop) { 147 | // Read coordinates 148 | // Note that BAM coords are 0-based, hence +1 149 | start = aln->core.pos + 1; 150 | finish = start + 1; 151 | 152 | // Compute value 153 | value = 0; 154 | while (aln 155 | && aln->core.tid == query_chrom_tid 156 | && aln->core.pos == start - 1) { 157 | value++; 158 | aln = nextRead(data, iter, aln); 159 | } 160 | 161 | // Push on 162 | if (pushValuesToBuffer(data->bufferedReaderData, query_chrom, start, finish, value)) 163 | return true; 164 | } 165 | 166 | return false; 167 | } 168 | 169 | static bool downloadBamFileChromosome(BamReaderData * data, char * query_chrom, uint32_t query_start, uint32_t query_stop) { 170 | // Create iterator 171 | int query_chrom_tid = bam_name2id(data->header, query_chrom); 172 | hts_itr_t * iter = sam_itr_queryi(data->idx, query_chrom_tid, query_start, query_stop); 173 | if(data->header == NULL || iter == NULL) { 174 | fprintf(stderr, "Unable to iterate to region %s:%i-%i within %s BAM file.", query_chrom, query_start, query_stop, data->filename); 175 | exit(1); 176 | } 177 | bool status; 178 | if (data->read_count) 179 | status = consumeIteratorNoCigars(data, iter, query_chrom, query_chrom_tid, query_stop); 180 | else 181 | status = consumeIteratorWithCigars(data, iter, query_chrom, query_chrom_tid, query_stop); 182 | 183 | if (iter) 184 | bam_itr_destroy(iter); 185 | return status; 186 | } 187 | 188 | typedef struct { 189 | char * name; 190 | int tid; 191 | } name_id_st; 192 | 193 | static int comp_name_id_st(const void * A, const void * B) { 194 | char * A_name = ((name_id_st *) A)->name; 195 | char * B_name = ((name_id_st *) B)->name; 196 | return strcmp(A_name, B_name); 197 | } 198 | 199 | static void * downloadBamFile(void * args) { 200 | BamReaderData * data = (BamReaderData *) args; 201 | 202 | if (data->chrom) { 203 | downloadBamFileChromosome(data, data->chrom, data->start, data->stop); 204 | } else { 205 | // Copy chromosome name into array of name_id_st: 206 | name_id_st * name_ids = calloc(data->header->n_targets, sizeof(name_id_st)); 207 | int index; 208 | for (index = 0; index < data->header->n_targets; index++) { 209 | name_ids[index].name = data->header->target_name[index]; 210 | name_ids[index].tid = index; 211 | } 212 | 213 | // Sort list of chromosomes alphabetically: 214 | qsort(name_ids, data->header->n_targets, sizeof(name_id_st), comp_name_id_st); 215 | 216 | // Iterate through chromsomes in alphabetic order 217 | for (index = 0; index < data->header->n_targets; index++) 218 | if (downloadBamFileChromosome(data, name_ids[index].name, 0, data->header->target_len[name_ids[index].tid])) 219 | break; 220 | } 221 | 222 | endBufferedSignal(data->bufferedReaderData); 223 | return NULL; 224 | } 225 | 226 | void OpenBamFile(BamReaderData * data, char * filename) { 227 | data->filename = filename; 228 | 229 | // read the header and initialize data 230 | data->fp = hts_open(filename, "r"); 231 | 232 | //Get the header 233 | data->header = sam_hdr_read(data->fp); 234 | 235 | // Get the index 236 | data->idx = sam_index_load(data->fp, data->filename); 237 | if(data->idx == NULL) { 238 | fprintf(stderr, "Unable to open BAM/SAM index. Make sure alignments are indexed\n"); 239 | exit(1); 240 | } 241 | } 242 | 243 | void closeBamFile(BamReaderData * data) { 244 | if (data->idx) 245 | hts_idx_destroy(data->idx); 246 | bam_hdr_destroy(data->header); 247 | sam_close(data->fp); 248 | } 249 | 250 | void BamReaderPop(WiggleIterator * wi) { 251 | BamReaderData * data = (BamReaderData *) wi->data; 252 | BufferedReaderPop(wi, data->bufferedReaderData); 253 | } 254 | 255 | void BamReaderSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 256 | BamReaderData * data = (BamReaderData *) wi->data; 257 | 258 | // Kill ongoing jobs 259 | if (data->bufferedReaderData) { 260 | killBufferedReader(data->bufferedReaderData); 261 | free(data->bufferedReaderData); 262 | data->bufferedReaderData = NULL; 263 | } 264 | 265 | // Set boundaries 266 | data->chrom = chrom; 267 | data->start = start; 268 | data->stop = finish; 269 | 270 | // Weeeee 271 | launchBufferedReader(&downloadBamFile, data, &(data->bufferedReaderData)); 272 | wi->done = false; 273 | BamReaderPop(wi); 274 | 275 | while (!wi->done && (strcmp(wi->chrom, chrom) < 0 || (strcmp(wi->chrom, chrom) == 0 && wi->finish <= start))) 276 | BamReaderPop(wi); 277 | 278 | } 279 | 280 | WiggleIterator * BamReader(char * filename, bool holdFire, bool read_count) { 281 | BamReaderData * data = (BamReaderData *) calloc(1, sizeof(BamReaderData)); 282 | data->read_count = read_count; 283 | OpenBamFile(data, filename); 284 | if (!holdFire) 285 | launchBufferedReader(&downloadBamFile, data, &(data->bufferedReaderData)); 286 | return newWiggleIterator(data, &BamReaderPop, &BamReaderSeek, 0, false); 287 | } 288 | -------------------------------------------------------------------------------- /src/bcfReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include "htslib/vcf.h" 18 | #include "wiggleIterator.h" 19 | #include "bufferedReader.h" 20 | 21 | #define BUFF_LENGTH 1000 22 | 23 | 24 | typedef struct bamFileReaderData_st { 25 | // Arguments to downloader 26 | char * filename; 27 | char * chrom; 28 | int start, stop; 29 | BufferedReaderData * bufferedReaderData; 30 | 31 | // BCF stuff 32 | htsFile * bcf_file; 33 | bcf_hdr_t * bcf_header; 34 | hts_itr_t * bcf_iterator; 35 | hts_idx_t * bcf_index; 36 | 37 | // Gzip file 38 | } BCFReaderData; 39 | 40 | static int nextLine(BCFReaderData * data, bcf1_t * holder) { 41 | if (data->bcf_iterator) 42 | return bcf_itr_next(data->bcf_file, data->bcf_iterator, holder); 43 | else 44 | return bcf_read(data->bcf_file, data->bcf_header, holder); 45 | } 46 | 47 | static void * downloadBCFFile(void * args) { 48 | BCFReaderData * data = (BCFReaderData *) args; 49 | bcf1_t * vcf_line = bcf_init(); 50 | 51 | while (nextLine(data, vcf_line) >= 0) 52 | // Note that BCF encoding is 0-based, hence +1s 53 | if (pushValuesToBuffer(data->bufferedReaderData, bcf_hdr_id2name(data->bcf_header, vcf_line->rid), vcf_line->pos+1, vcf_line->pos+2, 1)) 54 | break; 55 | 56 | endBufferedSignal(data->bufferedReaderData); 57 | bcf_destroy(vcf_line); 58 | if (data->bcf_iterator) 59 | bcf_itr_destroy(data->bcf_iterator); 60 | return NULL; 61 | } 62 | 63 | void OpenBCFFile(BCFReaderData * data, char * filename) { 64 | data->bcf_file = bcf_open(filename, "r"); 65 | data->bcf_index = bcf_index_load(filename); 66 | data->bcf_header = bcf_hdr_read(data->bcf_file); 67 | } 68 | 69 | void closeBCFFile(BCFReaderData * data) { 70 | if (data->bcf_iterator) 71 | bcf_itr_destroy(data->bcf_iterator); 72 | bcf_hdr_destroy(data->bcf_header); 73 | hts_idx_destroy(data->bcf_index); 74 | bcf_close(data->bcf_file); 75 | } 76 | 77 | void BCFReaderPop(WiggleIterator * wi) { 78 | BCFReaderData * data = (BCFReaderData *) wi->data; 79 | BufferedReaderPop(wi, data->bufferedReaderData); 80 | } 81 | 82 | void BcfReaderSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 83 | BCFReaderData * data = (BCFReaderData *) wi->data; 84 | 85 | if (data->bufferedReaderData) { 86 | killBufferedReader(data->bufferedReaderData); 87 | free(data->bufferedReaderData); 88 | data->bufferedReaderData = NULL; 89 | } 90 | // Note: BCF encoding is 0 based, hence -1s 91 | data->bcf_iterator = bcf_itr_queryi(data->bcf_index, bcf_hdr_name2id(data->bcf_header, chrom), start - 1, finish - 1); 92 | if (data->bcf_iterator == NULL) { 93 | fprintf(stderr, "Could not find index file to BCF file %s.\n", data->filename); 94 | exit(1); 95 | } 96 | launchBufferedReader(&downloadBCFFile, data, &(data->bufferedReaderData)); 97 | wi->done = false; 98 | BCFReaderPop(wi); 99 | 100 | while (!wi->done && (strcmp(wi->chrom, chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && wi->finish <= start))) 101 | BCFReaderPop(wi); 102 | } 103 | 104 | WiggleIterator * BcfReader(char * filename, bool holdFire) { 105 | BCFReaderData * data = (BCFReaderData *) calloc(1, sizeof(BCFReaderData)); 106 | OpenBCFFile(data, filename); 107 | if (!holdFire) 108 | launchBufferedReader(&downloadBCFFile, data, &(data->bufferedReaderData)); 109 | return newWiggleIterator(data, &BCFReaderPop, &BcfReaderSeek, 0, true); 110 | } 111 | -------------------------------------------------------------------------------- /src/bedReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include "wiggleIterator.h" 19 | 20 | typedef struct bedReaderData_st { 21 | char *filename; 22 | FILE * file; 23 | char * chrom; 24 | int stop; 25 | } BedReaderData; 26 | 27 | void BedReaderPop(WiggleIterator * wi) { 28 | BedReaderData * data = (BedReaderData *) wi->data; 29 | char line[5000]; 30 | char chrom[1000]; 31 | char sign = '.'; 32 | int start, finish; 33 | 34 | if (wi->done) 35 | return; 36 | 37 | while (fgets(line, 5000, data->file)) { 38 | if (line[0] == '#' || line[0] == EOF) 39 | continue; 40 | 41 | sscanf(line, "%s\t%i\t%i", chrom, &start, &finish); 42 | // Conversion from 0 to 1-based... 43 | start++; 44 | finish++; 45 | 46 | if (strcmp(chrom, wi->chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && start < wi->start)) { 47 | fprintf(stderr, "Bed file %s is not sorted!\nPosition %s:%i is before %s:%i\n", data->filename, chrom, start, wi->chrom, wi->start); 48 | exit(1); 49 | } 50 | 51 | wi->start = start; 52 | wi->finish = finish; 53 | 54 | if (sign == '+') 55 | wi->strand = 1; 56 | else if (sign == '-') 57 | wi->strand = -1; 58 | else 59 | wi->strand = 0; 60 | 61 | 62 | // The reason for creating a new string instead of simply 63 | // overwriting is that other functions may still be pointing 64 | // at the old label 65 | if (wi->chrom[0] == '\0' || strcmp(wi->chrom, chrom)) { 66 | wi->chrom = (char *) calloc(strlen(chrom) + 1, sizeof(char)); 67 | strcpy(wi->chrom, chrom); 68 | } 69 | 70 | if (data->stop > 0) { 71 | int comparison = strcmp(wi->chrom, data->chrom); 72 | if (comparison == 0) { 73 | if (wi->start >= data->stop) { 74 | wi->done = true; 75 | } else if (wi->finish > data->stop) { 76 | wi->finish = data->stop; 77 | } 78 | } else if (comparison > 0) { 79 | wi->done = true; 80 | } 81 | } 82 | 83 | return; 84 | } 85 | 86 | fclose(data->file); 87 | data->file = NULL; 88 | wi->done = true; 89 | } 90 | 91 | void BedReaderSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 92 | BedReaderData * data = (BedReaderData*) wi->data; 93 | 94 | data->stop = finish; 95 | data->chrom = chrom; 96 | 97 | if (!data->file || strcmp(chrom, wi->chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && start < wi->start)) { 98 | if (data->file) 99 | fclose(data->file); 100 | if (!(data->file = fopen(data->filename, "r"))) { 101 | fprintf(stderr, "Could not open input file %s\n", data->filename); 102 | exit(1); 103 | } 104 | // The reason for creating a new string instead of simply 105 | // overwriting is that other functions may still be pointing 106 | // at the old label 107 | wi->chrom = (char *) calloc(strlen(chrom) + 1, sizeof(char)); 108 | wi->done = false; 109 | pop(wi); 110 | } 111 | 112 | while (!wi->done && (strcmp(wi->chrom, chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && wi->finish < start))) 113 | pop(wi); 114 | 115 | if (!wi->done && strcmp(chrom, wi->chrom) == 0 && wi->start < start) 116 | wi->start = start; 117 | } 118 | 119 | WiggleIterator * BedReader(char * filename) { 120 | BedReaderData * data = (BedReaderData *) calloc(1, sizeof(BedReaderData)); 121 | data->filename = filename; 122 | data->stop = -1; 123 | if (!(data->file = fopen(filename, "r"))) { 124 | fprintf(stderr, "Could not open bed file %s\n", filename); 125 | exit(1); 126 | } 127 | return newWiggleIteratorChromName(data, &BedReaderPop, &BedReaderSeek, 0, true); 128 | } 129 | -------------------------------------------------------------------------------- /src/bigBedReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "bigWig.h" 17 | #include "bufferedReader.h" 18 | 19 | static int MAX_BLOCKS = 100; 20 | 21 | typedef struct bigBedReaderData_st { 22 | bigWigFile_t * fp; 23 | char * chrom; 24 | int start; 25 | int stop; 26 | BufferedReaderData * bufferedReaderData; 27 | } BigBedReaderData; 28 | 29 | static int readIteratorEntries(bwOverlapIterator_t *iter, char * chrom, int stretch_start, int stretch_stop, BigBedReaderData * data) { 30 | int index; 31 | for(index = 0; index < iter->entries->l; index++) { 32 | int start = iter->entries->start[index] + 1; 33 | int finish = iter->entries->end[index] + 1; 34 | 35 | // Box into queried stretch 36 | start = start < stretch_start? stretch_start: start; 37 | finish = finish < stretch_stop? finish: stretch_stop; 38 | 39 | if (pushValuesToBuffer(data->bufferedReaderData, chrom, start, finish, 1)) 40 | return 1; 41 | } 42 | return 0; 43 | } 44 | 45 | static int readBigBedRegion(BigBedReaderData * data, char * chrom, int start, int stop) { 46 | // This hack is required because libBigWig does not handle negative numbers 47 | if (start < 1) 48 | start = 1; 49 | if (stop < 1) 50 | stop = 1; 51 | // BigBed format 1 indexed, hence the -1s 52 | bwOverlapIterator_t *iter = bbOverlappingEntriesIterator(data->fp, chrom, start - 1, stop - 1, 0, MAX_BLOCKS); 53 | if (!iter) 54 | return 0; 55 | 56 | while(iter->data) { 57 | if (readIteratorEntries(iter, chrom, start, stop, data)) { 58 | bwIteratorDestroy(iter); 59 | return 1; 60 | } 61 | iter = bwIteratorNext(iter); 62 | } 63 | bwIteratorDestroy(iter); 64 | return 0; 65 | } 66 | 67 | void * readBigBed(void * ptr) { 68 | BigBedReaderData * data = (BigBedReaderData *) ptr; 69 | 70 | if (data->chrom) 71 | readBigBedRegion(data, data->chrom, data->start, data->stop); 72 | else { 73 | int chrom_index; 74 | Chrom_length * chrom_lengths = calloc(data->fp->cl->nKeys, sizeof(Chrom_length)); 75 | for (chrom_index = 0; chrom_index < data->fp->cl->nKeys; chrom_index++) { 76 | chrom_lengths[chrom_index].chrom = data->fp->cl->chrom[chrom_index]; 77 | chrom_lengths[chrom_index].length = data->fp->cl->len[chrom_index]; 78 | } 79 | 80 | qsort(chrom_lengths, data->fp->cl->nKeys, sizeof(Chrom_length), compare_chrom_lengths); 81 | 82 | for (chrom_index = 0; chrom_index < data->fp->cl->nKeys; chrom_index++) 83 | if (readBigBedRegion(data, chrom_lengths[chrom_index].chrom, 1, chrom_lengths[chrom_index].length)) 84 | break; 85 | 86 | free(chrom_lengths); 87 | } 88 | 89 | endBufferedSignal(data->bufferedReaderData); 90 | return NULL; 91 | } 92 | 93 | void BigBedReaderPop(WiggleIterator * wi) { 94 | BigBedReaderData * data = (BigBedReaderData *) wi->data; 95 | BufferedReaderPop(wi, data->bufferedReaderData); 96 | } 97 | 98 | void openBigBed(BigBedReaderData * data, char * filename, bool holdFire) { 99 | if(!bbIsBigBed(filename, NULL)) { 100 | printf("File %s is not in BigBed format\n", filename); 101 | exit(1); 102 | } 103 | data->fp = bbOpen(filename, NULL); 104 | if (!holdFire) 105 | launchBufferedReader(&readBigBed, data, &(data->bufferedReaderData)); 106 | } 107 | 108 | void BigBedReaderSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 109 | BigBedReaderData * data = (BigBedReaderData *) wi->data; 110 | 111 | if (data->bufferedReaderData) { 112 | killBufferedReader(data->bufferedReaderData); 113 | free(data->bufferedReaderData); 114 | data->bufferedReaderData = NULL; 115 | } 116 | data->chrom = chrom; 117 | data->start = start; 118 | data->stop = finish; 119 | launchBufferedReader(&readBigBed, data, &(data->bufferedReaderData)); 120 | wi->done = false; 121 | BigBedReaderPop(wi); 122 | 123 | while (!wi->done && (strcmp(wi->chrom, chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && wi->finish <= start))) 124 | BigBedReaderPop(wi); 125 | 126 | if (!wi->done && strcmp(chrom, wi->chrom) == 0 && wi->start < start) 127 | wi->start = start; 128 | } 129 | 130 | WiggleIterator * BigBedReader(char * f, bool holdFire) { 131 | BigBedReaderData * data = (BigBedReaderData *) calloc(1, sizeof(BigBedReaderData)); 132 | openBigBed(data, f, holdFire); 133 | return newWiggleIterator(data, &BigBedReaderPop, &BigBedReaderSeek, 0, true); 134 | } 135 | -------------------------------------------------------------------------------- /src/bigWiggleReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "bigWig.h" 17 | #include "bufferedReader.h" 18 | 19 | static int MAX_BLOCKS = 100; 20 | 21 | typedef struct bigWiggleReaderData_st { 22 | bigWigFile_t * fp; 23 | char * chrom; 24 | int start; 25 | int stop; 26 | BufferedReaderData * bufferedReaderData; 27 | } BigWiggleReaderData; 28 | 29 | void libBigWigInit(int size) { 30 | if(bwInit(size) != 0) { 31 | fprintf(stderr, "Received an error in bwInit\n"); 32 | exit(1); 33 | } 34 | } 35 | 36 | static int readIteratorIntervals(bwOverlapIterator_t *iter, char * chrom, int stretch_start, int stretch_stop, BigWiggleReaderData * data) { 37 | int index; 38 | for(index = 0; index < iter->intervals->l; index++) { 39 | int start = iter->intervals->start[index] + 1; 40 | int finish = iter->intervals->end[index] + 1; 41 | 42 | // Box into queried stretch 43 | start = start < stretch_start? stretch_start: start; 44 | finish = finish < stretch_stop? finish: stretch_stop; 45 | 46 | if (pushValuesToBuffer(data->bufferedReaderData, chrom, start, finish, iter->intervals->value[index])) 47 | return 1; 48 | } 49 | return 0; 50 | } 51 | 52 | static int readBigWiggleRegion(BigWiggleReaderData * data, char * chrom, int start, int stop) { 53 | // This hack is required because libBigWig does not handle negative numbers 54 | if (start < 1) 55 | start = 1; 56 | if (stop < 1) 57 | stop = 1; 58 | bwOverlapIterator_t *iter = bwOverlappingIntervalsIterator(data->fp, chrom, start - 1, stop - 1, MAX_BLOCKS); 59 | if (!iter) 60 | return 0; 61 | 62 | while(iter->data) { 63 | if (readIteratorIntervals(iter, chrom, start, stop, data)) { 64 | bwIteratorDestroy(iter); 65 | return 1; 66 | } 67 | iter = bwIteratorNext(iter); 68 | } 69 | bwIteratorDestroy(iter); 70 | return 0; 71 | } 72 | 73 | static int readBigWiggleChromosome(BigWiggleReaderData * data, char * chrom, int length) { 74 | int start; 75 | int stretch=10000; 76 | 77 | for (start = 1; start < length; start+=stretch) { 78 | if (readBigWiggleRegion(data, chrom, start, start+stretch)) 79 | return 1; 80 | } 81 | 82 | return 0; 83 | } 84 | 85 | void * readBigWiggle(void * ptr) { 86 | BigWiggleReaderData * data = (BigWiggleReaderData *) ptr; 87 | if (data->chrom) 88 | readBigWiggleRegion(data, data->chrom, data->start, data->stop); 89 | else { 90 | int chrom_index; 91 | Chrom_length * chrom_lengths = calloc(data->fp->cl->nKeys, sizeof(Chrom_length)); 92 | for (chrom_index = 0; chrom_index < data->fp->cl->nKeys; chrom_index++) { 93 | chrom_lengths[chrom_index].chrom = data->fp->cl->chrom[chrom_index]; 94 | chrom_lengths[chrom_index].length = data->fp->cl->len[chrom_index]; 95 | } 96 | 97 | qsort(chrom_lengths, data->fp->cl->nKeys, sizeof(Chrom_length), compare_chrom_lengths); 98 | 99 | for (chrom_index = 0; chrom_index < data->fp->cl->nKeys; chrom_index++) 100 | if (readBigWiggleChromosome(data, chrom_lengths[chrom_index].chrom, chrom_lengths[chrom_index].length)) 101 | break; 102 | 103 | free(chrom_lengths); 104 | } 105 | 106 | endBufferedSignal(data->bufferedReaderData); 107 | return NULL; 108 | } 109 | 110 | void BigWiggleReaderPop(WiggleIterator * wi) { 111 | BigWiggleReaderData * data = (BigWiggleReaderData *) wi->data; 112 | BufferedReaderPop(wi, data->bufferedReaderData); 113 | } 114 | 115 | void openBigWiggle(BigWiggleReaderData * data, char * filename, bool holdFire) { 116 | if(!bwIsBigWig(filename, NULL)) { 117 | printf("File %s is not in BigWig format\n", filename); 118 | exit(1); 119 | } 120 | data->fp = bwOpen(filename, NULL, "r"); 121 | if (!holdFire) 122 | launchBufferedReader(&readBigWiggle, data, &(data->bufferedReaderData)); 123 | } 124 | 125 | void BigWiggleReaderSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 126 | BigWiggleReaderData * data = (BigWiggleReaderData *) wi->data; 127 | 128 | if (data->bufferedReaderData) { 129 | killBufferedReader(data->bufferedReaderData); 130 | free(data->bufferedReaderData); 131 | data->bufferedReaderData = NULL; 132 | } 133 | data->chrom = chrom; 134 | data->start = start; 135 | data->stop = finish; 136 | launchBufferedReader(&readBigWiggle, data, &(data->bufferedReaderData)); 137 | wi->done = false; 138 | BigWiggleReaderPop(wi); 139 | 140 | while (!wi->done && (strcmp(wi->chrom, chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && wi->finish <= start))) 141 | BigWiggleReaderPop(wi); 142 | 143 | if (!wi->done && strcmp(chrom, wi->chrom) == 0 && wi->start < start) 144 | wi->start = start; 145 | } 146 | 147 | WiggleIterator * BigWiggleReader(char * f, bool holdFire) { 148 | BigWiggleReaderData * data = (BigWiggleReaderData *) calloc(1, sizeof(BigWiggleReaderData)); 149 | openBigWiggle(data, f, holdFire); 150 | return newWiggleIterator(data, &BigWiggleReaderPop, &BigWiggleReaderSeek, 0, false); 151 | } 152 | -------------------------------------------------------------------------------- /src/bufferedReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "bufferedReader.h" 17 | 18 | static int MAX_HEAD_START = 3; 19 | static int BLOCK_SIZE = 10000; 20 | 21 | typedef struct blockData_st { 22 | char **chrom; 23 | int * start; 24 | int * finish; 25 | double * value; 26 | int count; 27 | struct blockData_st * next; 28 | } BlockData; 29 | 30 | struct bufferedReaderData_st { 31 | pthread_t downloaderThreadID; 32 | BlockData * blockData, *lastBlockData; 33 | pthread_mutex_t count_mutex; 34 | pthread_cond_t count_cond; 35 | int blockCount; 36 | int readIndex; 37 | void * readerData; 38 | bool killed; 39 | }; 40 | 41 | static bool declareNewBlock(BufferedReaderData * data) { 42 | pthread_mutex_lock(&data->count_mutex); 43 | 44 | if (data->blockCount > MAX_HEAD_START) { 45 | pthread_cond_wait(&data->count_cond, &data->count_mutex); 46 | } 47 | if (data->blockCount < 0) { 48 | pthread_mutex_unlock(&data->count_mutex); 49 | return true; 50 | } 51 | data->blockCount++; 52 | pthread_cond_signal(&data->count_cond); 53 | pthread_mutex_unlock(&data->count_mutex); 54 | return false; 55 | } 56 | 57 | static BlockData * createBlockData() { 58 | BlockData * new = (BlockData * ) calloc(1, sizeof(BlockData)); 59 | new->chrom = (char **) calloc(BLOCK_SIZE, sizeof(char*)); 60 | new->start = (int *) calloc(BLOCK_SIZE, sizeof(int)); 61 | new->finish = (int *) calloc(BLOCK_SIZE, sizeof(int)); 62 | new->value = (double *) calloc(BLOCK_SIZE, sizeof(double)); 63 | return new; 64 | } 65 | 66 | bool pushValuesToBuffer(BufferedReaderData * data, const char * chrom, int start, int finish, double value) { 67 | 68 | if (data->blockData == NULL) 69 | data->lastBlockData = data->blockData = createBlockData(); 70 | else if (data->lastBlockData->count == BLOCK_SIZE) { 71 | data->lastBlockData->next = createBlockData(); 72 | data->lastBlockData = data->lastBlockData->next; 73 | if (declareNewBlock(data)) 74 | return true; 75 | } 76 | 77 | int index = data->lastBlockData->count; 78 | data->lastBlockData->chrom[index] = chrom; 79 | data->lastBlockData->start[index] = start; 80 | data->lastBlockData->finish[index] = finish; 81 | data->lastBlockData->value[index] = value; 82 | data->lastBlockData->count++; 83 | return false; 84 | } 85 | 86 | void endBufferedSignal(BufferedReaderData * data) { 87 | declareNewBlock(data); 88 | declareNewBlock(data); 89 | } 90 | 91 | static void destroyBlockData(BlockData * data) { 92 | free(data->chrom); 93 | free(data->start); 94 | free(data->finish); 95 | free(data->value); 96 | free(data); 97 | } 98 | 99 | static void waitForNextBlock(BufferedReaderData * data) { 100 | pthread_mutex_lock(&data->count_mutex); 101 | // Check whether allowed to step forward 102 | if (data->blockCount == 0) { 103 | pthread_cond_wait(&data->count_cond, &data->count_mutex); 104 | } 105 | // Signal freed memory 106 | data->blockCount--; 107 | pthread_cond_signal(&data->count_cond); 108 | pthread_mutex_unlock(&data->count_mutex); 109 | } 110 | 111 | static void goToNextBlock(BufferedReaderData * data) { 112 | BlockData * prevBlockData = data->blockData; 113 | data->blockData = data->blockData->next; 114 | data->readIndex = 0; 115 | destroyBlockData(prevBlockData); 116 | } 117 | 118 | void launchBufferedReader(void * (* readFileFunction)(void *), void * f_data, BufferedReaderData ** buf_data) { 119 | BufferedReaderData * data = calloc(1, sizeof(BufferedReaderData)); 120 | *buf_data = data; 121 | data->readIndex = 0; 122 | data->readerData = f_data; 123 | 124 | pthread_mutex_init(&data->count_mutex, NULL); 125 | pthread_cond_init(&data->count_cond, NULL); 126 | 127 | int err = pthread_create(&(data->downloaderThreadID), NULL, readFileFunction, f_data); 128 | if (err) { 129 | fprintf(stderr, "Could not create new thread %i\n", err); 130 | abort(); 131 | } 132 | 133 | waitForNextBlock(data); 134 | } 135 | 136 | void killBufferedReader(BufferedReaderData * data) { 137 | if (data->killed) 138 | return; 139 | 140 | pthread_mutex_lock(&data->count_mutex); 141 | data->blockCount = -1; 142 | // Send a signal in case the slave is waiting somewhere 143 | pthread_cond_signal(&data->count_cond); 144 | pthread_mutex_unlock(&data->count_mutex); 145 | pthread_join(data->downloaderThreadID, NULL); 146 | 147 | pthread_mutex_destroy(&data->count_mutex); 148 | pthread_cond_destroy(&data->count_cond); 149 | 150 | while (data->blockData) { 151 | BlockData * prevData = data->blockData; 152 | data->blockData = data->blockData->next; 153 | destroyBlockData(prevData); 154 | } 155 | 156 | data->lastBlockData = NULL; 157 | data->blockCount = 0; 158 | data->killed = true; 159 | } 160 | 161 | void BufferedReaderPop(WiggleIterator * wi, BufferedReaderData * data) { 162 | if (wi->done) 163 | return; 164 | else if (data == NULL || data->blockData == NULL) { 165 | wi->done = true; 166 | return; 167 | } else if (data->readIndex == data->blockData->count) { 168 | waitForNextBlock(data); 169 | goToNextBlock(data); 170 | if (data->blockData == NULL) { 171 | killBufferedReader(data); 172 | wi->done = true; 173 | return; 174 | } 175 | } 176 | 177 | int index = data->readIndex; 178 | wi->chrom = data->blockData->chrom[index]; 179 | wi->start = data->blockData->start[index]; 180 | wi->finish = data->blockData->finish[index]; 181 | wi->value = (double) data->blockData->value[index]; 182 | data->readIndex++; 183 | } 184 | 185 | 186 | int compare_chrom_lengths(const void * A, const void * B) { 187 | Chrom_length * cl_A = (Chrom_length *) A; 188 | Chrom_length * cl_B = (Chrom_length *) B; 189 | return strcmp(cl_A->chrom, cl_B->chrom); 190 | } 191 | -------------------------------------------------------------------------------- /src/bufferedReader.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef _BUFFERED_READER_H_ 16 | #define _BUFFERED_READER_H_ 17 | 18 | #include 19 | #include 20 | #include "wiggletools.h" 21 | #include "wiggleIterator.h" 22 | 23 | typedef struct chrom_length_st { 24 | char * chrom; 25 | int length; 26 | } Chrom_length; 27 | typedef struct bufferedReaderData_st BufferedReaderData; 28 | 29 | void launchBufferedReader(void * (* readFileFunction)(void *), void * f_data, BufferedReaderData ** buf_data); 30 | bool pushValuesToBuffer(BufferedReaderData * data, const char * chrom, int start, int finish, double value); 31 | void endBufferedSignal(BufferedReaderData * data); 32 | void killBufferedReader(BufferedReaderData * data); 33 | void BufferedReaderPop(WiggleIterator * wi, BufferedReaderData * data); 34 | 35 | int compare_chrom_lengths(const void * A, const void * B); 36 | #endif 37 | -------------------------------------------------------------------------------- /src/fib.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /*- 16 | * Copyright 1997-2003 John-Mark Gurney. 17 | * All rights reserved. 18 | * 19 | * Redistribution and use in source and binary forms, with or without 20 | * modification, are permitted provided that the following conditions 21 | * are met: 22 | * 1. Redistributions of source code must retain the above copyright 23 | * notice, this list of conditions and the following disclaimer. 24 | * 2. Redistributions in binary form must reproduce the above copyright 25 | * notice, this list of conditions and the following disclaimer in the 26 | * documentation and/or other materials provided with the distribution. 27 | * 28 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 | * SUCH DAMAGE. 39 | * 40 | * $Id: fib.c,v 1.10 2007/10/19 13:09:26 zerbino Exp $ 41 | * 42 | */ 43 | #include 44 | #include 45 | 46 | #include "fib.h" 47 | #include "recycleBin.h" 48 | 49 | #include "fibpriv.h" 50 | 51 | #define BLOCKSIZE 10000 52 | 53 | static FibHeapNode *allocateFibHeapEl(FibHeap * heap) 54 | { 55 | return allocatePointer(heap->nodeMemory); 56 | } 57 | 58 | static void deallocateFibHeapEl(FibHeapNode * a, FibHeap * heap) 59 | { 60 | deallocatePointer(heap->nodeMemory, a); 61 | } 62 | 63 | #define INT_BITS (sizeof(int) * 8) 64 | 65 | static inline int ceillog2(int a) 66 | { 67 | int oa; 68 | int i; 69 | int b; 70 | int cons; 71 | 72 | oa = a; 73 | b = INT_BITS / 2; 74 | i = 0; 75 | while (b) { 76 | i = (i << 1); 77 | cons = ((int) 1) << b; 78 | if (a >= cons) { 79 | a /= cons; 80 | i = i | 1; 81 | } else 82 | a &= cons - 1; 83 | b /= 2; 84 | } 85 | if ((((int) 1 << i)) == oa) 86 | return i; 87 | else 88 | return i + 1; 89 | } 90 | 91 | /* 92 | * Public Heap Functions 93 | */ 94 | FibHeap *fh_makeheap() 95 | { 96 | FibHeap *new = malloc(sizeof(FibHeap)); 97 | 98 | if (new) { 99 | new->nodeMemory = newRecycleBin(sizeof(FibHeapNode), BLOCKSIZE); 100 | new->fh_neginf = NULL; 101 | new->fh_n = 0; 102 | new->fh_Dl = -1; 103 | new->fh_cons = NULL; 104 | new->fh_min = NULL; 105 | new->fh_root = NULL; 106 | } 107 | 108 | return new; 109 | } 110 | 111 | void fh_deleteheap(FibHeap * h) 112 | { 113 | destroyRecycleBin(h->nodeMemory); 114 | h->fh_neginf = NULL; 115 | if (h->fh_cons != NULL) 116 | free(h->fh_cons); 117 | h->fh_cons = NULL; 118 | free(h); 119 | } 120 | 121 | /* 122 | * Public Key Heap Functions 123 | */ 124 | FibHeapNode *fh_insert(FibHeap * h, int key, int value) 125 | { 126 | FibHeapNode *x; 127 | 128 | if ((x = fhe_newelem(h)) == NULL) 129 | return NULL; 130 | 131 | /* just insert on root list, and make sure it's not the new min */ 132 | x->fhe_key = key; 133 | x->fhe_value = value; 134 | 135 | fh_insertel(h, x); 136 | 137 | return x; 138 | } 139 | 140 | static void fh_insertel(FibHeap * h, FibHeapNode * x) 141 | { 142 | fh_insertrootlist(h, x); 143 | 144 | if (h->fh_min == NULL || x->fhe_key < h->fh_min->fhe_key) 145 | h->fh_min = x; 146 | 147 | h->fh_n++; 148 | } 149 | 150 | static void fh_insertrootlist(FibHeap * h, FibHeapNode * x) 151 | { 152 | if (h->fh_root == NULL) { 153 | h->fh_root = x; 154 | x->fhe_left = x; 155 | x->fhe_right = x; 156 | } else { 157 | fhe_insertafter(h->fh_root, x); 158 | } 159 | } 160 | 161 | static void fhe_insertafter(FibHeapNode * a, FibHeapNode * b) 162 | { 163 | if (a == a->fhe_right) { 164 | a->fhe_right = b; 165 | a->fhe_left = b; 166 | b->fhe_right = a; 167 | b->fhe_left = a; 168 | } else { 169 | b->fhe_right = a->fhe_right; 170 | a->fhe_right->fhe_left = b; 171 | a->fhe_right = b; 172 | b->fhe_left = a; 173 | } 174 | } 175 | 176 | int fh_min(FibHeap * h) 177 | { 178 | if (h->fh_min == NULL) 179 | return (int) INT_MIN; 180 | return h->fh_min->fhe_key; 181 | } 182 | 183 | int fh_notempty(FibHeap * h) 184 | { 185 | return (int) (h->fh_min != NULL); 186 | } 187 | 188 | int fh_empty(FibHeap * h) 189 | { 190 | return (int) (h->fh_min == NULL); 191 | } 192 | 193 | int fh_extractmin(FibHeap * h) 194 | { 195 | if (h->fh_min != NULL) { 196 | FibHeapNode * min = fh_extractminel(h); 197 | int res = min->fhe_value; 198 | deallocateFibHeapEl(min, h); 199 | return res; 200 | } 201 | 202 | return -1; 203 | } 204 | 205 | static FibHeapNode *fh_extractminel(FibHeap * h) 206 | { 207 | FibHeapNode *ret; 208 | FibHeapNode *x, *y, *orig; 209 | 210 | ret = h->fh_min; 211 | 212 | orig = NULL; 213 | /* put all the children on the root list */ 214 | /* for true consistancy, we should use fhe_remove */ 215 | for (x = ret->fhe_child; x != orig && x != NULL;) { 216 | if (orig == NULL) 217 | orig = x; 218 | y = x->fhe_right; 219 | x->fhe_p = NULL; 220 | fh_insertrootlist(h, x); 221 | x = y; 222 | } 223 | /* remove minimum from root list */ 224 | fh_removerootlist(h, ret); 225 | h->fh_n--; 226 | 227 | /* if we aren't empty, consolidate the heap */ 228 | if (h->fh_n == 0) 229 | h->fh_min = NULL; 230 | else { 231 | h->fh_min = ret->fhe_right; 232 | fh_consolidate(h); 233 | } 234 | 235 | return ret; 236 | } 237 | 238 | static void fh_removerootlist(FibHeap * h, FibHeapNode * x) 239 | { 240 | if (x->fhe_left == x) 241 | h->fh_root = NULL; 242 | else 243 | h->fh_root = fhe_remove(x); 244 | } 245 | 246 | static void fh_consolidate(FibHeap * h) 247 | { 248 | FibHeapNode **a; 249 | FibHeapNode *w; 250 | FibHeapNode *y; 251 | FibHeapNode *x; 252 | int i; 253 | int d; 254 | int D; 255 | 256 | fh_checkcons(h); 257 | 258 | /* assign a the value of h->fh_cons so I don't have to rewrite code */ 259 | D = h->fh_Dl + 1; 260 | a = h->fh_cons; 261 | 262 | for (i = 0; i < D; i++) 263 | a[i] = NULL; 264 | 265 | while ((w = h->fh_root) != NULL) { 266 | x = w; 267 | fh_removerootlist(h, w); 268 | d = x->fhe_degree; 269 | /* XXX - assert that d < D */ 270 | while (a[d] != NULL) { 271 | y = a[d]; 272 | if (fh_compare(h, x, y) > 0) { 273 | FibHeapNode * temp = x; 274 | x = y; 275 | y = temp; 276 | } 277 | fh_heaplink(h, y, x); 278 | a[d] = NULL; 279 | d++; 280 | } 281 | a[d] = x; 282 | } 283 | h->fh_min = NULL; 284 | for (i = 0; i < D; i++) 285 | if (a[i] != NULL) { 286 | fh_insertrootlist(h, a[i]); 287 | if (h->fh_min == NULL 288 | || fh_compare(h, a[i], h->fh_min) < 0) 289 | h->fh_min = a[i]; 290 | } 291 | } 292 | 293 | static void fh_checkcons(FibHeap * h) 294 | { 295 | int oDl; 296 | 297 | /* make sure we have enough memory allocated to "reorganize" */ 298 | if (h->fh_Dl == -1 || h->fh_n > (1 << h->fh_Dl)) { 299 | oDl = h->fh_Dl; 300 | if ((h->fh_Dl = ceillog2(h->fh_n) + 1) < 8) 301 | h->fh_Dl = 8; 302 | if (oDl != h->fh_Dl) 303 | h->fh_cons = 304 | (FibHeapNode **) realloc(h->fh_cons, 305 | sizeof *h-> 306 | fh_cons * 307 | (h->fh_Dl + 1)); 308 | if (h->fh_cons == NULL) 309 | abort(); 310 | } 311 | } 312 | 313 | static int fh_compare(FibHeap * h, FibHeapNode * a, FibHeapNode * b) 314 | { 315 | if (a->fhe_key < b->fhe_key) 316 | return -1; 317 | if (a->fhe_key == b->fhe_key) 318 | return 0; 319 | return 1; 320 | } 321 | 322 | static void fh_heaplink(FibHeap * h, FibHeapNode * y, FibHeapNode * x) 323 | { 324 | /* make y a child of x */ 325 | if (x->fhe_child == NULL) 326 | x->fhe_child = y; 327 | else 328 | fhe_insertafter(x->fhe_child->fhe_left, y); 329 | y->fhe_p = x; 330 | x->fhe_degree++; 331 | y->fhe_mark = 0; 332 | } 333 | 334 | /* 335 | * begining of handling elements of fibheap 336 | */ 337 | static FibHeapNode *fhe_newelem(FibHeap * h) 338 | { 339 | FibHeapNode *e; 340 | 341 | if ((e = allocateFibHeapEl(h)) == NULL) 342 | return NULL; 343 | 344 | e->fhe_degree = 0; 345 | e->fhe_mark = 0; 346 | e->fhe_p = NULL; 347 | e->fhe_child = NULL; 348 | e->fhe_left = e; 349 | e->fhe_right = e; 350 | 351 | return e; 352 | } 353 | 354 | static FibHeapNode *fhe_remove(FibHeapNode * x) 355 | { 356 | FibHeapNode *ret; 357 | 358 | if (x == x->fhe_left) 359 | ret = NULL; 360 | else 361 | ret = x->fhe_left; 362 | 363 | /* fix the parent pointer */ 364 | if (x->fhe_p != NULL && x->fhe_p->fhe_child == x) 365 | x->fhe_p->fhe_child = ret; 366 | 367 | x->fhe_right->fhe_left = x->fhe_left; 368 | x->fhe_left->fhe_right = x->fhe_right; 369 | 370 | /* clear out hanging pointers */ 371 | x->fhe_p = NULL; 372 | x->fhe_left = x; 373 | x->fhe_right = x; 374 | 375 | return ret; 376 | } 377 | -------------------------------------------------------------------------------- /src/fib.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /*- 16 | * Copyright 1997, 1998-2003 John-Mark Gurney. 17 | * All rights reserved. 18 | * 19 | * Redistribution and use in source and binary forms, with or without 20 | * modification, are permitted provided that the following conditions 21 | * are met: 22 | * 1. Redistributions of source code must retain the above copyright 23 | * notice, this list of conditions and the following disclaimer. 24 | * 2. Redistributions in binary form must reproduce the above copyright 25 | * notice, this list of conditions and the following disclaimer in the 26 | * documentation and/or other materials provided with the distribution. 27 | * 28 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 | * SUCH DAMAGE. 39 | * 40 | * $Id: fib.h,v 1.9 2007/04/24 12:16:41 zerbino Exp $ 41 | * 42 | */ 43 | 44 | #ifndef _FIB_H_ 45 | #define _FIB_H_ 46 | 47 | typedef struct fibheap FibHeap; 48 | typedef struct fibheap_el FibHeapNode; 49 | 50 | FibHeap *fh_makeheap(void); 51 | FibHeapNode *fh_insert(FibHeap *, int, int); 52 | int fh_empty(FibHeap *); 53 | int fh_notempty(FibHeap *); 54 | int fh_min(FibHeap *); 55 | int fh_extractmin(FibHeap *); 56 | void fh_deleteheap(FibHeap *); 57 | 58 | #endif /* _FIB_H_ */ 59 | -------------------------------------------------------------------------------- /src/fibpriv.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /*- 16 | * Copyright 1997, 1999-2003 John-Mark Gurney. 17 | * All rights reserved. 18 | * 19 | * Redistribution and use in source and binary forms, with or without 20 | 21 | * are met: 22 | * 1. Redistributions of source code must retain the above copyright 23 | * notice, this list of conditions and the following disclaimer. 24 | * 2. Redistributions in binary form must reproduce the above copyright 25 | * notice, this list of conditions and the following disclaimer in the 26 | * documentation and/or other materials provided with the distribution. 27 | * 28 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 | * SUCH DAMAGE. 39 | * 40 | * $Id: fibpriv.h,v 1.10 2007/10/09 09:56:46 zerbino Exp $ 41 | * 42 | */ 43 | 44 | #ifndef _FIBPRIV_H_ 45 | #define _FIBPRIV_H_ 46 | 47 | #ifndef bool 48 | #define bool char 49 | #define true 1 50 | #define false 0 51 | #endif 52 | 53 | /* 54 | * specific node operations 55 | */ 56 | struct fibheap_el { 57 | FibHeapNode *fhe_p; 58 | FibHeapNode *fhe_child; 59 | FibHeapNode *fhe_left; 60 | FibHeapNode *fhe_right; 61 | int fhe_key; 62 | int fhe_value; 63 | int fhe_degree; 64 | bool fhe_mark; 65 | }; 66 | 67 | static FibHeapNode *fhe_newelem(struct fibheap *); 68 | static void fhe_insertafter(FibHeapNode * a, FibHeapNode * b); 69 | static FibHeapNode *fhe_remove(FibHeapNode * a); 70 | 71 | /* 72 | * global heap operations 73 | */ 74 | struct fibheap { 75 | RecycleBin *nodeMemory; 76 | int fh_n; 77 | int fh_Dl; 78 | FibHeapNode **fh_cons; 79 | FibHeapNode *fh_min; 80 | FibHeapNode *fh_root; 81 | void *fh_neginf; 82 | bool fh_keys; 83 | }; 84 | 85 | static void fh_insertrootlist(FibHeap * h, FibHeapNode * x); 86 | static void fh_removerootlist(FibHeap *, FibHeapNode *); 87 | static void fh_consolidate(FibHeap *); 88 | static void fh_heaplink(FibHeap * h, FibHeapNode * y, FibHeapNode * x); 89 | static FibHeapNode *fh_extractminel(FibHeap *); 90 | static void fh_checkcons(FibHeap * h); 91 | static int fh_compare(FibHeap * h, FibHeapNode * a, FibHeapNode * b); 92 | static void fh_insertel(FibHeap * h, FibHeapNode * x); 93 | 94 | /* 95 | * general functions 96 | */ 97 | static inline int ceillog2(int a); 98 | 99 | #endif /* _FIBPRIV_H_ */ 100 | -------------------------------------------------------------------------------- /src/hash.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "wiggletools.h" 17 | #include "hash.h" 18 | 19 | const size_t BUCKET_COUNT = 1024; 20 | const size_t BUCKET_INIT_SIZE = 10; 21 | 22 | typedef struct hashel_st { 23 | int key; 24 | int value; 25 | } HashElement; 26 | 27 | typedef struct hashlist_st { 28 | size_t size; 29 | size_t count; 30 | HashElement * list; 31 | } HashList; 32 | 33 | struct hash_st { 34 | HashList * lists; 35 | }; 36 | 37 | static void hashlist_init(HashList * hashlist) { 38 | hashlist->size = BUCKET_INIT_SIZE; 39 | hashlist->list = (HashElement *) calloc(sizeof(HashElement), BUCKET_INIT_SIZE); 40 | } 41 | 42 | Hash *hash_construct() { 43 | Hash * res = (Hash *) calloc(sizeof(Hash), 1); 44 | res->lists = calloc(sizeof(HashList), BUCKET_COUNT); 45 | int i; 46 | for (i =0; i < BUCKET_COUNT; i++) 47 | hashlist_init(res->lists + i); 48 | return res; 49 | } 50 | 51 | static void hashlist_destroy(HashList * hl) { 52 | free(hl->list); 53 | } 54 | 55 | void hash_destroy(Hash * hash) { 56 | int i; 57 | for (i =0; i < BUCKET_COUNT; i++) 58 | hashlist_destroy(hash->lists + i); 59 | free(hash->lists); 60 | free(hash); 61 | } 62 | 63 | static bool hashlist_increment(HashList* hl, int key) { 64 | int i; 65 | for (i = 0; i < hl->count; i++) 66 | if (hl->list[i].key == key) { 67 | hl->list[i].value++; 68 | return true; 69 | } 70 | 71 | hl->list[hl->count].key = key; 72 | hl->list[hl->count].value = 1; 73 | hl->count++; 74 | return false; 75 | } 76 | 77 | bool hash_increment(Hash * hash, int key) { 78 | return hashlist_increment(hash->lists + (key % BUCKET_COUNT), key); 79 | } 80 | 81 | static int hashlist_remove(HashList * hl, int key) { 82 | int i; 83 | int value = 0; 84 | bool found = false; 85 | 86 | for (i = 0; i < hl->count; i++) { 87 | if (hl->list[i].key == key) { 88 | value = hl->list[i].value; 89 | found = true; 90 | } 91 | if (found && i < hl->count - 1) { 92 | hl->list[i].key = hl->list[i+1].key; 93 | hl->list[i].value = hl->list[i+1].value; 94 | } 95 | } 96 | 97 | if (found) 98 | hl->count--; 99 | return value; 100 | } 101 | 102 | int hash_remove(Hash * hash, int key) { 103 | return hashlist_remove(hash->lists + (key % BUCKET_COUNT), key); 104 | } 105 | 106 | -------------------------------------------------------------------------------- /src/hash.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef _HASH_H_ 16 | #define _HASH_H_ 17 | 18 | typedef struct hash_st Hash; 19 | 20 | Hash *hash_construct(); 21 | bool hash_increment(Hash *, int); 22 | int hash_remove(Hash *, int); 23 | void hash_destroy(Hash *); 24 | 25 | #endif /* _FIB_H_ */ 26 | -------------------------------------------------------------------------------- /src/hashfib.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | #include "wiggletools.h" 18 | #include "hash.h" 19 | #include "fib.h" 20 | #include "hashfib.h" 21 | 22 | struct hashfib_st { 23 | Hash *hash; 24 | FibHeap *fib; 25 | }; 26 | 27 | HashFib *hashfib_construct() { 28 | HashFib * res = (HashFib*) calloc(sizeof(HashFib), 1); 29 | res->hash = hash_construct(); 30 | res->fib = fh_makeheap(); 31 | return res; 32 | } 33 | 34 | void hashfib_insert(HashFib * hf, int key) { 35 | if (!hash_increment(hf->hash, key)) 36 | fh_insert(hf->fib, key, 0); 37 | } 38 | 39 | bool hashfib_empty(HashFib * hf) { 40 | return fh_empty(hf->fib); 41 | } 42 | 43 | int hashfib_min(HashFib * hf) { 44 | return fh_min(hf->fib); 45 | } 46 | 47 | int hashfib_remove_min(HashFib * hf) { 48 | int key = fh_min(hf->fib); 49 | fh_extractmin(hf->fib); 50 | return hash_remove(hf->hash, key); 51 | } 52 | 53 | void hashfib_destroy(HashFib * hf) { 54 | hash_destroy(hf->hash); 55 | fh_deleteheap(hf->fib); 56 | free(hf); 57 | } 58 | -------------------------------------------------------------------------------- /src/hashfib.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef _HASHFIB_H_ 16 | #define _HASHFIB_H_ 17 | 18 | typedef struct hashfib_st HashFib; 19 | 20 | HashFib *hashfib_construct(); 21 | bool hashfib_empty(HashFib*); 22 | void hashfib_insert(HashFib *, int); 23 | int hashfib_min(HashFib *); 24 | int hashfib_remove_min(HashFib *); 25 | void hashfib_destroy(HashFib *); 26 | 27 | #endif /* _FIB_H_ */ 28 | -------------------------------------------------------------------------------- /src/mWigWriter.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | // Local header 22 | #include "multiplexer.h" 23 | 24 | ////////////////////////////////////////////////////// 25 | // Tee operator 26 | ////////////////////////////////////////////////////// 27 | 28 | #define BLOCK_LENGTH 10000 29 | #define MAX_OUT_BLOCKS 2 30 | 31 | typedef struct BlockData_st { 32 | char * chroms[BLOCK_LENGTH]; 33 | int starts[BLOCK_LENGTH]; 34 | int finishes[BLOCK_LENGTH]; 35 | double * values; 36 | int count; 37 | int width; 38 | bool bedGraph; 39 | struct BlockData_st * next; 40 | } BlockData; 41 | 42 | typedef struct TeeMultiplexerData_st { 43 | FILE * infile; 44 | FILE * outfile; 45 | Multiplexer * in; 46 | BlockData * dataBlocks; 47 | BlockData * lastBlock; 48 | int count; 49 | pthread_t threadID; 50 | pthread_mutex_t continue_mutex; 51 | pthread_cond_t continue_cond; 52 | bool done; 53 | bool bedGraph; 54 | } TeeMultiplexerData; 55 | 56 | static void printBlock(FILE * infile, FILE * outfile, BlockData * block) { 57 | int i, j; 58 | bool pointByPoint = false; 59 | bool makeHeader=false; 60 | char ** chromPtr = block->chroms; 61 | int * startPtr = block->starts; 62 | int * finishPtr = block->finishes; 63 | double * valuePtr = block->values; 64 | char * lastChrom = NULL; 65 | int lastFinish = -1; 66 | char buffer[5000]; 67 | 68 | for (i = 0; i < block->count; i++) { 69 | // Change mode 70 | if (!block->bedGraph && *finishPtr - *startPtr < 2 && !pointByPoint) { 71 | pointByPoint = true; 72 | makeHeader = true; 73 | } else if (*finishPtr - *startPtr > 5 && pointByPoint) { 74 | pointByPoint = false; 75 | } 76 | 77 | if (pointByPoint) { 78 | if (makeHeader || (pointByPoint && (lastChrom != *chromPtr || *startPtr > lastFinish))) 79 | fprintf(outfile, "fixedStep chrom=%s start=%i step=1\n", *chromPtr, *startPtr); 80 | makeHeader = false; 81 | for (j = 0; j < *finishPtr - *startPtr; j++) { 82 | int k; 83 | double * ptr = valuePtr; 84 | for (k = 0; k < block->width; k++) 85 | fprintf(outfile, "\t%lf", *(ptr++)); 86 | fprintf(outfile, "\n"); 87 | } 88 | valuePtr += block->width; 89 | } else if (!infile) { 90 | // Careful bedgraph lines are 0 based 91 | fprintf(outfile, "%s\t%i\t%i", *chromPtr, *startPtr-1, *finishPtr-1); 92 | int k; 93 | for (k = 0; k < block->width; k++) 94 | fprintf(outfile, "\t%lf", *(valuePtr++)); 95 | fprintf(outfile, "\n"); 96 | } else { 97 | // Read next line in infile 98 | if (!fgets(buffer, 5000, infile)) { 99 | fprintf(stderr, "Could not paste data to file lines, inconsistent number of lines.\n"); 100 | exit(1); 101 | } 102 | 103 | // Skip empty lines and metadata lines: 104 | while (! (strlen(buffer) && strncmp(buffer, "track", 5) && strncmp(buffer, "browser", 7))) { 105 | if (!fgets(buffer, 5000, infile)) { 106 | fprintf(stderr, "Could not paste data to file lines, inconsistent number of lines.\n"); 107 | exit(1); 108 | } 109 | } 110 | 111 | // Strip end of line symbols 112 | int k; 113 | for (k = strlen(buffer)-1; k >= 0; k--) { 114 | if (buffer[k] == '\n' || buffer[k] == '\r') 115 | buffer[k] = '\0'; 116 | else 117 | break; 118 | } 119 | 120 | // Print out 121 | fprintf(outfile, "%s", buffer); 122 | for (k = 0; k < block->width; k++) 123 | fprintf(outfile, "\t%lf", *(valuePtr++)); 124 | fprintf(outfile, "\n"); 125 | } 126 | 127 | lastChrom = *chromPtr; 128 | lastFinish = *finishPtr; 129 | chromPtr++; 130 | startPtr++; 131 | finishPtr++; 132 | } 133 | } 134 | 135 | static bool goToNextBlock(TeeMultiplexerData * data) { 136 | BlockData * ptr = data->dataBlocks; 137 | static int i = 0; 138 | i++; 139 | 140 | pthread_mutex_lock(&data->continue_mutex); 141 | // Received kill signal 142 | if (data->count < 0) 143 | return true; 144 | 145 | // Check that there is work left 146 | data->count--; 147 | if (data->count == 0 && !data->done) 148 | pthread_cond_wait(&data->continue_cond, &data->continue_mutex); 149 | pthread_cond_signal(&data->continue_cond); 150 | pthread_mutex_unlock(&data->continue_mutex); 151 | 152 | // Step forward 153 | data->dataBlocks = data->dataBlocks->next; 154 | free(ptr->values); 155 | free(ptr); 156 | return false; 157 | } 158 | 159 | static void * printToFile(void * args) { 160 | TeeMultiplexerData * data = (TeeMultiplexerData *) args; 161 | 162 | // Wait for first block to arrive 163 | pthread_mutex_lock(&data->continue_mutex); 164 | if (data->count == 0 && !data->done) 165 | pthread_cond_wait(&data->continue_cond, &data->continue_mutex); 166 | pthread_mutex_unlock(&data->continue_mutex); 167 | 168 | if (data->count < 0) 169 | return NULL; 170 | 171 | while(data->dataBlocks) { 172 | printBlock(data->infile, data->outfile, data->dataBlocks); 173 | if (goToNextBlock(data)) 174 | return NULL; 175 | } 176 | return NULL; 177 | } 178 | 179 | static void TeeMultiplexerPop(Multiplexer * multi) { 180 | TeeMultiplexerData * data = (TeeMultiplexerData *) multi->data; 181 | Multiplexer * in = data->in; 182 | if (!data->in->done) { 183 | multi->chrom = in->chrom; 184 | multi->start = in->start; 185 | multi->finish = in->finish; 186 | // No need to copy values, pointer points to the source multipliexers values' array 187 | //multi->value = in->value; 188 | 189 | if (data->threadID) { 190 | int index = data->lastBlock->count; 191 | data->lastBlock->chroms[index] = in->chrom; 192 | data->lastBlock->starts[index] = in->start; 193 | data->lastBlock->finishes[index] = in->finish; 194 | int i; 195 | double * ptr = data->lastBlock->values + (index * multi->count); 196 | for (i = 0; i < multi->count; i++) 197 | *(ptr++) = in->values[i]; 198 | 199 | if (++data->lastBlock->count >= BLOCK_LENGTH) { 200 | // Communications 201 | pthread_mutex_lock(&data->continue_mutex); 202 | data->count++; 203 | pthread_cond_signal(&data->continue_cond); 204 | if (data->count > MAX_OUT_BLOCKS) 205 | pthread_cond_wait(&data->continue_cond, &data->continue_mutex); 206 | pthread_mutex_unlock(&data->continue_mutex); 207 | 208 | data->lastBlock->next = (BlockData*) calloc(1, sizeof(BlockData)); 209 | data->lastBlock->next->values = (double*) calloc(BLOCK_LENGTH * multi->count, sizeof(double)); 210 | data->lastBlock->next->width = in->count; 211 | data->lastBlock = data->lastBlock->next; 212 | data->lastBlock->bedGraph = data->bedGraph; 213 | } 214 | } 215 | popMultiplexer(in); 216 | } else if (data->threadID) { 217 | pthread_mutex_lock(&data->continue_mutex); 218 | data->count++; 219 | data->done = true; 220 | pthread_cond_signal(&data->continue_cond); 221 | pthread_mutex_unlock(&data->continue_mutex); 222 | multi->done = true; 223 | pthread_join(data->threadID, NULL); 224 | } 225 | } 226 | 227 | static void launchWriter(TeeMultiplexerData * data, int width) { 228 | // Initialize variables 229 | data->count = 0; 230 | data->done = false; 231 | pthread_cond_init(&data->continue_cond, NULL); 232 | pthread_mutex_init(&data->continue_mutex, NULL); 233 | data->dataBlocks = data->lastBlock = (BlockData*) calloc(1, sizeof(BlockData)); 234 | data->dataBlocks->values = (double*) calloc(BLOCK_LENGTH * width, sizeof(double)); 235 | data->dataBlocks->width = width; 236 | data->lastBlock->bedGraph = data->bedGraph; 237 | 238 | // Launch pthread 239 | int err = pthread_create(&data->threadID, NULL, &printToFile, data); 240 | if (err) { 241 | fprintf(stderr, "Could not create new thread %i\n", err); 242 | exit(1); 243 | } 244 | } 245 | 246 | static void killWriter(TeeMultiplexerData * data) { 247 | BlockData * block; 248 | 249 | if (!data->threadID) 250 | return; 251 | 252 | // Set trap 253 | pthread_mutex_lock(&data->continue_mutex); 254 | data->count = -1; 255 | pthread_cond_signal(&data->continue_cond); 256 | pthread_mutex_unlock(&data->continue_mutex); 257 | 258 | // Wait for the catch 259 | pthread_join(data->threadID, NULL); 260 | 261 | // Clear variables 262 | pthread_cond_destroy(&data->continue_cond); 263 | pthread_mutex_destroy(&data->continue_mutex); 264 | 265 | while (data->dataBlocks) { 266 | block = data->dataBlocks; 267 | data->dataBlocks = block->next; 268 | free(block->values); 269 | free(block); 270 | } 271 | 272 | data->dataBlocks = NULL; 273 | data->lastBlock = NULL; 274 | } 275 | 276 | static void TeeMultiplexerSeek(Multiplexer * multi, const char * chrom, int start, int finish) { 277 | TeeMultiplexerData * data = (TeeMultiplexerData *) multi->data; 278 | killWriter(data); 279 | fflush(data->outfile); 280 | seekMultiplexer(data->in, chrom, start, finish); 281 | multi->done = false; 282 | launchWriter(data, multi->count); 283 | popMultiplexer(multi); 284 | } 285 | 286 | Multiplexer * TeeMultiplexer(Multiplexer * in, FILE * outfile, bool bedGraph, bool holdFire) { 287 | TeeMultiplexerData * data = (TeeMultiplexerData *) calloc(1, sizeof(TeeMultiplexerData)); 288 | data->in = in; 289 | data->outfile = outfile; 290 | data->bedGraph = bedGraph; 291 | // Hold fire means that you wait for the first seek before doing any writing 292 | if (!holdFire) 293 | launchWriter(data, in->count); 294 | 295 | Multiplexer * res = newCoreMultiplexer(data, in->count, &TeeMultiplexerPop, &TeeMultiplexerSeek); 296 | res->values = in->values; 297 | res->inplay = in->inplay; 298 | res->default_values = in->default_values; 299 | popMultiplexer(res); 300 | return res; 301 | } 302 | 303 | void toStdoutMultiplexer(Multiplexer * in, bool bedGraph, bool holdFire) { 304 | runMultiplexer(TeeMultiplexer(in, stdout, bedGraph, holdFire)); 305 | } 306 | 307 | ////////////////////////////////////////////////////////// 308 | // Paste Iterator 309 | ////////////////////////////////////////////////////////// 310 | 311 | Multiplexer * PasteMultiplexer(Multiplexer * in, FILE * infile, FILE * outfile, bool holdFire) { 312 | TeeMultiplexerData * data = (TeeMultiplexerData *) calloc(1, sizeof(TeeMultiplexerData)); 313 | data->in = in; 314 | data->infile = infile; 315 | data->bedGraph = true; 316 | data->outfile = outfile; 317 | // Hold fire means that you wait for the first seek before doing any writing 318 | if (!holdFire) 319 | launchWriter(data, in->count); 320 | 321 | Multiplexer * res = newCoreMultiplexer(data, in->count, &TeeMultiplexerPop, &TeeMultiplexerSeek); 322 | res->values = in->values; 323 | res->default_values = in->default_values; 324 | res->inplay = in->inplay; 325 | popMultiplexer(res); 326 | return res; 327 | } 328 | -------------------------------------------------------------------------------- /src/multiSet.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | 16 | #include 17 | #include 18 | 19 | #include "multiSet.h" 20 | 21 | static void popClosingMultiplexers(Multiset * multi) { 22 | while (fh_notempty(multi->finishes) && fh_min(multi->finishes) == multi->finish) { 23 | int index = fh_extractmin(multi->finishes); 24 | Multiplexer * multiplexer = multi->multis[index]; 25 | popMultiplexer(multiplexer); 26 | multi->inplay[index] = false; 27 | multi->inplay_count--; 28 | if (!multiplexer->done && !strcmp(multiplexer->chrom, multi->chrom)) 29 | fh_insert(multi->starts, multiplexer->start, index); 30 | } 31 | } 32 | 33 | static void queueUpMultiplexers(Multiset * multi) { 34 | // Find lowest value chromosome 35 | multi->chrom = NULL; 36 | Multiplexer ** muPtr = multi->multis; 37 | int i; 38 | for (i = 0; i < multi->count; i++) { 39 | if ((!(*muPtr)->done) && (!multi->chrom || strcmp((*muPtr)->chrom, multi->chrom) < 0)) 40 | multi->chrom = (*muPtr)->chrom; 41 | muPtr++; 42 | } 43 | 44 | // No chromosome found => All multisets done 45 | if (!multi->chrom) { 46 | multi->done = true; 47 | return; 48 | } 49 | 50 | // Put those multiplexers in heap 51 | muPtr = multi->multis; 52 | for (i = 0; i < multi->count; i++) { 53 | if ((!(*muPtr)->done) && strcmp((*muPtr)->chrom, multi->chrom) == 0) 54 | fh_insert(multi->starts, (*muPtr)->start, i); 55 | muPtr++; 56 | } 57 | 58 | } 59 | 60 | static void admitNewMultiplexersIntoPlay(Multiset * multi) { 61 | while (fh_notempty(multi->starts) && fh_min(multi->starts) == multi->start) { 62 | int index = fh_extractmin(multi->starts); 63 | Multiplexer * multiplexer = multi->multis[index]; 64 | fh_insert(multi->finishes, multiplexer->finish, index); 65 | multi->inplay[index] = true; 66 | multi->inplay_count++; 67 | } 68 | } 69 | 70 | static void defineNewFinish(Multiset * multi) { 71 | multi->finish = fh_min(multi->finishes); 72 | 73 | if (fh_notempty(multi->starts)) { 74 | int min_start = fh_min(multi->starts); 75 | if (multi->finish > min_start) 76 | multi->finish = min_start; 77 | } 78 | } 79 | 80 | void popMultiset(Multiset * multi) { 81 | popClosingMultiplexers(multi); 82 | 83 | // Check that there are multiplexers queued up 84 | // If no multiplexers are waiting, either waiting on other chromosomes 85 | // or finished. 86 | if (fh_empty(multi->starts) && fh_empty(multi->finishes)) 87 | queueUpMultiplexers(multi); 88 | 89 | // If queues still empty 90 | if (multi->done) 91 | return; 92 | 93 | // If no multiplexer in play jump to next start 94 | if (multi->inplay_count) 95 | multi->start = multi->finish; 96 | else 97 | multi->start = fh_min(multi->starts); 98 | 99 | admitNewMultiplexersIntoPlay(multi); 100 | defineNewFinish(multi); 101 | } 102 | 103 | void seekMultiset(Multiset * multi, const char * chrom, int start, int finish) { 104 | int i; 105 | multi->done = false; 106 | for (i=0; icount; i++) 107 | seekMultiplexer(multi->multis[i], chrom, start, finish); 108 | fh_deleteheap(multi->starts); 109 | fh_deleteheap(multi->finishes); 110 | multi->starts = fh_makeheap(); 111 | multi->finishes = fh_makeheap(); 112 | popMultiset(multi); 113 | } 114 | 115 | Multiset * newMultiset(Multiplexer ** multis, int count) { 116 | Multiset * new = (Multiset *) calloc (1, sizeof(Multiset)); 117 | new->count = count; 118 | new->multis = multis; 119 | new->inplay = (bool *) calloc(count, sizeof(bool)); 120 | new->values = (double **) calloc(count, sizeof(double)); 121 | new->starts = fh_makeheap(); 122 | new->finishes = fh_makeheap(); 123 | int i; 124 | for (i = 0; i < count; i++) 125 | new->values[i] = multis[i]->values; 126 | popMultiset(new); 127 | return new; 128 | } 129 | -------------------------------------------------------------------------------- /src/multiSet.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef WIGGLE_MULTISET_H_ 16 | #define WIGGLE_MULTISET_H_ 17 | 18 | #include "multiplexer.h" 19 | 20 | struct multiset_st { 21 | char * chrom; 22 | int start; 23 | int finish; 24 | double ** values; 25 | int count, inplay_count; 26 | bool *inplay; 27 | Multiplexer ** multis; 28 | bool done; 29 | FibHeap * starts, * finishes; 30 | }; 31 | 32 | void popMultiset(Multiset* multi); 33 | void seekMultiset(Multiset * multi, const char * chrom, int start, int finish); 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/multiplexer.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include "multiplexer.h" 21 | 22 | void popMultiplexer(Multiplexer * multi) { 23 | if (!multi->done) 24 | multi->pop(multi); 25 | } 26 | 27 | void runMultiplexer(Multiplexer * multi) { 28 | while (!multi->done) 29 | multi->pop(multi); 30 | } 31 | 32 | void seekMultiplexer(Multiplexer * multi, const char * chrom, int start, int finish) { 33 | multi->done = false; 34 | multi->seek(multi, chrom, start, finish); 35 | } 36 | 37 | static void popClosingWiggleIterators(Multiplexer * multi) { 38 | while (fh_notempty(multi->finishes) && fh_min(multi->finishes) == multi->finish) { 39 | int index = fh_extractmin(multi->finishes); 40 | WiggleIterator * wi = multi->iters[index]; 41 | pop(wi); 42 | multi->inplay[index] = false; 43 | multi->inplay_count--; 44 | multi->values[index] = wi->default_value; 45 | if (!wi->done && !strcmp(wi->chrom, multi->chrom)) 46 | fh_insert(multi->starts, wi->start, index); 47 | } 48 | } 49 | 50 | static void queueUpWiggleIterators(Multiplexer * multi) { 51 | // Find lowest value chromosome 52 | multi->chrom = NULL; 53 | WiggleIterator ** muPtr = multi->iters; 54 | int i; 55 | for (i = 0; i < multi->count; i++) { 56 | if ((!(*muPtr)->done) && (!multi->chrom || strcmp((*muPtr)->chrom, multi->chrom) < 0)) 57 | multi->chrom = (*muPtr)->chrom; 58 | muPtr++; 59 | } 60 | 61 | // No chromosome found => All itersets done 62 | if (!multi->chrom) { 63 | multi->done = true; 64 | return; 65 | } 66 | 67 | // Put those wis in heap 68 | muPtr = multi->iters; 69 | for (i = 0; i < multi->count; i++) { 70 | if ((!(*muPtr)->done) && strcmp((*muPtr)->chrom, multi->chrom) == 0) 71 | fh_insert(multi->starts, (*muPtr)->start, i); 72 | muPtr++; 73 | } 74 | } 75 | 76 | static void admitNewWiggleIteratorsIntoPlay(Multiplexer * multi) { 77 | while (fh_notempty(multi->starts) && fh_min(multi->starts) == multi->start) { 78 | int index = fh_extractmin(multi->starts); 79 | WiggleIterator * wi = multi->iters[index]; 80 | fh_insert(multi->finishes, wi->finish, index); 81 | multi->inplay[index] = true; 82 | multi->values[index] = wi->value; 83 | multi->inplay_count++; 84 | } 85 | } 86 | 87 | static void defineNewFinish(Multiplexer * multi) { 88 | multi->finish = fh_min(multi->finishes); 89 | 90 | if (fh_notempty(multi->starts)) { 91 | int min_start = fh_min(multi->starts); 92 | if (multi->finish > min_start) { 93 | multi->finish = min_start; 94 | } 95 | } 96 | } 97 | 98 | static bool popCoreMultiplexer2(Multiplexer * multi) { 99 | popClosingWiggleIterators(multi); 100 | 101 | // Check that there are wis queued up 102 | // If no wis are waiting, either waiting on other chromosomes 103 | // or finished. 104 | if (fh_empty(multi->starts) && fh_empty(multi->finishes)) 105 | queueUpWiggleIterators(multi); 106 | 107 | // If queues still empty 108 | if (multi->done) 109 | return false; 110 | 111 | // If no wi in play jump to next start 112 | if (multi->inplay_count) 113 | multi->start = multi->finish; 114 | else 115 | multi->start = fh_min(multi->starts); 116 | 117 | admitNewWiggleIteratorsIntoPlay(multi); 118 | defineNewFinish(multi); 119 | 120 | return multi->inplay_count == multi->count; 121 | } 122 | 123 | static void popCoreMultiplexer(Multiplexer * multi) { 124 | while (!multi->done) { 125 | if (popCoreMultiplexer2(multi) || !multi->strict) 126 | break; 127 | } 128 | } 129 | 130 | static void seekCoreMultiplexer(Multiplexer * multi, const char * chrom, int start, int finish) { 131 | int i; 132 | multi->done = false; 133 | for (i=0; icount; i++) 134 | seek(multi->iters[i], chrom, start, finish); 135 | fh_deleteheap(multi->starts); 136 | fh_deleteheap(multi->finishes); 137 | multi->starts = fh_makeheap(); 138 | multi->finishes = fh_makeheap(); 139 | multi->inplay_count = 0; 140 | popMultiplexer(multi); 141 | } 142 | 143 | Multiplexer * newCoreMultiplexer(void * data, int count, void (*pop)(Multiplexer *), void (*seek)(Multiplexer *, const char *, int, int)) { 144 | Multiplexer * new = (Multiplexer *) calloc (1, sizeof(Multiplexer)); 145 | new->count = count; 146 | new->values = (double *) calloc(count, sizeof(double)); 147 | new->default_values = (double *) calloc(count, sizeof(double)); 148 | new->inplay = (bool *) calloc(count, sizeof(bool)); 149 | new->pop = pop; 150 | new->seek = seek; 151 | new->data = data; 152 | new->starts = fh_makeheap(); 153 | new->finishes = fh_makeheap(); 154 | return new; 155 | } 156 | 157 | Multiplexer * newMultiplexer(WiggleIterator ** iters, int count, bool strict) { 158 | Multiplexer * new = newCoreMultiplexer(NULL, count, popCoreMultiplexer, seekCoreMultiplexer); 159 | new->strict = strict; 160 | new->iters = calloc(count, sizeof(WiggleIterator *)); 161 | int i; 162 | for (i = 0; i < count; i++) { 163 | new->iters[i] = NonOverlappingWiggleIterator(iters[i]); 164 | new->default_values[i] = new->iters[i]->default_value; 165 | new->values[i] = new->iters[i]->default_value; 166 | } 167 | popMultiplexer(new); 168 | return new; 169 | } 170 | -------------------------------------------------------------------------------- /src/multiplexer.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef WIGGLE_MULTIPLEXER_H_ 16 | #define WIGGLE_MULTIPLEXER_H_ 17 | 18 | #include "wiggleIterator.h" 19 | #include "fib.h" 20 | 21 | struct multiplexer_st { 22 | char * chrom; 23 | int start; 24 | int finish; 25 | double * values; 26 | double * default_values; 27 | int count, inplay_count; 28 | bool *inplay; 29 | WiggleIterator ** iters; 30 | bool done; 31 | bool strict; 32 | void (*pop)(Multiplexer *); 33 | void (*seek)(Multiplexer *, const char *, int, int); 34 | FibHeap * starts, *finishes; 35 | void * data; 36 | }; 37 | 38 | void popMultiplexer(Multiplexer * multi); 39 | void seekMultiplexer(Multiplexer * multi, const char * chrom, int start, int finish); 40 | void runMultiplexer(Multiplexer * multi); 41 | Multiplexer * newCoreMultiplexer(void * data, int count, void (*pop)(Multiplexer *), void (*seek)(Multiplexer *, const char *, int, int)); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/plots.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | // Local header 20 | #include "wiggleIterator.h" 21 | #include "multiplexer.h" 22 | 23 | ////////////////////////////////////////////////////// 24 | // Profile summaries 25 | ////////////////////////////////////////////////////// 26 | 27 | static void updateProfile(WiggleIterator * wig, double compression, double * profile, int profile_width, bool stranded) { 28 | int start, finish, pos; 29 | 30 | if (isnan(wig->value)) 31 | return; 32 | 33 | if (!stranded || wig->strand > 0) { 34 | start = (int) round(wig->start * compression); 35 | finish = (int) round(wig->finish * compression); 36 | } else if (wig->strand < 0) { 37 | start = (int) round(profile_width - 1 - (wig->finish * compression)); 38 | finish = (int) round(profile_width - 1 - (wig->start * compression)); 39 | } else { 40 | fprintf(stderr, "Cannot provide stranded profile on non-stranded regions\n"); 41 | exit(1); 42 | } 43 | 44 | if (start < 0) 45 | start = 0; 46 | if (finish <= start) 47 | finish = start + 1; 48 | if (finish > profile_width) 49 | finish = profile_width; 50 | 51 | for (pos = start; pos < finish; pos++) 52 | profile[pos] += wig->value / (finish - start); 53 | } 54 | 55 | void regionProfile(WiggleIterator * wig, double * profile, int profile_width, int region_width, bool stranded) { 56 | double compression = profile_width / (double) region_width; 57 | int pos; 58 | 59 | for (pos = 0; pos < profile_width; pos++) 60 | profile[pos] = 0; 61 | 62 | for (; !wig->done; pop(wig)) 63 | updateProfile(wig, compression, profile, profile_width, stranded); 64 | } 65 | 66 | void addProfile(double * dest, double * source, int width) { 67 | int i; 68 | 69 | for (i=0; iwidth - (hist->width - column) * ratio; 87 | int int_start = (int) start; 88 | double end = hist->width - (hist->width - column - 1) * ratio; 89 | int int_end = (int) end; 90 | double value = hist->values[row][column]; 91 | if (int_start == int_end) { 92 | hist->values[row][int_start] += value; 93 | } else { 94 | double split = (end - int_end) / ratio; 95 | hist->values[row][int_end] += value * split; 96 | hist->values[row][int_start] += value * (1 - split); 97 | } 98 | hist->values[row][column] -= value; 99 | } 100 | 101 | static void lowerMinNRows(Histogram * hist, double value) { 102 | if (hist->max != hist->min) { 103 | double ratio = (hist->max - hist->min) / (hist->max - value); 104 | int row, column; 105 | for (row = 0; row < hist->count; row++) 106 | // Careful to go from high to low to avoid compound effects as 107 | // you push weight to the high bins 108 | for (column = hist->width; column >= 0; column--) 109 | reassignColumnRight(hist, ratio, column, row); 110 | } else { 111 | // Copying the content of the first column to the last 112 | int row; 113 | for (row = 0; row < hist->count; row++) { 114 | hist->values[row][hist->width - 1] = hist->values[row][0]; 115 | hist->values[row][0] = 0; 116 | } 117 | } 118 | hist->min = value; 119 | } 120 | 121 | static void reassignColumnLeft(Histogram * hist, double ratio, int column, int row) { 122 | double start = column * ratio; 123 | int int_start = (int) start; 124 | double end = (column + 1) * ratio; 125 | int int_end = (int) end; 126 | double value = hist->values[row][column]; 127 | if (int_start == int_end) { 128 | hist->values[row][int_start] += value; 129 | } else { 130 | double split = (end - int_end) / ratio; 131 | hist->values[row][int_end] += value * split; 132 | hist->values[row][int_start] += value * (1 - split); 133 | } 134 | hist->values[row][column] -= value; 135 | } 136 | 137 | static void raiseMaxNRows(Histogram * hist, double value) { 138 | if (hist->max != hist->min) { 139 | double ratio = (hist->max - hist->min) / (value - hist->min); 140 | int row, column; 141 | for (row = 0; row < hist->count; row++) 142 | for (column = 1; column < hist->width; column++) 143 | reassignColumnLeft(hist, ratio, column, row); 144 | } 145 | hist->max = value; 146 | } 147 | 148 | static void insertIntoHistogram(Histogram * hist, WiggleIterator * wig, int row) { 149 | int column = (int) ((wig->value - hist->min) * hist->width / (hist->max - hist->min)); 150 | if (column == hist->width) 151 | column--; 152 | hist->values[row][column] += wig->finish - wig->start; 153 | } 154 | 155 | static void updateHistogram(Histogram * hist, WiggleIterator * wig, int row) { 156 | if (wig->value > hist->max) 157 | raiseMaxNRows(hist, wig->value); 158 | else if (wig->value < hist->min) 159 | lowerMinNRows(hist, wig->value); 160 | 161 | if (hist->min != hist->max) 162 | insertIntoHistogram(hist, wig, row); 163 | else 164 | hist->values[row][0] += wig->finish - wig->start; 165 | } 166 | 167 | Histogram * histogram(WiggleIterator ** wigs, int count, int width) { 168 | Histogram * hist = calloc(1, sizeof(Histogram)); 169 | hist->count = count; 170 | hist->width = width; 171 | hist->values = calloc(count, sizeof(double*)); 172 | int row; 173 | for (row = 0; row < count; row++) { 174 | hist->values[row] = calloc(width, sizeof(double)); 175 | WiggleIterator * wig = wigs[row]; 176 | if (row == 0) { 177 | while (isnan(wig->value)) { 178 | pop(wig); 179 | } 180 | hist->min = hist->max = wig->value; 181 | hist->values[0][0] = wig->finish - wig->start; 182 | pop(wig); 183 | } 184 | } 185 | 186 | for (row = 0; row < count; row++) { 187 | WiggleIterator * wig = wigs[row]; 188 | for (; !wig->done; pop(wig)) 189 | if (!isnan(wig->value)) 190 | updateHistogram(hist, wig, row); 191 | } 192 | 193 | return hist; 194 | } 195 | 196 | void normalize_histogram(Histogram * hist) { 197 | double sum; 198 | int column, row; 199 | 200 | for (row = 0; row < hist->count; row++) { 201 | sum = 0; 202 | for (column = 0; column < hist->width; column++) 203 | sum += hist->values[row][column]; 204 | 205 | for (column = 0; column < hist->width; column++) 206 | hist->values[row][column] /= sum; 207 | } 208 | } 209 | 210 | void print_histogram(Histogram * hist, FILE * file) { 211 | double step = (hist->max - hist->min) / hist->width; 212 | double position = hist->min; 213 | int column, row; 214 | 215 | for (column = 0; column < hist->width; column++) { 216 | fprintf(file, "%f", position + step/2); 217 | for (row = 0; row < hist->count; row++) { 218 | fprintf(file, "\t%f", hist->values[row][column]); 219 | } 220 | fprintf(file, "\n"); 221 | position += step; 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /src/recycleBin.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /****************************************************************\ 16 | * * 17 | * Efficient Memory Allocation Routines * 18 | * * 19 | * Guy St.C. Slater.. mailto:guy@ebi.ac.uk * 20 | * Copyright (C) 2000-2005. All Rights Reserved. * 21 | * * 22 | * This source code is distributed under the terms of the * 23 | * GNU Lesser General Public License. See the file COPYING * 24 | * or http://www.fsf.org/copyleft/lesser.html for details * 25 | * * 26 | * If you use this code, please keep this notice intact. * 27 | * * 28 | \****************************************************************/ 29 | 30 | #include 31 | #include 32 | 33 | #include "recycleBin.h" 34 | 35 | typedef struct RecycleBin_Node { 36 | struct RecycleBin_Node *next; 37 | } RecycleBin_Node; 38 | 39 | typedef struct chunk_st { 40 | struct chunk_st *next; 41 | } Chunk; 42 | 43 | struct recycleBin_st { 44 | Chunk *chunk_list; 45 | RecycleBin_Node *recycle; 46 | size_t node_size; 47 | int chunk_pos; 48 | int nodes_per_chunk; 49 | }; 50 | 51 | static void initRecycleBin(RecycleBin *recycleBin, 52 | size_t node_size, int nodes_per_chunk) 53 | { 54 | size_t chunckSize, allocSize; 55 | 56 | chunckSize = sizeof(Chunk) + nodes_per_chunk * node_size; 57 | allocSize = 1; 58 | /* Get nearest power of 2 */ 59 | while (allocSize < chunckSize) 60 | allocSize <<= 1; 61 | nodes_per_chunk = (allocSize - sizeof(Chunk)) / node_size; 62 | recycleBin->chunk_list = NULL; 63 | recycleBin->chunk_pos = nodes_per_chunk; 64 | recycleBin->nodes_per_chunk = nodes_per_chunk; 65 | recycleBin->node_size = node_size; 66 | recycleBin->recycle = NULL; 67 | } 68 | 69 | RecycleBin *newRecycleBin(size_t node_size, int nodes_per_chunk) 70 | { 71 | RecycleBin *recycleBin; 72 | 73 | if (node_size < sizeof(RecycleBin_Node)) { 74 | fprintf(stderr, "Too small elements to create a recycle bin!\n"); 75 | #ifdef DEBUG 76 | abort(); 77 | #endif 78 | exit(-1); 79 | } 80 | recycleBin = malloc(sizeof(RecycleBin)); 81 | if (!recycleBin) { 82 | fprintf(stderr, "Out of memory, exiting.\n"); 83 | #ifdef DEBUG 84 | abort(); 85 | #endif 86 | exit(-1); 87 | } 88 | initRecycleBin (recycleBin, node_size, nodes_per_chunk); 89 | 90 | return recycleBin; 91 | } 92 | 93 | static void destroyRecycleBinChunks(RecycleBin * recycleBin) 94 | { 95 | while (recycleBin->chunk_list != NULL) 96 | { 97 | Chunk *chunk; 98 | 99 | chunk = recycleBin->chunk_list; 100 | recycleBin->chunk_list = recycleBin->chunk_list->next; 101 | free(chunk); 102 | } 103 | } 104 | 105 | void destroyRecycleBin(RecycleBin * recycleBin) 106 | { 107 | if (recycleBin == NULL) 108 | return; 109 | 110 | destroyRecycleBinChunks(recycleBin); 111 | free(recycleBin); 112 | } 113 | 114 | void *allocatePointer(RecycleBin * recycle_bin) 115 | { 116 | RecycleBin_Node *node; 117 | Chunk *chunk; 118 | 119 | if (recycle_bin == NULL) { 120 | fprintf(stderr, "Null recycle bin!\n"); 121 | #ifdef DEBUG 122 | abort(); 123 | #endif 124 | exit(-1); 125 | } 126 | 127 | if (recycle_bin->recycle != NULL) { 128 | node = recycle_bin->recycle; 129 | recycle_bin->recycle = node->next; 130 | return node; 131 | } 132 | 133 | if (recycle_bin->chunk_pos == recycle_bin->nodes_per_chunk) { 134 | chunk = malloc(sizeof(Chunk) + recycle_bin->nodes_per_chunk 135 | * recycle_bin->node_size); 136 | if (chunk == NULL) { 137 | fprintf(stderr, "No more memory for memory chunk!\n"); 138 | #ifdef DEBUG 139 | abort(); 140 | #endif 141 | exit(-1); 142 | } 143 | chunk->next = recycle_bin->chunk_list; 144 | recycle_bin->chunk_list = chunk; 145 | recycle_bin->chunk_pos = 1; 146 | return (RecycleBin_Node *) ((size_t) (void *) chunk + 147 | sizeof(Chunk)); 148 | } 149 | 150 | chunk = recycle_bin->chunk_list; 151 | return (RecycleBin_Node *) ((size_t) (void *) chunk + sizeof(Chunk) 152 | + 153 | (recycle_bin-> 154 | node_size * 155 | recycle_bin->chunk_pos++)); 156 | } 157 | 158 | void deallocatePointer(RecycleBin * recycle_bin, void *data) 159 | { 160 | RecycleBin_Node *node = data; 161 | 162 | node->next = recycle_bin->recycle; 163 | recycle_bin->recycle = node; 164 | 165 | return; 166 | } 167 | -------------------------------------------------------------------------------- /src/recycleBin.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /****************************************************************\ 16 | * * 17 | * Efficient Memory Allocation Routines * 18 | * * 19 | * Guy St.C. Slater.. mailto:guy@ebi.ac.uk * 20 | * Copyright (C) 2000-2005. All Rights Reserved. * 21 | * * 22 | * This source code is distributed under the terms of the * 23 | * GNU Lesser General Public License. See the file COPYING * 24 | * or http://www.fsf.org/copyleft/lesser.html for details * 25 | * * 26 | * If you use this code, please keep this notice intact. * 27 | * * 28 | \****************************************************************/ 29 | 30 | #ifndef INCLUDED_RECYCLEBIN_H 31 | #define INCLUDED_RECYCLEBIN_H 32 | 33 | typedef struct recycleBin_st RecycleBin; 34 | 35 | // Constructor, Destructor 36 | RecycleBin *newRecycleBin(size_t node_size, int nodes_per_chunk); 37 | void destroyRecycleBin(RecycleBin * recycle_bin); 38 | 39 | // Use 40 | void *allocatePointer(RecycleBin * recycle_bin); 41 | void deallocatePointer(RecycleBin * recycle_bin, void *data); 42 | 43 | #endif /* INCLUDED_RECYCLEBIN_H */ 44 | -------------------------------------------------------------------------------- /src/samReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include "wiggleIterator.h" 19 | #include "hashfib.h" 20 | 21 | typedef struct samReaderData_st { 22 | char *filename; 23 | FILE * file; 24 | char * target_chrom; 25 | int stop; 26 | bool read_count; 27 | 28 | HashFib * starts, * ends; 29 | char chrom[1000]; 30 | char cigar[1000]; 31 | int pos; 32 | bool done; 33 | } SamReaderData; 34 | 35 | static int isSeparator(char c) { 36 | char * separators = "MIDNSHPX="; 37 | int i; 38 | 39 | for (i = 0; i < 9; i++) 40 | if (c == separators[i]) 41 | return 1; 42 | return 0; 43 | } 44 | 45 | static char * readNextCigarBlock(char * cigar, char * block) { 46 | int i; 47 | int lgth = strlen(cigar); 48 | for (i = 0; i < lgth && (i == 0 || !isSeparator(cigar[i-1])); i++) 49 | block[i] = cigar[i]; 50 | block[i] = '\0'; 51 | 52 | if (i) 53 | return cigar + i; 54 | else 55 | return NULL; 56 | } 57 | 58 | static int storeReadComponent(HashFib * starts, HashFib * ends, int start, char * block) { 59 | int last = strlen(block) - 1; 60 | char type = block[last]; 61 | block[last] = '\0'; 62 | int count = atoi(block); 63 | 64 | switch (type) { 65 | case 'M': 66 | case 'X': 67 | case '=': 68 | case 'D': 69 | hashfib_insert(starts, start); 70 | hashfib_insert(ends, start + count); 71 | case 'N': 72 | return start + count; 73 | default: 74 | return start; 75 | } 76 | } 77 | 78 | static void readLine(SamReaderData * data) { 79 | char line[5000]; 80 | char chrom[1000]; 81 | int pos; 82 | 83 | while (fgets(line, 5000, data->file)) { 84 | if (line[0] == '#') 85 | continue; 86 | if (line[0] == EOF) { 87 | data->done = true; 88 | return; 89 | } 90 | 91 | sscanf(line, "%*s\t%*i\t%s\t%i\t%*i\t%s", chrom, &pos, data->cigar); 92 | 93 | if (strcmp(chrom, data->chrom) < 0 || (strcmp(chrom, data->chrom) == 0 && pos < data->pos)) { 94 | fprintf(stderr, "Sam file %s is not sorted!\nPosition %s:%i should be before %s:%i\n", data->filename, chrom, pos, data->chrom, data->pos); 95 | exit(1); 96 | } 97 | 98 | strcpy(data->chrom, chrom); 99 | data->pos = pos; 100 | return; 101 | } 102 | 103 | data->done = true; 104 | } 105 | 106 | static void storeReadComponents(HashFib * starts, HashFib * ends, int start, char * cigar) { 107 | char block[100]; 108 | char * ptr; 109 | 110 | for (ptr = readNextCigarBlock(cigar, block); ptr; ptr = readNextCigarBlock(ptr, block)) 111 | start = storeReadComponent(starts, ends, start, block); 112 | } 113 | 114 | static void loadNextReadsOnChrom(WiggleIterator * wi) { 115 | SamReaderData * data = (SamReaderData *) wi->data; 116 | 117 | while (!data->done && !strcmp(wi->chrom, data->chrom) && (hashfib_empty(data->ends) || hashfib_empty(data->starts) || data->pos <= hashfib_min(data->ends) || data->pos <= hashfib_min(data->starts))) { 118 | storeReadComponents(data->starts, data->ends, data->pos, data->cigar); 119 | readLine(data); 120 | } 121 | } 122 | 123 | static void stepForward(WiggleIterator * wi) { 124 | SamReaderData * data = (SamReaderData *) wi->data; 125 | // Choose start 126 | if (wi->value) 127 | wi->start = wi->finish; 128 | else 129 | wi->start = hashfib_min(data->starts); 130 | 131 | if (wi->start == -1) 132 | abort(); 133 | 134 | // If overshot 135 | if (data->target_chrom && ( 136 | (wi->start >= data->stop && strcmp(wi->chrom, data->target_chrom) == 0) 137 | || strcmp(wi->chrom, data->target_chrom) > 0 138 | ) 139 | ) { 140 | wi->done = true; 141 | return; 142 | } 143 | 144 | // Compute value 145 | if (!hashfib_empty(data->starts) && hashfib_min(data->starts) == wi->start) 146 | wi->value += hashfib_remove_min(data->starts); 147 | 148 | // Compute finish 149 | if (hashfib_empty(data->starts) || hashfib_min(data->ends) < hashfib_min(data->starts)) 150 | wi->finish = hashfib_min(data->ends); 151 | else 152 | wi->finish = hashfib_min(data->starts); 153 | 154 | // If overshot 155 | if (data->target_chrom && wi->finish > data->stop) 156 | wi->finish = data->stop; 157 | 158 | } 159 | 160 | void SamReaderPop(WiggleIterator * wi) { 161 | SamReaderData * data = (SamReaderData *) wi->data; 162 | 163 | if (wi->done) 164 | return; 165 | 166 | if (data->read_count) { 167 | if (data->done) { 168 | fclose(data->file); 169 | data->file = NULL; 170 | wi->done = true; 171 | return; 172 | } 173 | 174 | // Initialise iterator values 175 | if (!wi->chrom || strcmp(wi->chrom, data->chrom)) { 176 | wi->chrom = (char *) calloc(strlen(data->chrom) + 1, sizeof(char)); 177 | strcpy(wi->chrom, data->chrom); 178 | } 179 | wi->start = data->pos; 180 | wi->finish = data->pos + 1; 181 | wi->value = 0; 182 | 183 | // Check for overlapping reads 184 | while (!data->done && !strcmp(wi->chrom, data->chrom) && data->pos == wi->start) { 185 | wi->value++; 186 | readLine(data); 187 | } 188 | } else { 189 | // Plan A is if already on a chromosome and there is remaining business there: 190 | if (wi->chrom) { 191 | while (!hashfib_empty(data->ends) && hashfib_min(data->ends) == wi->finish) { 192 | wi->value -= hashfib_remove_min(data->ends); 193 | if (wi->value < 0) { 194 | fprintf(stderr, "Negative coverage at %s:%i???\n", wi->chrom, wi->finish); 195 | exit(1); 196 | } 197 | } 198 | 199 | loadNextReadsOnChrom(wi); 200 | 201 | if (!hashfib_empty(data->ends)) { 202 | stepForward(wi); 203 | return; 204 | } 205 | } 206 | 207 | // Plan B if nothing to do on chromosome, move to next one: 208 | // This re-initialisation is needed because the default init value is 1 for the WiggleIterator 209 | wi->value = 0; 210 | wi->chrom = (char *) calloc(strlen(data->chrom) + 1, sizeof(char)); 211 | strcpy(wi->chrom, data->chrom); 212 | 213 | loadNextReadsOnChrom(wi); 214 | 215 | if (!hashfib_empty(data->ends)) { 216 | stepForward(wi); 217 | return; 218 | } 219 | 220 | // Plan C: just quit it 221 | fclose(data->file); 222 | data->file = NULL; 223 | wi->done = true; 224 | } 225 | } 226 | 227 | void SamReaderSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 228 | SamReaderData * data = (SamReaderData*) wi->data; 229 | 230 | if (data->file == stdin) { 231 | fprintf(stderr, "Cannot do a seek on stdin stream!\n"); 232 | exit(1); 233 | } 234 | 235 | // Set targets 236 | data->stop = finish; 237 | data->target_chrom = chrom; 238 | 239 | // Possibly start reading file from the top 240 | if (!data->file || strcmp(chrom, wi->chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && start < wi->start)) { 241 | if (data->file) 242 | fclose(data->file); 243 | if (!(data->file = fopen(data->filename, "r"))) { 244 | fprintf(stderr, "Could not open input file %s\n", data->filename); 245 | exit(1); 246 | } 247 | wi->done = false; 248 | // This is needed to avoid triggering the out of order check in the readLine below 249 | data->chrom[0] = '\0'; 250 | wi->chrom = NULL; 251 | 252 | // Reset coverage data structures 253 | if (!data->read_count) { 254 | hashfib_destroy(data->starts); 255 | hashfib_destroy(data->ends); 256 | data->starts = hashfib_construct(); 257 | data->ends = hashfib_construct(); 258 | } 259 | data->done = false; 260 | readLine(data); 261 | 262 | pop(wi); 263 | } 264 | 265 | 266 | // Move to starting point 267 | while (!wi->done && (strcmp(wi->chrom, chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && wi->finish < start))) 268 | pop(wi); 269 | 270 | // Trim very first position 271 | if (!wi->done && strcmp(chrom, wi->chrom) == 0 && wi->start < start) 272 | wi->start = start; 273 | } 274 | 275 | WiggleIterator * SamReader(char * filename, bool read_count) { 276 | SamReaderData * data = (SamReaderData *) calloc(1, sizeof(SamReaderData)); 277 | data->filename = filename; 278 | data->stop = -1; 279 | data->read_count = read_count; 280 | if (!read_count) { 281 | data->starts = hashfib_construct(); 282 | data->ends = hashfib_construct(); 283 | } 284 | if (strcmp(filename, "-")) { 285 | if (!(data->file = fopen(filename, "r"))) { 286 | fprintf(stderr, "Could not open input file %s\n", filename); 287 | exit(1); 288 | } 289 | } else 290 | data->file = stdin; 291 | readLine(data); 292 | 293 | return newWiggleIterator(data, &SamReaderPop, &SamReaderSeek, 0, false); 294 | } 295 | -------------------------------------------------------------------------------- /src/setComparisons.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include "multiSet.h" 20 | 21 | typedef struct setComparisonData_st { 22 | Multiset * multi; 23 | } SetComparisonData; 24 | 25 | void SetComparisonSeek(WiggleIterator * iter, const char * chrom, int start, int finish) { 26 | SetComparisonData * data = (SetComparisonData* ) iter->data; 27 | seekMultiset(data->multi, chrom, start, finish); 28 | pop(iter); 29 | } 30 | 31 | //////////////////////////////////////////////////////// 32 | // T-test 33 | //////////////////////////////////////////////////////// 34 | 35 | void TTestReductionPop(WiggleIterator * wi) { 36 | if (wi->done) 37 | return; 38 | 39 | SetComparisonData * data = (SetComparisonData *) wi->data; 40 | Multiset * multi = data->multi; 41 | 42 | if (multi->done) { 43 | wi->done = true; 44 | return; 45 | } 46 | 47 | // Go to first position where both of the sets have at least one value 48 | while (!multi->inplay[0] || !multi->inplay[1]) { 49 | popMultiset(multi); 50 | if (multi->done) { 51 | wi->done = true; 52 | return; 53 | } 54 | } 55 | wi->chrom = multi->chrom; 56 | wi->start = multi->start; 57 | wi->finish = multi->finish; 58 | 59 | // Compute measurements 60 | double sum1, sum2, sumSq1, sumSq2; 61 | int count1, count2; 62 | int index; 63 | 64 | sum1 = sum2 = 0; 65 | sumSq1 = sumSq2 = 0; 66 | count1 = multi->multis[0]->count; 67 | count2 = multi->multis[1]->count; 68 | 69 | for (index = 0; index < multi->multis[0]->count; index++) { 70 | if (multi->multis[0]->inplay[index]) { 71 | sum1 += multi->values[0][index]; 72 | sumSq1 += multi->values[0][index] * multi->values[0][index]; 73 | } 74 | } 75 | 76 | for (index = 0; index < multi->multis[1]->count; index++) { 77 | if (multi->multis[1]->inplay[index]) { 78 | sum2 += multi->values[1][index]; 79 | sumSq2 += multi->values[1][index] * multi->values[1][index]; 80 | } 81 | } 82 | 83 | // To avoid divisions by 0: 84 | if (count1 == 0 || count2 == 0) { 85 | wi->value = NAN; 86 | popMultiset(multi); 87 | return; 88 | } 89 | 90 | double mean1 = sum1 / count1; 91 | double mean2 = sum2 / count2; 92 | double meanSq1 = sumSq1 / count1; 93 | double meanSq2 = sumSq2 / count2; 94 | double var1 = meanSq1 - mean1 * mean1; 95 | double var2 = meanSq2 - mean2 * mean2; 96 | 97 | // To avoid divisions by 0: 98 | if (var1 + var2 == 0) { 99 | wi->value = NAN; 100 | popMultiset(multi); 101 | return; 102 | } 103 | 104 | // T-statistic 105 | 106 | double t = (mean1 - mean2) / sqrt(var1 / count1 + var2 / count2); 107 | 108 | if (t < 0) 109 | t = -t; 110 | 111 | // Degrees of freedom 112 | 113 | double nu = (var1 / count1 + var2 / count2) * (var1 / count1 + var2 / count2) / ((var1 * var1) / (count1 * count1 * (count1 - 1)) + (var2 * var2) / (count2 * count2 * (count2 - 1))); 114 | 115 | // P-value 116 | 117 | wi->value = 2 * gsl_cdf_tdist_Q(t, nu); 118 | 119 | // Update inputs 120 | popMultiset(multi); 121 | } 122 | 123 | WiggleIterator * TTestReduction(Multiset * multi) { 124 | SetComparisonData * data = (SetComparisonData *) calloc(1, sizeof(SetComparisonData)); 125 | if (multi->count != 2 || multi->multis[0]->count < 3 || multi->multis[1]->count < 3) { 126 | puts("The t-test function only works for two sets with enough elements to compute variance"); 127 | exit(1); 128 | } 129 | data->multi = multi; 130 | return newWiggleIterator(data, &TTestReductionPop, &SetComparisonSeek, NAN, false); 131 | } 132 | 133 | 134 | //////////////////////////////////////////////////////// 135 | // F-test 136 | //////////////////////////////////////////////////////// 137 | 138 | typedef struct ftestData_st { 139 | Multiset * multi; 140 | int * counts; 141 | double * means; 142 | int total_count; 143 | } FTestData; 144 | 145 | void FTestSeek(WiggleIterator * iter, const char * chrom, int start, int finish) { 146 | FTestData * data = (FTestData* ) iter->data; 147 | seekMultiset(data->multi, chrom, start, finish); 148 | pop(iter); 149 | } 150 | 151 | 152 | void FTestReductionPop(WiggleIterator * wi) { 153 | if (wi->done) 154 | return; 155 | 156 | FTestData * data = (FTestData *) wi->data; 157 | Multiset * multi = data->multi; 158 | 159 | if (multi->done) { 160 | wi->done = true; 161 | return; 162 | } 163 | 164 | // Go to first position where both of the sets have at least one value 165 | while (!multi->inplay[0] || !multi->inplay[1]) { 166 | popMultiset(multi); 167 | if (multi->done) { 168 | wi->done = true; 169 | return; 170 | } 171 | } 172 | wi->chrom = multi->chrom; 173 | wi->start = multi->start; 174 | wi->finish = multi->finish; 175 | 176 | // Compute means 177 | double mean = 0; 178 | int groups = multi->count; 179 | int index, index2; 180 | for (index = 0; index < groups; index++) { 181 | Multiplexer * mplx = multi->multis[index]; 182 | data->means[index] = 0; 183 | for (index2 = 0; index < mplx->count; index++) { 184 | if (mplx->inplay[index2]) 185 | data->means[index] += mplx->values[index2]; 186 | else 187 | data->means[index] += mplx->iters[index2]->default_value; 188 | } 189 | mean += data->means[index]; 190 | data->means[index] /= data->counts[index]; 191 | } 192 | mean /= data->total_count; 193 | 194 | double inter = 0; 195 | double intra = 0; 196 | for (index = 0; index < groups; index++) { 197 | Multiplexer * mplx = multi->multis[index]; 198 | inter += mplx->count * (data->means[index] - mean) * (data->means[index] - mean); 199 | for (index2 = 0; index < mplx->count; index++) { 200 | if (mplx->inplay[index2]) 201 | intra += (mplx->values[index2] - data->means[index]) * (mplx->values[index2] - data->means[index]); 202 | else 203 | intra += (mplx->iters[index2]->default_value - data->means[index]) * (mplx->iters[index2]->default_value - data->means[index]); 204 | } 205 | } 206 | 207 | // F-statistic 208 | inter /= groups - 1; 209 | intra /= data->total_count - groups; 210 | double f = inter / intra; 211 | 212 | // P-value 213 | wi->value = 2 * gsl_cdf_fdist_Q(f, multi->count - 1, data->total_count - multi->count); 214 | 215 | // Update inputs 216 | popMultiset(multi); 217 | } 218 | 219 | WiggleIterator * FTestReduction(Multiset * multi) { 220 | FTestData * data = (FTestData *) calloc(1, sizeof(FTestData)); 221 | data->multi = multi; 222 | data->means = calloc(multi->count, sizeof(double)); 223 | data->counts = calloc(multi->count, sizeof(int)); 224 | int index; 225 | for (index = 0; index < multi->count; index++) { 226 | data->counts[index] = multi->multis[index]->count; 227 | data->total_count += data->counts[index]; 228 | } 229 | return newWiggleIterator(data, &FTestReductionPop, &FTestSeek, NAN, false); 230 | } 231 | 232 | //////////////////////////////////////////////////////// 233 | // Mann-Whitney U (Wilcoxon rank-sum test) 234 | //////////////////////////////////////////////////////// 235 | 236 | typedef struct valueSetPair_st { 237 | double value; 238 | bool set; 239 | } ValueSetPair; 240 | 241 | typedef struct mwuData_st { 242 | Multiset * multi; 243 | int n1; 244 | int n2; 245 | int N; 246 | // Pre-allocated table for sorting 247 | ValueSetPair * rankingTable; 248 | // For normal approximation 249 | bool normalApproximation; 250 | double mu_U, sigma_U; 251 | } MWUData; 252 | 253 | void MWUSeek(WiggleIterator * iter, const char * chrom, int start, int finish) { 254 | MWUData * data = (MWUData* ) iter->data; 255 | seekMultiset(data->multi, chrom, start, finish); 256 | pop(iter); 257 | } 258 | 259 | static int compareValueSetPairs(const void * A, const void * B) { 260 | ValueSetPair * vspA = (ValueSetPair *) A; 261 | ValueSetPair * vspB = (ValueSetPair *) B; 262 | if (vspA->value < vspB->value) 263 | return -1; 264 | if (vspA->value > vspB->value) 265 | return 1; 266 | return 0; 267 | } 268 | 269 | void MWUReductionPop(WiggleIterator * wi) { 270 | if (wi->done) 271 | return; 272 | 273 | MWUData * data = (MWUData *) wi->data; 274 | Multiset * multi = data->multi; 275 | 276 | if (multi->done) { 277 | wi->done = true; 278 | return; 279 | } 280 | 281 | // Go to first position where both of the sets have at least one value 282 | while (!multi->inplay[0] || !multi->inplay[1]) { 283 | popMultiset(multi); 284 | if (multi->done) { 285 | wi->done = true; 286 | return; 287 | } 288 | } 289 | wi->chrom = multi->chrom; 290 | wi->start = multi->start; 291 | wi->finish = multi->finish; 292 | 293 | // Compute measurements 294 | int index; 295 | ValueSetPair * vspPtr = data->rankingTable; 296 | 297 | for (index = 0; index < data->n1; index++) { 298 | if (multi->multis[0]->inplay[index]) 299 | vspPtr->value = multi->values[0][index]; 300 | else 301 | vspPtr->value = multi->multis[0]->iters[index]->default_value; 302 | if (isnan(vspPtr->value)) { 303 | wi->value = NAN; 304 | popMultiset(multi); 305 | return; 306 | } 307 | vspPtr->set = false; 308 | vspPtr++; 309 | } 310 | 311 | for (index = 0; index < data->n2; index++) { 312 | if (multi->multis[1]->inplay[index]) 313 | vspPtr->value = multi->values[1][index]; 314 | else 315 | vspPtr->value = multi->multis[1]->iters[index]->default_value; 316 | if (isnan(vspPtr->value)) { 317 | wi->value = NAN; 318 | popMultiset(multi); 319 | return; 320 | } 321 | vspPtr->set = true; 322 | vspPtr++; 323 | } 324 | 325 | qsort(data->rankingTable, data->N, sizeof(ValueSetPair), compareValueSetPairs); 326 | 327 | // Sum of ranks of elements of set 1 328 | double U1 = 0; 329 | // Rolling count of elements of set 1 seen prior on the list 330 | int prev = 0; 331 | // Warns you when you have a tie with the previously visited Value-Set pairs 332 | int ties = 0; 333 | int previousTies = 0; 334 | vspPtr = data->rankingTable; 335 | for (index = 0; index < data->N && prev < data->n1; index++) { 336 | if (!vspPtr->set) { 337 | U1 += index - prev; 338 | if (ties) { 339 | // Look for ties on the table prior to the current position and after the last occurence of an element of set 1. 340 | int index2; 341 | for (index2 = index + 1; index2 < data->N && data->rankingTable[index2].value == vspPtr->value && data->rankingTable[index2].set; index2++) 342 | previousTies++; 343 | U1 -= previousTies / 2.0; 344 | U1 += (ties - previousTies) / 2.0; 345 | if (previousTies == ties) 346 | previousTies = ties = 0; 347 | } else { 348 | int index2; 349 | // Look for ties with next values 350 | for (index2 = index + 1; index2 < data->N && data->rankingTable[index2].value == vspPtr->value; index2++) 351 | if (data->rankingTable[index2].set) 352 | ties++; 353 | if (ties) 354 | U1 += ties / 2.0; 355 | } 356 | prev++; 357 | } 358 | vspPtr++; 359 | } 360 | 361 | if (data->normalApproximation) { 362 | if (U1 > data->mu_U) 363 | wi->value = 2 * erf((data->mu_U - U1) / data->sigma_U); 364 | else 365 | wi->value = 2 * erf((U1 - data->mu_U) / data->sigma_U); 366 | } 367 | 368 | // Update inputs 369 | popMultiset(multi); 370 | } 371 | 372 | WiggleIterator * MWUReduction(Multiset * multi) { 373 | MWUData * data = (MWUData *) calloc(1, sizeof(MWUData)); 374 | if (multi->count != 2 || multi->multis[0]->count == 0 || multi->multis[1]->count == 0) { 375 | puts("The Mann-Whitney U function only works for two non-empty sets"); 376 | exit(1); 377 | } 378 | data->multi = multi; 379 | data->n1 = multi->multis[0]->count; 380 | data->n2 = multi->multis[1]->count; 381 | data->N = data->n1 + data->n2; 382 | data->rankingTable = calloc(data->N, sizeof(ValueSetPair)); 383 | if (true) { 384 | // Ideally, tables could be used for small values of n1 and n2 385 | data->normalApproximation = true; 386 | data->mu_U = data->n1 * data->n2 / 2; 387 | data->sigma_U = sqrt(data->n1 * data->n2 * (data->n1 + data->n2 + 1) / 12); 388 | } 389 | return newWiggleIterator(data, &MWUReductionPop, &MWUSeek, NAN, false); 390 | } 391 | -------------------------------------------------------------------------------- /src/vcfReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include "wiggleIterator.h" 19 | 20 | typedef struct bedReaderData_st { 21 | char *filename; 22 | FILE * file; 23 | char * chrom; 24 | int stop; 25 | } VcfReaderData; 26 | 27 | void VcfReaderPop(WiggleIterator * wi) { 28 | VcfReaderData * data = (VcfReaderData *) wi->data; 29 | char line[5000]; 30 | char chrom[1000]; 31 | 32 | if (wi->done) 33 | return; 34 | 35 | while (fgets(line, 5000, data->file)) { 36 | if (line[0] != '#') { 37 | sscanf(line, "%s\t%i", chrom, &wi->start); 38 | 39 | wi->finish = wi->start + 1; 40 | 41 | // The reason for creating a new string instead of simply 42 | // overwriting is that other functions may still be pointin 43 | // at the old label 44 | if (wi->chrom[0] == '\0' || strcmp(wi->chrom, chrom)) { 45 | wi->chrom = (char *) calloc(strlen(chrom), sizeof(char)); 46 | strcpy(wi->chrom, chrom); 47 | } 48 | 49 | if (data->stop > 0) { 50 | if ((wi->start >= data->stop && strcmp(wi->chrom, data->chrom) == 0) || strcmp(wi->chrom, data->chrom) > 0) { 51 | wi->done = true; 52 | return; 53 | } else if (wi->finish > data->stop) { 54 | wi->finish = data->stop; 55 | } 56 | } 57 | return; 58 | } 59 | } 60 | 61 | fclose(data->file); 62 | data->file = NULL; 63 | wi->done = true; 64 | } 65 | 66 | void VcfReaderSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 67 | VcfReaderData * data = (VcfReaderData*) wi->data; 68 | 69 | data->stop = finish; 70 | data->chrom = chrom; 71 | 72 | if (wi->done || strcmp(chrom, wi->chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && start < wi->start)) { 73 | if (data->file) 74 | fclose(data->file); 75 | if (!(data->file = fopen(data->filename, "r"))) { 76 | fprintf(stderr, "Could not open input file %s\n", data->filename); 77 | exit(1); 78 | } 79 | wi->done = false; 80 | pop(wi); 81 | } 82 | 83 | while (!wi->done && (strcmp(wi->chrom, chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && wi->finish < start))) 84 | pop(wi); 85 | 86 | if (!wi->done && strcmp(chrom, wi->chrom) == 0 && wi->start < start) 87 | wi->start = start; 88 | } 89 | 90 | WiggleIterator * VcfReader(char * filename) { 91 | VcfReaderData * data = (VcfReaderData *) calloc(1, sizeof(VcfReaderData)); 92 | data->filename = filename; 93 | data->stop = -1; 94 | if (!(data->file = fopen(filename, "r"))) { 95 | fprintf(stderr, "Could not open bed file %s\n", filename); 96 | exit(1); 97 | } 98 | return newWiggleIteratorChromName(data, &VcfReaderPop, &VcfReaderSeek, 0, true); 99 | } 100 | -------------------------------------------------------------------------------- /src/wigReader.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include "wiggleIterator.h" 20 | 21 | ////////////////////////////////////////////////////// 22 | // File Reader 23 | ////////////////////////////////////////////////////// 24 | 25 | enum readingMode {FIXED_STEP, VARIABLE_STEP, BED_GRAPH}; 26 | 27 | typedef struct wiggleReaderData_st { 28 | char * filename; 29 | FILE * file; 30 | enum readingMode readingMode; 31 | int step; 32 | int span; 33 | char words[5]; 34 | char * chrom; 35 | int stop; 36 | } WiggleReaderData; 37 | 38 | 39 | static void WiggleReaderReadHeader(WiggleIterator * wi, WiggleReaderData * data, char * line) { 40 | bool chrom_b = true; 41 | bool start_b = true; 42 | bool step_b = true; 43 | const char * seps = " \t="; 44 | char * token = strtok(line, seps); 45 | 46 | // Default 47 | data->span = 1; 48 | 49 | // Reading following parameters 50 | token = strtok(NULL, seps); 51 | while(token) { 52 | if (!strcmp(token, "chrom")) { 53 | chrom_b = false; 54 | token = strtok(NULL, seps); 55 | if (!token) { 56 | fprintf(stderr, "Empty wi->chromosome name!\n"); 57 | exit(1); 58 | } 59 | if (strcmp(wi->chrom, token)) { 60 | wi->chrom = malloc(strlen(token) + 1); 61 | strcpy(wi->chrom, token); 62 | } 63 | } 64 | if (!strcmp(token, "start")) { 65 | start_b = false; 66 | token = strtok(NULL, seps); 67 | if (!token) { 68 | fprintf(stderr, "Empty wi->start position!\n"); 69 | exit(1); 70 | } 71 | sscanf(token, "%i", &(wi->start)); 72 | } 73 | if (!strcmp(token, "span")) { 74 | token = strtok(NULL, seps); 75 | if (!token) { 76 | fprintf(stderr, "Empty span length!\n"); 77 | exit(1); 78 | } 79 | sscanf(token, "%i", &(data->span)); 80 | } 81 | if (!strcmp(token, "step")) { 82 | step_b = false; 83 | if (data->readingMode == VARIABLE_STEP) { 84 | fprintf(stderr, "Cannot specify step length on a variable length track\n"); 85 | exit(1); 86 | } 87 | token = strtok(NULL, seps); 88 | if (!token) { 89 | fprintf(stderr, "Empty step length!\n"); 90 | exit(1); 91 | } 92 | sscanf(token, "%i", &(data->step)); 93 | } 94 | token = strtok(NULL, seps); 95 | } 96 | 97 | // Checking that all compulsory fields were filled: 98 | if ((data->readingMode == FIXED_STEP && (chrom_b || start_b || step_b)) || (data->readingMode == VARIABLE_STEP && chrom_b)) { 99 | fprintf(stderr, "Invalid header, missing data: %s\n", line); 100 | exit(1); 101 | } 102 | 103 | // Backing off so as not to offset the first line 104 | if (data->readingMode == FIXED_STEP) 105 | wi->start -= data->step; 106 | } 107 | 108 | static void WiggleReaderReadFixedStepLine(WiggleIterator * wi, char * line, int step, int span) { 109 | sscanf(line, "%lf", &(wi->value)); 110 | wi->start += step; 111 | wi->finish = wi->start + span; 112 | } 113 | 114 | static void WiggleReaderReadVariableStepLine(WiggleIterator * wi, char * line, int span) { 115 | sscanf(line, "%i\t%lf", &(wi->start), &(wi->value)); 116 | wi->finish = wi->start + span; 117 | } 118 | 119 | static void WiggleReaderReadBedGraphLine(WiggleIterator * wi, char * line) { 120 | char * buffer = calloc(500, sizeof(char)); 121 | sscanf(line, "%s\t%i\t%i\t%lf", buffer, &(wi->start), &(wi->finish), &(wi->value)); 122 | // BedGraphs are 0 based, half open 123 | wi->start++; 124 | wi->finish++; 125 | if (strcmp(buffer, wi->chrom)) 126 | wi->chrom = buffer; 127 | else 128 | free(buffer); 129 | } 130 | 131 | static int countWords(char * line) { 132 | int count = 0; 133 | char * ptr; 134 | 135 | if (line[0] != ' ' && line[0] != '\t') 136 | count++; 137 | 138 | for (ptr = line; *ptr; ptr++) 139 | if (*ptr == ' ' || *ptr == '\t') 140 | count++; 141 | 142 | return count; 143 | } 144 | 145 | static void WiggleReaderPop(WiggleIterator * wi) { 146 | WiggleReaderData * data = (WiggleReaderData*) wi->data; 147 | char line[5000]; 148 | 149 | if (wi->done) 150 | return; 151 | 152 | while (fgets(line, 5000, data->file)) { 153 | if (line[0] == '#' || line[0] == EOF) 154 | continue; 155 | else if ( !strncmp("variableStep", line, 12)) { 156 | data->readingMode = VARIABLE_STEP; 157 | WiggleReaderReadHeader(wi, data, line); 158 | continue; 159 | } else if (!strncmp("fixedStep", line, 9)) { 160 | data->readingMode = FIXED_STEP; 161 | WiggleReaderReadHeader(wi, data, line); 162 | continue; 163 | } else if (!strncmp("track", line, 5)) { 164 | continue; 165 | } 166 | 167 | switch (countWords(line)) { 168 | case 4: 169 | data->readingMode = BED_GRAPH; 170 | WiggleReaderReadBedGraphLine(wi, line); 171 | break; 172 | case 2: 173 | if (data->readingMode != VARIABLE_STEP) { 174 | fprintf(stderr, "Badly formatted fixed step line:\n%s", line); 175 | exit(1); 176 | } 177 | WiggleReaderReadVariableStepLine(wi, line,data->span); 178 | break; 179 | case 1: 180 | if (data->readingMode != FIXED_STEP) { 181 | fprintf(stderr, "Badly formatted variable step line:\n%s", line); 182 | exit(1); 183 | } 184 | WiggleReaderReadFixedStepLine(wi, line, data->step, data->span); 185 | break; 186 | default: 187 | fprintf(stderr, "Badly formatted wiggle or bed graph line :\n%s", line); 188 | exit(1); 189 | 190 | } 191 | 192 | if (data->stop > 0) { 193 | int comparison = strcmp(wi->chrom, data->chrom); 194 | if (comparison == 0) { 195 | if (wi->start >= data->stop) { 196 | wi->done = true; 197 | } else if (wi->finish > data->stop) { 198 | wi->finish = data->stop; 199 | } 200 | } else if (comparison > 0) { 201 | wi->done = true; 202 | } 203 | } 204 | 205 | return; 206 | } 207 | fclose(data->file); 208 | data->file = NULL; 209 | wi->done = true; 210 | } 211 | 212 | void WiggleReaderSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 213 | WiggleReaderData * data = (WiggleReaderData*) wi->data; 214 | 215 | data->stop = finish; 216 | data->chrom = chrom; 217 | 218 | if (!data->file || strcmp(chrom, wi->chrom) < 0 || (strcmp(chrom, wi->chrom) == 0 && start < wi->start)) { 219 | if (data->file) 220 | fclose(data->file); 221 | if (!(data->file = fopen(data->filename, "r"))) { 222 | fprintf(stderr, "Cannot open input file %s\n", data->filename); 223 | exit(1); 224 | } 225 | wi->done = false; 226 | pop(wi); 227 | } 228 | 229 | while (!wi->done && (strcmp(wi->chrom, chrom) < 0 || (strcmp(wi->chrom, chrom) == 0 && wi->finish <= start))) 230 | pop(wi); 231 | 232 | if (!wi->done && strcmp(chrom, wi->chrom) == 0 && wi->start < start) 233 | wi->start = start; 234 | } 235 | 236 | WiggleIterator * WiggleReader(char * f) { 237 | WiggleReaderData * data = (WiggleReaderData *) calloc(1, sizeof(WiggleReaderData)); 238 | data->filename = f; 239 | if (strcmp(f, "-")) { 240 | if (!(data->file = fopen(f, "r"))) { 241 | fprintf(stderr, "Could not open input file %s\n", f); 242 | exit(1); 243 | } 244 | } else 245 | data->file = stdin; 246 | data->readingMode = BED_GRAPH; 247 | data->stop = -1; 248 | return CompressionWiggleIterator(newWiggleIteratorChromName(data, &WiggleReaderPop, &WiggleReaderSeek, 0, false)); 249 | } 250 | -------------------------------------------------------------------------------- /src/wigWriter.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | // Local header 22 | #include "wiggleIterator.h" 23 | 24 | ////////////////////////////////////////////////////// 25 | // Tee operator 26 | ////////////////////////////////////////////////////// 27 | 28 | #define BLOCK_LENGTH 10000 29 | #define MAX_OUT_BLOCKS 2 30 | 31 | typedef struct BlockData_st { 32 | char * chroms[BLOCK_LENGTH]; 33 | int starts[BLOCK_LENGTH]; 34 | int finishes[BLOCK_LENGTH]; 35 | double values[BLOCK_LENGTH]; 36 | int count; 37 | bool bedGraph; 38 | struct BlockData_st * next; 39 | } BlockData; 40 | 41 | typedef struct TeeWiggleIteratorData_st { 42 | FILE * infile; 43 | FILE * outfile; 44 | WiggleIterator * iter; 45 | BlockData * dataBlocks; 46 | BlockData * lastBlock; 47 | int count; 48 | pthread_t threadID; 49 | pthread_mutex_t continue_mutex; 50 | pthread_cond_t continue_cond; 51 | bool done; 52 | bool bedGraph; 53 | } TeeWiggleIteratorData; 54 | 55 | static void printBlock(FILE * infile, FILE * outfile, BlockData * block) { 56 | int i, j; 57 | bool pointByPoint = false; 58 | bool makeHeader=false; 59 | char ** chromPtr = block->chroms; 60 | int * startPtr = block->starts; 61 | int * finishPtr = block->finishes; 62 | double * valuePtr = block->values; 63 | char * lastChrom = NULL; 64 | int lastFinish = -1; 65 | char buffer[5000]; 66 | 67 | for (i = 0; i < block->count; i++) { 68 | // Change mode 69 | if (!block->bedGraph && *finishPtr - *startPtr < 2 && !pointByPoint) { 70 | pointByPoint = true; 71 | makeHeader = true; 72 | } else if (*finishPtr - *startPtr > 5 && pointByPoint) { 73 | pointByPoint = false; 74 | } 75 | 76 | if (pointByPoint) { 77 | if (makeHeader || (pointByPoint && (lastChrom != *chromPtr || *startPtr > lastFinish))) 78 | fprintf(outfile, "fixedStep chrom=%s start=%i step=1\n", *chromPtr, *startPtr); 79 | makeHeader = false; 80 | for (j = 0; j < *finishPtr - *startPtr; j++) 81 | fprintf(outfile, "%lf\n", *valuePtr); 82 | } else if (!infile) 83 | // Careful bedgraph lines are 0 based 84 | fprintf(outfile, "%s\t%i\t%i\t%lf\n", *chromPtr, *startPtr-1, *finishPtr-1, *valuePtr); 85 | else { 86 | // Read next line in infile 87 | if (!fgets(buffer, 5000, infile)) { 88 | fprintf(stderr, "Could not paste data to file lines, inconsistent number of lines.\n"); 89 | exit(1); 90 | } 91 | 92 | // Skip empty lines and metadata lines: 93 | while (! (strlen(buffer) && strncmp(buffer, "track", 5) && strncmp(buffer, "browser", 7))) { 94 | if (!fgets(buffer, 5000, infile)) { 95 | fprintf(stderr, "Could not paste data to file lines, inconsistent number of lines.\n"); 96 | exit(1); 97 | } 98 | } 99 | 100 | // Strip end of line symbols 101 | int i; 102 | for (i = strlen(buffer)-1; i >= 0; i--) { 103 | if (buffer[i] == '\n' || buffer[i] == '\r') 104 | buffer[i] = '\0'; 105 | else 106 | break; 107 | } 108 | // Print out 109 | fprintf(outfile, "%s\t%lf\n", buffer, *valuePtr); 110 | } 111 | 112 | lastChrom = *chromPtr; 113 | lastFinish = *finishPtr; 114 | chromPtr++; 115 | startPtr++; 116 | finishPtr++; 117 | valuePtr++; 118 | } 119 | } 120 | 121 | static bool goToNextBlock(TeeWiggleIteratorData * data) { 122 | BlockData * ptr = data->dataBlocks; 123 | static int i = 0; 124 | i++; 125 | 126 | pthread_mutex_lock(&data->continue_mutex); 127 | // Received kill signal 128 | if (data->count < 0) 129 | return true; 130 | 131 | // Check that there is work left 132 | data->count--; 133 | if (data->count == 0 && !data->done) 134 | pthread_cond_wait(&data->continue_cond, &data->continue_mutex); 135 | pthread_cond_signal(&data->continue_cond); 136 | pthread_mutex_unlock(&data->continue_mutex); 137 | 138 | // Step forward 139 | data->dataBlocks = data->dataBlocks->next; 140 | free(ptr); 141 | return false; 142 | } 143 | 144 | static void * printToFile(void * args) { 145 | TeeWiggleIteratorData * data = (TeeWiggleIteratorData *) args; 146 | 147 | // Wait for first block to arrive 148 | pthread_mutex_lock(&data->continue_mutex); 149 | if (data->count == 0 && !data->done) 150 | pthread_cond_wait(&data->continue_cond, &data->continue_mutex); 151 | pthread_mutex_unlock(&data->continue_mutex); 152 | 153 | if (data->count < 0) 154 | return NULL; 155 | 156 | while(data->dataBlocks) { 157 | printBlock(data->infile, data->outfile, data->dataBlocks); 158 | if (goToNextBlock(data)) 159 | return NULL; 160 | } 161 | return NULL; 162 | } 163 | 164 | void TeeWiggleIteratorPop(WiggleIterator * wi) { 165 | TeeWiggleIteratorData * data = (TeeWiggleIteratorData *) wi->data; 166 | WiggleIterator * iter = data->iter; 167 | if (!data->iter->done) { 168 | wi->chrom = iter->chrom; 169 | wi->start = iter->start; 170 | wi->finish = iter->finish; 171 | wi->value = iter->value; 172 | 173 | if (data->threadID) { 174 | int index = data->lastBlock->count; 175 | data->lastBlock->chroms[index] = iter->chrom; 176 | data->lastBlock->starts[index] = iter->start; 177 | data->lastBlock->finishes[index] = iter->finish; 178 | data->lastBlock->values[index] = iter->value; 179 | if (++data->lastBlock->count >= BLOCK_LENGTH) { 180 | // Communications 181 | pthread_mutex_lock(&data->continue_mutex); 182 | data->count++; 183 | pthread_cond_signal(&data->continue_cond); 184 | if (data->count > MAX_OUT_BLOCKS) 185 | pthread_cond_wait(&data->continue_cond, &data->continue_mutex); 186 | pthread_mutex_unlock(&data->continue_mutex); 187 | 188 | data->lastBlock->next = (BlockData*) calloc(1, sizeof(BlockData)); 189 | data->lastBlock = data->lastBlock->next; 190 | data->lastBlock->bedGraph = data->bedGraph; 191 | } 192 | } 193 | pop(iter); 194 | } else if (data->threadID) { 195 | pthread_mutex_lock(&data->continue_mutex); 196 | data->count++; 197 | data->done = true; 198 | pthread_cond_signal(&data->continue_cond); 199 | pthread_mutex_unlock(&data->continue_mutex); 200 | wi->done = true; 201 | pthread_join(data->threadID, NULL); 202 | } 203 | } 204 | 205 | static void launchWriter(TeeWiggleIteratorData * data) { 206 | // Initialize variables 207 | data->count = 0; 208 | data->done = false; 209 | pthread_cond_init(&data->continue_cond, NULL); 210 | pthread_mutex_init(&data->continue_mutex, NULL); 211 | data->dataBlocks = data->lastBlock = (BlockData*) calloc(1, sizeof(BlockData)); 212 | data->lastBlock->bedGraph = data->bedGraph; 213 | 214 | // Launch pthread 215 | int err = pthread_create(&data->threadID, NULL, &printToFile, data); 216 | if (err) { 217 | fprintf(stderr, "Could not create new thread %i\n", err); 218 | exit(1); 219 | } 220 | } 221 | 222 | static void killWriter(TeeWiggleIteratorData * data) { 223 | BlockData * block; 224 | 225 | if (!data->threadID) 226 | return; 227 | 228 | // Set trap 229 | pthread_mutex_lock(&data->continue_mutex); 230 | data->count = -1; 231 | pthread_cond_signal(&data->continue_cond); 232 | pthread_mutex_unlock(&data->continue_mutex); 233 | 234 | // Wait for the catch 235 | pthread_join(data->threadID, NULL); 236 | 237 | // Clear variables 238 | pthread_cond_destroy(&data->continue_cond); 239 | pthread_mutex_destroy(&data->continue_mutex); 240 | 241 | while (data->dataBlocks) { 242 | block = data->dataBlocks; 243 | data->dataBlocks = block->next; 244 | free(block); 245 | } 246 | 247 | data->dataBlocks = NULL; 248 | data->lastBlock = NULL; 249 | } 250 | 251 | void TeeWiggleIteratorSeek(WiggleIterator * wi, const char * chrom, int start, int finish) { 252 | TeeWiggleIteratorData * data = (TeeWiggleIteratorData *) wi->data; 253 | killWriter(data); 254 | fflush(data->outfile); 255 | seek(data->iter, chrom, start, finish); 256 | wi->done = false; 257 | launchWriter(data); 258 | pop(wi); 259 | } 260 | 261 | WiggleIterator * TeeWiggleIterator(WiggleIterator * i, FILE * outfile, bool bedGraph, bool holdFire) { 262 | TeeWiggleIteratorData * data = (TeeWiggleIteratorData *) calloc(1, sizeof(TeeWiggleIteratorData)); 263 | if (bedGraph) { 264 | data->iter = i; 265 | } else { 266 | data->iter = CompressionWiggleIterator(i); 267 | } 268 | data->outfile = outfile; 269 | if (bedGraph || i->overlaps) 270 | data->bedGraph = true; 271 | // Hold fire means that you wait for the first seek before doing any writing 272 | if (!holdFire) 273 | launchWriter(data); 274 | 275 | return newWiggleIterator(data, &TeeWiggleIteratorPop, &TeeWiggleIteratorSeek, i->default_value, i->overlaps); 276 | } 277 | 278 | void toFile(WiggleIterator * wi, char * filename, bool bedGraph, bool holdFire) { 279 | FILE * file = fopen(filename, "w"); 280 | if (!file) { 281 | fprintf(stderr, "Could not open file %s\n", filename); 282 | exit(1); 283 | } 284 | runWiggleIterator(TeeWiggleIterator(wi, file, bedGraph, holdFire)); 285 | } 286 | 287 | void toStdout(WiggleIterator * wi, bool bedGraph, bool holdFire) { 288 | runWiggleIterator(TeeWiggleIterator(wi, stdout, bedGraph, holdFire)); 289 | } 290 | 291 | ////////////////////////////////////////////////////////// 292 | // Paste Iterator 293 | ////////////////////////////////////////////////////////// 294 | 295 | WiggleIterator * PasteWiggleIterator(WiggleIterator * i, FILE * infile, FILE * outfile, bool holdFire) { 296 | TeeWiggleIteratorData * data = (TeeWiggleIteratorData *) calloc(1, sizeof(TeeWiggleIteratorData)); 297 | data->iter = i; 298 | data->infile = infile; 299 | data->bedGraph = true; 300 | data->outfile = outfile; 301 | // Hold fire means that you wait for the first seek before doing any writing 302 | if (!holdFire) 303 | launchWriter(data); 304 | 305 | return newWiggleIterator(data, &TeeWiggleIteratorPop, NULL, i->default_value, i->overlaps); 306 | } 307 | -------------------------------------------------------------------------------- /src/wiggleIterator.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include "wiggleIterator.h" 19 | 20 | WiggleIterator * newWiggleIterator(void * data, void (*popFunction)(WiggleIterator *), void (*seek)(WiggleIterator *, const char *, int, int), double default_value, bool overlapping) { 21 | WiggleIterator * new = (WiggleIterator *) calloc(1, sizeof(WiggleIterator)); 22 | new->data = data; 23 | new->pop = popFunction; 24 | new->seek = seek; 25 | new->chrom = NULL; 26 | new->value = 1; // Default value for non-valued bed tracks; 27 | new->strand = 0; // Default value for non-stranded data; 28 | new->valuePtr = NULL; 29 | new->overlaps = overlapping; 30 | new->append = NULL; 31 | new->default_value = default_value; 32 | pop(new); 33 | return new; 34 | } 35 | 36 | WiggleIterator * newWiggleIteratorChromName(void * data, void (*popFunction)(WiggleIterator *), void (*seek)(WiggleIterator *, const char *, int, int), double default_value, bool overlapping) { 37 | WiggleIterator * new = (WiggleIterator *) calloc(1, sizeof(WiggleIterator)); 38 | new->data = data; 39 | new->pop = popFunction; 40 | new->seek = seek; 41 | new->chrom = calloc(1000,1); 42 | new->value = 1; // Default value for non-valued bed tracks; 43 | new->strand = 0; // Default value for non-stranded data; 44 | new->valuePtr = NULL; 45 | new->overlaps = overlapping; 46 | new->append = NULL; 47 | new->default_value = default_value; 48 | pop(new); 49 | return new; 50 | } 51 | 52 | void destroyWiggleIterator(WiggleIterator * wi) { 53 | free(wi->data); 54 | free(wi); 55 | } 56 | 57 | void pop(WiggleIterator * wi) { 58 | if (!wi->done) 59 | wi->pop(wi); 60 | } 61 | 62 | void runWiggleIterator(WiggleIterator * wi) { 63 | while (!wi->done) 64 | wi->pop(wi); 65 | } 66 | 67 | void seek(WiggleIterator * wi, const char * chrom, int start, int finish) { 68 | wi->done = false; 69 | (*(wi->seek))(wi, chrom, start, finish); 70 | } 71 | -------------------------------------------------------------------------------- /src/wiggleIterator.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef _WIGGLETOOLS_PRIV_ 16 | #define _WIGGLETOOLS_PRIV_ 17 | 18 | #include 19 | #include "wiggletools.h" 20 | 21 | struct wiggleIterator_st { 22 | char * chrom; 23 | int start; 24 | int finish; 25 | double value; 26 | void * valuePtr; 27 | bool done; 28 | int strand; 29 | void * data; 30 | void (*pop)(WiggleIterator *); 31 | void (*seek)(WiggleIterator *, const char *, int, int); 32 | bool overlaps; 33 | double default_value; 34 | WiggleIterator * append; 35 | }; 36 | 37 | WiggleIterator * newWiggleIterator(void * data, void (*pop)(WiggleIterator *), void (*seek)(WiggleIterator *, const char *, int, int), double default_value, bool overlapping); 38 | WiggleIterator * newWiggleIteratorChromName(void * data, void (*popFunction)(WiggleIterator *), void (*seek)(WiggleIterator *, const char *, int, int), double default_value, bool overlapping); 39 | void pop(WiggleIterator *); 40 | WiggleIterator * CompressionWiggleIterator(WiggleIterator *); 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /src/wiggletools.c: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | // Local header 20 | #include "wiggletools.h" 21 | 22 | int main(int argc, char ** argv) { 23 | if (argc < 2 || strcmp(argv[1], "--help") == 0) { 24 | printHelp(); 25 | return 0; 26 | } 27 | 28 | libBigWigInit(128000); 29 | 30 | rollYourOwn(argc-1, argv+1); 31 | 32 | return 0; 33 | } 34 | 35 | -------------------------------------------------------------------------------- /src/wiggletools.h: -------------------------------------------------------------------------------- 1 | // Copyright [1999-2017] EMBL-European Bioinformatics Institute 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef _WIGGLETOOLS_DEF_ 16 | #define _WIGGLETOOLS_DEF_ 17 | 18 | #ifndef bool 19 | #define bool char 20 | #define true 1 21 | #define false 0 22 | #endif 23 | 24 | #include 25 | 26 | typedef struct wiggleIterator_st WiggleIterator; 27 | typedef struct multiplexer_st Multiplexer; 28 | typedef struct multiset_st Multiset; 29 | typedef struct histogram_st Histogram; 30 | 31 | // Creators 32 | WiggleIterator * SmartReader (char *, bool); 33 | WiggleIterator * CatWiggleIterator (char **, int); 34 | // Secondary creators (to force file format recognition if necessary) 35 | WiggleIterator * WiggleReader (char *); 36 | WiggleIterator * BigWiggleReader (char *, bool); 37 | WiggleIterator * BedReader (char *); 38 | WiggleIterator * BigBedReader (char *, bool); 39 | WiggleIterator * BamReader (char *, bool, bool); 40 | WiggleIterator * SamReader (char *, bool); 41 | WiggleIterator * VcfReader (char *); 42 | WiggleIterator * BcfReader (char *, bool); 43 | 44 | // Generic class functions 45 | void seek(WiggleIterator *, const char *, int, int); 46 | 47 | // Algebraic operations on iterators 48 | 49 | // Unary 50 | WiggleIterator * UnitWiggleIterator (WiggleIterator *); 51 | WiggleIterator * CoverageWiggleIterator (WiggleIterator *); 52 | WiggleIterator * UnionWiggleIterator (WiggleIterator *); 53 | WiggleIterator * NonOverlappingWiggleIterator (WiggleIterator *); 54 | WiggleIterator * AbsWiggleIterator (WiggleIterator * ); 55 | WiggleIterator * NaturalLogWiggleIterator (WiggleIterator *); 56 | WiggleIterator * NaturalExpWiggleIterator (WiggleIterator *); 57 | WiggleIterator * TestNonOverlappingWiggleIterator(WiggleIterator *); 58 | WiggleIterator * OverlapWiggleIterator(WiggleIterator *, WiggleIterator *); 59 | WiggleIterator * TrimWiggleIterator(WiggleIterator *, WiggleIterator *); 60 | WiggleIterator * NoverlapWiggleIterator(WiggleIterator *, WiggleIterator *); 61 | WiggleIterator * NearestWiggleIterator(WiggleIterator *, WiggleIterator *); 62 | WiggleIterator * IsZero(WiggleIterator *); 63 | WiggleIterator * Floor(WiggleIterator *); 64 | WiggleIterator * ToInt(WiggleIterator *); 65 | WiggleIterator * CompressionWiggleIterator(WiggleIterator *); 66 | // Scalar operations 67 | WiggleIterator * ScaleWiggleIterator (WiggleIterator *, double); 68 | WiggleIterator * ShiftWiggleIterator(WiggleIterator *, double); 69 | WiggleIterator * ShiftPosIterator(WiggleIterator *, double); 70 | WiggleIterator * PowerWiggleIterator (WiggleIterator *, double); 71 | WiggleIterator * LogWiggleIterator (WiggleIterator * , double); 72 | WiggleIterator * ExpWiggleIterator (WiggleIterator *, double); 73 | WiggleIterator * DefaultValueWiggleIterator(WiggleIterator *, double); 74 | WiggleIterator * HighPassFilterWiggleIterator(WiggleIterator *, double, bool); 75 | WiggleIterator * SmoothWiggleIterator(WiggleIterator * i, int); 76 | WiggleIterator * BinningWiggleIterator(WiggleIterator * i, int); 77 | WiggleIterator * ExtendWiggleIterator(WiggleIterator * i, int); 78 | 79 | // Sets of iterators 80 | Multiplexer * newMultiplexer(WiggleIterator **, int, bool); 81 | 82 | // Reduction operators on sets 83 | 84 | WiggleIterator * SelectReduction(Multiplexer *, int); 85 | WiggleIterator * MaxReduction ( Multiplexer * ); 86 | WiggleIterator * MinReduction ( Multiplexer * ); 87 | WiggleIterator * SumReduction ( Multiplexer * ); 88 | WiggleIterator * ProductReduction ( Multiplexer * ); 89 | WiggleIterator * MeanReduction ( Multiplexer * ); 90 | WiggleIterator * VarianceReduction ( Multiplexer * ); 91 | WiggleIterator * StdDevReduction ( Multiplexer * ); 92 | WiggleIterator * EntropyReduction ( Multiplexer * ); 93 | WiggleIterator * CVReduction ( Multiplexer * ); 94 | WiggleIterator * MedianReduction ( Multiplexer * ); 95 | WiggleIterator * FillInReduction( Multiplexer * , bool); 96 | 97 | // Sets of sets iterators 98 | Multiset * newMultiset(Multiplexer **, int); 99 | 100 | // Reduction operators on sets of sets: 101 | WiggleIterator * TTestReduction(Multiset *); 102 | WiggleIterator * FTestReduction(Multiset *); 103 | WiggleIterator * MWUReduction(Multiset *); 104 | 105 | // Output 106 | void toFile (WiggleIterator *, char *, bool, bool); 107 | void toStdout (WiggleIterator *, bool, bool); 108 | WiggleIterator * TeeWiggleIterator(WiggleIterator *, FILE *, bool, bool); 109 | void runWiggleIterator(WiggleIterator * ); 110 | Multiplexer * TeeMultiplexer(Multiplexer *, FILE *, bool, bool); 111 | void toStdoutMultiplexer (Multiplexer *, bool, bool); 112 | void runMultiplexer(Multiplexer * ); 113 | WiggleIterator * PrintStatisticsWiggleIterator(WiggleIterator * i, FILE * file); 114 | 115 | // Statistics 116 | // Unary 117 | WiggleIterator * AUCIntegrator (WiggleIterator *); 118 | WiggleIterator * MeanIntegrator (WiggleIterator *); 119 | WiggleIterator * MinIntegrator (WiggleIterator *); 120 | WiggleIterator * MaxIntegrator (WiggleIterator *); 121 | WiggleIterator * VarianceIntegrator (WiggleIterator *); 122 | WiggleIterator * StandardDeviationIntegrator (WiggleIterator *); 123 | WiggleIterator * CoefficientOfVariationIntegrator (WiggleIterator *); 124 | WiggleIterator * NDPearsonIntegrator(Multiset *); 125 | WiggleIterator * EnergyIntegrator(WiggleIterator *, int); 126 | void regionProfile(WiggleIterator *, double *, int, int, bool); 127 | void addProfile(double *, double *, int); 128 | // Binary 129 | WiggleIterator * PearsonIntegrator (Multiplexer * multi); 130 | // Histograms 131 | Histogram * histogram(WiggleIterator **, int, int); 132 | void normalize_histogram(Histogram *); 133 | void print_histogram(Histogram *, FILE *); 134 | 135 | // Regional statistics 136 | Multiplexer * ApplyMultiplexer(WiggleIterator *, WiggleIterator * (**statistics)(WiggleIterator *), int count, WiggleIterator *, bool strict); 137 | Multiplexer * ProfileMultiplexer(WiggleIterator *, int, WiggleIterator *); 138 | Multiplexer * PasteMultiplexer(Multiplexer *, FILE *, FILE *, bool); 139 | 140 | // Cleaning up 141 | void destroyWiggleIterator (WiggleIterator *); 142 | 143 | // Big file params 144 | void libBigWigInit(int); 145 | 146 | // Command line parser 147 | void rollYourOwn(int argc, char ** argv); 148 | void printHelp(); 149 | 150 | #endif 151 | -------------------------------------------------------------------------------- /technical-supplement.tex: -------------------------------------------------------------------------------- 1 | \documentclass[12pt]{article} 2 | \usepackage{amsmath} 3 | 4 | \begin{document} 5 | 6 | \title{WiggleTools - technical supplement} 7 | \author{Daniel Zerbino} 8 | \maketitle 9 | 10 | \section{Online covariance calculation} 11 | 12 | The following is a minor adaptation of a formula presented by David Will\'e, who came up with the idea. 13 | 14 | The covariance between two series $X = \{x_n\}_{n=1..N}$ and $Y = \{y_n\}_{n=1..N}$ is defined as: 15 | \begin{align} 16 | cov(X,Y) &= E[(X-E[X])(Y-E[y])] \\ 17 | &= E[XY] - E[X]E[Y] 18 | \end{align} 19 | 20 | When handling very large datasets, WiggleTools processes the data as soon as it is read then discards it, thus obviating the need for large memory machines. Equation (1) is a problem, because you can only know $E[X]$ and $E[Y]$ after going through the entire dataset, so all the data would have to be stored in memory to compute the overall expectation. 21 | 22 | Equation (2) can be computed online. However, computing the sum of $(x_ny_n)_{n=1..N}$, then dividing by $N$ at the end of the run produces a very large number, which is prone to overflow the precision of the CPU given enough datapoints. 23 | 24 | However, if we define $R_n = \sum_{i=1}^n x_iy_i$, $S_n^x = \sum_{i=1}^n x_i$ and $S_n^y = \sum_{i=1}^n y_i$, then we re-write equation (2): 25 | 26 | $$cov(X,Y) = \frac{R_N-S_N^xS_N^y / N}{N}$$ 27 | 28 | We can therefore compute the covariance online, using the series $\{T^{xy}_n\}_{n=1..N} = \{R_n-S_n^xS_n^y / n\}_{i=1..N}$ 29 | 30 | $$cov(X,Y) = \frac{T^{xy}_N}{N}$$ 31 | 32 | Because it is a difference, $T^{xy}_n$ is less likely to overflow CPU precision than an online calculation of $R_n$. 33 | 34 | We assume that we computed $T^{xy}_n$. Because of the nature of the data processed by WiggleTools, it is common to encounter series of points with identical values. We therefore assume that there exists $k \ge 1$ such that $(x_{n+1}=...=x_{n+k})$ and $(y_{n+1}=...=y_{n+k})$. This gives us: 35 | 36 | \begin{align*} 37 | T^{xy}_{n+k} &= R_{n+k} - \frac{S_{n+k}^xS_{n+k}^y}{n+k} \\ 38 | &= (R_n + kx_{n+1}y_{n+1}) - \frac{(S_n^x + kx_{n+1})(S_n^y+ky_{n+1})}{n+k} \\ 39 | &= R_n + kx_{n+1}y_{n+1} - \frac{S_n^xS_n^y + S_n^ykx_{n+1} + S_n^xky_{n+1} + k^2x_{n+1}y_{n+1}}{n+k} \\ 40 | &= R_n - \frac{S_n^xS_n^y}{n+k} + \left(1 - \frac{k}{n+k}\right)kx_{n+1}y_{n+1} - \frac{S_n^ykx_{n+1} + S_n^xky_{n+1}}{n+k}\\ 41 | &= R_n - S_n^xS_n^y\left(\frac{1}{n}-\frac{k}{n(n+k)}\right) +\frac{n}{n+k}kx_{n+1}y_{n+1} - \frac{S_n^ykx_{n+1} + S_n^xky_{n+1}}{n+k}\\ 42 | &= \left(R_n - S_n^xS_n^y/n\right) + \frac{k}{n(n+k)}S_n^xS_n^y +\frac{nk}{n+k}x_{n+1}y_{n+1} - \frac{k}{n+k}S_n^yx_{n+1} + S_n^xy_{n+1}\\ 43 | &= T^{xy}_n + \frac{k}{n+k}\left(\frac{S_n^xS_n^y}{n} +nx_{n+1}y_{n+1} - S_n^yx_{n+1} - S_n^xy_{n+1}\right)\\ 44 | \end{align*} 45 | 46 | It is thus possible to compute $cov(X,Y)$ online, keeping only $S_n^x$, $S_n^y$, $T^{xy}_n$ and $n$ in memory. Note that if $n \gg k$, $S_n^x \gg x$ and $S_n^y \gg y$ we obtain: 47 | 48 | $$T^{xy}_{n+k} - T^{xy}_n \approx \bar{x}\bar{y} +x_{n+1}y_{n+1} - \bar{y}x_{n+1} - \bar{x}y_{n+1} $$ 49 | $$T^{xy}_{n+k} - T^{xy}_n \approx (x_{n+1} - \bar{x})(y_{n+1} - \bar{y}) $$ 50 | This implies that although $T^{xy}_{n}$ grows in $\mathcal{O}(n)$, the increments are commensurate to the product of the standard deviations of $X$ and $Y$. 51 | 52 | \section{Online variance computation} 53 | 54 | The above formulas can easily be adjusted to computing variance of a series $X = \{x_n\}_{n=1..N}$, since: 55 | $$\sigma^2(X)=cov(X,X)=\frac{T^{xx}_N}{N}$$ 56 | 57 | Where: 58 | $$T^{xx}_0 = 0$$ 59 | and: 60 | $$T^{xx}_{n+k} = T^{xx}_n + \frac{k}{n+k}\left(\frac{\left(S_n^x\right)^2}{n} +nx_{n+1}^2 - 2S_n^xx_{n+1}\right)$$ 61 | 62 | It is thus possible to compute $\sigma^2(X)$ online, keeping only $S_n^x$, $T^{xx}_n$ and $n$ in memory. 63 | 64 | \section{Online Pearson correlation computation} 65 | 66 | We apply the same idea to the Pearson correlation: 67 | 68 | \begin{align*} 69 | \rho &= \frac{cov(X,Y)}{\sqrt{\sigma^2(X)\sigma^2(Y)}} \\ 70 | \rho &= \frac{\frac{T^{xy}_N}{N}}{\sqrt{\frac{T^{xx}_N}{N}\frac{T^{yy}_N}{N}}} \\ 71 | \rho &= \frac{T^{xy}_N}{\sqrt{T^{xx}_NT^{yy}_N}} 72 | \end{align*} 73 | 74 | 75 | \end{document} 76 | -------------------------------------------------------------------------------- /test/bam.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/bam.bam -------------------------------------------------------------------------------- /test/bam.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/bam.bam.bai -------------------------------------------------------------------------------- /test/bcf.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/bcf.bcf -------------------------------------------------------------------------------- /test/bcf.bcf.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/bcf.bcf.csi -------------------------------------------------------------------------------- /test/chrom_sizes: -------------------------------------------------------------------------------- 1 | chr1 15 2 | chr2 30 3 | -------------------------------------------------------------------------------- /test/cram.cram: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/cram.cram -------------------------------------------------------------------------------- /test/cram.cram.crai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/cram.cram.crai -------------------------------------------------------------------------------- /test/expected/nearest_fixedStep.bg: -------------------------------------------------------------------------------- 1 | chr1 0 1 0.000000 2 | chr1 1 2 0.000000 3 | chr1 2 3 1.000000 4 | chr1 3 4 0.000000 5 | chr1 4 5 1.000000 6 | chr1 5 6 0.000000 7 | chr1 6 7 1.000000 8 | chr1 7 8 0.000000 9 | chr1 8 9 1.000000 10 | chr1 9 10 2.000000 11 | -------------------------------------------------------------------------------- /test/expected/nearest_overlapping.bg: -------------------------------------------------------------------------------- 1 | chr1 2 6 0.000000 2 | chr1 3 8 0.000000 3 | chr2 1 4 nan 4 | -------------------------------------------------------------------------------- /test/expected/pearson.txt: -------------------------------------------------------------------------------- 1 | -0.028968 2 | -------------------------------------------------------------------------------- /test/expected/profile.txt: -------------------------------------------------------------------------------- 1 | 0 5.000000 2 | 1 12.000000 3 | 2 22.000000 4 | -------------------------------------------------------------------------------- /test/expected/profiles.txt: -------------------------------------------------------------------------------- 1 | chr1 3 7 2.000000 3.000000 9.000000 2 | chr1 4 9 3.000000 9.000000 13.000000 3 | chr2 2 5 0.000000 0.000000 0.000000 4 | -------------------------------------------------------------------------------- /test/expected/regional_means.txt: -------------------------------------------------------------------------------- 1 | chr1 2 6 . 1000 3.500000 2 | chr1 3 8 . 1000 5.000000 3 | chr2 1 4 . 1000 nan 4 | -------------------------------------------------------------------------------- /test/fixedStep.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/fixedStep.bw -------------------------------------------------------------------------------- /test/fixedStep.wig: -------------------------------------------------------------------------------- 1 | fixedStep chrom=chr1 start=1 step=1 span=1 2 | 0 3 | 1 4 | 2 5 | 3 6 | 4 7 | 5 8 | 6 9 | 7 10 | 8 11 | 9 12 | -------------------------------------------------------------------------------- /test/overlapping.bb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/overlapping.bb -------------------------------------------------------------------------------- /test/overlapping.bed: -------------------------------------------------------------------------------- 1 | chr1 2 6 . 1000 2 | chr1 3 8 . 1000 3 | chr2 1 4 . 1000 4 | -------------------------------------------------------------------------------- /test/overlapping_coverage.wig: -------------------------------------------------------------------------------- 1 | fixedStep chrom=chr1 start=3 step=1 2 | 1.000000 3 | 2.000000 4 | 2.000000 5 | 2.000000 6 | 1.000000 7 | 1.000000 8 | fixedStep chrom=chr2 start=2 step=1 9 | 1.000000 10 | 1.000000 11 | 1.000000 12 | -------------------------------------------------------------------------------- /test/program.txt: -------------------------------------------------------------------------------- 1 | do isZero diff fixedStep.bw fixedStep.wig 2 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import shutil 4 | import subprocess 5 | 6 | def test(cmd): 7 | print('Testing: %s' % cmd) 8 | return subprocess.call(cmd, shell = True) 9 | 10 | def testOutput(cmd): 11 | print('Testing: %s' % cmd) 12 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell = True) 13 | assert p.wait() == 0 14 | out, err = p.communicate() 15 | return out 16 | 17 | if os.path.exists('tmp'): 18 | shutil.rmtree('tmp') 19 | os.mkdir('tmp') 20 | 21 | # Negative control 22 | assert test('../bin/wiggletools do isZero diff fixedStep.bw variableStep.wig') == 1 23 | 24 | # Test trailing token check: 25 | assert test('../bin/wiggletools diff fixedStep.bw variableStep.wig fixedStep.wig') == 1 26 | 27 | # Positive control 28 | assert test('../bin/wiggletools do isZero diff fixedStep.bw fixedStep.wig') == 0 29 | 30 | # Testing ratios and offset 31 | assert test('../bin/wiggletools do isZero offset -1 ratio variableStep.bw variableStep.wig') == 0 32 | 33 | # Testing BAM & BedGraph 34 | assert test('../bin/wiggletools do isZero diff bam.bam pileup.bg') == 0 35 | 36 | # Testing BAM & CRAM 37 | assert test('../bin/wiggletools do isZero diff bam.bam cram.cram') == 0 38 | 39 | # Testing BAM & SAM 40 | assert test('../bin/wiggletools do isZero diff bam.bam sam.sam') == 0 41 | 42 | # Testing fast BAM and SAM 43 | assert test('../bin/wiggletools do isZero diff read_count bam.bam read_count sam.sam') == 0 44 | 45 | # Testing BAM & SAM 46 | assert test('cat sam.sam | ../bin/wiggletools do isZero diff bam.bam sam -') == 0 47 | 48 | # Testing Bed and BigBed 49 | assert test('../bin/wiggletools do isZero diff overlapping.bed overlapping.bb') == 0 50 | 51 | # Testing Wig and BigWig 52 | assert test('../bin/wiggletools do isZero diff variableStep.bw variableStep.wig') == 0 53 | 54 | # Testing VCF and BCF 55 | assert test('../bin/wiggletools do isZero diff vcf.vcf bcf.bcf') == 0 56 | 57 | # Testing BAM & BedGraph 58 | assert test('../bin/wiggletools do isZero seek GL000200.1 1 1000 diff bam.bam pileup.bg') == 0 59 | 60 | # Testing BAM & CRAM 61 | assert test('../bin/wiggletools do isZero seek GL000200.1 1 1000 diff bam.bam cram.cram') == 0 62 | 63 | # Testing SAM & BedGraph 64 | assert test('../bin/wiggletools do isZero seek GL000200.1 1 1000 diff sam.sam pileup.bg') == 0 65 | 66 | # Testing Bed and BigBed 67 | assert test('../bin/wiggletools do isZero seek chr1 2 6 diff overlapping.bed overlapping.bb') == 0 68 | 69 | # Testing Wig and BigWig 70 | assert test('../bin/wiggletools do isZero seek chr1 2 6 diff variableStep.bw variableStep.wig') == 0 71 | 72 | # Testing VCF and BCF 73 | assert test('../bin/wiggletools do isZero seek chr1 2 6 diff vcf.vcf bcf.bcf') == 0 74 | 75 | # Testing sum, scale and multiplexers 76 | assert test('../bin/wiggletools do isZero diff sum fixedStep.bw fixedStep.bw : scale 2 fixedStep.bw') == 0 77 | 78 | # Testing open-ended lists 79 | assert test('../bin/wiggletools do isZero diff sum fixedStep.bw fixedStep.bw : sum fixedStep.bw fixedStep.bw ') == 0 80 | 81 | # Testing map 82 | assert test('../bin/wiggletools do isZero diff ln fixedStep.bw sum map ln fixedStep.bw ') == 0 83 | 84 | # Testing log and exponential 85 | assert test('../bin/wiggletools do isZero diff ln exp fixedStep.bw fixedStep.wig') == 0 86 | 87 | # Testing power and multiplication 88 | assert test('../bin/wiggletools do isZero diff pow 2 fixedStep.bw mult fixedStep.wig fixedStep.wig') == 0 89 | 90 | # Testing smoothing 91 | # TODO : Find better test 92 | # assert test('../bin/wiggletools do isZero diff smooth 2 fixedStep.wig fixedStep.wig') == 0 93 | 94 | # Testing filters 95 | assert float(testOutput('../bin/wiggletools AUC lt 4 fixedStep.wig')) == 4 96 | assert float(testOutput('../bin/wiggletools AUC lte 4 fixedStep.wig')) == 5 97 | assert float(testOutput('../bin/wiggletools AUC gte 4 fixedStep.wig')) == 6 98 | assert float(testOutput('../bin/wiggletools AUC gt 4 fixedStep.wig')) == 5 99 | 100 | # Testing apply 101 | assert test('../bin/wiggletools apply_paste tmp/regional_means.txt meanI overlapping.bed fixedStep.wig') == 0 102 | 103 | # Testing pearson 104 | assert test('../bin/wiggletools print tmp/pearson.txt pearson fixedStep.wig variableStep.wig') == 0 105 | 106 | # Testing profiles 107 | assert test('../bin/wiggletools profiles tmp/profiles.txt 3 overlapping.bed fixedStep.wig') == 0 108 | 109 | # Testing profile 110 | assert test('../bin/wiggletools profile tmp/profile.txt 3 overlapping.bed fixedStep.wig') == 0 111 | 112 | # Test overlap 113 | assert test('../bin/wiggletools do isZero diff fixedStep.wig overlaps fixedStep.wig fixedStep.wig') == 0 114 | 115 | # Test nearest #1 116 | assert test('../bin/wiggletools write_bg tmp/nearest_overlapping.bg nearest variableStep.wig overlapping.bed') == 0 117 | 118 | # Test nearest #1 119 | assert test('../bin/wiggletools write_bg tmp/nearest_fixedStep.bg nearest variableStep.wig fixedStep.bw') == 0 120 | 121 | # Test min 122 | assert float(testOutput('../bin/wiggletools print - minI fixedStep.wig')) == 0 123 | 124 | # Test max 125 | assert float(testOutput('../bin/wiggletools print - maxI fixedStep.wig')) == 9 126 | 127 | # Test coverage 128 | assert test('../bin/wiggletools do isZero diff overlapping_coverage.wig coverage overlapping.bed') == 0 129 | 130 | #Test trim 131 | assert test('../bin/wiggletools do isZero diff trim overlapping.bed variableStep.wig mult overlapping.bed variableStep.wig') == 0 132 | 133 | #Test floor 134 | assert test('../bin/wiggletools do isZero diff floor fixedStep.wig floor fixedStep.wig') == 0 135 | 136 | #Test toInt 137 | assert test('../bin/wiggletools do isZero diff toInt fixedStep.wig toInt variableStep.wig') == 1 138 | 139 | # Test program file 140 | assert test('../bin/wiggletools run program.txt') == 0 141 | 142 | assert test('diff tmp expected') == 0 143 | 144 | 145 | shutil.rmtree('tmp') 146 | 147 | print('All tests OK') 148 | -------------------------------------------------------------------------------- /test/variableStep.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/WiggleTools/34c5942f2445ee8a62e0ec24d532e35ab680b5fb/test/variableStep.bw -------------------------------------------------------------------------------- /test/variableStep.wig: -------------------------------------------------------------------------------- 1 | variableStep chrom=chr1 span=1 2 | 1 1 3 | 2 2 4 | 4 3 5 | 6 4 6 | 8 5 7 | -------------------------------------------------------------------------------- /test/vcf.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=1000GenomesPilot-NCBI36 5 | ##phasing=partial 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##FILTER= 13 | ##FILTER= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 | chr1 3 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 | chr1 5 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 21 | chr1 7 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 22 | --------------------------------------------------------------------------------