├── .gitmodules ├── src ├── test │ ├── resources │ │ ├── auxf.fa.fai │ │ ├── auxf.fa │ │ ├── test.bam │ │ ├── test.cram │ │ ├── test.vcf.bgz │ │ ├── test.vcf.gz │ │ ├── test.bgzf.bcf │ │ ├── misnamedBam.sam │ │ ├── test.vcf.bgzf.gz │ │ ├── HiSeq.10000.vcf.bgz │ │ ├── HiSeq.10000.vcf.gz │ │ ├── test.uncompressed.bcf │ │ ├── HiSeq.10000.vcf.bgz.tbi │ │ ├── HiSeq.10000.vcf.bgzf.gz │ │ ├── HiSeq.10000.vcf.bgzf.gz.tbi │ │ ├── log4j.properties │ │ ├── test_headerless.sam │ │ ├── README │ │ ├── test.sam │ │ ├── mini-chr1-chr2.fasta │ │ └── test.vcf │ └── java │ │ └── org │ │ └── seqdoop │ │ └── hadoop_bam │ │ ├── TestBAMSplitGuesser.java │ │ ├── util │ │ ├── TestVCFHeaderReader.java │ │ └── TestVCFFileMerger.java │ │ ├── TestSAMFormat.java │ │ ├── TestAnySAMInputFormat.java │ │ ├── TestVCFFormat.java │ │ ├── TestSplittingBAMIndexer.java │ │ ├── TestSAMHeaderReader.java │ │ ├── TestBGZFSplitGuesser.java │ │ ├── TestLineReader.java │ │ ├── TestConfHelper.java │ │ ├── IntervalUtilTest.java │ │ ├── BAMTestUtil.java │ │ ├── TestFastaInputFormat.java │ │ ├── TestSAMInputFormat.java │ │ ├── TestVCFInputFormatStringency.java │ │ └── TestCRAMInputFormatOnHDFS.java └── main │ └── java │ ├── htsjdk │ └── samtools │ │ ├── SAMRecordHelper.java │ │ └── LinearBAMIndex.java │ └── org │ └── seqdoop │ └── hadoop_bam │ ├── CRAMOutputFormat.java │ ├── KeyIgnoringCRAMRecordWriter.java │ ├── FormatException.java │ ├── VariantContextWithHeader.java │ ├── util │ ├── BGZFCompressionOutputStream.java │ ├── DataOutputWrapper.java │ ├── DataInputWrapper.java │ ├── GetSortedBAMHeader.java │ ├── ConfHelper.java │ ├── IntervalUtil.java │ ├── BGZFCodec.java │ ├── VCFHeaderReader.java │ ├── WrapSeekable.java │ ├── BGZFEnhancedGzipCodec.java │ ├── BGZFSplitCompressionInputStream.java │ ├── SAMHeaderReader.java │ ├── NIOFileUtil.java │ ├── BGZFBlockIndex.java │ └── SAMOutputPreparer.java │ ├── IntelGKLAccessor.java │ ├── KeyIgnoringBAMRecordWriter.java │ ├── FormatConstants.java │ ├── KeyIgnoringSAMRecordWriter.java │ ├── SAMInputFormat.java │ ├── SAMFormat.java │ ├── VCFOutputFormat.java │ ├── AnySAMOutputFormat.java │ ├── BAMOutputFormat.java │ ├── LazyParsingGenotypesContext.java │ ├── KeyIgnoringCRAMOutputFormat.java │ ├── KeyIgnoringBCFRecordWriter.java │ ├── VariantContextWritable.java │ ├── KeyIgnoringVCFRecordWriter.java │ ├── CRAMRecordReader.java │ ├── VCFFormat.java │ ├── SAMRecordWritable.java │ ├── BaseSplitGuesser.java │ ├── SAMRecordWriter.java │ ├── KeyIgnoringBAMOutputFormat.java │ ├── CRAMInputFormat.java │ ├── LazyBAMRecordFactory.java │ ├── KeyIgnoringAnySAMOutputFormat.java │ ├── FileVirtualSplit.java │ ├── CRAMRecordWriter.java │ ├── LazyVCFGenotypesContext.java │ ├── ReferenceFragment.java │ └── SplittingBAMIndex.java ├── bgzf-terminator.bin ├── NOTICE.txt ├── .gitignore ├── findbugs-exclude.xml ├── examples └── README.txt ├── .travis.yml ├── scripts ├── release │ ├── settings.xml │ ├── release.sh │ └── README.md └── deploy │ └── addServerToM2Settings.py └── LICENSE.txt /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/auxf.fa.fai: -------------------------------------------------------------------------------- 1 | Sheila 20 8 20 21 2 | -------------------------------------------------------------------------------- /src/test/resources/auxf.fa: -------------------------------------------------------------------------------- 1 | >Sheila 2 | GCTAGCTCAGAAAAAAAAAA 3 | -------------------------------------------------------------------------------- /bgzf-terminator.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/bgzf-terminator.bin -------------------------------------------------------------------------------- /src/test/resources/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.bam -------------------------------------------------------------------------------- /src/test/resources/test.cram: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.cram -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | This product includes software developed by The Apache Software 2 | Foundation (http://www.apache.org/). 3 | -------------------------------------------------------------------------------- /src/test/resources/test.vcf.bgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.vcf.bgz -------------------------------------------------------------------------------- /src/test/resources/test.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.vcf.gz -------------------------------------------------------------------------------- /src/test/resources/test.bgzf.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.bgzf.bcf -------------------------------------------------------------------------------- /src/test/resources/misnamedBam.sam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/misnamedBam.sam -------------------------------------------------------------------------------- /src/test/resources/test.vcf.bgzf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.vcf.bgzf.gz -------------------------------------------------------------------------------- /src/test/resources/HiSeq.10000.vcf.bgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.bgz -------------------------------------------------------------------------------- /src/test/resources/HiSeq.10000.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.gz -------------------------------------------------------------------------------- /src/test/resources/test.uncompressed.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.uncompressed.bcf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /test-classes 3 | /hadoop-bam.jar 4 | /libs 5 | .idea 6 | *.iml 7 | maven-metadata-local.xml 8 | *~ 9 | target 10 | -------------------------------------------------------------------------------- /src/test/resources/HiSeq.10000.vcf.bgz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.bgz.tbi -------------------------------------------------------------------------------- /src/test/resources/HiSeq.10000.vcf.bgzf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.bgzf.gz -------------------------------------------------------------------------------- /src/test/resources/HiSeq.10000.vcf.bgzf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.bgzf.gz.tbi -------------------------------------------------------------------------------- /findbugs-exclude.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger = WARN, out 2 | 3 | log4j.appender.out = org.apache.log4j.ConsoleAppender 4 | log4j.appender.out.layout = org.apache.log4j.PatternLayout 5 | log4j.appender.out.layout.ConversionPattern = %d (%t) [%p - %l] %m%n 6 | 7 | log4j.logger.org.seqdoop.hadoop_bam=DEBUG 8 | -------------------------------------------------------------------------------- /src/main/java/htsjdk/samtools/SAMRecordHelper.java: -------------------------------------------------------------------------------- 1 | package htsjdk.samtools; 2 | 3 | /** 4 | * This class is required in order to access the protected 5 | * {@link SAMRecord#eagerDecode()} method in HTSJDK. 6 | */ 7 | public class SAMRecordHelper { 8 | public static void eagerDecode(SAMRecord record) { 9 | record.eagerDecode(); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/test/resources/test_headerless.sam: -------------------------------------------------------------------------------- 1 | read_28833_29006_6945 99 chr21 28833 20 10M1D25M = 28993 195 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< H0:i:0 H1:i:0 MF:i:130 RG:Z:L1 Nm:i:1 2 | read_28701_28881_323b 147 chr21 28834 30 35M = 28701 -168 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< H0:i:1 H1:i:0 MF:i:18 RG:Z:L2 Nm:i:0 3 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/CRAMOutputFormat.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 4 | 5 | /** Currently this only locks down the value type of the {@link 6 | * org.apache.hadoop.mapreduce.OutputFormat}: contains no functionality. 7 | */ 8 | public abstract class CRAMOutputFormat 9 | extends FileOutputFormat 10 | {} 11 | -------------------------------------------------------------------------------- /src/test/resources/README: -------------------------------------------------------------------------------- 1 | Commands to generate file indexes and compressed versions. 2 | 3 | bgzip -c src/test/resources/HiSeq.10000.vcf > src/test/resources/HiSeq.10000.vcf.bgz 4 | bcftools index -t src/test/resources/HiSeq.10000.vcf.bgz 5 | 6 | cp src/test/resources/HiSeq.10000.vcf.bgz src/test/resources/HiSeq.10000.vcf.bgzf.gz 7 | cp src/test/resources/HiSeq.10000.vcf.bgz.tbi src/test/resources/HiSeq.10000.vcf.bgzf.gz.tbi 8 | 9 | gzip -k src/test/resources/HiSeq.10000.vcf -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | This directory contains examples for how to use Hadoop-BAM as a 2 | library. To build the examples do: 3 | 4 | $ mvn clean package 5 | 6 | To run the examples: 7 | 8 | $ hadoop jar target/*-jar-with-dependencies.jar \ 9 | org.seqdoop.hadoop_bam.examples.TestBAM 10 | 11 | and: 12 | 13 | $ hadoop jar target/*-jar-with-dependencies.jar \ 14 | org.seqdoop.hadoop_bam.examples.TestVCF 15 | -------------------------------------------------------------------------------- /src/test/resources/test.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.5 SO:coordinate 2 | @SQ SN:chr21 LN:62435964 AS:HG18 3 | @RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 PL:ILLUMINA 4 | @RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 PL:ILLUMINA 5 | read_28833_29006_6945 99 chr21 28833 20 10M1D25M = 28993 195 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< H0:i:0 H1:i:0 MF:i:130 RG:Z:L1 Nm:i:1 6 | read_28701_28881_323b 147 chr21 28834 30 35M = 28701 -168 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< H0:i:1 H1:i:0 MF:i:18 RG:Z:L2 Nm:i:0 7 | -------------------------------------------------------------------------------- /src/test/resources/mini-chr1-chr2.fasta: -------------------------------------------------------------------------------- 1 | >chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 2 | TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA 3 | ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAAC 4 | CCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCC 5 | TAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC 6 | CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCC 7 | >chr2 dna:chromosome chromosome:GRCh37:2:1:243199373:1 8 | TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - openjdk8 4 | 5 | env: 6 | global: 7 | - SONATYPE_USERNAME=schumach 8 | - secure: "WXYDXyMHy+pXDlSFypYtLJPQZIl8oyl8dqEKpYdoiOGVaI6VGEwvhQzcuHpIl2lLLsoB8uBrK+rGGlaBovr0MfkXZExfpyCd3lCpKENuEkh2iZboIvc037H2HJFz0oiPTGG4BZu6upGq7xACqIUiZpSlrnT0tzqczU//Di7EmYM=" 9 | 10 | before_install: 11 | - cat /etc/hosts # optionally check the content *before* 12 | - sudo hostname "$(hostname | cut -c1-63)" 13 | - sed -e "s/^\\(127\\.0\\.0\\.1.*\\)/\\1 $(hostname | cut -c1-63)/" /etc/hosts | sudo tee /etc/hosts 14 | - cat /etc/hosts # optionally check the content *after* 15 | 16 | script: mvn clean test jacoco:report 17 | 18 | after_success: 19 | - python scripts/deploy/addServerToM2Settings.py 20 | - mvn coveralls:report 21 | - mvn deploy -DskipTests=true --settings ~/.m2/mySettings.xml 22 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestBAMSplitGuesser.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.SAMUtils; 4 | import htsjdk.samtools.seekablestream.SeekableStream; 5 | import java.io.File; 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.Path; 8 | import org.junit.Test; 9 | import org.seqdoop.hadoop_bam.util.WrapSeekable; 10 | 11 | import static org.junit.Assert.assertEquals; 12 | 13 | public class TestBAMSplitGuesser { 14 | 15 | @Test 16 | public void test() throws Exception { 17 | Configuration conf = new Configuration(); 18 | String bam = getClass().getClassLoader().getResource("test.bam").getFile(); 19 | SeekableStream ss = WrapSeekable.openPath(conf, new Path(bam)); 20 | BAMSplitGuesser bamSplitGuesser = new BAMSplitGuesser(ss, conf); 21 | long startGuess = bamSplitGuesser.guessNextBAMRecordStart(0, 3 * 0xffff + 0xfffe); 22 | assertEquals(SAMUtils.findVirtualOffsetOfFirstRecordInBam(new File(bam)), startGuess); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /scripts/release/settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | sonatype-nexus-snapshots 7 | @SONATYPE_USERNAME@ 8 | @SONATYPE_PASSWORD@ 9 | 10 | 11 | sonatype-nexus-staging 12 | @SONATYPE_USERNAME@ 13 | @SONATYPE_PASSWORD@ 14 | 15 | 16 | 17 | 18 | gpg 19 | 20 | @GPG_EXECUTABLE_PATH@ 21 | @PGP_KEY_PASSPHRASE@ 22 | @GPG_KEY_NAME 23 | 24 | 25 | 26 | 27 | gpg 28 | 29 | 30 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Aalto University 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMRecordWriter.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.SAMFileHeader; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 6 | 7 | import java.io.IOException; 8 | import java.io.OutputStream; 9 | 10 | /** A convenience class that you can use as a RecordWriter for CRAM files. 11 | * 12 | *

The write function ignores the key, just outputting the SAMRecord.

13 | */ 14 | public class KeyIgnoringCRAMRecordWriter extends CRAMRecordWriter { 15 | public KeyIgnoringCRAMRecordWriter( 16 | Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) 17 | throws IOException 18 | { 19 | super(output, input, writeHeader, ctx); 20 | } 21 | 22 | public KeyIgnoringCRAMRecordWriter( 23 | Path output, SAMFileHeader header, boolean writeHeader, 24 | TaskAttemptContext ctx) 25 | throws IOException 26 | { 27 | super(output, header, writeHeader, ctx); 28 | } 29 | 30 | @Override public void write(K ignored, SAMRecordWritable rec) { 31 | writeAlignment(rec.get()); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/util/TestVCFHeaderReader.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam.util; 2 | 3 | import java.io.IOException; 4 | 5 | import com.google.common.io.Resources; 6 | 7 | import htsjdk.samtools.seekablestream.ByteArraySeekableStream; 8 | import htsjdk.samtools.seekablestream.SeekableStream; 9 | 10 | import org.junit.Test; 11 | 12 | import static org.junit.Assert.assertNotNull; 13 | 14 | public class TestVCFHeaderReader { 15 | 16 | @Test 17 | public void testReadHeaderFromVCF() throws IOException { 18 | assertNotNull(VCFHeaderReader.readHeaderFrom(seekableStream("test.vcf"))); 19 | } 20 | 21 | @Test 22 | public void testReadHeaderFromGzippedVCF() throws IOException { 23 | assertNotNull(VCFHeaderReader.readHeaderFrom(seekableStream("test.vcf.gz"))); 24 | } 25 | 26 | @Test 27 | public void testReadHeaderFromBGZFVCF() throws IOException { 28 | assertNotNull(VCFHeaderReader.readHeaderFrom(seekableStream("test.vcf.bgzf.gz"))); 29 | } 30 | 31 | static SeekableStream seekableStream(final String resource) throws IOException { 32 | return new ByteArraySeekableStream(Resources.toByteArray(ClassLoader.getSystemClassLoader().getResource(resource))); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /scripts/release/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # do we have enough arguments? 4 | if [ $# < 3 ]; then 5 | echo "Usage:" 6 | echo 7 | echo "./release.sh " 8 | exit 1 9 | fi 10 | 11 | # pick arguments 12 | release=$1 13 | devel=$2 14 | 15 | # get current branch 16 | branch=$(git rev-parse --abbrev-ref HEAD) 17 | 18 | commit=$(git log --pretty=format:"%H" | head -n 1) 19 | echo "releasing from ${commit} on branch ${branch}" 20 | 21 | git push origin ${branch} 22 | 23 | #do release 24 | mvn --batch-mode \ 25 | -Dresume=false \ 26 | -Dtag=${release} \ 27 | -DreleaseVersion=${release} \ 28 | -DdevelopmentVersion=${devel} \ 29 | -DbranchName=${release} \ 30 | release:clean \ 31 | release:prepare \ 32 | release:perform 33 | 34 | if [ $? != 0 ]; then 35 | echo "Releasing failed." 36 | exit 1 37 | fi 38 | 39 | if [ $branch = "master" ]; then 40 | # if original branch was master, update versions on original branch 41 | git checkout ${branch} 42 | mvn versions:set -DnewVersion=${devel} \ 43 | -DgenerateBackupPoms=false 44 | git commit -a -m "Modifying pom.xml files for new development after ${release} release." 45 | git push origin ${branch} 46 | fi 47 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestSAMFormat.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import org.junit.Test; 6 | 7 | import static org.junit.Assert.assertEquals; 8 | import static org.junit.Assert.assertNull; 9 | 10 | public class TestSAMFormat { 11 | 12 | @Test 13 | public void testInferFromFilePath() throws IOException { 14 | assertEquals(SAMFormat.SAM, SAMFormat.inferFromFilePath("test.sam")); 15 | assertEquals(SAMFormat.BAM, SAMFormat.inferFromFilePath("test.bam")); 16 | assertEquals(SAMFormat.CRAM, SAMFormat.inferFromFilePath("test.cram")); 17 | assertNull(SAMFormat.inferFromFilePath("test.vcf")); 18 | } 19 | 20 | @Test 21 | public void testInferFromData() throws IOException { 22 | assertEquals(SAMFormat.SAM, SAMFormat.inferFromData(stream("test.sam"))); 23 | assertEquals(SAMFormat.BAM, SAMFormat.inferFromData(stream("test.bam"))); 24 | assertEquals(SAMFormat.CRAM, SAMFormat.inferFromData(stream("test.cram"))); 25 | assertNull( SAMFormat.inferFromData(stream("test.vcf"))); 26 | } 27 | 28 | private InputStream stream(String resource) throws IOException { 29 | return ClassLoader.getSystemClassLoader().getResource(resource).openStream(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestAnySAMInputFormat.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.fs.PathNotFoundException; 6 | import org.junit.Test; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | 10 | public class TestAnySAMInputFormat { 11 | 12 | @Test 13 | public void testHeaderlessSamFormat() throws PathNotFoundException { 14 | final SAMFormat result = getSamFormat(new Configuration(), "test_headerless.sam"); 15 | assertEquals(SAMFormat.SAM, result); 16 | } 17 | 18 | @Test 19 | public void testTrustExtensionsIsHonored() throws PathNotFoundException { 20 | final Configuration conf = new Configuration(); 21 | //default to trusting exceptions 22 | assertEquals(SAMFormat.SAM, getSamFormat(conf, "misnamedBam.sam")); 23 | 24 | conf.set(AnySAMInputFormat.TRUST_EXTS_PROPERTY, "false"); 25 | final SAMFormat result = getSamFormat(conf, "misnamedBam.sam"); 26 | assertEquals(SAMFormat.BAM, result); 27 | } 28 | 29 | private SAMFormat getSamFormat(final Configuration conf, final String file) throws PathNotFoundException { 30 | final String filePath = getClass().getClassLoader().getResource(file).getFile(); 31 | return new AnySAMInputFormat(conf).getFormat(new Path(filePath)); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/FormatException.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011-2012 CRS4. 2 | // 3 | // This file is part of Hadoop-BAM. 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to 7 | // deal in the Software without restriction, including without limitation the 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | // sell copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in 13 | // all copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | // IN THE SOFTWARE. 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | public class FormatException extends RuntimeException 26 | { 27 | private static final long serialVersionUID = 1L; 28 | public FormatException(String msg) 29 | { 30 | super(msg); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /scripts/release/README.md: -------------------------------------------------------------------------------- 1 | Notes for release managers 2 | --- 3 | 4 | This document describes how to make a Hadoop-BAM release. 5 | 6 | Setup your environment 7 | 1. Copy (or incorporate) the settings.xml file to ```~/.m2/settings.xml``` 8 | 2. Edit the username, password, etc in ```~/.m2/settings.xml``` 9 | 10 | First, update the CHANGELOG.txt file with the list of closed issues and closed 11 | and merged pull requests. Additionally, you will need to update the version in 12 | README.md. These changes will need to be committed to the branch you are 13 | releasing from before you do the release. 14 | 15 | Then from the project root directory, run `./scripts/release/release.sh`. 16 | When you run this script, it takes the release version and the new development 17 | version as arguments. For example: 18 | 19 | ```bash 20 | ./scripts/release/release.sh 7.9.2 7.9.3-SNAPSHOT 21 | ``` 22 | 23 | This script can be run off of a different branch from 24 | master, which makes it possible to cut maintenance releases. 25 | 26 | Once you've successfully published the release, you will need to "close" and 27 | "release" it following the instructions at 28 | http://central.sonatype.org/pages/releasing-the-deployment.html#close-and-drop-or-release-your-staging-repository. 29 | 30 | After the release is rsynced to the Maven Central repository, confirm checksums match 31 | and verify signatures. You should be able to verify this before closing the release 32 | in Sonatype, as the checksums and signatures will be available in the staging repository. 33 | -------------------------------------------------------------------------------- /scripts/deploy/addServerToM2Settings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Based on https://gist.github.com/neothemachine/4060735 4 | 5 | import sys 6 | import os 7 | import os.path 8 | import xml.dom.minidom 9 | 10 | if os.environ["TRAVIS_SECURE_ENV_VARS"] == "false": 11 | print "no secure env vars available, skipping deployment" 12 | sys.exit() 13 | 14 | homedir = os.path.expanduser("~") 15 | 16 | m2 = xml.dom.minidom.parse(homedir + '/.m2/settings.xml') 17 | settings = m2.getElementsByTagName("settings")[0] 18 | 19 | serversNodes = settings.getElementsByTagName("servers") 20 | if not serversNodes: 21 | serversNode = m2.createElement("servers") 22 | settings.appendChild(serversNode) 23 | else: 24 | serversNode = serversNodes[0] 25 | 26 | sonatypeServerNode = m2.createElement("server") 27 | sonatypeServerId = m2.createElement("id") 28 | sonatypeServerUser = m2.createElement("username") 29 | sonatypeServerPass = m2.createElement("password") 30 | 31 | idNode = m2.createTextNode("sonatype-nexus-snapshots") 32 | userNode = m2.createTextNode(os.environ["SONATYPE_USERNAME"]) 33 | passNode = m2.createTextNode(os.environ["SONATYPE_PASSWORD"]) 34 | 35 | sonatypeServerId.appendChild(idNode) 36 | sonatypeServerUser.appendChild(userNode) 37 | sonatypeServerPass.appendChild(passNode) 38 | 39 | sonatypeServerNode.appendChild(sonatypeServerId) 40 | sonatypeServerNode.appendChild(sonatypeServerUser) 41 | sonatypeServerNode.appendChild(sonatypeServerPass) 42 | 43 | serversNode.appendChild(sonatypeServerNode) 44 | 45 | m2Str = m2.toxml() 46 | f = open(homedir + '/.m2/mySettings.xml', 'w') 47 | f.write(m2Str) 48 | f.close() 49 | -------------------------------------------------------------------------------- /src/test/resources/test.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta 5 | ##contig= 6 | ##phasing=partial 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##FILTER= 14 | ##FILTER= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 | 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 21 | 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 22 | 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 23 | 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 24 | 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 25 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/VariantContextWithHeader.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | package org.seqdoop.hadoop_bam; 22 | 23 | import htsjdk.variant.variantcontext.VariantContext; 24 | import htsjdk.variant.vcf.VCFHeader; 25 | 26 | public class VariantContextWithHeader extends VariantContext { 27 | private final VCFHeader header; 28 | 29 | public VariantContextWithHeader(VariantContext context, VCFHeader header) { 30 | super(context); 31 | this.header = header; 32 | } 33 | 34 | public VCFHeader getHeader() { 35 | return header; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/BGZFCompressionOutputStream.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam.util; 2 | 3 | import htsjdk.samtools.util.BlockCompressedOutputStream; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.OutputStream; 7 | import org.apache.hadoop.io.compress.CompressionOutputStream; 8 | 9 | /** 10 | * An implementation of {@code CompressionOutputStream} for BGZF, using 11 | * {@link BlockCompressedOutputStream} from htsjdk. Note that unlike 12 | * {@link BlockCompressedOutputStream}, an empty gzip block file terminator is 13 | * not written at the end of the stream. This is because in Hadoop, multiple 14 | * headerless files are often written in parallel, and merged afterwards into a single 15 | * file, and it's during the merge process the header and terminator are added. 16 | */ 17 | class BGZFCompressionOutputStream extends CompressionOutputStream { 18 | 19 | private BlockCompressedOutputStream output; 20 | 21 | public BGZFCompressionOutputStream(OutputStream out) 22 | throws IOException { 23 | super(out); 24 | this.output = new BlockCompressedOutputStream(out, (File) null); 25 | } 26 | 27 | public void write(int b) throws IOException { 28 | output.write(b); 29 | } 30 | 31 | public void write(byte[] b, int off, int len) throws IOException { 32 | output.write(b, off, len); 33 | } 34 | 35 | public void finish() throws IOException { 36 | output.flush(); 37 | } 38 | 39 | public void resetState() throws IOException { 40 | output.flush(); 41 | output = new BlockCompressedOutputStream(out, (File) null); 42 | } 43 | 44 | public void close() throws IOException { 45 | output.flush(); // don't close as we don't want to write terminator (empty gzip block) 46 | out.close(); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/DataOutputWrapper.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2011-06-27 09:25:13 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import java.io.DataOutput; 26 | import java.io.IOException; 27 | import java.io.OutputStream; 28 | 29 | public class DataOutputWrapper extends OutputStream { 30 | private final DataOutput out; 31 | 32 | public DataOutputWrapper(DataOutput o) { out = o; } 33 | 34 | @Override public void write(int b) throws IOException { 35 | out.writeByte(b); 36 | } 37 | @Override public void write(byte[] b, int off, int len) throws IOException { 38 | out.write(b, off, len); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/util/TestVCFFileMerger.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam.util; 2 | 3 | import htsjdk.variant.vcf.VCFHeader; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.net.URI; 8 | import java.nio.file.Files; 9 | import java.nio.file.Path; 10 | import java.nio.file.Paths; 11 | import org.junit.Before; 12 | import org.junit.Test; 13 | 14 | public class TestVCFFileMerger { 15 | 16 | private String partsDirectory; 17 | private VCFHeader header; 18 | 19 | @Before 20 | public void setup() throws Exception { 21 | File partsDir = File.createTempFile("parts", ""); 22 | partsDir.delete(); 23 | partsDir.mkdir(); 24 | Files.createFile(new File(partsDir, "_SUCCESS").toPath()); 25 | partsDirectory = partsDir.toURI().toString(); 26 | header = VCFHeaderReader.readHeaderFrom(TestVCFHeaderReader.seekableStream("test.vcf")); 27 | } 28 | 29 | @Test(expected = IllegalArgumentException.class) 30 | public void testEmpty() throws IOException { 31 | File out = File.createTempFile("out", ".vcf"); 32 | out.deleteOnExit(); 33 | VCFFileMerger.mergeParts(partsDirectory, out.toURI().toString(), header); 34 | } 35 | 36 | @Test(expected = IllegalArgumentException.class) 37 | public void testBCFNotSupported() throws IOException { 38 | File out = File.createTempFile("out", ".bcf"); 39 | out.deleteOnExit(); 40 | Path target = Paths.get(URI.create(partsDirectory)).resolve("part-m-00000"); 41 | Files.copy(stream("test.uncompressed.bcf"), target); 42 | VCFFileMerger.mergeParts(partsDirectory, out.toURI().toString(), header); 43 | } 44 | 45 | private InputStream stream(String resource) throws IOException { 46 | return ClassLoader.getSystemClassLoader().getResource(resource).openStream(); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/htsjdk/samtools/LinearBAMIndex.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | package htsjdk.samtools; 21 | 22 | import htsjdk.samtools.CachingBAMFileIndex; 23 | import htsjdk.samtools.LinearIndex; 24 | import htsjdk.samtools.SAMSequenceDictionary; 25 | import htsjdk.samtools.seekablestream.SeekableStream; 26 | 27 | /** 28 | * The htsjdk APIs for accessing the linear BAM index are private... 29 | */ 30 | public class LinearBAMIndex extends CachingBAMFileIndex { 31 | 32 | public LinearBAMIndex(SeekableStream stream, SAMSequenceDictionary dict) { 33 | super(stream, dict); 34 | } 35 | 36 | public LinearIndex getLinearIndex(int idx) { 37 | return getQueryResults(idx).getLinearIndex(); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestVCFFormat.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import org.junit.Test; 6 | 7 | import static org.junit.Assert.assertEquals; 8 | import static org.junit.Assert.assertFalse; 9 | import static org.junit.Assert.assertNull; 10 | import static org.junit.Assert.assertTrue; 11 | 12 | public class TestVCFFormat { 13 | 14 | @Test 15 | public void testInferFromFilePath() throws IOException { 16 | assertEquals(VCFFormat.VCF, VCFFormat.inferFromFilePath("test.vcf")); 17 | assertEquals(VCFFormat.VCF, VCFFormat.inferFromFilePath("test.vcf.gz")); 18 | assertEquals(VCFFormat.VCF, VCFFormat.inferFromFilePath("test.vcf.bgzf.gz")); 19 | assertEquals(VCFFormat.VCF, VCFFormat.inferFromFilePath("test.vcf.bgz")); 20 | assertEquals(VCFFormat.BCF, VCFFormat.inferFromFilePath("test.uncompressed.bcf")); 21 | assertEquals(VCFFormat.BCF, VCFFormat.inferFromFilePath("test.bgzf.bcf")); 22 | assertNull(VCFFormat.inferFromFilePath("test.sam")); 23 | } 24 | 25 | @Test 26 | public void testInferFromData() throws IOException { 27 | assertEquals(VCFFormat.VCF, VCFFormat.inferFromData(stream("test.vcf"))); 28 | assertEquals(VCFFormat.VCF, VCFFormat.inferFromData(stream("test.vcf.gz"))); 29 | assertEquals(VCFFormat.VCF, VCFFormat.inferFromData(stream("test.vcf.bgzf.gz"))); 30 | assertEquals(VCFFormat.VCF, VCFFormat.inferFromData(stream("test.vcf.bgz"))); 31 | assertEquals(VCFFormat.BCF, VCFFormat.inferFromData(stream("test.uncompressed.bcf"))); 32 | assertEquals(VCFFormat.BCF, VCFFormat.inferFromData(stream("test.bgzf.bcf"))); 33 | assertNull(VCFFormat.inferFromData(stream("test.sam"))); 34 | } 35 | 36 | private InputStream stream(String resource) throws IOException { 37 | return ClassLoader.getSystemClassLoader().getResource(resource).openStream(); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/IntelGKLAccessor.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | package org.seqdoop.hadoop_bam; 22 | 23 | import com.intel.gkl.compression.IntelDeflaterFactory; 24 | import com.intel.gkl.compression.IntelInflaterFactory; 25 | import htsjdk.samtools.util.zip.DeflaterFactory; 26 | import htsjdk.samtools.util.zip.InflaterFactory; 27 | 28 | /** 29 | * GKL code is kept here so this class is only loaded when the Intel inflator or 30 | * deflator is used. This means that users that aren't using these features don't 31 | * have to put the GKL JAR on their classpath (since it's a provided dependency). 32 | */ 33 | class IntelGKLAccessor { 34 | static InflaterFactory newInflatorFactor() { 35 | return new IntelInflaterFactory(); 36 | } 37 | static DeflaterFactory newDeflaterFactory() { 38 | return new IntelDeflaterFactory(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/DataInputWrapper.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2011-06-27 09:25:59 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import java.io.DataInput; 26 | import java.io.InputStream; 27 | import java.io.IOException; 28 | 29 | public class DataInputWrapper extends InputStream { 30 | private final DataInput in; 31 | 32 | public DataInputWrapper(DataInput i) { in = i; } 33 | 34 | @Override public long skip(long n) throws IOException { 35 | for (; n > Integer.MAX_VALUE; n -= Integer.MAX_VALUE) { 36 | final int skipped = in.skipBytes(Integer.MAX_VALUE); 37 | if (skipped < Integer.MAX_VALUE) 38 | return skipped; 39 | } 40 | return in.skipBytes((int)n); 41 | } 42 | @Override public int read(byte[] b, int off, int len) throws IOException { 43 | in.readFully(b, off, len); 44 | return len; 45 | } 46 | @Override public int read() throws IOException { 47 | return in.readByte(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMRecordWriter.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-11 10:36:08 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.IOException; 26 | import java.io.OutputStream; 27 | 28 | import htsjdk.samtools.SAMFileHeader; 29 | 30 | import org.apache.hadoop.fs.Path; 31 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 32 | 33 | /** A convenience class that you can use as a RecordWriter for BAM files. 34 | * 35 | *

The write function ignores the key, just outputting the SAMRecord.

36 | */ 37 | public class KeyIgnoringBAMRecordWriter extends BAMRecordWriter { 38 | public KeyIgnoringBAMRecordWriter( 39 | Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) 40 | throws IOException 41 | { 42 | super(output, input, writeHeader, ctx); 43 | } 44 | public KeyIgnoringBAMRecordWriter( 45 | Path output, SAMFileHeader header, boolean writeHeader, 46 | TaskAttemptContext ctx) 47 | throws IOException 48 | { 49 | super(output, header, writeHeader, ctx); 50 | } 51 | 52 | @Override public void write(K ignored, SAMRecordWritable rec) throws IOException { 53 | writeAlignment(rec.get()); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestSplittingBAMIndexer.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.SAMRecord; 4 | import htsjdk.samtools.SamReader; 5 | import htsjdk.samtools.SamReaderFactory; 6 | import java.io.File; 7 | import java.io.FileOutputStream; 8 | import java.io.IOException; 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.junit.Before; 11 | import org.junit.Test; 12 | 13 | import static org.junit.Assert.assertEquals; 14 | import static org.junit.Assert.assertTrue; 15 | 16 | public class TestSplittingBAMIndexer { 17 | private String input; 18 | 19 | @Before 20 | public void setup() throws Exception { 21 | input = ClassLoader.getSystemClassLoader().getResource("test.bam").getFile(); 22 | } 23 | 24 | @Test 25 | public void testIndexersProduceSameIndexes() throws Exception { 26 | long bamFileSize = new File(input).length(); 27 | for (int g : new int[] { 2, 10, SplittingBAMIndexer.DEFAULT_GRANULARITY}) { 28 | SplittingBAMIndex index1 = fromBAMFile(g); 29 | SplittingBAMIndex index2 = fromSAMRecords(g); 30 | assertEquals(index1, index2); 31 | assertEquals(bamFileSize, index1.bamSize()); 32 | assertEquals(bamFileSize, index2.bamSize()); 33 | } 34 | } 35 | 36 | private SplittingBAMIndex fromBAMFile(int granularity) throws 37 | IOException { 38 | Configuration conf = new Configuration(); 39 | conf.set("input", new File(input).toURI().toString()); 40 | conf.setInt("granularity", granularity); 41 | 42 | SplittingBAMIndexer.run(conf); 43 | 44 | File indexFile = new File(input + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); 45 | assertTrue(indexFile.exists()); 46 | 47 | return new SplittingBAMIndex(indexFile); 48 | } 49 | 50 | private SplittingBAMIndex fromSAMRecords(int granularity) throws IOException { 51 | File indexFile = new File(input + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); 52 | FileOutputStream out = new FileOutputStream(indexFile); 53 | SplittingBAMIndexer indexer = new SplittingBAMIndexer(out, granularity); 54 | SamReader samReader = SamReaderFactory.makeDefault() 55 | .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS).open(new File(input)); 56 | for (SAMRecord r : samReader) { 57 | indexer.processAlignment(r); 58 | } 59 | indexer.finish(new File(input).length()); 60 | out.close(); 61 | 62 | assertTrue(indexFile.exists()); 63 | 64 | return new SplittingBAMIndex(indexFile); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/FormatConstants.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011-2012 CRS4. 2 | // 3 | // This file is part of Hadoop-BAM. 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to 7 | // deal in the Software without restriction, including without limitation the 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | // sell copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in 13 | // all copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | // IN THE SOFTWARE. 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | public class FormatConstants 26 | { 27 | /** 28 | * Offset by which Sanger-style ASCII-encoded quality scores are shifted. 29 | */ 30 | public static final int SANGER_OFFSET = 33; 31 | 32 | /** 33 | * Maximum encodable quality score for Sanger Phred+33 encoded base qualities. 34 | * 35 | * Range of legal values is [0,93], according to wikipedia on 10/9/2013: 36 | * http://en.wikipedia.org/wiki/FASTQ_format#Quality 37 | */ 38 | public static final int SANGER_MAX = 93; 39 | 40 | /** 41 | * Offset by which Illumina-style ASCII-encoded quality scores are shifted. 42 | */ 43 | public static final int ILLUMINA_OFFSET = 64; 44 | 45 | /** 46 | * Maximum encodable quality score for Illumina Phred+64 encoded base qualities. 47 | */ 48 | public static final int ILLUMINA_MAX = 62; 49 | 50 | /** 51 | * Encodings for base quality formats. 52 | */ 53 | public enum BaseQualityEncoding { Illumina, Sanger }; 54 | 55 | private FormatConstants() {} // no instantiation 56 | 57 | public static final String CONF_INPUT_BASE_QUALITY_ENCODING = "hbam.input.base-quality-encoding"; 58 | public static final String CONF_INPUT_FILTER_FAILED_QC = "hbam.input.filter-failed-qc"; 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringSAMRecordWriter.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2012-02-23 12:53:50 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.IOException; 26 | import java.io.OutputStream; 27 | 28 | import htsjdk.samtools.SAMFileHeader; 29 | 30 | import org.apache.hadoop.fs.Path; 31 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 32 | 33 | /** A convenience class that you can use as a RecordWriter for SAM files. 34 | * 35 | *

The write function ignores the key, just outputting the SAMRecord.

36 | */ 37 | public class KeyIgnoringSAMRecordWriter extends SAMRecordWriter { 38 | public KeyIgnoringSAMRecordWriter( 39 | Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) 40 | throws IOException 41 | { 42 | super(output, input, writeHeader, ctx); 43 | } 44 | public KeyIgnoringSAMRecordWriter( 45 | Path output, SAMFileHeader header, boolean writeHeader, 46 | TaskAttemptContext ctx) 47 | throws IOException 48 | { 49 | super(output, header, writeHeader, ctx); 50 | } 51 | public KeyIgnoringSAMRecordWriter( 52 | OutputStream output, SAMFileHeader header, boolean writeHeader) 53 | throws IOException 54 | { 55 | super(output, header, writeHeader); 56 | } 57 | 58 | @Override public void write(K ignored, SAMRecordWritable rec) { 59 | writeAlignment(rec.get()); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/SAMInputFormat.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2012-02-02 11:32:58 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.IOException; 26 | 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.io.LongWritable; 29 | import org.apache.hadoop.mapreduce.JobContext; 30 | import org.apache.hadoop.mapreduce.InputSplit; 31 | import org.apache.hadoop.mapreduce.RecordReader; 32 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 33 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 34 | 35 | /** An {@link org.apache.hadoop.mapreduce.InputFormat} for SAM files. Values 36 | * are the individual records; see {@link BAMRecordReader} for the meaning of 37 | * the key. 38 | */ 39 | public class SAMInputFormat 40 | extends FileInputFormat 41 | { 42 | /** Returns a {@link SAMRecordReader} initialized with the parameters. */ 43 | @Override public RecordReader 44 | createRecordReader(InputSplit split, TaskAttemptContext ctx) 45 | throws InterruptedException, IOException 46 | { 47 | final RecordReader rr = 48 | new SAMRecordReader(); 49 | rr.initialize(split, ctx); 50 | return rr; 51 | } 52 | 53 | @Override public boolean isSplitable(JobContext job, Path path) { 54 | return super.isSplitable(job, path); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/SAMFormat.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2012-02-23 14:06:35 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.InputStream; 26 | import java.io.IOException; 27 | 28 | import org.apache.hadoop.fs.Path; 29 | 30 | /** Describes a SAM format. */ 31 | public enum SAMFormat { 32 | SAM, BAM, CRAM; 33 | 34 | /** Infers the SAM format by looking at the filename of the given path. 35 | * 36 | * @see #inferFromFilePath(String) 37 | */ 38 | public static SAMFormat inferFromFilePath(final Path path) { 39 | return inferFromFilePath(path.getName()); 40 | } 41 | 42 | /** Infers the SAM format by looking at the extension of the given file 43 | * name. *.sam is recognized as {@link #SAM}, 44 | * *.bam as {@link #BAM}, and *.bam as {@link #CRAM}. 45 | */ 46 | public static SAMFormat inferFromFilePath(final String name) { 47 | if (name.endsWith(".bam")) return BAM; 48 | if (name.endsWith(".cram")) return CRAM; 49 | if (name.endsWith(".sam")) return SAM; 50 | return null; 51 | } 52 | 53 | public static SAMFormat inferFromData(final InputStream in) throws IOException { 54 | final byte b = (byte)in.read(); 55 | in.close(); 56 | switch (b) { 57 | case 0x1f: return SAMFormat.BAM; 58 | case 0x43: return SAMFormat.CRAM; 59 | case '@': return SAMFormat.SAM; 60 | } 61 | return null; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/VCFOutputFormat.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2013-06-26 56:09:25 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 27 | 28 | /** An abstract {@link org.apache.hadoop.mapreduce.OutputFormat} for VCF and 29 | * BCF files. Only locks down the value type and stores the output format 30 | * requested. 31 | */ 32 | public abstract class VCFOutputFormat 33 | extends FileOutputFormat 34 | { 35 | /** A string property defining the output format to use. The value is read 36 | * directly by {@link VCFFormat#valueOf}. 37 | */ 38 | public static final String OUTPUT_VCF_FORMAT_PROPERTY = 39 | "hadoopbam.vcf.output-format"; 40 | 41 | protected VCFFormat format; 42 | 43 | /** Creates a new output format, reading {@link #OUTPUT_VCF_FORMAT_PROPERTY} 44 | * from the given Configuration. 45 | */ 46 | protected VCFOutputFormat(Configuration conf) { 47 | final String fmtStr = conf.get(OUTPUT_VCF_FORMAT_PROPERTY); 48 | 49 | format = fmtStr == null ? null : VCFFormat.valueOf(fmtStr); 50 | } 51 | 52 | /** Creates a new output format for the given VCF format. */ 53 | protected VCFOutputFormat(VCFFormat fmt) { 54 | if (fmt == null) 55 | throw new IllegalArgumentException("null VCFFormat"); 56 | format = fmt; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/GetSortedBAMHeader.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-20 13:54:10 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import java.io.File; 26 | import java.io.FileOutputStream; 27 | import java.io.IOException; 28 | 29 | import htsjdk.samtools.SAMFileHeader; 30 | import htsjdk.samtools.SamReaderFactory; 31 | import htsjdk.samtools.ValidationStringency; 32 | 33 | import org.seqdoop.hadoop_bam.SAMFormat; 34 | 35 | public final class GetSortedBAMHeader { 36 | public static void main(String[] args) throws IOException { 37 | if (args.length < 2) { 38 | System.err.println( 39 | "Usage: GetSortedBAMHeader input output\n\n"+ 40 | 41 | "Reads the BAM header from input (a standard BGZF-compressed BAM "+ 42 | "file), and\nwrites it (BGZF-compressed, no terminator block) to "+ 43 | "output. Sets the sort order\nindicated in the SAM header to "+ 44 | "'coordinate'."); 45 | System.exit(1); 46 | } 47 | 48 | final SAMFileHeader h = 49 | SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT) 50 | .setUseAsyncIo(false) 51 | .open(new File(args[0])).getFileHeader(); 52 | h.setSortOrder(SAMFileHeader.SortOrder.coordinate); 53 | 54 | try (FileOutputStream stream = new FileOutputStream(args[1])) { 55 | new SAMOutputPreparer().prepareForRecords(stream, SAMFormat.BAM, h); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/AnySAMOutputFormat.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2012-02-23 13:00:24 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 27 | 28 | /** An abstract {@link org.apache.hadoop.mapreduce.OutputFormat} for SAM and 29 | * BAM files. Only locks down the value type and stores the output format 30 | * requested. 31 | */ 32 | public abstract class AnySAMOutputFormat 33 | extends FileOutputFormat 34 | { 35 | /** A string property defining the output format to use. The value is read 36 | * directly by {@link SAMFormat#valueOf}. 37 | */ 38 | public static final String OUTPUT_SAM_FORMAT_PROPERTY = 39 | "hadoopbam.anysam.output-format"; 40 | 41 | protected SAMFormat format; 42 | 43 | /** Creates a new output format, reading {@link #OUTPUT_SAM_FORMAT_PROPERTY} 44 | * from the given Configuration. 45 | */ 46 | protected AnySAMOutputFormat(Configuration conf) { 47 | final String fmtStr = conf.get(OUTPUT_SAM_FORMAT_PROPERTY); 48 | 49 | format = fmtStr == null ? null : SAMFormat.valueOf(fmtStr); 50 | } 51 | 52 | /** Creates a new output format for the given SAM format. */ 53 | protected AnySAMOutputFormat(SAMFormat fmt) { 54 | if (fmt == null) 55 | throw new IllegalArgumentException("null SAMFormat"); 56 | format = fmt; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/BAMOutputFormat.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-11 12:17:33 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 27 | 28 | /** Currently this only locks down the value type of the {@link 29 | * org.apache.hadoop.mapreduce.OutputFormat}: contains no functionality. 30 | */ 31 | public abstract class BAMOutputFormat 32 | extends FileOutputFormat { 33 | /** 34 | * If set to true, write .splitting-bai files for every BAM file 35 | * (defaults to false). 36 | * A splitting BAI file (not to be confused with a regular BAI file) contains an 37 | * index of offsets that the BAM file can be read from; they are used by 38 | * {@link BAMInputFormat} to construct splits. 39 | */ 40 | public static final String WRITE_SPLITTING_BAI = 41 | "hadoopbam.bam.write-splitting-bai"; 42 | 43 | /** 44 | * If set to true, use the Intel deflater for compressing DEFLATE compressed streams. 45 | * If set, the GKL library must be 46 | * provided on the classpath. 47 | */ 48 | public static final String USE_INTEL_DEFLATER_PROPERTY = "hadoopbam.bam.use-intel-deflater"; 49 | 50 | static boolean useIntelDeflater(Configuration conf) { 51 | return conf.getBoolean(USE_INTEL_DEFLATER_PROPERTY, false); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestSAMHeaderReader.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.*; 4 | import htsjdk.samtools.cram.CRAMException; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.junit.Rule; 7 | import org.junit.Test; 8 | import org.junit.rules.ExpectedException; 9 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader; 10 | 11 | import java.io.InputStream; 12 | import java.net.URI; 13 | 14 | import static org.junit.Assert.assertEquals; 15 | 16 | public class TestSAMHeaderReader { 17 | @Rule 18 | public ExpectedException thrown= ExpectedException.none(); 19 | 20 | @Test 21 | public void testBAMHeaderReaderNoReference() throws Exception { 22 | 23 | final Configuration conf = new Configuration(); 24 | 25 | InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.bam"); 26 | final SamReader samReader = SamReaderFactory.makeDefault().open(SamInputResource.of(inputStream)); 27 | int sequenceCount = samReader.getFileHeader().getSequenceDictionary().size(); 28 | samReader.close(); 29 | 30 | inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.bam"); 31 | SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); 32 | inputStream.close(); 33 | 34 | assertEquals(samHeader.getSequenceDictionary().size(), sequenceCount); 35 | } 36 | 37 | @Test 38 | public void testCRAMHeaderReaderWithReference() throws Exception { 39 | final Configuration conf = new Configuration(); 40 | 41 | final InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.cram"); 42 | final URI reference = ClassLoader.getSystemClassLoader().getResource("auxf.fa").toURI(); 43 | conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, reference.toString()); 44 | 45 | SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); 46 | inputStream.close(); 47 | 48 | assertEquals(samHeader.getSequenceDictionary().size(), 1); 49 | } 50 | 51 | @Test 52 | public void testCRAMHeaderReaderNoReference() throws Exception { 53 | 54 | thrown.expect(IllegalStateException.class); // htsjdk throws on CRAM file with no reference provided 55 | 56 | final Configuration conf = new Configuration(); 57 | final InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.cram"); 58 | SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); 59 | inputStream.close(); 60 | 61 | assertEquals(samHeader.getSequenceDictionary().size(), 1); 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/ConfHelper.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011-2012 CRS4. 2 | // 3 | // This file is part of Hadoop-BAM. 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to 7 | // deal in the Software without restriction, including without limitation the 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | // sell copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in 13 | // all copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | // IN THE SOFTWARE. 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import org.apache.hadoop.conf.Configuration; 26 | 27 | public class ConfHelper 28 | { 29 | /** 30 | * Convert a string to a boolean. 31 | * 32 | * Accepted values: "yes", "true", "t", "y", "1" 33 | * "no", "false", "f", "n", "0" 34 | * All comparisons are case insensitive. 35 | * 36 | * If the value provided is null, defaultValue is returned. 37 | * 38 | * @exception IllegalArgumentException Thrown if value is not 39 | * null and doesn't match any of the accepted strings. 40 | */ 41 | public static boolean parseBoolean(String value, boolean defaultValue) 42 | { 43 | if (value == null) 44 | return defaultValue; 45 | 46 | value = value.trim(); 47 | 48 | // any of the following will 49 | final String[] acceptedTrue = new String[]{ "yes", "true", "t", "y", "1" }; 50 | final String[] acceptedFalse = new String[]{ "no", "false", "f", "n", "0" }; 51 | 52 | for (String possible: acceptedTrue) 53 | { 54 | if (possible.equalsIgnoreCase(value)) 55 | return true; 56 | } 57 | for (String possible: acceptedFalse) 58 | { 59 | if (possible.equalsIgnoreCase(value)) 60 | return false; 61 | } 62 | 63 | throw new IllegalArgumentException("Unrecognized boolean value '" + value + "'"); 64 | } 65 | 66 | public static boolean parseBoolean(Configuration conf, String propertyName, boolean defaultValue) 67 | { 68 | return parseBoolean(conf.get(propertyName), defaultValue); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/IntervalUtil.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam.util; 2 | 3 | import com.google.common.collect.ImmutableList; 4 | import htsjdk.samtools.util.Interval; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.seqdoop.hadoop_bam.FormatException; 7 | 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.function.Supplier; 11 | 12 | /** 13 | * Common utilities across different file formats. 14 | */ 15 | public final class IntervalUtil { 16 | 17 | // declared to prevent instantiation. 18 | private IntervalUtil() {} 19 | 20 | /** 21 | * Returns the list of intervals found in a string configuration property separated by colons. 22 | * @param conf the source configuration. 23 | * @param intervalPropertyName the property name holding the intervals. 24 | * @return {@code null} if there is no such a property in the configuration. 25 | * @throws NullPointerException if either input is null. 26 | */ 27 | public static List getIntervals(final Configuration conf, final String intervalPropertyName) { 28 | final String intervalsProperty = conf.get(intervalPropertyName); 29 | if (intervalsProperty == null) { 30 | return null; 31 | } 32 | if (intervalsProperty.isEmpty()) { 33 | return ImmutableList.of(); 34 | } 35 | final List intervals = new ArrayList<>(); 36 | for (final String s : intervalsProperty.split(",")) { 37 | final int lastColonIdx = s.lastIndexOf(':'); 38 | if (lastColonIdx < 0) { 39 | throw new FormatException("no colon found in interval string: " + s); 40 | } 41 | final int hyphenIdx = s.indexOf('-', lastColonIdx + 1); 42 | if (hyphenIdx < 0) { 43 | throw new FormatException("no hyphen found after colon interval string: " + s); 44 | } 45 | final String sequence = s.substring(0, lastColonIdx); 46 | final int start = parseIntOrThrowFormatException(s.substring(lastColonIdx + 1, hyphenIdx), 47 | "invalid start position", s); 48 | final int stop = parseIntOrThrowFormatException(s.substring(hyphenIdx + 1), 49 | "invalid stop position", s); 50 | intervals.add(new Interval(sequence, start, stop)); 51 | } 52 | return intervals; 53 | } 54 | 55 | private static int parseIntOrThrowFormatException(final String str, final String error, final String input) { 56 | try { 57 | return Integer.parseInt(str); 58 | } catch (final NumberFormatException ex) { 59 | throw new FormatException(error + " in interval '" + input + "': '" + str + "'"); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestBGZFSplitGuesser.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.util.BlockCompressedInputStream; 4 | import htsjdk.samtools.util.BlockCompressedStreamConstants; 5 | import java.io.File; 6 | import java.io.IOException; 7 | import java.util.Arrays; 8 | import java.util.Collection; 9 | import java.util.LinkedList; 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.FSDataInputStream; 12 | import org.apache.hadoop.fs.Path; 13 | import org.junit.Test; 14 | import org.junit.runner.RunWith; 15 | import org.junit.runners.Parameterized; 16 | import org.seqdoop.hadoop_bam.util.BGZFSplitGuesser; 17 | 18 | import static org.junit.Assert.assertEquals; 19 | 20 | @RunWith(Parameterized.class) 21 | public class TestBGZFSplitGuesser { 22 | 23 | private final File file; 24 | private final long firstSplit; 25 | private final long lastSplit; 26 | 27 | public TestBGZFSplitGuesser(String filename, long firstSplit, long lastSplit) { 28 | this.file = new File("src/test/resources/" + filename); 29 | this.firstSplit = firstSplit; 30 | this.lastSplit = lastSplit; 31 | } 32 | 33 | @Parameterized.Parameters 34 | public static Collection data() { 35 | return Arrays.asList(new Object[][] { 36 | {"test.vcf.bgzf.gz", 821, 821}, {"HiSeq.10000.vcf.bgzf.gz", 16688, 509222} 37 | }); 38 | } 39 | 40 | @Test 41 | public void test() throws IOException { 42 | Configuration conf = new Configuration(); 43 | Path path = new Path(file.toURI()); 44 | FSDataInputStream fsDataInputStream = path.getFileSystem(conf).open(path); 45 | BGZFSplitGuesser bgzfSplitGuesser = new BGZFSplitGuesser(fsDataInputStream); 46 | LinkedList boundaries = new LinkedList<>(); 47 | long start = 1; 48 | while (true) { 49 | long end = file.length(); 50 | long nextStart = bgzfSplitGuesser.guessNextBGZFBlockStart(start, end); 51 | if (nextStart == end) { 52 | break; 53 | } 54 | boundaries.add(nextStart); 55 | canReadFromBlockStart(nextStart); 56 | start = nextStart + 1; 57 | } 58 | assertEquals(firstSplit, (long) boundaries.getFirst()); 59 | assertEquals(lastSplit, (long) boundaries.getLast()); 60 | 61 | assertEquals("Last block start is terminator gzip block", 62 | file.length() - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length, 63 | (long) boundaries.get(boundaries.size() - 1)); 64 | } 65 | 66 | private void canReadFromBlockStart(long blockStart) throws IOException { 67 | BlockCompressedInputStream blockCompressedInputStream = new 68 | BlockCompressedInputStream(file); 69 | blockCompressedInputStream.setCheckCrcs(true); 70 | blockCompressedInputStream.seek(blockStart << 16); 71 | byte[] b = new byte[100]; 72 | blockCompressedInputStream.read(b); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/LazyParsingGenotypesContext.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2013-07-05 16:18:57 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import htsjdk.variant.variantcontext.LazyGenotypesContext; 26 | import htsjdk.variant.vcf.VCFHeader; 27 | 28 | /** You need to call getParser().setHeader() here before trying to decode() a 29 | * GenotypesContext in any VariantContext that came about via 30 | * VariantContextWritable.readFields(). That includes calling 31 | * VariantContext.fullyDecode() or almost any of the GenotypesContext methods. 32 | * The RecordReader provided by VCFInputFormat does this for you. 33 | */ 34 | // There's no public LazyGenotypesContext.LazyParser in Picard so we need to 35 | // provide our own. Since we need to have the header in the parser set 36 | // externally, we also need to provide a LazyGenotypesContext which gives 37 | // access to the parser. 38 | // 39 | // And since VCF and BCF have different kinds of lazy data, we have separate 40 | // classes implementing the actual parsing for each. 41 | public abstract class LazyParsingGenotypesContext 42 | extends LazyGenotypesContext 43 | { 44 | // super.parser is inaccessible to us so we keep a copy that we can access. 45 | private final Parser parserCopy; 46 | 47 | protected LazyParsingGenotypesContext(Parser p, byte[] data, int count) { 48 | super(p, data, count); 49 | parserCopy = p; 50 | } 51 | 52 | public Parser getParser() { return parserCopy; } 53 | 54 | public static interface HeaderDataCache { 55 | public void setHeader(VCFHeader header); 56 | } 57 | 58 | public static abstract class Parser implements LazyParser { 59 | public abstract void setHeaderDataCache(HeaderDataCache data); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMOutputFormat.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import htsjdk.samtools.SAMFileHeader; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.mapreduce.RecordWriter; 11 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 12 | 13 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader; 14 | 15 | /** Writes only the BAM records, not the key. 16 | * 17 | *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or 18 | * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.

19 | * 20 | *

By default, writes the SAM header to the output file(s). This 21 | * can be disabled, because in distributed usage one often ends up with (and, 22 | * for decent performance, wants to end up with) the output split into multiple 23 | * parts, which are easier to concatenate if the header is not present in each 24 | * file.

25 | */ 26 | public class KeyIgnoringCRAMOutputFormat extends CRAMOutputFormat { 27 | protected SAMFileHeader header; 28 | private boolean writeHeader = true; 29 | 30 | public KeyIgnoringCRAMOutputFormat() {} 31 | 32 | /** Whether the header will be written or not. */ 33 | public boolean getWriteHeader() { return writeHeader; } 34 | 35 | /** Set whether the header will be written or not. */ 36 | public void setWriteHeader(boolean b) { writeHeader = b; } 37 | 38 | public SAMFileHeader getSAMHeader() { return header; } 39 | public void setSAMHeader(SAMFileHeader header) { this.header = header; } 40 | 41 | public void readSAMHeaderFrom(Path path, Configuration conf) 42 | throws IOException 43 | { 44 | this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); 45 | } 46 | public void readSAMHeaderFrom(InputStream in, Configuration conf) { 47 | this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); 48 | } 49 | 50 | /** setSAMHeader or readSAMHeaderFrom must have 51 | * been called first. 52 | */ 53 | @Override public RecordWriter getRecordWriter( 54 | TaskAttemptContext ctx) 55 | throws IOException 56 | { 57 | return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); 58 | } 59 | 60 | // Allows wrappers to provide their own work file. 61 | public RecordWriter getRecordWriter( 62 | TaskAttemptContext ctx, Path out) 63 | throws IOException 64 | { 65 | if (this.header == null) 66 | throw new IOException( 67 | "Can't create a RecordWriter without the SAM header"); 68 | 69 | return new KeyIgnoringCRAMRecordWriter(out, header, writeHeader, ctx); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/BGZFCodec.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam.util; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.OutputStream; 6 | import org.apache.hadoop.fs.Seekable; 7 | import org.apache.hadoop.io.compress.CompressionCodec; 8 | import org.apache.hadoop.io.compress.CompressionOutputStream; 9 | import org.apache.hadoop.io.compress.Compressor; 10 | import org.apache.hadoop.io.compress.Decompressor; 11 | import org.apache.hadoop.io.compress.GzipCodec; 12 | import org.apache.hadoop.io.compress.SplitCompressionInputStream; 13 | import org.apache.hadoop.io.compress.SplittableCompressionCodec; 14 | 15 | /** 16 | * A Hadoop {@link CompressionCodec} for the 17 | * BGZF compression format, 18 | * which reads and writes files with a .bgz suffix. There is no standard 19 | * suffix for BGZF-compressed files, and in fact .gz is commonly used, in 20 | * which case {@link BGZFEnhancedGzipCodec} should be used instead of this class. 21 | *

22 | * To use BGZFCodec, set it on the configuration object as follows. 23 | *

24 | * {@code 25 | * conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName()) 26 | * } 27 | * @see BGZFEnhancedGzipCodec 28 | */ 29 | public class BGZFCodec extends GzipCodec implements SplittableCompressionCodec { 30 | 31 | public static final String DEFAULT_EXTENSION = ".bgz"; 32 | 33 | @Override 34 | public CompressionOutputStream createOutputStream(OutputStream out) throws IOException { 35 | return new BGZFCompressionOutputStream(out); 36 | } 37 | 38 | // compressors are not used, so ignore/return null 39 | 40 | @Override 41 | public CompressionOutputStream createOutputStream(OutputStream out, 42 | Compressor compressor) throws IOException { 43 | return createOutputStream(out); // compressors are not used, so ignore 44 | } 45 | 46 | @Override 47 | public Class getCompressorType() { 48 | return null; // compressors are not used, so return null 49 | } 50 | 51 | @Override 52 | public Compressor createCompressor() { 53 | return null; // compressors are not used, so return null 54 | } 55 | 56 | @Override 57 | public SplitCompressionInputStream createInputStream(InputStream seekableIn, 58 | Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException { 59 | BGZFSplitGuesser splitGuesser = new BGZFSplitGuesser(seekableIn); 60 | long adjustedStart = splitGuesser.guessNextBGZFBlockStart(start, end); 61 | ((Seekable)seekableIn).seek(adjustedStart); 62 | return new BGZFSplitCompressionInputStream(seekableIn, adjustedStart, end); 63 | } 64 | 65 | // fall back to GzipCodec for input streams without a start position 66 | 67 | @Override 68 | public String getDefaultExtension() { 69 | return DEFAULT_EXTENSION; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBCFRecordWriter.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2013-06-28 16:36:22 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.IOException; 26 | import java.io.OutputStream; 27 | 28 | import org.apache.hadoop.fs.Path; 29 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 30 | 31 | import htsjdk.variant.vcf.VCFHeader; 32 | 33 | /** A convenience class that you can use as a RecordWriter for BCF files. 34 | * 35 | *

The write function ignores the key, just outputting the 36 | * VariantContext.

37 | */ 38 | public class KeyIgnoringBCFRecordWriter extends BCFRecordWriter { 39 | public KeyIgnoringBCFRecordWriter( 40 | Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) 41 | throws IOException 42 | { 43 | super(output, input, writeHeader, ctx); 44 | } 45 | public KeyIgnoringBCFRecordWriter( 46 | Path output, VCFHeader header, boolean writeHeader, 47 | TaskAttemptContext ctx) 48 | throws IOException 49 | { 50 | super(output, header, writeHeader, ctx); 51 | } 52 | /** 53 | * @deprecated This constructor has no {@link TaskAttemptContext} so it is not 54 | * possible to pass configuration properties to the writer. 55 | */ 56 | @Deprecated 57 | public KeyIgnoringBCFRecordWriter( 58 | OutputStream output, VCFHeader header, boolean writeHeader) 59 | throws IOException 60 | { 61 | super(output, header, writeHeader); 62 | } 63 | public KeyIgnoringBCFRecordWriter( 64 | OutputStream output, VCFHeader header, boolean writeHeader, 65 | TaskAttemptContext ctx) 66 | throws IOException 67 | { 68 | super(output, header, writeHeader, ctx); 69 | } 70 | 71 | @Override public void write(K ignored, VariantContextWritable vc) { 72 | writeRecord(vc.get()); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/VariantContextWritable.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2013-06-26 10:27:20 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.DataInput; 26 | import java.io.DataOutput; 27 | import java.io.IOException; 28 | 29 | import org.apache.hadoop.io.Writable; 30 | import htsjdk.variant.variantcontext.VariantContext; 31 | import htsjdk.variant.vcf.VCFHeader; 32 | 33 | /** VariantContexts read here have LazyGenotypesContexts, which need to have a 34 | * header set before the genotype data in the VariantContexts can be decoded. 35 | * See the LazyGenotypesContext class. 36 | */ 37 | public class VariantContextWritable implements Writable { 38 | private VariantContext vc; 39 | 40 | public VariantContext get() { return vc; } 41 | public void set(VariantContext vc) { this.vc = vc; } 42 | public void set(VariantContext vc, VCFHeader header) { this.vc = new VariantContextWithHeader(vc, header); } 43 | 44 | // XXX: Unfortunately there's no simple way to just pass a BCF record 45 | // through. Contrasting to BAM, there's no equivalent of the BAMRecord 46 | // subclass of SAMRecord that saves the original BAM fields --- a 47 | // VariantContext only saves the decoded info, so it's impossible to encode 48 | // one to BCF without the header. 49 | // 50 | // VCF is also unusable because VCFWriter defensively refuses to write 51 | // anything without a header, throwing IllegalStateException if attempted. 52 | // 53 | // Thus, we have a custom encoding. 54 | @Override public void write(final DataOutput out) throws IOException { 55 | VariantContextCodec.write(out, vc); 56 | } 57 | @Override public void readFields(final DataInput in) throws IOException { 58 | vc = VariantContextCodec.read(in); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFRecordWriter.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2013-06-27 09:42:56 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.IOException; 26 | import java.io.OutputStream; 27 | 28 | import org.apache.hadoop.fs.Path; 29 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 30 | 31 | import htsjdk.variant.vcf.VCFHeader; 32 | 33 | /** A convenience class that you can use as a RecordWriter for VCF files. 34 | * 35 | *

The write function ignores the key, just outputting the 36 | * VariantContext.

37 | */ 38 | public class KeyIgnoringVCFRecordWriter extends VCFRecordWriter { 39 | public KeyIgnoringVCFRecordWriter( 40 | Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) 41 | throws IOException 42 | { 43 | super(output, input, writeHeader, ctx); 44 | } 45 | public KeyIgnoringVCFRecordWriter( 46 | Path output, VCFHeader header, boolean writeHeader, 47 | TaskAttemptContext ctx) 48 | throws IOException 49 | { 50 | super(output, header, writeHeader, ctx); 51 | } 52 | /** 53 | * @deprecated This constructor has no {@link TaskAttemptContext} so it is not 54 | * possible to pass configuration properties to the writer. 55 | */ 56 | @Deprecated 57 | public KeyIgnoringVCFRecordWriter( 58 | OutputStream output, VCFHeader header, boolean writeHeader) 59 | throws IOException 60 | { 61 | super(output, header, writeHeader); 62 | } 63 | public KeyIgnoringVCFRecordWriter( 64 | OutputStream output, VCFHeader header, boolean writeHeader, 65 | TaskAttemptContext ctx) 66 | throws IOException 67 | { 68 | super(output, header, writeHeader, ctx); 69 | } 70 | 71 | 72 | @Override public void write(K ignored, VariantContextWritable vc) { 73 | writeRecord(vc.get()); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestLineReader.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011-2012 CRS4. 2 | // 3 | // This file is part of Hadoop-BAM. 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to 7 | // deal in the Software without restriction, including without limitation the 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | // sell copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in 13 | // all copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | // IN THE SOFTWARE. 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import org.junit.*; 26 | import static org.junit.Assert.*; 27 | 28 | import org.seqdoop.hadoop_bam.LineReader; 29 | 30 | import org.apache.hadoop.io.Text; 31 | 32 | import java.io.ByteArrayInputStream; 33 | import java.io.IOException; 34 | 35 | public class TestLineReader 36 | { 37 | public static final String input10 = "0123456789"; 38 | public static final String input22 = "0123456789\n0987654321\n"; 39 | 40 | private LineReader reader; 41 | private Text dest = new Text(); 42 | 43 | @Test 44 | public void testReadBufferedLine() throws IOException 45 | { 46 | reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22); 47 | reader.readLine(dest); 48 | assertEquals("0123456789", dest.toString()); 49 | } 50 | 51 | @Test 52 | public void testSkipOnBufferedLine() throws IOException 53 | { 54 | reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22); 55 | long skipped = reader.skip(1); 56 | assertEquals(1, skipped); 57 | reader.readLine(dest); 58 | assertEquals("123456789", dest.toString()); 59 | } 60 | 61 | @Test 62 | public void testReadBeyondBuffer() throws IOException 63 | { 64 | reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5); 65 | reader.readLine(dest); 66 | assertEquals("0123456789", dest.toString()); 67 | } 68 | 69 | @Test 70 | public void testSkipBeyondBuffer() throws IOException 71 | { 72 | reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5); 73 | long skipped = reader.skip(11); 74 | assertEquals(11, skipped); 75 | reader.readLine(dest); 76 | assertEquals("0987654321", dest.toString()); 77 | } 78 | 79 | @Test 80 | public void testSkipBeyondInput() throws IOException 81 | { 82 | reader = new LineReader(new ByteArrayInputStream(input10.getBytes()), 5); 83 | long skipped = reader.skip(11); 84 | assertEquals(10, skipped); 85 | 86 | skipped = reader.skip(11); 87 | assertEquals(0, skipped); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/CRAMRecordReader.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.CRAMIterator; 4 | import htsjdk.samtools.SAMRecord; 5 | import htsjdk.samtools.ValidationStringency; 6 | import htsjdk.samtools.cram.ref.ReferenceSource; 7 | import htsjdk.samtools.seekablestream.SeekableStream; 8 | import java.io.IOException; 9 | import java.net.URI; 10 | import java.nio.file.Paths; 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.fs.Path; 13 | import org.apache.hadoop.io.LongWritable; 14 | import org.apache.hadoop.mapreduce.InputSplit; 15 | import org.apache.hadoop.mapreduce.RecordReader; 16 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 17 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 18 | import org.seqdoop.hadoop_bam.util.NIOFileUtil; 19 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader; 20 | import org.seqdoop.hadoop_bam.util.WrapSeekable; 21 | 22 | public class CRAMRecordReader extends RecordReader { 23 | 24 | private final LongWritable key = new LongWritable(); 25 | private final SAMRecordWritable record = new SAMRecordWritable(); 26 | private boolean isInitialized = false; 27 | private SeekableStream seekableStream; 28 | private long start; 29 | private long length; 30 | private CRAMIterator cramIterator; 31 | 32 | @Override 33 | public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { 34 | if(isInitialized) { 35 | close(); 36 | } 37 | isInitialized = true; 38 | 39 | final Configuration conf = context.getConfiguration(); 40 | final FileSplit fileSplit = (FileSplit) split; 41 | final Path file = fileSplit.getPath(); 42 | 43 | String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY); 44 | ReferenceSource refSource = new ReferenceSource(refSourcePath == null ? null : 45 | NIOFileUtil.asPath(refSourcePath)); 46 | 47 | seekableStream = WrapSeekable.openPath(conf, file); 48 | start = fileSplit.getStart(); 49 | length = fileSplit.getLength(); 50 | long end = start + length; 51 | // CRAMIterator right shifts boundaries by 16 so we do the reverse here 52 | // also subtract one from end since CRAMIterator's boundaries are inclusive 53 | long[] boundaries = new long[] {start << 16, (end - 1) << 16}; 54 | ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf); 55 | cramIterator = new CRAMIterator(seekableStream, refSource, boundaries, stringency); 56 | } 57 | 58 | @Override 59 | public boolean nextKeyValue() { 60 | if (!cramIterator.hasNext()) { 61 | return false; 62 | } 63 | SAMRecord r = cramIterator.next(); 64 | key.set(BAMRecordReader.getKey(r)); 65 | record.set(r); 66 | return true; 67 | } 68 | 69 | @Override 70 | public LongWritable getCurrentKey() { 71 | return key; 72 | } 73 | 74 | @Override 75 | public SAMRecordWritable getCurrentValue() { 76 | return record; 77 | } 78 | 79 | @Override 80 | public float getProgress() throws IOException { 81 | return (float)(seekableStream.position() - start) / length; 82 | } 83 | 84 | @Override 85 | public void close() { 86 | cramIterator.close(); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/VCFFormat.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2013-06-27 13:21:07 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import htsjdk.samtools.util.BlockCompressedInputStream; 26 | import java.io.BufferedInputStream; 27 | import java.io.InputStream; 28 | import java.io.IOException; 29 | 30 | import java.util.zip.GZIPInputStream; 31 | import org.apache.hadoop.fs.Path; 32 | 33 | /** Describes a VCF format. */ 34 | public enum VCFFormat { 35 | VCF, BCF; 36 | 37 | /** Infers the VCF format by looking at the filename of the given path. 38 | * 39 | * @see #inferFromFilePath(String) 40 | */ 41 | public static VCFFormat inferFromFilePath(final Path path) { 42 | return inferFromFilePath(path.getName()); 43 | } 44 | 45 | /** Infers the VCF format by looking at the extension of the given file 46 | * name. *.vcf is recognized as {@link #VCF} and 47 | * *.bcf as {@link #BCF}. 48 | */ 49 | public static VCFFormat inferFromFilePath(final String name) { 50 | if (name.endsWith(".bcf")) return BCF; 51 | if (name.endsWith(".vcf")) return VCF; 52 | if (name.endsWith(".gz")) return VCF; 53 | if (name.endsWith(".bgz")) return VCF; 54 | return null; 55 | } 56 | 57 | /** Infers the VCF format by looking at the first few bytes of the input. 58 | */ 59 | public static VCFFormat inferFromData(final InputStream in) throws IOException { 60 | BufferedInputStream bis = new BufferedInputStream(in); // so mark/reset is supported 61 | return inferFromUncompressedData(isGzip(bis) ? new GZIPInputStream(bis) : bis); 62 | } 63 | 64 | private static VCFFormat inferFromUncompressedData(final InputStream in) throws IOException { 65 | final byte b = (byte)in.read(); 66 | in.close(); 67 | switch (b) { 68 | case 'B': return BCF; 69 | case '#': return VCF; 70 | } 71 | return null; 72 | } 73 | 74 | /** 75 | * @return true if the stream is compressed with gzip (or BGZF) 76 | */ 77 | public static boolean isGzip(final InputStream in) throws IOException { 78 | in.mark(1); 79 | final byte b = (byte)in.read(); 80 | in.reset(); 81 | return b == 0x1f; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/SAMRecordWritable.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-12 09:57:45 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.DataOutput; 26 | import java.io.DataInput; 27 | import java.io.IOException; 28 | 29 | import org.apache.hadoop.io.Writable; 30 | 31 | import htsjdk.samtools.BAMRecordCodec; 32 | import htsjdk.samtools.SAMRecord; 33 | 34 | import org.seqdoop.hadoop_bam.util.DataInputWrapper; 35 | import org.seqdoop.hadoop_bam.util.DataOutputWrapper; 36 | 37 | /** A {@link Writable} {@link SAMRecord}. 38 | * 39 | *

In every mapper, the record will have a header, since BAMInputFormat 40 | * provides one. It is lost when transferring the SAMRecord to a reducer, 41 | * however. The current implementation of {@link BAMRecordCodec} does not 42 | * require a record for encoding nor decoding of a SAMRecord, so 43 | * this fortunately doesn't matter for either {@link #write} or {@link 44 | * #readFields}.

45 | */ 46 | public class SAMRecordWritable implements Writable { 47 | private static final BAMRecordCodec lazyCodec = 48 | new BAMRecordCodec(null, new LazyBAMRecordFactory()); 49 | 50 | private SAMRecord record; 51 | 52 | public SAMRecord get() { return record; } 53 | public void set(SAMRecord r) { record = r; } 54 | 55 | @Override public void write(DataOutput out) throws IOException { 56 | // In theory, it shouldn't matter whether we give a header to 57 | // BAMRecordCodec or not, since the representation of an alignment in BAM 58 | // doesn't depend on the header data at all. Only its interpretation 59 | // does, and a simple read/write codec shouldn't really have anything to 60 | // say about that. (But in practice, it already does matter for decode(), 61 | // which is why LazyBAMRecordFactory exists.) 62 | final BAMRecordCodec codec = new BAMRecordCodec(record.getHeader()); 63 | codec.setOutputStream(new DataOutputWrapper(out)); 64 | codec.encode(record); 65 | } 66 | @Override public void readFields(DataInput in) throws IOException { 67 | lazyCodec.setInputStream(new DataInputWrapper(in)); 68 | record = lazyCodec.decode(); 69 | } 70 | 71 | @Override 72 | public String toString() { 73 | return record.getSAMString().trim(); // remove trailing newline 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/VCFHeaderReader.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2013-07-04 10:49:20 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import java.io.BufferedInputStream; 26 | import java.io.InputStream; 27 | import java.io.IOException; 28 | 29 | import htsjdk.samtools.seekablestream.SeekableStream; 30 | import htsjdk.samtools.util.BlockCompressedInputStream; 31 | 32 | import htsjdk.tribble.FeatureCodecHeader; 33 | import htsjdk.tribble.TribbleException; 34 | import htsjdk.tribble.readers.AsciiLineReader; 35 | import htsjdk.tribble.readers.AsciiLineReaderIterator; 36 | import htsjdk.tribble.readers.PositionalBufferedStream; 37 | import htsjdk.variant.bcf2.BCF2Codec; 38 | import htsjdk.variant.vcf.VCFCodec; 39 | import htsjdk.variant.vcf.VCFHeader; 40 | import java.util.zip.GZIPInputStream; 41 | import org.seqdoop.hadoop_bam.VCFFormat; 42 | import org.slf4j.Logger; 43 | import org.slf4j.LoggerFactory; 44 | 45 | /** Can read a VCF header without being told beforehand whether the input is 46 | * VCF or BCF. 47 | */ 48 | public final class VCFHeaderReader { 49 | private static final Logger logger = LoggerFactory.getLogger(VCFHeaderReader.class); 50 | 51 | public static VCFHeader readHeaderFrom(final SeekableStream in) 52 | throws IOException 53 | { 54 | Object headerCodec = null; 55 | Object header = null; 56 | final long initialPos = in.position(); 57 | try { 58 | BufferedInputStream bis = new BufferedInputStream(in); 59 | InputStream is = VCFFormat.isGzip(bis) ? new GZIPInputStream(bis) : bis; 60 | headerCodec = new VCFCodec().readHeader(new AsciiLineReaderIterator(new AsciiLineReader(is))); 61 | } catch (TribbleException e) { 62 | logger.warn("Exception while trying to read VCF header from file:", e); 63 | 64 | in.seek(initialPos); 65 | 66 | InputStream bin = new BufferedInputStream(in); 67 | if (BlockCompressedInputStream.isValidFile(bin)) 68 | bin = new BlockCompressedInputStream(bin); 69 | 70 | headerCodec = 71 | new BCF2Codec().readHeader( 72 | new PositionalBufferedStream(bin)); 73 | } 74 | if (!(headerCodec instanceof FeatureCodecHeader)) 75 | throw new IOException("No VCF header found"); 76 | header = ((FeatureCodecHeader)headerCodec).getHeaderValue(); 77 | return (VCFHeader)header; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/WrapSeekable.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-25 11:24:30 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import java.io.InputStream; 26 | import java.io.IOException; 27 | 28 | import org.apache.hadoop.conf.Configuration; 29 | import org.apache.hadoop.fs.FileSystem; 30 | import org.apache.hadoop.fs.FSDataInputStream; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.hadoop.fs.Seekable; 33 | 34 | import htsjdk.samtools.seekablestream.SeekableStream; 35 | 36 | /** Wraps Hadoop's "seekable stream" abstraction so that we can give such a one 37 | * to BlockCompressedInputStream and retain seekability. 38 | * 39 | *

This is necessary because Hadoop and the SAM tools each have their own 40 | * "seekable stream" abstraction.

41 | */ 42 | public class WrapSeekable 43 | extends SeekableStream 44 | { 45 | private final S stm; 46 | private final long len; 47 | private final Path path; 48 | 49 | public WrapSeekable(final S s, long length, Path p) { 50 | stm = s; 51 | len = length; 52 | path = p; 53 | } 54 | 55 | /** A helper for the common use case. */ 56 | public static WrapSeekable openPath( 57 | FileSystem fs, Path p) throws IOException 58 | { 59 | return new WrapSeekable( 60 | fs.open(p), fs.getFileStatus(p).getLen(), p); 61 | } 62 | public static WrapSeekable openPath( 63 | Configuration conf, Path path) throws IOException 64 | { 65 | return openPath(path.getFileSystem(conf), path); 66 | } 67 | 68 | @Override public String getSource() { return path.toString(); } 69 | @Override public long length () { return len; } 70 | 71 | @Override public long position() throws IOException { return stm.getPos(); } 72 | @Override public void close() throws IOException { stm.close(); } 73 | @Override public boolean eof () throws IOException { 74 | return stm.getPos() == length(); 75 | } 76 | @Override public void seek(long pos) throws IOException { 77 | stm.seek(pos); 78 | } 79 | @Override public int read() throws IOException { 80 | return stm.read(); 81 | } 82 | @Override public int read(byte[] buf, int offset, int len) 83 | throws IOException 84 | { 85 | return stm.read(buf, offset, len); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestConfHelper.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011-2012 CRS4. 2 | // 3 | // This file is part of Hadoop-BAM. 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to 7 | // deal in the Software without restriction, including without limitation the 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | // sell copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in 13 | // all copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | // IN THE SOFTWARE. 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import org.seqdoop.hadoop_bam.util.ConfHelper; 26 | 27 | import org.junit.*; 28 | import static org.junit.Assert.*; 29 | 30 | import org.apache.hadoop.conf.Configuration; 31 | 32 | public class TestConfHelper 33 | { 34 | @Test 35 | public void testParseBooleanValidValues() 36 | { 37 | assertTrue(ConfHelper.parseBoolean("true", false)); 38 | assertTrue(ConfHelper.parseBoolean("tRuE", false)); 39 | assertTrue(ConfHelper.parseBoolean("TRUE", false)); 40 | assertTrue(ConfHelper.parseBoolean("t", false)); 41 | assertTrue(ConfHelper.parseBoolean("yes", false)); 42 | assertTrue(ConfHelper.parseBoolean("y", false)); 43 | assertTrue(ConfHelper.parseBoolean("Y", false)); 44 | assertTrue(ConfHelper.parseBoolean("1", false)); 45 | 46 | assertFalse(ConfHelper.parseBoolean("false", true)); 47 | assertFalse(ConfHelper.parseBoolean("faLse", true)); 48 | assertFalse(ConfHelper.parseBoolean("FALSE", true)); 49 | assertFalse(ConfHelper.parseBoolean("f", true)); 50 | assertFalse(ConfHelper.parseBoolean("no", true)); 51 | assertFalse(ConfHelper.parseBoolean("n", true)); 52 | assertFalse(ConfHelper.parseBoolean("N", true)); 53 | assertFalse(ConfHelper.parseBoolean("0", true)); 54 | } 55 | 56 | @Test 57 | public void testParseBooleanNull() 58 | { 59 | assertTrue(ConfHelper.parseBoolean(null, true)); 60 | assertFalse(ConfHelper.parseBoolean(null, false)); 61 | } 62 | 63 | @Test(expected=IllegalArgumentException.class) 64 | public void testParseBooleanInvalidValue() 65 | { 66 | ConfHelper.parseBoolean("dodo", true); 67 | } 68 | 69 | @Test 70 | public void testParseBooleanFromConfValue() 71 | { 72 | final String propName = "my.property"; 73 | Configuration conf = new Configuration(); 74 | conf.set(propName, "t"); 75 | assertTrue(ConfHelper.parseBoolean(conf, propName, false)); 76 | } 77 | 78 | @Test 79 | public void testParseBooleanFromConfNull() 80 | { 81 | Configuration conf = new Configuration(); 82 | assertTrue(ConfHelper.parseBoolean(conf, "my.property", true)); 83 | assertFalse(ConfHelper.parseBoolean(conf, "my.property", false)); 84 | } 85 | 86 | 87 | public static void main(String args[]) { 88 | org.junit.runner.JUnitCore.main(TestConfHelper.class.getName()); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/IntervalUtilTest.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.util.Interval; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.junit.Assert; 6 | import org.junit.Test; 7 | import org.seqdoop.hadoop_bam.util.IntervalUtil; 8 | 9 | import java.util.List; 10 | import java.util.stream.Collectors; 11 | import java.util.stream.Stream; 12 | 13 | /** 14 | * Unit tests for {@link IntervalUtil}. 15 | */ 16 | public class IntervalUtilTest { 17 | 18 | @Test 19 | public void testInvalidIntervals() { 20 | final String[] invalidIntervals = { 21 | "chr1", // full sequence interval are not allowed. 22 | "chr1:12", // single position omitting stop is not allowed. 23 | "chr1,chr2:121-123", // , are not allowed anywhere 24 | "chr20:1,100-3,400", // , " " 25 | "MT:35+", // , until end of contig + is not allowed. 26 | "MT:13-31-1112", // too many positions. 27 | "MT:-2112", // forgot the start position! 28 | " MT : 113 - 1245" // blanks are not allowed either. 29 | }; 30 | for (final String interval : invalidIntervals) { 31 | final Configuration conf = new Configuration(); 32 | conf.set("prop-name", interval); 33 | try { 34 | IntervalUtil.getIntervals(conf, "prop-name"); 35 | Assert.fail("expected an exception when dealing with '" + interval + "'"); 36 | } catch (final FormatException ex) { 37 | // fine. 38 | } 39 | } 40 | } 41 | 42 | @Test 43 | public void testValidIntervals() { 44 | final Object[][] validIntervals = { 45 | {"chr1:1-343", "chr1", 1, 343}, // standard 'chr' starting contig interval. 46 | {"chr20_Un:31-145", "chr20_Un", 31, 145}, // standard chromosome name containing underscore. 47 | {"X:31-145", "X", 31, 145}, // standard 'X' chromosome interval. 48 | {"10:45000012-678901123", "10", 45000012, 678901123}, // standard number starting chromosome name interval. 49 | {"HLA-DQA1*01:01:02:134-14151", "HLA-DQA1*01:01:02", 134, 14151}}; // example of a Hg38 assembly 50 | // HLA contigs including - and : in their names. 51 | 52 | final Configuration conf = new Configuration(); 53 | 54 | Assert.assertNull(IntervalUtil.getIntervals(conf, "prop-name")); 55 | 56 | conf.set("prop-name", ""); 57 | 58 | Assert.assertNotNull(IntervalUtil.getIntervals(conf, "prop-name")); 59 | Assert.assertTrue(IntervalUtil.getIntervals(conf, "prop-name").isEmpty()); 60 | 61 | conf.set("prop-name", Stream.of(validIntervals) 62 | .map(o -> (String) o[0]).collect(Collectors.joining(","))); 63 | 64 | final List allIntervals = IntervalUtil.getIntervals(conf, "prop-name"); 65 | Assert.assertNotNull(allIntervals); 66 | Assert.assertEquals(allIntervals.size(), validIntervals.length); 67 | for (int i = 0; i < validIntervals.length; i++) { 68 | Assert.assertNotNull(allIntervals.get(i)); 69 | Assert.assertEquals(allIntervals.get(i).getContig(), validIntervals[i][1]); 70 | Assert.assertEquals(allIntervals.get(i).getStart(), validIntervals[i][2]); 71 | Assert.assertEquals(allIntervals.get(i).getEnd(), validIntervals[i][3]); 72 | } 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/BGZFEnhancedGzipCodec.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam.util; 2 | 3 | import htsjdk.samtools.util.BlockCompressedInputStream; 4 | import java.io.BufferedInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import org.apache.hadoop.fs.Seekable; 8 | import org.apache.hadoop.io.compress.CompressionCodec; 9 | import org.apache.hadoop.io.compress.CompressionInputStream; 10 | import org.apache.hadoop.io.compress.Decompressor; 11 | import org.apache.hadoop.io.compress.GzipCodec; 12 | import org.apache.hadoop.io.compress.SplitCompressionInputStream; 13 | import org.apache.hadoop.io.compress.SplittableCompressionCodec; 14 | 15 | /** 16 | * A Hadoop {@link CompressionCodec} for the 17 | * BGZF compression format, 18 | * which reads and writes files with a .gz suffix. 19 | *

20 | * BGZF is a splittable extension of gzip, which means that all BGZF files are standard 21 | * gzip files, however the reverse is not necessarily the case. BGZF files often have the 22 | * standard .gz suffix (such as those produced by the 23 | * bcftools command), 24 | * which causes a difficulty since it is not immediately apparent from the filename alone 25 | * whether a file is a BGZF file, or merely a regular gzip file. BGZFEnhancedGzipCodec 26 | * will read the start of the file to look for BGZF headers to detect the type of 27 | * compression. 28 | *

29 | *

30 | * BGZFEnhancedGzipCodec will read BGZF or gzip files, but currently always writes regular gzip files. 31 | *

32 | *

33 | * To use BGZFEnhancedGzipCodec, set it on the configuration object as follows. This will 34 | * override the built-in GzipCodec that is mapped to the .gz suffix. 35 | *

36 | * {@code 37 | * conf.set("io.compression.codecs", BGZFEnhancedGzipCodec.class.getCanonicalName()) 38 | * } 39 | * @see BGZFCodec 40 | */ 41 | public class BGZFEnhancedGzipCodec extends GzipCodec implements SplittableCompressionCodec { 42 | 43 | @Override 44 | public SplitCompressionInputStream createInputStream(InputStream seekableIn, Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException { 45 | if (!(seekableIn instanceof Seekable)) { 46 | throw new IOException("seekableIn must be an instance of " + 47 | Seekable.class.getName()); 48 | } 49 | if (!BlockCompressedInputStream.isValidFile(new BufferedInputStream(seekableIn))) { 50 | // data is regular gzip, not BGZF 51 | ((Seekable)seekableIn).seek(0); 52 | final CompressionInputStream compressionInputStream = createInputStream(seekableIn, 53 | decompressor); 54 | return new SplitCompressionInputStream(compressionInputStream, start, end) { 55 | @Override 56 | public int read(byte[] b, int off, int len) throws IOException { 57 | return compressionInputStream.read(b, off, len); 58 | } 59 | @Override 60 | public void resetState() throws IOException { 61 | compressionInputStream.resetState(); 62 | } 63 | @Override 64 | public int read() throws IOException { 65 | return compressionInputStream.read(); 66 | } 67 | }; 68 | } 69 | BGZFSplitGuesser splitGuesser = new BGZFSplitGuesser(seekableIn); 70 | long adjustedStart = splitGuesser.guessNextBGZFBlockStart(start, end); 71 | ((Seekable)seekableIn).seek(adjustedStart); 72 | return new BGZFSplitCompressionInputStream(seekableIn, adjustedStart, end); 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/BaseSplitGuesser.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.seekablestream.SeekableStream; 4 | import java.io.IOException; 5 | import java.nio.ByteBuffer; 6 | import java.nio.ByteOrder; 7 | import org.apache.hadoop.io.IOUtils; 8 | 9 | class BaseSplitGuesser { 10 | 11 | protected final static int BGZF_MAGIC = 0x04088b1f; 12 | protected final static int BGZF_MAGIC_SUB = 0x00024342; 13 | protected final static int BGZF_SUB_SIZE = 4 + 2; 14 | 15 | protected SeekableStream in; 16 | protected final ByteBuffer buf; 17 | 18 | public BaseSplitGuesser() { 19 | buf = ByteBuffer.allocate(8); 20 | buf.order(ByteOrder.LITTLE_ENDIAN); 21 | } 22 | 23 | protected static class PosSize { 24 | public int pos; 25 | public int size; 26 | public PosSize(int p, int s) { pos = p; size = s; } 27 | } 28 | 29 | // Gives the compressed size on the side. Returns null if it doesn't find 30 | // anything. 31 | protected PosSize guessNextBGZFPos(int p, int end) { 32 | try { for (;;) { 33 | for (;;) { 34 | in.seek(p); 35 | IOUtils.readFully(in, buf.array(), 0, 4); 36 | int n = buf.getInt(0); 37 | 38 | if (n == BGZF_MAGIC) 39 | break; 40 | 41 | // Skip ahead a bit more than 1 byte if you can. 42 | if (n >>> 8 == BGZF_MAGIC << 8 >>> 8) 43 | ++p; 44 | else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16) 45 | p += 2; 46 | else 47 | p += 3; 48 | 49 | if (p >= end) 50 | return null; 51 | } 52 | // Found what looks like a gzip block header: now get XLEN and 53 | // search for the BGZF subfield. 54 | final int p0 = p; 55 | p += 10; 56 | in.seek(p); 57 | IOUtils.readFully(in, buf.array(), 0, 2); 58 | p += 2; 59 | final int xlen = getUShort(0); 60 | final int subEnd = p + xlen; 61 | 62 | while (p < subEnd) { 63 | IOUtils.readFully(in, buf.array(), 0, 4); 64 | 65 | if (buf.getInt(0) != BGZF_MAGIC_SUB) { 66 | p += 4 + getUShort(2); 67 | in.seek(p); 68 | continue; 69 | } 70 | 71 | // Found it: this is close enough to a BGZF block, make it 72 | // our guess. 73 | 74 | // But find out the size before returning. First, grab bsize: 75 | // we'll need it later. 76 | IOUtils.readFully(in, buf.array(), 0, 2); 77 | int bsize = getUShort(0); 78 | 79 | // Then skip the rest of the subfields. 80 | p += BGZF_SUB_SIZE; 81 | while (p < subEnd) { 82 | in.seek(p); 83 | IOUtils.readFully(in, buf.array(), 0, 4); 84 | p += 4 + getUShort(2); 85 | } 86 | if (p != subEnd) { 87 | // Cancel our guess because the xlen field didn't match the 88 | // data. 89 | break; 90 | } 91 | 92 | // Now skip past the compressed data and the CRC-32. 93 | p += bsize - xlen - 19 + 4; 94 | in.seek(p); 95 | IOUtils.readFully(in, buf.array(), 0, 4); 96 | return new PosSize(p0, buf.getInt(0)); 97 | } 98 | // No luck: look for the next gzip block header. Start right after 99 | // where we last saw the identifiers, although we could probably 100 | // safely skip further ahead. (If we find the correct one right 101 | // now, the previous block contained 0x1f8b0804 bytes of data: that 102 | // seems... unlikely.) 103 | p = p0 + 4; 104 | 105 | }} catch (IOException e) { 106 | return null; 107 | } 108 | } 109 | 110 | protected int getUShort(final int idx) { 111 | return (int)buf.getShort(idx) & 0xffff; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/SAMRecordWriter.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2012-02-23 12:42:49 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.IOException; 26 | import java.io.OutputStream; 27 | 28 | import htsjdk.samtools.SAMFileHeader; 29 | import htsjdk.samtools.SAMRecord; 30 | import htsjdk.samtools.SAMTextWriter; 31 | 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.mapreduce.RecordWriter; 34 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 35 | 36 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader; 37 | 38 | /** A base {@link RecordWriter} for SAM records. 39 | * 40 | *

Handles the output stream, writing the header if requested, and provides 41 | * the {@link #writeAlignment} function for subclasses.

42 | */ 43 | public abstract class SAMRecordWriter 44 | extends RecordWriter 45 | { 46 | private SAMTextWriter writer; 47 | private SAMFileHeader header; 48 | 49 | /** A SAMFileHeader is read from the input Path. */ 50 | public SAMRecordWriter( 51 | Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) 52 | throws IOException 53 | { 54 | init( 55 | output, 56 | SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()), 57 | writeHeader, ctx); 58 | } 59 | public SAMRecordWriter( 60 | Path output, SAMFileHeader header, boolean writeHeader, 61 | TaskAttemptContext ctx) 62 | throws IOException 63 | { 64 | init( 65 | output.getFileSystem(ctx.getConfiguration()).create(output), 66 | header, writeHeader); 67 | } 68 | public SAMRecordWriter( 69 | OutputStream output, SAMFileHeader header, boolean writeHeader) 70 | throws IOException 71 | { 72 | init(output, header, writeHeader); 73 | } 74 | 75 | private void init( 76 | Path output, SAMFileHeader header, boolean writeHeader, 77 | TaskAttemptContext ctx) 78 | throws IOException 79 | { 80 | init( 81 | output.getFileSystem(ctx.getConfiguration()).create(output), 82 | header, writeHeader); 83 | } 84 | private void init( 85 | OutputStream output, SAMFileHeader header, boolean writeHeader) 86 | throws IOException 87 | { 88 | this.header = header; 89 | writer = new SAMTextWriter(output); 90 | 91 | writer.setSortOrder(header.getSortOrder(), false); 92 | if (writeHeader) 93 | writer.setHeader(header); 94 | } 95 | 96 | @Override public void close(TaskAttemptContext ctx) { 97 | writer.close(); 98 | } 99 | 100 | protected void writeAlignment(final SAMRecord rec) { 101 | rec.setHeader(header); 102 | writer.writeAlignment(rec); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMOutputFormat.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-11 12:19:23 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.IOException; 26 | import java.io.InputStream; 27 | 28 | import htsjdk.samtools.SAMFileHeader; 29 | 30 | import org.apache.hadoop.conf.Configuration; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.hadoop.mapreduce.RecordWriter; 33 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 34 | 35 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader; 36 | 37 | /** Writes only the BAM records, not the key. 38 | * 39 | *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or 40 | * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.

41 | * 42 | *

By default, writes the SAM header to the output file(s). This 43 | * can be disabled, because in distributed usage one often ends up with (and, 44 | * for decent performance, wants to end up with) the output split into multiple 45 | * parts, which are easier to concatenate if the header is not present in each 46 | * file.

47 | */ 48 | public class KeyIgnoringBAMOutputFormat extends BAMOutputFormat { 49 | protected SAMFileHeader header; 50 | private boolean writeHeader = true; 51 | 52 | public KeyIgnoringBAMOutputFormat() {} 53 | 54 | /** Whether the header will be written or not. */ 55 | public boolean getWriteHeader() { return writeHeader; } 56 | 57 | /** Set whether the header will be written or not. */ 58 | public void setWriteHeader(boolean b) { writeHeader = b; } 59 | 60 | public SAMFileHeader getSAMHeader() { return header; } 61 | public void setSAMHeader(SAMFileHeader header) { this.header = header; } 62 | 63 | public void readSAMHeaderFrom(Path path, Configuration conf) 64 | throws IOException 65 | { 66 | this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); 67 | } 68 | public void readSAMHeaderFrom(InputStream in, Configuration conf) { 69 | this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); 70 | } 71 | 72 | /** setSAMHeader or readSAMHeaderFrom must have 73 | * been called first. 74 | */ 75 | @Override public RecordWriter getRecordWriter( 76 | TaskAttemptContext ctx) 77 | throws IOException 78 | { 79 | return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); 80 | } 81 | 82 | // Allows wrappers to provide their own work file. 83 | public RecordWriter getRecordWriter( 84 | TaskAttemptContext ctx, Path out) 85 | throws IOException 86 | { 87 | if (this.header == null) 88 | throw new IOException( 89 | "Can't create a RecordWriter without the SAM header"); 90 | 91 | return new KeyIgnoringBAMRecordWriter(out, header, writeHeader, ctx); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/CRAMInputFormat.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.cram.build.CramContainerIterator; 4 | import htsjdk.samtools.seekablestream.SeekableStream; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | import java.util.Map; 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.LongWritable; 13 | import org.apache.hadoop.mapreduce.InputSplit; 14 | import org.apache.hadoop.mapreduce.JobContext; 15 | import org.apache.hadoop.mapreduce.RecordReader; 16 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 19 | import org.seqdoop.hadoop_bam.util.WrapSeekable; 20 | 21 | public class CRAMInputFormat extends FileInputFormat { 22 | 23 | public static final String REFERENCE_SOURCE_PATH_PROPERTY = 24 | "hadoopbam.cram.reference-source-path"; 25 | 26 | @Override 27 | public List getSplits(JobContext job) throws IOException { 28 | return getSplits(super.getSplits(job), job.getConfiguration()); 29 | } 30 | 31 | public List getSplits(List splits, Configuration conf) 32 | throws IOException { 33 | // update splits to align with CRAM container boundaries 34 | List newSplits = new ArrayList(); 35 | Map> fileToOffsets = new HashMap>(); 36 | for (InputSplit split : splits) { 37 | FileSplit fileSplit = (FileSplit) split; 38 | Path path = fileSplit.getPath(); 39 | List containerOffsets = fileToOffsets.get(path); 40 | if (containerOffsets == null) { 41 | containerOffsets = getContainerOffsets(conf, path); 42 | fileToOffsets.put(path, containerOffsets); 43 | } 44 | long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart()); 45 | long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() + 46 | fileSplit.getLength()); 47 | long newLength = newEnd - newStart; 48 | if (newLength == 0) { // split is wholly within a container 49 | continue; 50 | } 51 | FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength, 52 | fileSplit.getLocations()); 53 | newSplits.add(newSplit); 54 | } 55 | return newSplits; 56 | } 57 | 58 | private static List getContainerOffsets(Configuration conf, Path cramFile) 59 | throws IOException { 60 | SeekableStream seekableStream = WrapSeekable.openPath(conf, cramFile); 61 | CramContainerIterator cci = new CramContainerIterator(seekableStream); 62 | List containerOffsets = new ArrayList(); 63 | containerOffsets.add(seekableStream.position()); 64 | while (cci.hasNext()) { 65 | cci.next(); 66 | containerOffsets.add(seekableStream.position()); 67 | } 68 | containerOffsets.add(seekableStream.length()); 69 | return containerOffsets; 70 | } 71 | 72 | private static long nextContainerOffset(List containerOffsets, long position) { 73 | for (long offset : containerOffsets) { 74 | if (offset >= position) { 75 | return offset; 76 | } 77 | } 78 | throw new IllegalStateException("Could not find position " + position + " in " + 79 | "container offsets: " + containerOffsets); 80 | } 81 | 82 | @Override 83 | public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { 84 | RecordReader rr = new CRAMRecordReader(); 85 | rr.initialize(split, context); 86 | return rr; 87 | } 88 | 89 | @Override 90 | public boolean isSplitable(JobContext job, Path path) { 91 | return true; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/BAMTestUtil.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.BAMIndex; 4 | import htsjdk.samtools.BAMIndexer; 5 | import htsjdk.samtools.SAMFileHeader; 6 | import htsjdk.samtools.SAMFileWriter; 7 | import htsjdk.samtools.SAMFileWriterFactory; 8 | import htsjdk.samtools.SAMRecord; 9 | import htsjdk.samtools.SAMRecordSetBuilder; 10 | import htsjdk.samtools.SamReader; 11 | import htsjdk.samtools.SamReaderFactory; 12 | import java.io.File; 13 | import java.io.IOException; 14 | 15 | class BAMTestUtil { 16 | public static File writeBamFile(int numPairs, SAMFileHeader.SortOrder sortOrder) 17 | throws IOException { 18 | // file will be both queryname and coordinate sorted, so use one or the other 19 | SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, sortOrder); 20 | for (int i = 0; i < numPairs; i++) { 21 | int chr = 20; 22 | int start1 = (i + 1) * 1000; 23 | int start2 = start1 + 100; 24 | if (i == 5) { // add two unmapped fragments instead of a mapped pair 25 | samRecordSetBuilder.addFrag(String.format("test-read-%03d-1", i), chr, start1, 26 | false, true, null, 27 | null, 28 | -1, false); 29 | samRecordSetBuilder.addFrag(String.format("test-read-%03d-2", i), chr, start2, 30 | false, true, null, 31 | null, 32 | -1, false); 33 | } else { 34 | samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1, 35 | start2); 36 | } 37 | } 38 | if (numPairs > 0) { // add two unplaced unmapped fragments if non-empty 39 | samRecordSetBuilder.addUnmappedFragment(String.format 40 | ("test-read-%03d-unplaced-unmapped", numPairs++)); 41 | samRecordSetBuilder.addUnmappedFragment(String.format 42 | ("test-read-%03d-unplaced-unmapped", numPairs++)); 43 | } 44 | 45 | final File bamFile = File.createTempFile("test", ".bam"); 46 | bamFile.deleteOnExit(); 47 | SAMFileHeader samHeader = samRecordSetBuilder.getHeader(); 48 | final SAMFileWriter bamWriter = new SAMFileWriterFactory() 49 | .makeSAMOrBAMWriter(samHeader, true, bamFile); 50 | for (final SAMRecord rec : samRecordSetBuilder.getRecords()) { 51 | bamWriter.addAlignment(rec); 52 | } 53 | bamWriter.close(); 54 | 55 | // create BAM index 56 | if (sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) { 57 | SamReader samReader = SamReaderFactory.makeDefault() 58 | .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS) 59 | .open(bamFile); 60 | BAMIndexer.createIndex(samReader, new File(bamFile.getAbsolutePath() 61 | .replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix))); 62 | } 63 | 64 | return bamFile; 65 | } 66 | 67 | public static File writeBamFileWithLargeHeader() throws IOException { 68 | SAMRecordSetBuilder samRecordSetBuilder = 69 | new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.queryname); 70 | for (int i = 0; i < 1000; i++) { 71 | int chr = 20; 72 | int start1 = (i + 1) * 1000; 73 | int start2 = start1 + 100; 74 | samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1, 75 | start2); 76 | } 77 | 78 | final File bamFile = File.createTempFile("test", ".bam"); 79 | bamFile.deleteOnExit(); 80 | SAMFileHeader samHeader = samRecordSetBuilder.getHeader(); 81 | StringBuffer sb = new StringBuffer(); 82 | for (int i = 0; i < 1000000; i++) { 83 | sb.append("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"); 84 | } 85 | samHeader.addComment(sb.toString()); 86 | final SAMFileWriter bamWriter = new SAMFileWriterFactory() 87 | .makeSAMOrBAMWriter(samHeader, true, bamFile); 88 | for (final SAMRecord rec : samRecordSetBuilder.getRecords()) { 89 | bamWriter.addAlignment(rec); 90 | } 91 | bamWriter.close(); 92 | 93 | return bamFile; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitCompressionInputStream.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam.util; 2 | 3 | import htsjdk.samtools.util.BlockCompressedInputStream; 4 | import java.io.BufferedInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import org.apache.hadoop.io.compress.SplitCompressionInputStream; 8 | 9 | /** 10 | * An implementation of {@code SplitCompressionInputStream} for BGZF, based on 11 | * {@code BZip2CompressionInputStream} and {@code CBZip2InputStream} from Hadoop. 12 | * (BZip2 is the only splittable compression codec in Hadoop.) 13 | */ 14 | class BGZFSplitCompressionInputStream extends SplitCompressionInputStream { 15 | private static final int END_OF_BLOCK = -2; 16 | private final BlockCompressedInputStream input; 17 | private BufferedInputStream bufferedIn; 18 | private long startingPos = 0L; 19 | private long processedPosition; 20 | 21 | private enum POS_ADVERTISEMENT_STATE_MACHINE { 22 | HOLD, ADVERTISE 23 | }; 24 | 25 | POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD; 26 | long compressedStreamPosition = 0; 27 | 28 | public BGZFSplitCompressionInputStream(InputStream in, long start, long end) 29 | throws IOException { 30 | super(in, start, end); 31 | bufferedIn = new BufferedInputStream(super.in); 32 | this.startingPos = super.getPos(); 33 | input = new BlockCompressedInputStream(bufferedIn); 34 | this.updatePos(false); 35 | } 36 | 37 | @Override 38 | public int read() throws IOException { 39 | byte b[] = new byte[1]; 40 | int result = this.read(b, 0, 1); 41 | return (result < 0) ? result : (b[0] & 0xff); 42 | } 43 | 44 | @Override 45 | public int read(byte[] b, int off, int len) throws IOException { 46 | // See BZip2CompressionInputStream#read for implementation notes. 47 | int result; 48 | result = readWithinBlock(b, off, len); 49 | if (result == END_OF_BLOCK) { 50 | this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE; 51 | } 52 | if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) { 53 | result = readWithinBlock(b, off, off + 1); 54 | // This is the precise time to update compressed stream position 55 | // to the client of this code. 56 | this.updatePos(true); 57 | this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD; 58 | } 59 | return result; 60 | } 61 | 62 | /** 63 | * Read up to len bytes from the stream, but no further than the end of the 64 | * compressed block. If at the end of the block then no bytes will be read and a return 65 | * value of -2 will be returned; on the next call to read, bytes from the next block 66 | * will be returned. This is the same contract as CBZip2InputStream in Hadoop. 67 | * @return int The return value greater than 0 are the bytes read. A value 68 | * of -1 means end of stream while -2 represents end of block. 69 | */ 70 | private int readWithinBlock(byte[] b, int off, int len) throws IOException { 71 | if (input.endOfBlock()) { 72 | final int available = input.available(); // this will read the next block, if there is one 73 | processedPosition = input.getPosition() >> 16; 74 | if (available == 0) { // end of stream 75 | return -1; 76 | } 77 | return END_OF_BLOCK; 78 | } 79 | 80 | // return up to end of block (at most) 81 | int available = input.available(); 82 | return input.read(b, off, Math.min(available, len)); 83 | } 84 | 85 | @Override 86 | public void resetState() throws IOException { 87 | // not implemented (only used in sequence files) 88 | } 89 | 90 | @Override 91 | public long getPos() throws IOException { 92 | return this.compressedStreamPosition; 93 | } 94 | 95 | // See comment in BZip2CompressionInputStream#updatePos 96 | private void updatePos(boolean shouldAddOn) { 97 | int addOn = shouldAddOn ? 1 : 0; 98 | this.compressedStreamPosition = this.startingPos + processedPosition + addOn; 99 | } 100 | 101 | @Override 102 | public void close() throws IOException { 103 | input.close(); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestFastaInputFormat.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import java.util.List; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.InputSplit; 7 | import org.apache.hadoop.mapreduce.JobContext; 8 | import org.apache.hadoop.mapreduce.RecordReader; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | import org.apache.hadoop.mapreduce.TaskAttemptID; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.task.JobContextImpl; 13 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; 14 | import org.junit.Before; 15 | import org.junit.Test; 16 | 17 | import static org.junit.Assert.assertEquals; 18 | import static org.junit.Assert.assertFalse; 19 | import static org.junit.Assert.assertTrue; 20 | import static org.mockito.Mockito.mock; 21 | 22 | public class TestFastaInputFormat { 23 | private String input; 24 | private TaskAttemptContext taskAttemptContext; 25 | private JobContext jobContext; 26 | 27 | @Before 28 | public void setup() throws Exception { 29 | Configuration conf = new Configuration(); 30 | input = ClassLoader.getSystemClassLoader().getResource("mini-chr1-chr2.fasta").getFile(); 31 | conf.set("mapred.input.dir", "file://" + input); 32 | 33 | // Input fasta is 600 bytes, so this gets us 3 FileInputFormat splits. 34 | conf.set(FileInputFormat.SPLIT_MAXSIZE, "200"); 35 | 36 | taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); 37 | jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID()); 38 | } 39 | 40 | @Test 41 | public void testReader() throws Exception { 42 | FastaInputFormat inputFormat = new FastaInputFormat(); 43 | List splits = inputFormat.getSplits(jobContext); 44 | assertEquals(2, splits.size()); 45 | RecordReader reader = inputFormat 46 | .createRecordReader(splits.get(0), taskAttemptContext); 47 | reader.initialize(splits.get(0), taskAttemptContext); 48 | 49 | assertTrue(reader.nextKeyValue()); 50 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:11"), reader.getCurrentKey()); 51 | assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"), reader.getCurrentValue().getSequence()); 52 | 53 | assertTrue(reader.nextKeyValue()); 54 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:182"), reader.getCurrentKey()); 55 | assertEquals(new Text("ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAAC"), reader.getCurrentValue().getSequence()); 56 | 57 | assertTrue(reader.nextKeyValue()); 58 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1163"), reader.getCurrentKey()); 59 | assertEquals(new Text("CCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCC"), reader.getCurrentValue().getSequence()); 60 | 61 | assertTrue(reader.nextKeyValue()); 62 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1244"), reader.getCurrentKey()); 63 | assertEquals(new Text("TAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC"), reader.getCurrentValue().getSequence()); 64 | 65 | assertTrue(reader.nextKeyValue()); 66 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1325"), reader.getCurrentKey()); 67 | assertEquals(new Text("CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCC"), reader.getCurrentValue().getSequence()); 68 | 69 | assertFalse(reader.nextKeyValue()); 70 | 71 | reader = inputFormat.createRecordReader(splits.get(1), taskAttemptContext); 72 | reader.initialize(splits.get(1), taskAttemptContext); 73 | 74 | assertTrue(reader.nextKeyValue()); 75 | assertEquals(new Text("chr2 dna:chromosome chromosome:GRCh37:2:1:243199373:11"), reader.getCurrentKey()); 76 | assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC"), reader.getCurrentValue().getSequence()); 77 | 78 | assertFalse(reader.nextKeyValue()); 79 | 80 | reader.close(); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/SAMHeaderReader.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2013-07-26 13:54:32 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import java.io.IOException; 26 | import java.io.InputStream; 27 | import java.net.URI; 28 | import java.nio.file.Paths; 29 | 30 | import htsjdk.samtools.cram.ref.ReferenceSource; 31 | import org.apache.hadoop.conf.Configuration; 32 | import org.apache.hadoop.fs.Path; 33 | 34 | import htsjdk.samtools.SAMFileHeader; 35 | import htsjdk.samtools.SamInputResource; 36 | import htsjdk.samtools.SamReaderFactory; 37 | import htsjdk.samtools.ValidationStringency; 38 | import org.seqdoop.hadoop_bam.CRAMInputFormat; 39 | 40 | public final class SAMHeaderReader { 41 | /** A String property corresponding to a ValidationStringency 42 | * value. If set, the given stringency is used when any part of the 43 | * Hadoop-BAM library reads SAM or BAM. 44 | */ 45 | public static final String VALIDATION_STRINGENCY_PROPERTY = 46 | "hadoopbam.samheaderreader.validation-stringency"; 47 | 48 | public static SAMFileHeader readSAMHeaderFrom(Path path, Configuration conf) 49 | throws IOException 50 | { 51 | InputStream i = path.getFileSystem(conf).open(path); 52 | final SAMFileHeader h = readSAMHeaderFrom(i, conf); 53 | i.close(); 54 | return h; 55 | } 56 | 57 | /** Does not close the stream. */ 58 | public static SAMFileHeader readSAMHeaderFrom( 59 | final InputStream in, final Configuration conf) 60 | { 61 | final ValidationStringency 62 | stringency = getValidationStringency(conf); 63 | SamReaderFactory readerFactory = SamReaderFactory.makeDefault() 64 | .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) 65 | .setUseAsyncIo(false); 66 | if (stringency != null) { 67 | readerFactory.validationStringency(stringency); 68 | } 69 | 70 | final ReferenceSource refSource = getReferenceSource(conf); 71 | if (null != refSource) { 72 | readerFactory.referenceSource(refSource); 73 | } 74 | return readerFactory.open(SamInputResource.of(in)).getFileHeader(); 75 | } 76 | 77 | public static ValidationStringency getValidationStringency( 78 | final Configuration conf) 79 | { 80 | final String p = conf.get(VALIDATION_STRINGENCY_PROPERTY); 81 | return p == null ? null : ValidationStringency.valueOf(p); 82 | } 83 | 84 | public static ReferenceSource getReferenceSource( 85 | final Configuration conf) 86 | { 87 | //TODO: There isn't anything particularly CRAM-specific about reference source or validation 88 | // stringency other than that a reference source is required for CRAM files. We should move 89 | // the reference source and validation stringency property names and utility methods out of 90 | // CRAMInputFormat and SAMHeaderReader and combine them together into a single class for extracting 91 | // configuration params, but it would break backward compatibility with existing code that 92 | // is dependent on the CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY. 93 | final String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY); 94 | return refSourcePath == null ? null : new ReferenceSource(NIOFileUtil.asPath(refSourcePath)); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/NIOFileUtil.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam.util; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | import java.net.URI; 6 | import java.nio.file.FileSystemNotFoundException; 7 | import java.nio.file.FileSystems; 8 | import java.nio.file.FileVisitResult; 9 | import java.nio.file.Files; 10 | import java.nio.file.Path; 11 | import java.nio.file.PathMatcher; 12 | import java.nio.file.Paths; 13 | import java.nio.file.SimpleFileVisitor; 14 | import java.nio.file.attribute.BasicFileAttributes; 15 | import java.util.Collections; 16 | import java.util.HashMap; 17 | import java.util.List; 18 | import java.util.stream.Collectors; 19 | 20 | public class NIOFileUtil { 21 | private NIOFileUtil() { 22 | } 23 | 24 | static final String PARTS_GLOB = "glob:**/part-[mr]-[0-9][0-9][0-9][0-9][0-9]*"; 25 | 26 | /** 27 | * Convert the given path {@link URI} to a {@link Path} object. 28 | * @param uri the path to convert 29 | * @return a {@link Path} object 30 | */ 31 | public static Path asPath(URI uri) { 32 | try { 33 | return Paths.get(uri); 34 | } catch (FileSystemNotFoundException e) { 35 | ClassLoader cl = Thread.currentThread().getContextClassLoader(); 36 | if (cl == null) { 37 | throw e; 38 | } 39 | try { 40 | return FileSystems.newFileSystem(uri, new HashMap<>(), cl).provider().getPath(uri); 41 | } catch (IOException ex) { 42 | throw new RuntimeException("Cannot create filesystem for " + uri, ex); 43 | } 44 | } 45 | } 46 | 47 | /** 48 | * Convert the given path string to a {@link Path} object. 49 | * @param path the path to convert 50 | * @return a {@link Path} object 51 | */ 52 | public static Path asPath(String path) { 53 | URI uri = URI.create(path); 54 | return uri.getScheme() == null ? Paths.get(path) : asPath(uri); 55 | } 56 | 57 | /** 58 | * Delete the given directory and all of its contents if non-empty. 59 | * @param directory the directory to delete 60 | * @throws IOException 61 | */ 62 | static void deleteRecursive(Path directory) throws IOException { 63 | Files.walkFileTree(directory, new SimpleFileVisitor() { 64 | @Override 65 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { 66 | Files.delete(file); 67 | return FileVisitResult.CONTINUE; 68 | } 69 | @Override 70 | public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { 71 | Files.deleteIfExists(dir); 72 | return FileVisitResult.CONTINUE; 73 | } 74 | }); 75 | } 76 | 77 | /** 78 | * Returns all the files in a directory that match the given pattern, and that don't 79 | * have the given extension. 80 | * @param directory the directory to look for files in, subdirectories are not 81 | * considered 82 | * @param syntaxAndPattern the syntax and pattern to use for matching (see 83 | * {@link java.nio.file.FileSystem#getPathMatcher} 84 | * @param excludesExt the extension to exclude, or null to exclude nothing 85 | * @return a list of files, sorted by name 86 | * @throws IOException 87 | */ 88 | static List getFilesMatching(Path directory, 89 | String syntaxAndPattern, String excludesExt) throws IOException { 90 | PathMatcher matcher = directory.getFileSystem().getPathMatcher(syntaxAndPattern); 91 | List parts = Files.walk(directory) 92 | .filter(matcher::matches) 93 | .filter(path -> excludesExt == null || !path.toString().endsWith(excludesExt)) 94 | .collect(Collectors.toList()); 95 | Collections.sort(parts); 96 | return parts; 97 | } 98 | 99 | /** 100 | * Merge the given part files in order into an output stream. 101 | * This deletes the parts. 102 | * @param parts the part files to merge 103 | * @param out the stream to write each file into, in order 104 | * @throws IOException 105 | */ 106 | static void mergeInto(List parts, OutputStream out) 107 | throws IOException { 108 | for (final Path part : parts) { 109 | Files.copy(part, out); 110 | Files.delete(part); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestSAMInputFormat.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.SAMRecord; 4 | import htsjdk.samtools.SamReader; 5 | import htsjdk.samtools.SamReaderFactory; 6 | import java.io.BufferedReader; 7 | import java.io.File; 8 | import java.io.FileReader; 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.fs.FileSystem; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.hadoop.io.LongWritable; 15 | import org.apache.hadoop.mapreduce.InputSplit; 16 | import org.apache.hadoop.mapreduce.Job; 17 | import org.apache.hadoop.mapreduce.JobContext; 18 | import org.apache.hadoop.mapreduce.RecordReader; 19 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 20 | import org.apache.hadoop.mapreduce.TaskAttemptID; 21 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 23 | import org.apache.hadoop.mapreduce.task.JobContextImpl; 24 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; 25 | import org.junit.Before; 26 | import org.junit.Test; 27 | 28 | import static org.junit.Assert.assertEquals; 29 | import static org.junit.Assert.assertTrue; 30 | import static org.mockito.Mockito.mock; 31 | 32 | public class TestSAMInputFormat { 33 | private String input; 34 | private TaskAttemptContext taskAttemptContext; 35 | private JobContext jobContext; 36 | 37 | @Before 38 | public void setup() throws Exception { 39 | Configuration conf = new Configuration(); 40 | input = ClassLoader.getSystemClassLoader().getResource("test.sam").getFile(); 41 | conf.set("mapred.input.dir", "file://" + input); 42 | 43 | taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); 44 | jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID()); 45 | } 46 | 47 | @Test 48 | public void testReader() throws Exception { 49 | int expectedCount = 0; 50 | SamReader samReader = SamReaderFactory.makeDefault().open(new File(input)); 51 | for (SAMRecord r : samReader) { 52 | expectedCount++; 53 | } 54 | samReader.close(); 55 | 56 | AnySAMInputFormat inputFormat = new AnySAMInputFormat(); 57 | List splits = inputFormat.getSplits(jobContext); 58 | assertEquals(1, splits.size()); 59 | RecordReader reader = inputFormat 60 | .createRecordReader(splits.get(0), taskAttemptContext); 61 | reader.initialize(splits.get(0), taskAttemptContext); 62 | 63 | int actualCount = 0; 64 | while (reader.nextKeyValue()) { 65 | actualCount++; 66 | } 67 | reader.close(); 68 | 69 | assertEquals(expectedCount, actualCount); 70 | } 71 | 72 | @Test 73 | public void testMapReduceJob() throws Exception { 74 | Configuration conf = new Configuration(); 75 | 76 | FileSystem fileSystem = FileSystem.get(conf); 77 | Path inputPath = new Path(input); 78 | Path outputPath = fileSystem.makeQualified(new Path("target/out")); 79 | fileSystem.delete(outputPath, true); 80 | 81 | Job job = Job.getInstance(conf); 82 | FileInputFormat.setInputPaths(job, inputPath); 83 | job.setInputFormatClass(SAMInputFormat.class); 84 | job.setOutputKeyClass(LongWritable.class); 85 | job.setOutputValueClass(SAMRecordWritable.class); 86 | job.setNumReduceTasks(0); 87 | FileOutputFormat.setOutputPath(job, outputPath); 88 | 89 | boolean success = job.waitForCompletion(true); 90 | assertTrue(success); 91 | 92 | List samStrings = new ArrayList(); 93 | SamReader samReader = SamReaderFactory.makeDefault().open(new File(input)); 94 | for (SAMRecord r : samReader) { 95 | samStrings.add(r.getSAMString().trim()); 96 | } 97 | samReader.close(); 98 | 99 | File outputFile = new File(new File(outputPath.toUri()), "part-m-00000"); 100 | BufferedReader br = new BufferedReader(new FileReader(outputFile)); 101 | String line; 102 | int index = 0; 103 | while ((line = br.readLine()) != null) { 104 | String value = line.substring(line.indexOf("\t") + 1); // ignore key 105 | assertEquals(samStrings.get(index++), value); 106 | } 107 | br.close(); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2011-11-15 11:58:23 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import htsjdk.samtools.BAMRecord; 26 | import htsjdk.samtools.SAMFileHeader; 27 | import htsjdk.samtools.SAMRecord; 28 | import htsjdk.samtools.SAMRecordFactory; 29 | 30 | /** A factory for the kind of lazy {@link BAMRecord} used internally. */ 31 | public class LazyBAMRecordFactory implements SAMRecordFactory { 32 | @Override public SAMRecord createSAMRecord(SAMFileHeader hdr) { 33 | throw new UnsupportedOperationException( 34 | "LazyBAMRecordFactory can only create BAM records"); 35 | } 36 | 37 | @Override public BAMRecord createBAMRecord( 38 | SAMFileHeader hdr, 39 | int referenceSequenceIndex, int alignmentStart, 40 | short readNameLength, short mappingQuality, 41 | int indexingBin, int cigarLen, int flags, int readLen, 42 | int mateReferenceSequenceIndex, int mateAlignmentStart, 43 | int insertSize, byte[] variableLengthBlock) 44 | { 45 | return new LazyBAMRecord( 46 | hdr, referenceSequenceIndex, alignmentStart, readNameLength, 47 | mappingQuality, indexingBin, cigarLen, flags, readLen, 48 | mateReferenceSequenceIndex, mateAlignmentStart, insertSize, 49 | variableLengthBlock); 50 | } 51 | } 52 | 53 | class LazyBAMRecord extends BAMRecord { 54 | private boolean decodedRefIdx = false; 55 | private boolean decodedMateRefIdx = false; 56 | 57 | public LazyBAMRecord( 58 | SAMFileHeader hdr, int referenceID, int coordinate, short readNameLength, 59 | short mappingQuality, int indexingBin, int cigarLen, int flags, 60 | int readLen, int mateReferenceID, int mateCoordinate, int insertSize, 61 | byte[] restOfData) 62 | { 63 | super( 64 | hdr, referenceID, coordinate, readNameLength, mappingQuality, 65 | indexingBin, cigarLen, flags, readLen, mateReferenceID, 66 | mateCoordinate, insertSize, restOfData); 67 | } 68 | 69 | @Override public void setReferenceIndex(final int referenceIndex) { 70 | mReferenceIndex = referenceIndex; 71 | decodedRefIdx = false; 72 | } 73 | @Override public void setMateReferenceIndex(final int referenceIndex) { 74 | mMateReferenceIndex = referenceIndex; 75 | decodedMateRefIdx = false; 76 | } 77 | 78 | @Override public String getReferenceName() { 79 | if (mReferenceIndex != null && !decodedRefIdx) { 80 | decodedRefIdx = true; 81 | super.setReferenceIndex(mReferenceIndex); 82 | } 83 | return super.getReferenceName(); 84 | } 85 | 86 | @Override public String getMateReferenceName() { 87 | if (mMateReferenceIndex != null && !decodedMateRefIdx) { 88 | decodedMateRefIdx = true; 89 | super.setMateReferenceIndex(mMateReferenceIndex); 90 | } 91 | return super.getMateReferenceName(); 92 | } 93 | 94 | @Override protected void eagerDecode() { 95 | getReferenceName(); 96 | getMateReferenceName(); 97 | super.eagerDecode(); 98 | } 99 | 100 | @Override 101 | public boolean equals(Object o) { 102 | // don't use decoded flags for equality check 103 | return super.equals(o); 104 | } 105 | 106 | @Override 107 | public int hashCode() { 108 | // don't use decoded flags for hash code 109 | return super.hashCode(); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormatStringency.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | package org.seqdoop.hadoop_bam; 22 | 23 | import htsjdk.samtools.ValidationStringency; 24 | import htsjdk.tribble.TribbleException; 25 | import htsjdk.variant.variantcontext.VariantContext; 26 | import java.util.List; 27 | import org.apache.hadoop.conf.Configuration; 28 | import org.apache.hadoop.io.LongWritable; 29 | import org.apache.hadoop.mapreduce.InputSplit; 30 | import org.apache.hadoop.mapreduce.JobContext; 31 | import org.apache.hadoop.mapreduce.RecordReader; 32 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 33 | import org.apache.hadoop.mapreduce.TaskAttemptID; 34 | import org.apache.hadoop.mapreduce.task.JobContextImpl; 35 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; 36 | import org.junit.Test; 37 | 38 | import static org.junit.Assert.assertEquals; 39 | import static org.junit.Assert.assertNotNull; 40 | import static org.mockito.Mockito.mock; 41 | 42 | public class TestVCFInputFormatStringency { 43 | 44 | public void checkReading(ValidationStringency validationStringency) throws Exception { 45 | String filename = "invalid_info_field.vcf"; 46 | Configuration conf = new Configuration(); 47 | String input_file = ClassLoader.getSystemClassLoader().getResource(filename).getFile(); 48 | conf.set("mapred.input.dir", "file://" + input_file); 49 | 50 | if (validationStringency != null) { 51 | VCFRecordReader.setValidationStringency(conf, validationStringency); 52 | } 53 | 54 | TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); 55 | JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID()); 56 | 57 | VCFInputFormat inputFormat = new VCFInputFormat(conf); 58 | List splits = inputFormat.getSplits(ctx); 59 | assertEquals(1, splits.size()); 60 | RecordReader reader = 61 | inputFormat.createRecordReader(splits.get(0), taskAttemptContext); 62 | int counter = 0; 63 | while (reader.nextKeyValue()) { 64 | VariantContextWritable writable = reader.getCurrentValue(); 65 | assertNotNull(writable); 66 | VariantContext vc = writable.get(); 67 | assertNotNull(vc); 68 | String value = vc.toString(); 69 | assertNotNull(value); 70 | counter++; 71 | } 72 | assertEquals(4, counter); 73 | } 74 | 75 | @Test(expected = TribbleException.class) 76 | public void testUnset() throws Exception { 77 | checkReading(null); // defaults to strict 78 | } 79 | 80 | @Test(expected = TribbleException.class) 81 | public void testDefault() throws Exception { 82 | checkReading(ValidationStringency.DEFAULT_STRINGENCY); // defaults to strict 83 | } 84 | 85 | @Test 86 | public void testSilent() throws Exception { 87 | checkReading(ValidationStringency.SILENT); 88 | } 89 | 90 | @Test 91 | public void testLenient() throws Exception { 92 | checkReading(ValidationStringency.LENIENT); 93 | } 94 | 95 | @Test(expected = TribbleException.class) 96 | public void testStrict() throws Exception { 97 | checkReading(ValidationStringency.STRICT); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndex.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-25 12:20:03 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import java.io.BufferedInputStream; 26 | import java.io.File; 27 | import java.io.FileInputStream; 28 | import java.io.InputStream; 29 | import java.io.IOException; 30 | import java.nio.ByteBuffer; 31 | import java.util.NavigableSet; 32 | import java.util.TreeSet; 33 | 34 | /** An index into BGZF-compressed files, for {@link BGZFSplitFileInputFormat}. 35 | * Reads files that are created by {@link BGZFBlockIndexer}. 36 | * 37 | *

Indexes the positions of individual gzip blocks in the file.

38 | */ 39 | public final class BGZFBlockIndex { 40 | private final NavigableSet offsets = new TreeSet(); 41 | 42 | public BGZFBlockIndex() {} 43 | public BGZFBlockIndex(final File path) throws IOException { 44 | this(new BufferedInputStream(new FileInputStream(path))); 45 | } 46 | public BGZFBlockIndex(final InputStream in) throws IOException { 47 | readIndex(in); 48 | } 49 | 50 | public void readIndex(final InputStream in) throws IOException { 51 | offsets.clear(); 52 | 53 | final ByteBuffer bb = ByteBuffer.allocate(8); 54 | 55 | for (long prev = -1; in.read(bb.array(), 2, 6) == 6;) { 56 | final long cur = bb.getLong(0); 57 | if (prev > cur) 58 | throw new IOException(String.format( 59 | "Invalid BGZF block index; offsets not in order: %#x > %#x", 60 | prev, cur)); 61 | 62 | offsets.add(prev = cur); 63 | } 64 | in.close(); 65 | 66 | if (offsets.size() < 1) 67 | throw new IOException( 68 | "Invalid BGZF block index: should contain at least the file size"); 69 | 70 | offsets.add(0L); 71 | } 72 | 73 | public Long prevBlock(final long filePos) { 74 | return offsets.floor(filePos); 75 | } 76 | public Long nextBlock(final long filePos) { 77 | return offsets.higher(filePos); 78 | } 79 | 80 | public int size() { return offsets.size(); } 81 | 82 | private long secondBlock() { return nextBlock(0); } 83 | private long lastBlock() { return prevBlock(fileSize() - 1); } 84 | private long fileSize() { return offsets.last(); } 85 | 86 | /** Writes some statistics about each BGZF block index file given as an 87 | * argument. 88 | */ 89 | public static void main(String[] args) { 90 | if (args.length == 0) { 91 | System.out.println( 92 | "Usage: BGZFBlockIndex [BGZF block indices...]\n\n"+ 93 | 94 | "Writes a few statistics about each BGZF block index."); 95 | return; 96 | } 97 | 98 | for (String arg : args) { 99 | final File f = new File(arg); 100 | if (f.isFile() && f.canRead()) { 101 | try { 102 | System.err.printf("%s:\n", f); 103 | final BGZFBlockIndex bi = new BGZFBlockIndex(f); 104 | final long second = bi.secondBlock(); 105 | final long last = bi.lastBlock(); 106 | System.err.printf( 107 | "\t%d blocks\n" + 108 | "\tfirst after 0 is at %#014x\n" + 109 | "\tlast is at %#014x\n" + 110 | "\tassociated BGZF file size %d\n", 111 | bi.size()-1, 112 | bi.secondBlock(), bi.lastBlock(), bi.fileSize()); 113 | } catch (IOException e) { 114 | System.err.printf("Failed to read %s!\n", f); 115 | e.printStackTrace(); 116 | } 117 | } else 118 | System.err.printf("%s does not look like a readable file!\n", f); 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringAnySAMOutputFormat.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-11 12:19:23 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.IOException; 26 | import java.io.InputStream; 27 | 28 | import htsjdk.samtools.SAMFileHeader; 29 | 30 | import org.apache.hadoop.conf.Configuration; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.hadoop.mapreduce.RecordWriter; 33 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 34 | 35 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader; 36 | 37 | /** Writes only the SAM records, not the key. 38 | * 39 | *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or 40 | * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.

41 | * 42 | *

By default, writes the SAM header to the output file(s). This 43 | * can be disabled, because in distributed usage one often ends up with (and, 44 | * for decent performance, wants to end up with) the output split into multiple 45 | * parts, which are easier to concatenate if the header is not present in each 46 | * file.

47 | */ 48 | public class KeyIgnoringAnySAMOutputFormat extends AnySAMOutputFormat { 49 | 50 | protected SAMFileHeader header; 51 | 52 | /** Whether the header will be written, defaults to true.. 53 | */ 54 | public static final String WRITE_HEADER_PROPERTY = 55 | "hadoopbam.anysam.write-header"; 56 | 57 | public KeyIgnoringAnySAMOutputFormat(SAMFormat fmt) { 58 | super(fmt); 59 | } 60 | public KeyIgnoringAnySAMOutputFormat(Configuration conf) { 61 | super(conf); 62 | 63 | if (format == null) 64 | throw new IllegalArgumentException( 65 | "unknown SAM format: OUTPUT_SAM_FORMAT_PROPERTY not set"); 66 | } 67 | public KeyIgnoringAnySAMOutputFormat(Configuration conf, Path path) { 68 | super(conf); 69 | 70 | if (format == null) { 71 | format = SAMFormat.inferFromFilePath(path); 72 | 73 | if (format == null) 74 | throw new IllegalArgumentException("unknown SAM format: " + path); 75 | } 76 | } 77 | 78 | public SAMFileHeader getSAMHeader() { return header; } 79 | public void setSAMHeader(SAMFileHeader header) { this.header = header; } 80 | 81 | public void readSAMHeaderFrom(Path path, Configuration conf) 82 | throws IOException 83 | { 84 | this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); 85 | } 86 | public void readSAMHeaderFrom(InputStream in, Configuration conf) { 87 | this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); 88 | } 89 | 90 | /** setSAMHeader or readSAMHeaderFrom must have 91 | * been called first. 92 | */ 93 | @Override public RecordWriter getRecordWriter( 94 | TaskAttemptContext ctx) 95 | throws IOException 96 | { 97 | return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); 98 | } 99 | 100 | // Allows wrappers to provide their own work file. 101 | public RecordWriter getRecordWriter( 102 | TaskAttemptContext ctx, Path out) 103 | throws IOException 104 | { 105 | if (this.header == null) 106 | throw new IOException( 107 | "Can't create a RecordWriter without the SAM header"); 108 | 109 | final boolean writeHeader = ctx.getConfiguration().getBoolean( 110 | WRITE_HEADER_PROPERTY, true); 111 | 112 | switch (format) { 113 | case BAM: 114 | return new KeyIgnoringBAMRecordWriter( 115 | out, header, writeHeader, ctx); 116 | 117 | case SAM: 118 | return new KeyIgnoringSAMRecordWriter( 119 | out, header, writeHeader, ctx); 120 | 121 | case CRAM: 122 | return new KeyIgnoringCRAMRecordWriter( 123 | out, header, writeHeader, ctx); 124 | 125 | default: assert false; return null; 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/FileVirtualSplit.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-09 13:06:32 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.DataOutput; 26 | import java.io.DataInput; 27 | import java.io.IOException; 28 | 29 | import org.apache.hadoop.fs.Path; 30 | import org.apache.hadoop.io.Text; 31 | import org.apache.hadoop.io.Writable; 32 | import org.apache.hadoop.mapreduce.InputSplit; 33 | 34 | /** Like a {@link org.apache.hadoop.mapreduce.lib.input.FileSplit}, but uses 35 | * BGZF virtual offsets to fit with {@link 36 | * htsjdk.samtools.util.BlockCompressedInputStream}. 37 | */ 38 | public class FileVirtualSplit extends InputSplit implements Writable { 39 | private Path file; 40 | private long vStart; 41 | private long vEnd; 42 | private final String[] locations; 43 | private long[] intervalFilePointers; 44 | 45 | private static final String[] NO_LOCATIONS = {}; 46 | 47 | public FileVirtualSplit() { locations = NO_LOCATIONS; } 48 | 49 | public FileVirtualSplit(Path f, long vs, long ve, String[] locs) { 50 | file = f; 51 | vStart = vs; 52 | vEnd = ve; 53 | locations = locs; 54 | } 55 | 56 | public FileVirtualSplit(Path f, long vs, long ve, String[] locs, long[] intervalFilePointers) { 57 | file = f; 58 | vStart = vs; 59 | vEnd = ve; 60 | locations = locs; 61 | this.intervalFilePointers = intervalFilePointers; 62 | } 63 | 64 | @Override public String[] getLocations() { return locations; } 65 | 66 | /** Inexact due to the nature of virtual offsets. 67 | * 68 | * We can't know how many blocks there are in between two file offsets, nor 69 | * how large those blocks are. So this uses only the difference between the 70 | * file offsets—unless that difference is zero, in which case the split is 71 | * wholly contained in one block and thus we can give an exact result. 72 | */ 73 | @Override public long getLength() { 74 | final long vsHi = vStart & ~0xffff; 75 | final long veHi = vEnd & ~0xffff; 76 | final long hiDiff = veHi - vsHi; 77 | return hiDiff == 0 ? ((vEnd & 0xffff) - (vStart & 0xffff)) : hiDiff; 78 | } 79 | 80 | public Path getPath() { return file; } 81 | 82 | /** Inclusive. */ 83 | public long getStartVirtualOffset() { return vStart; } 84 | 85 | /** Exclusive. */ 86 | public long getEndVirtualOffset() { return vEnd; } 87 | 88 | public void setStartVirtualOffset(long vo) { vStart = vo; } 89 | public void setEndVirtualOffset(long vo) { vEnd = vo; } 90 | 91 | /** 92 | * @return pairs of virtual file pointers for all intervals that should be used for 93 | * filtering the split, or null if there are none. These correspond to 94 | * BAMFileSpan chunk start/stop pointers in htsjdk. 95 | */ 96 | public long[] getIntervalFilePointers() { 97 | return intervalFilePointers; 98 | } 99 | 100 | @Override public void write(DataOutput out) throws IOException { 101 | Text.writeString(out, file.toString()); 102 | out.writeLong(vStart); 103 | out.writeLong(vEnd); 104 | out.writeBoolean(intervalFilePointers != null); 105 | if (intervalFilePointers != null) { 106 | out.writeInt(intervalFilePointers.length); 107 | for (int i = 0; i < intervalFilePointers.length; i++) { 108 | out.writeLong(intervalFilePointers[i]); 109 | } 110 | } 111 | } 112 | @Override public void readFields(DataInput in) throws IOException { 113 | file = new Path(Text.readString(in)); 114 | vStart = in.readLong(); 115 | vEnd = in.readLong(); 116 | if (in.readBoolean()) { 117 | intervalFilePointers = new long[in.readInt()]; 118 | for (int i = 0; i < intervalFilePointers.length; i++) { 119 | intervalFilePointers[i] = in.readLong(); 120 | } 121 | } 122 | } 123 | 124 | @Override 125 | public String toString() { return file + ":" + vStart + "-" + vEnd; } 126 | } 127 | -------------------------------------------------------------------------------- /src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormatOnHDFS.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import htsjdk.samtools.SAMRecord; 4 | import htsjdk.samtools.SamReader; 5 | import htsjdk.samtools.SamReaderFactory; 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.net.URI; 9 | import java.net.URISyntaxException; 10 | import java.nio.file.Files; 11 | import java.nio.file.Paths; 12 | import java.util.List; 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.fs.FileUtil; 15 | import org.apache.hadoop.hdfs.MiniDFSCluster; 16 | import org.apache.hadoop.io.LongWritable; 17 | import org.apache.hadoop.mapreduce.InputSplit; 18 | import org.apache.hadoop.mapreduce.JobContext; 19 | import org.apache.hadoop.mapreduce.RecordReader; 20 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 21 | import org.apache.hadoop.mapreduce.TaskAttemptID; 22 | import org.apache.hadoop.mapreduce.task.JobContextImpl; 23 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; 24 | import org.junit.AfterClass; 25 | import org.junit.Before; 26 | import org.junit.BeforeClass; 27 | import org.junit.Test; 28 | 29 | import static org.junit.Assert.assertEquals; 30 | import static org.junit.Assert.assertTrue; 31 | import static org.mockito.Mockito.mock; 32 | 33 | public class TestCRAMInputFormatOnHDFS { 34 | private String input; 35 | private String reference; 36 | private TaskAttemptContext taskAttemptContext; 37 | private JobContext jobContext; 38 | 39 | 40 | private static MiniDFSCluster cluster; 41 | private static URI clusterUri; 42 | 43 | @BeforeClass 44 | public static void setUpBeforeClass() throws Exception { 45 | cluster = startMini(TestCRAMInputFormatOnHDFS.class.getName()); 46 | clusterUri = formalizeClusterURI(cluster.getFileSystem().getUri()); 47 | } 48 | 49 | @AfterClass 50 | public static void teardownClass() throws Exception { 51 | if (cluster != null) 52 | { 53 | cluster.shutdown(); 54 | } 55 | } 56 | 57 | 58 | @Before 59 | public void setup() throws Exception { 60 | Configuration conf = new Configuration(); 61 | input = ClassLoader.getSystemClassLoader().getResource("test.cram").getFile(); 62 | reference = ClassLoader.getSystemClassLoader().getResource("auxf.fa").toURI().toString(); 63 | String referenceIndex = ClassLoader.getSystemClassLoader().getResource("auxf.fa.fai") 64 | .toURI().toString(); 65 | conf.set("mapred.input.dir", "file://" + input); 66 | 67 | URI hdfsRef = clusterUri.resolve("/tmp/auxf.fa"); 68 | URI hdfsRefIndex = clusterUri.resolve("/tmp/auxf.fa.fai"); 69 | Files.copy(Paths.get(URI.create(reference)), Paths.get(hdfsRef)); 70 | Files.copy(Paths.get(URI.create(referenceIndex)), Paths.get(hdfsRefIndex)); 71 | 72 | conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, hdfsRef.toString()); 73 | 74 | 75 | taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); 76 | jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID()); 77 | 78 | } 79 | 80 | private static MiniDFSCluster startMini(String testName) throws IOException { 81 | File baseDir = new File("./target/hdfs/" + testName).getAbsoluteFile(); 82 | FileUtil.fullyDelete(baseDir); 83 | Configuration conf = new Configuration(); 84 | conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()); 85 | MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf); 86 | MiniDFSCluster hdfsCluster = builder.clusterId(testName).build(); 87 | hdfsCluster.waitActive(); 88 | return hdfsCluster; 89 | } 90 | 91 | protected static URI formalizeClusterURI(URI clusterUri) throws URISyntaxException { 92 | if (clusterUri.getPath()==null) { 93 | return new URI(clusterUri.getScheme(), null, 94 | clusterUri.getHost(), clusterUri.getPort(), 95 | "/", null, null); 96 | } else if (clusterUri.getPath().trim()=="") { 97 | return new URI(clusterUri.getScheme(), null, 98 | clusterUri.getHost(), clusterUri.getPort(), 99 | "/", null, null); 100 | } 101 | return clusterUri; 102 | } 103 | 104 | @Test 105 | public void testReader() throws Exception { 106 | int expectedCount = 0; 107 | SamReader samReader = SamReaderFactory.makeDefault() 108 | .referenceSequence(new File(URI.create(reference))).open(new File(input)); 109 | for (SAMRecord r : samReader) { 110 | expectedCount++; 111 | } 112 | 113 | CRAMInputFormat inputFormat = new CRAMInputFormat(); 114 | List splits = inputFormat.getSplits(jobContext); 115 | assertEquals(1, splits.size()); 116 | RecordReader reader = inputFormat 117 | .createRecordReader(splits.get(0), taskAttemptContext); 118 | reader.initialize(splits.get(0), taskAttemptContext); 119 | 120 | int actualCount = 0; 121 | while (reader.nextKeyValue()) { 122 | actualCount++; 123 | } 124 | 125 | assertEquals(expectedCount, actualCount); 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/CRAMRecordWriter.java: -------------------------------------------------------------------------------- 1 | package org.seqdoop.hadoop_bam; 2 | 3 | import java.io.*; 4 | import java.net.URI; 5 | import java.nio.file.Paths; 6 | 7 | import htsjdk.samtools.CRAMContainerStreamWriter; 8 | import htsjdk.samtools.SAMTextHeaderCodec; 9 | import htsjdk.samtools.cram.ref.ReferenceSource; 10 | import htsjdk.samtools.SAMFileHeader; 11 | import htsjdk.samtools.SAMRecord; 12 | import htsjdk.samtools.reference.ReferenceSequenceFileFactory; 13 | import htsjdk.samtools.util.StringLineReader; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.mapreduce.RecordWriter; 16 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 17 | 18 | import org.seqdoop.hadoop_bam.util.NIOFileUtil; 19 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader; 20 | 21 | /** A base {@link RecordWriter} for CRAM records. 22 | * 23 | *

Handles the output stream, writing the header if requested, and provides 24 | * the {@link #writeAlignment} function for subclasses.

25 | *

Note that each file created by this class consists of a fragment of a 26 | * complete CRAM file containing only one or more CRAM containers that do not 27 | * include a CRAM file header, a SAMFileHeader, or a CRAM EOF container.

28 | */ 29 | public abstract class CRAMRecordWriter 30 | extends RecordWriter 31 | { 32 | // generic ID passed to CRAM code for internal error reporting 33 | private static final String HADOOP_BAM_PART_ID= "Hadoop-BAM-Part"; 34 | private OutputStream origOutput; 35 | private CRAMContainerStreamWriter cramContainerStream = null; 36 | private ReferenceSource refSource = null; 37 | private boolean writeHeader = true; 38 | 39 | /** A SAMFileHeader is read from the input Path. */ 40 | public CRAMRecordWriter( 41 | final Path output, 42 | final Path input, 43 | final boolean writeHeader, 44 | final TaskAttemptContext ctx) throws IOException 45 | { 46 | init( 47 | output, 48 | SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()), 49 | writeHeader, ctx); 50 | } 51 | 52 | public CRAMRecordWriter( 53 | final Path output, final SAMFileHeader header, final boolean writeHeader, 54 | final TaskAttemptContext ctx) 55 | throws IOException 56 | { 57 | init( 58 | output.getFileSystem(ctx.getConfiguration()).create(output), 59 | header, writeHeader, ctx); 60 | } 61 | 62 | // Working around not being able to call a constructor other than as the 63 | // first statement... 64 | private void init( 65 | final Path output, final SAMFileHeader header, final boolean writeHeader, 66 | final TaskAttemptContext ctx) 67 | throws IOException 68 | { 69 | init( 70 | output.getFileSystem(ctx.getConfiguration()).create(output), 71 | header, writeHeader, ctx); 72 | } 73 | 74 | private void init( 75 | final OutputStream output, final SAMFileHeader header, final boolean writeHeader, 76 | final TaskAttemptContext ctx) 77 | throws IOException 78 | { 79 | origOutput = output; 80 | this.writeHeader = writeHeader; 81 | 82 | final String referenceURI = 83 | ctx.getConfiguration().get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY); 84 | refSource = new ReferenceSource(referenceURI == null ? null : 85 | NIOFileUtil.asPath(referenceURI)); 86 | 87 | // A SAMFileHeader must be supplied at CRAMContainerStreamWriter creation time; if 88 | // we don't have one then delay creation until we do 89 | if (header != null) { 90 | cramContainerStream = new CRAMContainerStreamWriter( 91 | origOutput, null, refSource, header, HADOOP_BAM_PART_ID); 92 | if (writeHeader) { 93 | this.writeHeader(header); 94 | } 95 | } 96 | } 97 | 98 | @Override public void close(TaskAttemptContext ctx) throws IOException { 99 | cramContainerStream.finish(false); // Close, but suppress CRAM EOF container 100 | origOutput.close(); // And close the original output. 101 | } 102 | 103 | protected void writeAlignment(final SAMRecord rec) { 104 | if (null == cramContainerStream) { 105 | final SAMFileHeader header = rec.getHeader(); 106 | if (header == null) { 107 | throw new RuntimeException("Cannot write record to CRAM: null header in SAM record"); 108 | } 109 | if (writeHeader) { 110 | this.writeHeader(header); 111 | } 112 | cramContainerStream = new CRAMContainerStreamWriter( 113 | origOutput, null, refSource, header, HADOOP_BAM_PART_ID); 114 | } 115 | cramContainerStream.writeAlignment(rec); 116 | } 117 | 118 | private void writeHeader(final SAMFileHeader header) { 119 | cramContainerStream.writeHeader(header); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | package org.seqdoop.hadoop_bam; 22 | 23 | import java.io.UnsupportedEncodingException; 24 | import java.util.List; 25 | 26 | import htsjdk.tribble.readers.LineIterator; 27 | import htsjdk.variant.variantcontext.Allele; 28 | import htsjdk.variant.variantcontext.LazyGenotypesContext; 29 | import htsjdk.variant.vcf.AbstractVCFCodec; 30 | import htsjdk.variant.vcf.VCFHeader; 31 | import htsjdk.variant.vcf.VCFHeaderLine; 32 | import htsjdk.variant.vcf.VCFHeaderVersion; 33 | 34 | // File created: 2013-07-03 15:41:21 35 | 36 | // The actual parsing is delegated to AbstractVCFCodec. 37 | public class LazyVCFGenotypesContext extends LazyParsingGenotypesContext { 38 | 39 | /** Takes ownership of the given byte[]: don't modify its contents. */ 40 | public LazyVCFGenotypesContext( 41 | List alleles, String chrom, int start, 42 | byte[] utf8Unparsed, int count) 43 | { 44 | super(new Parser(alleles, chrom, start), utf8Unparsed, count); 45 | } 46 | 47 | public static class HeaderDataCache 48 | implements LazyParsingGenotypesContext.HeaderDataCache 49 | { 50 | private HeaderSettableVCFCodec codec = new HeaderSettableVCFCodec(); 51 | 52 | @Override public void setHeader(VCFHeader header) { 53 | VCFHeaderVersion version = null; 54 | 55 | // Normally AbstractVCFCodec parses the header and thereby sets the 56 | // version field. It gets used later on so we need to set it. 57 | for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { 58 | if (VCFHeaderVersion.isFormatString(line.getKey())) { 59 | version = VCFHeaderVersion.toHeaderVersion(line.getValue()); 60 | break; 61 | } 62 | } 63 | 64 | codec.setHeaderAndVersion(header, version); 65 | } 66 | 67 | public AbstractVCFCodec getCodec() { return codec; } 68 | } 69 | 70 | public static class Parser extends LazyParsingGenotypesContext.Parser { 71 | private HeaderSettableVCFCodec codec = null; 72 | private final List alleles; 73 | private final String chrom; 74 | private final int start; 75 | 76 | public Parser(List alleles, String chrom, int start) { 77 | this.alleles = alleles; 78 | this.chrom = chrom; 79 | this.start = start; 80 | } 81 | 82 | @Override public void setHeaderDataCache( 83 | LazyParsingGenotypesContext.HeaderDataCache data) 84 | { 85 | codec = (HeaderSettableVCFCodec)((HeaderDataCache)data).getCodec(); 86 | } 87 | 88 | @Override public LazyGenotypesContext.LazyData parse(final Object data) { 89 | if (codec == null || !codec.hasHeader()) 90 | throw new IllegalStateException( 91 | "Cannot decode genotypes without a codec with a VCFHeader"); 92 | 93 | final String str; 94 | try { 95 | str = new String((byte[])data, "UTF-8"); 96 | } catch (UnsupportedEncodingException absurd) { 97 | throw new RuntimeException( 98 | "Can never happen on a compliant Java implementation because "+ 99 | "UTF-8 is guaranteed to be supported"); 100 | } 101 | return codec.createGenotypeMap(str, alleles, chrom, start); 102 | } 103 | } 104 | } 105 | 106 | // This is a HACK. But, the functionality is only in AbstractVCFCodec so it 107 | // can't be helped. This is preferable to copying the functionality into 108 | // parse() above. 109 | class HeaderSettableVCFCodec extends AbstractVCFCodec { 110 | public boolean hasHeader() { return header != null; } 111 | 112 | public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) { 113 | this.header = header; 114 | this.version = ver; 115 | } 116 | 117 | @Override public Object readActualHeader(LineIterator reader) { 118 | throw new UnsupportedOperationException( 119 | "Internal error: this shouldn't be called"); 120 | } 121 | @Override public List parseFilters(String filterString) { 122 | throw new UnsupportedOperationException( 123 | "Internal error: this shouldn't be called"); 124 | } 125 | @Override public boolean canDecode(String s) { 126 | return true; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/ReferenceFragment.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012 Aalto University 2 | // 3 | // This file is part of Hadoop-BAM. 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to 7 | // deal in the Software without restriction, including without limitation the 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | // sell copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in 13 | // all copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | // IN THE SOFTWARE. 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import org.apache.hadoop.io.Text; 26 | import org.apache.hadoop.io.Writable; 27 | import org.apache.hadoop.io.WritableUtils; 28 | 29 | import java.io.IOException; 30 | import java.io.DataInput; 31 | import java.io.DataOutput; 32 | 33 | // partly based on SequencedFragment 34 | // note: this class is supposed to represent a single line of a fasta input file, augmented by chromosome/contig name and start position 35 | 36 | public class ReferenceFragment implements Writable 37 | { 38 | protected Text sequence = new Text(); 39 | 40 | protected Integer position; 41 | protected String indexSequence; 42 | 43 | public void clear() 44 | { 45 | sequence.clear(); 46 | indexSequence = null; 47 | position = null; 48 | } 49 | 50 | /** 51 | * Get sequence Text object. 52 | * Trade encapsulation for efficiency. Here we expose the internal Text 53 | * object so that data may be read and written diretly from/to it. 54 | * 55 | * Sequence should always be written using CAPITAL letters and 'N' for unknown bases. 56 | */ 57 | public Text getSequence() { return sequence; } 58 | 59 | /** 60 | * Get quality Text object. 61 | * Trade encapsulation for efficiency. Here we expose the internal Text 62 | * object so that data may be read and written diretly from/to it. 63 | * 64 | */ 65 | public void setPosition(Integer pos) { 66 | if (pos == null) 67 | throw new IllegalArgumentException("can't have null reference position"); 68 | position = pos; 69 | } 70 | 71 | public void setIndexSequence(String v) { 72 | if (v == null) 73 | throw new IllegalArgumentException("can't have null index sequence"); 74 | indexSequence = v; 75 | } 76 | 77 | public void setSequence(Text seq) 78 | { 79 | if (seq == null) 80 | throw new IllegalArgumentException("can't have a null sequence"); 81 | sequence = seq; 82 | } 83 | 84 | public Integer getPosition() { return position; } 85 | public String getIndexSequence() { return indexSequence; } 86 | 87 | /** 88 | * Recreates a pseudo fasta record with the fields available. 89 | */ 90 | public String toString() 91 | { 92 | String delim = "\t"; 93 | StringBuilder builder = new StringBuilder(800); 94 | builder.append(indexSequence).append(delim); 95 | builder.append(position).append(delim); 96 | builder.append(sequence); 97 | return builder.toString(); 98 | } 99 | 100 | public boolean equals(Object other) 101 | { 102 | if (other != null && other instanceof ReferenceFragment) 103 | { 104 | ReferenceFragment otherFrag = (ReferenceFragment)other; 105 | 106 | if (position == null && otherFrag.position != null || position != null && !position.equals(otherFrag.position)) 107 | return false; 108 | if (indexSequence == null && otherFrag.indexSequence != null || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence)) 109 | return false; 110 | // sequence can't be null 111 | if (!sequence.equals(otherFrag.sequence)) 112 | return false; 113 | 114 | return true; 115 | } 116 | else 117 | return false; 118 | } 119 | 120 | @Override 121 | public int hashCode() { 122 | int result = sequence.hashCode(); 123 | result = 31 * result + (position != null ? position.hashCode() : 0); 124 | result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0); 125 | return result; 126 | } 127 | 128 | public void readFields(DataInput in) throws IOException 129 | { 130 | // serialization order: 131 | // 1) sequence 132 | // 2) indexSequence (chromosome/contig name) 133 | // 3) position of first base in this line of the fasta file 134 | 135 | this.clear(); 136 | 137 | sequence.readFields(in); 138 | 139 | indexSequence = WritableUtils.readString(in); 140 | position = WritableUtils.readVInt(in); 141 | } 142 | 143 | public void write(DataOutput out) throws IOException 144 | { 145 | sequence.write(out); 146 | 147 | WritableUtils.writeString(out, indexSequence); 148 | WritableUtils.writeVInt(out, position); 149 | 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/util/SAMOutputPreparer.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2012-07-26 14:36:03 22 | 23 | package org.seqdoop.hadoop_bam.util; 24 | 25 | import java.io.File; 26 | import java.io.FilterOutputStream; 27 | import java.io.IOException; 28 | import java.io.OutputStream; 29 | import java.io.OutputStreamWriter; 30 | import java.io.StringWriter; 31 | import java.nio.ByteBuffer; 32 | import java.nio.ByteOrder; 33 | import java.util.List; 34 | 35 | import htsjdk.samtools.SAMFileHeader; 36 | import htsjdk.samtools.SAMSequenceRecord; 37 | import htsjdk.samtools.SAMTextHeaderCodec; 38 | import htsjdk.samtools.cram.build.CramIO; 39 | import htsjdk.samtools.cram.common.CramVersions; 40 | import htsjdk.samtools.util.BlockCompressedOutputStream; 41 | 42 | import org.seqdoop.hadoop_bam.SAMFormat; 43 | 44 | public class SAMOutputPreparer { 45 | private ByteBuffer buf; 46 | 47 | public SAMOutputPreparer() { 48 | // Enough room for a 32-bit integer. 49 | buf = ByteBuffer.wrap(new byte[4]); 50 | buf.order(ByteOrder.LITTLE_ENDIAN); 51 | } 52 | 53 | public static final byte[] BAM_MAGIC = {'B','A','M', 1}; 54 | 55 | /** Prepares the given output stream for writing of SAMRecords in the given 56 | * format. This includes writing the given SAM header and, in the case of 57 | * BAM or CRAM, writing some further metadata as well as compressing everything 58 | * written. Returns a new stream to replace the original: it will do the 59 | * appropriate compression for BAM/CRAM files. 60 | */ 61 | public OutputStream prepareForRecords( 62 | OutputStream out, final SAMFormat format, 63 | final SAMFileHeader header) 64 | throws IOException { 65 | 66 | switch (format) { 67 | case SAM: 68 | out = prepareSAMOrBAMStream(out, format, header); 69 | break; 70 | case BAM: 71 | out = prepareSAMOrBAMStream(out, format, header); 72 | break; 73 | case CRAM: 74 | out = prepareCRAMStream(out, format, header); 75 | break; 76 | default: 77 | throw new IllegalArgumentException 78 | ("Unsupported SAM file format, must be one of SAM, BAM or CRAM"); 79 | } 80 | 81 | // Important for BAM: if the caller doesn't want to use the new stream 82 | // for some reason, the BlockCompressedOutputStream's buffer would never 83 | // be flushed. 84 | out.flush(); 85 | return out; 86 | } 87 | 88 | private OutputStream prepareCRAMStream( 89 | OutputStream out, final SAMFormat format, 90 | final SAMFileHeader header) throws IOException 91 | { 92 | CramIO.writeHeader(CramVersions.DEFAULT_CRAM_VERSION, out, header, null); 93 | return out; 94 | } 95 | 96 | private OutputStream prepareSAMOrBAMStream( 97 | OutputStream out, final SAMFormat format, 98 | final SAMFileHeader header) throws IOException 99 | { 100 | final StringWriter sw = new StringWriter(); 101 | new SAMTextHeaderCodec().encode(sw, header); 102 | final String text = sw.toString(); 103 | 104 | if (format == SAMFormat.BAM) { 105 | out = new BlockCompressedOutputStream(out, (File) null); 106 | out.write(BAM_MAGIC); 107 | writeInt32(out, text.length()); 108 | } 109 | 110 | writeString(out, text); 111 | 112 | if (format == SAMFormat.BAM) { 113 | final List refs = 114 | header.getSequenceDictionary().getSequences(); 115 | 116 | writeInt32(out, refs.size()); 117 | 118 | for (final SAMSequenceRecord ref : refs) { 119 | final String name = ref.getSequenceName(); 120 | writeInt32(out, name.length() + 1); 121 | writeString(out, name); 122 | out.write(0); 123 | writeInt32(out, ref.getSequenceLength()); 124 | } 125 | } 126 | 127 | return out; 128 | } 129 | 130 | private static void writeString(final OutputStream out, final String s) 131 | throws IOException 132 | { 133 | // Don't flush the underlying stream yet, only the writer: in the case of 134 | // BAM, we might be able to cram more things into the gzip block still. 135 | final OutputStreamWriter w = new OutputStreamWriter( 136 | new FilterOutputStream(out) { @Override public void flush() {} } ); 137 | w.write(s); 138 | w.flush(); 139 | } 140 | 141 | private void writeInt32(final OutputStream out, int n) throws IOException { 142 | buf.putInt(0, n); 143 | out.write(buf.array()); 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndex.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010 Aalto University 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to 5 | // deal in the Software without restriction, including without limitation the 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | // sell copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | // IN THE SOFTWARE. 20 | 21 | // File created: 2010-08-04 13:11:10 22 | 23 | package org.seqdoop.hadoop_bam; 24 | 25 | import java.io.BufferedInputStream; 26 | import java.io.File; 27 | import java.io.FileInputStream; 28 | import java.io.InputStream; 29 | import java.io.IOException; 30 | import java.nio.ByteBuffer; 31 | import java.util.ArrayList; 32 | import java.util.List; 33 | import java.util.NavigableSet; 34 | import java.util.TreeSet; 35 | 36 | /** An index into BAM files, for {@link BAMInputFormat}. Reads files that are 37 | * created by {@link SplittingBAMIndexer}. 38 | * 39 | *

Indexes the positions of individual BAM records in the file.

40 | */ 41 | public final class SplittingBAMIndex { 42 | private final NavigableSet virtualOffsets = new TreeSet(); 43 | 44 | public SplittingBAMIndex() {} 45 | public SplittingBAMIndex(final File path) throws IOException { 46 | this(new BufferedInputStream(new FileInputStream(path))); 47 | } 48 | public SplittingBAMIndex(final InputStream in) throws IOException { 49 | readIndex(in); 50 | } 51 | 52 | public void readIndex(final InputStream in) throws IOException { 53 | virtualOffsets.clear(); 54 | 55 | final ByteBuffer bb = ByteBuffer.allocate(8); 56 | 57 | for (long prev = -1; in.read(bb.array()) == 8;) { 58 | final long cur = bb.getLong(0); 59 | if (prev > cur) 60 | throw new IOException(String.format( 61 | "Invalid splitting BAM index; offsets not in order: %#x > %#x", 62 | prev, cur)); 63 | 64 | virtualOffsets.add(prev = cur); 65 | } 66 | in.close(); 67 | 68 | if (virtualOffsets.size() < 1) 69 | throw new IOException( 70 | "Invalid splitting BAM index: "+ 71 | "should contain at least the file size"); 72 | } 73 | 74 | public List getVirtualOffsets() { 75 | return new ArrayList<>(virtualOffsets); 76 | } 77 | 78 | public Long prevAlignment(final long filePos) { 79 | return virtualOffsets.floor(filePos << 16); 80 | } 81 | public Long nextAlignment(final long filePos) { 82 | return virtualOffsets.higher(filePos << 16); 83 | } 84 | 85 | public int size() { return virtualOffsets.size(); } 86 | 87 | private long first() { return virtualOffsets.first(); } 88 | private long last() { return prevAlignment(bamSize() - 1); } 89 | long bamSize() { return virtualOffsets.last() >>> 16; } 90 | 91 | @Override 92 | public boolean equals(Object o) { 93 | if (this == o) return true; 94 | if (o == null || getClass() != o.getClass()) return false; 95 | 96 | SplittingBAMIndex that = (SplittingBAMIndex) o; 97 | 98 | return virtualOffsets != null ? virtualOffsets.equals(that.virtualOffsets) : that 99 | .virtualOffsets == null; 100 | 101 | } 102 | 103 | @Override 104 | public int hashCode() { 105 | return virtualOffsets != null ? virtualOffsets.hashCode() : 0; 106 | } 107 | 108 | @Override 109 | public String toString() { 110 | return virtualOffsets.toString(); 111 | } 112 | 113 | /** Writes some statistics about each splitting BAM index file given as an 114 | * argument. 115 | */ 116 | public static void main(String[] args) { 117 | if (args.length == 0) { 118 | System.out.println( 119 | "Usage: SplittingBAMIndex [splitting BAM indices...]\n\n"+ 120 | 121 | "Writes a few statistics about each splitting BAM index."); 122 | return; 123 | } 124 | 125 | for (String arg : args) { 126 | final File f = new File(arg); 127 | if (f.isFile() && f.canRead()) { 128 | try { 129 | System.err.printf("%s:\n", f); 130 | final SplittingBAMIndex bi = new SplittingBAMIndex(f); 131 | if (bi.size() == 1) { 132 | System.err.printf("\t0 alignments\n" + 133 | "\tassociated BAM file size %d\n", bi.bamSize()); 134 | } else { 135 | final long first = bi.first(); 136 | final long last = bi.last(); 137 | System.err.printf( 138 | "\t%d alignments\n" + 139 | "\tfirst is at %#06x in BGZF block at %#014x\n" + 140 | "\tlast is at %#06x in BGZF block at %#014x\n" + 141 | "\tassociated BAM file size %d\n", 142 | bi.size(), 143 | first & 0xffff, first >>> 16, 144 | last & 0xffff, last >>> 16, 145 | bi.bamSize()); 146 | } 147 | } catch (IOException e) { 148 | System.err.printf("Failed to read %s!\n", f); 149 | e.printStackTrace(); 150 | } 151 | } else 152 | System.err.printf("%s does not look like a readable file!\n", f); 153 | } 154 | } 155 | } 156 | --------------------------------------------------------------------------------