├── .gitmodules
├── src
    ├── test
    │   ├── resources
    │   │   ├── auxf.fa.fai
    │   │   ├── auxf.fa
    │   │   ├── test.bam
    │   │   ├── test.cram
    │   │   ├── test.vcf.bgz
    │   │   ├── test.vcf.gz
    │   │   ├── test.bgzf.bcf
    │   │   ├── misnamedBam.sam
    │   │   ├── test.vcf.bgzf.gz
    │   │   ├── HiSeq.10000.vcf.bgz
    │   │   ├── HiSeq.10000.vcf.gz
    │   │   ├── test.uncompressed.bcf
    │   │   ├── HiSeq.10000.vcf.bgz.tbi
    │   │   ├── HiSeq.10000.vcf.bgzf.gz
    │   │   ├── HiSeq.10000.vcf.bgzf.gz.tbi
    │   │   ├── log4j.properties
    │   │   ├── test_headerless.sam
    │   │   ├── README
    │   │   ├── test.sam
    │   │   ├── mini-chr1-chr2.fasta
    │   │   └── test.vcf
    │   └── java
    │   │   └── org
    │   │       └── seqdoop
    │   │           └── hadoop_bam
    │   │               ├── TestBAMSplitGuesser.java
    │   │               ├── util
    │   │                   ├── TestVCFHeaderReader.java
    │   │                   └── TestVCFFileMerger.java
    │   │               ├── TestSAMFormat.java
    │   │               ├── TestAnySAMInputFormat.java
    │   │               ├── TestVCFFormat.java
    │   │               ├── TestSplittingBAMIndexer.java
    │   │               ├── TestSAMHeaderReader.java
    │   │               ├── TestBGZFSplitGuesser.java
    │   │               ├── TestLineReader.java
    │   │               ├── TestConfHelper.java
    │   │               ├── IntervalUtilTest.java
    │   │               ├── BAMTestUtil.java
    │   │               ├── TestFastaInputFormat.java
    │   │               ├── TestSAMInputFormat.java
    │   │               ├── TestVCFInputFormatStringency.java
    │   │               └── TestCRAMInputFormatOnHDFS.java
    └── main
    │   └── java
    │       ├── htsjdk
    │           └── samtools
    │           │   ├── SAMRecordHelper.java
    │           │   └── LinearBAMIndex.java
    │       └── org
    │           └── seqdoop
    │               └── hadoop_bam
    │                   ├── CRAMOutputFormat.java
    │                   ├── KeyIgnoringCRAMRecordWriter.java
    │                   ├── FormatException.java
    │                   ├── VariantContextWithHeader.java
    │                   ├── util
    │                       ├── BGZFCompressionOutputStream.java
    │                       ├── DataOutputWrapper.java
    │                       ├── DataInputWrapper.java
    │                       ├── GetSortedBAMHeader.java
    │                       ├── ConfHelper.java
    │                       ├── IntervalUtil.java
    │                       ├── BGZFCodec.java
    │                       ├── VCFHeaderReader.java
    │                       ├── WrapSeekable.java
    │                       ├── BGZFEnhancedGzipCodec.java
    │                       ├── BGZFSplitCompressionInputStream.java
    │                       ├── SAMHeaderReader.java
    │                       ├── NIOFileUtil.java
    │                       ├── BGZFBlockIndex.java
    │                       └── SAMOutputPreparer.java
    │                   ├── IntelGKLAccessor.java
    │                   ├── KeyIgnoringBAMRecordWriter.java
    │                   ├── FormatConstants.java
    │                   ├── KeyIgnoringSAMRecordWriter.java
    │                   ├── SAMInputFormat.java
    │                   ├── SAMFormat.java
    │                   ├── VCFOutputFormat.java
    │                   ├── AnySAMOutputFormat.java
    │                   ├── BAMOutputFormat.java
    │                   ├── LazyParsingGenotypesContext.java
    │                   ├── KeyIgnoringCRAMOutputFormat.java
    │                   ├── KeyIgnoringBCFRecordWriter.java
    │                   ├── VariantContextWritable.java
    │                   ├── KeyIgnoringVCFRecordWriter.java
    │                   ├── CRAMRecordReader.java
    │                   ├── VCFFormat.java
    │                   ├── SAMRecordWritable.java
    │                   ├── BaseSplitGuesser.java
    │                   ├── SAMRecordWriter.java
    │                   ├── KeyIgnoringBAMOutputFormat.java
    │                   ├── CRAMInputFormat.java
    │                   ├── LazyBAMRecordFactory.java
    │                   ├── KeyIgnoringAnySAMOutputFormat.java
    │                   ├── FileVirtualSplit.java
    │                   ├── CRAMRecordWriter.java
    │                   ├── LazyVCFGenotypesContext.java
    │                   ├── ReferenceFragment.java
    │                   └── SplittingBAMIndex.java
├── bgzf-terminator.bin
├── NOTICE.txt
├── .gitignore
├── findbugs-exclude.xml
├── examples
    └── README.txt
├── .travis.yml
├── scripts
    ├── release
    │   ├── settings.xml
    │   ├── release.sh
    │   └── README.md
    └── deploy
    │   └── addServerToM2Settings.py
└── LICENSE.txt


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/test/resources/auxf.fa.fai:
--------------------------------------------------------------------------------
1 | Sheila	20	8	20	21
2 | 


--------------------------------------------------------------------------------
/src/test/resources/auxf.fa:
--------------------------------------------------------------------------------
1 | >Sheila
2 | GCTAGCTCAGAAAAAAAAAA
3 | 


--------------------------------------------------------------------------------
/bgzf-terminator.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/bgzf-terminator.bin


--------------------------------------------------------------------------------
/src/test/resources/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.bam


--------------------------------------------------------------------------------
/src/test/resources/test.cram:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.cram


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | This product includes software developed by The Apache Software
2 | Foundation (http://www.apache.org/).
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test.vcf.bgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.vcf.bgz


--------------------------------------------------------------------------------
/src/test/resources/test.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.vcf.gz


--------------------------------------------------------------------------------
/src/test/resources/test.bgzf.bcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.bgzf.bcf


--------------------------------------------------------------------------------
/src/test/resources/misnamedBam.sam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/misnamedBam.sam


--------------------------------------------------------------------------------
/src/test/resources/test.vcf.bgzf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.vcf.bgzf.gz


--------------------------------------------------------------------------------
/src/test/resources/HiSeq.10000.vcf.bgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.bgz


--------------------------------------------------------------------------------
/src/test/resources/HiSeq.10000.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.gz


--------------------------------------------------------------------------------
/src/test/resources/test.uncompressed.bcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/test.uncompressed.bcf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /bin
 2 | /test-classes
 3 | /hadoop-bam.jar
 4 | /libs
 5 | .idea
 6 | *.iml
 7 | maven-metadata-local.xml
 8 | *~
 9 | target
10 | 


--------------------------------------------------------------------------------
/src/test/resources/HiSeq.10000.vcf.bgz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.bgz.tbi


--------------------------------------------------------------------------------
/src/test/resources/HiSeq.10000.vcf.bgzf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.bgzf.gz


--------------------------------------------------------------------------------
/src/test/resources/HiSeq.10000.vcf.bgzf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HadoopGenomics/Hadoop-BAM/HEAD/src/test/resources/HiSeq.10000.vcf.bgzf.gz.tbi


--------------------------------------------------------------------------------
/findbugs-exclude.xml:
--------------------------------------------------------------------------------
 1 | <FindBugsFilter>
 2 |   <Match>
 3 |     <Bug pattern="EI_EXPOSE_REP" />
 4 |   </Match>
 5 |   <Match>
 6 |     <Bug pattern="EI_EXPOSE_REP2" />
 7 |   </Match>
 8 |   <Match>
 9 |     <Bug pattern="VA_FORMAT_STRING_USES_NEWLINE" />
10 |   </Match>
11 | </FindBugsFilter>
12 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger = WARN, out
2 | 
3 | log4j.appender.out = org.apache.log4j.ConsoleAppender
4 | log4j.appender.out.layout = org.apache.log4j.PatternLayout
5 | log4j.appender.out.layout.ConversionPattern = %d (%t) [%p - %l] %m%n
6 | 
7 | log4j.logger.org.seqdoop.hadoop_bam=DEBUG
8 | 


--------------------------------------------------------------------------------
/src/main/java/htsjdk/samtools/SAMRecordHelper.java:
--------------------------------------------------------------------------------
 1 | package htsjdk.samtools;
 2 | 
 3 | /**
 4 |  * This class is required in order to access the protected
 5 |  * {@link SAMRecord#eagerDecode()} method in HTSJDK.
 6 |  */
 7 | public class SAMRecordHelper {
 8 |   public static void eagerDecode(SAMRecord record) {
 9 |     record.eagerDecode();
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/test/resources/test_headerless.sam:
--------------------------------------------------------------------------------
1 | read_28833_29006_6945	99	chr21	28833	20	10M1D25M	=	28993	195	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	H0:i:0	H1:i:0	MF:i:130	RG:Z:L1	Nm:i:1
2 | read_28701_28881_323b	147	chr21	28834	30	35M	=	28701	-168	ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA	<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<	H0:i:1	H1:i:0	MF:i:18	RG:Z:L2	Nm:i:0
3 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/CRAMOutputFormat.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 4 | 
 5 | /** Currently this only locks down the value type of the {@link
 6 |  * org.apache.hadoop.mapreduce.OutputFormat}: contains no functionality.
 7 |  */
 8 | public abstract class CRAMOutputFormat<K>
 9 |         extends FileOutputFormat<K,SAMRecordWritable>
10 | {}
11 | 


--------------------------------------------------------------------------------
/src/test/resources/README:
--------------------------------------------------------------------------------
1 | Commands to generate file indexes and compressed versions.
2 | 
3 | bgzip -c src/test/resources/HiSeq.10000.vcf > src/test/resources/HiSeq.10000.vcf.bgz
4 | bcftools index -t src/test/resources/HiSeq.10000.vcf.bgz
5 | 
6 | cp src/test/resources/HiSeq.10000.vcf.bgz src/test/resources/HiSeq.10000.vcf.bgzf.gz
7 | cp src/test/resources/HiSeq.10000.vcf.bgz.tbi src/test/resources/HiSeq.10000.vcf.bgzf.gz.tbi
8 | 
9 | gzip -k src/test/resources/HiSeq.10000.vcf


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
 1 | This directory contains examples for how to use Hadoop-BAM as a
 2 | library. To build the examples do:
 3 | 
 4 | $ mvn clean package
 5 | 
 6 | To run the examples:
 7 | 
 8 | $ hadoop jar target/*-jar-with-dependencies.jar \
 9 |    org.seqdoop.hadoop_bam.examples.TestBAM <input.bam> <output_directory>
10 | 
11 | and:
12 | 
13 | $ hadoop jar target/*-jar-with-dependencies.jar \
14 |    org.seqdoop.hadoop_bam.examples.TestVCF <input.vcf> <output_directory>
15 | 


--------------------------------------------------------------------------------
/src/test/resources/test.sam:
--------------------------------------------------------------------------------
1 | @HD	VN:1.5	SO:coordinate
2 | @SQ	SN:chr21	LN:62435964	AS:HG18
3 | @RG	ID:L1	PU:SC_1_10	LB:SC_1	SM:NA12891	PL:ILLUMINA
4 | @RG	ID:L2	PU:SC_2_12	LB:SC_2	SM:NA12891	PL:ILLUMINA
5 | read_28833_29006_6945	99	chr21	28833	20	10M1D25M	=	28993	195	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	H0:i:0	H1:i:0	MF:i:130	RG:Z:L1	Nm:i:1
6 | read_28701_28881_323b	147	chr21	28834	30	35M	=	28701	-168	ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA	<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<	H0:i:1	H1:i:0	MF:i:18	RG:Z:L2	Nm:i:0
7 | 


--------------------------------------------------------------------------------
/src/test/resources/mini-chr1-chr2.fasta:
--------------------------------------------------------------------------------
1 | >chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1
2 | TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA
3 | ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAAC
4 | CCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCC
5 | TAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC
6 | CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCC
7 | >chr2 dna:chromosome chromosome:GRCh37:2:1:243199373:1
8 | TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC
9 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: java
 2 | jdk:
 3 |   - openjdk8
 4 | 
 5 | env:
 6 |   global:
 7 |     - SONATYPE_USERNAME=schumach
 8 |     - secure: "WXYDXyMHy+pXDlSFypYtLJPQZIl8oyl8dqEKpYdoiOGVaI6VGEwvhQzcuHpIl2lLLsoB8uBrK+rGGlaBovr0MfkXZExfpyCd3lCpKENuEkh2iZboIvc037H2HJFz0oiPTGG4BZu6upGq7xACqIUiZpSlrnT0tzqczU//Di7EmYM="
 9 | 
10 | before_install:
11 |   - cat /etc/hosts # optionally check the content *before*
12 |   - sudo hostname "$(hostname | cut -c1-63)"
13 |   - sed -e "s/^\\(127\\.0\\.0\\.1.*\\)/\\1 $(hostname | cut -c1-63)/" /etc/hosts | sudo tee /etc/hosts
14 |   - cat /etc/hosts # optionally check the content *after*
15 | 
16 | script: mvn clean test jacoco:report
17 | 
18 | after_success:
19 |   - python scripts/deploy/addServerToM2Settings.py
20 |   - mvn coveralls:report
21 |   - mvn deploy -DskipTests=true --settings ~/.m2/mySettings.xml
22 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestBAMSplitGuesser.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.SAMUtils;
 4 | import htsjdk.samtools.seekablestream.SeekableStream;
 5 | import java.io.File;
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.junit.Test;
 9 | import org.seqdoop.hadoop_bam.util.WrapSeekable;
10 | 
11 | import static org.junit.Assert.assertEquals;
12 | 
13 | public class TestBAMSplitGuesser {
14 | 
15 |   @Test
16 |   public void test() throws Exception {
17 |     Configuration conf = new Configuration();
18 |     String bam = getClass().getClassLoader().getResource("test.bam").getFile();
19 |     SeekableStream ss = WrapSeekable.openPath(conf, new Path(bam));
20 |     BAMSplitGuesser bamSplitGuesser = new BAMSplitGuesser(ss, conf);
21 |     long startGuess = bamSplitGuesser.guessNextBAMRecordStart(0, 3 * 0xffff + 0xfffe);
22 |     assertEquals(SAMUtils.findVirtualOffsetOfFirstRecordInBam(new File(bam)), startGuess);
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/scripts/release/settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <settings xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.1.0 http://maven.apache.org/xsd/settings-1.1.0.xsd" xmlns="http://maven.apache.org/SETTINGS/1.1.0"
 3 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 4 |   <servers>
 5 |     <server>
 6 |       <id>sonatype-nexus-snapshots</id>
 7 |       <username>@SONATYPE_USERNAME@</username>
 8 |       <password>@SONATYPE_PASSWORD@</password>
 9 |     </server>
10 |     <server>
11 |       <id>sonatype-nexus-staging</id>
12 |       <username>@SONATYPE_USERNAME@</username>
13 |       <password>@SONATYPE_PASSWORD@</password>
14 |     </server>
15 |   </servers>
16 |   <profiles>
17 |     <profile>
18 |       <id>gpg</id>
19 |       <properties>
20 |         <gpg.executable>@GPG_EXECUTABLE_PATH@</gpg.executable>
21 |         <gpg.passphrase>@PGP_KEY_PASSPHRASE@</gpg.passphrase>
22 |         <gpg.keyname>@GPG_KEY_NAME</gpg.keyname>
23 |       </properties>
24 |     </profile>
25 |   </profiles>
26 |   <activeProfiles>
27 |     <activeProfile>gpg</activeProfile>
28 |   </activeProfiles>
29 | </settings>
30 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 Aalto University
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to
 5 | deal in the Software without restriction, including without limitation the
 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | sell copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMRecordWriter.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.SAMFileHeader;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 6 | 
 7 | import java.io.IOException;
 8 | import java.io.OutputStream;
 9 | 
10 | /** A convenience class that you can use as a RecordWriter for CRAM files.
11 |  *
12 |  * <p>The write function ignores the key, just outputting the SAMRecord.</p>
13 |  */
14 | public class KeyIgnoringCRAMRecordWriter<K> extends CRAMRecordWriter<K> {
15 |     public KeyIgnoringCRAMRecordWriter(
16 |             Path output, Path input, boolean writeHeader, TaskAttemptContext ctx)
17 |             throws IOException
18 |     {
19 |         super(output, input, writeHeader, ctx);
20 |     }
21 | 
22 |     public KeyIgnoringCRAMRecordWriter(
23 |             Path output, SAMFileHeader header, boolean writeHeader,
24 |             TaskAttemptContext ctx)
25 |             throws IOException
26 |     {
27 |         super(output, header, writeHeader, ctx);
28 |     }
29 | 
30 |     @Override public void write(K ignored, SAMRecordWritable rec) {
31 |         writeAlignment(rec.get());
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/util/TestVCFHeaderReader.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam.util;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import com.google.common.io.Resources;
 6 | 
 7 | import htsjdk.samtools.seekablestream.ByteArraySeekableStream;
 8 | import htsjdk.samtools.seekablestream.SeekableStream;
 9 | 
10 | import org.junit.Test;
11 | 
12 | import static org.junit.Assert.assertNotNull;
13 | 
14 | public class TestVCFHeaderReader {
15 | 
16 |   @Test
17 |   public void testReadHeaderFromVCF() throws IOException {
18 |     assertNotNull(VCFHeaderReader.readHeaderFrom(seekableStream("test.vcf")));
19 |   }
20 | 
21 |   @Test
22 |   public void testReadHeaderFromGzippedVCF() throws IOException {
23 |     assertNotNull(VCFHeaderReader.readHeaderFrom(seekableStream("test.vcf.gz")));
24 |   }
25 | 
26 |   @Test
27 |   public void testReadHeaderFromBGZFVCF() throws IOException {
28 |     assertNotNull(VCFHeaderReader.readHeaderFrom(seekableStream("test.vcf.bgzf.gz")));
29 |   }
30 | 
31 |   static SeekableStream seekableStream(final String resource) throws IOException {
32 |     return new ByteArraySeekableStream(Resources.toByteArray(ClassLoader.getSystemClassLoader().getResource(resource)));
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/scripts/release/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # do we have enough arguments?
 4 | if [ $# < 3 ]; then
 5 |     echo "Usage:"
 6 |     echo
 7 |     echo "./release.sh <release version> <development version>"
 8 |     exit 1
 9 | fi
10 | 
11 | # pick arguments
12 | release=$1
13 | devel=$2
14 | 
15 | # get current branch
16 | branch=$(git rev-parse --abbrev-ref HEAD)
17 | 
18 | commit=$(git log --pretty=format:"%H" | head -n 1)
19 | echo "releasing from ${commit} on branch ${branch}"
20 | 
21 | git push origin ${branch}
22 | 
23 | #do release
24 | mvn --batch-mode \
25 |   -Dresume=false \
26 |   -Dtag=${release} \
27 |   -DreleaseVersion=${release} \
28 |   -DdevelopmentVersion=${devel} \
29 |   -DbranchName=${release} \
30 |   release:clean \
31 |   release:prepare \
32 |   release:perform
33 | 
34 | if [ $? != 0 ]; then
35 |   echo "Releasing failed."
36 |   exit 1
37 | fi
38 | 
39 | if [ $branch = "master" ]; then
40 |   # if original branch was master, update versions on original branch
41 |   git checkout ${branch}
42 |   mvn versions:set -DnewVersion=${devel} \
43 |     -DgenerateBackupPoms=false
44 |   git commit -a -m "Modifying pom.xml files for new development after ${release} release."
45 |   git push origin ${branch}
46 | fi
47 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestSAMFormat.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import org.junit.Test;
 6 | 
 7 | import static org.junit.Assert.assertEquals;
 8 | import static org.junit.Assert.assertNull;
 9 | 
10 | public class TestSAMFormat {
11 | 
12 |   @Test
13 |   public void testInferFromFilePath() throws IOException {
14 |     assertEquals(SAMFormat.SAM, SAMFormat.inferFromFilePath("test.sam"));
15 |     assertEquals(SAMFormat.BAM, SAMFormat.inferFromFilePath("test.bam"));
16 |     assertEquals(SAMFormat.CRAM, SAMFormat.inferFromFilePath("test.cram"));
17 |     assertNull(SAMFormat.inferFromFilePath("test.vcf"));
18 |   }
19 | 
20 |   @Test
21 |   public void testInferFromData() throws IOException {
22 |     assertEquals(SAMFormat.SAM, SAMFormat.inferFromData(stream("test.sam")));
23 |     assertEquals(SAMFormat.BAM, SAMFormat.inferFromData(stream("test.bam")));
24 |     assertEquals(SAMFormat.CRAM, SAMFormat.inferFromData(stream("test.cram")));
25 |     assertNull( SAMFormat.inferFromData(stream("test.vcf")));
26 |   }
27 | 
28 |   private InputStream stream(String resource) throws IOException {
29 |     return ClassLoader.getSystemClassLoader().getResource(resource).openStream();
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestAnySAMInputFormat.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.fs.PathNotFoundException;
 6 | import org.junit.Test;
 7 | 
 8 | import static org.junit.Assert.assertEquals;
 9 | 
10 | public class TestAnySAMInputFormat {
11 | 
12 |     @Test
13 |     public void testHeaderlessSamFormat() throws PathNotFoundException {
14 |         final SAMFormat result = getSamFormat(new Configuration(), "test_headerless.sam");
15 |         assertEquals(SAMFormat.SAM, result);
16 |     }
17 | 
18 |     @Test
19 |     public void testTrustExtensionsIsHonored() throws PathNotFoundException {
20 |         final Configuration conf = new Configuration();
21 |         //default to trusting exceptions
22 |         assertEquals(SAMFormat.SAM, getSamFormat(conf, "misnamedBam.sam"));
23 | 
24 |         conf.set(AnySAMInputFormat.TRUST_EXTS_PROPERTY, "false");
25 |         final SAMFormat result = getSamFormat(conf, "misnamedBam.sam");
26 |         assertEquals(SAMFormat.BAM, result);
27 |     }
28 | 
29 |     private SAMFormat getSamFormat(final Configuration conf, final String file) throws PathNotFoundException {
30 |         final String filePath = getClass().getClassLoader().getResource(file).getFile();
31 |         return new AnySAMInputFormat(conf).getFormat(new Path(filePath));
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/FormatException.java:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2011-2012 CRS4.
 2 | //
 3 | // This file is part of Hadoop-BAM.
 4 | //
 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | // of this software and associated documentation files (the "Software"), to
 7 | // deal in the Software without restriction, including without limitation the
 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 9 | // sell copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in
13 | // all copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | // IN THE SOFTWARE.
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | public class FormatException extends RuntimeException
26 | {
27 | 	private static final long serialVersionUID = 1L;
28 | 	public FormatException(String msg)
29 | 	{
30 | 		super(msg);
31 | 	}
32 | }
33 | 


--------------------------------------------------------------------------------
/scripts/release/README.md:
--------------------------------------------------------------------------------
 1 | Notes for release managers
 2 | ---
 3 | 
 4 | This document describes how to make a Hadoop-BAM release.
 5 | 
 6 | Setup your environment
 7 | 1. Copy (or incorporate) the settings.xml file to ```~/.m2/settings.xml```
 8 | 2. Edit the username, password, etc in ```~/.m2/settings.xml```
 9 | 
10 | First, update the CHANGELOG.txt file with the list of closed issues and closed
11 | and merged pull requests. Additionally, you will need to update the version in
12 | README.md. These changes will need to be committed to the branch you are
13 | releasing from before you do the release.
14 | 
15 | Then from the project root directory, run `./scripts/release/release.sh`.
16 | When you run this script, it takes the release version and the new development
17 | version as arguments. For example:
18 | 
19 | ```bash
20 | ./scripts/release/release.sh 7.9.2 7.9.3-SNAPSHOT
21 | ```
22 | 
23 | This script can be run off of a different branch from
24 | master, which makes it possible to cut maintenance releases.
25 | 
26 | Once you've successfully published the release, you will need to "close" and
27 | "release" it following the instructions at
28 | http://central.sonatype.org/pages/releasing-the-deployment.html#close-and-drop-or-release-your-staging-repository.
29 | 
30 | After the release is rsynced to the Maven Central repository, confirm checksums match
31 | and verify signatures. You should be able to verify this before closing the release
32 | in Sonatype, as the checksums and signatures will be available in the staging repository.
33 | 


--------------------------------------------------------------------------------
/scripts/deploy/addServerToM2Settings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Based on https://gist.github.com/neothemachine/4060735
 4 | 
 5 | import sys
 6 | import os
 7 | import os.path
 8 | import xml.dom.minidom
 9 | 
10 | if os.environ["TRAVIS_SECURE_ENV_VARS"] == "false":
11 |   print "no secure env vars available, skipping deployment"
12 |   sys.exit()
13 | 
14 | homedir = os.path.expanduser("~")
15 | 
16 | m2 = xml.dom.minidom.parse(homedir + '/.m2/settings.xml')
17 | settings = m2.getElementsByTagName("settings")[0]
18 | 
19 | serversNodes = settings.getElementsByTagName("servers")
20 | if not serversNodes:
21 |   serversNode = m2.createElement("servers")
22 |   settings.appendChild(serversNode)
23 | else:
24 |   serversNode = serversNodes[0]
25 |   
26 | sonatypeServerNode = m2.createElement("server")
27 | sonatypeServerId = m2.createElement("id")
28 | sonatypeServerUser = m2.createElement("username")
29 | sonatypeServerPass = m2.createElement("password")
30 | 
31 | idNode = m2.createTextNode("sonatype-nexus-snapshots")
32 | userNode = m2.createTextNode(os.environ["SONATYPE_USERNAME"])
33 | passNode = m2.createTextNode(os.environ["SONATYPE_PASSWORD"])
34 | 
35 | sonatypeServerId.appendChild(idNode)
36 | sonatypeServerUser.appendChild(userNode)
37 | sonatypeServerPass.appendChild(passNode)
38 | 
39 | sonatypeServerNode.appendChild(sonatypeServerId)
40 | sonatypeServerNode.appendChild(sonatypeServerUser)
41 | sonatypeServerNode.appendChild(sonatypeServerPass)
42 | 
43 | serversNode.appendChild(sonatypeServerNode)
44 |   
45 | m2Str = m2.toxml()
46 | f = open(homedir + '/.m2/mySettings.xml', 'w')
47 | f.write(m2Str)
48 | f.close()
49 | 


--------------------------------------------------------------------------------
/src/test/resources/test.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##fileDate=20090805
 3 | ##source=myImputationProgramV3.1
 4 | ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
 5 | ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
 6 | ##phasing=partial
 7 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
 8 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 9 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
10 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
11 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
12 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
13 | ##FILTER=<ID=q10,Description="Quality below 10">
14 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
15 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
16 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
17 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
18 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
19 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
20 | 20	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=0.5;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
21 | 20	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3
22 | 20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4
23 | 20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:7:56,60	0|0:48:4:51,51	0/0:61:2
24 | 20	1234567	microsat1	GTC	G,GTCT	50	PASS	NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/2:17:2	1/1:40:3
25 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/VariantContextWithHeader.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | package org.seqdoop.hadoop_bam;
22 | 
23 | import htsjdk.variant.variantcontext.VariantContext;
24 | import htsjdk.variant.vcf.VCFHeader;
25 | 
26 | public class VariantContextWithHeader extends VariantContext {
27 |     private final VCFHeader header;
28 | 
29 |     public VariantContextWithHeader(VariantContext context, VCFHeader header) {
30 |         super(context);
31 |         this.header = header;
32 |     }
33 | 
34 |     public VCFHeader getHeader() {
35 |         return header;
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCompressionOutputStream.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam.util;
 2 | 
 3 | import htsjdk.samtools.util.BlockCompressedOutputStream;
 4 | import java.io.File;
 5 | import java.io.IOException;
 6 | import java.io.OutputStream;
 7 | import org.apache.hadoop.io.compress.CompressionOutputStream;
 8 | 
 9 | /**
10 |  * An implementation of {@code CompressionOutputStream} for BGZF, using
11 |  * {@link BlockCompressedOutputStream} from htsjdk. Note that unlike
12 |  * {@link BlockCompressedOutputStream}, an empty gzip block file terminator is
13 |  * <i>not</i> written at the end of the stream. This is because in Hadoop, multiple
14 |  * headerless files are often written in parallel, and merged afterwards into a single
15 |  * file, and it's during the merge process the header and terminator are added.
16 |  */
17 | class BGZFCompressionOutputStream extends CompressionOutputStream {
18 | 
19 |   private BlockCompressedOutputStream output;
20 | 
21 |   public BGZFCompressionOutputStream(OutputStream out)
22 |       throws IOException {
23 |     super(out);
24 |     this.output = new BlockCompressedOutputStream(out, (File) null);
25 |   }
26 | 
27 |   public void write(int b) throws IOException {
28 |     output.write(b);
29 |   }
30 | 
31 |   public void write(byte[] b, int off, int len) throws IOException {
32 |     output.write(b, off, len);
33 |   }
34 | 
35 |   public void finish() throws IOException {
36 |     output.flush();
37 |   }
38 | 
39 |   public void resetState() throws IOException {
40 |     output.flush();
41 |     output = new BlockCompressedOutputStream(out, (File) null);
42 |   }
43 | 
44 |   public void close() throws IOException {
45 |     output.flush(); // don't close as we don't want to write terminator (empty gzip block)
46 |     out.close();
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/DataOutputWrapper.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2011 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2011-06-27 09:25:13
22 | 
23 | package org.seqdoop.hadoop_bam.util;
24 | 
25 | import java.io.DataOutput;
26 | import java.io.IOException;
27 | import java.io.OutputStream;
28 | 
29 | public class DataOutputWrapper extends OutputStream {
30 | 	private final DataOutput out;
31 | 
32 | 	public DataOutputWrapper(DataOutput o) { out = o; }
33 | 
34 | 	@Override public void write(int b) throws IOException {
35 | 		out.writeByte(b);
36 | 	}
37 | 	@Override public void write(byte[] b, int off, int len) throws IOException {
38 | 		out.write(b, off, len);
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/util/TestVCFFileMerger.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam.util;
 2 | 
 3 | import htsjdk.variant.vcf.VCFHeader;
 4 | import java.io.File;
 5 | import java.io.IOException;
 6 | import java.io.InputStream;
 7 | import java.net.URI;
 8 | import java.nio.file.Files;
 9 | import java.nio.file.Path;
10 | import java.nio.file.Paths;
11 | import org.junit.Before;
12 | import org.junit.Test;
13 | 
14 | public class TestVCFFileMerger {
15 | 
16 |   private String partsDirectory;
17 |   private VCFHeader header;
18 | 
19 |   @Before
20 |   public void setup() throws Exception {
21 |     File partsDir = File.createTempFile("parts", "");
22 |     partsDir.delete();
23 |     partsDir.mkdir();
24 |     Files.createFile(new File(partsDir, "_SUCCESS").toPath());
25 |     partsDirectory = partsDir.toURI().toString();
26 |     header = VCFHeaderReader.readHeaderFrom(TestVCFHeaderReader.seekableStream("test.vcf"));
27 |   }
28 | 
29 |   @Test(expected = IllegalArgumentException.class)
30 |   public void testEmpty() throws IOException {
31 |     File out = File.createTempFile("out", ".vcf");
32 |     out.deleteOnExit();
33 |     VCFFileMerger.mergeParts(partsDirectory, out.toURI().toString(), header);
34 |   }
35 | 
36 |   @Test(expected = IllegalArgumentException.class)
37 |   public void testBCFNotSupported() throws IOException {
38 |     File out = File.createTempFile("out", ".bcf");
39 |     out.deleteOnExit();
40 |     Path target = Paths.get(URI.create(partsDirectory)).resolve("part-m-00000");
41 |     Files.copy(stream("test.uncompressed.bcf"), target);
42 |     VCFFileMerger.mergeParts(partsDirectory, out.toURI().toString(), header);
43 |   }
44 | 
45 |   private InputStream stream(String resource) throws IOException {
46 |     return ClassLoader.getSystemClassLoader().getResource(resource).openStream();
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/htsjdk/samtools/LinearBAMIndex.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | package htsjdk.samtools;
21 | 
22 | import htsjdk.samtools.CachingBAMFileIndex;
23 | import htsjdk.samtools.LinearIndex;
24 | import htsjdk.samtools.SAMSequenceDictionary;
25 | import htsjdk.samtools.seekablestream.SeekableStream;
26 | 
27 | /**
28 |  * The htsjdk APIs for accessing the linear BAM index are private...
29 |  */
30 | public class LinearBAMIndex extends CachingBAMFileIndex {
31 | 
32 |         public LinearBAMIndex(SeekableStream stream, SAMSequenceDictionary dict) {
33 |                 super(stream, dict);
34 |         }
35 |         
36 |         public LinearIndex getLinearIndex(int idx) {
37 |                 return getQueryResults(idx).getLinearIndex();
38 |         }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestVCFFormat.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import org.junit.Test;
 6 | 
 7 | import static org.junit.Assert.assertEquals;
 8 | import static org.junit.Assert.assertFalse;
 9 | import static org.junit.Assert.assertNull;
10 | import static org.junit.Assert.assertTrue;
11 | 
12 | public class TestVCFFormat {
13 | 
14 |   @Test
15 |   public void testInferFromFilePath() throws IOException {
16 |     assertEquals(VCFFormat.VCF, VCFFormat.inferFromFilePath("test.vcf"));
17 |     assertEquals(VCFFormat.VCF, VCFFormat.inferFromFilePath("test.vcf.gz"));
18 |     assertEquals(VCFFormat.VCF, VCFFormat.inferFromFilePath("test.vcf.bgzf.gz"));
19 |     assertEquals(VCFFormat.VCF, VCFFormat.inferFromFilePath("test.vcf.bgz"));
20 |     assertEquals(VCFFormat.BCF, VCFFormat.inferFromFilePath("test.uncompressed.bcf"));
21 |     assertEquals(VCFFormat.BCF, VCFFormat.inferFromFilePath("test.bgzf.bcf"));
22 |     assertNull(VCFFormat.inferFromFilePath("test.sam"));
23 |   }
24 | 
25 |   @Test
26 |   public void testInferFromData() throws IOException {
27 |     assertEquals(VCFFormat.VCF, VCFFormat.inferFromData(stream("test.vcf")));
28 |     assertEquals(VCFFormat.VCF, VCFFormat.inferFromData(stream("test.vcf.gz")));
29 |     assertEquals(VCFFormat.VCF, VCFFormat.inferFromData(stream("test.vcf.bgzf.gz")));
30 |     assertEquals(VCFFormat.VCF, VCFFormat.inferFromData(stream("test.vcf.bgz")));
31 |     assertEquals(VCFFormat.BCF, VCFFormat.inferFromData(stream("test.uncompressed.bcf")));
32 |     assertEquals(VCFFormat.BCF, VCFFormat.inferFromData(stream("test.bgzf.bcf")));
33 |     assertNull(VCFFormat.inferFromData(stream("test.sam")));
34 |   }
35 | 
36 |   private InputStream stream(String resource) throws IOException {
37 |     return ClassLoader.getSystemClassLoader().getResource(resource).openStream();
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/IntelGKLAccessor.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | package org.seqdoop.hadoop_bam;
22 | 
23 | import com.intel.gkl.compression.IntelDeflaterFactory;
24 | import com.intel.gkl.compression.IntelInflaterFactory;
25 | import htsjdk.samtools.util.zip.DeflaterFactory;
26 | import htsjdk.samtools.util.zip.InflaterFactory;
27 | 
28 | /**
29 |  * GKL code is kept here so this class is only loaded when the Intel inflator or
30 |  * deflator is used. This means that users that aren't using these features don't
31 |  * have to put the GKL JAR on their classpath (since it's a provided dependency).
32 |  */
33 | class IntelGKLAccessor {
34 |   static InflaterFactory newInflatorFactor() {
35 |     return new IntelInflaterFactory();
36 |   }
37 |   static DeflaterFactory newDeflaterFactory() {
38 |     return new IntelDeflaterFactory();
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/DataInputWrapper.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2011 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2011-06-27 09:25:59
22 | 
23 | package org.seqdoop.hadoop_bam.util;
24 | 
25 | import java.io.DataInput;
26 | import java.io.InputStream;
27 | import java.io.IOException;
28 | 
29 | public class DataInputWrapper extends InputStream {
30 | 	private final DataInput in;
31 | 
32 | 	public DataInputWrapper(DataInput i) { in = i; }
33 | 
34 | 	@Override public long skip(long n) throws IOException {
35 | 		for (; n > Integer.MAX_VALUE; n -= Integer.MAX_VALUE) {
36 | 			final int skipped = in.skipBytes(Integer.MAX_VALUE);
37 | 			if (skipped < Integer.MAX_VALUE)
38 | 				return skipped;
39 | 		}
40 | 		return in.skipBytes((int)n);
41 | 	}
42 | 	@Override public int read(byte[] b, int off, int len) throws IOException {
43 | 		in.readFully(b, off, len);
44 | 		return len;
45 | 	}
46 | 	@Override public int read() throws IOException {
47 | 		return in.readByte();
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMRecordWriter.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2010 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2010-08-11 10:36:08
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.IOException;
26 | import java.io.OutputStream;
27 | 
28 | import htsjdk.samtools.SAMFileHeader;
29 | 
30 | import org.apache.hadoop.fs.Path;
31 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
32 | 
33 | /** A convenience class that you can use as a RecordWriter for BAM files.
34 |  *
35 |  * <p>The write function ignores the key, just outputting the SAMRecord.</p>
36 |  */
37 | public class KeyIgnoringBAMRecordWriter<K> extends BAMRecordWriter<K> {
38 | 	public KeyIgnoringBAMRecordWriter(
39 | 			Path output, Path input, boolean writeHeader, TaskAttemptContext ctx)
40 | 		throws IOException
41 | 	{
42 | 		super(output, input, writeHeader, ctx);
43 | 	}
44 | 	public KeyIgnoringBAMRecordWriter(
45 | 			Path output, SAMFileHeader header, boolean writeHeader,
46 | 			TaskAttemptContext ctx)
47 | 		throws IOException
48 | 	{
49 | 		super(output, header, writeHeader, ctx);
50 | 	}
51 | 
52 | 	@Override public void write(K ignored, SAMRecordWritable rec) throws IOException {
53 | 		writeAlignment(rec.get());
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestSplittingBAMIndexer.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.SAMRecord;
 4 | import htsjdk.samtools.SamReader;
 5 | import htsjdk.samtools.SamReaderFactory;
 6 | import java.io.File;
 7 | import java.io.FileOutputStream;
 8 | import java.io.IOException;
 9 | import org.apache.hadoop.conf.Configuration;
10 | import org.junit.Before;
11 | import org.junit.Test;
12 | 
13 | import static org.junit.Assert.assertEquals;
14 | import static org.junit.Assert.assertTrue;
15 | 
16 | public class TestSplittingBAMIndexer {
17 |   private String input;
18 | 
19 |   @Before
20 |   public void setup() throws Exception {
21 |     input = ClassLoader.getSystemClassLoader().getResource("test.bam").getFile();
22 |   }
23 | 
24 |   @Test
25 |   public void testIndexersProduceSameIndexes() throws Exception {
26 |     long bamFileSize = new File(input).length();
27 |     for (int g : new int[] { 2, 10, SplittingBAMIndexer.DEFAULT_GRANULARITY}) {
28 |       SplittingBAMIndex index1 = fromBAMFile(g);
29 |       SplittingBAMIndex index2 = fromSAMRecords(g);
30 |       assertEquals(index1, index2);
31 |       assertEquals(bamFileSize, index1.bamSize());
32 |       assertEquals(bamFileSize, index2.bamSize());
33 |     }
34 |   }
35 | 
36 |   private SplittingBAMIndex fromBAMFile(int granularity) throws
37 |       IOException {
38 |     Configuration conf = new Configuration();
39 |     conf.set("input", new File(input).toURI().toString());
40 |     conf.setInt("granularity", granularity);
41 | 
42 |     SplittingBAMIndexer.run(conf);
43 | 
44 |     File indexFile = new File(input + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
45 |     assertTrue(indexFile.exists());
46 | 
47 |     return new SplittingBAMIndex(indexFile);
48 |   }
49 | 
50 |   private SplittingBAMIndex fromSAMRecords(int granularity) throws IOException {
51 |     File indexFile = new File(input + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
52 |     FileOutputStream out = new FileOutputStream(indexFile);
53 |     SplittingBAMIndexer indexer = new SplittingBAMIndexer(out, granularity);
54 |     SamReader samReader = SamReaderFactory.makeDefault()
55 |         .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS).open(new File(input));
56 |     for (SAMRecord r : samReader) {
57 |       indexer.processAlignment(r);
58 |     }
59 |     indexer.finish(new File(input).length());
60 |     out.close();
61 | 
62 |     assertTrue(indexFile.exists());
63 | 
64 |     return new SplittingBAMIndex(indexFile);
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/FormatConstants.java:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2011-2012 CRS4.
 2 | //
 3 | // This file is part of Hadoop-BAM.
 4 | //
 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | // of this software and associated documentation files (the "Software"), to
 7 | // deal in the Software without restriction, including without limitation the
 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 9 | // sell copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in
13 | // all copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | // IN THE SOFTWARE.
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | public class FormatConstants
26 | {
27 | 	/**
28 | 	 * Offset by which Sanger-style ASCII-encoded quality scores are shifted.
29 | 	 */
30 | 	public static final int SANGER_OFFSET = 33;
31 | 
32 | 	/**
33 | 	 * Maximum encodable quality score for Sanger Phred+33 encoded base qualities.
34 | 	 *
35 | 	 * Range of legal values is [0,93], according to wikipedia on 10/9/2013:
36 | 	 *   http://en.wikipedia.org/wiki/FASTQ_format#Quality
37 | 	 */
38 | 	public static final int SANGER_MAX = 93;
39 | 
40 | 	/**
41 | 	 * Offset by which Illumina-style ASCII-encoded quality scores are shifted.
42 | 	 */
43 | 	public static final int ILLUMINA_OFFSET = 64;
44 | 
45 | 	/**
46 | 	 * Maximum encodable quality score for Illumina Phred+64 encoded base qualities.
47 | 	 */
48 | 	public static final int ILLUMINA_MAX = 62;
49 | 
50 | 	/**
51 | 	 * Encodings for base quality formats.
52 | 	 */
53 | 	public enum BaseQualityEncoding { Illumina, Sanger };
54 | 
55 | 	private FormatConstants() {} // no instantiation
56 | 
57 | 	public static final String CONF_INPUT_BASE_QUALITY_ENCODING = "hbam.input.base-quality-encoding";
58 | 	public static final String CONF_INPUT_FILTER_FAILED_QC      = "hbam.input.filter-failed-qc";
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringSAMRecordWriter.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2010 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2012-02-23 12:53:50
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.IOException;
26 | import java.io.OutputStream;
27 | 
28 | import htsjdk.samtools.SAMFileHeader;
29 | 
30 | import org.apache.hadoop.fs.Path;
31 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
32 | 
33 | /** A convenience class that you can use as a RecordWriter for SAM files.
34 |  *
35 |  * <p>The write function ignores the key, just outputting the SAMRecord.</p>
36 |  */
37 | public class KeyIgnoringSAMRecordWriter<K> extends SAMRecordWriter<K> {
38 | 	public KeyIgnoringSAMRecordWriter(
39 | 			Path output, Path input, boolean writeHeader, TaskAttemptContext ctx)
40 | 		throws IOException
41 | 	{
42 | 		super(output, input, writeHeader, ctx);
43 | 	}
44 | 	public KeyIgnoringSAMRecordWriter(
45 | 			Path output, SAMFileHeader header, boolean writeHeader,
46 | 			TaskAttemptContext ctx)
47 | 		throws IOException
48 | 	{
49 | 		super(output, header, writeHeader, ctx);
50 | 	}
51 | 	public KeyIgnoringSAMRecordWriter(
52 | 			OutputStream output, SAMFileHeader header, boolean writeHeader)
53 | 		throws IOException
54 | 	{
55 | 		super(output, header, writeHeader);
56 | 	}
57 | 
58 | 	@Override public void write(K ignored, SAMRecordWritable rec) {
59 | 		writeAlignment(rec.get());
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/SAMInputFormat.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2012-02-02 11:32:58
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.IOException;
26 | 
27 | import org.apache.hadoop.fs.Path;
28 | import org.apache.hadoop.io.LongWritable;
29 | import org.apache.hadoop.mapreduce.JobContext;
30 | import org.apache.hadoop.mapreduce.InputSplit;
31 | import org.apache.hadoop.mapreduce.RecordReader;
32 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
33 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
34 | 
35 | /** An {@link org.apache.hadoop.mapreduce.InputFormat} for SAM files. Values
36 |  * are the individual records; see {@link BAMRecordReader} for the meaning of
37 |  * the key.
38 |  */
39 | public class SAMInputFormat
40 | 	extends FileInputFormat<LongWritable,SAMRecordWritable>
41 | {
42 | 	/** Returns a {@link SAMRecordReader} initialized with the parameters. */
43 | 	@Override public RecordReader<LongWritable,SAMRecordWritable>
44 | 		createRecordReader(InputSplit split, TaskAttemptContext ctx)
45 | 			throws InterruptedException, IOException
46 | 	{
47 | 		final RecordReader<LongWritable,SAMRecordWritable> rr =
48 | 			new SAMRecordReader();
49 | 		rr.initialize(split, ctx);
50 | 		return rr;
51 | 	}
52 | 
53 | 	@Override public boolean isSplitable(JobContext job, Path path) {
54 | 		return super.isSplitable(job, path);
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/SAMFormat.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2012-02-23 14:06:35
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.InputStream;
26 | import java.io.IOException;
27 | 
28 | import org.apache.hadoop.fs.Path;
29 | 
30 | /** Describes a SAM format. */
31 | public enum SAMFormat {
32 | 	SAM, BAM, CRAM;
33 | 
34 | 	/** Infers the SAM format by looking at the filename of the given path.
35 | 	 *
36 | 	 * @see #inferFromFilePath(String)
37 | 	 */
38 | 	public static SAMFormat inferFromFilePath(final Path path) {
39 | 		return inferFromFilePath(path.getName());
40 | 	}
41 | 
42 | 	/** Infers the SAM format by looking at the extension of the given file
43 | 	 * name. <code>*.sam</code> is recognized as {@link #SAM},
44 | 	 * <code>*.bam</code> as {@link #BAM}, and <code>*.bam</code> as {@link #CRAM}.
45 | 	 */
46 | 	public static SAMFormat inferFromFilePath(final String name) {
47 | 		if (name.endsWith(".bam")) return BAM;
48 | 		if (name.endsWith(".cram")) return CRAM;
49 | 		if (name.endsWith(".sam")) return SAM;
50 | 		return null;
51 | 	}
52 | 
53 | 	public static SAMFormat inferFromData(final InputStream in) throws IOException {
54 | 		final byte b = (byte)in.read();
55 | 		in.close();
56 | 		switch (b) {
57 | 			case 0x1f: return SAMFormat.BAM;
58 | 			case 0x43: return SAMFormat.CRAM;
59 | 			case '@':  return SAMFormat.SAM;
60 | 		}
61 | 		return null;
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/VCFOutputFormat.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2013-06-26 56:09:25
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import org.apache.hadoop.conf.Configuration;
26 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
27 | 
28 | /** An abstract {@link org.apache.hadoop.mapreduce.OutputFormat} for VCF and
29 |  * BCF files. Only locks down the value type and stores the output format
30 |  * requested.
31 |  */
32 | public abstract class VCFOutputFormat<K>
33 | 	extends FileOutputFormat<K,VariantContextWritable>
34 | {
35 | 	/** A string property defining the output format to use. The value is read
36 | 	 * directly by {@link VCFFormat#valueOf}.
37 | 	 */
38 | 	public static final String OUTPUT_VCF_FORMAT_PROPERTY =
39 | 		"hadoopbam.vcf.output-format";
40 | 
41 | 	protected VCFFormat format;
42 | 
43 | 	/** Creates a new output format, reading {@link #OUTPUT_VCF_FORMAT_PROPERTY}
44 | 	 * from the given <code>Configuration</code>.
45 | 	 */
46 | 	protected VCFOutputFormat(Configuration conf) {
47 | 		final String fmtStr = conf.get(OUTPUT_VCF_FORMAT_PROPERTY);
48 | 
49 | 		format = fmtStr == null ? null : VCFFormat.valueOf(fmtStr);
50 | 	}
51 | 
52 | 	/** Creates a new output format for the given VCF format. */
53 | 	protected VCFOutputFormat(VCFFormat fmt) {
54 | 		if (fmt == null)
55 | 			throw new IllegalArgumentException("null VCFFormat");
56 | 		format = fmt;
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/GetSortedBAMHeader.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2010 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2010-08-20 13:54:10
22 | 
23 | package org.seqdoop.hadoop_bam.util;
24 | 
25 | import java.io.File;
26 | import java.io.FileOutputStream;
27 | import java.io.IOException;
28 | 
29 | import htsjdk.samtools.SAMFileHeader;
30 | import htsjdk.samtools.SamReaderFactory;
31 | import htsjdk.samtools.ValidationStringency;
32 | 
33 | import org.seqdoop.hadoop_bam.SAMFormat;
34 | 
35 | public final class GetSortedBAMHeader {
36 | 	public static void main(String[] args) throws IOException {
37 | 		if (args.length < 2) {
38 | 			System.err.println(
39 | 				"Usage: GetSortedBAMHeader input output\n\n"+
40 | 
41 | 				"Reads the BAM header from input (a standard BGZF-compressed BAM "+
42 | 				"file), and\nwrites it (BGZF-compressed, no terminator block) to "+
43 | 				"output. Sets the sort order\nindicated in the SAM header to "+
44 | 				"'coordinate'.");
45 | 			System.exit(1);
46 | 		}
47 | 
48 | 		final SAMFileHeader h =
49 | 				SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT)
50 | 						.setUseAsyncIo(false)
51 | 						.open(new File(args[0])).getFileHeader();
52 | 		h.setSortOrder(SAMFileHeader.SortOrder.coordinate);
53 | 
54 |         try (FileOutputStream stream = new FileOutputStream(args[1])) {
55 |             new SAMOutputPreparer().prepareForRecords(stream, SAMFormat.BAM, h);
56 |         }
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/AnySAMOutputFormat.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2010 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2012-02-23 13:00:24
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import org.apache.hadoop.conf.Configuration;
26 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
27 | 
28 | /** An abstract {@link org.apache.hadoop.mapreduce.OutputFormat} for SAM and
29 |  * BAM files. Only locks down the value type and stores the output format
30 |  * requested.
31 |  */
32 | public abstract class AnySAMOutputFormat<K>
33 | 	extends FileOutputFormat<K,SAMRecordWritable>
34 | {
35 | 	/** A string property defining the output format to use. The value is read
36 | 	 * directly by {@link SAMFormat#valueOf}.
37 | 	 */
38 | 	public static final String OUTPUT_SAM_FORMAT_PROPERTY =
39 | 		"hadoopbam.anysam.output-format";
40 | 
41 | 	protected SAMFormat format;
42 | 
43 | 	/** Creates a new output format, reading {@link #OUTPUT_SAM_FORMAT_PROPERTY}
44 | 	 * from the given <code>Configuration</code>.
45 | 	 */
46 | 	protected AnySAMOutputFormat(Configuration conf) {
47 | 		final String fmtStr = conf.get(OUTPUT_SAM_FORMAT_PROPERTY);
48 | 
49 | 		format = fmtStr == null ? null : SAMFormat.valueOf(fmtStr);
50 | 	}
51 | 
52 | 	/** Creates a new output format for the given SAM format. */
53 | 	protected AnySAMOutputFormat(SAMFormat fmt) {
54 | 		if (fmt == null)
55 | 			throw new IllegalArgumentException("null SAMFormat");
56 | 		format = fmt;
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/BAMOutputFormat.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2010 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2010-08-11 12:17:33
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import org.apache.hadoop.conf.Configuration;
26 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
27 | 
28 | /** Currently this only locks down the value type of the {@link
29 |  * org.apache.hadoop.mapreduce.OutputFormat}: contains no functionality.
30 |  */
31 | public abstract class BAMOutputFormat<K>
32 | 	extends FileOutputFormat<K,SAMRecordWritable> {
33 | 	/**
34 | 	 * If set to <code>true</code>, write <i>.splitting-bai</i> files for every BAM file
35 | 	 * (defaults to <code>false</code>).
36 | 	 * A splitting BAI file (not to be confused with a regular BAI file) contains an
37 | 	 * index of offsets that the BAM file can be read from; they are used by
38 | 	 * {@link BAMInputFormat} to construct splits.
39 | 	 */
40 | 	public static final String WRITE_SPLITTING_BAI =
41 | 			"hadoopbam.bam.write-splitting-bai";
42 | 
43 | 	/**
44 | 	 * If set to true, use the Intel deflater for compressing DEFLATE compressed streams.
45 | 	 * If set, the <a href="https://github.com/Intel-HLS/GKL">GKL library</a> must be
46 | 	 * provided on the classpath.
47 | 	 */
48 | 	public static final String USE_INTEL_DEFLATER_PROPERTY = "hadoopbam.bam.use-intel-deflater";
49 | 
50 | 	static boolean useIntelDeflater(Configuration conf) {
51 | 		return conf.getBoolean(USE_INTEL_DEFLATER_PROPERTY, false);
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestSAMHeaderReader.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.*;
 4 | import htsjdk.samtools.cram.CRAMException;
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.junit.Rule;
 7 | import org.junit.Test;
 8 | import org.junit.rules.ExpectedException;
 9 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
10 | 
11 | import java.io.InputStream;
12 | import java.net.URI;
13 | 
14 | import static org.junit.Assert.assertEquals;
15 | 
16 | public class TestSAMHeaderReader {
17 |     @Rule
18 |     public ExpectedException thrown= ExpectedException.none();
19 | 
20 |     @Test
21 |     public void testBAMHeaderReaderNoReference() throws Exception {
22 | 
23 |         final Configuration conf = new Configuration();
24 | 
25 |         InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.bam");
26 |         final SamReader samReader = SamReaderFactory.makeDefault().open(SamInputResource.of(inputStream));
27 |         int sequenceCount = samReader.getFileHeader().getSequenceDictionary().size();
28 |         samReader.close();
29 | 
30 |         inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.bam");
31 |         SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf);
32 |         inputStream.close();
33 | 
34 |         assertEquals(samHeader.getSequenceDictionary().size(), sequenceCount);
35 |     }
36 | 
37 |     @Test
38 |     public void testCRAMHeaderReaderWithReference() throws Exception {
39 |         final Configuration conf = new Configuration();
40 | 
41 |         final InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.cram");
42 |         final URI reference = ClassLoader.getSystemClassLoader().getResource("auxf.fa").toURI();
43 |         conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, reference.toString());
44 | 
45 |         SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf);
46 |         inputStream.close();
47 | 
48 |         assertEquals(samHeader.getSequenceDictionary().size(), 1);
49 |     }
50 | 
51 |     @Test
52 |     public void testCRAMHeaderReaderNoReference() throws Exception {
53 | 
54 |         thrown.expect(IllegalStateException.class); // htsjdk throws on CRAM file with no reference provided
55 | 
56 |         final Configuration conf = new Configuration();
57 |         final InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.cram");
58 |         SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf);
59 |         inputStream.close();
60 | 
61 |         assertEquals(samHeader.getSequenceDictionary().size(), 1);
62 |     }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/ConfHelper.java:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2011-2012 CRS4.
 2 | //
 3 | // This file is part of Hadoop-BAM.
 4 | //
 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | // of this software and associated documentation files (the "Software"), to
 7 | // deal in the Software without restriction, including without limitation the
 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 9 | // sell copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in
13 | // all copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | // IN THE SOFTWARE.
22 | 
23 | package org.seqdoop.hadoop_bam.util;
24 | 
25 | import org.apache.hadoop.conf.Configuration;
26 | 
27 | public class ConfHelper
28 | {
29 | 	/**
30 | 	 * Convert a string to a boolean.
31 | 	 *
32 | 	 * Accepted values: "yes", "true", "t", "y", "1"
33 | 	 *                  "no", "false", "f", "n", "0" 
34 | 	 * All comparisons are case insensitive.
35 | 	 *
36 | 	 * If the value provided is null, defaultValue is returned.
37 | 	 *
38 | 	 * @exception IllegalArgumentException Thrown if value is not
39 | 	 * null and doesn't match any of the accepted strings.
40 | 	 */
41 | 	public static boolean parseBoolean(String value, boolean defaultValue)
42 | 	{
43 | 		if (value == null)
44 | 			return defaultValue;
45 | 
46 | 		value = value.trim();
47 | 
48 | 		// any of the following will 
49 | 		final String[] acceptedTrue = new String[]{ "yes", "true", "t", "y", "1" };
50 | 		final String[] acceptedFalse = new String[]{ "no", "false", "f", "n", "0" };
51 | 
52 | 		for (String possible: acceptedTrue)
53 | 		{
54 | 			if (possible.equalsIgnoreCase(value))
55 | 				return true;
56 | 		}
57 | 		for (String possible: acceptedFalse)
58 | 		{
59 | 			if (possible.equalsIgnoreCase(value))
60 | 				return false;
61 | 		}
62 | 
63 | 		throw new IllegalArgumentException("Unrecognized boolean value '" + value + "'");
64 | 	}
65 | 
66 | 	public static boolean parseBoolean(Configuration conf, String propertyName, boolean defaultValue)
67 | 	{
68 | 		return parseBoolean(conf.get(propertyName), defaultValue);
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/IntervalUtil.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam.util;
 2 | 
 3 | import com.google.common.collect.ImmutableList;
 4 | import htsjdk.samtools.util.Interval;
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.seqdoop.hadoop_bam.FormatException;
 7 | 
 8 | import java.util.ArrayList;
 9 | import java.util.List;
10 | import java.util.function.Supplier;
11 | 
12 | /**
13 |  * Common utilities across different file formats.
14 |  */
15 | public final class IntervalUtil {
16 | 
17 |     // declared to prevent instantiation.
18 |     private IntervalUtil() {}
19 | 
20 |     /**
21 |      * Returns the list of intervals found in a string configuration property separated by colons.
22 |      * @param conf the source configuration.
23 |      * @param intervalPropertyName the property name holding the intervals.
24 |      * @return {@code null} if there is no such a property in the configuration.
25 |      * @throws NullPointerException if either input is null.
26 |      */
27 |     public static List<Interval> getIntervals(final Configuration conf, final String intervalPropertyName) {
28 |         final String intervalsProperty = conf.get(intervalPropertyName);
29 |         if (intervalsProperty == null) {
30 |             return null;
31 |         }
32 |         if (intervalsProperty.isEmpty()) {
33 |             return ImmutableList.of();
34 |         }
35 |         final List<Interval> intervals = new ArrayList<>();
36 |         for (final String s : intervalsProperty.split(",")) {
37 |             final int lastColonIdx = s.lastIndexOf(':');
38 |             if (lastColonIdx < 0) {
39 |                 throw new FormatException("no colon found in interval string: " + s);
40 |             }
41 |             final int hyphenIdx = s.indexOf('-', lastColonIdx + 1);
42 |             if (hyphenIdx < 0) {
43 |                 throw new FormatException("no hyphen found after colon interval string: " + s);
44 |             }
45 |             final String sequence = s.substring(0, lastColonIdx);
46 |             final int start = parseIntOrThrowFormatException(s.substring(lastColonIdx + 1, hyphenIdx),
47 |                     "invalid start position", s);
48 |             final int stop = parseIntOrThrowFormatException(s.substring(hyphenIdx + 1),
49 |                     "invalid stop position", s);
50 |             intervals.add(new Interval(sequence, start, stop));
51 |         }
52 |         return intervals;
53 |     }
54 | 
55 |     private static int parseIntOrThrowFormatException(final String str, final String error, final String input) {
56 |         try {
57 |             return Integer.parseInt(str);
58 |         } catch (final NumberFormatException ex) {
59 |             throw new FormatException(error + " in  interval '" + input + "': '" + str + "'");
60 |         }
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestBGZFSplitGuesser.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.util.BlockCompressedInputStream;
 4 | import htsjdk.samtools.util.BlockCompressedStreamConstants;
 5 | import java.io.File;
 6 | import java.io.IOException;
 7 | import java.util.Arrays;
 8 | import java.util.Collection;
 9 | import java.util.LinkedList;
10 | import org.apache.hadoop.conf.Configuration;
11 | import org.apache.hadoop.fs.FSDataInputStream;
12 | import org.apache.hadoop.fs.Path;
13 | import org.junit.Test;
14 | import org.junit.runner.RunWith;
15 | import org.junit.runners.Parameterized;
16 | import org.seqdoop.hadoop_bam.util.BGZFSplitGuesser;
17 | 
18 | import static org.junit.Assert.assertEquals;
19 | 
20 | @RunWith(Parameterized.class)
21 | public class TestBGZFSplitGuesser {
22 | 
23 |   private final File file;
24 |   private final long firstSplit;
25 |   private final long lastSplit;
26 | 
27 |   public TestBGZFSplitGuesser(String filename, long firstSplit, long lastSplit) {
28 |     this.file = new File("src/test/resources/" + filename);
29 |     this.firstSplit = firstSplit;
30 |     this.lastSplit = lastSplit;
31 |   }
32 | 
33 |   @Parameterized.Parameters
34 |   public static Collection<Object> data() {
35 |     return Arrays.asList(new Object[][] {
36 |         {"test.vcf.bgzf.gz", 821, 821}, {"HiSeq.10000.vcf.bgzf.gz", 16688, 509222}
37 |     });
38 |   }
39 | 
40 |   @Test
41 |   public void test() throws IOException {
42 |     Configuration conf = new Configuration();
43 |     Path path = new Path(file.toURI());
44 |     FSDataInputStream fsDataInputStream = path.getFileSystem(conf).open(path);
45 |     BGZFSplitGuesser bgzfSplitGuesser = new BGZFSplitGuesser(fsDataInputStream);
46 |     LinkedList<Long> boundaries = new LinkedList<>();
47 |     long start = 1;
48 |     while (true) {
49 |       long end = file.length();
50 |       long nextStart = bgzfSplitGuesser.guessNextBGZFBlockStart(start, end);
51 |       if (nextStart == end) {
52 |         break;
53 |       }
54 |       boundaries.add(nextStart);
55 |       canReadFromBlockStart(nextStart);
56 |       start = nextStart + 1;
57 |     }
58 |     assertEquals(firstSplit, (long) boundaries.getFirst());
59 |     assertEquals(lastSplit, (long) boundaries.getLast());
60 | 
61 |     assertEquals("Last block start is terminator gzip block",
62 |         file.length() - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length,
63 |         (long) boundaries.get(boundaries.size() - 1));
64 |   }
65 | 
66 |   private void canReadFromBlockStart(long blockStart) throws IOException {
67 |     BlockCompressedInputStream blockCompressedInputStream = new
68 |         BlockCompressedInputStream(file);
69 |     blockCompressedInputStream.setCheckCrcs(true);
70 |     blockCompressedInputStream.seek(blockStart << 16);
71 |     byte[] b = new byte[100];
72 |     blockCompressedInputStream.read(b);
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/LazyParsingGenotypesContext.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2013-07-05 16:18:57
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import htsjdk.variant.variantcontext.LazyGenotypesContext;
26 | import htsjdk.variant.vcf.VCFHeader;
27 | 
28 | /** You need to call getParser().setHeader() here before trying to decode() a
29 |  * GenotypesContext in any VariantContext that came about via
30 |  * VariantContextWritable.readFields(). That includes calling
31 |  * VariantContext.fullyDecode() or almost any of the GenotypesContext methods.
32 |  * The RecordReader provided by VCFInputFormat does this for you.
33 |  */
34 | // There's no public LazyGenotypesContext.LazyParser in Picard so we need to
35 | // provide our own. Since we need to have the header in the parser set
36 | // externally, we also need to provide a LazyGenotypesContext which gives
37 | // access to the parser.
38 | //
39 | // And since VCF and BCF have different kinds of lazy data, we have separate
40 | // classes implementing the actual parsing for each.
41 | public abstract class LazyParsingGenotypesContext
42 | 	extends LazyGenotypesContext
43 | {
44 | 	// super.parser is inaccessible to us so we keep a copy that we can access.
45 | 	private final Parser parserCopy;
46 | 
47 | 	protected LazyParsingGenotypesContext(Parser p, byte[] data, int count) {
48 | 		super(p, data, count);
49 | 		parserCopy = p;
50 | 	}
51 | 
52 | 	public Parser getParser() { return parserCopy; }
53 | 
54 | 	public static interface HeaderDataCache {
55 | 		public void setHeader(VCFHeader header);
56 | 	}
57 | 
58 | 	public static abstract class Parser implements LazyParser {
59 | 		public abstract void setHeaderDataCache(HeaderDataCache data);
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMOutputFormat.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import htsjdk.samtools.SAMFileHeader;
 7 | 
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.mapreduce.RecordWriter;
11 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
12 | 
13 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
14 | 
15 | /** Writes only the BAM records, not the key.
16 |  *
17 |  * <p>A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or
18 |  * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.</p>
19 |  *
20 |  * <p>By default, writes the SAM header to the output file(s). This
21 |  * can be disabled, because in distributed usage one often ends up with (and,
22 |  * for decent performance, wants to end up with) the output split into multiple
23 |  * parts, which are easier to concatenate if the header is not present in each
24 |  * file.</p>
25 |  */
26 | public class KeyIgnoringCRAMOutputFormat<K> extends CRAMOutputFormat<K> {
27 |     protected SAMFileHeader header;
28 |     private boolean writeHeader = true;
29 | 
30 |     public KeyIgnoringCRAMOutputFormat() {}
31 | 
32 |     /** Whether the header will be written or not. */
33 |     public boolean getWriteHeader()          { return writeHeader; }
34 | 
35 |     /** Set whether the header will be written or not. */
36 |     public void    setWriteHeader(boolean b) { writeHeader = b; }
37 | 
38 |     public SAMFileHeader getSAMHeader() { return header; }
39 |     public void setSAMHeader(SAMFileHeader header) { this.header = header; }
40 | 
41 |     public void readSAMHeaderFrom(Path path, Configuration conf)
42 |             throws IOException
43 |     {
44 |         this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf);
45 |     }
46 |     public void readSAMHeaderFrom(InputStream in, Configuration conf) {
47 |         this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
48 |     }
49 | 
50 |     /** <code>setSAMHeader</code> or <code>readSAMHeaderFrom</code> must have
51 |      * been called first.
52 |      */
53 |     @Override public RecordWriter<K,SAMRecordWritable> getRecordWriter(
54 |             TaskAttemptContext ctx)
55 |             throws IOException
56 |     {
57 |         return getRecordWriter(ctx, getDefaultWorkFile(ctx, ""));
58 |     }
59 | 
60 |     // Allows wrappers to provide their own work file.
61 |     public RecordWriter<K,SAMRecordWritable> getRecordWriter(
62 |             TaskAttemptContext ctx, Path out)
63 |             throws IOException
64 |     {
65 |         if (this.header == null)
66 |             throw new IOException(
67 |                     "Can't create a RecordWriter without the SAM header");
68 | 
69 |         return new KeyIgnoringCRAMRecordWriter<K>(out, header, writeHeader, ctx);
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCodec.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam.util;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.OutputStream;
 6 | import org.apache.hadoop.fs.Seekable;
 7 | import org.apache.hadoop.io.compress.CompressionCodec;
 8 | import org.apache.hadoop.io.compress.CompressionOutputStream;
 9 | import org.apache.hadoop.io.compress.Compressor;
10 | import org.apache.hadoop.io.compress.Decompressor;
11 | import org.apache.hadoop.io.compress.GzipCodec;
12 | import org.apache.hadoop.io.compress.SplitCompressionInputStream;
13 | import org.apache.hadoop.io.compress.SplittableCompressionCodec;
14 | 
15 | /**
16 |  * A Hadoop {@link CompressionCodec} for the
17 |  * <a href="https://samtools.github.io/hts-specs/SAMv1.pdf">BGZF compression format</a>,
18 |  * which reads and writes files with a <code>.bgz</code> suffix. There is no standard
19 |  * suffix for BGZF-compressed files, and in fact <code>.gz</code> is commonly used, in
20 |  * which case {@link BGZFEnhancedGzipCodec} should be used instead of this class.
21 |  * <p>
22 |  * To use BGZFCodec, set it on the configuration object as follows.
23 |  * </p>
24 |  * {@code
25 |  * conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName())
26 |  * }
27 |  * @see BGZFEnhancedGzipCodec
28 |  */
29 | public class BGZFCodec extends GzipCodec implements SplittableCompressionCodec {
30 | 
31 |   public static final String DEFAULT_EXTENSION = ".bgz";
32 | 
33 |   @Override
34 |   public CompressionOutputStream createOutputStream(OutputStream out) throws IOException {
35 |     return new BGZFCompressionOutputStream(out);
36 |   }
37 | 
38 |   // compressors are not used, so ignore/return null
39 | 
40 |   @Override
41 |   public CompressionOutputStream createOutputStream(OutputStream out,
42 |       Compressor compressor) throws IOException {
43 |     return createOutputStream(out); // compressors are not used, so ignore
44 |   }
45 | 
46 |   @Override
47 |   public Class<? extends Compressor> getCompressorType() {
48 |     return null; // compressors are not used, so return null
49 |   }
50 | 
51 |   @Override
52 |   public Compressor createCompressor() {
53 |     return null; // compressors are not used, so return null
54 |   }
55 | 
56 |   @Override
57 |   public SplitCompressionInputStream createInputStream(InputStream seekableIn,
58 |       Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException {
59 |     BGZFSplitGuesser splitGuesser = new BGZFSplitGuesser(seekableIn);
60 |     long adjustedStart = splitGuesser.guessNextBGZFBlockStart(start, end);
61 |     ((Seekable)seekableIn).seek(adjustedStart);
62 |     return new BGZFSplitCompressionInputStream(seekableIn, adjustedStart, end);
63 |   }
64 | 
65 |   // fall back to GzipCodec for input streams without a start position
66 | 
67 |   @Override
68 |   public String getDefaultExtension() {
69 |     return DEFAULT_EXTENSION;
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBCFRecordWriter.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2013-06-28 16:36:22
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.IOException;
26 | import java.io.OutputStream;
27 | 
28 | import org.apache.hadoop.fs.Path;
29 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
30 | 
31 | import htsjdk.variant.vcf.VCFHeader;
32 | 
33 | /** A convenience class that you can use as a RecordWriter for BCF files.
34 |  *
35 |  * <p>The write function ignores the key, just outputting the
36 |  * VariantContext.</p>
37 |  */
38 | public class KeyIgnoringBCFRecordWriter<K> extends BCFRecordWriter<K> {
39 | 	public KeyIgnoringBCFRecordWriter(
40 | 			Path output, Path input, boolean writeHeader, TaskAttemptContext ctx)
41 | 		throws IOException
42 | 	{
43 | 		super(output, input, writeHeader, ctx);
44 | 	}
45 | 	public KeyIgnoringBCFRecordWriter(
46 | 			Path output, VCFHeader header, boolean writeHeader,
47 | 			TaskAttemptContext ctx)
48 | 		throws IOException
49 | 	{
50 | 		super(output, header, writeHeader, ctx);
51 | 	}
52 | 	/**
53 | 	 * @deprecated This constructor has no {@link TaskAttemptContext} so it is not
54 | 	 * possible to pass configuration properties to the writer.
55 | 	 */
56 | 	@Deprecated
57 | 	public KeyIgnoringBCFRecordWriter(
58 | 			OutputStream output, VCFHeader header, boolean writeHeader)
59 | 		throws IOException
60 | 	{
61 | 		super(output, header, writeHeader);
62 | 	}
63 | 	public KeyIgnoringBCFRecordWriter(
64 | 			OutputStream output, VCFHeader header, boolean writeHeader,
65 | 			TaskAttemptContext ctx)
66 | 			throws IOException
67 | 	{
68 | 		super(output, header, writeHeader, ctx);
69 | 	}
70 | 
71 | 	@Override public void write(K ignored, VariantContextWritable vc) {
72 | 		writeRecord(vc.get());
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/VariantContextWritable.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2013-06-26 10:27:20
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.DataInput;
26 | import java.io.DataOutput;
27 | import java.io.IOException;
28 | 
29 | import org.apache.hadoop.io.Writable;
30 | import htsjdk.variant.variantcontext.VariantContext;
31 | import htsjdk.variant.vcf.VCFHeader;
32 | 
33 | /** VariantContexts read here have LazyGenotypesContexts, which need to have a
34 |  * header set before the genotype data in the VariantContexts can be decoded.
35 |  * See the LazyGenotypesContext class.
36 |  */
37 | public class VariantContextWritable implements Writable {
38 | 	private VariantContext vc;
39 | 
40 | 	public VariantContext get()                  { return vc; }
41 | 	public void           set(VariantContext vc) { this.vc = vc; }
42 |     public void           set(VariantContext vc, VCFHeader header) { this.vc = new VariantContextWithHeader(vc, header); }
43 | 
44 | 	// XXX: Unfortunately there's no simple way to just pass a BCF record
45 | 	// through. Contrasting to BAM, there's no equivalent of the BAMRecord
46 | 	// subclass of SAMRecord that saves the original BAM fields --- a
47 | 	// VariantContext only saves the decoded info, so it's impossible to encode
48 | 	// one to BCF without the header.
49 | 	//
50 | 	// VCF is also unusable because VCFWriter defensively refuses to write
51 | 	// anything without a header, throwing IllegalStateException if attempted.
52 | 	//
53 | 	// Thus, we have a custom encoding.
54 | 	@Override public void write(final DataOutput out) throws IOException {
55 | 		VariantContextCodec.write(out, vc);
56 | 	}
57 | 	@Override public void readFields(final DataInput in) throws IOException {
58 | 		vc = VariantContextCodec.read(in);
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFRecordWriter.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2013-06-27 09:42:56
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.IOException;
26 | import java.io.OutputStream;
27 | 
28 | import org.apache.hadoop.fs.Path;
29 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
30 | 
31 | import htsjdk.variant.vcf.VCFHeader;
32 | 
33 | /** A convenience class that you can use as a RecordWriter for VCF files.
34 |  *
35 |  * <p>The write function ignores the key, just outputting the
36 |  * VariantContext.</p>
37 |  */
38 | public class KeyIgnoringVCFRecordWriter<K> extends VCFRecordWriter<K> {
39 | 	public KeyIgnoringVCFRecordWriter(
40 | 			Path output, Path input, boolean writeHeader, TaskAttemptContext ctx)
41 | 		throws IOException
42 | 	{
43 | 		super(output, input, writeHeader, ctx);
44 | 	}
45 | 	public KeyIgnoringVCFRecordWriter(
46 | 			Path output, VCFHeader header, boolean writeHeader,
47 | 			TaskAttemptContext ctx)
48 | 		throws IOException
49 | 	{
50 | 		super(output, header, writeHeader, ctx);
51 | 	}
52 | 	/**
53 | 	 * @deprecated This constructor has no {@link TaskAttemptContext} so it is not
54 | 	 * possible to pass configuration properties to the writer.
55 | 	 */
56 | 	@Deprecated
57 | 	public KeyIgnoringVCFRecordWriter(
58 | 			OutputStream output, VCFHeader header, boolean writeHeader)
59 | 		throws IOException
60 | 	{
61 | 		super(output, header, writeHeader);
62 | 	}
63 | 	public KeyIgnoringVCFRecordWriter(
64 | 			OutputStream output, VCFHeader header, boolean writeHeader,
65 | 			TaskAttemptContext ctx)
66 | 			throws IOException
67 | 	{
68 | 		super(output, header, writeHeader, ctx);
69 | 	}
70 | 
71 | 
72 | 	@Override public void write(K ignored, VariantContextWritable vc) {
73 | 		writeRecord(vc.get());
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestLineReader.java:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2011-2012 CRS4.
 2 | //
 3 | // This file is part of Hadoop-BAM.
 4 | //
 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | // of this software and associated documentation files (the "Software"), to
 7 | // deal in the Software without restriction, including without limitation the
 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 9 | // sell copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in
13 | // all copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | // IN THE SOFTWARE.
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import org.junit.*;
26 | import static org.junit.Assert.*;
27 | 
28 | import org.seqdoop.hadoop_bam.LineReader;
29 | 
30 | import org.apache.hadoop.io.Text;
31 | 
32 | import java.io.ByteArrayInputStream;
33 | import java.io.IOException;
34 | 
35 | public class TestLineReader
36 | {
37 | 	public static final String input10 = "0123456789";
38 | 	public static final String input22 = "0123456789\n0987654321\n";
39 | 
40 | 	private LineReader reader;
41 | 	private Text dest = new Text();
42 | 
43 | 	@Test
44 | 	public void testReadBufferedLine() throws IOException
45 | 	{
46 | 		reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22);
47 | 		reader.readLine(dest);
48 | 		assertEquals("0123456789", dest.toString());
49 | 	}
50 | 
51 | 	@Test
52 | 	public void testSkipOnBufferedLine() throws IOException
53 | 	{
54 | 		reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22);
55 | 		long skipped = reader.skip(1);
56 | 		assertEquals(1, skipped);
57 | 		reader.readLine(dest);
58 | 		assertEquals("123456789", dest.toString());
59 | 	}
60 | 
61 | 	@Test
62 | 	public void testReadBeyondBuffer() throws IOException
63 | 	{
64 | 		reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5);
65 | 		reader.readLine(dest);
66 | 		assertEquals("0123456789", dest.toString());
67 | 	}
68 | 
69 | 	@Test
70 | 	public void testSkipBeyondBuffer() throws IOException
71 | 	{
72 | 		reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5);
73 | 		long skipped = reader.skip(11);
74 | 		assertEquals(11, skipped);
75 | 		reader.readLine(dest);
76 | 		assertEquals("0987654321", dest.toString());
77 | 	}
78 | 
79 | 	@Test
80 | 	public void testSkipBeyondInput() throws IOException
81 | 	{
82 | 		reader = new LineReader(new ByteArrayInputStream(input10.getBytes()), 5);
83 | 		long skipped = reader.skip(11);
84 | 		assertEquals(10, skipped);
85 | 
86 | 		skipped = reader.skip(11);
87 | 		assertEquals(0, skipped);
88 | 	}
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/CRAMRecordReader.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.CRAMIterator;
 4 | import htsjdk.samtools.SAMRecord;
 5 | import htsjdk.samtools.ValidationStringency;
 6 | import htsjdk.samtools.cram.ref.ReferenceSource;
 7 | import htsjdk.samtools.seekablestream.SeekableStream;
 8 | import java.io.IOException;
 9 | import java.net.URI;
10 | import java.nio.file.Paths;
11 | import org.apache.hadoop.conf.Configuration;
12 | import org.apache.hadoop.fs.Path;
13 | import org.apache.hadoop.io.LongWritable;
14 | import org.apache.hadoop.mapreduce.InputSplit;
15 | import org.apache.hadoop.mapreduce.RecordReader;
16 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
17 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
18 | import org.seqdoop.hadoop_bam.util.NIOFileUtil;
19 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
20 | import org.seqdoop.hadoop_bam.util.WrapSeekable;
21 | 
22 | public class CRAMRecordReader extends RecordReader<LongWritable, SAMRecordWritable> {
23 | 
24 |   private final LongWritable key = new LongWritable();
25 |   private final SAMRecordWritable record = new SAMRecordWritable();
26 |   private boolean isInitialized = false;
27 |   private SeekableStream seekableStream;
28 |   private long start;
29 |   private long length;
30 |   private CRAMIterator cramIterator;
31 | 
32 |   @Override
33 |   public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
34 |     if(isInitialized) {
35 |       close();
36 |     }
37 |     isInitialized = true;
38 | 
39 |     final Configuration conf = context.getConfiguration();
40 |     final FileSplit fileSplit = (FileSplit) split;
41 |     final Path file  = fileSplit.getPath();
42 | 
43 |     String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
44 |     ReferenceSource refSource = new ReferenceSource(refSourcePath == null ? null :
45 |         NIOFileUtil.asPath(refSourcePath));
46 | 
47 |     seekableStream = WrapSeekable.openPath(conf, file);
48 |     start = fileSplit.getStart();
49 |     length = fileSplit.getLength();
50 |     long end = start + length;
51 |     // CRAMIterator right shifts boundaries by 16 so we do the reverse here
52 |     // also subtract one from end since CRAMIterator's boundaries are inclusive
53 |     long[] boundaries = new long[] {start << 16, (end - 1) << 16};
54 |     ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf);
55 |     cramIterator = new CRAMIterator(seekableStream, refSource, boundaries, stringency);
56 |   }
57 | 
58 |   @Override
59 |   public boolean nextKeyValue() {
60 |     if (!cramIterator.hasNext()) {
61 |       return false;
62 |     }
63 |     SAMRecord r = cramIterator.next();
64 |     key.set(BAMRecordReader.getKey(r));
65 |     record.set(r);
66 |     return true;
67 |   }
68 | 
69 |   @Override
70 |   public LongWritable getCurrentKey() {
71 |     return key;
72 |   }
73 | 
74 |   @Override
75 |   public SAMRecordWritable getCurrentValue() {
76 |     return record;
77 |   }
78 | 
79 |   @Override
80 |   public float getProgress() throws IOException {
81 |     return (float)(seekableStream.position() - start) / length;
82 |   }
83 | 
84 |   @Override
85 |   public void close() {
86 |     cramIterator.close();
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/VCFFormat.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2013-06-27 13:21:07
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import htsjdk.samtools.util.BlockCompressedInputStream;
26 | import java.io.BufferedInputStream;
27 | import java.io.InputStream;
28 | import java.io.IOException;
29 | 
30 | import java.util.zip.GZIPInputStream;
31 | import org.apache.hadoop.fs.Path;
32 | 
33 | /** Describes a VCF format. */
34 | public enum VCFFormat {
35 | 	VCF, BCF;
36 | 
37 | 	/** Infers the VCF format by looking at the filename of the given path.
38 | 	 *
39 | 	 * @see #inferFromFilePath(String)
40 | 	 */
41 | 	public static VCFFormat inferFromFilePath(final Path path) {
42 | 		return inferFromFilePath(path.getName());
43 | 	}
44 | 
45 | 	/** Infers the VCF format by looking at the extension of the given file
46 | 	 * name. <code>*.vcf</code> is recognized as {@link #VCF} and
47 | 	 * <code>*.bcf</code> as {@link #BCF}.
48 | 	 */
49 | 	public static VCFFormat inferFromFilePath(final String name) {
50 | 		if (name.endsWith(".bcf")) return BCF;
51 | 		if (name.endsWith(".vcf")) return VCF;
52 | 		if (name.endsWith(".gz")) return VCF;
53 | 		if (name.endsWith(".bgz")) return VCF;
54 | 		return null;
55 | 	}
56 | 
57 | 	/** Infers the VCF format by looking at the first few bytes of the input.
58 | 	 */
59 | 	public static VCFFormat inferFromData(final InputStream in) throws IOException {
60 | 		BufferedInputStream bis = new BufferedInputStream(in); // so mark/reset is supported
61 | 		return inferFromUncompressedData(isGzip(bis) ? new GZIPInputStream(bis) : bis);
62 | 	}
63 | 
64 | 	private static VCFFormat inferFromUncompressedData(final InputStream in) throws IOException {
65 | 		final byte b = (byte)in.read();
66 | 		in.close();
67 | 		switch (b) {
68 | 			case 'B':  return BCF;
69 | 			case '#':  return VCF;
70 | 		}
71 | 		return null;
72 | 	}
73 | 
74 | 	/**
75 | 	 * @return <code>true</code> if the stream is compressed with gzip (or BGZF)
76 | 	*/
77 | 	public static boolean isGzip(final InputStream in) throws IOException {
78 | 		in.mark(1);
79 | 		final byte b = (byte)in.read();
80 | 		in.reset();
81 | 		return b == 0x1f;
82 | 	}
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWritable.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2010 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2010-08-12 09:57:45
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.DataOutput;
26 | import java.io.DataInput;
27 | import java.io.IOException;
28 | 
29 | import org.apache.hadoop.io.Writable;
30 | 
31 | import htsjdk.samtools.BAMRecordCodec;
32 | import htsjdk.samtools.SAMRecord;
33 | 
34 | import org.seqdoop.hadoop_bam.util.DataInputWrapper;
35 | import org.seqdoop.hadoop_bam.util.DataOutputWrapper;
36 | 
37 | /** A {@link Writable} {@link SAMRecord}.
38 |  *
39 |  * <p>In every mapper, the record will have a header, since BAMInputFormat
40 |  * provides one. It is lost when transferring the SAMRecord to a reducer,
41 |  * however. The current implementation of {@link BAMRecordCodec} does not
42 |  * require a record for encoding nor decoding of a <code>SAMRecord</code>, so
43 |  * this fortunately doesn't matter for either {@link #write} or {@link
44 |  * #readFields}.</p>
45 |  */
46 | public class SAMRecordWritable implements Writable {
47 | 	private static final BAMRecordCodec lazyCodec =
48 | 		new BAMRecordCodec(null, new LazyBAMRecordFactory());
49 | 
50 | 	private SAMRecord record;
51 | 
52 | 	public SAMRecord get()            { return record; }
53 | 	public void      set(SAMRecord r) { record = r; }
54 | 
55 | 	@Override public void write(DataOutput out) throws IOException {
56 | 		// In theory, it shouldn't matter whether we give a header to
57 | 		// BAMRecordCodec or not, since the representation of an alignment in BAM
58 | 		// doesn't depend on the header data at all. Only its interpretation
59 | 		// does, and a simple read/write codec shouldn't really have anything to
60 | 		// say about that. (But in practice, it already does matter for decode(),
61 | 		// which is why LazyBAMRecordFactory exists.)
62 | 		final BAMRecordCodec codec = new BAMRecordCodec(record.getHeader());
63 | 		codec.setOutputStream(new DataOutputWrapper(out));
64 | 		codec.encode(record);
65 | 	}
66 | 	@Override public void readFields(DataInput in) throws IOException {
67 | 		lazyCodec.setInputStream(new DataInputWrapper(in));
68 | 		record = lazyCodec.decode();
69 | 	}
70 | 
71 | 	@Override
72 | 	public String toString() {
73 | 		return record.getSAMString().trim(); // remove trailing newline
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/VCFHeaderReader.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2013-07-04 10:49:20
22 | 
23 | package org.seqdoop.hadoop_bam.util;
24 | 
25 | import java.io.BufferedInputStream;
26 | import java.io.InputStream;
27 | import java.io.IOException;
28 | 
29 | import htsjdk.samtools.seekablestream.SeekableStream;
30 | import htsjdk.samtools.util.BlockCompressedInputStream;
31 | 
32 | import htsjdk.tribble.FeatureCodecHeader;
33 | import htsjdk.tribble.TribbleException;
34 | import htsjdk.tribble.readers.AsciiLineReader;
35 | import htsjdk.tribble.readers.AsciiLineReaderIterator;
36 | import htsjdk.tribble.readers.PositionalBufferedStream;
37 | import htsjdk.variant.bcf2.BCF2Codec;
38 | import htsjdk.variant.vcf.VCFCodec;
39 | import htsjdk.variant.vcf.VCFHeader;
40 | import java.util.zip.GZIPInputStream;
41 | import org.seqdoop.hadoop_bam.VCFFormat;
42 | import org.slf4j.Logger;
43 | import org.slf4j.LoggerFactory;
44 | 
45 | /** Can read a VCF header without being told beforehand whether the input is
46 |  * VCF or BCF.
47 |  */
48 | public final class VCFHeaderReader {
49 | 	private static final Logger logger = LoggerFactory.getLogger(VCFHeaderReader.class);
50 | 
51 | 	public static VCFHeader readHeaderFrom(final SeekableStream in)
52 | 		throws IOException
53 | 	{
54 | 		Object headerCodec = null;
55 |         Object header = null;
56 | 		final long initialPos = in.position();
57 | 		try {
58 | 			BufferedInputStream bis = new BufferedInputStream(in);
59 | 			InputStream is = VCFFormat.isGzip(bis) ? new GZIPInputStream(bis) : bis;
60 | 			headerCodec = new VCFCodec().readHeader(new AsciiLineReaderIterator(new AsciiLineReader(is)));
61 | 		} catch (TribbleException e) {
62 | 			logger.warn("Exception while trying to read VCF header from file:", e);
63 | 
64 | 			in.seek(initialPos);
65 | 
66 | 			InputStream bin = new BufferedInputStream(in);
67 | 			if (BlockCompressedInputStream.isValidFile(bin))
68 | 				bin = new BlockCompressedInputStream(bin);
69 | 
70 | 			headerCodec =
71 | 				new BCF2Codec().readHeader(
72 | 					new PositionalBufferedStream(bin));
73 | 		}
74 | 		if (!(headerCodec instanceof FeatureCodecHeader))
75 | 			throw new IOException("No VCF header found");
76 |         header = ((FeatureCodecHeader)headerCodec).getHeaderValue();
77 | 		return (VCFHeader)header;
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/WrapSeekable.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2010 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2010-08-25 11:24:30
22 | 
23 | package org.seqdoop.hadoop_bam.util;
24 | 
25 | import java.io.InputStream;
26 | import java.io.IOException;
27 | 
28 | import org.apache.hadoop.conf.Configuration;
29 | import org.apache.hadoop.fs.FileSystem;
30 | import org.apache.hadoop.fs.FSDataInputStream;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.hadoop.fs.Seekable;
33 | 
34 | import htsjdk.samtools.seekablestream.SeekableStream;
35 | 
36 | /** Wraps Hadoop's "seekable stream" abstraction so that we can give such a one
37 |  * to BlockCompressedInputStream and retain seekability.
38 |  *
39 |  * <p>This is necessary because Hadoop and the SAM tools each have their own
40 |  * "seekable stream" abstraction.</p>
41 |  */
42 | public class WrapSeekable<S extends InputStream & Seekable>
43 | 	extends SeekableStream
44 | {
45 | 	private final S    stm;
46 | 	private final long len;
47 | 	private final Path path;
48 | 
49 | 	public WrapSeekable(final S s, long length, Path p) {
50 | 		stm  = s;
51 | 		len  = length;
52 | 		path = p;
53 | 	}
54 | 
55 | 	/** A helper for the common use case. */
56 | 	public static WrapSeekable<FSDataInputStream> openPath(
57 | 		FileSystem fs, Path p) throws IOException
58 | 	{
59 | 		return new WrapSeekable<FSDataInputStream>(
60 | 			fs.open(p), fs.getFileStatus(p).getLen(), p);
61 | 	}
62 | 	public static WrapSeekable<FSDataInputStream> openPath(
63 | 		Configuration conf, Path path) throws IOException
64 | 	{
65 | 		return openPath(path.getFileSystem(conf), path);
66 | 	}
67 | 
68 | 	@Override public String getSource() { return path.toString(); }
69 | 	@Override public long   length   () { return len; }
70 | 
71 | 	@Override public long position() throws IOException { return stm.getPos(); }
72 | 	@Override public void    close() throws IOException { stm.close(); }
73 | 	@Override public boolean eof  () throws IOException {
74 | 		return stm.getPos() == length();
75 | 	}
76 | 	@Override public void seek(long pos) throws IOException {
77 | 		stm.seek(pos);
78 | 	}
79 | 	@Override public int read() throws IOException {
80 | 		return stm.read();
81 | 	}
82 | 	@Override public int read(byte[] buf, int offset, int len)
83 | 		throws IOException
84 | 	{
85 | 		return stm.read(buf, offset, len);
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestConfHelper.java:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2011-2012 CRS4.
 2 | //
 3 | // This file is part of Hadoop-BAM.
 4 | //
 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | // of this software and associated documentation files (the "Software"), to
 7 | // deal in the Software without restriction, including without limitation the
 8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 9 | // sell copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in
13 | // all copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | // IN THE SOFTWARE.
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import org.seqdoop.hadoop_bam.util.ConfHelper;
26 | 
27 | import org.junit.*;
28 | import static org.junit.Assert.*;
29 | 
30 | import org.apache.hadoop.conf.Configuration;
31 | 
32 | public class TestConfHelper
33 | {
34 | 	@Test
35 | 	public void testParseBooleanValidValues()
36 | 	{
37 | 		assertTrue(ConfHelper.parseBoolean("true", false));
38 | 		assertTrue(ConfHelper.parseBoolean("tRuE", false));
39 | 		assertTrue(ConfHelper.parseBoolean("TRUE", false));
40 | 		assertTrue(ConfHelper.parseBoolean("t", false));
41 | 		assertTrue(ConfHelper.parseBoolean("yes", false));
42 | 		assertTrue(ConfHelper.parseBoolean("y", false));
43 | 		assertTrue(ConfHelper.parseBoolean("Y", false));
44 | 		assertTrue(ConfHelper.parseBoolean("1", false));
45 | 
46 | 		assertFalse(ConfHelper.parseBoolean("false", true));
47 | 		assertFalse(ConfHelper.parseBoolean("faLse", true));
48 | 		assertFalse(ConfHelper.parseBoolean("FALSE", true));
49 | 		assertFalse(ConfHelper.parseBoolean("f", true));
50 | 		assertFalse(ConfHelper.parseBoolean("no", true));
51 | 		assertFalse(ConfHelper.parseBoolean("n", true));
52 | 		assertFalse(ConfHelper.parseBoolean("N", true));
53 | 		assertFalse(ConfHelper.parseBoolean("0", true));
54 | 	}
55 | 
56 | 	@Test
57 | 	public void testParseBooleanNull()
58 | 	{
59 | 		assertTrue(ConfHelper.parseBoolean(null, true));
60 | 		assertFalse(ConfHelper.parseBoolean(null, false));
61 | 	}
62 | 
63 | 	@Test(expected=IllegalArgumentException.class)
64 | 	public void testParseBooleanInvalidValue()
65 | 	{
66 | 		ConfHelper.parseBoolean("dodo", true);
67 | 	}
68 | 
69 | 	@Test
70 | 	public void testParseBooleanFromConfValue()
71 | 	{
72 | 		final String propName = "my.property";
73 | 		Configuration conf = new Configuration();
74 | 		conf.set(propName, "t");
75 | 		assertTrue(ConfHelper.parseBoolean(conf, propName, false));
76 | 	}
77 | 
78 | 	@Test
79 | 	public void testParseBooleanFromConfNull()
80 | 	{
81 | 		Configuration conf = new Configuration();
82 | 		assertTrue(ConfHelper.parseBoolean(conf, "my.property", true));
83 | 		assertFalse(ConfHelper.parseBoolean(conf, "my.property", false));
84 | 	}
85 | 
86 | 
87 | 	public static void main(String args[]) {
88 | 		org.junit.runner.JUnitCore.main(TestConfHelper.class.getName());
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/IntervalUtilTest.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.util.Interval;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.junit.Assert;
 6 | import org.junit.Test;
 7 | import org.seqdoop.hadoop_bam.util.IntervalUtil;
 8 | 
 9 | import java.util.List;
10 | import java.util.stream.Collectors;
11 | import java.util.stream.Stream;
12 | 
13 | /**
14 |  * Unit tests for {@link IntervalUtil}.
15 |  */
16 | public class IntervalUtilTest {
17 | 
18 |     @Test
19 |     public void testInvalidIntervals() {
20 |         final String[] invalidIntervals = {
21 |                 "chr1", // full sequence interval are not allowed.
22 |                 "chr1:12", // single position omitting stop is not allowed.
23 |                 "chr1,chr2:121-123", // , are not allowed anywhere
24 |                 "chr20:1,100-3,400", // ,   "             "
25 |                 "MT:35+", // , until end of contig + is not allowed.
26 |                 "MT:13-31-1112", // too many positions.
27 |                 "MT:-2112", // forgot the start position!
28 |                 " MT : 113 - 1245" // blanks are not allowed either.
29 |         };
30 |         for (final String interval : invalidIntervals) {
31 |             final Configuration conf = new Configuration();
32 |             conf.set("prop-name", interval);
33 |             try {
34 |                 IntervalUtil.getIntervals(conf, "prop-name");
35 |                 Assert.fail("expected an exception when dealing with '" + interval + "'");
36 |             } catch (final FormatException ex) {
37 |                 // fine.
38 |             }
39 |         }
40 |     }
41 | 
42 |     @Test
43 |     public void testValidIntervals() {
44 |         final Object[][] validIntervals = {
45 |                 {"chr1:1-343", "chr1", 1, 343}, // standard 'chr' starting contig interval.
46 |                 {"chr20_Un:31-145", "chr20_Un", 31, 145}, // standard chromosome name containing underscore.
47 |                 {"X:31-145", "X", 31, 145}, // standard 'X' chromosome interval.
48 |                 {"10:45000012-678901123", "10", 45000012, 678901123},  // standard number starting chromosome name interval.
49 |                 {"HLA-DQA1*01:01:02:134-14151", "HLA-DQA1*01:01:02", 134, 14151}}; // example of a Hg38 assembly
50 |                                                                                    // HLA contigs including - and : in their names.
51 | 
52 |         final Configuration conf = new Configuration();
53 | 
54 |         Assert.assertNull(IntervalUtil.getIntervals(conf, "prop-name"));
55 | 
56 |         conf.set("prop-name", "");
57 | 
58 |         Assert.assertNotNull(IntervalUtil.getIntervals(conf, "prop-name"));
59 |         Assert.assertTrue(IntervalUtil.getIntervals(conf, "prop-name").isEmpty());
60 | 
61 |         conf.set("prop-name", Stream.of(validIntervals)
62 |                 .map(o -> (String) o[0]).collect(Collectors.joining(",")));
63 | 
64 |         final List<Interval> allIntervals = IntervalUtil.getIntervals(conf, "prop-name");
65 |         Assert.assertNotNull(allIntervals);
66 |         Assert.assertEquals(allIntervals.size(), validIntervals.length);
67 |         for (int i = 0; i < validIntervals.length; i++) {
68 |             Assert.assertNotNull(allIntervals.get(i));
69 |             Assert.assertEquals(allIntervals.get(i).getContig(), validIntervals[i][1]);
70 |             Assert.assertEquals(allIntervals.get(i).getStart(), validIntervals[i][2]);
71 |             Assert.assertEquals(allIntervals.get(i).getEnd(), validIntervals[i][3]);
72 |         }
73 |     }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/BGZFEnhancedGzipCodec.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam.util;
 2 | 
 3 | import htsjdk.samtools.util.BlockCompressedInputStream;
 4 | import java.io.BufferedInputStream;
 5 | import java.io.IOException;
 6 | import java.io.InputStream;
 7 | import org.apache.hadoop.fs.Seekable;
 8 | import org.apache.hadoop.io.compress.CompressionCodec;
 9 | import org.apache.hadoop.io.compress.CompressionInputStream;
10 | import org.apache.hadoop.io.compress.Decompressor;
11 | import org.apache.hadoop.io.compress.GzipCodec;
12 | import org.apache.hadoop.io.compress.SplitCompressionInputStream;
13 | import org.apache.hadoop.io.compress.SplittableCompressionCodec;
14 | 
15 | /**
16 |  * A Hadoop {@link CompressionCodec} for the
17 |  * <a href="https://samtools.github.io/hts-specs/SAMv1.pdf">BGZF compression format</a>,
18 |  * which reads and writes files with a <code>.gz</code> suffix.
19 |  * <p>
20 |  * BGZF is a splittable extension of gzip, which means that all BGZF files are standard
21 |  * gzip files, however the reverse is not necessarily the case. BGZF files often have the
22 |  * standard <code>.gz</code> suffix (such as those produced by the
23 |  * <code>bcftools</code> command),
24 |  * which causes a difficulty since it is not immediately apparent from the filename alone
25 |  * whether a file is a BGZF file, or merely a regular gzip file. BGZFEnhancedGzipCodec
26 |  * will read the start of the file to look for BGZF headers to detect the type of
27 |  * compression.
28 |  * </p>
29 |  * <p>
30 |  * BGZFEnhancedGzipCodec will read BGZF or gzip files, but currently always writes regular gzip files.
31 |  * </p>
32 |  * <p>
33 |  * To use BGZFEnhancedGzipCodec, set it on the configuration object as follows. This will
34 |  * override the built-in GzipCodec that is mapped to the <code>.gz</code> suffix.
35 |  * </p>
36 |  * {@code
37 |  * conf.set("io.compression.codecs", BGZFEnhancedGzipCodec.class.getCanonicalName())
38 |  * }
39 |  * @see BGZFCodec
40 |  */
41 | public class BGZFEnhancedGzipCodec extends GzipCodec implements SplittableCompressionCodec {
42 | 
43 |   @Override
44 |   public SplitCompressionInputStream createInputStream(InputStream seekableIn, Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException {
45 |     if (!(seekableIn instanceof Seekable)) {
46 |       throw new IOException("seekableIn must be an instance of " +
47 |           Seekable.class.getName());
48 |     }
49 |     if (!BlockCompressedInputStream.isValidFile(new BufferedInputStream(seekableIn))) {
50 |       // data is regular gzip, not BGZF
51 |       ((Seekable)seekableIn).seek(0);
52 |       final CompressionInputStream compressionInputStream = createInputStream(seekableIn,
53 |           decompressor);
54 |       return new SplitCompressionInputStream(compressionInputStream, start, end) {
55 |         @Override
56 |         public int read(byte[] b, int off, int len) throws IOException {
57 |           return compressionInputStream.read(b, off, len);
58 |         }
59 |         @Override
60 |         public void resetState() throws IOException {
61 |           compressionInputStream.resetState();
62 |         }
63 |         @Override
64 |         public int read() throws IOException {
65 |           return compressionInputStream.read();
66 |         }
67 |       };
68 |     }
69 |     BGZFSplitGuesser splitGuesser = new BGZFSplitGuesser(seekableIn);
70 |     long adjustedStart = splitGuesser.guessNextBGZFBlockStart(start, end);
71 |     ((Seekable)seekableIn).seek(adjustedStart);
72 |     return new BGZFSplitCompressionInputStream(seekableIn, adjustedStart, end);
73 |   }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/BaseSplitGuesser.java:
--------------------------------------------------------------------------------
  1 | package org.seqdoop.hadoop_bam;
  2 | 
  3 | import htsjdk.samtools.seekablestream.SeekableStream;
  4 | import java.io.IOException;
  5 | import java.nio.ByteBuffer;
  6 | import java.nio.ByteOrder;
  7 | import org.apache.hadoop.io.IOUtils;
  8 | 
  9 | class BaseSplitGuesser {
 10 | 
 11 |   protected final static int BGZF_MAGIC     = 0x04088b1f;
 12 |   protected final static int BGZF_MAGIC_SUB = 0x00024342;
 13 |   protected final static int BGZF_SUB_SIZE  = 4 + 2;
 14 | 
 15 |   protected SeekableStream in;
 16 |   protected final ByteBuffer buf;
 17 | 
 18 |   public BaseSplitGuesser() {
 19 |     buf = ByteBuffer.allocate(8);
 20 |     buf.order(ByteOrder.LITTLE_ENDIAN);
 21 |   }
 22 | 
 23 |   protected static class PosSize {
 24 |     public int pos;
 25 |     public int size;
 26 |     public PosSize(int p, int s) { pos = p; size = s; }
 27 |   }
 28 | 
 29 |   // Gives the compressed size on the side. Returns null if it doesn't find
 30 |   // anything.
 31 |   protected PosSize guessNextBGZFPos(int p, int end) {
 32 |     try { for (;;) {
 33 |       for (;;) {
 34 |         in.seek(p);
 35 |         IOUtils.readFully(in, buf.array(), 0, 4);
 36 |         int n = buf.getInt(0);
 37 | 
 38 |         if (n == BGZF_MAGIC)
 39 |           break;
 40 | 
 41 |         // Skip ahead a bit more than 1 byte if you can.
 42 |         if (n >>> 8 == BGZF_MAGIC << 8 >>> 8)
 43 |           ++p;
 44 |         else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16)
 45 |           p += 2;
 46 |         else
 47 |           p += 3;
 48 | 
 49 |         if (p >= end)
 50 |           return null;
 51 |       }
 52 |       // Found what looks like a gzip block header: now get XLEN and
 53 |       // search for the BGZF subfield.
 54 |       final int p0 = p;
 55 |       p += 10;
 56 |       in.seek(p);
 57 |       IOUtils.readFully(in, buf.array(), 0, 2);
 58 |       p += 2;
 59 |       final int xlen   = getUShort(0);
 60 |       final int subEnd = p + xlen;
 61 | 
 62 |       while (p < subEnd) {
 63 |         IOUtils.readFully(in, buf.array(), 0, 4);
 64 | 
 65 |         if (buf.getInt(0) != BGZF_MAGIC_SUB) {
 66 |           p += 4 + getUShort(2);
 67 |           in.seek(p);
 68 |           continue;
 69 |         }
 70 | 
 71 |         // Found it: this is close enough to a BGZF block, make it
 72 |         // our guess.
 73 | 
 74 |         // But find out the size before returning. First, grab bsize:
 75 |         // we'll need it later.
 76 |         IOUtils.readFully(in, buf.array(), 0, 2);
 77 |         int bsize = getUShort(0);
 78 | 
 79 |         // Then skip the rest of the subfields.
 80 |         p += BGZF_SUB_SIZE;
 81 |         while (p < subEnd) {
 82 |           in.seek(p);
 83 |           IOUtils.readFully(in, buf.array(), 0, 4);
 84 |           p += 4 + getUShort(2);
 85 |         }
 86 |         if (p != subEnd) {
 87 |           // Cancel our guess because the xlen field didn't match the
 88 |           // data.
 89 |           break;
 90 |         }
 91 | 
 92 |         // Now skip past the compressed data and the CRC-32.
 93 |         p += bsize - xlen - 19 + 4;
 94 |         in.seek(p);
 95 |         IOUtils.readFully(in, buf.array(), 0, 4);
 96 |         return new PosSize(p0, buf.getInt(0));
 97 |       }
 98 |       // No luck: look for the next gzip block header. Start right after
 99 |       // where we last saw the identifiers, although we could probably
100 |       // safely skip further ahead. (If we find the correct one right
101 |       // now, the previous block contained 0x1f8b0804 bytes of data: that
102 |       // seems... unlikely.)
103 |       p = p0 + 4;
104 | 
105 |     }} catch (IOException e) {
106 |       return null;
107 |     }
108 |   }
109 | 
110 |   protected int getUShort(final int idx) {
111 |     return (int)buf.getShort(idx) & 0xffff;
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWriter.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2010 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | // File created: 2012-02-23 12:42:49
 22 | 
 23 | package org.seqdoop.hadoop_bam;
 24 | 
 25 | import java.io.IOException;
 26 | import java.io.OutputStream;
 27 | 
 28 | import htsjdk.samtools.SAMFileHeader;
 29 | import htsjdk.samtools.SAMRecord;
 30 | import htsjdk.samtools.SAMTextWriter;
 31 | 
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.apache.hadoop.mapreduce.RecordWriter;
 34 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 35 | 
 36 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
 37 | 
 38 | /** A base {@link RecordWriter} for SAM records.
 39 |  *
 40 |  * <p>Handles the output stream, writing the header if requested, and provides
 41 |  * the {@link #writeAlignment} function for subclasses.</p>
 42 |  */
 43 | public abstract class SAMRecordWriter<K>
 44 | 	extends RecordWriter<K,SAMRecordWritable>
 45 | {
 46 | 	private SAMTextWriter writer;
 47 | 	private SAMFileHeader header;
 48 | 
 49 | 	/** A SAMFileHeader is read from the input Path. */
 50 | 	public SAMRecordWriter(
 51 | 			Path output, Path input, boolean writeHeader, TaskAttemptContext ctx)
 52 | 		throws IOException
 53 | 	{
 54 | 		init(
 55 | 			output,
 56 | 			SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()),
 57 | 			writeHeader, ctx);
 58 | 	}
 59 | 	public SAMRecordWriter(
 60 | 			Path output, SAMFileHeader header, boolean writeHeader,
 61 | 			TaskAttemptContext ctx)
 62 | 		throws IOException
 63 | 	{
 64 | 		init(
 65 | 			output.getFileSystem(ctx.getConfiguration()).create(output),
 66 | 			header, writeHeader);
 67 | 	}
 68 | 	public SAMRecordWriter(
 69 | 			OutputStream output, SAMFileHeader header, boolean writeHeader)
 70 | 		throws IOException
 71 | 	{
 72 | 		init(output, header, writeHeader);
 73 | 	}
 74 | 
 75 | 	private void init(
 76 | 			Path output, SAMFileHeader header, boolean writeHeader,
 77 | 			TaskAttemptContext ctx)
 78 | 		throws IOException
 79 | 	{
 80 | 		init(
 81 | 			output.getFileSystem(ctx.getConfiguration()).create(output),
 82 | 			header, writeHeader);
 83 | 	}
 84 | 	private void init(
 85 | 			OutputStream output, SAMFileHeader header, boolean writeHeader)
 86 | 		throws IOException
 87 | 	{
 88 | 		this.header = header;
 89 | 		writer = new SAMTextWriter(output);
 90 | 
 91 | 		writer.setSortOrder(header.getSortOrder(), false);
 92 | 		if (writeHeader)
 93 | 			writer.setHeader(header);
 94 | 	}
 95 | 
 96 | 	@Override public void close(TaskAttemptContext ctx) {
 97 | 		writer.close();
 98 | 	}
 99 | 
100 | 	protected void writeAlignment(final SAMRecord rec) {
101 | 		rec.setHeader(header);
102 | 		writer.writeAlignment(rec);
103 | 	}
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMOutputFormat.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2010 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2010-08-11 12:19:23
22 | 
23 | package org.seqdoop.hadoop_bam;
24 | 
25 | import java.io.IOException;
26 | import java.io.InputStream;
27 | 
28 | import htsjdk.samtools.SAMFileHeader;
29 | 
30 | import org.apache.hadoop.conf.Configuration;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.hadoop.mapreduce.RecordWriter;
33 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
34 | 
35 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
36 | 
37 | /** Writes only the BAM records, not the key.
38 |  *
39 |  * <p>A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or
40 |  * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.</p>
41 |  *
42 |  * <p>By default, writes the SAM header to the output file(s). This
43 |  * can be disabled, because in distributed usage one often ends up with (and,
44 |  * for decent performance, wants to end up with) the output split into multiple
45 |  * parts, which are easier to concatenate if the header is not present in each
46 |  * file.</p>
47 |  */
48 | public class KeyIgnoringBAMOutputFormat<K> extends BAMOutputFormat<K> {
49 | 	protected SAMFileHeader header;
50 | 	private boolean writeHeader = true;
51 | 
52 | 	public KeyIgnoringBAMOutputFormat() {}
53 | 
54 | 	/** Whether the header will be written or not. */
55 | 	public boolean getWriteHeader()          { return writeHeader; }
56 | 
57 | 	/** Set whether the header will be written or not. */
58 | 	public void    setWriteHeader(boolean b) { writeHeader = b; }
59 | 
60 | 	public SAMFileHeader getSAMHeader() { return header; }
61 | 	public void setSAMHeader(SAMFileHeader header) { this.header = header; }
62 | 
63 | 	public void readSAMHeaderFrom(Path path, Configuration conf)
64 | 		throws IOException
65 | 	{
66 | 		this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf);
67 | 	}
68 | 	public void readSAMHeaderFrom(InputStream in, Configuration conf) {
69 | 		this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
70 | 	}
71 | 
72 | 	/** <code>setSAMHeader</code> or <code>readSAMHeaderFrom</code> must have
73 | 	 * been called first.
74 | 	 */
75 | 	@Override public RecordWriter<K,SAMRecordWritable> getRecordWriter(
76 | 			TaskAttemptContext ctx)
77 | 		throws IOException
78 | 	{
79 | 		return getRecordWriter(ctx, getDefaultWorkFile(ctx, ""));
80 | 	}
81 | 
82 | 	// Allows wrappers to provide their own work file.
83 | 	public RecordWriter<K,SAMRecordWritable> getRecordWriter(
84 | 			TaskAttemptContext ctx, Path out)
85 | 		throws IOException
86 | 	{
87 | 		if (this.header == null)
88 | 			throw new IOException(
89 | 				"Can't create a RecordWriter without the SAM header");
90 | 
91 | 		return new KeyIgnoringBAMRecordWriter<K>(out, header, writeHeader, ctx);
92 | 	}
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/CRAMInputFormat.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.cram.build.CramContainerIterator;
 4 | import htsjdk.samtools.seekablestream.SeekableStream;
 5 | import java.io.IOException;
 6 | import java.util.ArrayList;
 7 | import java.util.HashMap;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import org.apache.hadoop.conf.Configuration;
11 | import org.apache.hadoop.fs.Path;
12 | import org.apache.hadoop.io.LongWritable;
13 | import org.apache.hadoop.mapreduce.InputSplit;
14 | import org.apache.hadoop.mapreduce.JobContext;
15 | import org.apache.hadoop.mapreduce.RecordReader;
16 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
18 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
19 | import org.seqdoop.hadoop_bam.util.WrapSeekable;
20 | 
21 | public class CRAMInputFormat extends FileInputFormat<LongWritable, SAMRecordWritable> {
22 | 
23 |   public static final String REFERENCE_SOURCE_PATH_PROPERTY =
24 |       "hadoopbam.cram.reference-source-path";
25 | 
26 |   @Override
27 |   public List<InputSplit> getSplits(JobContext job) throws IOException {
28 |     return getSplits(super.getSplits(job), job.getConfiguration());
29 |   }
30 | 
31 |   public List<InputSplit> getSplits(List<InputSplit> splits, Configuration conf)
32 |       throws IOException {
33 |     // update splits to align with CRAM container boundaries
34 |     List<InputSplit> newSplits = new ArrayList<InputSplit>();
35 |     Map<Path, List<Long>> fileToOffsets = new HashMap<Path, List<Long>>();
36 |     for (InputSplit split : splits) {
37 |       FileSplit fileSplit = (FileSplit) split;
38 |       Path path = fileSplit.getPath();
39 |       List<Long> containerOffsets = fileToOffsets.get(path);
40 |       if (containerOffsets == null) {
41 |         containerOffsets = getContainerOffsets(conf, path);
42 |         fileToOffsets.put(path, containerOffsets);
43 |       }
44 |       long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart());
45 |       long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() +
46 |           fileSplit.getLength());
47 |       long newLength = newEnd - newStart;
48 |       if (newLength == 0) { // split is wholly within a container
49 |         continue;
50 |       }
51 |       FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength,
52 |           fileSplit.getLocations());
53 |       newSplits.add(newSplit);
54 |     }
55 |     return newSplits;
56 |   }
57 | 
58 |   private static List<Long> getContainerOffsets(Configuration conf, Path cramFile)
59 |       throws IOException {
60 |     SeekableStream seekableStream = WrapSeekable.openPath(conf, cramFile);
61 |     CramContainerIterator cci = new CramContainerIterator(seekableStream);
62 |     List<Long> containerOffsets = new ArrayList<Long>();
63 |     containerOffsets.add(seekableStream.position());
64 |     while (cci.hasNext()) {
65 |       cci.next();
66 |       containerOffsets.add(seekableStream.position());
67 |     }
68 |     containerOffsets.add(seekableStream.length());
69 |     return containerOffsets;
70 |   }
71 | 
72 |   private static long nextContainerOffset(List<Long> containerOffsets, long position) {
73 |     for (long offset : containerOffsets) {
74 |       if (offset >= position) {
75 |         return offset;
76 |       }
77 |     }
78 |     throw new IllegalStateException("Could not find position " + position + " in " +
79 |         "container offsets: " + containerOffsets);
80 |   }
81 | 
82 |   @Override
83 |   public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
84 |     RecordReader<LongWritable, SAMRecordWritable> rr = new CRAMRecordReader();
85 |     rr.initialize(split, context);
86 |     return rr;
87 |   }
88 | 
89 |   @Override
90 |   public boolean isSplitable(JobContext job, Path path) {
91 |     return true;
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/BAMTestUtil.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import htsjdk.samtools.BAMIndex;
 4 | import htsjdk.samtools.BAMIndexer;
 5 | import htsjdk.samtools.SAMFileHeader;
 6 | import htsjdk.samtools.SAMFileWriter;
 7 | import htsjdk.samtools.SAMFileWriterFactory;
 8 | import htsjdk.samtools.SAMRecord;
 9 | import htsjdk.samtools.SAMRecordSetBuilder;
10 | import htsjdk.samtools.SamReader;
11 | import htsjdk.samtools.SamReaderFactory;
12 | import java.io.File;
13 | import java.io.IOException;
14 | 
15 | class BAMTestUtil {
16 |   public static File writeBamFile(int numPairs, SAMFileHeader.SortOrder sortOrder)
17 |       throws IOException {
18 |     // file will be both queryname and coordinate sorted, so use one or the other
19 |     SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, sortOrder);
20 |     for (int i = 0; i < numPairs; i++) {
21 |       int chr = 20;
22 |       int start1 = (i + 1) * 1000;
23 |       int start2 = start1 + 100;
24 |       if (i == 5) { // add two unmapped fragments instead of a mapped pair
25 |         samRecordSetBuilder.addFrag(String.format("test-read-%03d-1", i), chr, start1,
26 |             false, true, null,
27 |             null,
28 |             -1, false);
29 |         samRecordSetBuilder.addFrag(String.format("test-read-%03d-2", i), chr, start2,
30 |             false, true, null,
31 |             null,
32 |             -1, false);
33 |       } else {
34 |         samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1,
35 |             start2);
36 |       }
37 |     }
38 |     if (numPairs > 0) { // add two unplaced unmapped fragments if non-empty
39 |       samRecordSetBuilder.addUnmappedFragment(String.format
40 |           ("test-read-%03d-unplaced-unmapped", numPairs++));
41 |       samRecordSetBuilder.addUnmappedFragment(String.format
42 |           ("test-read-%03d-unplaced-unmapped", numPairs++));
43 |     }
44 | 
45 |     final File bamFile = File.createTempFile("test", ".bam");
46 |     bamFile.deleteOnExit();
47 |     SAMFileHeader samHeader = samRecordSetBuilder.getHeader();
48 |     final SAMFileWriter bamWriter = new SAMFileWriterFactory()
49 |         .makeSAMOrBAMWriter(samHeader, true, bamFile);
50 |     for (final SAMRecord rec : samRecordSetBuilder.getRecords()) {
51 |       bamWriter.addAlignment(rec);
52 |     }
53 |     bamWriter.close();
54 | 
55 |     // create BAM index
56 |     if (sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
57 |       SamReader samReader = SamReaderFactory.makeDefault()
58 |           .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS)
59 |           .open(bamFile);
60 |       BAMIndexer.createIndex(samReader, new File(bamFile.getAbsolutePath()
61 |           .replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix)));
62 |     }
63 | 
64 |     return bamFile;
65 |   }
66 | 
67 |   public static File writeBamFileWithLargeHeader() throws IOException {
68 |     SAMRecordSetBuilder samRecordSetBuilder =
69 |         new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.queryname);
70 |     for (int i = 0; i < 1000; i++) {
71 |       int chr = 20;
72 |       int start1 = (i + 1) * 1000;
73 |       int start2 = start1 + 100;
74 |       samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1,
75 |           start2);
76 |     }
77 | 
78 |     final File bamFile = File.createTempFile("test", ".bam");
79 |     bamFile.deleteOnExit();
80 |     SAMFileHeader samHeader = samRecordSetBuilder.getHeader();
81 |     StringBuffer sb = new StringBuffer();
82 |     for (int i = 0; i < 1000000; i++) {
83 |       sb.append("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789");
84 |     }
85 |     samHeader.addComment(sb.toString());
86 |     final SAMFileWriter bamWriter = new SAMFileWriterFactory()
87 |         .makeSAMOrBAMWriter(samHeader, true, bamFile);
88 |     for (final SAMRecord rec : samRecordSetBuilder.getRecords()) {
89 |       bamWriter.addAlignment(rec);
90 |     }
91 |     bamWriter.close();
92 | 
93 |     return bamFile;
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitCompressionInputStream.java:
--------------------------------------------------------------------------------
  1 | package org.seqdoop.hadoop_bam.util;
  2 | 
  3 | import htsjdk.samtools.util.BlockCompressedInputStream;
  4 | import java.io.BufferedInputStream;
  5 | import java.io.IOException;
  6 | import java.io.InputStream;
  7 | import org.apache.hadoop.io.compress.SplitCompressionInputStream;
  8 | 
  9 | /**
 10 |  * An implementation of {@code SplitCompressionInputStream} for BGZF, based on
 11 |  * {@code BZip2CompressionInputStream} and {@code CBZip2InputStream} from Hadoop.
 12 |  * (BZip2 is the only splittable compression codec in Hadoop.)
 13 |  */
 14 | class BGZFSplitCompressionInputStream extends SplitCompressionInputStream {
 15 |   private static final int END_OF_BLOCK = -2;
 16 |   private final BlockCompressedInputStream input;
 17 |   private BufferedInputStream bufferedIn;
 18 |   private long startingPos = 0L;
 19 |   private long processedPosition;
 20 | 
 21 |   private enum POS_ADVERTISEMENT_STATE_MACHINE {
 22 |     HOLD, ADVERTISE
 23 |   };
 24 | 
 25 |   POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
 26 |   long compressedStreamPosition = 0;
 27 | 
 28 |   public BGZFSplitCompressionInputStream(InputStream in, long start, long end)
 29 |       throws IOException {
 30 |     super(in, start, end);
 31 |     bufferedIn = new BufferedInputStream(super.in);
 32 |     this.startingPos = super.getPos();
 33 |     input = new BlockCompressedInputStream(bufferedIn);
 34 |     this.updatePos(false);
 35 |   }
 36 | 
 37 |   @Override
 38 |   public int read() throws IOException {
 39 |     byte b[] = new byte[1];
 40 |     int result = this.read(b, 0, 1);
 41 |     return (result < 0) ? result : (b[0] & 0xff);
 42 |   }
 43 | 
 44 |   @Override
 45 |   public int read(byte[] b, int off, int len) throws IOException {
 46 |     // See BZip2CompressionInputStream#read for implementation notes.
 47 |     int result;
 48 |     result = readWithinBlock(b, off, len);
 49 |     if (result == END_OF_BLOCK) {
 50 |       this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE;
 51 |     }
 52 |     if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) {
 53 |       result = readWithinBlock(b, off, off + 1);
 54 |       // This is the precise time to update compressed stream position
 55 |       // to the client of this code.
 56 |       this.updatePos(true);
 57 |       this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
 58 |     }
 59 |     return result;
 60 |   }
 61 | 
 62 |   /**
 63 |    * Read up to <code>len</code> bytes from the stream, but no further than the end of the
 64 |    * compressed block. If at the end of the block then no bytes will be read and a return
 65 |    * value of -2 will be returned; on the next call to read, bytes from the next block
 66 |    * will be returned. This is the same contract as CBZip2InputStream in Hadoop.
 67 |    * @return int The return value greater than 0 are the bytes read.  A value
 68 |    * of -1 means end of stream while -2 represents end of block.
 69 |    */
 70 |   private int readWithinBlock(byte[] b, int off, int len) throws IOException {
 71 |     if (input.endOfBlock()) {
 72 |       final int available = input.available(); // this will read the next block, if there is one
 73 |       processedPosition = input.getPosition() >> 16;
 74 |       if (available == 0) { // end of stream
 75 |         return -1;
 76 |       }
 77 |       return END_OF_BLOCK;
 78 |     }
 79 | 
 80 |     // return up to end of block (at most)
 81 |     int available = input.available();
 82 |     return input.read(b, off, Math.min(available, len));
 83 |   }
 84 | 
 85 |   @Override
 86 |   public void resetState() throws IOException {
 87 |     // not implemented (only used in sequence files)
 88 |   }
 89 | 
 90 |   @Override
 91 |   public long getPos() throws IOException {
 92 |     return this.compressedStreamPosition;
 93 |   }
 94 | 
 95 |   // See comment in BZip2CompressionInputStream#updatePos
 96 |   private void updatePos(boolean shouldAddOn) {
 97 |     int addOn = shouldAddOn ? 1 : 0;
 98 |     this.compressedStreamPosition = this.startingPos + processedPosition + addOn;
 99 |   }
100 | 
101 |   @Override
102 |   public void close() throws IOException {
103 |     input.close();
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestFastaInputFormat.java:
--------------------------------------------------------------------------------
 1 | package org.seqdoop.hadoop_bam;
 2 | 
 3 | import java.util.List;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.InputSplit;
 7 | import org.apache.hadoop.mapreduce.JobContext;
 8 | import org.apache.hadoop.mapreduce.RecordReader;
 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
10 | import org.apache.hadoop.mapreduce.TaskAttemptID;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.task.JobContextImpl;
13 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
14 | import org.junit.Before;
15 | import org.junit.Test;
16 | 
17 | import static org.junit.Assert.assertEquals;
18 | import static org.junit.Assert.assertFalse;
19 | import static org.junit.Assert.assertTrue;
20 | import static org.mockito.Mockito.mock;
21 | 
22 | public class TestFastaInputFormat {
23 |   private String input;
24 |   private TaskAttemptContext taskAttemptContext;
25 |   private JobContext jobContext;
26 | 
27 |   @Before
28 |   public void setup() throws Exception {
29 |     Configuration conf = new Configuration();
30 |     input = ClassLoader.getSystemClassLoader().getResource("mini-chr1-chr2.fasta").getFile();
31 |     conf.set("mapred.input.dir", "file://" + input);
32 | 
33 |     // Input fasta is 600 bytes, so this gets us 3 FileInputFormat splits.
34 |     conf.set(FileInputFormat.SPLIT_MAXSIZE, "200");
35 | 
36 |     taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class));
37 |     jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID());
38 |   }
39 | 
40 |   @Test
41 |   public void testReader() throws Exception {
42 |     FastaInputFormat inputFormat = new FastaInputFormat();
43 |     List<InputSplit> splits = inputFormat.getSplits(jobContext);
44 |     assertEquals(2, splits.size());
45 |     RecordReader<Text, ReferenceFragment> reader = inputFormat
46 |         .createRecordReader(splits.get(0), taskAttemptContext);
47 |     reader.initialize(splits.get(0), taskAttemptContext);
48 | 
49 |     assertTrue(reader.nextKeyValue());
50 |     assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:11"), reader.getCurrentKey());
51 |     assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"), reader.getCurrentValue().getSequence());
52 | 
53 |     assertTrue(reader.nextKeyValue());
54 |     assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:182"), reader.getCurrentKey());
55 |     assertEquals(new Text("ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAAC"), reader.getCurrentValue().getSequence());
56 | 
57 |     assertTrue(reader.nextKeyValue());
58 |     assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1163"), reader.getCurrentKey());
59 |     assertEquals(new Text("CCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCC"), reader.getCurrentValue().getSequence());
60 | 
61 |     assertTrue(reader.nextKeyValue());
62 |     assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1244"), reader.getCurrentKey());
63 |     assertEquals(new Text("TAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC"), reader.getCurrentValue().getSequence());
64 | 
65 |     assertTrue(reader.nextKeyValue());
66 |     assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1325"), reader.getCurrentKey());
67 |     assertEquals(new Text("CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCC"), reader.getCurrentValue().getSequence());
68 | 
69 |     assertFalse(reader.nextKeyValue());
70 | 
71 |     reader = inputFormat.createRecordReader(splits.get(1), taskAttemptContext);
72 |     reader.initialize(splits.get(1), taskAttemptContext);
73 | 
74 |     assertTrue(reader.nextKeyValue());
75 |     assertEquals(new Text("chr2 dna:chromosome chromosome:GRCh37:2:1:243199373:11"), reader.getCurrentKey());
76 |     assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC"), reader.getCurrentValue().getSequence());
77 | 
78 |     assertFalse(reader.nextKeyValue());
79 | 
80 |     reader.close();
81 |   }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/SAMHeaderReader.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013 Aalto University
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to
 5 | // deal in the Software without restriction, including without limitation the
 6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | // sell copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 | 
21 | // File created: 2013-07-26 13:54:32
22 | 
23 | package org.seqdoop.hadoop_bam.util;
24 | 
25 | import java.io.IOException;
26 | import java.io.InputStream;
27 | import java.net.URI;
28 | import java.nio.file.Paths;
29 | 
30 | import htsjdk.samtools.cram.ref.ReferenceSource;
31 | import org.apache.hadoop.conf.Configuration;
32 | import org.apache.hadoop.fs.Path;
33 | 
34 | import htsjdk.samtools.SAMFileHeader;
35 | import htsjdk.samtools.SamInputResource;
36 | import htsjdk.samtools.SamReaderFactory;
37 | import htsjdk.samtools.ValidationStringency;
38 | import org.seqdoop.hadoop_bam.CRAMInputFormat;
39 | 
40 | public final class SAMHeaderReader {
41 | 	/** A String property corresponding to a ValidationStringency
42 | 	 * value. If set, the given stringency is used when any part of the
43 | 	 * Hadoop-BAM library reads SAM or BAM.
44 | 	 */
45 | 	public static final String VALIDATION_STRINGENCY_PROPERTY =
46 | 		"hadoopbam.samheaderreader.validation-stringency";
47 | 
48 | 	public static SAMFileHeader readSAMHeaderFrom(Path path, Configuration conf)
49 | 		throws IOException
50 | 	{
51 | 		InputStream i = path.getFileSystem(conf).open(path);
52 | 		final SAMFileHeader h = readSAMHeaderFrom(i, conf);
53 | 		i.close();
54 | 		return h;
55 | 	}
56 | 
57 | 	/** Does not close the stream. */
58 | 	public static SAMFileHeader readSAMHeaderFrom(
59 | 		final InputStream in, final Configuration conf)
60 | 	{
61 | 		final ValidationStringency
62 | 			stringency = getValidationStringency(conf);
63 | 		SamReaderFactory readerFactory = SamReaderFactory.makeDefault()
64 | 				.setOption(SamReaderFactory.Option.EAGERLY_DECODE, false)
65 | 				.setUseAsyncIo(false);
66 | 		if (stringency != null) {
67 | 			readerFactory.validationStringency(stringency);
68 | 		}
69 | 
70 | 		final ReferenceSource refSource = getReferenceSource(conf);
71 | 		if (null != refSource) {
72 | 			readerFactory.referenceSource(refSource);
73 | 		}
74 | 		return readerFactory.open(SamInputResource.of(in)).getFileHeader();
75 | 	}
76 | 
77 | 	public static ValidationStringency getValidationStringency(
78 | 		final Configuration conf)
79 | 	{
80 | 		final String p = conf.get(VALIDATION_STRINGENCY_PROPERTY);
81 | 		return p == null ? null : ValidationStringency.valueOf(p);
82 | 	}
83 | 
84 | 	public static ReferenceSource getReferenceSource(
85 | 			final Configuration conf)
86 | 	{
87 | 		//TODO: There isn't anything particularly CRAM-specific about reference source or validation
88 | 		// stringency other than that a reference source is required for CRAM files. We should move
89 | 		// the reference source and validation stringency property names and utility methods out of
90 | 		// CRAMInputFormat and SAMHeaderReader and combine them together into a single class for extracting
91 | 		// configuration params, but it would break backward compatibility with existing code that
92 | 		// is dependent on the CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY.
93 | 		final String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
94 | 		return refSourcePath == null ? null : new ReferenceSource(NIOFileUtil.asPath(refSourcePath));
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/NIOFileUtil.java:
--------------------------------------------------------------------------------
  1 | package org.seqdoop.hadoop_bam.util;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.OutputStream;
  5 | import java.net.URI;
  6 | import java.nio.file.FileSystemNotFoundException;
  7 | import java.nio.file.FileSystems;
  8 | import java.nio.file.FileVisitResult;
  9 | import java.nio.file.Files;
 10 | import java.nio.file.Path;
 11 | import java.nio.file.PathMatcher;
 12 | import java.nio.file.Paths;
 13 | import java.nio.file.SimpleFileVisitor;
 14 | import java.nio.file.attribute.BasicFileAttributes;
 15 | import java.util.Collections;
 16 | import java.util.HashMap;
 17 | import java.util.List;
 18 | import java.util.stream.Collectors;
 19 | 
 20 | public class NIOFileUtil {
 21 |   private NIOFileUtil() {
 22 |   }
 23 | 
 24 |   static final String PARTS_GLOB = "glob:**/part-[mr]-[0-9][0-9][0-9][0-9][0-9]*";
 25 | 
 26 |   /**
 27 |    * Convert the given path {@link URI} to a {@link Path} object.
 28 |    * @param uri the path to convert
 29 |    * @return a {@link Path} object
 30 |    */
 31 |   public static Path asPath(URI uri) {
 32 |     try {
 33 |       return Paths.get(uri);
 34 |     } catch (FileSystemNotFoundException e) {
 35 |       ClassLoader cl = Thread.currentThread().getContextClassLoader();
 36 |       if (cl == null) {
 37 |         throw e;
 38 |       }
 39 |       try {
 40 |         return FileSystems.newFileSystem(uri, new HashMap<>(), cl).provider().getPath(uri);
 41 |       } catch (IOException ex) {
 42 |         throw new RuntimeException("Cannot create filesystem for " + uri, ex);
 43 |       }
 44 |     }
 45 |   }
 46 | 
 47 |   /**
 48 |    * Convert the given path string to a {@link Path} object.
 49 |    * @param path the path to convert
 50 |    * @return a {@link Path} object
 51 |    */
 52 |   public static Path asPath(String path) {
 53 |     URI uri = URI.create(path);
 54 |     return uri.getScheme() == null ? Paths.get(path) : asPath(uri);
 55 |   }
 56 | 
 57 |   /**
 58 |    * Delete the given directory and all of its contents if non-empty.
 59 |    * @param directory the directory to delete
 60 |    * @throws IOException
 61 |    */
 62 |   static void deleteRecursive(Path directory) throws IOException {
 63 |     Files.walkFileTree(directory, new SimpleFileVisitor<Path>() {
 64 |       @Override
 65 |       public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
 66 |         Files.delete(file);
 67 |         return FileVisitResult.CONTINUE;
 68 |       }
 69 |       @Override
 70 |       public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
 71 |         Files.deleteIfExists(dir);
 72 |         return FileVisitResult.CONTINUE;
 73 |       }
 74 |     });
 75 |   }
 76 | 
 77 |   /**
 78 |    * Returns all the files in a directory that match the given pattern, and that don't
 79 |    * have the given extension.
 80 |    * @param directory the directory to look for files in, subdirectories are not
 81 |    *                  considered
 82 |    * @param syntaxAndPattern the syntax and pattern to use for matching (see
 83 |    * {@link java.nio.file.FileSystem#getPathMatcher}
 84 |    * @param excludesExt the extension to exclude, or null to exclude nothing
 85 |    * @return a list of files, sorted by name
 86 |    * @throws IOException
 87 |    */
 88 |   static List<Path> getFilesMatching(Path directory,
 89 |       String syntaxAndPattern, String excludesExt) throws IOException {
 90 |     PathMatcher matcher = directory.getFileSystem().getPathMatcher(syntaxAndPattern);
 91 |     List<Path> parts = Files.walk(directory)
 92 |         .filter(matcher::matches)
 93 |         .filter(path -> excludesExt == null || !path.toString().endsWith(excludesExt))
 94 |         .collect(Collectors.toList());
 95 |     Collections.sort(parts);
 96 |     return parts;
 97 |   }
 98 | 
 99 |   /**
100 |    * Merge the given part files in order into an output stream.
101 |    * This deletes the parts.
102 |    * @param parts the part files to merge
103 |    * @param out the stream to write each file into, in order
104 |    * @throws IOException
105 |    */
106 |   static void mergeInto(List<Path> parts, OutputStream out)
107 |       throws IOException {
108 |     for (final Path part : parts) {
109 |       Files.copy(part, out);
110 |       Files.delete(part);
111 |     }
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestSAMInputFormat.java:
--------------------------------------------------------------------------------
  1 | package org.seqdoop.hadoop_bam;
  2 | 
  3 | import htsjdk.samtools.SAMRecord;
  4 | import htsjdk.samtools.SamReader;
  5 | import htsjdk.samtools.SamReaderFactory;
  6 | import java.io.BufferedReader;
  7 | import java.io.File;
  8 | import java.io.FileReader;
  9 | import java.util.ArrayList;
 10 | import java.util.List;
 11 | import org.apache.hadoop.conf.Configuration;
 12 | import org.apache.hadoop.fs.FileSystem;
 13 | import org.apache.hadoop.fs.Path;
 14 | import org.apache.hadoop.io.LongWritable;
 15 | import org.apache.hadoop.mapreduce.InputSplit;
 16 | import org.apache.hadoop.mapreduce.Job;
 17 | import org.apache.hadoop.mapreduce.JobContext;
 18 | import org.apache.hadoop.mapreduce.RecordReader;
 19 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 20 | import org.apache.hadoop.mapreduce.TaskAttemptID;
 21 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 23 | import org.apache.hadoop.mapreduce.task.JobContextImpl;
 24 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
 25 | import org.junit.Before;
 26 | import org.junit.Test;
 27 | 
 28 | import static org.junit.Assert.assertEquals;
 29 | import static org.junit.Assert.assertTrue;
 30 | import static org.mockito.Mockito.mock;
 31 | 
 32 | public class TestSAMInputFormat {
 33 |   private String input;
 34 |   private TaskAttemptContext taskAttemptContext;
 35 |   private JobContext jobContext;
 36 | 
 37 |   @Before
 38 |   public void setup() throws Exception {
 39 |     Configuration conf = new Configuration();
 40 |     input = ClassLoader.getSystemClassLoader().getResource("test.sam").getFile();
 41 |     conf.set("mapred.input.dir", "file://" + input);
 42 | 
 43 |     taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class));
 44 |     jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID());
 45 |   }
 46 | 
 47 |   @Test
 48 |   public void testReader() throws Exception {
 49 |     int expectedCount = 0;
 50 |     SamReader samReader = SamReaderFactory.makeDefault().open(new File(input));
 51 |     for (SAMRecord r : samReader) {
 52 |       expectedCount++;
 53 |     }
 54 |     samReader.close();
 55 | 
 56 |     AnySAMInputFormat inputFormat = new AnySAMInputFormat();
 57 |     List<InputSplit> splits = inputFormat.getSplits(jobContext);
 58 |     assertEquals(1, splits.size());
 59 |     RecordReader<LongWritable, SAMRecordWritable> reader = inputFormat
 60 |         .createRecordReader(splits.get(0), taskAttemptContext);
 61 |     reader.initialize(splits.get(0), taskAttemptContext);
 62 | 
 63 |     int actualCount = 0;
 64 |     while (reader.nextKeyValue()) {
 65 |       actualCount++;
 66 |     }
 67 |     reader.close();
 68 | 
 69 |     assertEquals(expectedCount, actualCount);
 70 |   }
 71 | 
 72 |   @Test
 73 |   public void testMapReduceJob() throws Exception {
 74 |     Configuration conf = new Configuration();
 75 | 
 76 |     FileSystem fileSystem = FileSystem.get(conf);
 77 |     Path inputPath = new Path(input);
 78 |     Path outputPath = fileSystem.makeQualified(new Path("target/out"));
 79 |     fileSystem.delete(outputPath, true);
 80 | 
 81 |     Job job = Job.getInstance(conf);
 82 |     FileInputFormat.setInputPaths(job, inputPath);
 83 |     job.setInputFormatClass(SAMInputFormat.class);
 84 |     job.setOutputKeyClass(LongWritable.class);
 85 |     job.setOutputValueClass(SAMRecordWritable.class);
 86 |     job.setNumReduceTasks(0);
 87 |     FileOutputFormat.setOutputPath(job, outputPath);
 88 | 
 89 |     boolean success = job.waitForCompletion(true);
 90 |     assertTrue(success);
 91 | 
 92 |     List<String> samStrings = new ArrayList<String>();
 93 |     SamReader samReader = SamReaderFactory.makeDefault().open(new File(input));
 94 |     for (SAMRecord r : samReader) {
 95 |       samStrings.add(r.getSAMString().trim());
 96 |     }
 97 |     samReader.close();
 98 | 
 99 |     File outputFile = new File(new File(outputPath.toUri()), "part-m-00000");
100 |     BufferedReader br = new BufferedReader(new FileReader(outputFile));
101 |     String line;
102 |     int index = 0;
103 |     while ((line = br.readLine()) != null) {
104 |       String value = line.substring(line.indexOf("\t") + 1); // ignore key
105 |       assertEquals(samStrings.get(index++), value);
106 |     }
107 |     br.close();
108 |   }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2011 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | // File created: 2011-11-15 11:58:23
 22 | 
 23 | package org.seqdoop.hadoop_bam;
 24 | 
 25 | import htsjdk.samtools.BAMRecord;
 26 | import htsjdk.samtools.SAMFileHeader;
 27 | import htsjdk.samtools.SAMRecord;
 28 | import htsjdk.samtools.SAMRecordFactory;
 29 | 
 30 | /** A factory for the kind of lazy {@link BAMRecord} used internally. */
 31 | public class LazyBAMRecordFactory implements SAMRecordFactory {
 32 | 	@Override public SAMRecord createSAMRecord(SAMFileHeader hdr) {
 33 | 		throw new UnsupportedOperationException(
 34 | 			"LazyBAMRecordFactory can only create BAM records");
 35 | 	}
 36 | 
 37 | 	@Override public BAMRecord createBAMRecord(
 38 | 		SAMFileHeader hdr,
 39 | 		int referenceSequenceIndex, int alignmentStart,
 40 | 		short readNameLength, short mappingQuality,
 41 | 		int indexingBin, int cigarLen, int flags, int readLen,
 42 | 		int mateReferenceSequenceIndex, int mateAlignmentStart,
 43 | 		int insertSize, byte[] variableLengthBlock)
 44 | 	{
 45 | 		return new LazyBAMRecord(
 46 | 			hdr, referenceSequenceIndex, alignmentStart, readNameLength,
 47 | 			mappingQuality, indexingBin, cigarLen, flags, readLen,
 48 | 			mateReferenceSequenceIndex, mateAlignmentStart, insertSize,
 49 | 			variableLengthBlock);
 50 | 	}
 51 | }
 52 | 
 53 | class LazyBAMRecord extends BAMRecord {
 54 | 	private boolean decodedRefIdx     = false;
 55 | 	private boolean decodedMateRefIdx = false;
 56 | 
 57 | 	public LazyBAMRecord(
 58 | 		SAMFileHeader hdr, int referenceID, int coordinate, short readNameLength,
 59 | 		short mappingQuality, int indexingBin, int cigarLen, int flags,
 60 | 		int readLen, int mateReferenceID, int mateCoordinate, int insertSize,
 61 | 		byte[] restOfData)
 62 | 	{
 63 | 		super(
 64 | 			hdr, referenceID, coordinate, readNameLength, mappingQuality,
 65 | 			indexingBin, cigarLen, flags, readLen, mateReferenceID,
 66 | 			mateCoordinate, insertSize, restOfData);
 67 | 	}
 68 | 
 69 | 	@Override public void setReferenceIndex(final int referenceIndex) {
 70 | 		mReferenceIndex = referenceIndex;
 71 | 		decodedRefIdx = false;
 72 | 	}
 73 | 	@Override public void setMateReferenceIndex(final int referenceIndex) {
 74 | 		mMateReferenceIndex = referenceIndex;
 75 | 		decodedMateRefIdx = false;
 76 | 	}
 77 | 
 78 | 	@Override public String getReferenceName() {
 79 | 		if (mReferenceIndex != null && !decodedRefIdx) {
 80 | 			decodedRefIdx = true;
 81 | 			super.setReferenceIndex(mReferenceIndex);
 82 | 		}
 83 | 		return super.getReferenceName();
 84 | 	}
 85 | 
 86 | 	@Override public String getMateReferenceName() {
 87 | 		if (mMateReferenceIndex != null && !decodedMateRefIdx) {
 88 | 			decodedMateRefIdx = true;
 89 | 			super.setMateReferenceIndex(mMateReferenceIndex);
 90 | 		}
 91 | 		return super.getMateReferenceName();
 92 | 	}
 93 | 
 94 | 	@Override protected void eagerDecode() {
 95 | 		getReferenceName();
 96 | 		getMateReferenceName();
 97 | 		super.eagerDecode();
 98 | 	}
 99 | 
100 | 	@Override
101 | 	public boolean equals(Object o) {
102 | 		// don't use decoded flags for equality check
103 | 		return super.equals(o);
104 | 	}
105 | 
106 | 	@Override
107 | 	public int hashCode() {
108 | 		// don't use decoded flags for hash code
109 | 		return super.hashCode();
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormatStringency.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | package org.seqdoop.hadoop_bam;
 22 | 
 23 | import htsjdk.samtools.ValidationStringency;
 24 | import htsjdk.tribble.TribbleException;
 25 | import htsjdk.variant.variantcontext.VariantContext;
 26 | import java.util.List;
 27 | import org.apache.hadoop.conf.Configuration;
 28 | import org.apache.hadoop.io.LongWritable;
 29 | import org.apache.hadoop.mapreduce.InputSplit;
 30 | import org.apache.hadoop.mapreduce.JobContext;
 31 | import org.apache.hadoop.mapreduce.RecordReader;
 32 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 33 | import org.apache.hadoop.mapreduce.TaskAttemptID;
 34 | import org.apache.hadoop.mapreduce.task.JobContextImpl;
 35 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
 36 | import org.junit.Test;
 37 | 
 38 | import static org.junit.Assert.assertEquals;
 39 | import static org.junit.Assert.assertNotNull;
 40 | import static org.mockito.Mockito.mock;
 41 | 
 42 | public class TestVCFInputFormatStringency {
 43 | 
 44 |     public void checkReading(ValidationStringency validationStringency) throws Exception {
 45 |         String filename = "invalid_info_field.vcf";
 46 |         Configuration conf = new Configuration();
 47 |         String input_file = ClassLoader.getSystemClassLoader().getResource(filename).getFile();
 48 |         conf.set("mapred.input.dir", "file://" + input_file);
 49 | 
 50 |         if (validationStringency != null) {
 51 |             VCFRecordReader.setValidationStringency(conf, validationStringency);
 52 |         }
 53 | 
 54 |         TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class));
 55 |         JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID());
 56 | 
 57 |         VCFInputFormat inputFormat = new VCFInputFormat(conf);
 58 |         List<InputSplit> splits = inputFormat.getSplits(ctx);
 59 |         assertEquals(1, splits.size());
 60 |         RecordReader<LongWritable, VariantContextWritable> reader =
 61 |             inputFormat.createRecordReader(splits.get(0), taskAttemptContext);
 62 |         int counter = 0;
 63 |         while (reader.nextKeyValue()) {
 64 |             VariantContextWritable writable = reader.getCurrentValue();
 65 |             assertNotNull(writable);
 66 |             VariantContext vc = writable.get();
 67 |             assertNotNull(vc);
 68 |             String value = vc.toString();
 69 |             assertNotNull(value);
 70 |             counter++;
 71 |         }
 72 |         assertEquals(4, counter);
 73 |     }
 74 | 
 75 |     @Test(expected = TribbleException.class)
 76 |     public void testUnset() throws Exception {
 77 |         checkReading(null); // defaults to strict
 78 |     }
 79 | 
 80 |     @Test(expected = TribbleException.class)
 81 |     public void testDefault() throws Exception {
 82 |         checkReading(ValidationStringency.DEFAULT_STRINGENCY); // defaults to strict
 83 |     }
 84 | 
 85 |     @Test
 86 |     public void testSilent() throws Exception {
 87 |         checkReading(ValidationStringency.SILENT);
 88 |     }
 89 | 
 90 |     @Test
 91 |     public void testLenient() throws Exception {
 92 |         checkReading(ValidationStringency.LENIENT);
 93 |     }
 94 | 
 95 |     @Test(expected = TribbleException.class)
 96 |     public void testStrict() throws Exception {
 97 |         checkReading(ValidationStringency.STRICT);
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndex.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2010 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | // File created: 2010-08-25 12:20:03
 22 | 
 23 | package org.seqdoop.hadoop_bam.util;
 24 | 
 25 | import java.io.BufferedInputStream;
 26 | import java.io.File;
 27 | import java.io.FileInputStream;
 28 | import java.io.InputStream;
 29 | import java.io.IOException;
 30 | import java.nio.ByteBuffer;
 31 | import java.util.NavigableSet;
 32 | import java.util.TreeSet;
 33 | 
 34 | /** An index into BGZF-compressed files, for {@link BGZFSplitFileInputFormat}.
 35 |  * Reads files that are created by {@link BGZFBlockIndexer}.
 36 |  *
 37 |  * <p>Indexes the positions of individual gzip blocks in the file.</p>
 38 |  */
 39 | public final class BGZFBlockIndex {
 40 | 	private final NavigableSet<Long> offsets = new TreeSet<Long>();
 41 | 
 42 | 	public BGZFBlockIndex() {}
 43 | 	public BGZFBlockIndex(final File path) throws IOException {
 44 | 		this(new BufferedInputStream(new FileInputStream(path)));
 45 | 	}
 46 | 	public BGZFBlockIndex(final InputStream in) throws IOException {
 47 | 		readIndex(in);
 48 | 	}
 49 | 
 50 | 	public void readIndex(final InputStream in) throws IOException {
 51 | 		offsets.clear();
 52 | 
 53 | 		final ByteBuffer bb = ByteBuffer.allocate(8);
 54 | 
 55 | 		for (long prev = -1; in.read(bb.array(), 2, 6) == 6;) {
 56 | 			final long cur = bb.getLong(0);
 57 | 			if (prev > cur)
 58 | 				throw new IOException(String.format(
 59 | 					"Invalid BGZF block index; offsets not in order: %#x > %#x",
 60 | 					prev, cur));
 61 | 
 62 | 			offsets.add(prev = cur);
 63 | 		}
 64 | 		in.close();
 65 | 
 66 | 		if (offsets.size() < 1)
 67 | 			throw new IOException(
 68 | 				"Invalid BGZF block index: should contain at least the file size");
 69 | 
 70 | 		offsets.add(0L);
 71 | 	}
 72 | 
 73 | 	public Long prevBlock(final long filePos) {
 74 | 		return offsets.floor(filePos);
 75 | 	}
 76 | 	public Long nextBlock(final long filePos) {
 77 | 		return offsets.higher(filePos);
 78 | 	}
 79 | 
 80 | 	public int size() { return offsets.size(); }
 81 | 
 82 | 	private long secondBlock() { return nextBlock(0); }
 83 | 	private long   lastBlock() { return prevBlock(fileSize() - 1); }
 84 | 	private long    fileSize() { return offsets.last(); }
 85 | 
 86 | 	/** Writes some statistics about each BGZF block index file given as an
 87 | 	 * argument.
 88 | 	 */
 89 | 	public static void main(String[] args) {
 90 | 		if (args.length == 0) {
 91 | 			System.out.println(
 92 | 				"Usage: BGZFBlockIndex [BGZF block indices...]\n\n"+
 93 | 
 94 | 				"Writes a few statistics about each BGZF block index.");
 95 | 			return;
 96 | 		}
 97 | 
 98 | 		for (String arg : args) {
 99 | 			final File f = new File(arg);
100 | 			if (f.isFile() && f.canRead()) {
101 | 				try {
102 | 					System.err.printf("%s:\n", f);
103 | 					final BGZFBlockIndex bi = new BGZFBlockIndex(f);
104 | 					final long second = bi.secondBlock();
105 | 					final long last   = bi.lastBlock();
106 | 					System.err.printf(
107 | 						"\t%d blocks\n" +
108 | 						"\tfirst after 0 is at %#014x\n" +
109 | 						"\tlast          is at %#014x\n" +
110 | 						"\tassociated BGZF file size %d\n",
111 | 						bi.size()-1,
112 | 						bi.secondBlock(), bi.lastBlock(), bi.fileSize());
113 | 				} catch (IOException e) {
114 | 					System.err.printf("Failed to read %s!\n", f);
115 | 					e.printStackTrace();
116 | 				}
117 | 			} else
118 | 				System.err.printf("%s does not look like a readable file!\n", f);
119 | 		}
120 | 	}
121 | }
122 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringAnySAMOutputFormat.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2010 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | // File created: 2010-08-11 12:19:23
 22 | 
 23 | package org.seqdoop.hadoop_bam;
 24 | 
 25 | import java.io.IOException;
 26 | import java.io.InputStream;
 27 | 
 28 | import htsjdk.samtools.SAMFileHeader;
 29 | 
 30 | import org.apache.hadoop.conf.Configuration;
 31 | import org.apache.hadoop.fs.Path;
 32 | import org.apache.hadoop.mapreduce.RecordWriter;
 33 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 34 | 
 35 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
 36 | 
 37 | /** Writes only the SAM records, not the key.
 38 |  *
 39 |  * <p>A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or
 40 |  * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.</p>
 41 |  *
 42 |  * <p>By default, writes the SAM header to the output file(s). This
 43 |  * can be disabled, because in distributed usage one often ends up with (and,
 44 |  * for decent performance, wants to end up with) the output split into multiple
 45 |  * parts, which are easier to concatenate if the header is not present in each
 46 |  * file.</p>
 47 |  */
 48 | public class KeyIgnoringAnySAMOutputFormat<K> extends AnySAMOutputFormat<K> {
 49 | 
 50 | 	protected SAMFileHeader header;
 51 | 
 52 | 	/** Whether the header will be written, defaults to true..
 53 | 	 */
 54 | 	public static final String WRITE_HEADER_PROPERTY =
 55 | 		"hadoopbam.anysam.write-header";
 56 | 
 57 | 	public KeyIgnoringAnySAMOutputFormat(SAMFormat fmt) {
 58 | 		super(fmt);
 59 | 	}
 60 | 	public KeyIgnoringAnySAMOutputFormat(Configuration conf) {
 61 | 		super(conf);
 62 | 
 63 | 		if (format == null)
 64 | 			throw new IllegalArgumentException(
 65 | 				"unknown SAM format: OUTPUT_SAM_FORMAT_PROPERTY not set");
 66 | 	}
 67 | 	public KeyIgnoringAnySAMOutputFormat(Configuration conf, Path path) {
 68 | 		super(conf);
 69 | 
 70 | 		if (format == null) {
 71 | 			format = SAMFormat.inferFromFilePath(path);
 72 | 
 73 | 			if (format == null)
 74 | 				throw new IllegalArgumentException("unknown SAM format: " + path);
 75 | 		}
 76 | 	}
 77 | 
 78 | 	public SAMFileHeader getSAMHeader() { return header; }
 79 | 	public void setSAMHeader(SAMFileHeader header) { this.header = header; }
 80 | 
 81 | 	public void readSAMHeaderFrom(Path path, Configuration conf)
 82 | 		throws IOException
 83 | 	{
 84 | 		this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf);
 85 | 	}
 86 | 	public void readSAMHeaderFrom(InputStream in, Configuration conf) {
 87 | 		this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
 88 | 	}
 89 | 
 90 | 	/** <code>setSAMHeader</code> or <code>readSAMHeaderFrom</code> must have
 91 | 	 * been called first.
 92 | 	 */
 93 | 	@Override public RecordWriter<K,SAMRecordWritable> getRecordWriter(
 94 | 			TaskAttemptContext ctx)
 95 | 		throws IOException
 96 | 	{
 97 | 		return getRecordWriter(ctx, getDefaultWorkFile(ctx, ""));
 98 | 	}
 99 | 
100 | 	// Allows wrappers to provide their own work file.
101 | 	public RecordWriter<K,SAMRecordWritable> getRecordWriter(
102 | 			TaskAttemptContext ctx, Path out)
103 | 		throws IOException
104 | 	{
105 | 		if (this.header == null)
106 | 			throw new IOException(
107 | 				"Can't create a RecordWriter without the SAM header");
108 | 
109 | 		final boolean writeHeader = ctx.getConfiguration().getBoolean(
110 | 			WRITE_HEADER_PROPERTY, true);
111 | 
112 | 		switch (format) {
113 | 			case BAM:
114 | 				return new KeyIgnoringBAMRecordWriter<K>(
115 | 					out, header, writeHeader, ctx);
116 | 
117 | 			case SAM:
118 | 				return new KeyIgnoringSAMRecordWriter<K>(
119 | 						out, header, writeHeader, ctx);
120 | 
121 | 			case CRAM:
122 | 				return new KeyIgnoringCRAMRecordWriter<K>(
123 | 						out, header, writeHeader, ctx);
124 | 
125 | 			default: assert false; return null;
126 | 		}
127 | 	}
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/FileVirtualSplit.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2010 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | // File created: 2010-08-09 13:06:32
 22 | 
 23 | package org.seqdoop.hadoop_bam;
 24 | 
 25 | import java.io.DataOutput;
 26 | import java.io.DataInput;
 27 | import java.io.IOException;
 28 | 
 29 | import org.apache.hadoop.fs.Path;
 30 | import org.apache.hadoop.io.Text;
 31 | import org.apache.hadoop.io.Writable;
 32 | import org.apache.hadoop.mapreduce.InputSplit;
 33 | 
 34 | /** Like a {@link org.apache.hadoop.mapreduce.lib.input.FileSplit}, but uses
 35 |  * BGZF virtual offsets to fit with {@link
 36 |  * htsjdk.samtools.util.BlockCompressedInputStream}.
 37 |  */
 38 | public class FileVirtualSplit extends InputSplit implements Writable {
 39 | 	private Path file;
 40 | 	private long vStart;
 41 | 	private long vEnd;
 42 | 	private final String[] locations;
 43 | 	private long[] intervalFilePointers;
 44 | 
 45 | 	private static final String[] NO_LOCATIONS = {};
 46 | 
 47 | 	public FileVirtualSplit() { locations = NO_LOCATIONS; }
 48 | 
 49 | 	public FileVirtualSplit(Path f, long vs, long ve, String[] locs) {
 50 | 		file      = f;
 51 | 		vStart    = vs;
 52 | 		vEnd      = ve;
 53 | 		locations = locs;
 54 | 	}
 55 | 
 56 | 	public FileVirtualSplit(Path f, long vs, long ve, String[] locs, long[] intervalFilePointers) {
 57 | 		file      = f;
 58 | 		vStart    = vs;
 59 | 		vEnd      = ve;
 60 | 		locations = locs;
 61 | 		this.intervalFilePointers = intervalFilePointers;
 62 | 	}
 63 | 
 64 | 	@Override public String[] getLocations() { return locations; }
 65 | 
 66 | 	/** Inexact due to the nature of virtual offsets.
 67 |     *
 68 |     * We can't know how many blocks there are in between two file offsets, nor
 69 |     * how large those blocks are. So this uses only the difference between the
 70 |     * file offsets—unless that difference is zero, in which case the split is
 71 |     * wholly contained in one block and thus we can give an exact result.
 72 | 	 */
 73 | 	@Override public long getLength() {
 74 | 		final long vsHi   = vStart & ~0xffff;
 75 | 		final long veHi   = vEnd   & ~0xffff;
 76 | 		final long hiDiff = veHi - vsHi;
 77 | 		return hiDiff == 0 ? ((vEnd & 0xffff) - (vStart & 0xffff)) : hiDiff;
 78 | 	}
 79 | 
 80 | 	public Path getPath() { return file; }
 81 | 
 82 | 	/** Inclusive. */
 83 | 	public long getStartVirtualOffset() { return vStart; }
 84 | 
 85 | 	/** Exclusive. */
 86 | 	public long   getEndVirtualOffset() { return vEnd;   }
 87 | 
 88 | 	public void setStartVirtualOffset(long vo) { vStart = vo; }
 89 | 	public void   setEndVirtualOffset(long vo) { vEnd   = vo; }
 90 | 
 91 | 	/**
 92 | 	 * @return pairs of virtual file pointers for all intervals that should be used for
 93 | 	 * filtering the split, or <code>null</code> if there are none. These correspond to
 94 | 	 * BAMFileSpan chunk start/stop pointers in htsjdk.
 95 | 	 */
 96 | 	public long[] getIntervalFilePointers() {
 97 | 		return intervalFilePointers;
 98 | 	}
 99 | 
100 | 	@Override public void write(DataOutput out) throws IOException {
101 | 		Text.writeString(out, file.toString());
102 | 		out.writeLong(vStart);
103 | 		out.writeLong(vEnd);
104 | 		out.writeBoolean(intervalFilePointers != null);
105 | 		if (intervalFilePointers != null) {
106 | 			out.writeInt(intervalFilePointers.length);
107 | 			for (int i = 0; i < intervalFilePointers.length; i++) {
108 | 				out.writeLong(intervalFilePointers[i]);
109 | 			}
110 | 		}
111 | 	}
112 | 	@Override public void readFields(DataInput in) throws IOException {
113 | 		file   = new Path(Text.readString(in));
114 | 		vStart = in.readLong();
115 | 		vEnd   = in.readLong();
116 | 		if (in.readBoolean()) {
117 | 			intervalFilePointers = new long[in.readInt()];
118 | 			for (int i = 0; i < intervalFilePointers.length; i++) {
119 | 				intervalFilePointers[i] = in.readLong();
120 | 			}
121 | 		}
122 | 	}
123 | 
124 | 	@Override
125 | 	public String toString() { return file + ":" + vStart + "-" + vEnd; }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormatOnHDFS.java:
--------------------------------------------------------------------------------
  1 | package org.seqdoop.hadoop_bam;
  2 | 
  3 | import htsjdk.samtools.SAMRecord;
  4 | import htsjdk.samtools.SamReader;
  5 | import htsjdk.samtools.SamReaderFactory;
  6 | import java.io.File;
  7 | import java.io.IOException;
  8 | import java.net.URI;
  9 | import java.net.URISyntaxException;
 10 | import java.nio.file.Files;
 11 | import java.nio.file.Paths;
 12 | import java.util.List;
 13 | import org.apache.hadoop.conf.Configuration;
 14 | import org.apache.hadoop.fs.FileUtil;
 15 | import org.apache.hadoop.hdfs.MiniDFSCluster;
 16 | import org.apache.hadoop.io.LongWritable;
 17 | import org.apache.hadoop.mapreduce.InputSplit;
 18 | import org.apache.hadoop.mapreduce.JobContext;
 19 | import org.apache.hadoop.mapreduce.RecordReader;
 20 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 21 | import org.apache.hadoop.mapreduce.TaskAttemptID;
 22 | import org.apache.hadoop.mapreduce.task.JobContextImpl;
 23 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
 24 | import org.junit.AfterClass;
 25 | import org.junit.Before;
 26 | import org.junit.BeforeClass;
 27 | import org.junit.Test;
 28 | 
 29 | import static org.junit.Assert.assertEquals;
 30 | import static org.junit.Assert.assertTrue;
 31 | import static org.mockito.Mockito.mock;
 32 | 
 33 | public class TestCRAMInputFormatOnHDFS {
 34 |   private String input;
 35 |   private String reference;
 36 |   private TaskAttemptContext taskAttemptContext;
 37 |   private JobContext jobContext;
 38 | 
 39 | 
 40 |   private static MiniDFSCluster cluster;
 41 |   private static URI clusterUri;
 42 | 
 43 |   @BeforeClass
 44 |   public static void setUpBeforeClass() throws Exception {
 45 |     cluster = startMini(TestCRAMInputFormatOnHDFS.class.getName());
 46 |     clusterUri = formalizeClusterURI(cluster.getFileSystem().getUri());
 47 |   }
 48 | 
 49 |   @AfterClass
 50 |   public static void teardownClass() throws Exception {
 51 |     if (cluster != null)
 52 |     {
 53 |       cluster.shutdown();
 54 |     }
 55 |   }
 56 | 
 57 | 
 58 |   @Before
 59 |   public void setup() throws Exception {
 60 |     Configuration conf = new Configuration();
 61 |     input = ClassLoader.getSystemClassLoader().getResource("test.cram").getFile();
 62 |     reference = ClassLoader.getSystemClassLoader().getResource("auxf.fa").toURI().toString();
 63 |     String referenceIndex = ClassLoader.getSystemClassLoader().getResource("auxf.fa.fai")
 64 |         .toURI().toString();
 65 |     conf.set("mapred.input.dir", "file://" + input);
 66 | 
 67 |     URI hdfsRef = clusterUri.resolve("/tmp/auxf.fa");
 68 |     URI hdfsRefIndex = clusterUri.resolve("/tmp/auxf.fa.fai");
 69 |     Files.copy(Paths.get(URI.create(reference)), Paths.get(hdfsRef));
 70 |     Files.copy(Paths.get(URI.create(referenceIndex)), Paths.get(hdfsRefIndex));
 71 | 
 72 |     conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, hdfsRef.toString());
 73 | 
 74 | 
 75 |     taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class));
 76 |     jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID());
 77 | 
 78 |   }
 79 | 
 80 |   private static MiniDFSCluster startMini(String testName) throws IOException {
 81 |     File baseDir = new File("./target/hdfs/" + testName).getAbsoluteFile();
 82 |     FileUtil.fullyDelete(baseDir);
 83 |     Configuration conf = new Configuration();
 84 |     conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath());
 85 |     MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf);
 86 |     MiniDFSCluster hdfsCluster = builder.clusterId(testName).build();
 87 |     hdfsCluster.waitActive();
 88 |     return hdfsCluster;
 89 |   }
 90 | 
 91 |   protected static URI formalizeClusterURI(URI clusterUri) throws URISyntaxException {
 92 |     if (clusterUri.getPath()==null) {
 93 |       return new URI(clusterUri.getScheme(), null,
 94 |           clusterUri.getHost(), clusterUri.getPort(),
 95 |           "/", null, null);
 96 |     } else if (clusterUri.getPath().trim()=="") {
 97 |       return new URI(clusterUri.getScheme(), null,
 98 |           clusterUri.getHost(), clusterUri.getPort(),
 99 |           "/", null, null);
100 |     }
101 |     return clusterUri;
102 |   }
103 | 
104 |   @Test
105 |   public void testReader() throws Exception {
106 |     int expectedCount = 0;
107 |     SamReader samReader = SamReaderFactory.makeDefault()
108 |         .referenceSequence(new File(URI.create(reference))).open(new File(input));
109 |     for (SAMRecord r : samReader) {
110 |       expectedCount++;
111 |     }
112 | 
113 |     CRAMInputFormat inputFormat = new CRAMInputFormat();
114 |     List<InputSplit> splits = inputFormat.getSplits(jobContext);
115 |     assertEquals(1, splits.size());
116 |     RecordReader<LongWritable, SAMRecordWritable> reader = inputFormat
117 |         .createRecordReader(splits.get(0), taskAttemptContext);
118 |     reader.initialize(splits.get(0), taskAttemptContext);
119 | 
120 |     int actualCount = 0;
121 |     while (reader.nextKeyValue()) {
122 |       actualCount++;
123 |     }
124 | 
125 |     assertEquals(expectedCount, actualCount);
126 |   }
127 | 
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/CRAMRecordWriter.java:
--------------------------------------------------------------------------------
  1 | package org.seqdoop.hadoop_bam;
  2 | 
  3 | import java.io.*;
  4 | import java.net.URI;
  5 | import java.nio.file.Paths;
  6 | 
  7 | import htsjdk.samtools.CRAMContainerStreamWriter;
  8 | import htsjdk.samtools.SAMTextHeaderCodec;
  9 | import htsjdk.samtools.cram.ref.ReferenceSource;
 10 | import htsjdk.samtools.SAMFileHeader;
 11 | import htsjdk.samtools.SAMRecord;
 12 | import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
 13 | import htsjdk.samtools.util.StringLineReader;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.mapreduce.RecordWriter;
 16 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 17 | 
 18 | import org.seqdoop.hadoop_bam.util.NIOFileUtil;
 19 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
 20 | 
 21 | /** A base {@link RecordWriter} for CRAM records.
 22 |  *
 23 |  * <p>Handles the output stream, writing the header if requested, and provides
 24 |  * the {@link #writeAlignment} function for subclasses.</p>
 25 |  * <p>Note that each file created by this class consists of a fragment of a
 26 |  * complete CRAM file containing only one or more CRAM containers that do not
 27 |  * include a CRAM file header, a SAMFileHeader, or a CRAM EOF container.</p>
 28 |  */
 29 | public abstract class CRAMRecordWriter<K>
 30 |         extends RecordWriter<K,SAMRecordWritable>
 31 | {
 32 |     // generic ID passed to CRAM code for internal error reporting
 33 |     private static final String HADOOP_BAM_PART_ID= "Hadoop-BAM-Part";
 34 |     private OutputStream   origOutput;
 35 |     private CRAMContainerStreamWriter cramContainerStream = null;
 36 |     private ReferenceSource refSource = null;
 37 |     private boolean writeHeader = true;
 38 | 
 39 |     /** A SAMFileHeader is read from the input Path. */
 40 |     public CRAMRecordWriter(
 41 |             final Path output,
 42 |             final Path input,
 43 |             final boolean writeHeader,
 44 |             final TaskAttemptContext ctx) throws IOException
 45 |     {
 46 |         init(
 47 |                 output,
 48 |                 SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()),
 49 |                 writeHeader, ctx);
 50 |     }
 51 | 
 52 |     public CRAMRecordWriter(
 53 |             final Path output, final SAMFileHeader header, final boolean writeHeader,
 54 |             final TaskAttemptContext ctx)
 55 |             throws IOException
 56 |     {
 57 |         init(
 58 |                 output.getFileSystem(ctx.getConfiguration()).create(output),
 59 |                 header, writeHeader, ctx);
 60 |     }
 61 | 
 62 |     // Working around not being able to call a constructor other than as the
 63 |     // first statement...
 64 |     private void init(
 65 |             final Path output, final SAMFileHeader header, final boolean writeHeader,
 66 |             final TaskAttemptContext ctx)
 67 |             throws IOException
 68 |     {
 69 |         init(
 70 |                 output.getFileSystem(ctx.getConfiguration()).create(output),
 71 |                 header, writeHeader, ctx);
 72 |     }
 73 | 
 74 |     private void init(
 75 |             final OutputStream output, final SAMFileHeader header, final boolean writeHeader,
 76 |             final TaskAttemptContext ctx)
 77 |             throws IOException
 78 |     {
 79 |         origOutput = output;
 80 |         this.writeHeader = writeHeader;
 81 | 
 82 |         final String referenceURI =
 83 |                 ctx.getConfiguration().get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
 84 |         refSource = new ReferenceSource(referenceURI == null ? null :
 85 |             NIOFileUtil.asPath(referenceURI));
 86 | 
 87 |         // A SAMFileHeader must be supplied at CRAMContainerStreamWriter creation time; if
 88 |         // we don't have one then delay creation until we do
 89 |         if (header != null) {
 90 |             cramContainerStream = new CRAMContainerStreamWriter(
 91 |                     origOutput, null, refSource, header, HADOOP_BAM_PART_ID);
 92 |             if (writeHeader) {
 93 |                 this.writeHeader(header);
 94 |             }
 95 |         }
 96 |     }
 97 | 
 98 |     @Override public void close(TaskAttemptContext ctx) throws IOException {
 99 |         cramContainerStream.finish(false); // Close, but suppress CRAM EOF container
100 |         origOutput.close(); // And close the original output.
101 |     }
102 | 
103 |     protected void writeAlignment(final SAMRecord rec) {
104 |         if (null == cramContainerStream) {
105 |             final SAMFileHeader header = rec.getHeader();
106 |             if (header == null) {
107 |                 throw new RuntimeException("Cannot write record to CRAM: null header in SAM record");
108 |             }
109 |             if (writeHeader) {
110 |                 this.writeHeader(header);
111 |             }
112 |             cramContainerStream = new CRAMContainerStreamWriter(
113 |                     origOutput, null, refSource, header, HADOOP_BAM_PART_ID);
114 |         }
115 |         cramContainerStream.writeAlignment(rec);
116 |     }
117 | 
118 |     private void writeHeader(final SAMFileHeader header) {
119 |         cramContainerStream.writeHeader(header);
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2013 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | package org.seqdoop.hadoop_bam;
 22 | 
 23 | import java.io.UnsupportedEncodingException;
 24 | import java.util.List;
 25 | 
 26 | import htsjdk.tribble.readers.LineIterator;
 27 | import htsjdk.variant.variantcontext.Allele;
 28 | import htsjdk.variant.variantcontext.LazyGenotypesContext;
 29 | import htsjdk.variant.vcf.AbstractVCFCodec;
 30 | import htsjdk.variant.vcf.VCFHeader;
 31 | import htsjdk.variant.vcf.VCFHeaderLine;
 32 | import htsjdk.variant.vcf.VCFHeaderVersion;
 33 | 
 34 | // File created: 2013-07-03 15:41:21
 35 | 
 36 | // The actual parsing is delegated to AbstractVCFCodec.
 37 | public class LazyVCFGenotypesContext extends LazyParsingGenotypesContext {
 38 | 
 39 | 	/** Takes ownership of the given byte[]: don't modify its contents. */
 40 | 	public LazyVCFGenotypesContext(
 41 | 		List<Allele> alleles, String chrom, int start,
 42 | 		byte[] utf8Unparsed, int count)
 43 | 	{
 44 | 		super(new Parser(alleles, chrom, start), utf8Unparsed, count);
 45 | 	}
 46 | 
 47 | 	public static class HeaderDataCache
 48 | 		implements LazyParsingGenotypesContext.HeaderDataCache
 49 | 	{
 50 | 		private HeaderSettableVCFCodec codec = new HeaderSettableVCFCodec();
 51 | 
 52 | 		@Override public void setHeader(VCFHeader header) {
 53 | 			VCFHeaderVersion version = null;
 54 | 
 55 | 			// Normally AbstractVCFCodec parses the header and thereby sets the
 56 | 			// version field. It gets used later on so we need to set it.
 57 | 			for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) {
 58 | 				if (VCFHeaderVersion.isFormatString(line.getKey())) {
 59 | 					version = VCFHeaderVersion.toHeaderVersion(line.getValue());
 60 | 					break;
 61 | 				}
 62 | 			}
 63 | 
 64 | 			codec.setHeaderAndVersion(header, version);
 65 | 		}
 66 | 
 67 | 		public AbstractVCFCodec getCodec() { return codec; }
 68 | 	}
 69 | 
 70 | 	public static class Parser extends LazyParsingGenotypesContext.Parser {
 71 | 		private HeaderSettableVCFCodec codec = null;
 72 | 		private final List<Allele> alleles;
 73 | 		private final String chrom;
 74 | 		private final int start;
 75 | 
 76 | 		public Parser(List<Allele> alleles, String chrom, int start) {
 77 | 			this.alleles = alleles;
 78 | 			this.chrom = chrom;
 79 | 			this.start = start;
 80 | 		}
 81 | 
 82 | 		@Override public void setHeaderDataCache(
 83 | 			LazyParsingGenotypesContext.HeaderDataCache data)
 84 | 		{
 85 | 			codec = (HeaderSettableVCFCodec)((HeaderDataCache)data).getCodec();
 86 | 		}
 87 | 
 88 | 		@Override public LazyGenotypesContext.LazyData parse(final Object data) {
 89 | 			if (codec == null || !codec.hasHeader())
 90 | 				throw new IllegalStateException(
 91 | 					"Cannot decode genotypes without a codec with a VCFHeader");
 92 | 
 93 | 			final String str;
 94 | 			try {
 95 | 				str = new String((byte[])data, "UTF-8");
 96 | 			} catch (UnsupportedEncodingException absurd) {
 97 | 				throw new RuntimeException(
 98 | 					"Can never happen on a compliant Java implementation because "+
 99 | 					"UTF-8 is guaranteed to be supported");
100 | 			}
101 | 			return codec.createGenotypeMap(str, alleles, chrom, start);
102 | 		}
103 | 	}
104 | }
105 | 
106 | // This is a HACK. But, the functionality is only in AbstractVCFCodec so it
107 | // can't be helped. This is preferable to copying the functionality into
108 | // parse() above.
109 | class HeaderSettableVCFCodec extends AbstractVCFCodec {
110 | 	public boolean hasHeader() { return header != null; }
111 | 
112 | 	public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) {
113 | 		this.header = header;
114 | 		this.version = ver;
115 | 	}
116 | 
117 | 	@Override public Object readActualHeader(LineIterator reader) {
118 | 		throw new UnsupportedOperationException(
119 | 			"Internal error: this shouldn't be called");
120 | 	}
121 | 	@Override public List<String> parseFilters(String filterString) {
122 | 		throw new UnsupportedOperationException(
123 | 			"Internal error: this shouldn't be called");
124 | 	}
125 | 	@Override public boolean canDecode(String s) {
126 | 		return true;
127 | 	}
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/ReferenceFragment.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2012 Aalto University
  2 | //
  3 | // This file is part of Hadoop-BAM.
  4 | //
  5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | // of this software and associated documentation files (the "Software"), to
  7 | // deal in the Software without restriction, including without limitation the
  8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9 | // sell copies of the Software, and to permit persons to whom the Software is
 10 | // furnished to do so, subject to the following conditions:
 11 | //
 12 | // The above copyright notice and this permission notice shall be included in
 13 | // all copies or substantial portions of the Software.
 14 | //
 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 21 | // IN THE SOFTWARE.
 22 | 
 23 | package org.seqdoop.hadoop_bam;
 24 | 
 25 | import org.apache.hadoop.io.Text;
 26 | import org.apache.hadoop.io.Writable;
 27 | import org.apache.hadoop.io.WritableUtils;
 28 | 
 29 | import java.io.IOException;
 30 | import java.io.DataInput;
 31 | import java.io.DataOutput;
 32 | 
 33 | // partly based on SequencedFragment
 34 | // note: this class is supposed to represent a single line of a fasta input file, augmented by chromosome/contig name and start position
 35 | 
 36 | public class ReferenceFragment implements Writable
 37 | {
 38 |     protected Text sequence = new Text();
 39 |     
 40 |     protected Integer position;
 41 |     protected String indexSequence;
 42 | 
 43 |     public void clear()
 44 |     {
 45 | 	sequence.clear();
 46 | 	indexSequence = null;
 47 | 	position = null;
 48 |     }
 49 | 
 50 |     /**
 51 |      * Get sequence Text object.
 52 |      * Trade encapsulation for efficiency.  Here we expose the internal Text
 53 |      * object so that data may be read and written diretly from/to it.
 54 |      *
 55 |      * Sequence should always be written using CAPITAL letters and 'N' for unknown bases.
 56 |      */
 57 |     public Text getSequence() { return sequence; }
 58 | 
 59 |     /**
 60 |      * Get quality Text object.
 61 |      * Trade encapsulation for efficiency.  Here we expose the internal Text
 62 |      * object so that data may be read and written diretly from/to it.
 63 |      *
 64 |      */
 65 |     public void setPosition(Integer pos) {
 66 | 	if (pos == null)
 67 | 	    throw new IllegalArgumentException("can't have null reference position");
 68 | 	position = pos;
 69 |     }
 70 | 
 71 |     public void setIndexSequence(String v) {
 72 | 	if (v == null)
 73 | 	    throw new IllegalArgumentException("can't have null index sequence");
 74 | 	indexSequence = v;
 75 |     }
 76 | 
 77 |     public void setSequence(Text seq)
 78 |     {
 79 | 	if (seq == null)
 80 | 	    throw new IllegalArgumentException("can't have a null sequence");
 81 | 	sequence = seq;
 82 |     }
 83 | 
 84 |     public Integer getPosition() { return position; }
 85 |     public String getIndexSequence() { return indexSequence; }
 86 | 
 87 |     /**
 88 |      * Recreates a pseudo fasta record with the fields available.
 89 |      */
 90 |     public String toString()
 91 |     {
 92 | 	String delim = "\t";
 93 | 	StringBuilder builder = new StringBuilder(800);
 94 | 	builder.append(indexSequence).append(delim);
 95 | 	builder.append(position).append(delim);
 96 | 	builder.append(sequence);
 97 | 	return builder.toString();
 98 |     }
 99 | 
100 |     public boolean equals(Object other)
101 |     {
102 | 	if (other != null && other instanceof ReferenceFragment)
103 | 	    {
104 | 		ReferenceFragment otherFrag = (ReferenceFragment)other;
105 | 
106 | 		if (position == null && otherFrag.position != null || position != null && !position.equals(otherFrag.position))
107 | 		    return false;
108 | 		if (indexSequence == null && otherFrag.indexSequence != null || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence))
109 | 		    return false;
110 | 		// sequence can't be null
111 | 		if (!sequence.equals(otherFrag.sequence))
112 | 		    return false;
113 | 
114 | 		return true;
115 | 	    }
116 | 	else
117 | 	    return false;
118 |     }
119 | 
120 | 	@Override
121 | 	public int hashCode() {
122 | 		int result = sequence.hashCode();
123 | 		result = 31 * result + (position != null ? position.hashCode() : 0);
124 | 		result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0);
125 | 		return result;
126 | 	}
127 | 
128 | 	public void readFields(DataInput in) throws IOException
129 |     {
130 | 	// serialization order:
131 | 	// 1) sequence
132 | 	// 2) indexSequence (chromosome/contig name)
133 | 	// 3) position of first base in this line of the fasta file
134 | 
135 | 	this.clear();
136 | 
137 | 	sequence.readFields(in);
138 | 
139 | 	indexSequence = WritableUtils.readString(in);
140 | 	position = WritableUtils.readVInt(in);
141 |     }
142 | 
143 |     public void write(DataOutput out) throws IOException
144 |     {
145 | 	sequence.write(out);
146 | 
147 | 	WritableUtils.writeString(out, indexSequence);
148 | 	WritableUtils.writeVInt(out, position);
149 | 	
150 |     }
151 | }
152 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/SAMOutputPreparer.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2012 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | // File created: 2012-07-26 14:36:03
 22 | 
 23 | package org.seqdoop.hadoop_bam.util;
 24 | 
 25 | import java.io.File;
 26 | import java.io.FilterOutputStream;
 27 | import java.io.IOException;
 28 | import java.io.OutputStream;
 29 | import java.io.OutputStreamWriter;
 30 | import java.io.StringWriter;
 31 | import java.nio.ByteBuffer;
 32 | import java.nio.ByteOrder;
 33 | import java.util.List;
 34 | 
 35 | import htsjdk.samtools.SAMFileHeader;
 36 | import htsjdk.samtools.SAMSequenceRecord;
 37 | import htsjdk.samtools.SAMTextHeaderCodec;
 38 | import htsjdk.samtools.cram.build.CramIO;
 39 | import htsjdk.samtools.cram.common.CramVersions;
 40 | import htsjdk.samtools.util.BlockCompressedOutputStream;
 41 | 
 42 | import org.seqdoop.hadoop_bam.SAMFormat;
 43 | 
 44 | public class SAMOutputPreparer {
 45 | 	private ByteBuffer buf;
 46 | 
 47 | 	public SAMOutputPreparer() {
 48 | 		// Enough room for a 32-bit integer.
 49 | 		buf = ByteBuffer.wrap(new byte[4]);
 50 | 		buf.order(ByteOrder.LITTLE_ENDIAN);
 51 | 	}
 52 | 
 53 | 	public static final byte[] BAM_MAGIC = {'B','A','M', 1};
 54 | 
 55 | 	/** Prepares the given output stream for writing of SAMRecords in the given
 56 | 	 * format. This includes writing the given SAM header and, in the case of
 57 | 	 * BAM or CRAM, writing some further metadata as well as compressing everything
 58 | 	 * written. Returns a new stream to replace the original: it will do the
 59 | 	 * appropriate compression for BAM/CRAM files.
 60 | 	 */
 61 | 	public OutputStream prepareForRecords(
 62 | 			OutputStream out, final SAMFormat format,
 63 | 			final SAMFileHeader header)
 64 | 		throws IOException {
 65 | 
 66 |         switch (format) {
 67 |             case SAM:
 68 |                 out = prepareSAMOrBAMStream(out, format, header);
 69 |                 break;
 70 |             case BAM:
 71 |                 out = prepareSAMOrBAMStream(out, format, header);
 72 |                 break;
 73 |             case CRAM:
 74 |                 out = prepareCRAMStream(out, format, header);
 75 |                 break;
 76 |             default:
 77 |                 throw new IllegalArgumentException
 78 |                     ("Unsupported SAM file format, must be one of SAM, BAM or CRAM");
 79 |         }
 80 | 
 81 |         // Important for BAM: if the caller doesn't want to use the new stream
 82 |         // for some reason, the BlockCompressedOutputStream's buffer would never
 83 |         // be flushed.
 84 |         out.flush();
 85 |         return out;
 86 | 	}
 87 | 
 88 | 	private OutputStream prepareCRAMStream(
 89 | 			OutputStream out, final SAMFormat format,
 90 | 	        final SAMFileHeader header)  throws IOException
 91 | 	{
 92 | 		CramIO.writeHeader(CramVersions.DEFAULT_CRAM_VERSION, out, header, null);
 93 | 		return out;
 94 | 	}
 95 | 
 96 | 	private OutputStream prepareSAMOrBAMStream(
 97 | 			OutputStream out, final SAMFormat format,
 98 | 			final SAMFileHeader header) throws IOException
 99 | 	{
100 | 		final StringWriter sw = new StringWriter();
101 | 		new SAMTextHeaderCodec().encode(sw, header);
102 | 		final String text = sw.toString();
103 | 
104 | 		if (format == SAMFormat.BAM) {
105 |                     out = new BlockCompressedOutputStream(out, (File) null);
106 | 			out.write(BAM_MAGIC);
107 | 			writeInt32(out, text.length());
108 | 		}
109 | 
110 | 		writeString(out, text);
111 | 
112 | 		if (format == SAMFormat.BAM) {
113 | 			final List<SAMSequenceRecord> refs =
114 | 				header.getSequenceDictionary().getSequences();
115 | 
116 | 			writeInt32(out, refs.size());
117 | 
118 | 			for (final SAMSequenceRecord ref : refs) {
119 | 				final String name = ref.getSequenceName();
120 | 				writeInt32(out, name.length() + 1);
121 | 				writeString(out, name);
122 | 				out.write(0);
123 | 				writeInt32(out, ref.getSequenceLength());
124 | 			}
125 | 		}
126 | 
127 | 		return out;
128 | 	}
129 | 
130 | 	private static void writeString(final OutputStream out, final String s)
131 | 		throws IOException
132 | 	{
133 | 		// Don't flush the underlying stream yet, only the writer: in the case of
134 | 		// BAM, we might be able to cram more things into the gzip block still.
135 | 		final OutputStreamWriter w = new OutputStreamWriter(
136 | 			new FilterOutputStream(out) { @Override public void flush() {} } );
137 | 		w.write(s);
138 | 		w.flush();
139 | 	}
140 | 
141 | 	private void writeInt32(final OutputStream out, int n) throws IOException {
142 | 		buf.putInt(0, n);
143 | 		out.write(buf.array());
144 | 	}
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndex.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2010 Aalto University
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to
  5 | // deal in the Software without restriction, including without limitation the
  6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | // sell copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | // IN THE SOFTWARE.
 20 | 
 21 | // File created: 2010-08-04 13:11:10
 22 | 
 23 | package org.seqdoop.hadoop_bam;
 24 | 
 25 | import java.io.BufferedInputStream;
 26 | import java.io.File;
 27 | import java.io.FileInputStream;
 28 | import java.io.InputStream;
 29 | import java.io.IOException;
 30 | import java.nio.ByteBuffer;
 31 | import java.util.ArrayList;
 32 | import java.util.List;
 33 | import java.util.NavigableSet;
 34 | import java.util.TreeSet;
 35 | 
 36 | /** An index into BAM files, for {@link BAMInputFormat}. Reads files that are
 37 |  * created by {@link SplittingBAMIndexer}.
 38 |  *
 39 |  * <p>Indexes the positions of individual BAM records in the file.</p>
 40 |  */
 41 | public final class SplittingBAMIndex {
 42 | 	private final NavigableSet<Long> virtualOffsets = new TreeSet<Long>();
 43 | 
 44 | 	public SplittingBAMIndex() {}
 45 | 	public SplittingBAMIndex(final File path) throws IOException {
 46 | 		this(new BufferedInputStream(new FileInputStream(path)));
 47 | 	}
 48 | 	public SplittingBAMIndex(final InputStream in) throws IOException {
 49 | 		readIndex(in);
 50 | 	}
 51 | 
 52 | 	public void readIndex(final InputStream in) throws IOException {
 53 | 		virtualOffsets.clear();
 54 | 
 55 | 		final ByteBuffer bb = ByteBuffer.allocate(8);
 56 | 
 57 | 		for (long prev = -1; in.read(bb.array()) == 8;) {
 58 | 			final long cur = bb.getLong(0);
 59 | 			if (prev > cur)
 60 | 				throw new IOException(String.format(
 61 | 					"Invalid splitting BAM index; offsets not in order: %#x > %#x",
 62 | 					prev, cur));
 63 | 
 64 | 			virtualOffsets.add(prev = cur);
 65 | 		}
 66 | 		in.close();
 67 | 
 68 | 		if (virtualOffsets.size() < 1)
 69 | 			throw new IOException(
 70 | 				"Invalid splitting BAM index: "+
 71 | 				"should contain at least the file size");
 72 | 	}
 73 | 
 74 | 	public List<Long> getVirtualOffsets() {
 75 | 		return new ArrayList<>(virtualOffsets);
 76 | 	}
 77 | 
 78 | 	public Long prevAlignment(final long filePos) {
 79 | 		return virtualOffsets.floor(filePos << 16);
 80 | 	}
 81 | 	public Long nextAlignment(final long filePos) {
 82 | 		return virtualOffsets.higher(filePos << 16);
 83 | 	}
 84 | 
 85 | 	public int size() { return virtualOffsets.size(); }
 86 | 
 87 | 	private long   first() { return virtualOffsets.first(); }
 88 | 	private long    last() { return prevAlignment(bamSize() - 1); }
 89 | 	long bamSize() { return virtualOffsets.last() >>> 16; }
 90 | 
 91 | 	@Override
 92 | 	public boolean equals(Object o) {
 93 | 		if (this == o) return true;
 94 | 		if (o == null || getClass() != o.getClass()) return false;
 95 | 
 96 | 		SplittingBAMIndex that = (SplittingBAMIndex) o;
 97 | 
 98 | 		return virtualOffsets != null ? virtualOffsets.equals(that.virtualOffsets) : that
 99 | 				.virtualOffsets == null;
100 | 
101 | 	}
102 | 
103 | 	@Override
104 | 	public int hashCode() {
105 | 		return virtualOffsets != null ? virtualOffsets.hashCode() : 0;
106 | 	}
107 | 
108 | 	@Override
109 | 	public String toString() {
110 | 		return virtualOffsets.toString();
111 | 	}
112 | 
113 | 	/** Writes some statistics about each splitting BAM index file given as an
114 | 	 * argument.
115 | 	 */
116 | 	public static void main(String[] args) {
117 | 		if (args.length == 0) {
118 | 			System.out.println(
119 | 				"Usage: SplittingBAMIndex [splitting BAM indices...]\n\n"+
120 | 
121 | 				"Writes a few statistics about each splitting BAM index.");
122 | 			return;
123 | 		}
124 | 
125 | 		for (String arg : args) {
126 | 			final File f = new File(arg);
127 | 			if (f.isFile() && f.canRead()) {
128 | 				try {
129 | 					System.err.printf("%s:\n", f);
130 | 					final SplittingBAMIndex bi = new SplittingBAMIndex(f);
131 | 					if (bi.size() == 1) {
132 | 						System.err.printf("\t0 alignments\n" +
133 | 								"\tassociated BAM file size %d\n", bi.bamSize());
134 | 					} else {
135 | 						final long first = bi.first();
136 | 						final long last = bi.last();
137 | 						System.err.printf(
138 | 								"\t%d alignments\n" +
139 | 										"\tfirst is at %#06x in BGZF block at %#014x\n" +
140 | 										"\tlast  is at %#06x in BGZF block at %#014x\n" +
141 | 										"\tassociated BAM file size %d\n",
142 | 								bi.size(),
143 | 								first & 0xffff, first >>> 16,
144 | 								last & 0xffff, last >>> 16,
145 | 								bi.bamSize());
146 | 					}
147 | 				} catch (IOException e) {
148 | 					System.err.printf("Failed to read %s!\n", f);
149 | 					e.printStackTrace();
150 | 				}
151 | 			} else
152 | 				System.err.printf("%s does not look like a readable file!\n", f);
153 | 		}
154 | 	}
155 | }
156 | 


--------------------------------------------------------------------------------