allIntervals = IntervalUtil.getIntervals(conf, "prop-name");
65 | Assert.assertNotNull(allIntervals);
66 | Assert.assertEquals(allIntervals.size(), validIntervals.length);
67 | for (int i = 0; i < validIntervals.length; i++) {
68 | Assert.assertNotNull(allIntervals.get(i));
69 | Assert.assertEquals(allIntervals.get(i).getContig(), validIntervals[i][1]);
70 | Assert.assertEquals(allIntervals.get(i).getStart(), validIntervals[i][2]);
71 | Assert.assertEquals(allIntervals.get(i).getEnd(), validIntervals[i][3]);
72 | }
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/BGZFEnhancedGzipCodec.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam.util;
2 |
3 | import htsjdk.samtools.util.BlockCompressedInputStream;
4 | import java.io.BufferedInputStream;
5 | import java.io.IOException;
6 | import java.io.InputStream;
7 | import org.apache.hadoop.fs.Seekable;
8 | import org.apache.hadoop.io.compress.CompressionCodec;
9 | import org.apache.hadoop.io.compress.CompressionInputStream;
10 | import org.apache.hadoop.io.compress.Decompressor;
11 | import org.apache.hadoop.io.compress.GzipCodec;
12 | import org.apache.hadoop.io.compress.SplitCompressionInputStream;
13 | import org.apache.hadoop.io.compress.SplittableCompressionCodec;
14 |
15 | /**
16 | * A Hadoop {@link CompressionCodec} for the
17 | * BGZF compression format,
18 | * which reads and writes files with a .gz suffix.
19 | *
20 | * BGZF is a splittable extension of gzip, which means that all BGZF files are standard
21 | * gzip files, however the reverse is not necessarily the case. BGZF files often have the
22 | * standard .gz suffix (such as those produced by the
23 | * bcftools command),
24 | * which causes a difficulty since it is not immediately apparent from the filename alone
25 | * whether a file is a BGZF file, or merely a regular gzip file. BGZFEnhancedGzipCodec
26 | * will read the start of the file to look for BGZF headers to detect the type of
27 | * compression.
28 | *
29 | *
30 | * BGZFEnhancedGzipCodec will read BGZF or gzip files, but currently always writes regular gzip files.
31 | *
32 | *
33 | * To use BGZFEnhancedGzipCodec, set it on the configuration object as follows. This will
34 | * override the built-in GzipCodec that is mapped to the .gz suffix.
35 | *
36 | * {@code
37 | * conf.set("io.compression.codecs", BGZFEnhancedGzipCodec.class.getCanonicalName())
38 | * }
39 | * @see BGZFCodec
40 | */
41 | public class BGZFEnhancedGzipCodec extends GzipCodec implements SplittableCompressionCodec {
42 |
43 | @Override
44 | public SplitCompressionInputStream createInputStream(InputStream seekableIn, Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException {
45 | if (!(seekableIn instanceof Seekable)) {
46 | throw new IOException("seekableIn must be an instance of " +
47 | Seekable.class.getName());
48 | }
49 | if (!BlockCompressedInputStream.isValidFile(new BufferedInputStream(seekableIn))) {
50 | // data is regular gzip, not BGZF
51 | ((Seekable)seekableIn).seek(0);
52 | final CompressionInputStream compressionInputStream = createInputStream(seekableIn,
53 | decompressor);
54 | return new SplitCompressionInputStream(compressionInputStream, start, end) {
55 | @Override
56 | public int read(byte[] b, int off, int len) throws IOException {
57 | return compressionInputStream.read(b, off, len);
58 | }
59 | @Override
60 | public void resetState() throws IOException {
61 | compressionInputStream.resetState();
62 | }
63 | @Override
64 | public int read() throws IOException {
65 | return compressionInputStream.read();
66 | }
67 | };
68 | }
69 | BGZFSplitGuesser splitGuesser = new BGZFSplitGuesser(seekableIn);
70 | long adjustedStart = splitGuesser.guessNextBGZFBlockStart(start, end);
71 | ((Seekable)seekableIn).seek(adjustedStart);
72 | return new BGZFSplitCompressionInputStream(seekableIn, adjustedStart, end);
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/BaseSplitGuesser.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam;
2 |
3 | import htsjdk.samtools.seekablestream.SeekableStream;
4 | import java.io.IOException;
5 | import java.nio.ByteBuffer;
6 | import java.nio.ByteOrder;
7 | import org.apache.hadoop.io.IOUtils;
8 |
9 | class BaseSplitGuesser {
10 |
11 | protected final static int BGZF_MAGIC = 0x04088b1f;
12 | protected final static int BGZF_MAGIC_SUB = 0x00024342;
13 | protected final static int BGZF_SUB_SIZE = 4 + 2;
14 |
15 | protected SeekableStream in;
16 | protected final ByteBuffer buf;
17 |
18 | public BaseSplitGuesser() {
19 | buf = ByteBuffer.allocate(8);
20 | buf.order(ByteOrder.LITTLE_ENDIAN);
21 | }
22 |
23 | protected static class PosSize {
24 | public int pos;
25 | public int size;
26 | public PosSize(int p, int s) { pos = p; size = s; }
27 | }
28 |
29 | // Gives the compressed size on the side. Returns null if it doesn't find
30 | // anything.
31 | protected PosSize guessNextBGZFPos(int p, int end) {
32 | try { for (;;) {
33 | for (;;) {
34 | in.seek(p);
35 | IOUtils.readFully(in, buf.array(), 0, 4);
36 | int n = buf.getInt(0);
37 |
38 | if (n == BGZF_MAGIC)
39 | break;
40 |
41 | // Skip ahead a bit more than 1 byte if you can.
42 | if (n >>> 8 == BGZF_MAGIC << 8 >>> 8)
43 | ++p;
44 | else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16)
45 | p += 2;
46 | else
47 | p += 3;
48 |
49 | if (p >= end)
50 | return null;
51 | }
52 | // Found what looks like a gzip block header: now get XLEN and
53 | // search for the BGZF subfield.
54 | final int p0 = p;
55 | p += 10;
56 | in.seek(p);
57 | IOUtils.readFully(in, buf.array(), 0, 2);
58 | p += 2;
59 | final int xlen = getUShort(0);
60 | final int subEnd = p + xlen;
61 |
62 | while (p < subEnd) {
63 | IOUtils.readFully(in, buf.array(), 0, 4);
64 |
65 | if (buf.getInt(0) != BGZF_MAGIC_SUB) {
66 | p += 4 + getUShort(2);
67 | in.seek(p);
68 | continue;
69 | }
70 |
71 | // Found it: this is close enough to a BGZF block, make it
72 | // our guess.
73 |
74 | // But find out the size before returning. First, grab bsize:
75 | // we'll need it later.
76 | IOUtils.readFully(in, buf.array(), 0, 2);
77 | int bsize = getUShort(0);
78 |
79 | // Then skip the rest of the subfields.
80 | p += BGZF_SUB_SIZE;
81 | while (p < subEnd) {
82 | in.seek(p);
83 | IOUtils.readFully(in, buf.array(), 0, 4);
84 | p += 4 + getUShort(2);
85 | }
86 | if (p != subEnd) {
87 | // Cancel our guess because the xlen field didn't match the
88 | // data.
89 | break;
90 | }
91 |
92 | // Now skip past the compressed data and the CRC-32.
93 | p += bsize - xlen - 19 + 4;
94 | in.seek(p);
95 | IOUtils.readFully(in, buf.array(), 0, 4);
96 | return new PosSize(p0, buf.getInt(0));
97 | }
98 | // No luck: look for the next gzip block header. Start right after
99 | // where we last saw the identifiers, although we could probably
100 | // safely skip further ahead. (If we find the correct one right
101 | // now, the previous block contained 0x1f8b0804 bytes of data: that
102 | // seems... unlikely.)
103 | p = p0 + 4;
104 |
105 | }} catch (IOException e) {
106 | return null;
107 | }
108 | }
109 |
110 | protected int getUShort(final int idx) {
111 | return (int)buf.getShort(idx) & 0xffff;
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWriter.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2010 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2012-02-23 12:42:49
22 |
23 | package org.seqdoop.hadoop_bam;
24 |
25 | import java.io.IOException;
26 | import java.io.OutputStream;
27 |
28 | import htsjdk.samtools.SAMFileHeader;
29 | import htsjdk.samtools.SAMRecord;
30 | import htsjdk.samtools.SAMTextWriter;
31 |
32 | import org.apache.hadoop.fs.Path;
33 | import org.apache.hadoop.mapreduce.RecordWriter;
34 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
35 |
36 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
37 |
38 | /** A base {@link RecordWriter} for SAM records.
39 | *
40 | * Handles the output stream, writing the header if requested, and provides
41 | * the {@link #writeAlignment} function for subclasses.
42 | */
43 | public abstract class SAMRecordWriter
44 | extends RecordWriter
45 | {
46 | private SAMTextWriter writer;
47 | private SAMFileHeader header;
48 |
49 | /** A SAMFileHeader is read from the input Path. */
50 | public SAMRecordWriter(
51 | Path output, Path input, boolean writeHeader, TaskAttemptContext ctx)
52 | throws IOException
53 | {
54 | init(
55 | output,
56 | SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()),
57 | writeHeader, ctx);
58 | }
59 | public SAMRecordWriter(
60 | Path output, SAMFileHeader header, boolean writeHeader,
61 | TaskAttemptContext ctx)
62 | throws IOException
63 | {
64 | init(
65 | output.getFileSystem(ctx.getConfiguration()).create(output),
66 | header, writeHeader);
67 | }
68 | public SAMRecordWriter(
69 | OutputStream output, SAMFileHeader header, boolean writeHeader)
70 | throws IOException
71 | {
72 | init(output, header, writeHeader);
73 | }
74 |
75 | private void init(
76 | Path output, SAMFileHeader header, boolean writeHeader,
77 | TaskAttemptContext ctx)
78 | throws IOException
79 | {
80 | init(
81 | output.getFileSystem(ctx.getConfiguration()).create(output),
82 | header, writeHeader);
83 | }
84 | private void init(
85 | OutputStream output, SAMFileHeader header, boolean writeHeader)
86 | throws IOException
87 | {
88 | this.header = header;
89 | writer = new SAMTextWriter(output);
90 |
91 | writer.setSortOrder(header.getSortOrder(), false);
92 | if (writeHeader)
93 | writer.setHeader(header);
94 | }
95 |
96 | @Override public void close(TaskAttemptContext ctx) {
97 | writer.close();
98 | }
99 |
100 | protected void writeAlignment(final SAMRecord rec) {
101 | rec.setHeader(header);
102 | writer.writeAlignment(rec);
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMOutputFormat.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2010 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2010-08-11 12:19:23
22 |
23 | package org.seqdoop.hadoop_bam;
24 |
25 | import java.io.IOException;
26 | import java.io.InputStream;
27 |
28 | import htsjdk.samtools.SAMFileHeader;
29 |
30 | import org.apache.hadoop.conf.Configuration;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.hadoop.mapreduce.RecordWriter;
33 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
34 |
35 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
36 |
37 | /** Writes only the BAM records, not the key.
38 | *
39 | * A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or
40 | * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.
41 | *
42 | * By default, writes the SAM header to the output file(s). This
43 | * can be disabled, because in distributed usage one often ends up with (and,
44 | * for decent performance, wants to end up with) the output split into multiple
45 | * parts, which are easier to concatenate if the header is not present in each
46 | * file.
47 | */
48 | public class KeyIgnoringBAMOutputFormat extends BAMOutputFormat {
49 | protected SAMFileHeader header;
50 | private boolean writeHeader = true;
51 |
52 | public KeyIgnoringBAMOutputFormat() {}
53 |
54 | /** Whether the header will be written or not. */
55 | public boolean getWriteHeader() { return writeHeader; }
56 |
57 | /** Set whether the header will be written or not. */
58 | public void setWriteHeader(boolean b) { writeHeader = b; }
59 |
60 | public SAMFileHeader getSAMHeader() { return header; }
61 | public void setSAMHeader(SAMFileHeader header) { this.header = header; }
62 |
63 | public void readSAMHeaderFrom(Path path, Configuration conf)
64 | throws IOException
65 | {
66 | this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf);
67 | }
68 | public void readSAMHeaderFrom(InputStream in, Configuration conf) {
69 | this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
70 | }
71 |
72 | /** setSAMHeader or readSAMHeaderFrom must have
73 | * been called first.
74 | */
75 | @Override public RecordWriter getRecordWriter(
76 | TaskAttemptContext ctx)
77 | throws IOException
78 | {
79 | return getRecordWriter(ctx, getDefaultWorkFile(ctx, ""));
80 | }
81 |
82 | // Allows wrappers to provide their own work file.
83 | public RecordWriter getRecordWriter(
84 | TaskAttemptContext ctx, Path out)
85 | throws IOException
86 | {
87 | if (this.header == null)
88 | throw new IOException(
89 | "Can't create a RecordWriter without the SAM header");
90 |
91 | return new KeyIgnoringBAMRecordWriter(out, header, writeHeader, ctx);
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/CRAMInputFormat.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam;
2 |
3 | import htsjdk.samtools.cram.build.CramContainerIterator;
4 | import htsjdk.samtools.seekablestream.SeekableStream;
5 | import java.io.IOException;
6 | import java.util.ArrayList;
7 | import java.util.HashMap;
8 | import java.util.List;
9 | import java.util.Map;
10 | import org.apache.hadoop.conf.Configuration;
11 | import org.apache.hadoop.fs.Path;
12 | import org.apache.hadoop.io.LongWritable;
13 | import org.apache.hadoop.mapreduce.InputSplit;
14 | import org.apache.hadoop.mapreduce.JobContext;
15 | import org.apache.hadoop.mapreduce.RecordReader;
16 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
18 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
19 | import org.seqdoop.hadoop_bam.util.WrapSeekable;
20 |
21 | public class CRAMInputFormat extends FileInputFormat {
22 |
23 | public static final String REFERENCE_SOURCE_PATH_PROPERTY =
24 | "hadoopbam.cram.reference-source-path";
25 |
26 | @Override
27 | public List getSplits(JobContext job) throws IOException {
28 | return getSplits(super.getSplits(job), job.getConfiguration());
29 | }
30 |
31 | public List getSplits(List splits, Configuration conf)
32 | throws IOException {
33 | // update splits to align with CRAM container boundaries
34 | List newSplits = new ArrayList();
35 | Map> fileToOffsets = new HashMap>();
36 | for (InputSplit split : splits) {
37 | FileSplit fileSplit = (FileSplit) split;
38 | Path path = fileSplit.getPath();
39 | List containerOffsets = fileToOffsets.get(path);
40 | if (containerOffsets == null) {
41 | containerOffsets = getContainerOffsets(conf, path);
42 | fileToOffsets.put(path, containerOffsets);
43 | }
44 | long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart());
45 | long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() +
46 | fileSplit.getLength());
47 | long newLength = newEnd - newStart;
48 | if (newLength == 0) { // split is wholly within a container
49 | continue;
50 | }
51 | FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength,
52 | fileSplit.getLocations());
53 | newSplits.add(newSplit);
54 | }
55 | return newSplits;
56 | }
57 |
58 | private static List getContainerOffsets(Configuration conf, Path cramFile)
59 | throws IOException {
60 | SeekableStream seekableStream = WrapSeekable.openPath(conf, cramFile);
61 | CramContainerIterator cci = new CramContainerIterator(seekableStream);
62 | List containerOffsets = new ArrayList();
63 | containerOffsets.add(seekableStream.position());
64 | while (cci.hasNext()) {
65 | cci.next();
66 | containerOffsets.add(seekableStream.position());
67 | }
68 | containerOffsets.add(seekableStream.length());
69 | return containerOffsets;
70 | }
71 |
72 | private static long nextContainerOffset(List containerOffsets, long position) {
73 | for (long offset : containerOffsets) {
74 | if (offset >= position) {
75 | return offset;
76 | }
77 | }
78 | throw new IllegalStateException("Could not find position " + position + " in " +
79 | "container offsets: " + containerOffsets);
80 | }
81 |
82 | @Override
83 | public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
84 | RecordReader rr = new CRAMRecordReader();
85 | rr.initialize(split, context);
86 | return rr;
87 | }
88 |
89 | @Override
90 | public boolean isSplitable(JobContext job, Path path) {
91 | return true;
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/BAMTestUtil.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam;
2 |
3 | import htsjdk.samtools.BAMIndex;
4 | import htsjdk.samtools.BAMIndexer;
5 | import htsjdk.samtools.SAMFileHeader;
6 | import htsjdk.samtools.SAMFileWriter;
7 | import htsjdk.samtools.SAMFileWriterFactory;
8 | import htsjdk.samtools.SAMRecord;
9 | import htsjdk.samtools.SAMRecordSetBuilder;
10 | import htsjdk.samtools.SamReader;
11 | import htsjdk.samtools.SamReaderFactory;
12 | import java.io.File;
13 | import java.io.IOException;
14 |
15 | class BAMTestUtil {
16 | public static File writeBamFile(int numPairs, SAMFileHeader.SortOrder sortOrder)
17 | throws IOException {
18 | // file will be both queryname and coordinate sorted, so use one or the other
19 | SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, sortOrder);
20 | for (int i = 0; i < numPairs; i++) {
21 | int chr = 20;
22 | int start1 = (i + 1) * 1000;
23 | int start2 = start1 + 100;
24 | if (i == 5) { // add two unmapped fragments instead of a mapped pair
25 | samRecordSetBuilder.addFrag(String.format("test-read-%03d-1", i), chr, start1,
26 | false, true, null,
27 | null,
28 | -1, false);
29 | samRecordSetBuilder.addFrag(String.format("test-read-%03d-2", i), chr, start2,
30 | false, true, null,
31 | null,
32 | -1, false);
33 | } else {
34 | samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1,
35 | start2);
36 | }
37 | }
38 | if (numPairs > 0) { // add two unplaced unmapped fragments if non-empty
39 | samRecordSetBuilder.addUnmappedFragment(String.format
40 | ("test-read-%03d-unplaced-unmapped", numPairs++));
41 | samRecordSetBuilder.addUnmappedFragment(String.format
42 | ("test-read-%03d-unplaced-unmapped", numPairs++));
43 | }
44 |
45 | final File bamFile = File.createTempFile("test", ".bam");
46 | bamFile.deleteOnExit();
47 | SAMFileHeader samHeader = samRecordSetBuilder.getHeader();
48 | final SAMFileWriter bamWriter = new SAMFileWriterFactory()
49 | .makeSAMOrBAMWriter(samHeader, true, bamFile);
50 | for (final SAMRecord rec : samRecordSetBuilder.getRecords()) {
51 | bamWriter.addAlignment(rec);
52 | }
53 | bamWriter.close();
54 |
55 | // create BAM index
56 | if (sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
57 | SamReader samReader = SamReaderFactory.makeDefault()
58 | .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS)
59 | .open(bamFile);
60 | BAMIndexer.createIndex(samReader, new File(bamFile.getAbsolutePath()
61 | .replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix)));
62 | }
63 |
64 | return bamFile;
65 | }
66 |
67 | public static File writeBamFileWithLargeHeader() throws IOException {
68 | SAMRecordSetBuilder samRecordSetBuilder =
69 | new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.queryname);
70 | for (int i = 0; i < 1000; i++) {
71 | int chr = 20;
72 | int start1 = (i + 1) * 1000;
73 | int start2 = start1 + 100;
74 | samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1,
75 | start2);
76 | }
77 |
78 | final File bamFile = File.createTempFile("test", ".bam");
79 | bamFile.deleteOnExit();
80 | SAMFileHeader samHeader = samRecordSetBuilder.getHeader();
81 | StringBuffer sb = new StringBuffer();
82 | for (int i = 0; i < 1000000; i++) {
83 | sb.append("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789");
84 | }
85 | samHeader.addComment(sb.toString());
86 | final SAMFileWriter bamWriter = new SAMFileWriterFactory()
87 | .makeSAMOrBAMWriter(samHeader, true, bamFile);
88 | for (final SAMRecord rec : samRecordSetBuilder.getRecords()) {
89 | bamWriter.addAlignment(rec);
90 | }
91 | bamWriter.close();
92 |
93 | return bamFile;
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitCompressionInputStream.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam.util;
2 |
3 | import htsjdk.samtools.util.BlockCompressedInputStream;
4 | import java.io.BufferedInputStream;
5 | import java.io.IOException;
6 | import java.io.InputStream;
7 | import org.apache.hadoop.io.compress.SplitCompressionInputStream;
8 |
9 | /**
10 | * An implementation of {@code SplitCompressionInputStream} for BGZF, based on
11 | * {@code BZip2CompressionInputStream} and {@code CBZip2InputStream} from Hadoop.
12 | * (BZip2 is the only splittable compression codec in Hadoop.)
13 | */
14 | class BGZFSplitCompressionInputStream extends SplitCompressionInputStream {
15 | private static final int END_OF_BLOCK = -2;
16 | private final BlockCompressedInputStream input;
17 | private BufferedInputStream bufferedIn;
18 | private long startingPos = 0L;
19 | private long processedPosition;
20 |
21 | private enum POS_ADVERTISEMENT_STATE_MACHINE {
22 | HOLD, ADVERTISE
23 | };
24 |
25 | POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
26 | long compressedStreamPosition = 0;
27 |
28 | public BGZFSplitCompressionInputStream(InputStream in, long start, long end)
29 | throws IOException {
30 | super(in, start, end);
31 | bufferedIn = new BufferedInputStream(super.in);
32 | this.startingPos = super.getPos();
33 | input = new BlockCompressedInputStream(bufferedIn);
34 | this.updatePos(false);
35 | }
36 |
37 | @Override
38 | public int read() throws IOException {
39 | byte b[] = new byte[1];
40 | int result = this.read(b, 0, 1);
41 | return (result < 0) ? result : (b[0] & 0xff);
42 | }
43 |
44 | @Override
45 | public int read(byte[] b, int off, int len) throws IOException {
46 | // See BZip2CompressionInputStream#read for implementation notes.
47 | int result;
48 | result = readWithinBlock(b, off, len);
49 | if (result == END_OF_BLOCK) {
50 | this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE;
51 | }
52 | if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) {
53 | result = readWithinBlock(b, off, off + 1);
54 | // This is the precise time to update compressed stream position
55 | // to the client of this code.
56 | this.updatePos(true);
57 | this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
58 | }
59 | return result;
60 | }
61 |
62 | /**
63 | * Read up to len bytes from the stream, but no further than the end of the
64 | * compressed block. If at the end of the block then no bytes will be read and a return
65 | * value of -2 will be returned; on the next call to read, bytes from the next block
66 | * will be returned. This is the same contract as CBZip2InputStream in Hadoop.
67 | * @return int The return value greater than 0 are the bytes read. A value
68 | * of -1 means end of stream while -2 represents end of block.
69 | */
70 | private int readWithinBlock(byte[] b, int off, int len) throws IOException {
71 | if (input.endOfBlock()) {
72 | final int available = input.available(); // this will read the next block, if there is one
73 | processedPosition = input.getPosition() >> 16;
74 | if (available == 0) { // end of stream
75 | return -1;
76 | }
77 | return END_OF_BLOCK;
78 | }
79 |
80 | // return up to end of block (at most)
81 | int available = input.available();
82 | return input.read(b, off, Math.min(available, len));
83 | }
84 |
85 | @Override
86 | public void resetState() throws IOException {
87 | // not implemented (only used in sequence files)
88 | }
89 |
90 | @Override
91 | public long getPos() throws IOException {
92 | return this.compressedStreamPosition;
93 | }
94 |
95 | // See comment in BZip2CompressionInputStream#updatePos
96 | private void updatePos(boolean shouldAddOn) {
97 | int addOn = shouldAddOn ? 1 : 0;
98 | this.compressedStreamPosition = this.startingPos + processedPosition + addOn;
99 | }
100 |
101 | @Override
102 | public void close() throws IOException {
103 | input.close();
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestFastaInputFormat.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam;
2 |
3 | import java.util.List;
4 | import org.apache.hadoop.conf.Configuration;
5 | import org.apache.hadoop.io.Text;
6 | import org.apache.hadoop.mapreduce.InputSplit;
7 | import org.apache.hadoop.mapreduce.JobContext;
8 | import org.apache.hadoop.mapreduce.RecordReader;
9 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
10 | import org.apache.hadoop.mapreduce.TaskAttemptID;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.task.JobContextImpl;
13 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
14 | import org.junit.Before;
15 | import org.junit.Test;
16 |
17 | import static org.junit.Assert.assertEquals;
18 | import static org.junit.Assert.assertFalse;
19 | import static org.junit.Assert.assertTrue;
20 | import static org.mockito.Mockito.mock;
21 |
22 | public class TestFastaInputFormat {
23 | private String input;
24 | private TaskAttemptContext taskAttemptContext;
25 | private JobContext jobContext;
26 |
27 | @Before
28 | public void setup() throws Exception {
29 | Configuration conf = new Configuration();
30 | input = ClassLoader.getSystemClassLoader().getResource("mini-chr1-chr2.fasta").getFile();
31 | conf.set("mapred.input.dir", "file://" + input);
32 |
33 | // Input fasta is 600 bytes, so this gets us 3 FileInputFormat splits.
34 | conf.set(FileInputFormat.SPLIT_MAXSIZE, "200");
35 |
36 | taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class));
37 | jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID());
38 | }
39 |
40 | @Test
41 | public void testReader() throws Exception {
42 | FastaInputFormat inputFormat = new FastaInputFormat();
43 | List splits = inputFormat.getSplits(jobContext);
44 | assertEquals(2, splits.size());
45 | RecordReader reader = inputFormat
46 | .createRecordReader(splits.get(0), taskAttemptContext);
47 | reader.initialize(splits.get(0), taskAttemptContext);
48 |
49 | assertTrue(reader.nextKeyValue());
50 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:11"), reader.getCurrentKey());
51 | assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"), reader.getCurrentValue().getSequence());
52 |
53 | assertTrue(reader.nextKeyValue());
54 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:182"), reader.getCurrentKey());
55 | assertEquals(new Text("ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAAC"), reader.getCurrentValue().getSequence());
56 |
57 | assertTrue(reader.nextKeyValue());
58 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1163"), reader.getCurrentKey());
59 | assertEquals(new Text("CCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCC"), reader.getCurrentValue().getSequence());
60 |
61 | assertTrue(reader.nextKeyValue());
62 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1244"), reader.getCurrentKey());
63 | assertEquals(new Text("TAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC"), reader.getCurrentValue().getSequence());
64 |
65 | assertTrue(reader.nextKeyValue());
66 | assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1325"), reader.getCurrentKey());
67 | assertEquals(new Text("CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCC"), reader.getCurrentValue().getSequence());
68 |
69 | assertFalse(reader.nextKeyValue());
70 |
71 | reader = inputFormat.createRecordReader(splits.get(1), taskAttemptContext);
72 | reader.initialize(splits.get(1), taskAttemptContext);
73 |
74 | assertTrue(reader.nextKeyValue());
75 | assertEquals(new Text("chr2 dna:chromosome chromosome:GRCh37:2:1:243199373:11"), reader.getCurrentKey());
76 | assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC"), reader.getCurrentValue().getSequence());
77 |
78 | assertFalse(reader.nextKeyValue());
79 |
80 | reader.close();
81 | }
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/SAMHeaderReader.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2013 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2013-07-26 13:54:32
22 |
23 | package org.seqdoop.hadoop_bam.util;
24 |
25 | import java.io.IOException;
26 | import java.io.InputStream;
27 | import java.net.URI;
28 | import java.nio.file.Paths;
29 |
30 | import htsjdk.samtools.cram.ref.ReferenceSource;
31 | import org.apache.hadoop.conf.Configuration;
32 | import org.apache.hadoop.fs.Path;
33 |
34 | import htsjdk.samtools.SAMFileHeader;
35 | import htsjdk.samtools.SamInputResource;
36 | import htsjdk.samtools.SamReaderFactory;
37 | import htsjdk.samtools.ValidationStringency;
38 | import org.seqdoop.hadoop_bam.CRAMInputFormat;
39 |
40 | public final class SAMHeaderReader {
41 | /** A String property corresponding to a ValidationStringency
42 | * value. If set, the given stringency is used when any part of the
43 | * Hadoop-BAM library reads SAM or BAM.
44 | */
45 | public static final String VALIDATION_STRINGENCY_PROPERTY =
46 | "hadoopbam.samheaderreader.validation-stringency";
47 |
48 | public static SAMFileHeader readSAMHeaderFrom(Path path, Configuration conf)
49 | throws IOException
50 | {
51 | InputStream i = path.getFileSystem(conf).open(path);
52 | final SAMFileHeader h = readSAMHeaderFrom(i, conf);
53 | i.close();
54 | return h;
55 | }
56 |
57 | /** Does not close the stream. */
58 | public static SAMFileHeader readSAMHeaderFrom(
59 | final InputStream in, final Configuration conf)
60 | {
61 | final ValidationStringency
62 | stringency = getValidationStringency(conf);
63 | SamReaderFactory readerFactory = SamReaderFactory.makeDefault()
64 | .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false)
65 | .setUseAsyncIo(false);
66 | if (stringency != null) {
67 | readerFactory.validationStringency(stringency);
68 | }
69 |
70 | final ReferenceSource refSource = getReferenceSource(conf);
71 | if (null != refSource) {
72 | readerFactory.referenceSource(refSource);
73 | }
74 | return readerFactory.open(SamInputResource.of(in)).getFileHeader();
75 | }
76 |
77 | public static ValidationStringency getValidationStringency(
78 | final Configuration conf)
79 | {
80 | final String p = conf.get(VALIDATION_STRINGENCY_PROPERTY);
81 | return p == null ? null : ValidationStringency.valueOf(p);
82 | }
83 |
84 | public static ReferenceSource getReferenceSource(
85 | final Configuration conf)
86 | {
87 | //TODO: There isn't anything particularly CRAM-specific about reference source or validation
88 | // stringency other than that a reference source is required for CRAM files. We should move
89 | // the reference source and validation stringency property names and utility methods out of
90 | // CRAMInputFormat and SAMHeaderReader and combine them together into a single class for extracting
91 | // configuration params, but it would break backward compatibility with existing code that
92 | // is dependent on the CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY.
93 | final String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
94 | return refSourcePath == null ? null : new ReferenceSource(NIOFileUtil.asPath(refSourcePath));
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/NIOFileUtil.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam.util;
2 |
3 | import java.io.IOException;
4 | import java.io.OutputStream;
5 | import java.net.URI;
6 | import java.nio.file.FileSystemNotFoundException;
7 | import java.nio.file.FileSystems;
8 | import java.nio.file.FileVisitResult;
9 | import java.nio.file.Files;
10 | import java.nio.file.Path;
11 | import java.nio.file.PathMatcher;
12 | import java.nio.file.Paths;
13 | import java.nio.file.SimpleFileVisitor;
14 | import java.nio.file.attribute.BasicFileAttributes;
15 | import java.util.Collections;
16 | import java.util.HashMap;
17 | import java.util.List;
18 | import java.util.stream.Collectors;
19 |
20 | public class NIOFileUtil {
21 | private NIOFileUtil() {
22 | }
23 |
24 | static final String PARTS_GLOB = "glob:**/part-[mr]-[0-9][0-9][0-9][0-9][0-9]*";
25 |
26 | /**
27 | * Convert the given path {@link URI} to a {@link Path} object.
28 | * @param uri the path to convert
29 | * @return a {@link Path} object
30 | */
31 | public static Path asPath(URI uri) {
32 | try {
33 | return Paths.get(uri);
34 | } catch (FileSystemNotFoundException e) {
35 | ClassLoader cl = Thread.currentThread().getContextClassLoader();
36 | if (cl == null) {
37 | throw e;
38 | }
39 | try {
40 | return FileSystems.newFileSystem(uri, new HashMap<>(), cl).provider().getPath(uri);
41 | } catch (IOException ex) {
42 | throw new RuntimeException("Cannot create filesystem for " + uri, ex);
43 | }
44 | }
45 | }
46 |
47 | /**
48 | * Convert the given path string to a {@link Path} object.
49 | * @param path the path to convert
50 | * @return a {@link Path} object
51 | */
52 | public static Path asPath(String path) {
53 | URI uri = URI.create(path);
54 | return uri.getScheme() == null ? Paths.get(path) : asPath(uri);
55 | }
56 |
57 | /**
58 | * Delete the given directory and all of its contents if non-empty.
59 | * @param directory the directory to delete
60 | * @throws IOException
61 | */
62 | static void deleteRecursive(Path directory) throws IOException {
63 | Files.walkFileTree(directory, new SimpleFileVisitor() {
64 | @Override
65 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
66 | Files.delete(file);
67 | return FileVisitResult.CONTINUE;
68 | }
69 | @Override
70 | public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
71 | Files.deleteIfExists(dir);
72 | return FileVisitResult.CONTINUE;
73 | }
74 | });
75 | }
76 |
77 | /**
78 | * Returns all the files in a directory that match the given pattern, and that don't
79 | * have the given extension.
80 | * @param directory the directory to look for files in, subdirectories are not
81 | * considered
82 | * @param syntaxAndPattern the syntax and pattern to use for matching (see
83 | * {@link java.nio.file.FileSystem#getPathMatcher}
84 | * @param excludesExt the extension to exclude, or null to exclude nothing
85 | * @return a list of files, sorted by name
86 | * @throws IOException
87 | */
88 | static List getFilesMatching(Path directory,
89 | String syntaxAndPattern, String excludesExt) throws IOException {
90 | PathMatcher matcher = directory.getFileSystem().getPathMatcher(syntaxAndPattern);
91 | List parts = Files.walk(directory)
92 | .filter(matcher::matches)
93 | .filter(path -> excludesExt == null || !path.toString().endsWith(excludesExt))
94 | .collect(Collectors.toList());
95 | Collections.sort(parts);
96 | return parts;
97 | }
98 |
99 | /**
100 | * Merge the given part files in order into an output stream.
101 | * This deletes the parts.
102 | * @param parts the part files to merge
103 | * @param out the stream to write each file into, in order
104 | * @throws IOException
105 | */
106 | static void mergeInto(List parts, OutputStream out)
107 | throws IOException {
108 | for (final Path part : parts) {
109 | Files.copy(part, out);
110 | Files.delete(part);
111 | }
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestSAMInputFormat.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam;
2 |
3 | import htsjdk.samtools.SAMRecord;
4 | import htsjdk.samtools.SamReader;
5 | import htsjdk.samtools.SamReaderFactory;
6 | import java.io.BufferedReader;
7 | import java.io.File;
8 | import java.io.FileReader;
9 | import java.util.ArrayList;
10 | import java.util.List;
11 | import org.apache.hadoop.conf.Configuration;
12 | import org.apache.hadoop.fs.FileSystem;
13 | import org.apache.hadoop.fs.Path;
14 | import org.apache.hadoop.io.LongWritable;
15 | import org.apache.hadoop.mapreduce.InputSplit;
16 | import org.apache.hadoop.mapreduce.Job;
17 | import org.apache.hadoop.mapreduce.JobContext;
18 | import org.apache.hadoop.mapreduce.RecordReader;
19 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
20 | import org.apache.hadoop.mapreduce.TaskAttemptID;
21 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
23 | import org.apache.hadoop.mapreduce.task.JobContextImpl;
24 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
25 | import org.junit.Before;
26 | import org.junit.Test;
27 |
28 | import static org.junit.Assert.assertEquals;
29 | import static org.junit.Assert.assertTrue;
30 | import static org.mockito.Mockito.mock;
31 |
32 | public class TestSAMInputFormat {
33 | private String input;
34 | private TaskAttemptContext taskAttemptContext;
35 | private JobContext jobContext;
36 |
37 | @Before
38 | public void setup() throws Exception {
39 | Configuration conf = new Configuration();
40 | input = ClassLoader.getSystemClassLoader().getResource("test.sam").getFile();
41 | conf.set("mapred.input.dir", "file://" + input);
42 |
43 | taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class));
44 | jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID());
45 | }
46 |
47 | @Test
48 | public void testReader() throws Exception {
49 | int expectedCount = 0;
50 | SamReader samReader = SamReaderFactory.makeDefault().open(new File(input));
51 | for (SAMRecord r : samReader) {
52 | expectedCount++;
53 | }
54 | samReader.close();
55 |
56 | AnySAMInputFormat inputFormat = new AnySAMInputFormat();
57 | List splits = inputFormat.getSplits(jobContext);
58 | assertEquals(1, splits.size());
59 | RecordReader reader = inputFormat
60 | .createRecordReader(splits.get(0), taskAttemptContext);
61 | reader.initialize(splits.get(0), taskAttemptContext);
62 |
63 | int actualCount = 0;
64 | while (reader.nextKeyValue()) {
65 | actualCount++;
66 | }
67 | reader.close();
68 |
69 | assertEquals(expectedCount, actualCount);
70 | }
71 |
72 | @Test
73 | public void testMapReduceJob() throws Exception {
74 | Configuration conf = new Configuration();
75 |
76 | FileSystem fileSystem = FileSystem.get(conf);
77 | Path inputPath = new Path(input);
78 | Path outputPath = fileSystem.makeQualified(new Path("target/out"));
79 | fileSystem.delete(outputPath, true);
80 |
81 | Job job = Job.getInstance(conf);
82 | FileInputFormat.setInputPaths(job, inputPath);
83 | job.setInputFormatClass(SAMInputFormat.class);
84 | job.setOutputKeyClass(LongWritable.class);
85 | job.setOutputValueClass(SAMRecordWritable.class);
86 | job.setNumReduceTasks(0);
87 | FileOutputFormat.setOutputPath(job, outputPath);
88 |
89 | boolean success = job.waitForCompletion(true);
90 | assertTrue(success);
91 |
92 | List samStrings = new ArrayList();
93 | SamReader samReader = SamReaderFactory.makeDefault().open(new File(input));
94 | for (SAMRecord r : samReader) {
95 | samStrings.add(r.getSAMString().trim());
96 | }
97 | samReader.close();
98 |
99 | File outputFile = new File(new File(outputPath.toUri()), "part-m-00000");
100 | BufferedReader br = new BufferedReader(new FileReader(outputFile));
101 | String line;
102 | int index = 0;
103 | while ((line = br.readLine()) != null) {
104 | String value = line.substring(line.indexOf("\t") + 1); // ignore key
105 | assertEquals(samStrings.get(index++), value);
106 | }
107 | br.close();
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2011 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2011-11-15 11:58:23
22 |
23 | package org.seqdoop.hadoop_bam;
24 |
25 | import htsjdk.samtools.BAMRecord;
26 | import htsjdk.samtools.SAMFileHeader;
27 | import htsjdk.samtools.SAMRecord;
28 | import htsjdk.samtools.SAMRecordFactory;
29 |
30 | /** A factory for the kind of lazy {@link BAMRecord} used internally. */
31 | public class LazyBAMRecordFactory implements SAMRecordFactory {
32 | @Override public SAMRecord createSAMRecord(SAMFileHeader hdr) {
33 | throw new UnsupportedOperationException(
34 | "LazyBAMRecordFactory can only create BAM records");
35 | }
36 |
37 | @Override public BAMRecord createBAMRecord(
38 | SAMFileHeader hdr,
39 | int referenceSequenceIndex, int alignmentStart,
40 | short readNameLength, short mappingQuality,
41 | int indexingBin, int cigarLen, int flags, int readLen,
42 | int mateReferenceSequenceIndex, int mateAlignmentStart,
43 | int insertSize, byte[] variableLengthBlock)
44 | {
45 | return new LazyBAMRecord(
46 | hdr, referenceSequenceIndex, alignmentStart, readNameLength,
47 | mappingQuality, indexingBin, cigarLen, flags, readLen,
48 | mateReferenceSequenceIndex, mateAlignmentStart, insertSize,
49 | variableLengthBlock);
50 | }
51 | }
52 |
53 | class LazyBAMRecord extends BAMRecord {
54 | private boolean decodedRefIdx = false;
55 | private boolean decodedMateRefIdx = false;
56 |
57 | public LazyBAMRecord(
58 | SAMFileHeader hdr, int referenceID, int coordinate, short readNameLength,
59 | short mappingQuality, int indexingBin, int cigarLen, int flags,
60 | int readLen, int mateReferenceID, int mateCoordinate, int insertSize,
61 | byte[] restOfData)
62 | {
63 | super(
64 | hdr, referenceID, coordinate, readNameLength, mappingQuality,
65 | indexingBin, cigarLen, flags, readLen, mateReferenceID,
66 | mateCoordinate, insertSize, restOfData);
67 | }
68 |
69 | @Override public void setReferenceIndex(final int referenceIndex) {
70 | mReferenceIndex = referenceIndex;
71 | decodedRefIdx = false;
72 | }
73 | @Override public void setMateReferenceIndex(final int referenceIndex) {
74 | mMateReferenceIndex = referenceIndex;
75 | decodedMateRefIdx = false;
76 | }
77 |
78 | @Override public String getReferenceName() {
79 | if (mReferenceIndex != null && !decodedRefIdx) {
80 | decodedRefIdx = true;
81 | super.setReferenceIndex(mReferenceIndex);
82 | }
83 | return super.getReferenceName();
84 | }
85 |
86 | @Override public String getMateReferenceName() {
87 | if (mMateReferenceIndex != null && !decodedMateRefIdx) {
88 | decodedMateRefIdx = true;
89 | super.setMateReferenceIndex(mMateReferenceIndex);
90 | }
91 | return super.getMateReferenceName();
92 | }
93 |
94 | @Override protected void eagerDecode() {
95 | getReferenceName();
96 | getMateReferenceName();
97 | super.eagerDecode();
98 | }
99 |
100 | @Override
101 | public boolean equals(Object o) {
102 | // don't use decoded flags for equality check
103 | return super.equals(o);
104 | }
105 |
106 | @Override
107 | public int hashCode() {
108 | // don't use decoded flags for hash code
109 | return super.hashCode();
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormatStringency.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2017 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | package org.seqdoop.hadoop_bam;
22 |
23 | import htsjdk.samtools.ValidationStringency;
24 | import htsjdk.tribble.TribbleException;
25 | import htsjdk.variant.variantcontext.VariantContext;
26 | import java.util.List;
27 | import org.apache.hadoop.conf.Configuration;
28 | import org.apache.hadoop.io.LongWritable;
29 | import org.apache.hadoop.mapreduce.InputSplit;
30 | import org.apache.hadoop.mapreduce.JobContext;
31 | import org.apache.hadoop.mapreduce.RecordReader;
32 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
33 | import org.apache.hadoop.mapreduce.TaskAttemptID;
34 | import org.apache.hadoop.mapreduce.task.JobContextImpl;
35 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
36 | import org.junit.Test;
37 |
38 | import static org.junit.Assert.assertEquals;
39 | import static org.junit.Assert.assertNotNull;
40 | import static org.mockito.Mockito.mock;
41 |
42 | public class TestVCFInputFormatStringency {
43 |
44 | public void checkReading(ValidationStringency validationStringency) throws Exception {
45 | String filename = "invalid_info_field.vcf";
46 | Configuration conf = new Configuration();
47 | String input_file = ClassLoader.getSystemClassLoader().getResource(filename).getFile();
48 | conf.set("mapred.input.dir", "file://" + input_file);
49 |
50 | if (validationStringency != null) {
51 | VCFRecordReader.setValidationStringency(conf, validationStringency);
52 | }
53 |
54 | TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class));
55 | JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID());
56 |
57 | VCFInputFormat inputFormat = new VCFInputFormat(conf);
58 | List splits = inputFormat.getSplits(ctx);
59 | assertEquals(1, splits.size());
60 | RecordReader reader =
61 | inputFormat.createRecordReader(splits.get(0), taskAttemptContext);
62 | int counter = 0;
63 | while (reader.nextKeyValue()) {
64 | VariantContextWritable writable = reader.getCurrentValue();
65 | assertNotNull(writable);
66 | VariantContext vc = writable.get();
67 | assertNotNull(vc);
68 | String value = vc.toString();
69 | assertNotNull(value);
70 | counter++;
71 | }
72 | assertEquals(4, counter);
73 | }
74 |
75 | @Test(expected = TribbleException.class)
76 | public void testUnset() throws Exception {
77 | checkReading(null); // defaults to strict
78 | }
79 |
80 | @Test(expected = TribbleException.class)
81 | public void testDefault() throws Exception {
82 | checkReading(ValidationStringency.DEFAULT_STRINGENCY); // defaults to strict
83 | }
84 |
85 | @Test
86 | public void testSilent() throws Exception {
87 | checkReading(ValidationStringency.SILENT);
88 | }
89 |
90 | @Test
91 | public void testLenient() throws Exception {
92 | checkReading(ValidationStringency.LENIENT);
93 | }
94 |
95 | @Test(expected = TribbleException.class)
96 | public void testStrict() throws Exception {
97 | checkReading(ValidationStringency.STRICT);
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndex.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2010 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2010-08-25 12:20:03
22 |
23 | package org.seqdoop.hadoop_bam.util;
24 |
25 | import java.io.BufferedInputStream;
26 | import java.io.File;
27 | import java.io.FileInputStream;
28 | import java.io.InputStream;
29 | import java.io.IOException;
30 | import java.nio.ByteBuffer;
31 | import java.util.NavigableSet;
32 | import java.util.TreeSet;
33 |
34 | /** An index into BGZF-compressed files, for {@link BGZFSplitFileInputFormat}.
35 | * Reads files that are created by {@link BGZFBlockIndexer}.
36 | *
37 | * Indexes the positions of individual gzip blocks in the file.
38 | */
39 | public final class BGZFBlockIndex {
40 | private final NavigableSet offsets = new TreeSet();
41 |
42 | public BGZFBlockIndex() {}
43 | public BGZFBlockIndex(final File path) throws IOException {
44 | this(new BufferedInputStream(new FileInputStream(path)));
45 | }
46 | public BGZFBlockIndex(final InputStream in) throws IOException {
47 | readIndex(in);
48 | }
49 |
50 | public void readIndex(final InputStream in) throws IOException {
51 | offsets.clear();
52 |
53 | final ByteBuffer bb = ByteBuffer.allocate(8);
54 |
55 | for (long prev = -1; in.read(bb.array(), 2, 6) == 6;) {
56 | final long cur = bb.getLong(0);
57 | if (prev > cur)
58 | throw new IOException(String.format(
59 | "Invalid BGZF block index; offsets not in order: %#x > %#x",
60 | prev, cur));
61 |
62 | offsets.add(prev = cur);
63 | }
64 | in.close();
65 |
66 | if (offsets.size() < 1)
67 | throw new IOException(
68 | "Invalid BGZF block index: should contain at least the file size");
69 |
70 | offsets.add(0L);
71 | }
72 |
73 | public Long prevBlock(final long filePos) {
74 | return offsets.floor(filePos);
75 | }
76 | public Long nextBlock(final long filePos) {
77 | return offsets.higher(filePos);
78 | }
79 |
80 | public int size() { return offsets.size(); }
81 |
82 | private long secondBlock() { return nextBlock(0); }
83 | private long lastBlock() { return prevBlock(fileSize() - 1); }
84 | private long fileSize() { return offsets.last(); }
85 |
86 | /** Writes some statistics about each BGZF block index file given as an
87 | * argument.
88 | */
89 | public static void main(String[] args) {
90 | if (args.length == 0) {
91 | System.out.println(
92 | "Usage: BGZFBlockIndex [BGZF block indices...]\n\n"+
93 |
94 | "Writes a few statistics about each BGZF block index.");
95 | return;
96 | }
97 |
98 | for (String arg : args) {
99 | final File f = new File(arg);
100 | if (f.isFile() && f.canRead()) {
101 | try {
102 | System.err.printf("%s:\n", f);
103 | final BGZFBlockIndex bi = new BGZFBlockIndex(f);
104 | final long second = bi.secondBlock();
105 | final long last = bi.lastBlock();
106 | System.err.printf(
107 | "\t%d blocks\n" +
108 | "\tfirst after 0 is at %#014x\n" +
109 | "\tlast is at %#014x\n" +
110 | "\tassociated BGZF file size %d\n",
111 | bi.size()-1,
112 | bi.secondBlock(), bi.lastBlock(), bi.fileSize());
113 | } catch (IOException e) {
114 | System.err.printf("Failed to read %s!\n", f);
115 | e.printStackTrace();
116 | }
117 | } else
118 | System.err.printf("%s does not look like a readable file!\n", f);
119 | }
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringAnySAMOutputFormat.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2010 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2010-08-11 12:19:23
22 |
23 | package org.seqdoop.hadoop_bam;
24 |
25 | import java.io.IOException;
26 | import java.io.InputStream;
27 |
28 | import htsjdk.samtools.SAMFileHeader;
29 |
30 | import org.apache.hadoop.conf.Configuration;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.hadoop.mapreduce.RecordWriter;
33 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
34 |
35 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
36 |
37 | /** Writes only the SAM records, not the key.
38 | *
39 | * A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or
40 | * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.
41 | *
42 | * By default, writes the SAM header to the output file(s). This
43 | * can be disabled, because in distributed usage one often ends up with (and,
44 | * for decent performance, wants to end up with) the output split into multiple
45 | * parts, which are easier to concatenate if the header is not present in each
46 | * file.
47 | */
48 | public class KeyIgnoringAnySAMOutputFormat extends AnySAMOutputFormat {
49 |
50 | protected SAMFileHeader header;
51 |
52 | /** Whether the header will be written, defaults to true..
53 | */
54 | public static final String WRITE_HEADER_PROPERTY =
55 | "hadoopbam.anysam.write-header";
56 |
57 | public KeyIgnoringAnySAMOutputFormat(SAMFormat fmt) {
58 | super(fmt);
59 | }
60 | public KeyIgnoringAnySAMOutputFormat(Configuration conf) {
61 | super(conf);
62 |
63 | if (format == null)
64 | throw new IllegalArgumentException(
65 | "unknown SAM format: OUTPUT_SAM_FORMAT_PROPERTY not set");
66 | }
67 | public KeyIgnoringAnySAMOutputFormat(Configuration conf, Path path) {
68 | super(conf);
69 |
70 | if (format == null) {
71 | format = SAMFormat.inferFromFilePath(path);
72 |
73 | if (format == null)
74 | throw new IllegalArgumentException("unknown SAM format: " + path);
75 | }
76 | }
77 |
78 | public SAMFileHeader getSAMHeader() { return header; }
79 | public void setSAMHeader(SAMFileHeader header) { this.header = header; }
80 |
81 | public void readSAMHeaderFrom(Path path, Configuration conf)
82 | throws IOException
83 | {
84 | this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf);
85 | }
86 | public void readSAMHeaderFrom(InputStream in, Configuration conf) {
87 | this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
88 | }
89 |
90 | /** setSAMHeader or readSAMHeaderFrom must have
91 | * been called first.
92 | */
93 | @Override public RecordWriter getRecordWriter(
94 | TaskAttemptContext ctx)
95 | throws IOException
96 | {
97 | return getRecordWriter(ctx, getDefaultWorkFile(ctx, ""));
98 | }
99 |
100 | // Allows wrappers to provide their own work file.
101 | public RecordWriter getRecordWriter(
102 | TaskAttemptContext ctx, Path out)
103 | throws IOException
104 | {
105 | if (this.header == null)
106 | throw new IOException(
107 | "Can't create a RecordWriter without the SAM header");
108 |
109 | final boolean writeHeader = ctx.getConfiguration().getBoolean(
110 | WRITE_HEADER_PROPERTY, true);
111 |
112 | switch (format) {
113 | case BAM:
114 | return new KeyIgnoringBAMRecordWriter(
115 | out, header, writeHeader, ctx);
116 |
117 | case SAM:
118 | return new KeyIgnoringSAMRecordWriter(
119 | out, header, writeHeader, ctx);
120 |
121 | case CRAM:
122 | return new KeyIgnoringCRAMRecordWriter(
123 | out, header, writeHeader, ctx);
124 |
125 | default: assert false; return null;
126 | }
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/FileVirtualSplit.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2010 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2010-08-09 13:06:32
22 |
23 | package org.seqdoop.hadoop_bam;
24 |
25 | import java.io.DataOutput;
26 | import java.io.DataInput;
27 | import java.io.IOException;
28 |
29 | import org.apache.hadoop.fs.Path;
30 | import org.apache.hadoop.io.Text;
31 | import org.apache.hadoop.io.Writable;
32 | import org.apache.hadoop.mapreduce.InputSplit;
33 |
34 | /** Like a {@link org.apache.hadoop.mapreduce.lib.input.FileSplit}, but uses
35 | * BGZF virtual offsets to fit with {@link
36 | * htsjdk.samtools.util.BlockCompressedInputStream}.
37 | */
38 | public class FileVirtualSplit extends InputSplit implements Writable {
39 | private Path file;
40 | private long vStart;
41 | private long vEnd;
42 | private final String[] locations;
43 | private long[] intervalFilePointers;
44 |
45 | private static final String[] NO_LOCATIONS = {};
46 |
47 | public FileVirtualSplit() { locations = NO_LOCATIONS; }
48 |
49 | public FileVirtualSplit(Path f, long vs, long ve, String[] locs) {
50 | file = f;
51 | vStart = vs;
52 | vEnd = ve;
53 | locations = locs;
54 | }
55 |
56 | public FileVirtualSplit(Path f, long vs, long ve, String[] locs, long[] intervalFilePointers) {
57 | file = f;
58 | vStart = vs;
59 | vEnd = ve;
60 | locations = locs;
61 | this.intervalFilePointers = intervalFilePointers;
62 | }
63 |
64 | @Override public String[] getLocations() { return locations; }
65 |
66 | /** Inexact due to the nature of virtual offsets.
67 | *
68 | * We can't know how many blocks there are in between two file offsets, nor
69 | * how large those blocks are. So this uses only the difference between the
70 | * file offsets—unless that difference is zero, in which case the split is
71 | * wholly contained in one block and thus we can give an exact result.
72 | */
73 | @Override public long getLength() {
74 | final long vsHi = vStart & ~0xffff;
75 | final long veHi = vEnd & ~0xffff;
76 | final long hiDiff = veHi - vsHi;
77 | return hiDiff == 0 ? ((vEnd & 0xffff) - (vStart & 0xffff)) : hiDiff;
78 | }
79 |
80 | public Path getPath() { return file; }
81 |
82 | /** Inclusive. */
83 | public long getStartVirtualOffset() { return vStart; }
84 |
85 | /** Exclusive. */
86 | public long getEndVirtualOffset() { return vEnd; }
87 |
88 | public void setStartVirtualOffset(long vo) { vStart = vo; }
89 | public void setEndVirtualOffset(long vo) { vEnd = vo; }
90 |
91 | /**
92 | * @return pairs of virtual file pointers for all intervals that should be used for
93 | * filtering the split, or null if there are none. These correspond to
94 | * BAMFileSpan chunk start/stop pointers in htsjdk.
95 | */
96 | public long[] getIntervalFilePointers() {
97 | return intervalFilePointers;
98 | }
99 |
100 | @Override public void write(DataOutput out) throws IOException {
101 | Text.writeString(out, file.toString());
102 | out.writeLong(vStart);
103 | out.writeLong(vEnd);
104 | out.writeBoolean(intervalFilePointers != null);
105 | if (intervalFilePointers != null) {
106 | out.writeInt(intervalFilePointers.length);
107 | for (int i = 0; i < intervalFilePointers.length; i++) {
108 | out.writeLong(intervalFilePointers[i]);
109 | }
110 | }
111 | }
112 | @Override public void readFields(DataInput in) throws IOException {
113 | file = new Path(Text.readString(in));
114 | vStart = in.readLong();
115 | vEnd = in.readLong();
116 | if (in.readBoolean()) {
117 | intervalFilePointers = new long[in.readInt()];
118 | for (int i = 0; i < intervalFilePointers.length; i++) {
119 | intervalFilePointers[i] = in.readLong();
120 | }
121 | }
122 | }
123 |
124 | @Override
125 | public String toString() { return file + ":" + vStart + "-" + vEnd; }
126 | }
127 |
--------------------------------------------------------------------------------
/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormatOnHDFS.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam;
2 |
3 | import htsjdk.samtools.SAMRecord;
4 | import htsjdk.samtools.SamReader;
5 | import htsjdk.samtools.SamReaderFactory;
6 | import java.io.File;
7 | import java.io.IOException;
8 | import java.net.URI;
9 | import java.net.URISyntaxException;
10 | import java.nio.file.Files;
11 | import java.nio.file.Paths;
12 | import java.util.List;
13 | import org.apache.hadoop.conf.Configuration;
14 | import org.apache.hadoop.fs.FileUtil;
15 | import org.apache.hadoop.hdfs.MiniDFSCluster;
16 | import org.apache.hadoop.io.LongWritable;
17 | import org.apache.hadoop.mapreduce.InputSplit;
18 | import org.apache.hadoop.mapreduce.JobContext;
19 | import org.apache.hadoop.mapreduce.RecordReader;
20 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
21 | import org.apache.hadoop.mapreduce.TaskAttemptID;
22 | import org.apache.hadoop.mapreduce.task.JobContextImpl;
23 | import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
24 | import org.junit.AfterClass;
25 | import org.junit.Before;
26 | import org.junit.BeforeClass;
27 | import org.junit.Test;
28 |
29 | import static org.junit.Assert.assertEquals;
30 | import static org.junit.Assert.assertTrue;
31 | import static org.mockito.Mockito.mock;
32 |
33 | public class TestCRAMInputFormatOnHDFS {
34 | private String input;
35 | private String reference;
36 | private TaskAttemptContext taskAttemptContext;
37 | private JobContext jobContext;
38 |
39 |
40 | private static MiniDFSCluster cluster;
41 | private static URI clusterUri;
42 |
43 | @BeforeClass
44 | public static void setUpBeforeClass() throws Exception {
45 | cluster = startMini(TestCRAMInputFormatOnHDFS.class.getName());
46 | clusterUri = formalizeClusterURI(cluster.getFileSystem().getUri());
47 | }
48 |
49 | @AfterClass
50 | public static void teardownClass() throws Exception {
51 | if (cluster != null)
52 | {
53 | cluster.shutdown();
54 | }
55 | }
56 |
57 |
58 | @Before
59 | public void setup() throws Exception {
60 | Configuration conf = new Configuration();
61 | input = ClassLoader.getSystemClassLoader().getResource("test.cram").getFile();
62 | reference = ClassLoader.getSystemClassLoader().getResource("auxf.fa").toURI().toString();
63 | String referenceIndex = ClassLoader.getSystemClassLoader().getResource("auxf.fa.fai")
64 | .toURI().toString();
65 | conf.set("mapred.input.dir", "file://" + input);
66 |
67 | URI hdfsRef = clusterUri.resolve("/tmp/auxf.fa");
68 | URI hdfsRefIndex = clusterUri.resolve("/tmp/auxf.fa.fai");
69 | Files.copy(Paths.get(URI.create(reference)), Paths.get(hdfsRef));
70 | Files.copy(Paths.get(URI.create(referenceIndex)), Paths.get(hdfsRefIndex));
71 |
72 | conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, hdfsRef.toString());
73 |
74 |
75 | taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class));
76 | jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID());
77 |
78 | }
79 |
80 | private static MiniDFSCluster startMini(String testName) throws IOException {
81 | File baseDir = new File("./target/hdfs/" + testName).getAbsoluteFile();
82 | FileUtil.fullyDelete(baseDir);
83 | Configuration conf = new Configuration();
84 | conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath());
85 | MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf);
86 | MiniDFSCluster hdfsCluster = builder.clusterId(testName).build();
87 | hdfsCluster.waitActive();
88 | return hdfsCluster;
89 | }
90 |
91 | protected static URI formalizeClusterURI(URI clusterUri) throws URISyntaxException {
92 | if (clusterUri.getPath()==null) {
93 | return new URI(clusterUri.getScheme(), null,
94 | clusterUri.getHost(), clusterUri.getPort(),
95 | "/", null, null);
96 | } else if (clusterUri.getPath().trim()=="") {
97 | return new URI(clusterUri.getScheme(), null,
98 | clusterUri.getHost(), clusterUri.getPort(),
99 | "/", null, null);
100 | }
101 | return clusterUri;
102 | }
103 |
104 | @Test
105 | public void testReader() throws Exception {
106 | int expectedCount = 0;
107 | SamReader samReader = SamReaderFactory.makeDefault()
108 | .referenceSequence(new File(URI.create(reference))).open(new File(input));
109 | for (SAMRecord r : samReader) {
110 | expectedCount++;
111 | }
112 |
113 | CRAMInputFormat inputFormat = new CRAMInputFormat();
114 | List splits = inputFormat.getSplits(jobContext);
115 | assertEquals(1, splits.size());
116 | RecordReader reader = inputFormat
117 | .createRecordReader(splits.get(0), taskAttemptContext);
118 | reader.initialize(splits.get(0), taskAttemptContext);
119 |
120 | int actualCount = 0;
121 | while (reader.nextKeyValue()) {
122 | actualCount++;
123 | }
124 |
125 | assertEquals(expectedCount, actualCount);
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/CRAMRecordWriter.java:
--------------------------------------------------------------------------------
1 | package org.seqdoop.hadoop_bam;
2 |
3 | import java.io.*;
4 | import java.net.URI;
5 | import java.nio.file.Paths;
6 |
7 | import htsjdk.samtools.CRAMContainerStreamWriter;
8 | import htsjdk.samtools.SAMTextHeaderCodec;
9 | import htsjdk.samtools.cram.ref.ReferenceSource;
10 | import htsjdk.samtools.SAMFileHeader;
11 | import htsjdk.samtools.SAMRecord;
12 | import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
13 | import htsjdk.samtools.util.StringLineReader;
14 | import org.apache.hadoop.fs.Path;
15 | import org.apache.hadoop.mapreduce.RecordWriter;
16 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
17 |
18 | import org.seqdoop.hadoop_bam.util.NIOFileUtil;
19 | import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
20 |
21 | /** A base {@link RecordWriter} for CRAM records.
22 | *
23 | * Handles the output stream, writing the header if requested, and provides
24 | * the {@link #writeAlignment} function for subclasses.
25 | * Note that each file created by this class consists of a fragment of a
26 | * complete CRAM file containing only one or more CRAM containers that do not
27 | * include a CRAM file header, a SAMFileHeader, or a CRAM EOF container.
28 | */
29 | public abstract class CRAMRecordWriter
30 | extends RecordWriter
31 | {
32 | // generic ID passed to CRAM code for internal error reporting
33 | private static final String HADOOP_BAM_PART_ID= "Hadoop-BAM-Part";
34 | private OutputStream origOutput;
35 | private CRAMContainerStreamWriter cramContainerStream = null;
36 | private ReferenceSource refSource = null;
37 | private boolean writeHeader = true;
38 |
39 | /** A SAMFileHeader is read from the input Path. */
40 | public CRAMRecordWriter(
41 | final Path output,
42 | final Path input,
43 | final boolean writeHeader,
44 | final TaskAttemptContext ctx) throws IOException
45 | {
46 | init(
47 | output,
48 | SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()),
49 | writeHeader, ctx);
50 | }
51 |
52 | public CRAMRecordWriter(
53 | final Path output, final SAMFileHeader header, final boolean writeHeader,
54 | final TaskAttemptContext ctx)
55 | throws IOException
56 | {
57 | init(
58 | output.getFileSystem(ctx.getConfiguration()).create(output),
59 | header, writeHeader, ctx);
60 | }
61 |
62 | // Working around not being able to call a constructor other than as the
63 | // first statement...
64 | private void init(
65 | final Path output, final SAMFileHeader header, final boolean writeHeader,
66 | final TaskAttemptContext ctx)
67 | throws IOException
68 | {
69 | init(
70 | output.getFileSystem(ctx.getConfiguration()).create(output),
71 | header, writeHeader, ctx);
72 | }
73 |
74 | private void init(
75 | final OutputStream output, final SAMFileHeader header, final boolean writeHeader,
76 | final TaskAttemptContext ctx)
77 | throws IOException
78 | {
79 | origOutput = output;
80 | this.writeHeader = writeHeader;
81 |
82 | final String referenceURI =
83 | ctx.getConfiguration().get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
84 | refSource = new ReferenceSource(referenceURI == null ? null :
85 | NIOFileUtil.asPath(referenceURI));
86 |
87 | // A SAMFileHeader must be supplied at CRAMContainerStreamWriter creation time; if
88 | // we don't have one then delay creation until we do
89 | if (header != null) {
90 | cramContainerStream = new CRAMContainerStreamWriter(
91 | origOutput, null, refSource, header, HADOOP_BAM_PART_ID);
92 | if (writeHeader) {
93 | this.writeHeader(header);
94 | }
95 | }
96 | }
97 |
98 | @Override public void close(TaskAttemptContext ctx) throws IOException {
99 | cramContainerStream.finish(false); // Close, but suppress CRAM EOF container
100 | origOutput.close(); // And close the original output.
101 | }
102 |
103 | protected void writeAlignment(final SAMRecord rec) {
104 | if (null == cramContainerStream) {
105 | final SAMFileHeader header = rec.getHeader();
106 | if (header == null) {
107 | throw new RuntimeException("Cannot write record to CRAM: null header in SAM record");
108 | }
109 | if (writeHeader) {
110 | this.writeHeader(header);
111 | }
112 | cramContainerStream = new CRAMContainerStreamWriter(
113 | origOutput, null, refSource, header, HADOOP_BAM_PART_ID);
114 | }
115 | cramContainerStream.writeAlignment(rec);
116 | }
117 |
118 | private void writeHeader(final SAMFileHeader header) {
119 | cramContainerStream.writeHeader(header);
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2013 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | package org.seqdoop.hadoop_bam;
22 |
23 | import java.io.UnsupportedEncodingException;
24 | import java.util.List;
25 |
26 | import htsjdk.tribble.readers.LineIterator;
27 | import htsjdk.variant.variantcontext.Allele;
28 | import htsjdk.variant.variantcontext.LazyGenotypesContext;
29 | import htsjdk.variant.vcf.AbstractVCFCodec;
30 | import htsjdk.variant.vcf.VCFHeader;
31 | import htsjdk.variant.vcf.VCFHeaderLine;
32 | import htsjdk.variant.vcf.VCFHeaderVersion;
33 |
34 | // File created: 2013-07-03 15:41:21
35 |
36 | // The actual parsing is delegated to AbstractVCFCodec.
37 | public class LazyVCFGenotypesContext extends LazyParsingGenotypesContext {
38 |
39 | /** Takes ownership of the given byte[]: don't modify its contents. */
40 | public LazyVCFGenotypesContext(
41 | List alleles, String chrom, int start,
42 | byte[] utf8Unparsed, int count)
43 | {
44 | super(new Parser(alleles, chrom, start), utf8Unparsed, count);
45 | }
46 |
47 | public static class HeaderDataCache
48 | implements LazyParsingGenotypesContext.HeaderDataCache
49 | {
50 | private HeaderSettableVCFCodec codec = new HeaderSettableVCFCodec();
51 |
52 | @Override public void setHeader(VCFHeader header) {
53 | VCFHeaderVersion version = null;
54 |
55 | // Normally AbstractVCFCodec parses the header and thereby sets the
56 | // version field. It gets used later on so we need to set it.
57 | for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) {
58 | if (VCFHeaderVersion.isFormatString(line.getKey())) {
59 | version = VCFHeaderVersion.toHeaderVersion(line.getValue());
60 | break;
61 | }
62 | }
63 |
64 | codec.setHeaderAndVersion(header, version);
65 | }
66 |
67 | public AbstractVCFCodec getCodec() { return codec; }
68 | }
69 |
70 | public static class Parser extends LazyParsingGenotypesContext.Parser {
71 | private HeaderSettableVCFCodec codec = null;
72 | private final List alleles;
73 | private final String chrom;
74 | private final int start;
75 |
76 | public Parser(List alleles, String chrom, int start) {
77 | this.alleles = alleles;
78 | this.chrom = chrom;
79 | this.start = start;
80 | }
81 |
82 | @Override public void setHeaderDataCache(
83 | LazyParsingGenotypesContext.HeaderDataCache data)
84 | {
85 | codec = (HeaderSettableVCFCodec)((HeaderDataCache)data).getCodec();
86 | }
87 |
88 | @Override public LazyGenotypesContext.LazyData parse(final Object data) {
89 | if (codec == null || !codec.hasHeader())
90 | throw new IllegalStateException(
91 | "Cannot decode genotypes without a codec with a VCFHeader");
92 |
93 | final String str;
94 | try {
95 | str = new String((byte[])data, "UTF-8");
96 | } catch (UnsupportedEncodingException absurd) {
97 | throw new RuntimeException(
98 | "Can never happen on a compliant Java implementation because "+
99 | "UTF-8 is guaranteed to be supported");
100 | }
101 | return codec.createGenotypeMap(str, alleles, chrom, start);
102 | }
103 | }
104 | }
105 |
106 | // This is a HACK. But, the functionality is only in AbstractVCFCodec so it
107 | // can't be helped. This is preferable to copying the functionality into
108 | // parse() above.
109 | class HeaderSettableVCFCodec extends AbstractVCFCodec {
110 | public boolean hasHeader() { return header != null; }
111 |
112 | public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) {
113 | this.header = header;
114 | this.version = ver;
115 | }
116 |
117 | @Override public Object readActualHeader(LineIterator reader) {
118 | throw new UnsupportedOperationException(
119 | "Internal error: this shouldn't be called");
120 | }
121 | @Override public List parseFilters(String filterString) {
122 | throw new UnsupportedOperationException(
123 | "Internal error: this shouldn't be called");
124 | }
125 | @Override public boolean canDecode(String s) {
126 | return true;
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/ReferenceFragment.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2012 Aalto University
2 | //
3 | // This file is part of Hadoop-BAM.
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to
7 | // deal in the Software without restriction, including without limitation the
8 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 | // sell copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in
13 | // all copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | // IN THE SOFTWARE.
22 |
23 | package org.seqdoop.hadoop_bam;
24 |
25 | import org.apache.hadoop.io.Text;
26 | import org.apache.hadoop.io.Writable;
27 | import org.apache.hadoop.io.WritableUtils;
28 |
29 | import java.io.IOException;
30 | import java.io.DataInput;
31 | import java.io.DataOutput;
32 |
33 | // partly based on SequencedFragment
34 | // note: this class is supposed to represent a single line of a fasta input file, augmented by chromosome/contig name and start position
35 |
36 | public class ReferenceFragment implements Writable
37 | {
38 | protected Text sequence = new Text();
39 |
40 | protected Integer position;
41 | protected String indexSequence;
42 |
43 | public void clear()
44 | {
45 | sequence.clear();
46 | indexSequence = null;
47 | position = null;
48 | }
49 |
50 | /**
51 | * Get sequence Text object.
52 | * Trade encapsulation for efficiency. Here we expose the internal Text
53 | * object so that data may be read and written diretly from/to it.
54 | *
55 | * Sequence should always be written using CAPITAL letters and 'N' for unknown bases.
56 | */
57 | public Text getSequence() { return sequence; }
58 |
59 | /**
60 | * Get quality Text object.
61 | * Trade encapsulation for efficiency. Here we expose the internal Text
62 | * object so that data may be read and written diretly from/to it.
63 | *
64 | */
65 | public void setPosition(Integer pos) {
66 | if (pos == null)
67 | throw new IllegalArgumentException("can't have null reference position");
68 | position = pos;
69 | }
70 |
71 | public void setIndexSequence(String v) {
72 | if (v == null)
73 | throw new IllegalArgumentException("can't have null index sequence");
74 | indexSequence = v;
75 | }
76 |
77 | public void setSequence(Text seq)
78 | {
79 | if (seq == null)
80 | throw new IllegalArgumentException("can't have a null sequence");
81 | sequence = seq;
82 | }
83 |
84 | public Integer getPosition() { return position; }
85 | public String getIndexSequence() { return indexSequence; }
86 |
87 | /**
88 | * Recreates a pseudo fasta record with the fields available.
89 | */
90 | public String toString()
91 | {
92 | String delim = "\t";
93 | StringBuilder builder = new StringBuilder(800);
94 | builder.append(indexSequence).append(delim);
95 | builder.append(position).append(delim);
96 | builder.append(sequence);
97 | return builder.toString();
98 | }
99 |
100 | public boolean equals(Object other)
101 | {
102 | if (other != null && other instanceof ReferenceFragment)
103 | {
104 | ReferenceFragment otherFrag = (ReferenceFragment)other;
105 |
106 | if (position == null && otherFrag.position != null || position != null && !position.equals(otherFrag.position))
107 | return false;
108 | if (indexSequence == null && otherFrag.indexSequence != null || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence))
109 | return false;
110 | // sequence can't be null
111 | if (!sequence.equals(otherFrag.sequence))
112 | return false;
113 |
114 | return true;
115 | }
116 | else
117 | return false;
118 | }
119 |
120 | @Override
121 | public int hashCode() {
122 | int result = sequence.hashCode();
123 | result = 31 * result + (position != null ? position.hashCode() : 0);
124 | result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0);
125 | return result;
126 | }
127 |
128 | public void readFields(DataInput in) throws IOException
129 | {
130 | // serialization order:
131 | // 1) sequence
132 | // 2) indexSequence (chromosome/contig name)
133 | // 3) position of first base in this line of the fasta file
134 |
135 | this.clear();
136 |
137 | sequence.readFields(in);
138 |
139 | indexSequence = WritableUtils.readString(in);
140 | position = WritableUtils.readVInt(in);
141 | }
142 |
143 | public void write(DataOutput out) throws IOException
144 | {
145 | sequence.write(out);
146 |
147 | WritableUtils.writeString(out, indexSequence);
148 | WritableUtils.writeVInt(out, position);
149 |
150 | }
151 | }
152 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/util/SAMOutputPreparer.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2012 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2012-07-26 14:36:03
22 |
23 | package org.seqdoop.hadoop_bam.util;
24 |
25 | import java.io.File;
26 | import java.io.FilterOutputStream;
27 | import java.io.IOException;
28 | import java.io.OutputStream;
29 | import java.io.OutputStreamWriter;
30 | import java.io.StringWriter;
31 | import java.nio.ByteBuffer;
32 | import java.nio.ByteOrder;
33 | import java.util.List;
34 |
35 | import htsjdk.samtools.SAMFileHeader;
36 | import htsjdk.samtools.SAMSequenceRecord;
37 | import htsjdk.samtools.SAMTextHeaderCodec;
38 | import htsjdk.samtools.cram.build.CramIO;
39 | import htsjdk.samtools.cram.common.CramVersions;
40 | import htsjdk.samtools.util.BlockCompressedOutputStream;
41 |
42 | import org.seqdoop.hadoop_bam.SAMFormat;
43 |
44 | public class SAMOutputPreparer {
45 | private ByteBuffer buf;
46 |
47 | public SAMOutputPreparer() {
48 | // Enough room for a 32-bit integer.
49 | buf = ByteBuffer.wrap(new byte[4]);
50 | buf.order(ByteOrder.LITTLE_ENDIAN);
51 | }
52 |
53 | public static final byte[] BAM_MAGIC = {'B','A','M', 1};
54 |
55 | /** Prepares the given output stream for writing of SAMRecords in the given
56 | * format. This includes writing the given SAM header and, in the case of
57 | * BAM or CRAM, writing some further metadata as well as compressing everything
58 | * written. Returns a new stream to replace the original: it will do the
59 | * appropriate compression for BAM/CRAM files.
60 | */
61 | public OutputStream prepareForRecords(
62 | OutputStream out, final SAMFormat format,
63 | final SAMFileHeader header)
64 | throws IOException {
65 |
66 | switch (format) {
67 | case SAM:
68 | out = prepareSAMOrBAMStream(out, format, header);
69 | break;
70 | case BAM:
71 | out = prepareSAMOrBAMStream(out, format, header);
72 | break;
73 | case CRAM:
74 | out = prepareCRAMStream(out, format, header);
75 | break;
76 | default:
77 | throw new IllegalArgumentException
78 | ("Unsupported SAM file format, must be one of SAM, BAM or CRAM");
79 | }
80 |
81 | // Important for BAM: if the caller doesn't want to use the new stream
82 | // for some reason, the BlockCompressedOutputStream's buffer would never
83 | // be flushed.
84 | out.flush();
85 | return out;
86 | }
87 |
88 | private OutputStream prepareCRAMStream(
89 | OutputStream out, final SAMFormat format,
90 | final SAMFileHeader header) throws IOException
91 | {
92 | CramIO.writeHeader(CramVersions.DEFAULT_CRAM_VERSION, out, header, null);
93 | return out;
94 | }
95 |
96 | private OutputStream prepareSAMOrBAMStream(
97 | OutputStream out, final SAMFormat format,
98 | final SAMFileHeader header) throws IOException
99 | {
100 | final StringWriter sw = new StringWriter();
101 | new SAMTextHeaderCodec().encode(sw, header);
102 | final String text = sw.toString();
103 |
104 | if (format == SAMFormat.BAM) {
105 | out = new BlockCompressedOutputStream(out, (File) null);
106 | out.write(BAM_MAGIC);
107 | writeInt32(out, text.length());
108 | }
109 |
110 | writeString(out, text);
111 |
112 | if (format == SAMFormat.BAM) {
113 | final List refs =
114 | header.getSequenceDictionary().getSequences();
115 |
116 | writeInt32(out, refs.size());
117 |
118 | for (final SAMSequenceRecord ref : refs) {
119 | final String name = ref.getSequenceName();
120 | writeInt32(out, name.length() + 1);
121 | writeString(out, name);
122 | out.write(0);
123 | writeInt32(out, ref.getSequenceLength());
124 | }
125 | }
126 |
127 | return out;
128 | }
129 |
130 | private static void writeString(final OutputStream out, final String s)
131 | throws IOException
132 | {
133 | // Don't flush the underlying stream yet, only the writer: in the case of
134 | // BAM, we might be able to cram more things into the gzip block still.
135 | final OutputStreamWriter w = new OutputStreamWriter(
136 | new FilterOutputStream(out) { @Override public void flush() {} } );
137 | w.write(s);
138 | w.flush();
139 | }
140 |
141 | private void writeInt32(final OutputStream out, int n) throws IOException {
142 | buf.putInt(0, n);
143 | out.write(buf.array());
144 | }
145 | }
146 |
--------------------------------------------------------------------------------
/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndex.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2010 Aalto University
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to
5 | // deal in the Software without restriction, including without limitation the
6 | // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | // sell copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | // IN THE SOFTWARE.
20 |
21 | // File created: 2010-08-04 13:11:10
22 |
23 | package org.seqdoop.hadoop_bam;
24 |
25 | import java.io.BufferedInputStream;
26 | import java.io.File;
27 | import java.io.FileInputStream;
28 | import java.io.InputStream;
29 | import java.io.IOException;
30 | import java.nio.ByteBuffer;
31 | import java.util.ArrayList;
32 | import java.util.List;
33 | import java.util.NavigableSet;
34 | import java.util.TreeSet;
35 |
36 | /** An index into BAM files, for {@link BAMInputFormat}. Reads files that are
37 | * created by {@link SplittingBAMIndexer}.
38 | *
39 | * Indexes the positions of individual BAM records in the file.
40 | */
41 | public final class SplittingBAMIndex {
42 | private final NavigableSet virtualOffsets = new TreeSet();
43 |
44 | public SplittingBAMIndex() {}
45 | public SplittingBAMIndex(final File path) throws IOException {
46 | this(new BufferedInputStream(new FileInputStream(path)));
47 | }
48 | public SplittingBAMIndex(final InputStream in) throws IOException {
49 | readIndex(in);
50 | }
51 |
52 | public void readIndex(final InputStream in) throws IOException {
53 | virtualOffsets.clear();
54 |
55 | final ByteBuffer bb = ByteBuffer.allocate(8);
56 |
57 | for (long prev = -1; in.read(bb.array()) == 8;) {
58 | final long cur = bb.getLong(0);
59 | if (prev > cur)
60 | throw new IOException(String.format(
61 | "Invalid splitting BAM index; offsets not in order: %#x > %#x",
62 | prev, cur));
63 |
64 | virtualOffsets.add(prev = cur);
65 | }
66 | in.close();
67 |
68 | if (virtualOffsets.size() < 1)
69 | throw new IOException(
70 | "Invalid splitting BAM index: "+
71 | "should contain at least the file size");
72 | }
73 |
74 | public List getVirtualOffsets() {
75 | return new ArrayList<>(virtualOffsets);
76 | }
77 |
78 | public Long prevAlignment(final long filePos) {
79 | return virtualOffsets.floor(filePos << 16);
80 | }
81 | public Long nextAlignment(final long filePos) {
82 | return virtualOffsets.higher(filePos << 16);
83 | }
84 |
85 | public int size() { return virtualOffsets.size(); }
86 |
87 | private long first() { return virtualOffsets.first(); }
88 | private long last() { return prevAlignment(bamSize() - 1); }
89 | long bamSize() { return virtualOffsets.last() >>> 16; }
90 |
91 | @Override
92 | public boolean equals(Object o) {
93 | if (this == o) return true;
94 | if (o == null || getClass() != o.getClass()) return false;
95 |
96 | SplittingBAMIndex that = (SplittingBAMIndex) o;
97 |
98 | return virtualOffsets != null ? virtualOffsets.equals(that.virtualOffsets) : that
99 | .virtualOffsets == null;
100 |
101 | }
102 |
103 | @Override
104 | public int hashCode() {
105 | return virtualOffsets != null ? virtualOffsets.hashCode() : 0;
106 | }
107 |
108 | @Override
109 | public String toString() {
110 | return virtualOffsets.toString();
111 | }
112 |
113 | /** Writes some statistics about each splitting BAM index file given as an
114 | * argument.
115 | */
116 | public static void main(String[] args) {
117 | if (args.length == 0) {
118 | System.out.println(
119 | "Usage: SplittingBAMIndex [splitting BAM indices...]\n\n"+
120 |
121 | "Writes a few statistics about each splitting BAM index.");
122 | return;
123 | }
124 |
125 | for (String arg : args) {
126 | final File f = new File(arg);
127 | if (f.isFile() && f.canRead()) {
128 | try {
129 | System.err.printf("%s:\n", f);
130 | final SplittingBAMIndex bi = new SplittingBAMIndex(f);
131 | if (bi.size() == 1) {
132 | System.err.printf("\t0 alignments\n" +
133 | "\tassociated BAM file size %d\n", bi.bamSize());
134 | } else {
135 | final long first = bi.first();
136 | final long last = bi.last();
137 | System.err.printf(
138 | "\t%d alignments\n" +
139 | "\tfirst is at %#06x in BGZF block at %#014x\n" +
140 | "\tlast is at %#06x in BGZF block at %#014x\n" +
141 | "\tassociated BAM file size %d\n",
142 | bi.size(),
143 | first & 0xffff, first >>> 16,
144 | last & 0xffff, last >>> 16,
145 | bi.bamSize());
146 | }
147 | } catch (IOException e) {
148 | System.err.printf("Failed to read %s!\n", f);
149 | e.printStackTrace();
150 | }
151 | } else
152 | System.err.printf("%s does not look like a readable file!\n", f);
153 | }
154 | }
155 | }
156 |
--------------------------------------------------------------------------------