├── .github └── workflows │ ├── ci.yml │ └── deploy.yml ├── .gitignore ├── CHANGES.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── KEYS ├── LICENSE.txt ├── LICENSE_header.txt ├── README.md ├── SUPPORT.md ├── adam-apis ├── pom.xml └── src │ ├── main │ └── scala │ │ └── org │ │ └── bdgenomics │ │ └── adam │ │ └── api │ │ ├── java │ │ ├── GenomicDatasetConverters.scala │ │ ├── GenomicRDDConverters.scala │ │ └── JavaADAMContext.scala │ │ └── python │ │ └── DataFrameConversionWrapper.scala │ └── test │ ├── java │ └── org │ │ └── bdgenomics │ │ └── adam │ │ └── api │ │ └── java │ │ ├── JavaADAMCoverageConduit.java │ │ ├── JavaADAMFeatureConduit.java │ │ ├── JavaADAMFragmentConduit.java │ │ ├── JavaADAMGenotypeConduit.java │ │ ├── JavaADAMReadConduit.java │ │ ├── JavaADAMSequenceConduit.java │ │ ├── JavaADAMSliceConduit.java │ │ └── JavaADAMVariantConduit.java │ ├── resources │ └── indexed_bams │ │ ├── sorted.bam │ │ └── sorted.bam.bai │ └── scala │ └── org │ └── bdgenomics │ └── adam │ └── api │ └── java │ └── JavaADAMContextSuite.scala ├── adam-assembly ├── pom.xml └── src │ └── main │ └── scala │ └── org │ └── bdgenomics │ └── adam │ └── assembly │ └── Assembly.scala ├── adam-cli ├── .gitignore ├── pom.xml └── src │ ├── main │ ├── java-templates │ │ └── org │ │ │ └── bdgenomics │ │ │ └── adam │ │ │ └── cli │ │ │ └── About.java │ └── scala │ │ └── org │ │ └── bdgenomics │ │ └── adam │ │ └── cli │ │ ├── ADAM2Fastq.scala │ │ ├── ADAMMain.scala │ │ ├── CountReadKmers.scala │ │ ├── CountSliceKmers.scala │ │ ├── Coverage.scala │ │ ├── CramArgs.scala │ │ ├── FileSystemUtils.scala │ │ ├── FlagStat.scala │ │ ├── MergeShards.scala │ │ ├── PrintADAM.scala │ │ ├── TransformAlignments.scala │ │ ├── TransformFeatures.scala │ │ ├── TransformFragments.scala │ │ ├── TransformGenotypes.scala │ │ ├── TransformSequences.scala │ │ ├── TransformSlices.scala │ │ ├── TransformVariants.scala │ │ └── View.scala │ └── test │ ├── resources │ ├── artificial.counts.txt │ ├── artificial.fa │ ├── artificial.fa.fai │ ├── bqsr1-r1.fq │ ├── bqsr1-r2.fq │ ├── bqsr1.sam │ ├── chr5.phyloP46way.trunc.wigFix │ ├── contigs.fa │ ├── flag-values.sam │ ├── gencode.v7.annotation.trunc10.bed │ ├── log4j.properties │ ├── small.vcf │ ├── sorted.bam │ ├── sorted.bam.bai │ ├── sorted.counts.txt │ ├── sorted.lex.vcf │ └── sorted.vcf │ └── scala │ └── org │ └── bdgenomics │ └── adam │ └── cli │ ├── ADAM2FastqSuite.scala │ ├── ADAMMainSuite.scala │ ├── AboutSuite.scala │ ├── CountReadKmersSuite.scala │ ├── CountSliceKmersSuite.scala │ ├── CoverageSuite.scala │ ├── MergeShardsSuite.scala │ ├── ParquetLister.scala │ ├── TransformAlignmentsSuite.scala │ ├── TransformFeaturesSuite.scala │ ├── TransformFragmentsSuite.scala │ ├── TransformGenotypesSuite.scala │ ├── TransformVariantsSuite.scala │ └── ViewSuite.scala ├── adam-codegen ├── pom.xml └── src │ └── main │ └── scala │ └── org │ └── bdgenomics │ └── adam │ └── codegen │ ├── DumpSchemasToProduct.scala │ ├── DumpSchemasToProjectionEnums.scala │ ├── Generator.scala │ └── ReflectSchema.scala ├── adam-core ├── .gitignore ├── pom.xml └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── bdgenomics │ │ │ └── adam │ │ │ └── io │ │ │ ├── FastqInputFormat.java │ │ │ ├── FastqRecordReader.java │ │ │ ├── InterleavedFastqInputFormat.java │ │ │ ├── ResettableCompressedSplitLineReader.java │ │ │ └── SingleFastqInputFormat.java │ └── scala │ │ └── org │ │ └── bdgenomics │ │ └── adam │ │ ├── algorithms │ │ ├── consensus │ │ │ ├── Consensus.scala │ │ │ ├── ConsensusGenerator.scala │ │ │ ├── ConsensusGeneratorFromKnowns.scala │ │ │ ├── ConsensusGeneratorFromReads.scala │ │ │ ├── ConsensusGeneratorFromSmithWaterman.scala │ │ │ ├── NormalizationUtils.scala │ │ │ └── UnionConsensusGenerator.scala │ │ └── smithwaterman │ │ │ ├── SmithWaterman.scala │ │ │ ├── SmithWatermanConstantGapScoring.scala │ │ │ └── SmithWatermanGapScoringFromFn.scala │ │ ├── converters │ │ ├── AlignmentConverter.scala │ │ ├── DefaultHeaderLines.scala │ │ ├── FastaConverters.scala │ │ ├── FastqRecordConverter.scala │ │ ├── FragmentConverter.scala │ │ ├── TranscriptEffectConverter.scala │ │ └── VariantContextConverter.scala │ │ ├── ds │ │ ├── ADAMContext.scala │ │ ├── ADAMParquetInputFormat.scala │ │ ├── ADAMSaveAnyArgs.scala │ │ ├── GenomeBins.scala │ │ ├── GenomicBroadcast.scala │ │ ├── GenomicDataset.scala │ │ ├── GenomicDatasetConversion.scala │ │ ├── GenomicPartitioners.scala │ │ ├── InFormatter.scala │ │ ├── OutFormatter.scala │ │ ├── ReferencePartitioner.scala │ │ ├── RegionJoin.scala │ │ ├── SAMHeaderWriter.scala │ │ ├── ShuffleRegionJoin.scala │ │ ├── TreeRegionJoin.scala │ │ ├── VCFHeaderUtils.scala │ │ ├── feature │ │ │ ├── BEDInFormatter.scala │ │ │ ├── BEDOutFormatter.scala │ │ │ ├── CoverageDataset.scala │ │ │ ├── FeatureDataset.scala │ │ │ ├── FeatureParser.scala │ │ │ ├── Features.scala │ │ │ ├── GFF3HeaderWriter.scala │ │ │ ├── GFF3InFormatter.scala │ │ │ ├── GFF3OutFormatter.scala │ │ │ ├── GTFInFormatter.scala │ │ │ ├── GTFOutFormatter.scala │ │ │ ├── NarrowPeakInFormatter.scala │ │ │ └── NarrowPeakOutFormatter.scala │ │ ├── fragment │ │ │ ├── FragmentDataset.scala │ │ │ ├── InterleavedFASTQInFormatter.scala │ │ │ ├── Tab5InFormatter.scala │ │ │ └── Tab6InFormatter.scala │ │ ├── read │ │ │ ├── ADAMBAMOutputFormat.scala │ │ │ ├── ADAMCRAMOutputFormat.scala │ │ │ ├── ADAMSAMOutputFormat.scala │ │ │ ├── AlignmentDataset.scala │ │ │ ├── AnySAMInFormatter.scala │ │ │ ├── AnySAMOutFormatter.scala │ │ │ ├── BAMInFormatter.scala │ │ │ ├── BinQualities.scala │ │ │ ├── FASTQInFormatter.scala │ │ │ ├── FlagStat.scala │ │ │ ├── MDTagging.scala │ │ │ ├── MarkDuplicates.scala │ │ │ ├── ReadDataset.scala │ │ │ ├── ReferencePositionPair.scala │ │ │ ├── RepairPartitions.scala │ │ │ ├── SAMInFormatter.scala │ │ │ ├── SingleReadBucket.scala │ │ │ ├── realignment │ │ │ │ ├── IndelRealignmentTarget.scala │ │ │ │ ├── ModPartitioner.scala │ │ │ │ ├── RealignIndels.scala │ │ │ │ └── RealignmentTargetFinder.scala │ │ │ └── recalibration │ │ │ │ ├── Aggregate.scala │ │ │ │ ├── BaseQualityRecalibration.scala │ │ │ │ ├── Covariate.scala │ │ │ │ ├── CovariateKey.scala │ │ │ │ ├── CovariateSpace.scala │ │ │ │ ├── CycleCovariate.scala │ │ │ │ ├── DinucCovariate.scala │ │ │ │ ├── Observation.scala │ │ │ │ ├── ObservationTable.scala │ │ │ │ ├── RecalibrationTable.scala │ │ │ │ └── Recalibrator.scala │ │ ├── sequence │ │ │ ├── FASTAInFormatter.scala │ │ │ ├── FlankSlices.scala │ │ │ ├── SequenceDataset.scala │ │ │ └── SliceDataset.scala │ │ └── variant │ │ │ ├── ADAMVCFOutputFormat.scala │ │ │ ├── GenotypeDataset.scala │ │ │ ├── VCFInFormatter.scala │ │ │ ├── VCFOutFormatter.scala │ │ │ ├── VariantContextDataset.scala │ │ │ └── VariantDataset.scala │ │ ├── models │ │ ├── Alphabet.scala │ │ ├── Attribute.scala │ │ ├── Coverage.scala │ │ ├── IndelTable.scala │ │ ├── MdTag.scala │ │ ├── NonoverlappingRegions.scala │ │ ├── ReadGroupDictionary.scala │ │ ├── ReferencePosition.scala │ │ ├── ReferenceRegion.scala │ │ ├── SAMFileHeaderWritable.scala │ │ ├── SequenceDictionary.scala │ │ ├── SnpTable.scala │ │ ├── VCFHeaderWritable.scala │ │ └── VariantContext.scala │ │ ├── projections │ │ ├── FieldEnumeration.scala │ │ └── Projection.scala │ │ ├── rich │ │ ├── RichAlignment.scala │ │ ├── RichCigar.scala │ │ └── RichVariant.scala │ │ ├── serialization │ │ └── ADAMKryoRegistrator.scala │ │ ├── sql │ │ └── VariantContext.scala │ │ └── util │ │ ├── ADAMShell.scala │ │ ├── ASCIITable.scala │ │ ├── AttributeUtils.scala │ │ ├── FileExtensions.scala │ │ ├── FileMerger.scala │ │ ├── GenomeFileReader.scala │ │ ├── IndexedFastaFile.scala │ │ ├── ManualRegionPartitioner.scala │ │ ├── ParallelFileMerger.scala │ │ ├── ParquetFileTraversable.scala │ │ ├── ParquetLogger.scala │ │ ├── PhredUtils.scala │ │ ├── ReferenceFile.scala │ │ ├── ReferenceMap.scala │ │ ├── SequenceDictionaryReader.scala │ │ ├── TextAlignment.scala │ │ ├── TextRddWriter.scala │ │ └── TwoBitFile.scala │ └── test │ ├── resources │ ├── HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_all.fixed-phase-set.excerpt.vcf │ ├── HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_all.fixed-phase-set.excerpt.vcf.README │ ├── HLA_DQB1_05_01_01_02.dict │ ├── HLA_DQB1_05_01_01_02.fa │ ├── HLA_DQB1_05_01_01_02.fa.fai │ ├── Homo_sapiens.GRCh37.75.trun100.gtf │ ├── Homo_sapiens.GRCh37.75.trun20.gtf │ ├── Hs_Ensembl_example_genes.gtf │ ├── NA12878.1_854950_855150.sam │ ├── NA12878.1_922305.G_GC_hom.sam │ ├── NA12878.chr22.tiny.freebayes.vcf │ ├── NA12878.sam │ ├── SeqCap_EZ_Exome_v3.hg19.interval_list │ ├── artificial.README.txt │ ├── artificial.cram │ ├── artificial.fa │ ├── artificial.fa.fai │ ├── artificial.realigned.sam │ ├── artificial.sam │ ├── badheader.sam │ ├── bams │ │ └── small.bam │ ├── bqsr1-r1.fq │ ├── bqsr1-r2.fq │ ├── bqsr1-ref.observed │ ├── bqsr1.sam │ ├── bqsr1.snps │ ├── bqsr1.vcf │ ├── bqsr1.vcf.tbi │ ├── chr20.250k.fa.gz │ ├── chromInfo.txt │ ├── combined_2018-05-18.9900-10050.fastq │ ├── ctg123.fasta.gff3 │ ├── dict_with_accession.dict │ ├── dvl1.200.bed │ ├── dvl1.200.gff3 │ ├── dvl1.200.gtf │ ├── env_test_command.sh │ ├── example_intervals.list │ ├── fastq_nobases.fq │ ├── fastq_noqual.fq │ ├── fastq_sample1.fq │ ├── fastq_sample1.fq.bgz │ ├── fastq_sample1.fq.bz2 │ ├── fastq_sample1.fq.gz │ ├── fastq_sample2.fq │ ├── fastq_sample3.fq │ ├── fastq_sample4.fq │ ├── fastq_to_usam.py │ ├── gencode.chr20.transcript_names.head10.txt │ ├── gencode.v19.annotation.chr20.250k.gtf │ ├── gencode.v19.pc_transcripts.250k.fa.gz │ ├── gencode.v7.annotation.trunc10.bed │ ├── gvcf_dir │ │ ├── gvcf_multiallelic.g.vcf │ │ └── gvcf_multiallelic_noPLs.g.vcf │ ├── gvcf_multiallelic │ │ └── multiallelic.vcf │ ├── hg19.chrM.2bit │ ├── hg19.genome │ ├── hg19.genome.txt │ ├── hs37d5.dict │ ├── hs38DH_chr1_10.fa │ ├── human_g1k_v37_chr1_59kb.2bit │ ├── human_g1k_v37_chr1_59kb.fasta │ ├── improper_pairs_1.fq │ ├── improper_pairs_2.fq │ ├── indexed_bams │ │ ├── sorted.2.bai │ │ ├── sorted.2.bam │ │ ├── sorted.bam │ │ └── sorted.bam.bai │ ├── inf_float_values.vcf │ ├── interleaved_fastq_sample1.ifq │ ├── interleaved_fastq_sample1.ifq.bgz │ ├── interleaved_fastq_sample1.ifq.bz2 │ ├── interleaved_fastq_sample1.ifq.gz │ ├── interleaved_fastq_sample1.ifq.output │ ├── interleaved_fastq_sample2.ifq │ ├── interleaved_fastq_sample2.ifq.output │ ├── interleaved_fastq_sample3.ifq │ ├── interleaved_fastq_sample3.ifq.output │ ├── interleaved_fastq_sample4.ifq │ ├── interleaved_fastq_sample4.ifq.output │ ├── interleaved_fastq_sample5.ifq │ ├── interleaved_fastq_sample5.ifq.output │ ├── invalid │ │ ├── small.INFO_flag.vcf │ │ └── truth_small_variants.vcf │ ├── legacy.fa │ ├── log4j.properties │ ├── multi_chr.sam │ ├── multiline_fastq.fq │ ├── nan_float_values.vcf │ ├── ordered.sam │ ├── proper_pairs_1.fq │ ├── proper_pairs_2.fq │ ├── queryname.sam │ ├── random.vcf │ ├── read_names_with_index_sequences_interleaved.fq │ ├── read_names_with_index_sequences_pair1.fq │ ├── read_names_with_index_sequences_pair2.fq │ ├── readname_sorted.sam │ ├── reads-0-2-0 │ ├── reads12.sam │ ├── reads12_diff1.sam │ ├── reads13.sam │ ├── reads21.sam │ ├── sample1.query.sam │ ├── sample1.queryname.sam │ ├── sample_coverage.bed │ ├── single_fastq_sample1.fq.output │ ├── single_fastq_sample2.fq.output │ ├── single_fastq_sample3.fq.output │ ├── single_fastq_sample4.fq.output │ ├── small.1.bed │ ├── small.1.narrowPeak │ ├── small.1.sam │ ├── small.1_12.bed │ ├── small.addctg.vcf │ ├── small.badheader.sam │ ├── small.sam │ ├── small.vcf │ ├── small_missing.vcf │ ├── small_realignment_targets.intervals │ ├── small_realignment_targets.pileup │ ├── small_realignment_targets.sam │ ├── small_realignment_targets_README.txt │ ├── small_snpeff.vcf │ ├── sorted-variants.lex.vcf │ ├── sorted-variants.vcf │ ├── sorted.lex.vcf │ ├── sorted.sam │ ├── sorted.vcf │ ├── tab5_to_usam.py │ ├── tab6_to_usam.py │ ├── tag.sam │ ├── tags.sam │ ├── test.compressed.bcf │ ├── test.conf │ ├── test.uncompressed.bcf │ ├── test.vcf │ ├── test.vcf.bgz │ ├── test.vcf.bgzf.gz │ ├── test.vcf.gz │ ├── test_command.sh │ ├── test_rowgroup_rangeindex.1.txt │ ├── timeout.py │ ├── trinity.fa │ ├── unmapped.sam │ ├── unordered.sam │ ├── unsorted.sam │ ├── vcf_dir │ │ ├── 1.vcf │ │ ├── 2.vcf │ │ ├── 3.vcf │ │ └── zero.vcf │ ├── wgEncodeOpenChromDnaseGm19238Pk.trunc10.narrowPeak │ └── wgs_calling_regions.hg38.interval_list │ └── scala │ └── org │ └── bdgenomics │ └── adam │ ├── algorithms │ ├── consensus │ │ ├── ConsensusGeneratorFromKnownsSuite.scala │ │ ├── ConsensusGeneratorFromReadsSuite.scala │ │ ├── ConsensusSuite.scala │ │ └── NormalizationUtilsSuite.scala │ └── smithwaterman │ │ └── SmithWatermanSuite.scala │ ├── converters │ ├── AlignmentConverterSuite.scala │ ├── FastqRecordConverterSuite.scala │ ├── FragmentConverterSuite.scala │ ├── TranscriptEffectConverterSuite.scala │ └── VariantContextConverterSuite.scala │ ├── ds │ ├── ADAMContextSuite.scala │ ├── GenomicDatasetSuite.scala │ ├── GenomicPositionPartitionerSuite.scala │ ├── InnerShuffleRegionJoinSuite.scala │ ├── InnerTreeRegionJoinSuite.scala │ ├── LeftOuterShuffleRegionJoinAndGroupByLeftSuite.scala │ ├── LeftOuterShuffleRegionJoinSuite.scala │ ├── OuterRegionJoinSuite.scala │ ├── RightOuterTreeRegionJoinSuite.scala │ ├── SortedGenomicDatasetSuite.scala │ ├── TreeRegionJoinSuite.scala │ ├── feature │ │ ├── CoverageDatasetSuite.scala │ │ ├── FeatureDatasetSuite.scala │ │ └── GFF3HeaderWriterSuite.scala │ ├── fragment │ │ └── FragmentDatasetSuite.scala │ ├── read │ │ ├── AlignmentDatasetSuite.scala │ │ ├── BinQualitiesSuite.scala │ │ ├── FlagStatSuite.scala │ │ ├── MDTaggingSuite.scala │ │ ├── MarkDuplicatesSuite.scala │ │ ├── ReadDatasetSuite.scala │ │ ├── RepairPartitionsSuite.scala │ │ ├── SingleReadBucketSuite.scala │ │ ├── realignment │ │ │ ├── IndelRealignmentTargetSuite.scala │ │ │ ├── ModPartitionerSuite.scala │ │ │ └── RealignIndelsSuite.scala │ │ └── recalibration │ │ │ ├── BaseQualityRecalibrationSuite.scala │ │ │ ├── CycleCovariateSuite.scala │ │ │ ├── DinucCovariateSuite.scala │ │ │ ├── RecalibrationTableSuite.scala │ │ │ └── RecalibratorSuite.scala │ ├── sequence │ │ ├── FlankSlicesSuite.scala │ │ ├── SequenceDatasetSuite.scala │ │ └── SliceDatasetSuite.scala │ └── variant │ │ ├── GenotypeDatasetSuite.scala │ │ ├── VariantContextDatasetSuite.scala │ │ └── VariantDatasetSuite.scala │ ├── io │ ├── InterleavedFastqInputFormatSuite.scala │ └── SingleFastqInputFormatSuite.scala │ ├── models │ ├── AlphabetSuite.scala │ ├── CoverageSuite.scala │ ├── IndelTableSuite.scala │ ├── MdTagSuite.scala │ ├── NonoverlappingRegionsSuite.scala │ ├── ReadGroupDictionarySuite.scala │ ├── ReferencePositionSuite.scala │ ├── ReferenceRegionSuite.scala │ ├── SequenceDictionarySuite.scala │ └── SnpTableSuite.scala │ ├── rich │ ├── RichAlignmentSuite.scala │ └── RichCigarSuite.scala │ └── util │ ├── ADAMFunSuite.scala │ ├── AttributeUtilsSuite.scala │ ├── FileExtensionsSuite.scala │ ├── FileMergerSuite.scala │ ├── IndexedFastaFileSuite.scala │ ├── ParallelFileMergerSuite.scala │ ├── PhredUtilsSuite.scala │ └── TwoBitFileSuite.scala ├── adam-distribution ├── pom.xml └── src │ └── main │ └── assembly │ └── assembly.xml ├── adam-python ├── .gitignore ├── MANIFEST.in ├── Makefile ├── README.md ├── bdgenomics │ ├── __init__.py │ └── adam │ │ ├── .gitignore │ │ ├── __init__.py │ │ ├── adamContext.py │ │ ├── ds.py │ │ ├── find_adam_home.py │ │ ├── models.py │ │ ├── stringency.py │ │ └── test │ │ ├── __init__.py │ │ ├── adamContext_test.py │ │ ├── alignmentDataset_test.py │ │ ├── coverageDataset_test.py │ │ ├── featureDataset_test.py │ │ ├── genotypeDataset_test.py │ │ └── variantDataset_test.py ├── pom.xml ├── setup.py └── version.py ├── adam-r ├── .gitignore ├── bdgenomics.adam │ ├── DESCRIPTION │ ├── NAMESPACE │ ├── R │ │ ├── adam-context.R │ │ ├── ds.R │ │ └── generics.R │ └── tests │ │ ├── testthat.R │ │ └── testthat │ │ ├── helpers.R │ │ ├── test_adamContext.R │ │ ├── test_alignmentDataset.R │ │ ├── test_featureDataset.R │ │ ├── test_genotypeDataset.R │ │ └── test_variantDataset.R └── pom.xml ├── bin ├── adam-shell ├── adam-submit ├── adamR ├── find-adam-assembly.sh ├── find-adam-egg.sh ├── find-adam-home ├── find-spark.sh └── pyadam ├── docs ├── .gitignore ├── Makefile ├── _static │ ├── favicon.ico │ └── logo.png ├── algorithms │ ├── bqsr.rst │ ├── dm.rst │ ├── joins.rst │ ├── reads.rst │ └── ri.rst ├── api │ ├── adamContext.rst │ ├── genomicDataset.rst │ ├── img │ │ ├── join_examples.png │ │ └── join_rdds.png │ ├── joins.rst │ ├── overview.rst │ ├── pipes.rst │ └── python.rst ├── architecture │ ├── evidence.rst │ ├── img │ │ ├── grdd.pdf │ │ ├── grdd.png │ │ ├── stack-model.pdf │ │ └── stack-model.png │ ├── overview.rst │ ├── schemas.rst │ └── stackModel.rst ├── benchmarks │ ├── algorithms.rst │ ├── img │ │ ├── bam.pdf │ │ ├── bam.png │ │ ├── bed.pdf │ │ ├── bed.png │ │ ├── gff.pdf │ │ ├── gff.png │ │ ├── plot-speedup.py │ │ ├── speedup-bqsr.pdf │ │ ├── speedup-bqsr.png │ │ ├── speedup-ir.pdf │ │ ├── speedup-ir.png │ │ ├── speedup-md.pdf │ │ ├── speedup-md.png │ │ ├── vcf.pdf │ │ └── vcf.png │ └── storage.rst ├── citing.rst ├── cli │ ├── actions.rst │ ├── conversions.rst │ ├── overview.rst │ └── printers.rst ├── conf.py ├── deploying │ ├── aws.rst │ ├── gcp.rst │ ├── slurm.rst │ ├── toil.rst │ └── yarn.rst ├── downstream │ ├── cli.rst │ ├── library.rst │ └── overview.rst ├── img │ ├── bdgenomics-stack.key │ ├── bdgenomics-stack.png │ ├── source │ │ └── file_benchmarks.py │ └── stack-model.ai ├── index.rst ├── installation │ ├── example.rst │ ├── pip.rst │ └── source.rst ├── references.rst ├── requirements.txt ├── source │ └── bibliography.bib └── template.tex ├── pom.xml └── scripts ├── fastq-interleaver.py ├── make-flag-values-sam.py ├── move_to_scala_2.11.sh ├── move_to_scala_2.12.sh ├── move_to_spark_2.sh ├── move_to_spark_3.sh └── release.sh /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: pull_request 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | env: 10 | SPARK_LOCAL_IP: localhost 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: actions/setup-java@v2 14 | with: 15 | java-version: '11' 16 | distribution: 'temurin' 17 | - uses: actions/cache@v4 18 | with: 19 | path: ~/.m2 20 | key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} 21 | restore-keys: ${{ runner.os }}-m2 22 | - run: mvn --batch-mode --update-snapshots clean package 23 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy Snapshot 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | if: ${{ github.repository == 'bigdatagenomics/adam' }} 12 | env: 13 | SPARK_LOCAL_IP: localhost 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: actions/setup-java@v2 17 | with: 18 | java-version: '11' 19 | distribution: 'temurin' 20 | server-id: sonatype-nexus-snapshots 21 | server-username: MAVEN_USERNAME 22 | server-password: MAVEN_PASSWORD 23 | - uses: actions/cache@v4 24 | with: 25 | path: ~/.m2 26 | key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} 27 | restore-keys: ${{ runner.os }}-m2 28 | - env: 29 | MAVEN_USERNAME: ${{ secrets.OSS_SONATYPE_USERNAME }} 30 | MAVEN_PASSWORD: ${{ secrets.OSS_SONATYPE_PASSWORD }} 31 | run: mvn --batch-mode -DskipTests=true deploy 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | target 4 | adam*.jar 5 | build 6 | *~ 7 | #* 8 | *.bak 9 | *.bam* 10 | *.adam* 11 | *.log 12 | .*.swp 13 | .DS_Store 14 | *#* 15 | .Rproj.user 16 | .Rhistory 17 | .RData 18 | -------------------------------------------------------------------------------- /LICENSE_header.txt: -------------------------------------------------------------------------------- 1 | Licensed to Big Data Genomics (BDG) under one 2 | or more contributor license agreements. See the NOTICE file 3 | distributed with this work for additional information 4 | regarding copyright ownership. The BDG licenses this file 5 | to you under the Apache License, Version 2.0 (the 6 | "License"); you may not use this file except in compliance 7 | with the License. You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | How to get support for ADAM 2 | =========================== 3 | 4 | ### Join the ADAM Gitter channel 5 | 6 | The primary mechanism for communication between ADAM developers and users is the [ADAM Gitter channel](https://gitter.im/bigdatagenomics/adam). 7 | 8 | 9 | ### Join the ADAM IRC channel 10 | 11 | If you prefer IRC, you can often find ADAM developers and users on [Libera.Chat](https://libera.chat/) in the #adamdev room. 12 | 13 | 14 | ### Join the ADAM developers mailing list 15 | 16 | The ADAM project also hosts a developers mailing list, see http://bdgenomics.org/mail/ for details. 17 | 18 | 19 | ### Search the Github issue tracker and pull requests 20 | 21 | Before creating a new issue, please search the [ADAM issue tracker](https://github.com/bigdatagenomics/adam/issues) 22 | and [ADAM open pull requests](https://github.com/bigdatagenomics/adam/pulls) on Github. 23 | 24 | 25 | ### Create a new issue on Github 26 | 27 | If you have identified a new issue, please [create a new issue](https://github.com/bigdatagenomics/adam/issues/new) 28 | on the ADAM issue tracker on Github and prepare supporting material, such as scripts, test cases, and data that 29 | provide context for your issue. The [How to submit a contribution](https://opensource.guide/how-to-contribute/) 30 | Open Source Guide is very helpful in this regard. 31 | 32 | Thank you for contributing to ADAM! 33 | -------------------------------------------------------------------------------- /adam-apis/src/main/scala/org/bdgenomics/adam/api/python/DataFrameConversionWrapper.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.python 19 | 20 | import org.apache.spark.api.java.function.{ Function => JFunction } 21 | import org.apache.spark.sql.DataFrame 22 | 23 | class DataFrameConversionWrapper( 24 | newDf: DataFrame) extends JFunction[DataFrame, DataFrame] { 25 | 26 | def call(v1: DataFrame): DataFrame = { 27 | newDf 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMCoverageConduit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.java; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import org.bdgenomics.adam.ds.ADAMContext; 24 | import org.bdgenomics.adam.ds.feature.CoverageDataset; 25 | 26 | /** 27 | * A simple test class for the JavaADAMRDD/Context. Writes an RDD of coverage to 28 | * disk and reads it back. 29 | */ 30 | final class JavaADAMCoverageConduit { 31 | public static CoverageDataset conduit(final CoverageDataset recordRdd, 32 | final ADAMContext ac) throws IOException { 33 | 34 | // make temp directory and save file 35 | Path tempDir = Files.createTempDirectory("javaAC"); 36 | String fileName = tempDir.toString() + "/testRdd.coverage.adam"; 37 | recordRdd.save(fileName, false, false); 38 | 39 | // create a new adam context and load the file 40 | JavaADAMContext jac = new JavaADAMContext(ac); 41 | return jac.loadCoverage(fileName); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFeatureConduit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.java; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import org.bdgenomics.adam.ds.ADAMContext; 24 | import org.bdgenomics.adam.ds.feature.FeatureDataset; 25 | 26 | /** 27 | * A simple test class for the JavaADAMRDD/Context. Writes an RDD of features to 28 | * disk and reads it back. 29 | */ 30 | final class JavaADAMFeatureConduit { 31 | public static FeatureDataset conduit(final FeatureDataset recordRdd, 32 | final ADAMContext ac) throws IOException { 33 | 34 | // make temp directory and save file 35 | Path tempDir = Files.createTempDirectory("javaAC"); 36 | String fileName = tempDir.toString() + "/testRdd.feature.adam"; 37 | recordRdd.save(fileName, false, false); 38 | 39 | // create a new adam context and load the file 40 | JavaADAMContext jac = new JavaADAMContext(ac); 41 | return jac.loadFeatures(fileName); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFragmentConduit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.java; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import org.bdgenomics.adam.ds.ADAMContext; 24 | import org.bdgenomics.adam.ds.fragment.FragmentDataset; 25 | 26 | /** 27 | * A simple test class for the JavaADAMRDD/Context. Writes an RDD of fragments to 28 | * disk and reads it back. 29 | */ 30 | final class JavaADAMFragmentConduit { 31 | public static FragmentDataset conduit(final FragmentDataset recordRdd, 32 | final ADAMContext ac) throws IOException { 33 | 34 | // make temp directory and save file 35 | Path tempDir = Files.createTempDirectory("javaAC"); 36 | String fileName = tempDir.toString() + "/testRdd.fragment.adam"; 37 | recordRdd.save(fileName); 38 | 39 | // create a new adam context and load the file 40 | JavaADAMContext jac = new JavaADAMContext(ac); 41 | return jac.loadFragments(fileName); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMGenotypeConduit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.java; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import org.bdgenomics.adam.ds.ADAMContext; 24 | import org.bdgenomics.adam.ds.variant.GenotypeDataset; 25 | 26 | /** 27 | * A simple test class for the JavaADAMRDD/Context. Writes an RDD of annotations to 28 | * disk and reads it back. 29 | */ 30 | final class JavaADAMGenotypeConduit { 31 | public static GenotypeDataset conduit(final GenotypeDataset recordRdd, 32 | final ADAMContext ac) throws IOException { 33 | 34 | // make temp directory and save file 35 | Path tempDir = Files.createTempDirectory("javaAC"); 36 | String fileName = tempDir.toString() + "/testRdd.genotype.adam"; 37 | recordRdd.saveAsParquet(fileName); 38 | 39 | // create a new adam context and load the file 40 | JavaADAMContext jac = new JavaADAMContext(ac); 41 | return jac.loadGenotypes(fileName); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMReadConduit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.java; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import org.bdgenomics.adam.ds.ADAMContext; 24 | import org.bdgenomics.adam.ds.read.AlignmentDataset; 25 | 26 | /** 27 | * A simple test class for the JavaADAMRDD/Context. Writes an RDD of reads to 28 | * disk and reads it back. 29 | */ 30 | class JavaADAMReadConduit { 31 | public static AlignmentDataset conduit(final AlignmentDataset recordRdd, 32 | final ADAMContext ac) throws IOException { 33 | 34 | // make temp directory and save file 35 | Path tempDir = Files.createTempDirectory("javaAC"); 36 | String fileName = tempDir.toString() + "/testRdd.read.adam"; 37 | recordRdd.save(fileName, false); 38 | 39 | // create a new adam context and load the file 40 | JavaADAMContext jac = new JavaADAMContext(ac); 41 | return jac.loadAlignments(fileName); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSequenceConduit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.java; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | 24 | import org.bdgenomics.adam.ds.ADAMContext; 25 | import org.bdgenomics.adam.ds.sequence.SequenceDataset; 26 | 27 | /** 28 | * A simple test class for the JavaADAMRDD/Context. Writes an RDD of sequences 29 | * to disk and reads it back. 30 | */ 31 | final class JavaADAMSequenceConduit { 32 | public static SequenceDataset conduit(final SequenceDataset sequenceDataset, 33 | final ADAMContext ac) throws IOException { 34 | 35 | // make temp directory and save file 36 | Path tempDir = Files.createTempDirectory("javaAC"); 37 | String fileName = tempDir.toString() + "/testRdd.sequences.adam"; 38 | sequenceDataset.save(fileName, true, true); 39 | 40 | // create a new adam context and load the file 41 | JavaADAMContext jac = new JavaADAMContext(ac); 42 | return jac.loadDnaSequences(fileName); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSliceConduit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.java; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | 24 | import org.bdgenomics.adam.ds.ADAMContext; 25 | import org.bdgenomics.adam.ds.sequence.SliceDataset; 26 | 27 | /** 28 | * A simple test class for the JavaADAMRDD/Context. Writes an RDD of slices 29 | * to disk and reads it back. 30 | */ 31 | final class JavaADAMSliceConduit { 32 | public static SliceDataset conduit(final SliceDataset sliceDataset, 33 | final ADAMContext ac) throws IOException { 34 | 35 | // make temp directory and save file 36 | Path tempDir = Files.createTempDirectory("javaAC"); 37 | String fileName = tempDir.toString() + "/testRdd.slices.adam"; 38 | sliceDataset.save(fileName, true, true); 39 | 40 | // create a new adam context and load the file 41 | JavaADAMContext jac = new JavaADAMContext(ac); 42 | return jac.loadSlices(fileName, 10000); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMVariantConduit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.api.java; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import org.bdgenomics.adam.ds.ADAMContext; 24 | import org.bdgenomics.adam.ds.variant.VariantDataset; 25 | 26 | /** 27 | * A simple test class for the JavaADAMRDD/Context. Writes an RDD of annotations to 28 | * disk and reads it back. 29 | */ 30 | final class JavaADAMVariantConduit { 31 | public static VariantDataset conduit(final VariantDataset recordRdd, 32 | final ADAMContext ac) throws IOException { 33 | 34 | // make temp directory and save file 35 | Path tempDir = Files.createTempDirectory("javaAC"); 36 | String fileName = tempDir.toString() + "/testRdd.variant.adam"; 37 | recordRdd.saveAsParquet(fileName); 38 | 39 | // create a new adam context and load the file 40 | JavaADAMContext jac = new JavaADAMContext(ac); 41 | return jac.loadVariants(fileName); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /adam-apis/src/test/resources/indexed_bams/sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-apis/src/test/resources/indexed_bams/sorted.bam -------------------------------------------------------------------------------- /adam-apis/src/test/resources/indexed_bams/sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-apis/src/test/resources/indexed_bams/sorted.bam.bai -------------------------------------------------------------------------------- /adam-assembly/src/main/scala/org/bdgenomics/adam/assembly/Assembly.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.assembly 19 | 20 | /** 21 | * Empty assembly object, allows Maven build to create sources and javadoc artifacts. 22 | */ 23 | object Assembly { 24 | // empty 25 | } 26 | -------------------------------------------------------------------------------- /adam-cli/.gitignore: -------------------------------------------------------------------------------- 1 | dependency-reduced-pom.xml 2 | -------------------------------------------------------------------------------- /adam-cli/src/main/scala/org/bdgenomics/adam/cli/CramArgs.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.cli 19 | 20 | import org.apache.spark.SparkContext 21 | import org.bdgenomics.utils.cli._ 22 | import org.kohsuke.args4j.{ Option => Args4jOption } 23 | import org.seqdoop.hadoop_bam.CRAMInputFormat 24 | 25 | /** 26 | * Abstract arguments that capture CRAM format configuration. 27 | */ 28 | private[cli] trait CramArgs { 29 | 30 | @Args4jOption(required = false, name = "-cram_reference", usage = "CRAM format reference, if necessary") 31 | var cramReference: String = null 32 | 33 | /** 34 | * Configure CRAM format. 35 | * 36 | * @param sc Spark context to configure 37 | */ 38 | def configureCramFormat(sc: SparkContext) = { 39 | Option(cramReference).map(ref => { 40 | sc.hadoopConfiguration.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, ref) 41 | }) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /adam-cli/src/main/scala/org/bdgenomics/adam/cli/FileSystemUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.cli 19 | 20 | import org.apache.hadoop.conf.Configuration 21 | import org.apache.hadoop.fs.Path 22 | import org.apache.hadoop.mapred.FileAlreadyExistsException 23 | 24 | /** 25 | * Utility methods for file systems. 26 | */ 27 | private[cli] object FileSystemUtils { 28 | private def exists(pathName: String, conf: Configuration): Boolean = { 29 | val p = new Path(pathName) 30 | val fs = p.getFileSystem(conf) 31 | fs.exists(p) 32 | } 33 | 34 | // move to BDGSparkCommand in bdg-utils? 35 | def checkWriteablePath(pathName: String, conf: Configuration): Unit = { 36 | if (exists(pathName, conf)) { 37 | throw new FileAlreadyExistsException("Cannot write to path name, %s already exists".format(pathName)) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /adam-cli/src/test/resources/artificial.counts.txt: -------------------------------------------------------------------------------- 1 | GGGGGGGGAAAAAAAAAAGGG 1 2 | AAAAAAAAAAAAAAAAAAAAG 1 3 | GGGGAAAAAAAAAAGGGGGGG 1 4 | GGGGGGGAAAAAAAAAAGGGG 1 5 | AGGGGGGGGGGAAAAAAAAAA 2 6 | AAAAAAAAAAAAAGGGGGGGG 1 7 | GGGGGAAAAAAAAAAGGGGGG 1 8 | GAAAAAAAAAAAAAAAAAAAA 1 9 | AAAAAAAAAAAAAAAGGGGGG 1 10 | GGGGGGGGGAAAAAAAAAAAA 1 11 | AAAAAAAAAAAAAAAAAAAGG 1 12 | AAAGGGGGGGGGGAAAAAAAA 2 13 | GGGGGGGGAAAAAAAAAAAAA 1 14 | GGAAAAAAAAAAGGGGGGGGG 1 15 | GGGGGGAAAAAAAAAAGGGGG 1 16 | AAAAGGGGGGGGGGAAAAAAA 2 17 | GGGGGGGAAAAAAAAAAAAAA 1 18 | AAAAAAAAAAAAAAGGGGGGG 1 19 | GGGGGGGGGGAAAAAAAAAAG 1 20 | AAAAAAAAAGGGGGGGGGGAA 2 21 | AAGGGGGGGGGGAAAAAAAAA 2 22 | GGAAAAAAAAAAAAAAAAAAA 1 23 | AAAAAAAAAAAAAAAAGGGGG 1 24 | AAAAAAAAAAAGGGGGGGGGG 1 25 | AAAAAAAAAAAAAAAAAGGGG 1 26 | GGGGGGGGGGAAAAAAAAAAA 1 27 | GGGAAAAAAAAAAAAAAAAAA 1 28 | GGGGGGGGGAAAAAAAAAAGG 1 29 | GAAAAAAAAAAGGGGGGGGGG 1 30 | AAAAAAGGGGGGGGGGAAAAA 2 31 | AAAAAGGGGGGGGGGAAAAAA 2 32 | GGGAAAAAAAAAAGGGGGGGG 1 33 | GGGGGAAAAAAAAAAAAAAAA 1 34 | AAAAAAAAAAAAAAAAAAAAA 1050 35 | AAAAAAAAAAAAAAAAAAGGG 1 36 | GGGGGGAAAAAAAAAAAAAAA 1 37 | AAAAAAAAGGGGGGGGGGAAA 2 38 | GGGGAAAAAAAAAAAAAAAAA 1 39 | AAAAAAAGGGGGGGGGGAAAA 2 40 | AAAAAAAAAAGGGGGGGGGGA 2 41 | AAAAAAAAAAAAGGGGGGGGG 1 42 | -------------------------------------------------------------------------------- /adam-cli/src/test/resources/artificial.fa: -------------------------------------------------------------------------------- 1 | ../../../../adam-core/src/test/resources/artificial.fa -------------------------------------------------------------------------------- /adam-cli/src/test/resources/artificial.fa.fai: -------------------------------------------------------------------------------- 1 | ../../../../adam-core/src/test/resources/artificial.fa.fai -------------------------------------------------------------------------------- /adam-cli/src/test/resources/chr5.phyloP46way.trunc.wigFix: -------------------------------------------------------------------------------- 1 | fixedStep chrom=chr5 start=13940 step=1 2 | 0.067 3 | 0.075 4 | 0.075 5 | -2.162 6 | -2.294 7 | 0.075 8 | fixedStep chrom=chr5 start=15296 step=1 9 | 0.139 10 | 0.155 11 | 0.155 12 | 0.139 13 | -------------------------------------------------------------------------------- /adam-cli/src/test/resources/gencode.v7.annotation.trunc10.bed: -------------------------------------------------------------------------------- 1 | chr1 11869 14409 gene . + "pseudogene" 2 | chr1 11869 14409 transcript . + "processed_transcript" 3 | chr1 11869 12227 exon . + "processed_transcript" 4 | chr1 12613 12721 exon . + "processed_transcript" 5 | chr1 13221 14409 exon . + "processed_transcript" 6 | chr1 12010 13670 transcript . + "transcribed_unprocessed_pseudogene" 7 | chr1 12010 12057 exon . + "transcribed_unprocessed_pseudogene" 8 | chr1 12179 12227 exon . + "transcribed_unprocessed_pseudogene" 9 | chr1 12613 12697 exon . + "transcribed_unprocessed_pseudogene" 10 | chr1 12975 13052 exon . + "transcribed_unprocessed_pseudogene" 11 | -------------------------------------------------------------------------------- /adam-cli/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stderr, logfile 3 | 4 | # Direct log messages to stderr 5 | log4j.appender.stderr=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stderr.Target=System.err 7 | log4j.appender.stderr.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stderr.threshold=WARN 9 | log4j.appender.stderr.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 10 | 11 | # Log at INFO level to file 12 | log4j.appender.logfile=org.apache.log4j.FileAppender 13 | log4j.appender.logfile.append=true 14 | log4j.appender.logfile.file=adam.log 15 | log4j.appender.logfile.threshold=INFO 16 | log4j.appender.logfile.layout=org.apache.log4j.PatternLayout 17 | log4j.appender.logfile.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 18 | log4j.appender.logfile.encoding=UTF-8 19 | 20 | # Tell Parquet to shut up 21 | log4j.logger.org.apache.parquet=ERROR 22 | -------------------------------------------------------------------------------- /adam-cli/src/test/resources/sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-cli/src/test/resources/sorted.bam -------------------------------------------------------------------------------- /adam-cli/src/test/resources/sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-cli/src/test/resources/sorted.bam.bai -------------------------------------------------------------------------------- /adam-cli/src/test/resources/sorted.counts.txt: -------------------------------------------------------------------------------- 1 | ACACACAC 1 2 | ACACACACACAC 1 3 | ACACACACAC 3 4 | -------------------------------------------------------------------------------- /adam-cli/src/test/scala/org/bdgenomics/adam/cli/AboutSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.cli 19 | 20 | import org.scalatest.FunSuite 21 | 22 | class AboutSuite extends FunSuite { 23 | val about = new About() 24 | 25 | test("template variables have been replaced") { 26 | assert(about.artifactId !== "${project.artifactId}") 27 | assert(about.buildTimestamp !== "${maven.build.timestamp}") 28 | assert(about.scalaVersion !== "${scala.version}") 29 | assert(about.sparkVersion !== "${spark.version}") 30 | assert(about.version !== "${version}") 31 | } 32 | 33 | test("templated values are not empty") { 34 | assert(about.artifactId.nonEmpty) 35 | assert(about.buildTimestamp.nonEmpty) 36 | assert(about.scalaVersion.nonEmpty) 37 | assert(about.sparkVersion.nonEmpty) 38 | assert(about.version.nonEmpty) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /adam-cli/src/test/scala/org/bdgenomics/adam/cli/CountReadKmersSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.cli 19 | 20 | import org.bdgenomics.adam.ds.ADAMContext._ 21 | import org.bdgenomics.adam.util.ADAMFunSuite 22 | 23 | class CountReadKmersSuite extends ADAMFunSuite { 24 | sparkTest("count kmers to single file") { 25 | val inputPath = copyResource("sorted.sam") 26 | val actualPath = tmpFile("sorted.counts.txt") 27 | val expectedPath = copyResource("sorted.counts.txt") 28 | CountReadKmers(Array("-single", inputPath, actualPath, "21")).run(sc) 29 | checkFiles(expectedPath, actualPath) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /adam-cli/src/test/scala/org/bdgenomics/adam/cli/CountSliceKmersSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.cli 19 | 20 | import org.bdgenomics.adam.ds.ADAMContext._ 21 | import org.bdgenomics.adam.util.ADAMFunSuite 22 | 23 | class CountSliceKmersSuite extends ADAMFunSuite { 24 | sparkTest("count slice kmers to single file") { 25 | val inputPath = copyResource("artificial.fa") 26 | val actualPath = tmpFile("artificial.counts.txt") 27 | val expectedPath = copyResource("artificial.counts.txt") 28 | CountSliceKmers(Array("-single", inputPath, actualPath, "21")).run(sc) 29 | checkFiles(expectedPath, actualPath) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /adam-cli/src/test/scala/org/bdgenomics/adam/cli/CoverageSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.cli 19 | 20 | import org.bdgenomics.adam.ds.ADAMContext._ 21 | import org.bdgenomics.adam.util.ADAMFunSuite 22 | import org.bdgenomics.utils.cli.Args4j 23 | 24 | class CoverageSuite extends ADAMFunSuite { 25 | 26 | sparkTest("correctly calculates coverage from small sam file") { 27 | val inputPath = copyResource("artificial.sam") 28 | val outputPath = tmpFile("coverage.adam") 29 | 30 | val args: Array[String] = Array(inputPath, outputPath) 31 | new Coverage(Args4j[CoverageArgs](args)).run(sc) 32 | val coverage = sc.loadCoverage(outputPath) 33 | 34 | val pointCoverage = coverage.flatten.rdd.filter(_.start == 30).first 35 | assert(pointCoverage.count == 5) 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.cli 19 | 20 | import java.io._ 21 | import org.bdgenomics.adam.util.ADAMFunSuite 22 | import org.bdgenomics.adam.ds.ADAMContext._ 23 | import org.bdgenomics.utils.cli.Args4j 24 | 25 | class TransformFeaturesSuite extends ADAMFunSuite { 26 | 27 | sparkTest("can convert a simple BED file") { 28 | 29 | val loader = Thread.currentThread().getContextClassLoader 30 | val inputPath = loader.getResource("gencode.v7.annotation.trunc10.bed").getPath 31 | val outputFile = File.createTempFile("adam-cli.TransformFeaturesSuite", ".adam") 32 | val outputPath = outputFile.getAbsolutePath 33 | 34 | val argLine = "%s %s".format(inputPath, outputPath).split("\\s+") 35 | 36 | // We have to do this, since the features2adam won't work if the file already exists, 37 | // but the "createTempFile" method actually creates the file (on some systems?) 38 | assert(outputFile.delete(), "Couldn't delete (empty) temp file") 39 | 40 | val args: TransformFeaturesArgs = Args4j.apply[TransformFeaturesArgs](argLine) 41 | 42 | val features2Adam = new TransformFeatures(args) 43 | features2Adam.run(sc) 44 | 45 | val converted = sc.loadFeatures(outputPath).rdd.collect 46 | 47 | assert(converted.size === 10) 48 | assert(converted.find(_.getReferenceName != "chr1").isEmpty) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformVariantsSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.cli 19 | 20 | import org.bdgenomics.adam.util.ADAMFunSuite 21 | 22 | class TransformVariantsSuite extends ADAMFunSuite { 23 | 24 | sparkTest("save a file sorted by contig index") { 25 | val inputPath = copyResource("random.vcf") 26 | val intermediatePath = tmpFile("variants.adam") 27 | val actualPath = tmpFile("sorted-variants.vcf") 28 | val expectedPath = copyResource("sorted-variants.vcf") 29 | 30 | TransformVariants( 31 | Array(inputPath, intermediatePath) 32 | ).run(sc) 33 | 34 | TransformVariants( 35 | Array(intermediatePath, actualPath, "-sort_on_save", "-single") 36 | ).run(sc) 37 | 38 | checkFiles(expectedPath, actualPath) 39 | } 40 | 41 | sparkTest("save a lexicographically sorted file") { 42 | val inputPath = copyResource("random.vcf") 43 | val intermediatePath = tmpFile("variants.lex.adam") 44 | val actualPath = tmpFile("sorted-variants.lex.vcf") 45 | val expectedPath = copyResource("sorted-variants.lex.vcf") 46 | 47 | TransformVariants( 48 | Array(inputPath, intermediatePath) 49 | ).run(sc) 50 | 51 | TransformVariants( 52 | Array(intermediatePath, actualPath, "-sort_lexicographically_on_save", "-single") 53 | ).run(sc) 54 | 55 | checkFiles(expectedPath, actualPath) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /adam-codegen/src/main/scala/org/bdgenomics/adam/codegen/Generator.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.codegen 19 | 20 | import java.io.FileWriter 21 | 22 | trait Generator { 23 | 24 | protected def writeHeader(fw: FileWriter, packageName: String) { 25 | val hdr = Seq( 26 | "/**", 27 | "* Licensed to Big Data Genomics (BDG) under one", 28 | "* or more contributor license agreements. See the NOTICE file", 29 | "* distributed with this work for additional information", 30 | "* regarding copyright ownership. The BDG licenses this file", 31 | "* to you under the Apache License, Version 2.0 (the", 32 | "* \"License\"); you may not use this file except in compliance", 33 | "* with the License. You may obtain a copy of the License at", 34 | "*", 35 | "* http://www.apache.org/licenses/LICENSE-2.0", 36 | "*", 37 | "* Unless required by applicable law or agreed to in writing, software", 38 | "* distributed under the License is distributed on an \"AS IS\" BASIS,", 39 | "* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.", 40 | "* See the License for the specific language governing permissions and", 41 | "* limitations under the License.", 42 | "*/", 43 | "package %s".format(packageName), 44 | "", 45 | "import scala.collection.JavaConversions._", 46 | "import scala.collection.JavaConverters._").mkString("\n") 47 | 48 | fw.write(hdr) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /adam-codegen/src/main/scala/org/bdgenomics/adam/codegen/ReflectSchema.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.codegen 19 | 20 | import org.apache.avro.reflect.ReflectData 21 | import org.apache.avro.Schema 22 | 23 | object ReflectSchema { 24 | 25 | private[codegen] def getSchemaByReflection(className: String): Schema = { 26 | 27 | // load the class 28 | val classLoader = Thread.currentThread().getContextClassLoader() 29 | val klazz = classLoader.loadClass(className) 30 | 31 | // get the schema through reflection 32 | ReflectData.get().getSchema(klazz) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /adam-core/.gitignore: -------------------------------------------------------------------------------- 1 | dependency-reduced-pom.xml 2 | src/main/resources/git.properties 3 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/ADAMSaveAnyArgs.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds 19 | 20 | import org.bdgenomics.utils.cli.SaveArgs 21 | 22 | /** 23 | * Argument configuration for saving any output format. 24 | */ 25 | trait ADAMSaveAnyArgs extends SaveArgs { 26 | 27 | /** 28 | * If true and saving as FASTQ, we will sort by read name. 29 | */ 30 | var sortFastqOutput: Boolean 31 | 32 | /** 33 | * If true and saving as a legacy format, we will write shards so that they 34 | * can be merged into a single file. 35 | * 36 | * @see deferMerging 37 | */ 38 | var asSingleFile: Boolean 39 | 40 | /** 41 | * If true and asSingleFile is true, we will not merge the shards once we 42 | * write them, and will leave them for the user to merge later. If false and 43 | * asSingleFile is true, then we will merge the shards on write. If 44 | * asSingleFile is false, this is ignored. 45 | * 46 | * @see asSingleFile 47 | */ 48 | var deferMerging: Boolean 49 | 50 | /** 51 | * If asSingleFile is true and deferMerging is false, disables the use of the 52 | * fast file concatenation engine. 53 | */ 54 | var disableFastConcat: Boolean 55 | } 56 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/GenomicBroadcast.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds 19 | 20 | import org.apache.spark.broadcast.Broadcast 21 | import org.bdgenomics.adam.models.ReferenceRegion 22 | import org.bdgenomics.utils.interval.array.IntervalArray 23 | 24 | case class GenomicBroadcast[T, U <: Product, V <: GenomicDataset[T, U, V]] private[ds] ( 25 | backingDataset: V, 26 | broadcastTree: Broadcast[IntervalArray[ReferenceRegion, T]]) { 27 | } 28 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/GenomicDatasetConversion.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds 19 | 20 | import org.apache.spark.api.java.function.Function2 21 | import org.apache.spark.sql.Dataset 22 | import scala.reflect.runtime.universe.TypeTag 23 | 24 | trait GenomicDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V], X, Y <: Product, Z <: GenomicDataset[X, Y, Z]] extends Function2[V, Dataset[Y], Z] { 25 | 26 | val yTag: TypeTag[Y] 27 | 28 | def call(v1: V, v2: Dataset[Y]): Z 29 | } 30 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/ReferencePartitioner.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds 19 | 20 | import org.apache.spark.Partitioner 21 | import org.bdgenomics.adam.models.{ 22 | ReferencePosition, 23 | ReferenceRegion, 24 | SequenceDictionary 25 | } 26 | 27 | /** 28 | * Repartitions objects that are keyed by a ReferencePosition or ReferenceRegion 29 | * into a single partition per contig. 30 | */ 31 | case class ReferencePartitioner(sd: SequenceDictionary) extends Partitioner { 32 | 33 | // extract just the reference names 34 | private val referenceNames = sd.records.map(_.name) 35 | 36 | override def numPartitions: Int = referenceNames.length 37 | 38 | private def partitionFromName(name: String): Int = { 39 | // which reference is this in? 40 | val pIdx = referenceNames.indexOf(name) 41 | 42 | // provide debug info to user if key is bad 43 | assert(pIdx != -1, "Reference not found in " + sd + " for key " + name) 44 | 45 | pIdx 46 | } 47 | 48 | override def getPartition(key: Any): Int = key match { 49 | case rp: ReferencePosition => { 50 | partitionFromName(rp.referenceName) 51 | } 52 | case rr: ReferenceRegion => { 53 | partitionFromName(rr.referenceName) 54 | } 55 | case _ => throw new IllegalArgumentException("Only ReferencePositions or ReferenceRegions can be used as a key.") 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/RegionJoin.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds 19 | 20 | import org.bdgenomics.adam.models.ReferenceRegion 21 | import org.apache.spark.rdd.RDD 22 | import scala.reflect.ClassTag 23 | 24 | /** 25 | * A trait describing a join in the genomic coordinate space between two RDDs 26 | * where the values are keyed by a ReferenceRegion. 27 | * 28 | * @tparam T The type of the left RDD. 29 | * @tparam U The type of the right RDD. 30 | * @tparam RT The type of data yielded by the left RDD at the output of the 31 | * join. This may not match T if the join is an outer join, etc. 32 | * @tparam RU The type of data yielded by the right RDD at the output of the 33 | * join. 34 | */ 35 | abstract class RegionJoin[T: ClassTag, U: ClassTag, RT, RU] extends Serializable { 36 | 37 | /** 38 | * Performs a region join between two RDDs. 39 | * 40 | * @param baseRDD The 'left' side of the join 41 | * @param joinedRDD The 'right' side of the join 42 | * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region 43 | * corresponding to x overlaps the region corresponding to y. 44 | */ 45 | def partitionAndJoin( 46 | baseRDD: RDD[(ReferenceRegion, T)], 47 | joinedRDD: RDD[(ReferenceRegion, U)]): RDD[(RT, RU)] 48 | } 49 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/feature/GFF3HeaderWriter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.feature 19 | 20 | import org.apache.hadoop.fs.Path 21 | import org.apache.spark.SparkContext 22 | 23 | /** 24 | * Writes the header for a GFF3 file to an otherwise empty file. 25 | */ 26 | private[feature] object GFF3HeaderWriter { 27 | 28 | val HEADER_STRING = "##gff-version 3.2.1" 29 | 30 | /** 31 | * Writes a GFF3 Header pragma to a file. 32 | * 33 | * @param filePath The path to write the file to. 34 | * @param sc The SparkContext, to access the Hadoop FS Configuration. 35 | */ 36 | def apply(filePath: String, 37 | sc: SparkContext) { 38 | val path = new Path(filePath) 39 | val fs = path.getFileSystem(sc.hadoopConfiguration) 40 | val os = fs.create(path) 41 | os.write(HEADER_STRING.getBytes) 42 | os.write("\n".getBytes) 43 | os.close() 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/read/BAMInFormatter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.read 19 | 20 | import htsjdk.samtools.{ 21 | SAMFileHeader, 22 | SAMFileWriter, 23 | SAMFileWriterFactory 24 | } 25 | import java.io.OutputStream 26 | import org.bdgenomics.adam.converters.AlignmentConverter 27 | import org.bdgenomics.adam.models.ReadGroupDictionary 28 | 29 | /** 30 | * InFormatter companion for building an InFormatter that streams BAM. 31 | */ 32 | object BAMInFormatter extends AnySAMInFormatterCompanion[BAMInFormatter] { 33 | 34 | protected def makeFormatter(header: SAMFileHeader, 35 | readGroups: ReadGroupDictionary, 36 | converter: AlignmentConverter): BAMInFormatter = { 37 | BAMInFormatter(header, readGroups, converter) 38 | } 39 | } 40 | 41 | case class BAMInFormatter private ( 42 | header: SAMFileHeader, 43 | readGroups: ReadGroupDictionary, 44 | converter: AlignmentConverter) extends AnySAMInFormatter[BAMInFormatter] { 45 | 46 | protected val companion = BAMInFormatter 47 | 48 | protected def makeWriter(os: OutputStream): SAMFileWriter = { 49 | new SAMFileWriterFactory() 50 | .makeBAMWriter(header, true, os) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/read/SAMInFormatter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.read 19 | 20 | import htsjdk.samtools.{ 21 | SAMFileHeader, 22 | SAMFileWriter, 23 | SAMFileWriterFactory 24 | } 25 | import java.io.OutputStream 26 | import org.bdgenomics.adam.converters.AlignmentConverter 27 | import org.bdgenomics.adam.models.ReadGroupDictionary 28 | 29 | /** 30 | * InFormatter companion for building an InFormatter that streams SAM. 31 | */ 32 | object SAMInFormatter extends AnySAMInFormatterCompanion[SAMInFormatter] { 33 | 34 | protected def makeFormatter(header: SAMFileHeader, 35 | readGroups: ReadGroupDictionary, 36 | converter: AlignmentConverter): SAMInFormatter = { 37 | SAMInFormatter(header, readGroups, converter) 38 | } 39 | } 40 | 41 | case class SAMInFormatter private ( 42 | header: SAMFileHeader, 43 | readGroups: ReadGroupDictionary, 44 | converter: AlignmentConverter) extends AnySAMInFormatter[SAMInFormatter] { 45 | 46 | def this() = { 47 | this(null, null, null) 48 | } 49 | 50 | protected val companion = SAMInFormatter 51 | 52 | protected def makeWriter(os: OutputStream): SAMFileWriter = { 53 | new SAMFileWriterFactory() 54 | .makeSAMWriter(header, true, os) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/read/realignment/ModPartitioner.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.read.realignment 19 | 20 | import org.apache.spark.Partitioner 21 | 22 | private[realignment] case class ModPartitioner(numPartitions: Int) extends Partitioner { 23 | 24 | def getPartition(key: Any): Int = key match { 25 | case i: Int => { 26 | (i.abs % numPartitions).abs 27 | } 28 | case _ => { 29 | throw new IllegalArgumentException("Key %s is not an Int.".format(key)) 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/read/recalibration/Covariate.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.read.recalibration 19 | 20 | import org.bdgenomics.formats.avro.Alignment 21 | 22 | /** 23 | * A Covariate represents a predictor, also known as a "feature" or 24 | * "independent variable". 25 | * 26 | * @tparam T The type of this feature. 27 | */ 28 | private[recalibration] abstract class Covariate[T] { 29 | 30 | /** 31 | * Given a read, computes the value of this covariate for each residue in the 32 | * read. 33 | * 34 | * @param read The read to observe. 35 | * @return The covariates corresponding to each base in this read. 36 | */ 37 | def compute(read: Alignment): Array[T] 38 | 39 | /** 40 | * Format the provided covariate value to be compatible with GATK's CSV output. 41 | * 42 | * @param cov A covariate value to render. 43 | * @return Returns the covariate value rendered as a single CSV cell. 44 | */ 45 | def toCSV(cov: T): String = { 46 | cov.toString 47 | } 48 | 49 | /** 50 | * A short name for this covariate, used in CSV output header. 51 | */ 52 | val csvFieldName: String 53 | } 54 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/read/recalibration/ObservationTable.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.read.recalibration 19 | 20 | import org.bdgenomics.adam.models.ReadGroupDictionary 21 | 22 | /** 23 | * Table containing the empirical frequency of mismatches for each set of 24 | * covariate values. 25 | * 26 | * @param entries The error covariate → observed error frequency mapping. 27 | */ 28 | private[adam] class ObservationTable( 29 | val entries: scala.collection.Map[CovariateKey, Observation]) extends Serializable { 30 | 31 | override def toString = entries.map { case (k, v) => "%s\t%s".format(k, v) }.mkString("\n") 32 | 33 | /** 34 | * @param readGroups The read groups that generated the reads in this table. 35 | * @return Return this table as CSV. 36 | */ 37 | def toCSV(readGroups: ReadGroupDictionary): String = { 38 | val rows = entries.map { 39 | case (key, obs) => 40 | (CovariateSpace.toCSV(key, readGroups) ++ 41 | obs.toCSV ++ 42 | (if (key.containsNone) Seq("**") else Seq())) 43 | } 44 | (Seq(csvHeader) ++ rows).map(_.mkString(",")).mkString("\n") 45 | } 46 | 47 | private def csvHeader: Seq[String] = { 48 | (CovariateSpace.csvHeader ++ 49 | Seq("TotalCount", "MismatchCount", "EmpiricalQ", "IsSkipped")) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/ds/variant/ADAMVCFOutputFormat.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.variant 19 | 20 | import htsjdk.variant.vcf.{ VCFHeaderLine, VCFHeader } 21 | import org.apache.hadoop.fs.{ FileSystem, Path } 22 | import org.apache.hadoop.mapreduce.{ RecordWriter, TaskAttemptContext } 23 | import org.bdgenomics.adam.models.SequenceDictionary 24 | import org.seqdoop.hadoop_bam.{ 25 | KeyIgnoringVCFOutputFormat, 26 | KeyIgnoringVCFRecordWriter, 27 | VariantContextWritable, 28 | VCFFormat 29 | } 30 | 31 | /** 32 | * Wrapper for Hadoop-BAM to work around requirement for no-args constructor. 33 | * 34 | * @tparam K The key type. Keys are not written. 35 | */ 36 | class ADAMVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF) with Serializable { 37 | 38 | override def getRecordWriter(context: TaskAttemptContext): RecordWriter[K, VariantContextWritable] = { 39 | val conf = context.getConfiguration() 40 | 41 | // where is our header file? 42 | val path = new Path(conf.get("org.bdgenomics.adam.rdd.variant.vcf_header_path")) 43 | 44 | // read the header file 45 | readHeaderFrom(path, FileSystem.get(conf)) 46 | 47 | // return record writer 48 | super.getRecordWriter(context) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/models/VCFHeaderWritable.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.models 19 | 20 | import htsjdk.variant.vcf.VCFHeader 21 | 22 | /** 23 | * Serializable wrapper for the VCF header. 24 | * 25 | * @param header A VCF header to serialize. 26 | */ 27 | private[adam] case class VCFHeaderWritable(header: VCFHeader) { 28 | } 29 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/sql/VariantContext.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.sql 19 | 20 | import org.bdgenomics.adam.models.{ 21 | ReferencePosition, 22 | VariantContext => VariantContextModel 23 | } 24 | import org.bdgenomics.adam.rich.RichVariant 25 | 26 | object VariantContext { 27 | 28 | def fromModel(vc: VariantContextModel): VariantContext = { 29 | VariantContext(vc.position.referenceName, 30 | vc.position.start, 31 | vc.variant.variant.getEnd, 32 | Variant.fromAvro(vc.variant.variant), 33 | vc.genotypes.map(g => Genotype.fromAvro(g)).toSeq) 34 | } 35 | } 36 | 37 | case class VariantContext(referenceName: String, 38 | start: Long, 39 | end: Long, 40 | variant: Variant, 41 | genotypes: Seq[Genotype]) { 42 | 43 | def toModel(): VariantContextModel = { 44 | new VariantContextModel(new ReferencePosition(referenceName, start), 45 | RichVariant(variant.toAvro), 46 | genotypes.map(_.toAvro)) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/util/GenomeFileReader.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.util 19 | 20 | import org.apache.spark.SparkContext 21 | import org.bdgenomics.adam.models.{ SequenceDictionary, SequenceRecord } 22 | 23 | /** 24 | * Object for reading Bedtools genome files from disk. Also supports 25 | * UCSC Genome Browser chromInfo files. 26 | */ 27 | object GenomeFileReader { 28 | 29 | /** 30 | * Populates a SequenceDictionary from a .genome file on disk. 31 | * 32 | * @param filePath The path to read the dictionary from. 33 | * @param sc The SparkContext to use for configuration. 34 | * @return Returns a populated sequence dictionary. 35 | */ 36 | def apply(filePath: String, 37 | sc: SparkContext): SequenceDictionary = { 38 | 39 | val records = sc 40 | .textFile(filePath) 41 | .map(line => line.split("\t")) 42 | .map(tokens => if (tokens.length > 2) { 43 | SequenceRecord( 44 | tokens(0), 45 | tokens(1).toLong, 46 | url = Some(tokens(2)), 47 | md5 = None, 48 | refseq = None, 49 | genbank = None, 50 | assembly = None, 51 | species = None, 52 | index = None 53 | ) 54 | } else { 55 | SequenceRecord(tokens(0), tokens(1).toLong) 56 | }) 57 | .collect 58 | 59 | new SequenceDictionary(records.toVector) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/util/ManualRegionPartitioner.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.util 19 | 20 | import org.apache.spark.Partitioner 21 | 22 | private[adam] case class ManualRegionPartitioner[V](partitions: Int) extends Partitioner { 23 | 24 | override def numPartitions: Int = partitions 25 | 26 | def getPartition(key: Any): Int = { 27 | key match { 28 | case (_, f2: Int) => f2 29 | case (i: Int) => i 30 | case _ => { 31 | throw new Exception("Unable to partition key %s without destination assignment.".format(key)) 32 | } 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/util/ParquetLogger.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.util 19 | 20 | import java.util.logging.{ Level, Logger } 21 | 22 | /** 23 | * Helper object for setting the logging level for Parquet. 24 | */ 25 | object ParquetLogger { 26 | 27 | /** 28 | * Sets the logger level for Parquet. 29 | */ 30 | val hadoopLoggerLevel = (level: Level) => { 31 | val parquetHadoopLogger = Logger.getLogger("org.apache.parquet.hadoop") 32 | parquetHadoopLogger.setLevel(level) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.util 19 | 20 | import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary } 21 | 22 | /** 23 | * File that contains a reference assembly that can be broadcasted 24 | */ 25 | trait ReferenceFile extends Serializable { 26 | /** 27 | * Extract reference sequence from the file. 28 | * 29 | * @param region The desired ReferenceRegion to extract. 30 | * @return The reference sequence at the desired locus. 31 | */ 32 | def extract(region: ReferenceRegion): String 33 | 34 | /* 35 | * Stores SequenceDictionary for ReferenceFile 36 | */ 37 | def references: SequenceDictionary 38 | } 39 | -------------------------------------------------------------------------------- /adam-core/src/main/scala/org/bdgenomics/adam/util/TextAlignment.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.util 19 | 20 | private[util] object TextAlignment extends Enumeration { 21 | type TextAlignment = Value 22 | val Left, Right, Center = Value 23 | } 24 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_all.fixed-phase-set.excerpt.vcf.README: -------------------------------------------------------------------------------- 1 | Excerpt from file 2 | ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv3.3.2/GRCh38/supplementaryFiles/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_all.vcf.gz 3 | 4 | Genome in a Bottle (GIAB) sample HG001 (aka NA12878) 5 | 6 | http://www.nature.com/nbt/journal/v32/n3/full/nbt.2835.html (doi:10.1038/nbt.2835) 7 | http://www.nature.com/articles/sdata201625 (doi:10.1038/sdata.2016.25) 8 | 9 | ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv3.3.2/README_NISTv3.3.2.txt 10 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/HLA_DQB1_05_01_01_02.dict: -------------------------------------------------------------------------------- 1 | @HD VN:1.5 2 | @SQ SN:HLA-DQB1*05:01:01:02 LN:7090 M5:0f304adf7acf3bd4b7c54c1394c85a4b UR:file:/Users/akmorrow/ADAM/adam/adam-core/src/test/resources/HLA_DQB1_05_01_01_02.fa 3 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/HLA_DQB1_05_01_01_02.fa.fai: -------------------------------------------------------------------------------- 1 | HLA-DQB1*05:01:01:02 7090 39 72 73 2 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/artificial.README.txt: -------------------------------------------------------------------------------- 1 | 2 | Description 3 | 4 | artificial.fa and artificial.sam are hand crafted. The idea is that 5 | there are two candiate indel contensus locations (deletions) but 6 | the reads support one strictly more than the other one. 7 | 8 | Relevant commands: 9 | 10 | After changing the sam do (to fix the MD tags): 11 | 12 | samtools view -bS artificial.sam | samtools calmd - /home/andre/biotools/artificial.fa | samtools view -bS - > artificial.bam 13 | 14 | Observe pileup via: 15 | 16 | samtools mpileup -BIf artificial.fa artificial.bam | less 17 | 18 | For comparison with GATK: 19 | 20 | a) (only if new reads were added use Picard to add missing readgroup data): 21 | java -jar AddOrReplaceReadGroups.jar I= artificial.bam O= artificial.fixed.bam SORT_ORDER=coordinate RGID="read_group_id" RGLB="library" RGPL="illumina" RGPU="platform_unit" RGSM="sequencing_center" CREATE_INDEX=True; 22 | 23 | b) java -jar GenomeAnalysisTK.jar -T RealignerTargetCreator -R artificial.fa -I artificial.fixed.bam -o target.intervals 24 | 25 | c) java -jar /home/andre/biotools/gatk/GenomeAnalysisTK.jar -T IndelRealigner -R /home/andre/biotools/artificial.fa -I artificial.fixed.bam -o artificial.realigned.bam -targetIntervals target.intervals 26 | 27 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/artificial.cram: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/artificial.cram -------------------------------------------------------------------------------- /adam-core/src/test/resources/artificial.fa: -------------------------------------------------------------------------------- 1 | >artificial fasta 2 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAGGGGGGGGGGAAAAAA 3 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 4 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 5 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 6 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 7 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 8 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 9 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 10 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 11 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 12 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 13 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 14 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 15 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 16 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 17 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 18 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/artificial.fa.fai: -------------------------------------------------------------------------------- 1 | artificial 1120 18 70 71 2 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/artificial.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.3 SO:coordinate 2 | @SQ SN:artificial LN:1120 3 | read1 67 artificial 6 90 29M10D31M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:70 XS:i:70 NM:i:20 MD:Z:29^GGGGGGGGGG10G0G0G0G0G0G0G0G0G0G11 4 | read2 67 artificial 11 90 44M10D16M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:70 XS:i:70 NM:i:30 MD:Z:24G0G0G0G0G0G0G0G0G0G10^GGGGGGGGGG0A0A0A0A0A0A0A0A0A0A6 5 | read3 67 artificial 16 90 19M10D41M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:70 XS:i:70 NM:i:20 MD:Z:19^GGGGGGGGGG10G0G0G0G0G0G0G0G0G0G21 6 | read4 67 artificial 21 90 34M10D26M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:70 XS:i:70 NM:i:30 MD:Z:14G0G0G0G0G0G0G0G0G0G10^GGGGGGGGGG0A0A0A0A0A0A0A0A0A0A16 7 | read5 67 artificial 26 90 9M10D51M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:70 XS:i:70 NM:i:20 MD:Z:9^GGGGGGGGGG10G0G0G0G0G0G0G0G0G0G31 8 | read1 131 artificial 106 90 60M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:0 AS:i:70 XS:i:70 MD:Z:60 9 | read2 131 artificial 111 90 60M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:0 AS:i:70 XS:i:70 MD:Z:60 10 | read3 131 artificial 116 90 60M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:0 AS:i:70 XS:i:70 MD:Z:60 11 | read4 131 artificial 121 90 60M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:0 AS:i:70 XS:i:70 MD:Z:60 12 | read5 131 artificial 126 90 60M = 1 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:0 AS:i:70 XS:i:70 MD:Z:60 13 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/badheader.sam: -------------------------------------------------------------------------------- 1 | @SQ SNN:1 LN:249250621 2 | @SQ SN:2 LN:243199373 3 | simread:1:26472783:false 16 1 26472784 60 75M * 0 0 GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA * NM:i:0 AS:i:75 XS:i:0 4 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/bams/small.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/bams/small.bam -------------------------------------------------------------------------------- /adam-core/src/test/resources/bqsr1.vcf.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/bqsr1.vcf.tbi -------------------------------------------------------------------------------- /adam-core/src/test/resources/chr20.250k.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/chr20.250k.fa.gz -------------------------------------------------------------------------------- /adam-core/src/test/resources/ctg123.fasta.gff3: -------------------------------------------------------------------------------- 1 | ##gff-version 3.2.1 2 | ##sequence-region ctg123 1 1497228 3 | ctg123 . gene 1000 9000 . + . ID=gene00001;Name=EDEN 4 | ctg123 . TF_binding_site 1000 1012 . + . ID=tfbs00001;Parent=gene00001 5 | ctg123 . mRNA 1050 9000 . + . ID=mRNA00001;Parent=gene00001;Name=EDEN.1 6 | ctg123 . five_prime_UTR 1050 1200 . + . Parent=mRNA00001 7 | ctg123 . CDS 1201 1500 . + 0 ID=cds00001;Parent=mRNA00001 8 | ctg123 . CDS 3000 3902 . + 0 ID=cds00001;Parent=mRNA00001 9 | ctg123 . CDS 5000 5500 . + 0 ID=cds00001;Parent=mRNA00001 10 | ctg123 . CDS 7000 7600 . + 0 ID=cds00001;Parent=mRNA00001 11 | ctg123 . three_prime_UTR 7601 9000 . + . Parent=mRNA00001 12 | ctg123 . cDNA_match 1050 1500 5.8e-42 + . ID=match00001;Target=cdna0123+12+462 13 | ctg123 . cDNA_match 5000 5500 8.1e-43 + . ID=match00001;Target=cdna0123+463+963 14 | ctg123 . cDNA_match 7000 9000 1.4e-40 + . ID=match00001;Target=cdna0123+964+2964 15 | ##FASTA 16 | >ctg123 17 | cttctgggcgtacccgattctcggagaacttgccgcaccattccgccttg 18 | tgttcattgctgcctgcatgttcattgtctacctcggctacgtgtggcta 19 | tctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaa 20 | aagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttat 21 | aatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaat 22 | cttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattc 23 | gtatttgatttgggtttactatcgaataatgagaattttcaggcttaggc 24 | ttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt 25 | aggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttag 26 | aatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattc 27 | >cnda0123 28 | ttcaagtgctcagtcaatgtgattcacagtatgtcaccaaatattttggc 29 | agctttctcaagggatcaaaattatggatcattatggaatacctcggtgg 30 | aggctcagcgctcgatttaactaaaagtggaaagctggacgaaagtcata 31 | tcgctgtgattcttcgcgaaattttgaaaggtctcgagtatctgcatagt 32 | gaaagaaaaatccacagagatattaaaggagccaacgttttgttggaccg 33 | tcaaacagcggctgtaaaaatttgtgattatggttaaagg 34 | >aa0123 35 | mapgsvtsdispsststagssrspesekpgpshggvppggpshsslpvgr 36 | rhppvlrmvlealqageqrrgtsvaaiklyilhkyptvdvlrfkyllkqa 37 | latgmrrgllarplnskarg 38 | >prot0123 39 | MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHSSLPVGR 40 | RHPPVLRMVLEALQAGEQRRGTSVAAIKLYILHKYPTVDVLRFKYLLKQA 41 | LATGMRRGLLARPLNSKARGATGSFKLVPKHKKKIQPRKMAPATAPRRAG 42 | EAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKVQKPPPKPGAATEKARKQ 43 | GGAAKDTRAQSGEARKVPPKPDKAMRAPSSAGGLSRKAKAKGSRSSQGDA 44 | EAYRKTKAESKSSKPTASKVKNGAASPTKKKVVAKAKAPKAGQGPNTKAA 45 | APAKGSGSKVVPAHLSRKTEAPKGPRKAGLPIKASSSKVSSQRAEA 46 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/dict_with_accession.dict: -------------------------------------------------------------------------------- 1 | @HD VN:1.4 SO:unsorted 2 | @SQ SN:1 LN:249250621 UR:file:/gs01/projects/ngs/resources/gatk/2.3/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 REFSEQ:NC_000001.10 GENBANK:CM000663.1 3 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/env_test_command.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # pipe input to a file 4 | tee ${OUTPUT_PATH} > /dev/null 5 | 6 | # print out another file 7 | cat ${INPUT_PATH} | tee ${OUTPUT_PATH}_2 -------------------------------------------------------------------------------- /adam-core/src/test/resources/example_intervals.list: -------------------------------------------------------------------------------- 1 | @HD VN:1.0 SO:coordinate 2 | @SQ SN:1 LN:249250621 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 SP:Homo Sapiens 3 | @SQ SN:2 LN:243199373 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a0d9851da00400dec1098a9255ac712e SP:Homo Sapiens 4 | 1 30366 30503 + target_1 5 | 1 69089 70010 + target_2 6 | 1 367657 368599 + target_3 7 | 1 621094 622036 + target_4 8 | 1 861320 861395 + target_5 9 | 1 865533 865718 + target_6 10 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/fastq_nobases.fq: -------------------------------------------------------------------------------- 1 | @nobases/1 2 | 3 | + 4 | 5 | @nobases/2 6 | 7 | + 8 | 9 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/fastq_noqual.fq: -------------------------------------------------------------------------------- 1 | @noqual/1 2 | GATTACA 3 | + 4 | * 5 | @noqual/2 6 | ACATTAG 7 | + 8 | * 9 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/fastq_sample1.fq.bgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/fastq_sample1.fq.bgz -------------------------------------------------------------------------------- /adam-core/src/test/resources/fastq_sample1.fq.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/fastq_sample1.fq.bz2 -------------------------------------------------------------------------------- /adam-core/src/test/resources/fastq_sample1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/fastq_sample1.fq.gz -------------------------------------------------------------------------------- /adam-core/src/test/resources/fastq_to_usam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | import sys 5 | 6 | # read lines from stdin 7 | lines = sys.stdin.readlines() 8 | 9 | # must have multiple of 8 10 | assert len(lines) % 8 == 0, "Expected multiple of 8 lines (got %d -> %s)" % (len(lines), lines) 11 | fastq_records = len(lines) // 4 12 | 13 | # print sam header 14 | print("@HD\tVN:1.5\tSO:unsorted") 15 | 16 | # loop and print sam lines 17 | for i in range(fastq_records): 18 | 19 | # fastq is: 20 | # 21 | # @readname 22 | # sequence 23 | # + 24 | # qualities 25 | rn1 = lines[4 * i].strip() 26 | assert rn1[0] == '@' 27 | readName = rn1[1:-2] 28 | readNum = rn1[-1:] 29 | sequence = lines[4 * i + 1].strip() 30 | rn2 = lines[4 * i + 2] 31 | assert rn2[0] == '+' 32 | assert len(rn2.strip()) == 1 or rn2[1:] == readName 33 | qualities = lines[4 * i + 3].strip() 34 | 35 | # flags: 36 | # 1 = paired (we assume that in this script) 37 | # 4 = unmapped 38 | # 8 = mate unmapped 39 | # 64 = first of pair 40 | # 128 = second of pair 41 | flags = 8 | 4 | 1 42 | if readNum == '1': 43 | flags |= 64 44 | elif readNum == '2': 45 | flags |= 128 46 | else: 47 | assert 1 <= readNum <= 2, "Read num must be 1 or 2: %s from %s" % (readNum, rn1) 48 | 49 | # sam is the following tab-delimited columns: 50 | # 51 | # 1. read name 52 | # 2. flags 53 | # 3. ref (* = unaligned) 54 | # 4. pos (0 = unaligned) 55 | # 5. map qual (0 if unmapped) 56 | # 6. cigar (* = unavailable) 57 | # 7. mate ref (* = unaligned) 58 | # 8. mate pos (0 = unaligned) 59 | # 9. tlen (0 = unknown) 60 | # 10. sequence 61 | # 11. qualities 62 | print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName, 63 | flags, 64 | sequence, 65 | qualities)) 66 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/gencode.chr20.transcript_names.head10.txt: -------------------------------------------------------------------------------- 1 | ENST00000608838.1 2 | ENST00000382410.2 3 | ENST00000382398.3 4 | ENST00000542572.1 5 | ENST00000382388.3 6 | ENST00000334391.4 7 | ENST00000544961.1 8 | ENST00000246105.4 9 | ENST00000382376.3 10 | ENST00000608495.1 11 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/gencode.v19.pc_transcripts.250k.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/gencode.v19.pc_transcripts.250k.fa.gz -------------------------------------------------------------------------------- /adam-core/src/test/resources/gencode.v7.annotation.trunc10.bed: -------------------------------------------------------------------------------- 1 | chr1 11869 14409 gene . + "pseudogene" 2 | chr1 11869 14409 transcript . + "processed_transcript" 3 | chr1 11869 12227 exon . + "processed_transcript" 4 | chr1 12613 12721 exon . + "processed_transcript" 5 | chr1 13221 14409 exon . + "processed_transcript" 6 | chr1 12010 13670 transcript . + "transcribed_unprocessed_pseudogene" 7 | chr1 12010 12057 exon . + "transcribed_unprocessed_pseudogene" 8 | chr1 12179 12227 exon . + "transcribed_unprocessed_pseudogene" 9 | chr1 12613 12697 exon . + "transcribed_unprocessed_pseudogene" 10 | chr1 12975 13052 exon . + "transcribed_unprocessed_pseudogene" 11 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/hg19.chrM.2bit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/hg19.chrM.2bit -------------------------------------------------------------------------------- /adam-core/src/test/resources/hg19.genome: -------------------------------------------------------------------------------- 1 | chr1 249250621 2 | chr2 243199373 3 | chr3 198022430 4 | chr4 191154276 5 | chr5 180915260 6 | chr6 171115067 7 | chr7 159138663 8 | chrX 155270560 9 | chr8 146364022 10 | chr9 141213431 11 | chr10 135534747 12 | chr11 135006516 13 | chr12 133851895 14 | chr13 115169878 15 | chr14 107349540 16 | chr15 102531392 17 | chr16 90354753 18 | chr17 81195210 19 | chr18 78077248 20 | chr20 63025520 21 | chrY 59373566 22 | chr19 59128983 23 | chr22 51304566 24 | chr21 48129895 25 | chr6_ssto_hap7 4928567 26 | chr6_mcf_hap5 4833398 27 | chr6_cox_hap2 4795371 28 | chr6_mann_hap4 4683263 29 | chr6_apd_hap1 4622290 30 | chr6_qbl_hap6 4611984 31 | chr6_dbb_hap3 4610396 32 | chr17_ctg5_hap1 1680828 33 | chr4_ctg9_hap1 590426 34 | chr1_gl000192_random 547496 35 | chrUn_gl000225 211173 36 | chr4_gl000194_random 191469 37 | chr4_gl000193_random 189789 38 | chr9_gl000200_random 187035 39 | chrUn_gl000222 186861 40 | chrUn_gl000212 186858 41 | chr7_gl000195_random 182896 42 | chrUn_gl000223 180455 43 | chrUn_gl000224 179693 44 | chrUn_gl000219 179198 45 | chr17_gl000205_random 174588 46 | chrUn_gl000215 172545 47 | chrUn_gl000216 172294 48 | chrUn_gl000217 172149 49 | chr9_gl000199_random 169874 50 | chrUn_gl000211 166566 51 | chrUn_gl000213 164239 52 | chrUn_gl000220 161802 53 | chrUn_gl000218 161147 54 | chr19_gl000209_random 159169 55 | chrUn_gl000221 155397 56 | chrUn_gl000214 137718 57 | chrUn_gl000228 129120 58 | chrUn_gl000227 128374 59 | chr1_gl000191_random 106433 60 | chr19_gl000208_random 92689 61 | chr9_gl000198_random 90085 62 | chr17_gl000204_random 81310 63 | chrUn_gl000233 45941 64 | chrUn_gl000237 45867 65 | chrUn_gl000230 43691 66 | chrUn_gl000242 43523 67 | chrUn_gl000243 43341 68 | chrUn_gl000241 42152 69 | chrUn_gl000236 41934 70 | chrUn_gl000240 41933 71 | chr17_gl000206_random 41001 72 | chrUn_gl000232 40652 73 | chrUn_gl000234 40531 74 | chr11_gl000202_random 40103 75 | chrUn_gl000238 39939 76 | chrUn_gl000244 39929 77 | chrUn_gl000248 39786 78 | chr8_gl000196_random 38914 79 | chrUn_gl000249 38502 80 | chrUn_gl000246 38154 81 | chr17_gl000203_random 37498 82 | chr8_gl000197_random 37175 83 | chrUn_gl000245 36651 84 | chrUn_gl000247 36422 85 | chr9_gl000201_random 36148 86 | chrUn_gl000235 34474 87 | chrUn_gl000239 33824 88 | chr21_gl000210_random 27682 89 | chrUn_gl000231 27386 90 | chrUn_gl000229 19913 91 | chrM 16571 92 | chrUn_gl000226 15008 93 | chr18_gl000207_random 4262 94 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/hs38DH_chr1_10.fa: -------------------------------------------------------------------------------- 1 | >chr1 AC:CM000663.2 gi:568336023 LN:248956422 rl:Chromosome M5:6aef897c3d6ff0c78aff06ac189178dd AS:GRCh38 2 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 3 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 4 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 5 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 6 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 7 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 8 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 9 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 10 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 11 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/human_g1k_v37_chr1_59kb.2bit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/human_g1k_v37_chr1_59kb.2bit -------------------------------------------------------------------------------- /adam-core/src/test/resources/improper_pairs_1.fq: -------------------------------------------------------------------------------- 1 | @H06HDADXX130110:2:2116:3345:91806/1 2 | GTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGCTAGGGTTAAGGGTAGGGTTAGCGAAAGGGCTGGGGTTAGGGGTGCGGGTACGCGTAGCATTAGGGCTAGAAGTAGGATCTGCAGTGCCTGACCGCGTCTGCGCGGCGACTGCCCAAAGCCTGGGGCCGACTCCAGGCTGAAGCTCAT 3 | + 4 | >=<=???>?>???=??>>8<=2=<===1194>?#3==>########################################################################################################################################################################################################### 5 | @H06HDADXX130110:1:2103:11970:57672/1 6 | GGATAGGGTTAGGGTTAGGGTTAGGGCTAGGGATAGGGGTAGGGTTGGGGTTGGTCATCGGGTGTTTCTTTGTGTTTGAGGTTGATTATTGTGATGGTTAAGGTATCTAGGTATTGTAAAAGTTGGCTTTTAACTTAGAAAATTATGTCATTCTGTTCACAAGTGTTTAGATTGGTAGATAGGTACTATGCGATCACTTCCATTGGCTGAGAGTTCGATTGATTATGAGCCACGCTAGTGGTTGAGATCT 7 | + 8 | 69+26933-:7;;135,53<>7<692(?2=9:**;<=##################################################################################################################################################################################################################### 9 | @H06JUADXX130110:1:1108:6424:55322/1 10 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACTCTAACCCTAACCCTAACCCTAACGGTAACCCTTACCCTTACTGTAACGCTTATCCTAAATCAAATTCTTCCTCTTAAGATCGCTGTTAAAATTAATCCTATTAGAACAGGTCTTCTGGCACCAAGTTATGTCAATATCCCTTACTCTAAACATGCCTTGATCTCTCATGCATCACTTCAGCACAGCTCTTATGGATCTAGGATCCTCAGT 11 | + 12 | =>;=?=@@=?@?@@9>7@=?=;=?@>29?=?;=>@;4@*0878;40'=@;(3399@9>7@:A############################################################################################################################################################################################ 13 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/improper_pairs_2.fq: -------------------------------------------------------------------------------- 1 | @H06HDADXX130110:2:2116:3345:91806/2 2 | TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCCTAACCCTAACCCTAACCCTAACCCGTACCCTAAACCCAACCCTAACCACAAAGCAAATCCCAACCTTAACCGGAACCCGAAATCTCGCAGCAAATCTGCAGTAGAGACGCAGACTCAACCATGCGTCTATTAGTACGCATTATCATTGCCTCATGCTTCTTAAGTACAGAGAGATGAC 3 | + 4 | ==;@@@<>>@??<>>???<=>>?>:><@?4=:>7=5=>:<=@;'@A?######################################################################################################################################################################################################## 5 | @H06HDADXX130110:1:2103:11970:57672/2 6 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTATCGTCAAACCTTACCTCCTCCCTAGCCTCCACCCTGACCATGACACCAACCATCAGCCTTATAGAAAACCCCAGAGATGCTCTTATCCTATACCACAATTACCCCATAACGAAAGAAAGGACTGAAAACAAATAAGTAAAATTCGTACAAATTATATCTATGAGTATGTCCCTGAGTGTAGGTGTAGGTGCATCC 7 | + 8 | =>:=>@=?<>>??>;:=;:8;=(5)0-6;1:>?<>############################################################################################################################################################################################################## 9 | 10 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/indexed_bams/sorted.2.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/indexed_bams/sorted.2.bai -------------------------------------------------------------------------------- /adam-core/src/test/resources/indexed_bams/sorted.2.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/indexed_bams/sorted.2.bam -------------------------------------------------------------------------------- /adam-core/src/test/resources/indexed_bams/sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/indexed_bams/sorted.bam -------------------------------------------------------------------------------- /adam-core/src/test/resources/indexed_bams/sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/indexed_bams/sorted.bam.bai -------------------------------------------------------------------------------- /adam-core/src/test/resources/interleaved_fastq_sample1.ifq.bgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.bgz -------------------------------------------------------------------------------- /adam-core/src/test/resources/interleaved_fastq_sample1.ifq.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.bz2 -------------------------------------------------------------------------------- /adam-core/src/test/resources/interleaved_fastq_sample1.ifq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.gz -------------------------------------------------------------------------------- /adam-core/src/test/resources/interleaved_fastq_sample2.ifq.output: -------------------------------------------------------------------------------- 1 | >>>interleaved fastq record start>>> 2 | @H06HDADXX130110:1:2103:11970:57672_1 3 | GGATAGGGTTAGGGTTAGGGTTAGGGCTAGGGATAGGGGTAGGGTTGGGGTTGGTCATCGGGTGTTTCTTTGTGTTTGAGGTTGATTATTGTGATGGTTAAGGTATCTAGGTATTGTAAAAGTTGGCTTTTAACTTAGAAAATTATGTCATTCTGTTCACAAGTGTTTAGATTGGTAGATAGGTACTATGCGATCACTTCCATTGGCTGAGAGTTCGATTGATTATGAGCCACGCTAGTGGTTGAGATCT 4 | + 5 | 69+26933-:7;;135,53<>7<692(?2=9:**;<=##################################################################################################################################################################################################################### 6 | @H06HDADXX130110:1:2103:11970:57672_2 7 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTATCGTCAAACCTTACCTCCTCCCTAGCCTCCACCCTGACCATGACACCAACCATCAGCCTTATAGAAAACCCCAGAGATGCTCTTATCCTATACCACAATTACCCCATAACGAAAGAAAGGACTGAAAACAAATAAGTAAAATTCGTACAAATTATATCTATGAGTATGTCCCTGAGTGTAGGTGTAGGTGCATCC 8 | + 9 | =>:=>@=?<>>??>;:=;:8;=(5)0-6;1:>?<>############################################################################################################################################################################################################## 10 | <<>>interleaved fastq record start>>> 12 | @H06JUADXX130110:1:1108:6424:55322_1 13 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACTCTAACCCTAACCCTAACCCTAACGGTAACCCTTACCCTTACTGTAACGCTTATCCTAAATCAAATTCTTCCTCTTAAGATCGCTGTTAAAATTAATCCTATTAGAACAGGTCTTCTGGCACCAAGTTATGTCAATATCCCTTACTCTAAACATGCCTTGATCTCTCATGCATCACTTCAGCACAGCTCTTATGGATCTAGGATCCTCAGT 14 | + 15 | =>;=?=@@=?@?@@9>7@=?=;=?@>29?=?;=>@;4@*0878;40'=@;(3399@9>7@:A############################################################################################################################################################################################ 16 | @H06JUADXX130110:1:1108:6424:55322_2 17 | AGGGATAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGATAGGGCTAGGGTTAGGGATAGGGATAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTATCGATAGGGATAGGGATAGGGATAGAGTTAGGGCTATGGGTAGGGTTAGAGTCAGGGAAAGAGATAGGGATGGAGATGGGGTTAAAAAGAAGTCAAGGAATTAAGGTAGGGAAACGGTTCGAGATCTGTAAAGGGCAACGA 18 | + 19 | >>;>*9?:@??@@????@????>@?>>@>@?>?????@@???????=?;+A?@?>89?@###################################################################################################################################################################################### 20 | << 3 | ##contig= 4 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 5 | 1 14397 . CTGT C 139.12 IndelQD AC=2;AF=0.333;AN=6;BaseQRankSum=1.800;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384 GT:AD:DP:FT:GQ:PL 0/1:16,4:20:rd:99:120,0,827 0/1:8,2:10:dp;rd:60:60,0,414 0/0:39,0:39:PASS:99:0,116,2114 -------------------------------------------------------------------------------- /adam-core/src/test/resources/legacy.fa: -------------------------------------------------------------------------------- 1 | ;LCBO - Prolactin precursor - Bovine 2 | ; a sample sequence in FASTA format 3 | MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSS 4 | EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL 5 | VTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGAKETEPYPVWSGLPSLQTKDED 6 | ARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC* 7 | 8 | >MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken 9 | ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID 10 | FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA 11 | DIDGDGQVNYEEFVQMMTAK* 12 | 13 | >gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] 14 | LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV 15 | EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG 16 | LLILILL---LALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL 17 | GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX 18 | IENY 19 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stderr, logfile 3 | 4 | # Direct log messages to stderr 5 | log4j.appender.stderr=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stderr.Target=System.err 7 | log4j.appender.stderr.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stderr.threshold=WARN 9 | log4j.appender.stderr.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 10 | 11 | # Log at INFO level to file 12 | log4j.appender.logfile=org.apache.log4j.FileAppender 13 | log4j.appender.logfile.append=true 14 | log4j.appender.logfile.file=adam.log 15 | log4j.appender.logfile.threshold=INFO 16 | log4j.appender.logfile.layout=org.apache.log4j.PatternLayout 17 | log4j.appender.logfile.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 18 | log4j.appender.logfile.encoding=UTF-8 19 | 20 | # Tell Parquet to shut up 21 | log4j.logger.org.apache.parquet=ERROR 22 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/multi_chr.sam: -------------------------------------------------------------------------------- 1 | @SQ SN:1 LN:249250621 2 | @SQ SN:2 LN:243199373 3 | @PG ID:p1 PN:myProg CL:"myProg 123" VN:1.0.0 4 | @PG ID:p2 PN:myProg CL:"myProg 456" VN:1.0.0 PP:p1 5 | simread:1:26472783:false 16 1 26472784 60 75M * 0 0 GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA * NM:i:0 AS:i:75 XS:i:0 6 | simread:1:240997787:true 0 1 240997788 60 75M * 0 0 CTTTATTTTTATTTTTAAGGTTTTTTTTGTTTGTTTGTTTTGAGATGGAGTCTCGCTCCACCGCCCAGACTGGAG * NM:i:0 AS:i:75 XS:i:39 7 | simread:1:189606653:true 0 2 189606654 60 75M * 0 0 TGTATCTTCCTCCCCTGCTGTATGTTTCCTGCCCTCAAACATCACACTCCACGTTCTTCAGCTTTAGGACTTGGA * NM:i:0 AS:i:75 XS:i:0 8 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/proper_pairs_1.fq: -------------------------------------------------------------------------------- 1 | @H06HDADXX130110:2:2116:3345:91806/1 2 | GTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGCTAGGGTTAAGGGTAGGGTTAGCGAAAGGGCTGGGGTTAGGGGTGCGGGTACGCGTAGCATTAGGGCTAGAAGTAGGATCTGCAGTGCCTGACCGCGTCTGCGCGGCGACTGCCCAAAGCCTGGGGCCGACTCCAGGCTGAAGCTCAT 3 | + 4 | >=<=???>?>???=??>>8<=2=<===1194>?#3==>########################################################################################################################################################################################################### 5 | @H06HDADXX130110:1:2103:11970:57672/1 6 | GGATAGGGTTAGGGTTAGGGTTAGGGCTAGGGATAGGGGTAGGGTTGGGGTTGGTCATCGGGTGTTTCTTTGTGTTTGAGGTTGATTATTGTGATGGTTAAGGTATCTAGGTATTGTAAAAGTTGGCTTTTAACTTAGAAAATTATGTCATTCTGTTCACAAGTGTTTAGATTGGTAGATAGGTACTATGCGATCACTTCCATTGGCTGAGAGTTCGATTGATTATGAGCCACGCTAGTGGTTGAGATCT 7 | + 8 | 69+26933-:7;;135,53<>7<692(?2=9:**;<=##################################################################################################################################################################################################################### 9 | @H06JUADXX130110:1:1108:6424:55322/1 10 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACTCTAACCCTAACCCTAACCCTAACGGTAACCCTTACCCTTACTGTAACGCTTATCCTAAATCAAATTCTTCCTCTTAAGATCGCTGTTAAAATTAATCCTATTAGAACAGGTCTTCTGGCACCAAGTTATGTCAATATCCCTTACTCTAAACATGCCTTGATCTCTCATGCATCACTTCAGCACAGCTCTTATGGATCTAGGATCCTCAGT 11 | + 12 | =>;=?=@@=?@?@@9>7@=?=;=?@>29?=?;=>@;4@*0878;40'=@;(3399@9>7@:A############################################################################################################################################################################################ 13 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/proper_pairs_2.fq: -------------------------------------------------------------------------------- 1 | @H06HDADXX130110:2:2116:3345:91806/2 2 | TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCCTAACCCTAACCCTAACCCTAACCCGTACCCTAAACCCAACCCTAACCACAAAGCAAATCCCAACCTTAACCGGAACCCGAAATCTCGCAGCAAATCTGCAGTAGAGACGCAGACTCAACCATGCGTCTATTAGTACGCATTATCATTGCCTCATGCTTCTTAAGTACAGAGAGATGAC 3 | + 4 | ==;@@@<>>@??<>>???<=>>?>:><@?4=:>7=5=>:<=@;'@A?######################################################################################################################################################################################################## 5 | @H06HDADXX130110:1:2103:11970:57672/2 6 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTATCGTCAAACCTTACCTCCTCCCTAGCCTCCACCCTGACCATGACACCAACCATCAGCCTTATAGAAAACCCCAGAGATGCTCTTATCCTATACCACAATTACCCCATAACGAAAGAAAGGACTGAAAACAAATAAGTAAAATTCGTACAAATTATATCTATGAGTATGTCCCTGAGTGTAGGTGTAGGTGCATCC 7 | + 8 | =>:=>@=?<>>??>;:=;:8;=(5)0-6;1:>?<>############################################################################################################################################################################################################## 9 | @H06JUADXX130110:1:1108:6424:55322/2 10 | AGGGATAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGATAGGGCTAGGGTTAGGGATAGGGATAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTATCGATAGGGATAGGGATAGGGATAGAGTTAGGGCTATGGGTAGGGTTAGAGTCAGGGAAAGAGATAGGGATGGAGATGGGGTTAAAAAGAAGTCAAGGAATTAAGGTAGGGAAACGGTTCGAGATCTGTAAAGGGCAACGA 11 | + 12 | >>;>*9?:@??@@????@????>@?>>@>@?>?????@@???????=?;+A?@?>89?@###################################################################################################################################################################################### 13 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/read_names_with_index_sequences_interleaved.fq: -------------------------------------------------------------------------------- 1 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 1:N:0:ATCANG 2 | TCTGTGTAAATTACCCAGCCTCACGTATTCCTTTAGAGCAATGCAAAACAGACTAGACAAAAGGCTTTTAAAAGTCTAATCTGAGATTCCTGACCAAATGT 3 | + 4 | CCCFFFFFHHHHHJJJJJJJJJJJJHIJJJJJJJJJIJJJJJJJJJJJJJJJJJJJHIJGHJIJJIJJJJJHHHHHHHFFFFFFFEDDEEEEDDDDDDDDD 5 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 2:N:0:ATCANG 6 | NTAATAATGAGTGCACAATAGTTTTTCTCCTGAAACATAATTATTCTCTCAATCATCCCCATCCCCACCAAAGTCAATCACGGGAAGATCAATCAGCCTGC 7 | + 8 | #1=DFFFFHHHFHIJJIIIJJJIJJJIJJJJJJJIJJJJJJIJJJJJJJJJJJJJIJJJJJJJJJJIJIJIJJHHHHHHHFFFDDDDDDDDDDDDDDDDDD 9 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 1:N:0:ATCANG 10 | AAGCTGACTTGTGTTGGGAGCTCATCTGTTCCCTTGACTTCTCTTTTTCCAGTTCTTCGTCAAGGCCACAGGTGCTGCGGGAAAATCAGTAACTAATGAAC 11 | + 12 | @C@FFFDFFFDFBHGDHH;EHHGHE?EBHCHIGGI>BFFECGGIIIIGIHEBBGCHHIG);;FEEADGH@CCDCCC@CCC 13 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 2:N:0:ATCANG 14 | TCCTCCCACTTCTGTCTCCCTCAGCAGCCTCTCATATTGCTGCTGTCTGCCTGGCCTATAGGCTTCTGAGTTATGACACTGGTGTGAAGAGAAAAGGCTTN 15 | + 16 | 1?@DABDDDDF+AE?EBFHIIII>G>?;8?3?EEFB;5<5(: 17 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 1:N:0:ATCANG 18 | AGCTGACATGAGAAAAGCCTGGTAAATCCGGGGCAAGTGACTGAAATGAAAGAATCCAATCAGATTCCAGCTCCAAGGGCCGCTAATTGTAGTAACTGGCT 19 | + 20 | CCCFDFFFHFFFFGIIGGHJEB@DDEDE@>?? 21 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 2:N:0:ATCANG 22 | ATATTAAGCCACTTGCAGCAAGACAGCCTGAAACTTCGTGACTCCCTGGAGCTTTTGGTGGTGGACGAAGCTGACCTTCTTTTTTCCTTTGGCTTTGANNN 23 | + 24 | C@CFDFFFHGHGHIJJJIGHHHIJJJIIEHIJJJJJIHDFGBFGIIJGGHIGIJJGEHCFH@EHEEEDFCDCDDDDDDDDDDDDDDDDDDCDDCDDCDDCC 25 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 1:N:0:ATCANG 26 | TCATTCCACATCTCAATCTCTCCTAGGAAGTTTTCCGGCCTTGTTGACAGGTTTAATTGAAAGGAGAAGCCAAATGTTGAGTAAACAGATTGCAAAAACTG 27 | + 28 | CCCFFFFFHHFHHJJJJJIJIIJHJJJJJJIIJIJJGIIIIJIJJJJJJJJBFIIJJJJJJJJJJJJJJHHHHHFF?DFFFEEEEEEDDCDDDDDDDDDDC 29 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 2:N:0:ATCANG 30 | NNNNNNNNNNNNNNNNNNNNNGGATAANNANCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 31 | + 32 | #####################22@@??##1#0############.############################################++8::<====== 33 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/read_names_with_index_sequences_pair1.fq: -------------------------------------------------------------------------------- 1 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 1:N:0:ATCACG 2 | TCTGTGTAAATTACCCAGCCTCACGTATTCCTTTAGAGCAATGCAAAACAGACTAGACAAAAGGCTTTTAAAAGTCTAATCTGAGATTCCTGACCAAATGT 3 | + 4 | CCCFFFFFHHHHHJJJJJJJJJJJJHIJJJJJJJJJIJJJJJJJJJJJJJJJJJJJHIJGHJIJJIJJJJJHHHHHHHFFFFFFFEDDEEEEDDDDDDDDD 5 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 1:N:0:ATCACG 6 | AAGCTGACTTGTGTTGGGAGCTCATCTGTTCCCTTGACTTCTCTTTTTCCAGTTCTTCGTCAAGGCCACAGGTGCTGCGGGAAAATCAGTAACTAATGAAC 7 | + 8 | @C@FFFDFFFDFBHGDHH;EHHGHE?EBHCHIGGI>BFFECGGIIIIGIHEBBGCHHIG);;FEEADGH@CCDCCC@CCC 9 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 1:N:0:ATCACG 10 | AGCTGACATGAGAAAAGCCTGGTAAATCCGGGGCAAGTGACTGAAATGAAAGAATCCAATCAGATTCCAGCTCCAAGGGCCGCTAATTGTAGTAACTGGCT 11 | + 12 | CCCFDFFFHFFFFGIIGGHJEB@DDEDE@>?? 13 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 1:N:0:ATCACG 14 | TCATTCCACATCTCAATCTCTCCTAGGAAGTTTTCCGGCCTTGTTGACAGGTTTAATTGAAAGGAGAAGCCAAATGTTGAGTAAACAGATTGCAAAAACTG 15 | + 16 | CCCFFFFFHHFHHJJJJJIJIIJHJJJJJJIIJIJJGIIIIJIJJJJJJJJBFIIJJJJJJJJJJJJJJHHHHHFF?DFFFEEEEEEDDCDDDDDDDDDDC -------------------------------------------------------------------------------- /adam-core/src/test/resources/read_names_with_index_sequences_pair2.fq: -------------------------------------------------------------------------------- 1 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 2:N:0:ATCACG 2 | NTAATAATGAGTGCACAATAGTTTTTCTCCTGAAACATAATTATTCTCTCAATCATCCCCATCCCCACCAAAGTCAATCACGGGAAGATCAATCAGCCTGC 3 | + 4 | #1=DFFFFHHHFHIJJIIIJJJIJJJIJJJJJJJIJJJJJJIJJJJJJJJJJJJJIJJJJJJJJJJIJIJIJJHHHHHHHFFFDDDDDDDDDDDDDDDDDD 5 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 2:N:0:ATCACG 6 | TCCTCCCACTTCTGTCTCCCTCAGCAGCCTCTCATATTGCTGCTGTCTGCCTGGCCTATAGGCTTCTGAGTTATGACACTGGTGTGAAGAGAAAAGGCTTN 7 | + 8 | 1?@DABDDDDF+AE?EBFHIIII>G>?;8?3?EEFB;5<5(: 9 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 2:N:0:ATCACG 10 | ATATTAAGCCACTTGCAGCAAGACAGCCTGAAACTTCGTGACTCCCTGGAGCTTTTGGTGGTGGACGAAGCTGACCTTCTTTTTTCCTTTGGCTTTGANNN 11 | + 12 | C@CFDFFFHGHGHIJJJIGHHHIJJJIIEHIJJJJJIHDFGBFGIIJGGHIGIJJGEHCFH@EHEEEDFCDCDDDDDDDDDDDDDDDDDDCDDCDDCDDCC 13 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 2:N:0:ATCACG 14 | NNNNNNNNNNNNNNNNNNNNNGGATAANNANCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 15 | + 16 | #####################22@@??##1#0############.############################################++8::<====== -------------------------------------------------------------------------------- /adam-core/src/test/resources/readname_sorted.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 SO:queryname 2 | @SQ SN:1 LN:1000 3 | @SQ SN:chr2 LN:1000 4 | @SQ SN:3 LN:1000 5 | @SQ SN:4 LN:2000 6 | A 0 1 1 50 10M * 0 0 ACACACACAC ********** 7 | B 0 3 11 40 4M2I4M * 0 0 ACACACACAC ********** 8 | C 0 4 1001 25 8M * 0 0 ACACACAC ******** 9 | D 0 chr2 501 55 10M2S * 0 0 ACACACACACAC ************ 10 | E 0 chr2 101 45 10M * 0 0 ACACACACAC ********** 11 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/reads-0-2-0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/reads-0-2-0 -------------------------------------------------------------------------------- /adam-core/src/test/resources/sample_coverage.bed: -------------------------------------------------------------------------------- 1 | chr1 1 10 sequence_feature 3.0 ? 2 | chr1 15 20 sequence_feature 2.0 ? 3 | chr2 15 20 sequence_feature 2.0 ? 4 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/small.1.bed: -------------------------------------------------------------------------------- 1 | 1 143 26423 2 | 1 14397230 26472788 3 | 1 169801934 169801939 4 | 1 240997788 240997796 -------------------------------------------------------------------------------- /adam-core/src/test/resources/small.1.narrowPeak: -------------------------------------------------------------------------------- 1 | 1 26472784 26472859 simread:1:26472783:false 0 + 0 -1 -1 -1 2 | 1 240997788 240997863 simread:1:240997787:true 0 + 0 -1 -1 -1 3 | 1 189606654 189606729 simread:1:189606653:true 0 + 0 -1 -1 -1 4 | 1 207027739 207027814 simread:1:207027738:true 0 + 0 -1 -1 -1 5 | 1 14397234 14397309 simread:1:14397233:false 0 + 0 -1 -1 -1 6 | 1 240344443 240344518 simread:1:240344442:true 0 + 0 -1 -1 -1 7 | 1 153978725 153978800 simread:1:153978724:false 0 + 0 -1 -1 -1 8 | 1 237728410 237728485 simread:1:237728409:true 0 + 0 -1 -1 -1 9 | 1 231911907 231911982 simread:1:231911906:false 0 + 0 -1 -1 -1 10 | 1 50683372 50683447 simread:1:50683371:false 0 + 0 -1 -1 -1 11 | 1 37577446 37577521 simread:1:37577445:false 0 + 0 -1 -1 -1 12 | 1 195211966 195212041 simread:1:195211965:false 0 + 0 -1 -1 -1 13 | 1 163841414 163841489 simread:1:163841413:false 0 + 0 -1 -1 -1 14 | 1 101556379 101556454 simread:1:101556378:false 0 + 0 -1 -1 -1 15 | 1 20101801 20101876 simread:1:20101800:true 0 + 0 -1 -1 -1 16 | 1 186794284 186794359 simread:1:186794283:true 0 + 0 -1 -1 -1 17 | 1 165341383 165341458 simread:1:165341382:true 0 + 0 -1 -1 -1 18 | 1 5469107 5469182 simread:1:5469106:true 0 + 0 -1 -1 -1 19 | 1 89554253 89554328 simread:1:89554252:false 0 + 0 -1 -1 -1 20 | 1 169801934 169802009 simread:1:169801933:true 0 + 0 -1 -1 -1 21 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/small.1_12.bed: -------------------------------------------------------------------------------- 1 | 1 143 26423 line1 0.0 . 150 26400 0,0,0 . . . 2 | 1 14397230 26472788 line2 100.0 + 14397230 26472700 255,0,0 1 12075558 14397230 3 | 1 169801934 169801939 line3 200.0 - . . 0,255,0 2 100,200 169801934,169801739 4 | 1 240997788 240997796 line4 with a space 1000.0 ? . . 0,0,255 . . . 5 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/small_realignment_targets.intervals: -------------------------------------------------------------------------------- 1 | 702290 702324 2 | 807756 3 | 808685 4 | 857250 857251 5 | 858175 858176 6 | 869645 869648 7 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/small_realignment_targets_README.txt: -------------------------------------------------------------------------------- 1 | 2 | Rough summery for generation of test cases. 3 | 4 | The reads were hand-picked from a input generated by the 5 | Mason read simulator. Indel realignment intervals extracted 6 | by hand from GATK output of the RealignmentTargetCreator. 7 | 8 | Mouse reference from: 9 | 10 | wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38/Primary_Assembly/assembled_chromosomes/FASTA/chrY.fa.gz 11 | 12 | Mason from: 13 | 14 | http://www.seqan.de/projects/mason/ 15 | 16 | Mouse reads created by: 17 | 18 | ./bin/mason illumina -sq -n 100 -hn 2 -pi 0.005 -pd 0.005 -mp -i -rn 1 /home/andre/biotools/mouse_chrY.fa 19 | 20 | Convert sam to bam and index: 21 | 22 | samtools view -bS mouse_chrY.fa.fastq.sam > mouse_chrY.fa.fastq.bam 23 | samtools index mouse_chrY.fa.fastq.bam 24 | 25 | Fix read groups and such: 26 | 27 | ./picard-tools.sh AddOrReplaceReadGroups.jar I= mouse_chrY.fa.fastq.bam O= mouse_chrY.fa.fastq.fixed.bam SORT_ORDER=coordinate RGID="read_group_id" RGLB="library" RGPL="illumina" RGPU="platform_unit" RGSM="sequencing_center" CREATE_INDEX=True; 28 | 29 | Reference sequence dictionary: 30 | 31 | ./picard-tools.sh CreateSequenceDictionary.jar R= mouse_chrY.fa O= mouse_chrY.dict 32 | 33 | Here we notice that the true bam fails GATK's mapping quality test?! 34 | 35 | bwa index mouse_chrY.fa 36 | bwa mem -M -t 4 mouse_chrY.fa mouse_chrY.fa_1.fastq mouse_chrY.fa_2.fastq > mouse_chrY.fa.bwa.sam 37 | 38 | ... and repeat the steps from above 39 | samtools view -bS mouse_chrY.fa.bwa.sam > mouse_chrY.fa.bwa.bam 40 | samtools sort mouse_chrY.fa.bwa.bam mouse_chrY.fa.bwa.sorted 41 | samtools index mouse_chrY.fa.bwa.sorted.bam 42 | ./picard-tools.sh AddOrReplaceReadGroups.jar I= mouse_chrY.fa.bwa.sorted.bam O= mouse_chrY.fa.bwa.sorted.fixed.bam SORT_ORDER=coordinate RGID="read_group_id" RGLB="library" RGPL="illumina" RGPU="platform_unit" RGSM="sequencing_center" CREATE_INDEX=True; 43 | 44 | Generate samtools mpileup for comparison: 45 | 46 | samtools mpileup -f /home/andre/biotools/mouse_chrY.fa small_realignment_targets.bam > small_realignment_targets.pileup 47 | 48 | Notice that MD tag is missing so generate it: 49 | 50 | samtools calmd small_realignment_targets.bam /home/andre/biotools/mouse_chrY.fa > small_realignment_targets.sam_new 51 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/sorted.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 SO:coordinate 2 | @SQ SN:1 LN:1000 3 | @SQ SN:3 LN:1000 4 | @SQ SN:4 LN:2000 5 | @SQ SN:chr2 LN:1000 6 | A 0 1 1 50 10M * 0 0 ACACACACAC ********** 7 | B 0 3 11 40 4M2I4M * 0 0 ACACACACAC ********** 8 | C 0 4 1001 25 8M * 0 0 ACACACAC ******** 9 | E 0 chr2 101 45 10M * 0 0 ACACACACAC ********** 10 | D 0 chr2 501 55 10M2S * 0 0 ACACACACACAC ************ 11 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/tab5_to_usam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | import sys 5 | 6 | # read lines from stdin 7 | lines = sys.stdin.readlines() 8 | 9 | # print sam header 10 | print("@HD\tVN:1.5\tSO:unsorted") 11 | 12 | # loop and print sam lines 13 | for line in lines: 14 | fields = line.split() 15 | readName = fields[0] 16 | firstSequence = fields[1] 17 | firstQualities = fields[2] 18 | secondSequence = fields[3] 19 | secondQualities = fields[4] 20 | 21 | # flags: 22 | # 1 = paired (we assume that in this script) 23 | # 4 = unmapped 24 | # 8 = mate unmapped 25 | # 64 = first of pair 26 | # 128 = second of pair 27 | firstFlags = 64 | 8 | 4 | 1 28 | secondFlags = 128 | 8 | 4 | 1 29 | 30 | # sam is the following tab-delimited columns: 31 | # 32 | # 1. read name 33 | # 2. flags 34 | # 3. ref (* = unaligned) 35 | # 4. pos (0 = unaligned) 36 | # 5. map qual (0 if unmapped) 37 | # 6. cigar (* = unavailable) 38 | # 7. mate ref (* = unaligned) 39 | # 8. mate pos (0 = unaligned) 40 | # 9. tlen (0 = unknown) 41 | # 10. sequence 42 | # 11. qualities 43 | print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/1", 44 | firstFlags, 45 | firstSequence, 46 | firstQualities)) 47 | print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/2", 48 | secondFlags, 49 | secondSequence, 50 | secondQualities)) 51 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/tab6_to_usam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | import sys 5 | 6 | # read lines from stdin 7 | lines = sys.stdin.readlines() 8 | 9 | # print sam header 10 | print("@HD\tVN:1.5\tSO:unsorted") 11 | 12 | # loop and print sam lines 13 | for line in lines: 14 | fields = line.split() 15 | firstReadName = fields[0] 16 | firstSequence = fields[1] 17 | firstQualities = fields[2] 18 | secondReadName = fields[3] 19 | secondSequence = fields[4] 20 | secondQualities = fields[5] 21 | 22 | # flags: 23 | # 1 = paired (we assume that in this script) 24 | # 4 = unmapped 25 | # 8 = mate unmapped 26 | # 64 = first of pair 27 | # 128 = second of pair 28 | firstFlags = 64 | 8 | 4 | 1 29 | secondFlags = 128 | 8 | 4 | 1 30 | 31 | # sam is the following tab-delimited columns: 32 | # 33 | # 1. read name 34 | # 2. flags 35 | # 3. ref (* = unaligned) 36 | # 4. pos (0 = unaligned) 37 | # 5. map qual (0 if unmapped) 38 | # 6. cigar (* = unavailable) 39 | # 7. mate ref (* = unaligned) 40 | # 8. mate pos (0 = unaligned) 41 | # 9. tlen (0 = unknown) 42 | # 10. sequence 43 | # 11. qualities 44 | print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (firstReadName, 45 | firstFlags, 46 | firstSequence, 47 | firstQualities)) 48 | print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (secondReadName, 49 | secondFlags, 50 | secondSequence, 51 | secondQualities)) 52 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/tag.sam: -------------------------------------------------------------------------------- 1 | @SQ SN:1 LN:249250621 2 | @SQ SN:2 LN:243199373 3 | @PG ID:p1 PN:myProg CL:"myProg 123" VN:1.0.0 4 | @PG ID:p2 PN:myProg CL:"myProg 456" VN:1.0.0 PP:p1 5 | simread:1:26472783:false 16 1 26472784 60 75M * 0 0 GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA * NM:i:0 AS:i:75 XS:i:0 Zb:B:c,-1,0,1 ZB:B:C,1,0,1 Zi:B:i,-1,0,1,2 ZI:B:I,1,0,1,2 Zs:B:s,-2,0,2 ZS:B:S,2,0,2 ZF:B:f,-1.100000,0.000000,1.100000 -------------------------------------------------------------------------------- /adam-core/src/test/resources/tags.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.4 SO:coordinate 2 | @SQ SN:1 LN:1000 3 | StandardTags 0 1 1 255 10M * 0 0 ACACACACAC ********** NM:i:0 MD:Z:10 XS:A:- 4 | MDTagWithEdits 0 1 1 255 10M * 0 0 ACAGACACTC ********** NM:i:2 MD:Z:3G4T1 5 | HexByteArray 0 1 1 255 10M * 0 0 ACACACACAC ********** NM:i:0 MD:Z:10 XB:H:010203 6 | LengthOneArrays 0 1 1 255 10M * 0 0 ACACACACAC ********** NM:i:0 MD:Z:10 XB:B:c,1 XI:B:i,1 XS:B:s,1 XF:B:f,1 7 | LongerArrays 0 1 1 255 10M * 0 0 ACACACACAC ********** NM:i:0 MD:Z:10 XB:B:c,1,2,3 XI:B:i,1,2,3 XS:B:s,1,2,3 XS:B:f,1,2,3 8 | SignedArrays 0 1 1 255 10M * 0 0 ACACACACAC ********** NM:i:0 MD:Z:10 XB:B:c,-1 XI:B:i,-1 XS:B:s,-1 9 | UnsignedArrays 0 1 1 255 10M * 0 0 ACACACACAC ********** NM:i:0 MD:Z:10 XB:B:C,1,2,3 XI:B:I,1,2,3 XS:B:S,1,2,3 10 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/test.compressed.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.compressed.bcf -------------------------------------------------------------------------------- /adam-core/src/test/resources/test.conf: -------------------------------------------------------------------------------- 1 | 2 | accessKey=accessKey 3 | secretKey=secretKey 4 | 5 | accessKey_s3 = accessKey_s3 6 | secretKey_s3=secretKey_s3 7 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/test.uncompressed.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.uncompressed.bcf -------------------------------------------------------------------------------- /adam-core/src/test/resources/test.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta 5 | ##contig= 6 | ##phasing=partial 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##FILTER= 14 | ##FILTER= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 | 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 21 | 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 22 | 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 23 | 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 24 | 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 25 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/test.vcf.bgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.vcf.bgz -------------------------------------------------------------------------------- /adam-core/src/test/resources/test.vcf.bgzf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.vcf.bgzf.gz -------------------------------------------------------------------------------- /adam-core/src/test/resources/test.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.vcf.gz -------------------------------------------------------------------------------- /adam-core/src/test/resources/test_command.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # pipe input to a file 4 | tee $1 > /dev/null 5 | 6 | # print out another file 7 | cat $2 -------------------------------------------------------------------------------- /adam-core/src/test/resources/test_rowgroup_rangeindex.1.txt: -------------------------------------------------------------------------------- 1 | s3://TEST/test-parquet 0 chr22:1000-2000,chr21:10000-20000 2 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/timeout.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | import sys 5 | import time 6 | 7 | # read lines from stdin 8 | lines = sys.stdin.readlines() 9 | 10 | def print_lines(skip_header=False): 11 | for line in lines: 12 | if not (skip_header and line.startswith('@')): 13 | print(line.strip().rstrip()) 14 | 15 | print_lines() 16 | sys.stdout.flush() 17 | 18 | time.sleep(10) 19 | 20 | print_lines(skip_header=True) 21 | sys.stdout.flush() 22 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/unsorted.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 SO:unsorted 2 | @SQ SN:1 LN:1000 3 | @SQ SN:chr2 LN:1000 4 | @SQ SN:3 LN:1000 5 | @SQ SN:4 LN:2000 6 | B 0 3 11 40 4M2I4M * 0 0 ACACACACAC ********** 7 | E 0 chr2 101 45 10M * 0 0 ACACACACAC ********** 8 | C 0 4 1001 25 8M * 0 0 ACACACAC ******** 9 | A 0 1 1 50 10M * 0 0 ACACACACAC ********** 10 | D 0 chr2 501 55 10M2S * 0 0 ACACACACACAC ************ 11 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/wgEncodeOpenChromDnaseGm19238Pk.trunc10.narrowPeak: -------------------------------------------------------------------------------- 1 | chr1 713849 714434 chr1.1 1000 . 0.2252 9.16 -1 263 2 | chr1 740180 740393 chr1.2 595 . 0.0473 1.94 -1 104 3 | chr1 752735 753037 chr1.3 613 . 0.0536 2.2 -1 135 4 | chr1 762137 763263 chr1.4 1000 . 0.3077 12.5 -1 742 5 | chr1 773142 773478 chr1.5 571 . 0.0387 1.59 -1 200 6 | chr1 773831 773990 chr1.6 566 . 0.0370 1.52 -1 66 7 | chr1 791738 791783 chr1.7 551 . 0.0315 1.3 -1 13 8 | chr1 793311 793670 chr1.8 690 . 0.0812 3.32 -1 165 9 | chr1 793756 794115 chr1.9 588 . 0.0447 1.84 -1 144 10 | chr1 794221 794336 chr1.10 553 . 0.0323 1.33 -1 57 11 | -------------------------------------------------------------------------------- /adam-core/src/test/resources/wgs_calling_regions.hg38.interval_list: -------------------------------------------------------------------------------- 1 | @HD VN:1.5 SO:coordinate 2 | @SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens 3 | @SQ SN:chr2 LN:242193529 M5:f98db672eb0993dcfdabafe2a882905c AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens 4 | @PG ID:1 CL:picard.util.IntervalListTools INPUT=[HG38excludeNs.interval_list, genome.interval_list] OUTPUT=wgs_calling_regions.v3.interval_list SORT=true ACTION=INTERSECT PADDING=0 UNIQUE=false SCATTER_COUNT=1 INCLUDE_FILTERED=false BREAK_BANDS_AT_MULTIPLES_OF=0 SUBDIVISION_MODE=INTERVAL_SUBDIVISION INVERT=false VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json PN:IntervalListTools 5 | chr1 10001 207666 + . intersection ACGTmer 6 | chr1 257667 297968 + . intersection ACGTmer 7 | chr1 347969 535988 + . intersection ACGTmer 8 | chr1 585989 2702781 + . intersection ACGTmer 9 | chr1 2746291 12954384 + . intersection ACGTmer 10 | chr1 13004385 16799163 + . intersection ACGTmer 11 | chr1 16849164 29552233 + . intersection ACGTmer 12 | chr1 29553836 121976459 + . intersection ACGTmer 13 | chr1 122026460 124977944 + . intersection ACGTmer 14 | chr1 124978327 125130246 + . intersection ACGTmer 15 | chr1 125131848 125171347 + . intersection ACGTmer 16 | chr1 125173584 125184587 + . intersection ACGTmer 17 | chr1 143184588 223558935 + . intersection ACGTmer 18 | chr1 223608936 228558364 + . intersection ACGTmer 19 | chr1 228608365 248946422 + . intersection ACGTmer 20 | chr2 10001 16145119 + . intersection ACGTmer 21 | chr2 16146120 32867130 + . intersection ACGTmer 22 | chr2 32868131 32916625 + . intersection ACGTmer 23 | chr2 32917626 89330679 + . intersection ACGTmer 24 | chr2 89530680 89685992 + . intersection ACGTmer 25 | chr2 89753993 90402511 + . intersection ACGTmer 26 | chr2 91402512 92138145 + . intersection ACGTmer 27 | chr2 92188146 94090557 + . intersection ACGTmer 28 | chr2 94140558 94293015 + . intersection ACGTmer 29 | chr2 94496016 97439618 + . intersection ACGTmer 30 | chr2 97489619 238903659 + . intersection ACGTmer 31 | chr2 238904048 242183529 + . intersection ACGTmer 32 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReadsSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.algorithms.consensus 19 | 20 | import org.apache.spark.rdd.RDD 21 | import org.bdgenomics.adam.ds.ADAMContext._ 22 | import org.bdgenomics.adam.rich.RichAlignment 23 | import org.bdgenomics.adam.util.ADAMFunSuite 24 | import org.bdgenomics.formats.avro.Alignment 25 | 26 | class ConsensusGeneratorFromReadsSuite extends ADAMFunSuite { 27 | 28 | val cg = new ConsensusGeneratorFromReads 29 | 30 | def artificial_reads: RDD[Alignment] = { 31 | val path = testFile("artificial.sam") 32 | sc.loadAlignments(path).rdd 33 | } 34 | 35 | sparkTest("checking search for consensus list for artificial reads") { 36 | val consensus = cg.findConsensus(artificial_reads.map(new RichAlignment(_)) 37 | .collect() 38 | .toSeq) 39 | 40 | assert(consensus.size === 2) 41 | } 42 | } 43 | 44 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.algorithms.consensus 19 | 20 | import org.bdgenomics.adam.models.ReferenceRegion 21 | import org.scalatest.FunSuite 22 | 23 | class ConsensusSuite extends FunSuite { 24 | 25 | test("test the insertion of a consensus insertion into a reference") { 26 | val c = Consensus("TCGA", ReferenceRegion("0", 10L, 11L)) 27 | 28 | val ref = "AAAAAAAAAA" 29 | 30 | val cs = c.insertIntoReference(ref, ReferenceRegion("0", 5L, 16L)) 31 | 32 | assert(cs === "AAAAAATCGAAAAA") 33 | } 34 | 35 | test("test the insertion of a consensus deletion into a reference") { 36 | val c = Consensus("", ReferenceRegion("0", 10L, 16L)) 37 | 38 | val ref = "AAAAATTTTT" 39 | 40 | val cs = c.insertIntoReference(ref, ReferenceRegion("0", 5L, 16L)) 41 | 42 | assert(cs === "AAAAA") 43 | } 44 | 45 | test("inserting empty consensus returns the reference") { 46 | val ref = "AAAAAAAAAAAAA" 47 | val c = new Consensus("", ReferenceRegion("0", 0L, 1L)) 48 | 49 | val co = c.insertIntoReference(ref, ReferenceRegion("0", 0, ref.length)) 50 | 51 | assert(ref === co) 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/ds/LeftOuterShuffleRegionJoinSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds 19 | 20 | import org.apache.spark.rdd.RDD 21 | import org.bdgenomics.adam.models.{ 22 | ReferenceRegion, 23 | SequenceDictionary, 24 | SequenceRecord 25 | } 26 | import org.bdgenomics.formats.avro.Alignment 27 | 28 | class LeftOuterShuffleRegionJoinSuite(partitionMap: Seq[Option[(ReferenceRegion, ReferenceRegion)]]) 29 | extends OuterRegionJoinSuite { 30 | 31 | val partitionSize = 3 32 | var seqDict: SequenceDictionary = _ 33 | 34 | before { 35 | seqDict = SequenceDictionary( 36 | SequenceRecord("chr1", 15, url = "test://chrom1"), 37 | SequenceRecord("chr2", 15, url = "test://chrom2")) 38 | } 39 | 40 | def runJoin(leftRdd: RDD[(ReferenceRegion, Alignment)], 41 | rightRdd: RDD[(ReferenceRegion, Alignment)]): RDD[(Option[Alignment], Alignment)] = { 42 | LeftOuterShuffleRegionJoin[Alignment, Alignment](rightRdd, leftRdd) 43 | .compute().map(_.swap) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/ds/RightOuterTreeRegionJoinSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds 19 | 20 | import org.apache.spark.rdd.RDD 21 | import org.bdgenomics.adam.models.ReferenceRegion 22 | import org.bdgenomics.adam.ds.read.AlignmentArray 23 | import org.bdgenomics.formats.avro.Alignment 24 | import org.bdgenomics.utils.interval.array.IntervalArray 25 | 26 | class RightOuterTreeRegionJoinSuite extends OuterRegionJoinSuite { 27 | 28 | def runJoin(leftRdd: RDD[(ReferenceRegion, Alignment)], 29 | rightRdd: RDD[(ReferenceRegion, Alignment)]): RDD[(Option[Alignment], Alignment)] = { 30 | RightOuterTreeRegionJoin[Alignment, Alignment]().broadcastAndJoin( 31 | IntervalArray[ReferenceRegion, Alignment](leftRdd, 32 | AlignmentArray.apply(_, _)), 33 | rightRdd) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/ds/feature/GFF3HeaderWriterSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.feature 19 | 20 | import org.bdgenomics.adam.util.ADAMFunSuite 21 | import scala.io.Source 22 | 23 | class GFF3HeaderWriterSuite extends ADAMFunSuite { 24 | 25 | sparkTest("write gff3 header pragma") { 26 | val tmp = tmpFile(".gff3") 27 | GFF3HeaderWriter(tmp, sc) 28 | val lines = Source.fromFile(tmp) 29 | .getLines 30 | .toSeq 31 | assert(lines.size === 1) 32 | assert(lines.head === GFF3HeaderWriter.HEADER_STRING) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/ds/read/realignment/ModPartitionerSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.read.realignment 19 | 20 | import org.scalatest.FunSuite 21 | 22 | class ModPartitionerSuite extends FunSuite { 23 | 24 | val partitioner = ModPartitioner(123) 25 | 26 | test("report number of partitions correctly") { 27 | assert(partitioner.numPartitions === 123) 28 | } 29 | 30 | test("partition a number that is lower than the number of partitions and positive") { 31 | assert(partitioner.getPartition(12) == 12) 32 | } 33 | 34 | test("partition a number that is greater than the number of partitions and positive") { 35 | assert(partitioner.getPartition(321) == 75) 36 | } 37 | 38 | test("partition a number that is lower than the number of partitions and negative") { 39 | assert(partitioner.getPartition(-21) == 21) 40 | } 41 | 42 | test("partition a number that is greater than the number of partitions and negative") { 43 | assert(partitioner.getPartition(-1234) == 4) 44 | } 45 | 46 | test("fire an exception if input is not an integer") { 47 | intercept[IllegalArgumentException] { 48 | partitioner.getPartition("a string") 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/ds/read/recalibration/RecalibrationTableSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.ds.read.recalibration 19 | 20 | import org.bdgenomics.formats.avro.Alignment 21 | import org.scalatest.FunSuite 22 | 23 | class RecalibrationTableSuite extends FunSuite { 24 | 25 | val observedCovariates = Map((CovariateKey(0, 26 | (50 + 33).toChar, 27 | 2, 28 | 'A', 29 | 'C') -> new Aggregate(1000000, 1, 10.0)), 30 | (CovariateKey(0, 31 | (40 + 33).toChar, 32 | 1, 33 | 'N', 34 | 'N') -> new Aggregate(100000, 1, 10.0))) 35 | val table = RecalibrationTable(new ObservationTable( 36 | observedCovariates)) 37 | 38 | test("look up quality scores in table") { 39 | val scores = table(observedCovariates.map(_._1).toArray) 40 | 41 | assert(scores.size === 2) 42 | assert(scores(0) === (50 + 33).toChar) 43 | assert(scores(1) === (47 + 33).toChar) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/io/InterleavedFastqInputFormatSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.io 19 | 20 | import org.bdgenomics.adam.util.ADAMFunSuite 21 | import org.apache.spark.rdd.RDD 22 | import org.apache.hadoop.io.Text 23 | 24 | class InterleavedFastqInputFormatSuite extends ADAMFunSuite { 25 | (1 to 5) foreach { testNumber => 26 | val inputName = "interleaved_fastq_sample%d.ifq".format(testNumber) 27 | val expectedOutputName = inputName + ".output" 28 | val expectedOutputPath = testFile(expectedOutputName) 29 | val expectedOutputData = scala.io.Source.fromFile(expectedOutputPath).mkString 30 | 31 | sparkTest("interleaved FASTQ hadoop reader: %s->%s".format(inputName, expectedOutputName)) { 32 | def ifq_reader: RDD[(Void, Text)] = { 33 | val path = testFile(inputName) 34 | sc.newAPIHadoopFile(path, 35 | classOf[InterleavedFastqInputFormat], 36 | classOf[Void], 37 | classOf[Text]) 38 | } 39 | 40 | val ifq_reads = ifq_reader.collect() 41 | 42 | val testOutput = new StringBuilder() 43 | 44 | ifq_reads.foreach(pair => { 45 | testOutput.append(">>>interleaved fastq record start>>>\n") 46 | testOutput.append(pair._2) 47 | testOutput.append("<< 26 | val inputName = "fastq_sample%d.fq".format(testNumber) 27 | val expectedOutputName = "single_" + inputName + ".output" 28 | val expectedOutputPath = testFile(expectedOutputName) 29 | val expectedOutputData = scala.io.Source.fromFile(expectedOutputPath).mkString 30 | 31 | sparkTest("FASTQ hadoop reader: %s->%s".format(inputName, expectedOutputName)) { 32 | def ifq_reader: RDD[(Void, Text)] = { 33 | val path = testFile(inputName) 34 | sc.newAPIHadoopFile(path, 35 | classOf[SingleFastqInputFormat], 36 | classOf[Void], 37 | classOf[Text]) 38 | } 39 | 40 | val ifq_reads = ifq_reader.collect() 41 | 42 | val testOutput = new StringBuilder() 43 | 44 | ifq_reads.foreach(pair => { 45 | testOutput.append(">>>fastq record start>>>\n") 46 | testOutput.append(pair._2) 47 | testOutput.append("<< "org.apache.spark.serializer.KryoSerializer", 32 | "spark.kryo.registrator" -> "org.bdgenomics.adam.serialization.ADAMKryoRegistrator", 33 | "spark.kryo.referenceTracking" -> "true", 34 | "spark.kryo.registrationRequired" -> "true" 35 | ) 36 | 37 | } 38 | 39 | -------------------------------------------------------------------------------- /adam-core/src/test/scala/org/bdgenomics/adam/util/FileMergerSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Big Data Genomics (BDG) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The BDG licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.bdgenomics.adam.util 19 | 20 | import org.apache.hadoop.fs.{ FileSystem, Path } 21 | 22 | class FileMergerSuite extends ADAMFunSuite { 23 | 24 | sparkTest("cannot write both empty gzip block and cram eof") { 25 | intercept[IllegalArgumentException] { 26 | // we don't need to pass real paths here 27 | FileMerger.mergeFiles(sc, 28 | FileSystem.getLocal(sc.hadoopConfiguration), 29 | new Path("output"), 30 | new Path("head"), 31 | writeEmptyGzipBlock = true, 32 | writeCramEOF = true) 33 | } 34 | } 35 | 36 | sparkTest("buffer size must be non-negative") { 37 | intercept[IllegalArgumentException] { 38 | // we don't need to pass real paths here 39 | FileMerger.mergeFiles(sc, 40 | FileSystem.getLocal(sc.hadoopConfiguration), 41 | new Path("output"), 42 | new Path("head"), 43 | optBufferSize = Some(0)) 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /adam-distribution/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.bdgenomics.adam 6 | adam-parent-spark3_2.12 7 | 1.1-SNAPSHOT 8 | ../pom.xml 9 | 10 | 11 | adam-distribution-spark3_2.12 12 | pom 13 | ADAM_${scala.version.prefix}: Distribution 14 | 15 | 16 | 17 | org.apache.maven.plugins 18 | maven-assembly-plugin 19 | 20 | 21 | src/main/assembly/assembly.xml 22 | 23 | 24 | 25 | 26 | package 27 | 28 | single 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /adam-python/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .cache -------------------------------------------------------------------------------- /adam-python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | global-exclude *.py[cod] __pycache__ .DS_Store 20 | recursive-include deps/jars *.jar 21 | include version.py 22 | -------------------------------------------------------------------------------- /adam-python/README.md: -------------------------------------------------------------------------------- 1 | # ADAM 2 | 3 | ADAM is a library and command line tool that enables the use of [Apache 4 | Spark](https://spark.apache.org) to parallelize genomic data analysis across 5 | cluster/cloud computing environments. ADAM uses a set of schemas to describe 6 | genomic sequences, reads, variants/genotypes, and features, and can be used 7 | with data in legacy genomic file formats such as SAM/BAM/CRAM, BED/GFF3/GTF, 8 | and VCF, as well as data stored in the columnar 9 | [Apache Parquet](https://parquet.apache.org) format. On a single node, ADAM 10 | provides competitive performance to optimized multi-threaded tools, while 11 | enabling scale out to clusters with more than a thousand cores. ADAM's APIs 12 | can be used from Scala, Java, Python, R, and SQL. 13 | 14 | ## Documentation 15 | 16 | ADAM's documentation is hosted at [readthedocs](http://adam.readthedocs.io). 17 | 18 | ## Python Requirements 19 | 20 | ADAM depends on having PySpark installed. -------------------------------------------------------------------------------- /adam-python/bdgenomics/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to Big Data Genomics (BDG) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The BDG licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | __path__ = __import__('pkgutil').extend_path(__path__, __name__) 17 | -------------------------------------------------------------------------------- /adam-python/bdgenomics/adam/.gitignore: -------------------------------------------------------------------------------- 1 | schemas.py -------------------------------------------------------------------------------- /adam-python/bdgenomics/adam/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | r""" 19 | ======================= 20 | bdgenomics.adam Package 21 | ======================= 22 | .. currentmodule:: bdgenomics.adam 23 | 24 | ADAM's Python API wraps the ADAMContext and GenomicDataset APIs so they can be used from PySpark. 25 | The Python API is feature complete relative to ADAM's Java API. 26 | 27 | .. automodule:: bdgenomics.adam.adamContext 28 | .. automodule:: bdgenomics.adam.models 29 | .. automodule:: bdgenomics.adam.ds 30 | .. automodule:: bdgenomics.adam.stringency 31 | 32 | """ 33 | -------------------------------------------------------------------------------- /adam-python/bdgenomics/adam/models.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | r""" 19 | ====== 20 | models 21 | ====== 22 | .. currentmodule:: bdgenomics.adam.models 23 | .. autosummary:: 24 | :toctree: _generate/ 25 | 26 | ReferenceRegion 27 | """ 28 | 29 | class ReferenceRegion: 30 | """ 31 | Represents a contiguous region of the reference genome. 32 | """ 33 | 34 | def __init__(self, referenceName, start, end): 35 | """ 36 | Represents a contiguous region of the reference genome. 37 | 38 | :param referenceName The name of the sequence (chromosome) in the reference genome 39 | :param start The 0-based residue-coordinate for the start of the region 40 | :param end The 0-based residue-coordinate for the first residue after the start 41 | which is not in the region -- i.e. [start, end) define a 0-based 42 | half-open interval. 43 | """ 44 | 45 | self.referenceName = referenceName 46 | self.start = start 47 | self.end = end 48 | 49 | 50 | def _toJava(self, jvm): 51 | """ 52 | Converts to an org.bdgenomics.adam.models.ReferenceRegion 53 | 54 | Should not be called from user code. 55 | 56 | :param jvm: Py4j JVM handle. 57 | """ 58 | 59 | return jvm.org.bdgenomics.adam.models.ReferenceRegion.fromGenomicRange(self.referenceName, self.start, self.end) 60 | -------------------------------------------------------------------------------- /adam-python/bdgenomics/adam/stringency.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | r""" 19 | ========== 20 | stringency 21 | ========== 22 | .. currentmodule:: bdgenomics.adam.stringency 23 | .. autosummary:: 24 | :toctree: _generate/ 25 | 26 | STRICT 27 | LENIENT 28 | SILENT 29 | """ 30 | 31 | STRICT = 2 32 | """ 33 | htsjdk.samtools.ValidationStringency.STRICT 34 | """ 35 | LENIENT = 1 36 | """ 37 | htsjdk.samtools.ValidationStringency.LENIENT 38 | """ 39 | SILENT = 0 40 | """ 41 | htsjdk.samtools.ValidationStringency.SILENT 42 | """ 43 | 44 | def _toJava(stringency, jvm): 45 | """ 46 | Converts to an HTSJDK ValidationStringency enum. 47 | 48 | Should not be called from user code. 49 | 50 | :param bdgenomics.adam.stringency stringency: The desired stringency level. 51 | :param jvm: Py4j JVM handle. 52 | """ 53 | 54 | if stringency is STRICT: 55 | return jvm.htsjdk.samtools.ValidationStringency.valueOf("STRICT") 56 | elif stringency is LENIENT: 57 | return jvm.htsjdk.samtools.ValidationStringency.valueOf("LENIENT") 58 | elif stringency is SILENT: 59 | return jvm.htsjdk.samtools.ValidationStringency.valueOf("SILENT") 60 | else: 61 | raise RuntimeError("Received %s. Stringency must be one of STRICT (%d), LENIENT (%d), or SILENT (%s)." % (stringency, STRICT, LENIENT, SILENT)) 62 | -------------------------------------------------------------------------------- /adam-python/bdgenomics/adam/test/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | 20 | import os 21 | import sys 22 | import tempfile 23 | import unittest 24 | 25 | from pyspark.sql import SparkSession 26 | 27 | class SparkTestCase(unittest.TestCase): 28 | 29 | 30 | def resourceFile(self, filename, module='adam-core'): 31 | 32 | adamRoot = os.path.dirname(os.getcwd()) 33 | return os.path.join(os.path.join(adamRoot, 34 | "%s/src/test/resources" % module), 35 | filename) 36 | 37 | 38 | def tmpFile(self): 39 | 40 | tempFile = tempfile.NamedTemporaryFile(delete=True) 41 | tempFile.close() 42 | return tempFile.name 43 | 44 | 45 | def checkFiles(self, file1, file2): 46 | 47 | f1 = open(file1) 48 | f2 = open(file2) 49 | 50 | try: 51 | self.assertEqual(f1.read(), f2.read()) 52 | finally: 53 | f1.close() 54 | f2.close() 55 | 56 | 57 | def setUp(self): 58 | self._old_sys_path = list(sys.path) 59 | class_name = self.__class__.__name__ 60 | self.ss = SparkSession.builder \ 61 | .master('local[4]') \ 62 | .appName(class_name) \ 63 | .getOrCreate() 64 | self.sc = self.ss.sparkContext 65 | 66 | 67 | def tearDown(self): 68 | self.sc.stop() 69 | sys.path = self._old_sys_path 70 | -------------------------------------------------------------------------------- /adam-python/bdgenomics/adam/test/variantDataset_test.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | 20 | from bdgenomics.adam.adamContext import ADAMContext 21 | from bdgenomics.adam.test import SparkTestCase 22 | 23 | 24 | class VariantDatasetTest(SparkTestCase): 25 | 26 | 27 | def test_vcf_round_trip(self): 28 | 29 | testFile = self.resourceFile("small.vcf") 30 | ac = ADAMContext(self.ss) 31 | 32 | variants = ac.loadVariants(testFile) 33 | 34 | tmpPath = self.tmpFile() + ".vcf" 35 | variants.toVariantContexts().saveAsVcf(tmpPath) 36 | 37 | savedVariants = ac.loadVariants(testFile) 38 | 39 | self.assertEqual(variants._jvmDataset.jrdd().count(), 40 | savedVariants._jvmDataset.jrdd().count()) 41 | 42 | 43 | def test_transform(self): 44 | 45 | variantPath = self.resourceFile("small.vcf") 46 | ac = ADAMContext(self.ss) 47 | 48 | variants = ac.loadVariants(variantPath) 49 | 50 | transformedVariants = variants.transform(lambda x: x.filter(x.start < 19190)) 51 | 52 | self.assertEqual(transformedVariants.toDF().count(), 3) 53 | -------------------------------------------------------------------------------- /adam-python/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.bdgenomics.adam 6 | adam-parent-spark3_2.12 7 | 1.1-SNAPSHOT 8 | ../pom.xml 9 | 10 | 11 | adam-python-spark3_2.12 12 | jar 13 | ADAM_${scala.version.prefix}: Python APIs 14 | 15 | ${maven.build.timestamp} 16 | yyyy-MM-dd 17 | 18 | 19 | 20 | 21 | 22 | org.codehaus.mojo 23 | exec-maven-plugin 24 | 25 | 26 | dev-python 27 | process-resources 28 | 29 | exec 30 | 31 | 32 | make 33 | 34 | develop 35 | 36 | 37 | 38 | 39 | test-python 40 | process-test-resources 41 | 42 | exec 43 | 44 | 45 | make 46 | 47 | test 48 | 49 | ${skipTests} 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /adam-python/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from __future__ import print_function 20 | 21 | version = '1.1a0' 22 | 23 | if __name__ == '__main__': 24 | print(version) 25 | -------------------------------------------------------------------------------- /adam-r/.gitignore: -------------------------------------------------------------------------------- 1 | !bdgenomics.adam 2 | -------------------------------------------------------------------------------- /adam-r/bdgenomics.adam/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: bdgenomics.adam 2 | Type: Package 3 | Version: 1.0.1 4 | Title: R Frontend for Big Data Genomics/ADAM 5 | Description: ADAM is a genomics analysis platform with specialized file formats built using Apache Avro, Apache Spark and Parquet. 6 | Author: Big Data Genomics 7 | Maintainer: Frank Austin Nothaft 8 | Authors@R: c(person("Frank Austin", "Nothaft", role = c("aut", "cre"), 9 | email = "fnothaft@alumni.stanford.edu"), 10 | person(family = "Big Data Genomics", role = c("aut", "cph"))) 11 | License: Apache License (== 2.0) 12 | URL: http://www.bdgenomics.org https://github.com/bigdatagenomics/adam 13 | BugReports: https://github.com/bigdatagenomics/adam/issues 14 | Imports: 15 | methods, 16 | SparkR (>= 2.1.0) 17 | Depends: 18 | R (>= 3.0) 19 | Suggests: 20 | testthat 21 | Collate: 22 | 'generics.R' 23 | 'adam-context.R' 24 | 'ds.R' 25 | RoxygenNote: 7.1.1 26 | -------------------------------------------------------------------------------- /adam-r/bdgenomics.adam/tests/testthat.R: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | library(testthat) 19 | library(bdgenomics.adam) 20 | 21 | test_check("bdgenomics.adam") 22 | -------------------------------------------------------------------------------- /adam-r/bdgenomics.adam/tests/testthat/helpers.R: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | resourceFile <- function(fileName, submodule="adam-core") { 20 | file.path(dirname(dirname(dirname(dirname(getwd())))), 21 | paste(submodule, "src/test/resources", sep = "/", 22 | fileName)) 23 | } 24 | 25 | expect_files_match <- function(newFile, originalFile) { 26 | expect_equal(readLines(newFile), readLines(originalFile)) 27 | } 28 | -------------------------------------------------------------------------------- /adam-r/bdgenomics.adam/tests/testthat/test_variantDataset.R: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to Big Data Genomics (BDG) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The BDG licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | library(bdgenomics.adam) 19 | 20 | context("manipulating variants") 21 | 22 | ac <- createADAMContext() 23 | 24 | test_that("round trip vcf", { 25 | testFile <- resourceFile("small.vcf") 26 | variants <- loadVariants(ac, testFile) 27 | tmpPath <- tempfile(fileext = ".vcf") 28 | saveAsVcf(toVariantContexts(variants), tmpPath) 29 | 30 | expect_equal(count(toDF(variants)), count(toDF(loadVariants(ac, tmpPath)))) 31 | }) 32 | -------------------------------------------------------------------------------- /bin/adam-shell: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to Big Data Genomics (BDG) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The BDG licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | # does the user have ADAM_OPTS set? if yes, then warn 23 | if [[ -z $@ && -n "$ADAM_OPTS" ]]; then 24 | echo "WARNING: Passing Spark arguments via ADAM_OPTS was recently removed." 1>&2 25 | echo "Run adam-shell instead as adam-shell " 1>&2 26 | fi 27 | 28 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]}) 29 | 30 | ADAM_CLI_JAR=$(${SOURCE_DIR}/find-adam-assembly.sh) 31 | 32 | SPARK_SHELL=$(${SOURCE_DIR}/find-spark.sh spark-shell) 33 | echo "Using SPARK_SHELL=$SPARK_SHELL" 1>&2 34 | 35 | # submit the job to Spark 36 | "$SPARK_SHELL" \ 37 | --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 38 | --conf spark.kryo.registrator=org.bdgenomics.adam.serialization.ADAMKryoRegistrator \ 39 | --jars ${ADAM_CLI_JAR} \ 40 | "$@" 41 | -------------------------------------------------------------------------------- /bin/adamR: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to Big Data Genomics (BDG) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The BDG licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]}) 23 | 24 | ADAM_CLI_JAR=$(${SOURCE_DIR}/find-adam-assembly.sh) 25 | 26 | SPARKR=$(${SOURCE_DIR}/find-spark.sh sparkR) 27 | echo "Using SPARKR=$SPARKR" 1>&2 28 | 29 | # submit the job to Spark 30 | "$SPARKR" \ 31 | --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 32 | --conf spark.kryo.registrator=org.bdgenomics.adam.serialization.ADAMKryoRegistrator \ 33 | --jars ${ADAM_CLI_JAR} \ 34 | --driver-class-path ${ADAM_CLI_JAR} \ 35 | "$@" 36 | 37 | -------------------------------------------------------------------------------- /bin/find-adam-assembly.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to Big Data Genomics (BDG) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The BDG licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]}) 23 | . ${SOURCE_DIR}/find-adam-home 24 | 25 | # Find ADAM cli assembly jar 26 | ADAM_CLI_JAR= 27 | if [ -d "$ADAM_HOME/repo" ]; then 28 | ASSEMBLY_DIR="$ADAM_HOME/repo" 29 | elif [ -d "$ADAM_HOME/jars" ]; then 30 | ASSEMBLY_DIR="$ADAM_HOME/jars" 31 | else 32 | ASSEMBLY_DIR="$ADAM_HOME/adam-assembly/target" 33 | fi 34 | 35 | ASSEMBLY_JARS=$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\.\_\-]*\.jar$" | grep -v javadoc | grep -v sources || true) 36 | num_jars=$(echo ${ASSEMBLY_JARS} | wc -w) 37 | 38 | if [ "$num_jars" -eq "0" ]; then 39 | echo "Failed to find ADAM cli assembly in $ASSEMBLY_DIR." 1>&2 40 | echo "You need to build ADAM before running this program." 1>&2 41 | exit 1 42 | fi 43 | 44 | if [ "$num_jars" -gt "1" ]; then 45 | echo "Found multiple ADAM cli assembly jars in $ASSEMBLY_DIR:" 1>&2 46 | echo "$ASSEMBLY_JARS" 1>&2 47 | echo "Please remove all but one jar." 1>&2 48 | exit 1 49 | fi 50 | 51 | echo "${ASSEMBLY_DIR}/${ASSEMBLY_JARS}" 52 | -------------------------------------------------------------------------------- /bin/find-adam-egg.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to Big Data Genomics (BDG) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The BDG licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]}) 23 | . ${SOURCE_DIR}/find-adam-home 24 | 25 | # Find ADAM python egg 26 | if [ -d "$ADAM_HOME/repo" ]; then 27 | DIST_DIR="$ADAM_HOME/repo" 28 | else 29 | DIST_DIR="$ADAM_HOME/adam-python/dist" 30 | fi 31 | 32 | DIST_EGG=$(ls -1 "$DIST_DIR" | grep "^bdgenomics\.adam[0-9A-Za-z\.\_\-]*.egg$" || true) 33 | num_egg=$(echo ${DIST_EGG} | wc -w) 34 | 35 | if [ "$num_egg" -eq "0" ]; then 36 | echo "Failed to find ADAM egg in $DIST_DIR." 1>&2 37 | echo "You need to build ADAM before running this program." 1>&2 38 | exit 1 39 | fi 40 | 41 | if [ "$num_egg" -gt "1" ]; then 42 | echo "Found multiple ADAM eggs in $DIST_DIR:" 1>&2 43 | echo "$DIST_EGG" 1>&2 44 | echo "Please remove all but one egg." 1>&2 45 | exit 1 46 | fi 47 | 48 | echo "${DIST_DIR}/${DIST_EGG}" 49 | -------------------------------------------------------------------------------- /bin/find-adam-home: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to Big Data Genomics (BDG) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The BDG licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | # Attempts to find a proper value for ADAM_HOME. Should be included using "source" directive. 23 | 24 | FIND_ADAM_HOME_PYTHON_SCRIPT="$(cd "$(dirname "$0")"; pwd)/find_adam_home.py" 25 | 26 | # Short cirtuit if the user already has this set. 27 | if [ ! -z "${ADAM_HOME}" ]; then 28 | true 29 | elif [ ! -f "$FIND_ADAM_HOME_PYTHON_SCRIPT" ]; then 30 | # If we are not in the same directory as find_adam_home.py we are not pip installed so we don't 31 | # need to search the different Python directories for a ADAM installation. 32 | # Note only that, if the user has pip installed adam but is directly calling pyadam or 33 | # adam-submit in another directory we want to use that version of adam rather than the 34 | # pip installed version of adam. 35 | export ADAM_HOME="$(cd "$(dirname "$0")"/..; pwd)" 36 | else 37 | # We are pip installed, use the Python script to resolve a reasonable ADAM_HOME 38 | # Default to standard python interpreter unless told otherwise 39 | if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then 40 | PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}" 41 | fi 42 | export ADAM_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_ADAM_HOME_PYTHON_SCRIPT") 43 | fi 44 | -------------------------------------------------------------------------------- /bin/find-spark.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to Big Data Genomics (BDG) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The BDG licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | SPARK_CMD=${1:-spark-submit} 23 | 24 | # Find spark-submit script 25 | if [ -z "$SPARK_HOME" ]; then 26 | SPARK_SUBMIT=$(which ${SPARK_CMD} || echo) 27 | else 28 | SPARK_SUBMIT=${SPARK_HOME}/bin/${SPARK_CMD} 29 | fi 30 | if [ -z "$SPARK_SUBMIT" ]; then 31 | echo "SPARK_HOME not set and ${SPARK_CMD} not on PATH; Aborting." 1>&2 32 | exit 1 33 | fi 34 | 35 | echo ${SPARK_SUBMIT} 36 | -------------------------------------------------------------------------------- /bin/pyadam: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to Big Data Genomics (BDG) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The BDG licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]}) 23 | 24 | ADAM_CLI_JAR=$(${SOURCE_DIR}/find-adam-assembly.sh) 25 | ADAM_EGG=$(${SOURCE_DIR}/find-adam-egg.sh) 26 | 27 | PYSPARK=$(${SOURCE_DIR}/find-spark.sh pyspark) 28 | echo "Using PYSPARK=$PYSPARK" 1>&2 29 | 30 | # submit the job to Spark 31 | "$PYSPARK" \ 32 | --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 33 | --conf spark.kryo.registrator=org.bdgenomics.adam.serialization.ADAMKryoRegistrator \ 34 | --jars ${ADAM_CLI_JAR} \ 35 | --driver-class-path ${ADAM_CLI_JAR} \ 36 | --py-files ${ADAM_EGG} \ 37 | "$@" 38 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build -------------------------------------------------------------------------------- /docs/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/_static/favicon.ico -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/algorithms/dm.rst: -------------------------------------------------------------------------------- 1 | Duplicate Marking Implementation 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | Reads may be duplicated during sequencing, either due to clonal 5 | duplication via PCR before sequencing, or due to optical duplication 6 | while on the sequencer. To identify duplicated reads, we apply a 7 | heuristic algorithm that looks at read fragments that have a consistent 8 | mapping signature. First, we bucket together reads that are from the 9 | same sequenced fragment by grouping reads together on the basis of read 10 | name and read group. Per read bucket, we then identify the 5' mapping 11 | positions of the primarily aligned reads. We mark as duplicates all read 12 | pairs that have the same pair alignment locations, and all unpaired 13 | reads that map to the same sites. Only the highest scoring read/read 14 | pair is kept, where the score is the sum of all quality scores in the 15 | read that are greater than 15. 16 | -------------------------------------------------------------------------------- /docs/algorithms/joins.rst: -------------------------------------------------------------------------------- 1 | ShuffleRegionJoin Load Balancing 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | ShuffleRegionJoins perform a sort-merge join on distributed genomic 5 | data. The current standard for distributing genomic data are to use a 6 | binning approach where ranges of genomic data are assigned to a 7 | particular partition. This approach has a significant limitation that we 8 | aim to solve: no matter how fine-grained the bins created, they can 9 | never resolve extremely skewed data. ShuffleRegionJoin also requires 10 | that the data be sorted, so we keep track of the fact that knowledge of 11 | sort through the join so we can reuse this knowledge downstream. 12 | 13 | The first step in ShuffleRegionJoin is to sort and balance the data. 14 | This is done with a sampling method and the data are sorted if it was 15 | not previously. When we shuffle the data, we also store the region 16 | ranges for all the data on this partition. Storing these partition 17 | bounds allows us to copartition the right dataset by assigning all 18 | records to a partition if the record falls within the partition bounds. 19 | After the right data are colocated with the correct records in the left 20 | dataset, we perform the join locally on each partition. 21 | 22 | Maintaining the sorted knowledge and partition bounds are extremely 23 | useful for downstream applications that can take advantage of sorted 24 | data. Subsequent joins, for example, will be much faster because the 25 | data are already relatively balanced and sorted. Additional set theory 26 | and aggregation primitives, such as counting nearby regions, grouping 27 | and clustering nearby regions, and finding the set difference will all 28 | benefit from the sorted knowledge because each of these primitives 29 | requires that the data be sorted first. 30 | 31 | -------------------------------------------------------------------------------- /docs/api/img/join_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/api/img/join_examples.png -------------------------------------------------------------------------------- /docs/api/img/join_rdds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/api/img/join_rdds.png -------------------------------------------------------------------------------- /docs/api/python.rst: -------------------------------------------------------------------------------- 1 | ADAM Python Documentation 2 | ========================= 3 | 4 | .. automodule:: bdgenomics.adam 5 | -------------------------------------------------------------------------------- /docs/architecture/evidence.rst: -------------------------------------------------------------------------------- 1 | Interacting with data through ADAM's evidence access layer 2 | ---------------------------------------------------------- 3 | 4 | ADAM exposes access to distributed datasets of genomic data through the 5 | `ADAMContext <../api/adamContext.html>`__ entrypoint. The ADAMContext wraps Apache 6 | Spark's SparkContext, which tracks the configuration and state of the 7 | current running Spark application. On top of the SparkContext, the 8 | ADAMContext provides data loading functions which yield 9 | `GenomicDataset <../api/genomicDataset.html>`__\ s. The GenomicDataset classes provide a 10 | wrapper around Apache Spark's two APIs for manipulating distributed 11 | datasets: the legacy Resilient Distributed Dataset (Zaharia et al. 2012) 12 | and the new Spark SQL Dataset/DataFrame API (Armbrust et al. 2015). 13 | Additionally, the GenomicDataset is enriched with genomics-specific metadata 14 | such as computational lineage and sample metadata, and optimized 15 | genomics-specific query patterns such as `region joins <../api/joins.html>`__ and 16 | the `auto-parallelizing pipe API <../api/pipes.html>`__ for running legacy tools 17 | using Apache Spark. 18 | 19 | .. figure:: img/grdd.png 20 | :alt: The GenomicDataset Class Hierarchy 21 | 22 | The GenomicDataset Class Hierarchy 23 | 24 | All GenomicDatasets include a sequence dictionary which describes the 25 | reference genome that the data in the genomic dataset are aligned to, if one is 26 | known. Additionally, ReadGroupGenomicDataset store a dictionary with read 27 | groups that are attached to the reads/fragments. Similarly, the 28 | MultisampleGenomicDataset includes a list of samples who are present in the 29 | dataset. 30 | -------------------------------------------------------------------------------- /docs/architecture/img/grdd.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/architecture/img/grdd.pdf -------------------------------------------------------------------------------- /docs/architecture/img/grdd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/architecture/img/grdd.png -------------------------------------------------------------------------------- /docs/architecture/img/stack-model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/architecture/img/stack-model.pdf -------------------------------------------------------------------------------- /docs/architecture/img/stack-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/architecture/img/stack-model.png -------------------------------------------------------------------------------- /docs/architecture/schemas.rst: -------------------------------------------------------------------------------- 1 | The bdg-formats schemas 2 | ----------------------- 3 | 4 | The schemas that comprise ADAM's narrow waist are defined in the 5 | `bdg-formats `__ 6 | project, using the `Apache Avro `__ schema 7 | description language. This schema definition language automatically 8 | generates implementations of this schema for multiple common languages, 9 | including Java, C, C++, and Python. bdg-formats contains several core 10 | schemas: 11 | 12 | - The *Alignment* schema represents a genomic read, along with 13 | that read's alignment to a reference genome, if available. 14 | - The *Feature* schema represents a generic genomic feature. This 15 | record can be used to tag a region of the genome with an annotation, 16 | such as coverage observed over that region, or the coordinates of an 17 | exon. 18 | - The *Fragment* schema represents a set of read alignments that came 19 | from a single sequenced fragment. 20 | - The *Genotype* schema represents a genotype call, along with 21 | annotations about the quality/read support of the called genotype. 22 | - The *Sequence* and *Slice* schema represents sequences and slices of 23 | sequences, respectfully. 24 | - The *Variant* schema represents a sequence variant, along with 25 | statistics about that variant's support across a group of samples, 26 | and annotations about the effect of the variant. 27 | 28 | The bdg-formats schemas are designed so that common fields are easy to 29 | query, while maintaining extensibility and the ability to interoperate 30 | with common genomics file formats. Where necessary, the bdg-formats 31 | schemas are nested, which allows for the description of complex nested 32 | features and groupings (such as the Fragment record, which groups 33 | together Alignments). All fields in the bdg-formats schemas are 34 | nullable, and the schemas themselves do not contain invariants around 35 | valid values for a field. Instead, we validate data on ingress and 36 | egress to/from a conventional genomic file format. This allows users to 37 | take advantage of features such as field projection, which can improve 38 | the performance of queries like `flagstat <#flagstat>`__ by an order of 39 | magnitude. 40 | -------------------------------------------------------------------------------- /docs/benchmarks/img/bam.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/bam.pdf -------------------------------------------------------------------------------- /docs/benchmarks/img/bam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/bam.png -------------------------------------------------------------------------------- /docs/benchmarks/img/bed.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/bed.pdf -------------------------------------------------------------------------------- /docs/benchmarks/img/bed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/bed.png -------------------------------------------------------------------------------- /docs/benchmarks/img/gff.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/gff.pdf -------------------------------------------------------------------------------- /docs/benchmarks/img/gff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/gff.png -------------------------------------------------------------------------------- /docs/benchmarks/img/plot-speedup.py: -------------------------------------------------------------------------------- 1 | from pylab import loglog, ylabel, xlabel, title, grid, savefig, show, legend, xticks, yticks, figure, xlim, ylim 2 | 3 | def setup(n, st): 4 | exp_n = [1.0 / st, (n [-1] / n [0]) / st] 5 | l_n = [n [0], n [-1]] 6 | 7 | figure () 8 | loglog (l_n, exp_n, 'k-', basex=2, basey=2, label="Ideal Speedup") 9 | 10 | def plot (n, mt, label_name, pattern): 11 | 12 | speedup = [] 13 | 14 | for m in mt: 15 | 16 | speedup.append (1.0 / m) 17 | 18 | loglog (n, speedup, pattern, basex=2, basey=2, label=label_name) 19 | 20 | def label(name, t, lloc=2): 21 | locs,labels = xticks() 22 | xn = ["", "32", "64", "128", "256", "512", "1024", ""] 23 | xticks(locs, xn) 24 | 25 | yn = ["", "32K", "16K", "8K", "4K", "2K", "1K", "500", ""] 26 | locs,labels = yticks() 27 | yticks(locs, yn) 28 | 29 | ylabel ("Runtime (seconds)") 30 | xlabel ("Number of Threads") 31 | legend (loc=lloc) 32 | title (t) 33 | grid (True) 34 | savefig (name) 35 | 36 | n_ideal = [32, 1024] 37 | 38 | n = [32, 128, 256, 512, 1024] 39 | markdup = [16639.22, 4438.37, 2005.25, 1247.36, 844.03] 40 | frag_md = [8249.56, 2594.44, 1409.86, 868.19, 529.19] 41 | gatk_md = [17068.58, 4036.25, 1737.97, 991.62, 589.37] 42 | bqsr = [27034.11, 7461.35, 4663.84, 2977.69, 2108.43] 43 | gatk_bqsr = [(28232.96 + 2931.97), (8473.64 + 1312.90), (5578.24 + 732.01), (3465.61 + 551.55), (2410.03 + 487.16)] 44 | ir = [23808.67, 6476.63, 3507.99, 2407.57, 1242.10] 45 | 46 | setup(n_ideal, frag_md[0]) 47 | 48 | plot(n, markdup, 'ADAM Mark Duplicates', 'bx-') 49 | plot(n, frag_md, 'ADAM Fragments Mark Duplicates', 'bo--') 50 | plot(n, gatk_md, 'GATK4 Mark Duplicates', 'c.--') 51 | 52 | label("speedup-md.pdf", 53 | "Duplicate Marking Speedup on NA12878 (High Coverage)", lloc=4) 54 | 55 | setup(n_ideal, bqsr[0]) 56 | 57 | plot(n, bqsr, 'ADAM BQSR', 'bx-') 58 | plot(n, gatk_bqsr, 'GATK4 BQSR', 'c.--') 59 | 60 | label("speedup-bqsr.pdf", 61 | "Base Recalibration Speedup on NA12878 (High Coverage)") 62 | 63 | setup(n_ideal, ir[0]) 64 | 65 | plot(n, ir, 'INDEL Realignment', 'bx-') 66 | 67 | label("speedup-ir.pdf", 68 | "INDEL Realignment Speedup on NA12878 (High Coverage)") 69 | -------------------------------------------------------------------------------- /docs/benchmarks/img/speedup-bqsr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-bqsr.pdf -------------------------------------------------------------------------------- /docs/benchmarks/img/speedup-bqsr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-bqsr.png -------------------------------------------------------------------------------- /docs/benchmarks/img/speedup-ir.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-ir.pdf -------------------------------------------------------------------------------- /docs/benchmarks/img/speedup-ir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-ir.png -------------------------------------------------------------------------------- /docs/benchmarks/img/speedup-md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-md.pdf -------------------------------------------------------------------------------- /docs/benchmarks/img/speedup-md.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-md.png -------------------------------------------------------------------------------- /docs/benchmarks/img/vcf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/vcf.pdf -------------------------------------------------------------------------------- /docs/benchmarks/img/vcf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/vcf.png -------------------------------------------------------------------------------- /docs/citing.rst: -------------------------------------------------------------------------------- 1 | Citing ADAM 2 | =========== 3 | 4 | ADAM has been described in two manuscripts. The first, 5 | `a tech report `__, 6 | came out in 2013 and described the rationale behind using schemas for genomics, 7 | and presented an early implementation of some of the preprocessing algorithms. 8 | 9 | To cite this paper, please cite:: 10 | 11 | @techreport{massie13, 12 | title={{ADAM}: Genomics Formats and Processing Patterns for Cloud Scale Computing}, 13 | author={Massie, Matt and Nothaft, Frank and Hartl, Christopher and Kozanitis, Christos and Schumacher, Andr{\'e} and Joseph, Anthony D and Patterson, David A}, 14 | year={2013}, 15 | institution={UCB/EECS-2013-207, EECS Department, University of California, Berkeley} 16 | } 17 | 18 | 19 | The second, 20 | `a conference paper `__, 21 | appeared in the SIGMOD 2015 Industrial Track. This paper described how ADAM's 22 | design was influenced by database systems, expanded upon the concept of a stack 23 | architecture for scientific analyses, presented more results comparing ADAM to 24 | state-of-the-art single node genomics tools, and demonstrated how the 25 | architecture generalized beyond genomics. 26 | 27 | To cite this paper, please cite:: 28 | 29 | @inproceedings{nothaft15, 30 | title={Rethinking Data-Intensive Science Using Scalable Analytics Systems}, 31 | author={Nothaft, Frank A and Massie, Matt and Danford, Timothy and Zhang, Zhao and Laserson, Uri and Yeksigian, Carl and Kottalam, Jey and Ahuja, Arun and Hammerbacher, Jeff and Linderman, Michael and Franklin, Michael and Joseph, Anthony D. and Patterson, David A.}, 32 | booktitle={Proceedings of the 2015 International Conference on Management of Data (SIGMOD '15)}, 33 | year={2015}, 34 | organization={ACM} 35 | } 36 | 37 | 38 | We prefer that you cite both papers, but if you can only cite one paper, we 39 | prefer that you cite the SIGMOD 2015 manuscript. 40 | -------------------------------------------------------------------------------- /docs/downstream/overview.rst: -------------------------------------------------------------------------------- 1 | Building Downstream Applications 2 | ================================ 3 | 4 | ADAM is packaged so that it can be used interactively via the ADAM 5 | shell, called from the command line interface (CLI), or included as a 6 | library when building downstream applications. 7 | 8 | This document covers three patterns for building applications downstream 9 | of ADAM: 10 | 11 | - Extend the ADAM CLI by `adding new commands `__ 12 | - Extend the ADAM CLI by `adding new commands in an external repository `__ 13 | - Use ADAM as a `library in new applications `__ 14 | -------------------------------------------------------------------------------- /docs/img/bdgenomics-stack.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/img/bdgenomics-stack.key -------------------------------------------------------------------------------- /docs/img/bdgenomics-stack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/img/bdgenomics-stack.png -------------------------------------------------------------------------------- /docs/img/stack-model.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/img/stack-model.ai -------------------------------------------------------------------------------- /docs/installation/example.rst: -------------------------------------------------------------------------------- 1 | Running an example command 2 | ========================== 3 | 4 | flagstat 5 | -------- 6 | 7 | Once you have data converted to ADAM, you can gather statistics from the 8 | ADAM file using flagstat_. This command will output 9 | stats identically to the samtools ``flagstat`` command. 10 | 11 | .. code:: bash 12 | 13 | adam-submit flagstat NA12878_chr20.adam 14 | 15 | Outputs: 16 | 17 | :: 18 | 19 | 51554029 + 0 in total (QC-passed reads + QC-failed reads) 20 | 0 + 0 duplicates 21 | 50849935 + 0 mapped (98.63%:0.00%) 22 | 51554029 + 0 paired in sequencing 23 | 25778679 + 0 read1 24 | 25775350 + 0 read2 25 | 49874394 + 0 properly paired (96.74%:0.00%) 26 | 50145841 + 0 with itself and mate mapped 27 | 704094 + 0 singletons (1.37%:0.00%) 28 | 158721 + 0 with mate mapped to a different chr 29 | 105812 + 0 with mate mapped to a different chr (mapQ>=5) 30 | 31 | In practice, you will find that the ADAM ``flagstat`` command takes 32 | orders of magnitude less time than samtools to compute these statistics. 33 | For example, on a MacBook Pro, the command above took 17 seconds to run 34 | while ``samtools flagstat NA12878_chr20.bam`` took 55 seconds. On larger 35 | files, the difference in speed is even more dramatic. ADAM is faster 36 | because it is multi-threaded, distributed and uses a columnar storage 37 | format (with a projected schema that only materializes the read flags 38 | instead of the whole read). 39 | 40 | Running on a cluster 41 | -------------------- 42 | 43 | We provide the ``adam-submit`` and ``adam-shell`` commands under the 44 | ``bin`` directory. These can be used to submit ADAM jobs to a spark 45 | cluster, or to run ADAM interactively. 46 | 47 | -------------------------------------------------------------------------------- /docs/installation/pip.rst: -------------------------------------------------------------------------------- 1 | Installing ADAM using Pip 2 | ========================= 3 | 4 | ADAM is available through the `Python Package Index`_ and thus can be installed 5 | using pip. To install ADAM using pip, run: 6 | 7 | .. code:: bash 8 | 9 | pip install bdgenomics.adam 10 | 11 | Pip will install the bdgenomics.adam Python binding, as well as the ADAM CLI. 12 | 13 | .. _Python Package Index: https://pypi.python.org/pypi 14 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==1.7.7 2 | -------------------------------------------------------------------------------- /scripts/move_to_scala_2.11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set +x 4 | 5 | grep "" pom.xml | grep -q 2.11 6 | if [[ $? == 0 ]]; 7 | then 8 | echo "Scala version is already set to 2.11 (Scala artifacts have _2.11 version suffix in artifact name)." 9 | echo "Cowardly refusing to move to Scala 2.11 a second time..." 10 | 11 | exit 1 12 | fi 13 | 14 | find . -name "pom.xml" -exec sed -e "s/2.12.10/2.11.12/g" \ 15 | -e "s/2.12/2.11/g" \ 16 | -i.2.11.bak '{}' \; 17 | find . -name "*.2.11.*bak" -exec rm -f {} \; 18 | -------------------------------------------------------------------------------- /scripts/move_to_scala_2.12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set +x 4 | 5 | grep "" pom.xml | grep -q 2.12 6 | if [[ $? == 0 ]]; 7 | then 8 | echo "Scala version is already set to 2.12 (Scala artifacts have _2.12 version suffix in artifact name)." 9 | echo "Cowardly refusing to move to Scala 2.12 a second time..." 10 | 11 | exit 1 12 | fi 13 | 14 | find . -name "pom.xml" -exec sed -e "s/2.11.12/2.12.10/g" \ 15 | -e "s/2.11/2.12/g" \ 16 | -i.2.12.bak '{}' \; 17 | find . -name "*.2.12.*bak" -exec rm -f {} \; 18 | -------------------------------------------------------------------------------- /scripts/move_to_spark_2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set +x 4 | 5 | grep -q "spark2" pom.xml 6 | if [[ $? == 0 ]]; 7 | then 8 | echo "POM is already set up for Spark 2 (Spark 2 artifacts have -spark2 suffix in artifact names)." 9 | echo "Cowardly refusing to move to Spark 2 a second time..." 10 | 11 | exit 1 12 | fi 13 | 14 | svp="\${scala.version.prefix}" 15 | substitution_cmd="s/-spark3_$svp/-spark2_$svp/g" 16 | 17 | find . -name "pom.xml" -exec sed \ 18 | -e "/adam-/ s/-spark3_2\.1/-spark2_2\.1/" \ 19 | -e "/adam-/ $substitution_cmd" \ 20 | -e "/utils-/ s/-spark3_2\.1/-spark2_2\.1/" \ 21 | -e "/utils-/ $substitution_cmd" \ 22 | -e "/spark.version/ s/3.1.2/2.4.7/g" \ 23 | -i.spark2.bak '{}' \; 24 | -------------------------------------------------------------------------------- /scripts/move_to_spark_3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set +x 4 | 5 | grep -q "spark3" pom.xml 6 | if [[ $? == 0 ]]; 7 | then 8 | echo "POM is already set up for Spark 3 (Spark 3 artifacts have -spark3 suffix in artifact names)." 9 | echo "Cowardly refusing to move to Spark 3 a second time..." 10 | 11 | exit 1 12 | fi 13 | 14 | svp="\${scala.version.prefix}" 15 | substitution_cmd="s/-spark2_$svp/-spark3_$svp/g" 16 | 17 | find . -name "pom.xml" -exec sed \ 18 | -e "/adam-/ s/-spark2_2\.1/-spark3_2\.1/" \ 19 | -e "/adam-/ $substitution_cmd" \ 20 | -e "/utils-/ s/-spark2_2\.1/-spark3_2\.1/" \ 21 | -e "/utils-/ $substitution_cmd" \ 22 | -e "/spark.version/ s/2.4.7/3.1.2/g" \ 23 | -i.spark3.bak '{}' \; 24 | -------------------------------------------------------------------------------- /scripts/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # do we have enough arguments? 4 | if [ $# < 4 ]; then 5 | echo "Usage:" 6 | echo 7 | echo "./release.sh " 8 | exit 1 9 | fi 10 | 11 | # pick arguments 12 | release=$1 13 | devel=$2 14 | milestone=$3 15 | 16 | # get current branch 17 | branch=$(git status -bs | awk '{ print $2 }' | awk -F'.' '{ print $1 }' | head -n 1) 18 | 19 | # update changelog per Github milestone 20 | mvn com.github.heuermh.maven.plugin.changes:github-changes-maven-plugin:1.2:github-changes -DmilestoneId=${milestone} 21 | git commit -a -m "Modifying changelog." 22 | 23 | # update R version 24 | sed -i -e "s/Version: [0-9.]*/Version: $1/g" adam-r/bdgenomics.adam/DESCRIPTION 25 | git commit -a -m "Bumping R version to $1." 26 | 27 | commit=$(git log --pretty=format:"%H" | head -n 1) 28 | echo "releasing from ${commit} on branch ${branch}" 29 | 30 | git push origin ${branch} 31 | 32 | # do spark 3, scala 2.12 release 33 | git checkout -b maint_spark3_2.12-${release} ${branch} 34 | 35 | mvn --batch-mode \ 36 | -P distribution \ 37 | -Dresume=false \ 38 | -Dtag=adam-parent-spark3_2.12-${release} \ 39 | -DreleaseVersion=${release} \ 40 | -DdevelopmentVersion=${devel} \ 41 | -DbranchName=adam-spark3_2.12-${release} \ 42 | release:clean \ 43 | release:prepare \ 44 | release:perform 45 | 46 | if [ $? != 0 ]; then 47 | echo "Releasing Spark 3, Scala 2.12 version failed." 48 | exit 1 49 | fi 50 | 51 | if [ $branch = "master" ]; then 52 | # if original branch was master, update versions on original branch 53 | git checkout ${branch} 54 | mvn versions:set -DnewVersion=${devel} \ 55 | -DgenerateBackupPoms=false 56 | git commit -a -m "Modifying pom.xml files for new development after ${release} release." 57 | git push origin ${branch} 58 | fi 59 | --------------------------------------------------------------------------------