├── .github
    └── workflows
    │   ├── ci.yml
    │   └── deploy.yml
├── .gitignore
├── CHANGES.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── KEYS
├── LICENSE.txt
├── LICENSE_header.txt
├── README.md
├── SUPPORT.md
├── adam-apis
    ├── pom.xml
    └── src
    │   ├── main
    │       └── scala
    │       │   └── org
    │       │       └── bdgenomics
    │       │           └── adam
    │       │               └── api
    │       │                   ├── java
    │       │                       ├── GenomicDatasetConverters.scala
    │       │                       ├── GenomicRDDConverters.scala
    │       │                       └── JavaADAMContext.scala
    │       │                   └── python
    │       │                       └── DataFrameConversionWrapper.scala
    │   └── test
    │       ├── java
    │           └── org
    │           │   └── bdgenomics
    │           │       └── adam
    │           │           └── api
    │           │               └── java
    │           │                   ├── JavaADAMCoverageConduit.java
    │           │                   ├── JavaADAMFeatureConduit.java
    │           │                   ├── JavaADAMFragmentConduit.java
    │           │                   ├── JavaADAMGenotypeConduit.java
    │           │                   ├── JavaADAMReadConduit.java
    │           │                   ├── JavaADAMSequenceConduit.java
    │           │                   ├── JavaADAMSliceConduit.java
    │           │                   └── JavaADAMVariantConduit.java
    │       ├── resources
    │           └── indexed_bams
    │           │   ├── sorted.bam
    │           │   └── sorted.bam.bai
    │       └── scala
    │           └── org
    │               └── bdgenomics
    │                   └── adam
    │                       └── api
    │                           └── java
    │                               └── JavaADAMContextSuite.scala
├── adam-assembly
    ├── pom.xml
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── bdgenomics
    │                   └── adam
    │                       └── assembly
    │                           └── Assembly.scala
├── adam-cli
    ├── .gitignore
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java-templates
    │       │   └── org
    │       │   │   └── bdgenomics
    │       │   │       └── adam
    │       │   │           └── cli
    │       │   │               └── About.java
    │       └── scala
    │       │   └── org
    │       │       └── bdgenomics
    │       │           └── adam
    │       │               └── cli
    │       │                   ├── ADAM2Fastq.scala
    │       │                   ├── ADAMMain.scala
    │       │                   ├── CountReadKmers.scala
    │       │                   ├── CountSliceKmers.scala
    │       │                   ├── Coverage.scala
    │       │                   ├── CramArgs.scala
    │       │                   ├── FileSystemUtils.scala
    │       │                   ├── FlagStat.scala
    │       │                   ├── MergeShards.scala
    │       │                   ├── PrintADAM.scala
    │       │                   ├── TransformAlignments.scala
    │       │                   ├── TransformFeatures.scala
    │       │                   ├── TransformFragments.scala
    │       │                   ├── TransformGenotypes.scala
    │       │                   ├── TransformSequences.scala
    │       │                   ├── TransformSlices.scala
    │       │                   ├── TransformVariants.scala
    │       │                   └── View.scala
    │   └── test
    │       ├── resources
    │           ├── artificial.counts.txt
    │           ├── artificial.fa
    │           ├── artificial.fa.fai
    │           ├── bqsr1-r1.fq
    │           ├── bqsr1-r2.fq
    │           ├── bqsr1.sam
    │           ├── chr5.phyloP46way.trunc.wigFix
    │           ├── contigs.fa
    │           ├── flag-values.sam
    │           ├── gencode.v7.annotation.trunc10.bed
    │           ├── log4j.properties
    │           ├── small.vcf
    │           ├── sorted.bam
    │           ├── sorted.bam.bai
    │           ├── sorted.counts.txt
    │           ├── sorted.lex.vcf
    │           └── sorted.vcf
    │       └── scala
    │           └── org
    │               └── bdgenomics
    │                   └── adam
    │                       └── cli
    │                           ├── ADAM2FastqSuite.scala
    │                           ├── ADAMMainSuite.scala
    │                           ├── AboutSuite.scala
    │                           ├── CountReadKmersSuite.scala
    │                           ├── CountSliceKmersSuite.scala
    │                           ├── CoverageSuite.scala
    │                           ├── MergeShardsSuite.scala
    │                           ├── ParquetLister.scala
    │                           ├── TransformAlignmentsSuite.scala
    │                           ├── TransformFeaturesSuite.scala
    │                           ├── TransformFragmentsSuite.scala
    │                           ├── TransformGenotypesSuite.scala
    │                           ├── TransformVariantsSuite.scala
    │                           └── ViewSuite.scala
├── adam-codegen
    ├── pom.xml
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── bdgenomics
    │                   └── adam
    │                       └── codegen
    │                           ├── DumpSchemasToProduct.scala
    │                           ├── DumpSchemasToProjectionEnums.scala
    │                           ├── Generator.scala
    │                           └── ReflectSchema.scala
├── adam-core
    ├── .gitignore
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── org
    │       │   │   └── bdgenomics
    │       │   │       └── adam
    │       │   │           └── io
    │       │   │               ├── FastqInputFormat.java
    │       │   │               ├── FastqRecordReader.java
    │       │   │               ├── InterleavedFastqInputFormat.java
    │       │   │               ├── ResettableCompressedSplitLineReader.java
    │       │   │               └── SingleFastqInputFormat.java
    │       └── scala
    │       │   └── org
    │       │       └── bdgenomics
    │       │           └── adam
    │       │               ├── algorithms
    │       │                   ├── consensus
    │       │                   │   ├── Consensus.scala
    │       │                   │   ├── ConsensusGenerator.scala
    │       │                   │   ├── ConsensusGeneratorFromKnowns.scala
    │       │                   │   ├── ConsensusGeneratorFromReads.scala
    │       │                   │   ├── ConsensusGeneratorFromSmithWaterman.scala
    │       │                   │   ├── NormalizationUtils.scala
    │       │                   │   └── UnionConsensusGenerator.scala
    │       │                   └── smithwaterman
    │       │                   │   ├── SmithWaterman.scala
    │       │                   │   ├── SmithWatermanConstantGapScoring.scala
    │       │                   │   └── SmithWatermanGapScoringFromFn.scala
    │       │               ├── converters
    │       │                   ├── AlignmentConverter.scala
    │       │                   ├── DefaultHeaderLines.scala
    │       │                   ├── FastaConverters.scala
    │       │                   ├── FastqRecordConverter.scala
    │       │                   ├── FragmentConverter.scala
    │       │                   ├── TranscriptEffectConverter.scala
    │       │                   └── VariantContextConverter.scala
    │       │               ├── ds
    │       │                   ├── ADAMContext.scala
    │       │                   ├── ADAMParquetInputFormat.scala
    │       │                   ├── ADAMSaveAnyArgs.scala
    │       │                   ├── GenomeBins.scala
    │       │                   ├── GenomicBroadcast.scala
    │       │                   ├── GenomicDataset.scala
    │       │                   ├── GenomicDatasetConversion.scala
    │       │                   ├── GenomicPartitioners.scala
    │       │                   ├── InFormatter.scala
    │       │                   ├── OutFormatter.scala
    │       │                   ├── ReferencePartitioner.scala
    │       │                   ├── RegionJoin.scala
    │       │                   ├── SAMHeaderWriter.scala
    │       │                   ├── ShuffleRegionJoin.scala
    │       │                   ├── TreeRegionJoin.scala
    │       │                   ├── VCFHeaderUtils.scala
    │       │                   ├── feature
    │       │                   │   ├── BEDInFormatter.scala
    │       │                   │   ├── BEDOutFormatter.scala
    │       │                   │   ├── CoverageDataset.scala
    │       │                   │   ├── FeatureDataset.scala
    │       │                   │   ├── FeatureParser.scala
    │       │                   │   ├── Features.scala
    │       │                   │   ├── GFF3HeaderWriter.scala
    │       │                   │   ├── GFF3InFormatter.scala
    │       │                   │   ├── GFF3OutFormatter.scala
    │       │                   │   ├── GTFInFormatter.scala
    │       │                   │   ├── GTFOutFormatter.scala
    │       │                   │   ├── NarrowPeakInFormatter.scala
    │       │                   │   └── NarrowPeakOutFormatter.scala
    │       │                   ├── fragment
    │       │                   │   ├── FragmentDataset.scala
    │       │                   │   ├── InterleavedFASTQInFormatter.scala
    │       │                   │   ├── Tab5InFormatter.scala
    │       │                   │   └── Tab6InFormatter.scala
    │       │                   ├── read
    │       │                   │   ├── ADAMBAMOutputFormat.scala
    │       │                   │   ├── ADAMCRAMOutputFormat.scala
    │       │                   │   ├── ADAMSAMOutputFormat.scala
    │       │                   │   ├── AlignmentDataset.scala
    │       │                   │   ├── AnySAMInFormatter.scala
    │       │                   │   ├── AnySAMOutFormatter.scala
    │       │                   │   ├── BAMInFormatter.scala
    │       │                   │   ├── BinQualities.scala
    │       │                   │   ├── FASTQInFormatter.scala
    │       │                   │   ├── FlagStat.scala
    │       │                   │   ├── MDTagging.scala
    │       │                   │   ├── MarkDuplicates.scala
    │       │                   │   ├── ReadDataset.scala
    │       │                   │   ├── ReferencePositionPair.scala
    │       │                   │   ├── RepairPartitions.scala
    │       │                   │   ├── SAMInFormatter.scala
    │       │                   │   ├── SingleReadBucket.scala
    │       │                   │   ├── realignment
    │       │                   │   │   ├── IndelRealignmentTarget.scala
    │       │                   │   │   ├── ModPartitioner.scala
    │       │                   │   │   ├── RealignIndels.scala
    │       │                   │   │   └── RealignmentTargetFinder.scala
    │       │                   │   └── recalibration
    │       │                   │   │   ├── Aggregate.scala
    │       │                   │   │   ├── BaseQualityRecalibration.scala
    │       │                   │   │   ├── Covariate.scala
    │       │                   │   │   ├── CovariateKey.scala
    │       │                   │   │   ├── CovariateSpace.scala
    │       │                   │   │   ├── CycleCovariate.scala
    │       │                   │   │   ├── DinucCovariate.scala
    │       │                   │   │   ├── Observation.scala
    │       │                   │   │   ├── ObservationTable.scala
    │       │                   │   │   ├── RecalibrationTable.scala
    │       │                   │   │   └── Recalibrator.scala
    │       │                   ├── sequence
    │       │                   │   ├── FASTAInFormatter.scala
    │       │                   │   ├── FlankSlices.scala
    │       │                   │   ├── SequenceDataset.scala
    │       │                   │   └── SliceDataset.scala
    │       │                   └── variant
    │       │                   │   ├── ADAMVCFOutputFormat.scala
    │       │                   │   ├── GenotypeDataset.scala
    │       │                   │   ├── VCFInFormatter.scala
    │       │                   │   ├── VCFOutFormatter.scala
    │       │                   │   ├── VariantContextDataset.scala
    │       │                   │   └── VariantDataset.scala
    │       │               ├── models
    │       │                   ├── Alphabet.scala
    │       │                   ├── Attribute.scala
    │       │                   ├── Coverage.scala
    │       │                   ├── IndelTable.scala
    │       │                   ├── MdTag.scala
    │       │                   ├── NonoverlappingRegions.scala
    │       │                   ├── ReadGroupDictionary.scala
    │       │                   ├── ReferencePosition.scala
    │       │                   ├── ReferenceRegion.scala
    │       │                   ├── SAMFileHeaderWritable.scala
    │       │                   ├── SequenceDictionary.scala
    │       │                   ├── SnpTable.scala
    │       │                   ├── VCFHeaderWritable.scala
    │       │                   └── VariantContext.scala
    │       │               ├── projections
    │       │                   ├── FieldEnumeration.scala
    │       │                   └── Projection.scala
    │       │               ├── rich
    │       │                   ├── RichAlignment.scala
    │       │                   ├── RichCigar.scala
    │       │                   └── RichVariant.scala
    │       │               ├── serialization
    │       │                   └── ADAMKryoRegistrator.scala
    │       │               ├── sql
    │       │                   └── VariantContext.scala
    │       │               └── util
    │       │                   ├── ADAMShell.scala
    │       │                   ├── ASCIITable.scala
    │       │                   ├── AttributeUtils.scala
    │       │                   ├── FileExtensions.scala
    │       │                   ├── FileMerger.scala
    │       │                   ├── GenomeFileReader.scala
    │       │                   ├── IndexedFastaFile.scala
    │       │                   ├── ManualRegionPartitioner.scala
    │       │                   ├── ParallelFileMerger.scala
    │       │                   ├── ParquetFileTraversable.scala
    │       │                   ├── ParquetLogger.scala
    │       │                   ├── PhredUtils.scala
    │       │                   ├── ReferenceFile.scala
    │       │                   ├── ReferenceMap.scala
    │       │                   ├── SequenceDictionaryReader.scala
    │       │                   ├── TextAlignment.scala
    │       │                   ├── TextRddWriter.scala
    │       │                   └── TwoBitFile.scala
    │   └── test
    │       ├── resources
    │           ├── HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_all.fixed-phase-set.excerpt.vcf
    │           ├── HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_all.fixed-phase-set.excerpt.vcf.README
    │           ├── HLA_DQB1_05_01_01_02.dict
    │           ├── HLA_DQB1_05_01_01_02.fa
    │           ├── HLA_DQB1_05_01_01_02.fa.fai
    │           ├── Homo_sapiens.GRCh37.75.trun100.gtf
    │           ├── Homo_sapiens.GRCh37.75.trun20.gtf
    │           ├── Hs_Ensembl_example_genes.gtf
    │           ├── NA12878.1_854950_855150.sam
    │           ├── NA12878.1_922305.G_GC_hom.sam
    │           ├── NA12878.chr22.tiny.freebayes.vcf
    │           ├── NA12878.sam
    │           ├── SeqCap_EZ_Exome_v3.hg19.interval_list
    │           ├── artificial.README.txt
    │           ├── artificial.cram
    │           ├── artificial.fa
    │           ├── artificial.fa.fai
    │           ├── artificial.realigned.sam
    │           ├── artificial.sam
    │           ├── badheader.sam
    │           ├── bams
    │           │   └── small.bam
    │           ├── bqsr1-r1.fq
    │           ├── bqsr1-r2.fq
    │           ├── bqsr1-ref.observed
    │           ├── bqsr1.sam
    │           ├── bqsr1.snps
    │           ├── bqsr1.vcf
    │           ├── bqsr1.vcf.tbi
    │           ├── chr20.250k.fa.gz
    │           ├── chromInfo.txt
    │           ├── combined_2018-05-18.9900-10050.fastq
    │           ├── ctg123.fasta.gff3
    │           ├── dict_with_accession.dict
    │           ├── dvl1.200.bed
    │           ├── dvl1.200.gff3
    │           ├── dvl1.200.gtf
    │           ├── env_test_command.sh
    │           ├── example_intervals.list
    │           ├── fastq_nobases.fq
    │           ├── fastq_noqual.fq
    │           ├── fastq_sample1.fq
    │           ├── fastq_sample1.fq.bgz
    │           ├── fastq_sample1.fq.bz2
    │           ├── fastq_sample1.fq.gz
    │           ├── fastq_sample2.fq
    │           ├── fastq_sample3.fq
    │           ├── fastq_sample4.fq
    │           ├── fastq_to_usam.py
    │           ├── gencode.chr20.transcript_names.head10.txt
    │           ├── gencode.v19.annotation.chr20.250k.gtf
    │           ├── gencode.v19.pc_transcripts.250k.fa.gz
    │           ├── gencode.v7.annotation.trunc10.bed
    │           ├── gvcf_dir
    │           │   ├── gvcf_multiallelic.g.vcf
    │           │   └── gvcf_multiallelic_noPLs.g.vcf
    │           ├── gvcf_multiallelic
    │           │   └── multiallelic.vcf
    │           ├── hg19.chrM.2bit
    │           ├── hg19.genome
    │           ├── hg19.genome.txt
    │           ├── hs37d5.dict
    │           ├── hs38DH_chr1_10.fa
    │           ├── human_g1k_v37_chr1_59kb.2bit
    │           ├── human_g1k_v37_chr1_59kb.fasta
    │           ├── improper_pairs_1.fq
    │           ├── improper_pairs_2.fq
    │           ├── indexed_bams
    │           │   ├── sorted.2.bai
    │           │   ├── sorted.2.bam
    │           │   ├── sorted.bam
    │           │   └── sorted.bam.bai
    │           ├── inf_float_values.vcf
    │           ├── interleaved_fastq_sample1.ifq
    │           ├── interleaved_fastq_sample1.ifq.bgz
    │           ├── interleaved_fastq_sample1.ifq.bz2
    │           ├── interleaved_fastq_sample1.ifq.gz
    │           ├── interleaved_fastq_sample1.ifq.output
    │           ├── interleaved_fastq_sample2.ifq
    │           ├── interleaved_fastq_sample2.ifq.output
    │           ├── interleaved_fastq_sample3.ifq
    │           ├── interleaved_fastq_sample3.ifq.output
    │           ├── interleaved_fastq_sample4.ifq
    │           ├── interleaved_fastq_sample4.ifq.output
    │           ├── interleaved_fastq_sample5.ifq
    │           ├── interleaved_fastq_sample5.ifq.output
    │           ├── invalid
    │           │   ├── small.INFO_flag.vcf
    │           │   └── truth_small_variants.vcf
    │           ├── legacy.fa
    │           ├── log4j.properties
    │           ├── multi_chr.sam
    │           ├── multiline_fastq.fq
    │           ├── nan_float_values.vcf
    │           ├── ordered.sam
    │           ├── proper_pairs_1.fq
    │           ├── proper_pairs_2.fq
    │           ├── queryname.sam
    │           ├── random.vcf
    │           ├── read_names_with_index_sequences_interleaved.fq
    │           ├── read_names_with_index_sequences_pair1.fq
    │           ├── read_names_with_index_sequences_pair2.fq
    │           ├── readname_sorted.sam
    │           ├── reads-0-2-0
    │           ├── reads12.sam
    │           ├── reads12_diff1.sam
    │           ├── reads13.sam
    │           ├── reads21.sam
    │           ├── sample1.query.sam
    │           ├── sample1.queryname.sam
    │           ├── sample_coverage.bed
    │           ├── single_fastq_sample1.fq.output
    │           ├── single_fastq_sample2.fq.output
    │           ├── single_fastq_sample3.fq.output
    │           ├── single_fastq_sample4.fq.output
    │           ├── small.1.bed
    │           ├── small.1.narrowPeak
    │           ├── small.1.sam
    │           ├── small.1_12.bed
    │           ├── small.addctg.vcf
    │           ├── small.badheader.sam
    │           ├── small.sam
    │           ├── small.vcf
    │           ├── small_missing.vcf
    │           ├── small_realignment_targets.intervals
    │           ├── small_realignment_targets.pileup
    │           ├── small_realignment_targets.sam
    │           ├── small_realignment_targets_README.txt
    │           ├── small_snpeff.vcf
    │           ├── sorted-variants.lex.vcf
    │           ├── sorted-variants.vcf
    │           ├── sorted.lex.vcf
    │           ├── sorted.sam
    │           ├── sorted.vcf
    │           ├── tab5_to_usam.py
    │           ├── tab6_to_usam.py
    │           ├── tag.sam
    │           ├── tags.sam
    │           ├── test.compressed.bcf
    │           ├── test.conf
    │           ├── test.uncompressed.bcf
    │           ├── test.vcf
    │           ├── test.vcf.bgz
    │           ├── test.vcf.bgzf.gz
    │           ├── test.vcf.gz
    │           ├── test_command.sh
    │           ├── test_rowgroup_rangeindex.1.txt
    │           ├── timeout.py
    │           ├── trinity.fa
    │           ├── unmapped.sam
    │           ├── unordered.sam
    │           ├── unsorted.sam
    │           ├── vcf_dir
    │           │   ├── 1.vcf
    │           │   ├── 2.vcf
    │           │   ├── 3.vcf
    │           │   └── zero.vcf
    │           ├── wgEncodeOpenChromDnaseGm19238Pk.trunc10.narrowPeak
    │           └── wgs_calling_regions.hg38.interval_list
    │       └── scala
    │           └── org
    │               └── bdgenomics
    │                   └── adam
    │                       ├── algorithms
    │                           ├── consensus
    │                           │   ├── ConsensusGeneratorFromKnownsSuite.scala
    │                           │   ├── ConsensusGeneratorFromReadsSuite.scala
    │                           │   ├── ConsensusSuite.scala
    │                           │   └── NormalizationUtilsSuite.scala
    │                           └── smithwaterman
    │                           │   └── SmithWatermanSuite.scala
    │                       ├── converters
    │                           ├── AlignmentConverterSuite.scala
    │                           ├── FastqRecordConverterSuite.scala
    │                           ├── FragmentConverterSuite.scala
    │                           ├── TranscriptEffectConverterSuite.scala
    │                           └── VariantContextConverterSuite.scala
    │                       ├── ds
    │                           ├── ADAMContextSuite.scala
    │                           ├── GenomicDatasetSuite.scala
    │                           ├── GenomicPositionPartitionerSuite.scala
    │                           ├── InnerShuffleRegionJoinSuite.scala
    │                           ├── InnerTreeRegionJoinSuite.scala
    │                           ├── LeftOuterShuffleRegionJoinAndGroupByLeftSuite.scala
    │                           ├── LeftOuterShuffleRegionJoinSuite.scala
    │                           ├── OuterRegionJoinSuite.scala
    │                           ├── RightOuterTreeRegionJoinSuite.scala
    │                           ├── SortedGenomicDatasetSuite.scala
    │                           ├── TreeRegionJoinSuite.scala
    │                           ├── feature
    │                           │   ├── CoverageDatasetSuite.scala
    │                           │   ├── FeatureDatasetSuite.scala
    │                           │   └── GFF3HeaderWriterSuite.scala
    │                           ├── fragment
    │                           │   └── FragmentDatasetSuite.scala
    │                           ├── read
    │                           │   ├── AlignmentDatasetSuite.scala
    │                           │   ├── BinQualitiesSuite.scala
    │                           │   ├── FlagStatSuite.scala
    │                           │   ├── MDTaggingSuite.scala
    │                           │   ├── MarkDuplicatesSuite.scala
    │                           │   ├── ReadDatasetSuite.scala
    │                           │   ├── RepairPartitionsSuite.scala
    │                           │   ├── SingleReadBucketSuite.scala
    │                           │   ├── realignment
    │                           │   │   ├── IndelRealignmentTargetSuite.scala
    │                           │   │   ├── ModPartitionerSuite.scala
    │                           │   │   └── RealignIndelsSuite.scala
    │                           │   └── recalibration
    │                           │   │   ├── BaseQualityRecalibrationSuite.scala
    │                           │   │   ├── CycleCovariateSuite.scala
    │                           │   │   ├── DinucCovariateSuite.scala
    │                           │   │   ├── RecalibrationTableSuite.scala
    │                           │   │   └── RecalibratorSuite.scala
    │                           ├── sequence
    │                           │   ├── FlankSlicesSuite.scala
    │                           │   ├── SequenceDatasetSuite.scala
    │                           │   └── SliceDatasetSuite.scala
    │                           └── variant
    │                           │   ├── GenotypeDatasetSuite.scala
    │                           │   ├── VariantContextDatasetSuite.scala
    │                           │   └── VariantDatasetSuite.scala
    │                       ├── io
    │                           ├── InterleavedFastqInputFormatSuite.scala
    │                           └── SingleFastqInputFormatSuite.scala
    │                       ├── models
    │                           ├── AlphabetSuite.scala
    │                           ├── CoverageSuite.scala
    │                           ├── IndelTableSuite.scala
    │                           ├── MdTagSuite.scala
    │                           ├── NonoverlappingRegionsSuite.scala
    │                           ├── ReadGroupDictionarySuite.scala
    │                           ├── ReferencePositionSuite.scala
    │                           ├── ReferenceRegionSuite.scala
    │                           ├── SequenceDictionarySuite.scala
    │                           └── SnpTableSuite.scala
    │                       ├── rich
    │                           ├── RichAlignmentSuite.scala
    │                           └── RichCigarSuite.scala
    │                       └── util
    │                           ├── ADAMFunSuite.scala
    │                           ├── AttributeUtilsSuite.scala
    │                           ├── FileExtensionsSuite.scala
    │                           ├── FileMergerSuite.scala
    │                           ├── IndexedFastaFileSuite.scala
    │                           ├── ParallelFileMergerSuite.scala
    │                           ├── PhredUtilsSuite.scala
    │                           └── TwoBitFileSuite.scala
├── adam-distribution
    ├── pom.xml
    └── src
    │   └── main
    │       └── assembly
    │           └── assembly.xml
├── adam-python
    ├── .gitignore
    ├── MANIFEST.in
    ├── Makefile
    ├── README.md
    ├── bdgenomics
    │   ├── __init__.py
    │   └── adam
    │   │   ├── .gitignore
    │   │   ├── __init__.py
    │   │   ├── adamContext.py
    │   │   ├── ds.py
    │   │   ├── find_adam_home.py
    │   │   ├── models.py
    │   │   ├── stringency.py
    │   │   └── test
    │   │       ├── __init__.py
    │   │       ├── adamContext_test.py
    │   │       ├── alignmentDataset_test.py
    │   │       ├── coverageDataset_test.py
    │   │       ├── featureDataset_test.py
    │   │       ├── genotypeDataset_test.py
    │   │       └── variantDataset_test.py
    ├── pom.xml
    ├── setup.py
    └── version.py
├── adam-r
    ├── .gitignore
    ├── bdgenomics.adam
    │   ├── DESCRIPTION
    │   ├── NAMESPACE
    │   ├── R
    │   │   ├── adam-context.R
    │   │   ├── ds.R
    │   │   └── generics.R
    │   └── tests
    │   │   ├── testthat.R
    │   │   └── testthat
    │   │       ├── helpers.R
    │   │       ├── test_adamContext.R
    │   │       ├── test_alignmentDataset.R
    │   │       ├── test_featureDataset.R
    │   │       ├── test_genotypeDataset.R
    │   │       └── test_variantDataset.R
    └── pom.xml
├── bin
    ├── adam-shell
    ├── adam-submit
    ├── adamR
    ├── find-adam-assembly.sh
    ├── find-adam-egg.sh
    ├── find-adam-home
    ├── find-spark.sh
    └── pyadam
├── docs
    ├── .gitignore
    ├── Makefile
    ├── _static
    │   ├── favicon.ico
    │   └── logo.png
    ├── algorithms
    │   ├── bqsr.rst
    │   ├── dm.rst
    │   ├── joins.rst
    │   ├── reads.rst
    │   └── ri.rst
    ├── api
    │   ├── adamContext.rst
    │   ├── genomicDataset.rst
    │   ├── img
    │   │   ├── join_examples.png
    │   │   └── join_rdds.png
    │   ├── joins.rst
    │   ├── overview.rst
    │   ├── pipes.rst
    │   └── python.rst
    ├── architecture
    │   ├── evidence.rst
    │   ├── img
    │   │   ├── grdd.pdf
    │   │   ├── grdd.png
    │   │   ├── stack-model.pdf
    │   │   └── stack-model.png
    │   ├── overview.rst
    │   ├── schemas.rst
    │   └── stackModel.rst
    ├── benchmarks
    │   ├── algorithms.rst
    │   ├── img
    │   │   ├── bam.pdf
    │   │   ├── bam.png
    │   │   ├── bed.pdf
    │   │   ├── bed.png
    │   │   ├── gff.pdf
    │   │   ├── gff.png
    │   │   ├── plot-speedup.py
    │   │   ├── speedup-bqsr.pdf
    │   │   ├── speedup-bqsr.png
    │   │   ├── speedup-ir.pdf
    │   │   ├── speedup-ir.png
    │   │   ├── speedup-md.pdf
    │   │   ├── speedup-md.png
    │   │   ├── vcf.pdf
    │   │   └── vcf.png
    │   └── storage.rst
    ├── citing.rst
    ├── cli
    │   ├── actions.rst
    │   ├── conversions.rst
    │   ├── overview.rst
    │   └── printers.rst
    ├── conf.py
    ├── deploying
    │   ├── aws.rst
    │   ├── gcp.rst
    │   ├── slurm.rst
    │   ├── toil.rst
    │   └── yarn.rst
    ├── downstream
    │   ├── cli.rst
    │   ├── library.rst
    │   └── overview.rst
    ├── img
    │   ├── bdgenomics-stack.key
    │   ├── bdgenomics-stack.png
    │   ├── source
    │   │   └── file_benchmarks.py
    │   └── stack-model.ai
    ├── index.rst
    ├── installation
    │   ├── example.rst
    │   ├── pip.rst
    │   └── source.rst
    ├── references.rst
    ├── requirements.txt
    ├── source
    │   └── bibliography.bib
    └── template.tex
├── pom.xml
└── scripts
    ├── fastq-interleaver.py
    ├── make-flag-values-sam.py
    ├── move_to_scala_2.11.sh
    ├── move_to_scala_2.12.sh
    ├── move_to_spark_2.sh
    ├── move_to_spark_3.sh
    └── release.sh


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: pull_request
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     env:
10 |         SPARK_LOCAL_IP: localhost
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |       - uses: actions/setup-java@v2
14 |         with:
15 |           java-version: '11'
16 |           distribution: 'temurin'
17 |       - uses: actions/cache@v4
18 |         with:
19 |           path: ~/.m2
20 |           key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
21 |           restore-keys: ${{ runner.os }}-m2
22 |       - run: mvn --batch-mode --update-snapshots clean package
23 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy Snapshot
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - master
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     if: ${{ github.repository == 'bigdatagenomics/adam' }}
12 |     env:
13 |       SPARK_LOCAL_IP: localhost
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - uses: actions/setup-java@v2
17 |         with:
18 |           java-version: '11'
19 |           distribution: 'temurin'
20 |           server-id: sonatype-nexus-snapshots
21 |           server-username: MAVEN_USERNAME
22 |           server-password: MAVEN_PASSWORD
23 |       - uses: actions/cache@v4
24 |         with:
25 |           path: ~/.m2
26 |           key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
27 |           restore-keys: ${{ runner.os }}-m2
28 |       - env:
29 |           MAVEN_USERNAME: ${{ secrets.OSS_SONATYPE_USERNAME }}
30 |           MAVEN_PASSWORD: ${{ secrets.OSS_SONATYPE_PASSWORD }}
31 |         run: mvn --batch-mode -DskipTests=true deploy
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.iml
 3 | target
 4 | adam*.jar
 5 | build
 6 | *~
 7 | #*
 8 | *.bak
 9 | *.bam*
10 | *.adam*
11 | *.log
12 | .*.swp
13 | .DS_Store
14 | *#*
15 | .Rproj.user
16 | .Rhistory
17 | .RData
18 | 


--------------------------------------------------------------------------------
/LICENSE_header.txt:
--------------------------------------------------------------------------------
 1 | Licensed to Big Data Genomics (BDG) under one
 2 | or more contributor license agreements.  See the NOTICE file
 3 | distributed with this work for additional information
 4 | regarding copyright ownership.  The BDG licenses this file
 5 | to you under the Apache License, Version 2.0 (the
 6 | "License"); you may not use this file except in compliance
 7 | with the License.  You may obtain a copy of the License at
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | How to get support for ADAM
 2 | ===========================
 3 | 
 4 | ### Join the ADAM Gitter channel
 5 | 
 6 | The primary mechanism for communication between ADAM developers and users is the [ADAM Gitter channel](https://gitter.im/bigdatagenomics/adam).
 7 | 
 8 | 
 9 | ### Join the ADAM IRC channel
10 | 
11 | If you prefer IRC, you can often find ADAM developers and users on [Libera.Chat](https://libera.chat/) in the #adamdev room.
12 | 
13 | 
14 | ### Join the ADAM developers mailing list
15 | 
16 | The ADAM project also hosts a developers mailing list, see http://bdgenomics.org/mail/ for details.
17 | 
18 | 
19 | ### Search the Github issue tracker and pull requests
20 | 
21 | Before creating a new issue, please search the [ADAM issue tracker](https://github.com/bigdatagenomics/adam/issues)
22 | and [ADAM open pull requests](https://github.com/bigdatagenomics/adam/pulls) on Github.
23 | 
24 | 
25 | ### Create a new issue on Github
26 | 
27 | If you have identified a new issue, please [create a new issue](https://github.com/bigdatagenomics/adam/issues/new)
28 | on the ADAM issue tracker on Github and prepare supporting material, such as scripts, test cases, and data that
29 | provide context for your issue.  The [How to submit a contribution](https://opensource.guide/how-to-contribute/)
30 | Open Source Guide is very helpful in this regard.
31 | 
32 | Thank you for contributing to ADAM!
33 | 


--------------------------------------------------------------------------------
/adam-apis/src/main/scala/org/bdgenomics/adam/api/python/DataFrameConversionWrapper.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.python
19 | 
20 | import org.apache.spark.api.java.function.{ Function => JFunction }
21 | import org.apache.spark.sql.DataFrame
22 | 
23 | class DataFrameConversionWrapper(
24 |     newDf: DataFrame) extends JFunction[DataFrame, DataFrame] {
25 | 
26 |   def call(v1: DataFrame): DataFrame = {
27 |     newDf
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMCoverageConduit.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.java;
19 | 
20 | import java.io.IOException;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | import org.bdgenomics.adam.ds.ADAMContext;
24 | import org.bdgenomics.adam.ds.feature.CoverageDataset;
25 | 
26 | /**
27 |  * A simple test class for the JavaADAMRDD/Context. Writes an RDD of coverage to
28 |  * disk and reads it back.
29 |  */
30 | final class JavaADAMCoverageConduit {
31 |     public static CoverageDataset conduit(final CoverageDataset recordRdd,
32 |                                           final ADAMContext ac) throws IOException {
33 | 
34 |         // make temp directory and save file
35 |         Path tempDir = Files.createTempDirectory("javaAC");
36 |         String fileName = tempDir.toString() + "/testRdd.coverage.adam";
37 |         recordRdd.save(fileName, false, false);
38 | 
39 |         // create a new adam context and load the file
40 |         JavaADAMContext jac = new JavaADAMContext(ac);
41 |         return jac.loadCoverage(fileName);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFeatureConduit.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.java;
19 | 
20 | import java.io.IOException;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | import org.bdgenomics.adam.ds.ADAMContext;
24 | import org.bdgenomics.adam.ds.feature.FeatureDataset;
25 | 
26 | /**
27 |  * A simple test class for the JavaADAMRDD/Context. Writes an RDD of features to
28 |  * disk and reads it back.
29 |  */
30 | final class JavaADAMFeatureConduit {
31 |     public static FeatureDataset conduit(final FeatureDataset recordRdd,
32 |                                          final ADAMContext ac) throws IOException {
33 | 
34 |         // make temp directory and save file
35 |         Path tempDir = Files.createTempDirectory("javaAC");
36 |         String fileName = tempDir.toString() + "/testRdd.feature.adam";
37 |         recordRdd.save(fileName, false, false);
38 | 
39 |         // create a new adam context and load the file
40 |         JavaADAMContext jac = new JavaADAMContext(ac);
41 |         return jac.loadFeatures(fileName);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFragmentConduit.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.java;
19 | 
20 | import java.io.IOException;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | import org.bdgenomics.adam.ds.ADAMContext;
24 | import org.bdgenomics.adam.ds.fragment.FragmentDataset;
25 | 
26 | /**
27 |  * A simple test class for the JavaADAMRDD/Context. Writes an RDD of fragments to
28 |  * disk and reads it back.
29 |  */
30 | final class JavaADAMFragmentConduit {
31 |     public static FragmentDataset conduit(final FragmentDataset recordRdd,
32 |                                           final ADAMContext ac) throws IOException {
33 | 
34 |         // make temp directory and save file
35 |         Path tempDir = Files.createTempDirectory("javaAC");
36 |         String fileName = tempDir.toString() + "/testRdd.fragment.adam";
37 |         recordRdd.save(fileName);
38 | 
39 |         // create a new adam context and load the file
40 |         JavaADAMContext jac = new JavaADAMContext(ac);
41 |         return jac.loadFragments(fileName);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMGenotypeConduit.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.java;
19 | 
20 | import java.io.IOException;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | import org.bdgenomics.adam.ds.ADAMContext;
24 | import org.bdgenomics.adam.ds.variant.GenotypeDataset;
25 | 
26 | /**
27 |  * A simple test class for the JavaADAMRDD/Context. Writes an RDD of annotations to
28 |  * disk and reads it back.
29 |  */
30 | final class JavaADAMGenotypeConduit {
31 |     public static GenotypeDataset conduit(final GenotypeDataset recordRdd,
32 |                                           final ADAMContext ac) throws IOException {
33 | 
34 |         // make temp directory and save file
35 |         Path tempDir = Files.createTempDirectory("javaAC");
36 |         String fileName = tempDir.toString() + "/testRdd.genotype.adam";
37 |         recordRdd.saveAsParquet(fileName);
38 | 
39 |         // create a new adam context and load the file
40 |         JavaADAMContext jac = new JavaADAMContext(ac);
41 |         return jac.loadGenotypes(fileName);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMReadConduit.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.java;
19 | 
20 | import java.io.IOException;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | import org.bdgenomics.adam.ds.ADAMContext;
24 | import org.bdgenomics.adam.ds.read.AlignmentDataset;
25 | 
26 | /**
27 |  * A simple test class for the JavaADAMRDD/Context. Writes an RDD of reads to
28 |  * disk and reads it back.
29 |  */
30 | class JavaADAMReadConduit {
31 |     public static AlignmentDataset conduit(final AlignmentDataset recordRdd,
32 |                                            final ADAMContext ac) throws IOException {
33 | 
34 |         // make temp directory and save file
35 |         Path tempDir = Files.createTempDirectory("javaAC");
36 |         String fileName = tempDir.toString() + "/testRdd.read.adam";
37 |         recordRdd.save(fileName, false);
38 | 
39 |         // create a new adam context and load the file
40 |         JavaADAMContext jac = new JavaADAMContext(ac);
41 |         return jac.loadAlignments(fileName);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSequenceConduit.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.java;
19 | 
20 | import java.io.IOException;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | 
24 | import org.bdgenomics.adam.ds.ADAMContext;
25 | import org.bdgenomics.adam.ds.sequence.SequenceDataset;
26 | 
27 | /**
28 |  * A simple test class for the JavaADAMRDD/Context. Writes an RDD of sequences
29 |  * to disk and reads it back.
30 |  */
31 | final class JavaADAMSequenceConduit {
32 |     public static SequenceDataset conduit(final SequenceDataset sequenceDataset,
33 |                                       final ADAMContext ac) throws IOException {
34 | 
35 |         // make temp directory and save file
36 |         Path tempDir = Files.createTempDirectory("javaAC");
37 |         String fileName = tempDir.toString() + "/testRdd.sequences.adam";
38 |         sequenceDataset.save(fileName, true, true);
39 | 
40 |         // create a new adam context and load the file
41 |         JavaADAMContext jac = new JavaADAMContext(ac);
42 |         return jac.loadDnaSequences(fileName);
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSliceConduit.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.java;
19 | 
20 | import java.io.IOException;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | 
24 | import org.bdgenomics.adam.ds.ADAMContext;
25 | import org.bdgenomics.adam.ds.sequence.SliceDataset;
26 | 
27 | /**
28 |  * A simple test class for the JavaADAMRDD/Context. Writes an RDD of slices
29 |  * to disk and reads it back.
30 |  */
31 | final class JavaADAMSliceConduit {
32 |     public static SliceDataset conduit(final SliceDataset sliceDataset,
33 |                                    final ADAMContext ac) throws IOException {
34 | 
35 |         // make temp directory and save file
36 |         Path tempDir = Files.createTempDirectory("javaAC");
37 |         String fileName = tempDir.toString() + "/testRdd.slices.adam";
38 |         sliceDataset.save(fileName, true, true);
39 | 
40 |         // create a new adam context and load the file
41 |         JavaADAMContext jac = new JavaADAMContext(ac);
42 |         return jac.loadSlices(fileName, 10000);
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMVariantConduit.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.api.java;
19 | 
20 | import java.io.IOException;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | import org.bdgenomics.adam.ds.ADAMContext;
24 | import org.bdgenomics.adam.ds.variant.VariantDataset;
25 | 
26 | /**
27 |  * A simple test class for the JavaADAMRDD/Context. Writes an RDD of annotations to
28 |  * disk and reads it back.
29 |  */
30 | final class JavaADAMVariantConduit {
31 |     public static VariantDataset conduit(final VariantDataset recordRdd,
32 |                                          final ADAMContext ac) throws IOException {
33 | 
34 |         // make temp directory and save file
35 |         Path tempDir = Files.createTempDirectory("javaAC");
36 |         String fileName = tempDir.toString() + "/testRdd.variant.adam";
37 |         recordRdd.saveAsParquet(fileName);
38 | 
39 |         // create a new adam context and load the file
40 |         JavaADAMContext jac = new JavaADAMContext(ac);
41 |         return jac.loadVariants(fileName);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/adam-apis/src/test/resources/indexed_bams/sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-apis/src/test/resources/indexed_bams/sorted.bam


--------------------------------------------------------------------------------
/adam-apis/src/test/resources/indexed_bams/sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-apis/src/test/resources/indexed_bams/sorted.bam.bai


--------------------------------------------------------------------------------
/adam-assembly/src/main/scala/org/bdgenomics/adam/assembly/Assembly.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.assembly
19 | 
20 | /**
21 |  * Empty assembly object, allows Maven build to create sources and javadoc artifacts.
22 |  */
23 | object Assembly {
24 |   // empty
25 | }
26 | 


--------------------------------------------------------------------------------
/adam-cli/.gitignore:
--------------------------------------------------------------------------------
1 | dependency-reduced-pom.xml
2 | 


--------------------------------------------------------------------------------
/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CramArgs.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.cli
19 | 
20 | import org.apache.spark.SparkContext
21 | import org.bdgenomics.utils.cli._
22 | import org.kohsuke.args4j.{ Option => Args4jOption }
23 | import org.seqdoop.hadoop_bam.CRAMInputFormat
24 | 
25 | /**
26 |  * Abstract arguments that capture CRAM format configuration.
27 |  */
28 | private[cli] trait CramArgs {
29 | 
30 |   @Args4jOption(required = false, name = "-cram_reference", usage = "CRAM format reference, if necessary")
31 |   var cramReference: String = null
32 | 
33 |   /**
34 |    * Configure CRAM format.
35 |    *
36 |    * @param sc Spark context to configure
37 |    */
38 |   def configureCramFormat(sc: SparkContext) = {
39 |     Option(cramReference).map(ref => {
40 |       sc.hadoopConfiguration.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, ref)
41 |     })
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FileSystemUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.cli
19 | 
20 | import org.apache.hadoop.conf.Configuration
21 | import org.apache.hadoop.fs.Path
22 | import org.apache.hadoop.mapred.FileAlreadyExistsException
23 | 
24 | /**
25 |  * Utility methods for file systems.
26 |  */
27 | private[cli] object FileSystemUtils {
28 |   private def exists(pathName: String, conf: Configuration): Boolean = {
29 |     val p = new Path(pathName)
30 |     val fs = p.getFileSystem(conf)
31 |     fs.exists(p)
32 |   }
33 | 
34 |   // move to BDGSparkCommand in bdg-utils?
35 |   def checkWriteablePath(pathName: String, conf: Configuration): Unit = {
36 |     if (exists(pathName, conf)) {
37 |       throw new FileAlreadyExistsException("Cannot write to path name, %s already exists".format(pathName))
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/artificial.counts.txt:
--------------------------------------------------------------------------------
 1 | GGGGGGGGAAAAAAAAAAGGG	1
 2 | AAAAAAAAAAAAAAAAAAAAG	1
 3 | GGGGAAAAAAAAAAGGGGGGG	1
 4 | GGGGGGGAAAAAAAAAAGGGG	1
 5 | AGGGGGGGGGGAAAAAAAAAA	2
 6 | AAAAAAAAAAAAAGGGGGGGG	1
 7 | GGGGGAAAAAAAAAAGGGGGG	1
 8 | GAAAAAAAAAAAAAAAAAAAA	1
 9 | AAAAAAAAAAAAAAAGGGGGG	1
10 | GGGGGGGGGAAAAAAAAAAAA	1
11 | AAAAAAAAAAAAAAAAAAAGG	1
12 | AAAGGGGGGGGGGAAAAAAAA	2
13 | GGGGGGGGAAAAAAAAAAAAA	1
14 | GGAAAAAAAAAAGGGGGGGGG	1
15 | GGGGGGAAAAAAAAAAGGGGG	1
16 | AAAAGGGGGGGGGGAAAAAAA	2
17 | GGGGGGGAAAAAAAAAAAAAA	1
18 | AAAAAAAAAAAAAAGGGGGGG	1
19 | GGGGGGGGGGAAAAAAAAAAG	1
20 | AAAAAAAAAGGGGGGGGGGAA	2
21 | AAGGGGGGGGGGAAAAAAAAA	2
22 | GGAAAAAAAAAAAAAAAAAAA	1
23 | AAAAAAAAAAAAAAAAGGGGG	1
24 | AAAAAAAAAAAGGGGGGGGGG	1
25 | AAAAAAAAAAAAAAAAAGGGG	1
26 | GGGGGGGGGGAAAAAAAAAAA	1
27 | GGGAAAAAAAAAAAAAAAAAA	1
28 | GGGGGGGGGAAAAAAAAAAGG	1
29 | GAAAAAAAAAAGGGGGGGGGG	1
30 | AAAAAAGGGGGGGGGGAAAAA	2
31 | AAAAAGGGGGGGGGGAAAAAA	2
32 | GGGAAAAAAAAAAGGGGGGGG	1
33 | GGGGGAAAAAAAAAAAAAAAA	1
34 | AAAAAAAAAAAAAAAAAAAAA	1050
35 | AAAAAAAAAAAAAAAAAAGGG	1
36 | GGGGGGAAAAAAAAAAAAAAA	1
37 | AAAAAAAAGGGGGGGGGGAAA	2
38 | GGGGAAAAAAAAAAAAAAAAA	1
39 | AAAAAAAGGGGGGGGGGAAAA	2
40 | AAAAAAAAAAGGGGGGGGGGA	2
41 | AAAAAAAAAAAAGGGGGGGGG	1
42 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/artificial.fa:
--------------------------------------------------------------------------------
1 | ../../../../adam-core/src/test/resources/artificial.fa


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/artificial.fa.fai:
--------------------------------------------------------------------------------
1 | ../../../../adam-core/src/test/resources/artificial.fa.fai


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/chr5.phyloP46way.trunc.wigFix:
--------------------------------------------------------------------------------
 1 | fixedStep chrom=chr5 start=13940 step=1
 2 | 0.067
 3 | 0.075
 4 | 0.075
 5 | -2.162
 6 | -2.294
 7 | 0.075
 8 | fixedStep chrom=chr5 start=15296 step=1
 9 | 0.139
10 | 0.155
11 | 0.155
12 | 0.139
13 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/gencode.v7.annotation.trunc10.bed:
--------------------------------------------------------------------------------
 1 | chr1	11869	14409	gene	.	+	"pseudogene"
 2 | chr1	11869	14409	transcript	.	+	"processed_transcript"
 3 | chr1	11869	12227	exon	.	+	"processed_transcript"
 4 | chr1	12613	12721	exon	.	+	"processed_transcript"
 5 | chr1	13221	14409	exon	.	+	"processed_transcript"
 6 | chr1	12010	13670	transcript	.	+	"transcribed_unprocessed_pseudogene"
 7 | chr1	12010	12057	exon	.	+	"transcribed_unprocessed_pseudogene"
 8 | chr1	12179	12227	exon	.	+	"transcribed_unprocessed_pseudogene"
 9 | chr1	12613	12697	exon	.	+	"transcribed_unprocessed_pseudogene"
10 | chr1	12975	13052	exon	.	+	"transcribed_unprocessed_pseudogene"
11 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Root logger option
 2 | log4j.rootLogger=INFO, stderr, logfile
 3 | 
 4 | # Direct log messages to stderr
 5 | log4j.appender.stderr=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stderr.Target=System.err
 7 | log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
 8 | log4j.appender.stderr.threshold=WARN
 9 | log4j.appender.stderr.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
10 | 
11 | # Log at INFO level to file
12 | log4j.appender.logfile=org.apache.log4j.FileAppender
13 | log4j.appender.logfile.append=true
14 | log4j.appender.logfile.file=adam.log
15 | log4j.appender.logfile.threshold=INFO
16 | log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
17 | log4j.appender.logfile.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
18 | log4j.appender.logfile.encoding=UTF-8
19 | 
20 | # Tell Parquet to shut up
21 | log4j.logger.org.apache.parquet=ERROR
22 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-cli/src/test/resources/sorted.bam


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-cli/src/test/resources/sorted.bam.bai


--------------------------------------------------------------------------------
/adam-cli/src/test/resources/sorted.counts.txt:
--------------------------------------------------------------------------------
1 | ACACACAC	1
2 | ACACACACACAC	1
3 | ACACACACAC	3
4 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/scala/org/bdgenomics/adam/cli/AboutSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.cli
19 | 
20 | import org.scalatest.FunSuite
21 | 
22 | class AboutSuite extends FunSuite {
23 |   val about = new About()
24 | 
25 |   test("template variables have been replaced") {
26 |     assert(about.artifactId !== "${project.artifactId}")
27 |     assert(about.buildTimestamp !== "${maven.build.timestamp}")
28 |     assert(about.scalaVersion !== "${scala.version}")
29 |     assert(about.sparkVersion !== "${spark.version}")
30 |     assert(about.version !== "${version}")
31 |   }
32 | 
33 |   test("templated values are not empty") {
34 |     assert(about.artifactId.nonEmpty)
35 |     assert(about.buildTimestamp.nonEmpty)
36 |     assert(about.scalaVersion.nonEmpty)
37 |     assert(about.sparkVersion.nonEmpty)
38 |     assert(about.version.nonEmpty)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/scala/org/bdgenomics/adam/cli/CountReadKmersSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.cli
19 | 
20 | import org.bdgenomics.adam.ds.ADAMContext._
21 | import org.bdgenomics.adam.util.ADAMFunSuite
22 | 
23 | class CountReadKmersSuite extends ADAMFunSuite {
24 |   sparkTest("count kmers to single file") {
25 |     val inputPath = copyResource("sorted.sam")
26 |     val actualPath = tmpFile("sorted.counts.txt")
27 |     val expectedPath = copyResource("sorted.counts.txt")
28 |     CountReadKmers(Array("-single", inputPath, actualPath, "21")).run(sc)
29 |     checkFiles(expectedPath, actualPath)
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/scala/org/bdgenomics/adam/cli/CountSliceKmersSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.cli
19 | 
20 | import org.bdgenomics.adam.ds.ADAMContext._
21 | import org.bdgenomics.adam.util.ADAMFunSuite
22 | 
23 | class CountSliceKmersSuite extends ADAMFunSuite {
24 |   sparkTest("count slice kmers to single file") {
25 |     val inputPath = copyResource("artificial.fa")
26 |     val actualPath = tmpFile("artificial.counts.txt")
27 |     val expectedPath = copyResource("artificial.counts.txt")
28 |     CountSliceKmers(Array("-single", inputPath, actualPath, "21")).run(sc)
29 |     checkFiles(expectedPath, actualPath)
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/scala/org/bdgenomics/adam/cli/CoverageSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.cli
19 | 
20 | import org.bdgenomics.adam.ds.ADAMContext._
21 | import org.bdgenomics.adam.util.ADAMFunSuite
22 | import org.bdgenomics.utils.cli.Args4j
23 | 
24 | class CoverageSuite extends ADAMFunSuite {
25 | 
26 |   sparkTest("correctly calculates coverage from small sam file") {
27 |     val inputPath = copyResource("artificial.sam")
28 |     val outputPath = tmpFile("coverage.adam")
29 | 
30 |     val args: Array[String] = Array(inputPath, outputPath)
31 |     new Coverage(Args4j[CoverageArgs](args)).run(sc)
32 |     val coverage = sc.loadCoverage(outputPath)
33 | 
34 |     val pointCoverage = coverage.flatten.rdd.filter(_.start == 30).first
35 |     assert(pointCoverage.count == 5)
36 |   }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.cli
19 | 
20 | import java.io._
21 | import org.bdgenomics.adam.util.ADAMFunSuite
22 | import org.bdgenomics.adam.ds.ADAMContext._
23 | import org.bdgenomics.utils.cli.Args4j
24 | 
25 | class TransformFeaturesSuite extends ADAMFunSuite {
26 | 
27 |   sparkTest("can convert a simple BED file") {
28 | 
29 |     val loader = Thread.currentThread().getContextClassLoader
30 |     val inputPath = loader.getResource("gencode.v7.annotation.trunc10.bed").getPath
31 |     val outputFile = File.createTempFile("adam-cli.TransformFeaturesSuite", ".adam")
32 |     val outputPath = outputFile.getAbsolutePath
33 | 
34 |     val argLine = "%s %s".format(inputPath, outputPath).split("\\s+")
35 | 
36 |     // We have to do this, since the features2adam won't work if the file already exists,
37 |     // but the "createTempFile" method actually creates the file (on some systems?)
38 |     assert(outputFile.delete(), "Couldn't delete (empty) temp file")
39 | 
40 |     val args: TransformFeaturesArgs = Args4j.apply[TransformFeaturesArgs](argLine)
41 | 
42 |     val features2Adam = new TransformFeatures(args)
43 |     features2Adam.run(sc)
44 | 
45 |     val converted = sc.loadFeatures(outputPath).rdd.collect
46 | 
47 |     assert(converted.size === 10)
48 |     assert(converted.find(_.getReferenceName != "chr1").isEmpty)
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformVariantsSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.cli
19 | 
20 | import org.bdgenomics.adam.util.ADAMFunSuite
21 | 
22 | class TransformVariantsSuite extends ADAMFunSuite {
23 | 
24 |   sparkTest("save a file sorted by contig index") {
25 |     val inputPath = copyResource("random.vcf")
26 |     val intermediatePath = tmpFile("variants.adam")
27 |     val actualPath = tmpFile("sorted-variants.vcf")
28 |     val expectedPath = copyResource("sorted-variants.vcf")
29 | 
30 |     TransformVariants(
31 |       Array(inputPath, intermediatePath)
32 |     ).run(sc)
33 | 
34 |     TransformVariants(
35 |       Array(intermediatePath, actualPath, "-sort_on_save", "-single")
36 |     ).run(sc)
37 | 
38 |     checkFiles(expectedPath, actualPath)
39 |   }
40 | 
41 |   sparkTest("save a lexicographically sorted file") {
42 |     val inputPath = copyResource("random.vcf")
43 |     val intermediatePath = tmpFile("variants.lex.adam")
44 |     val actualPath = tmpFile("sorted-variants.lex.vcf")
45 |     val expectedPath = copyResource("sorted-variants.lex.vcf")
46 | 
47 |     TransformVariants(
48 |       Array(inputPath, intermediatePath)
49 |     ).run(sc)
50 | 
51 |     TransformVariants(
52 |       Array(intermediatePath, actualPath, "-sort_lexicographically_on_save", "-single")
53 |     ).run(sc)
54 | 
55 |     checkFiles(expectedPath, actualPath)
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/adam-codegen/src/main/scala/org/bdgenomics/adam/codegen/Generator.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.codegen
19 | 
20 | import java.io.FileWriter
21 | 
22 | trait Generator {
23 | 
24 |   protected def writeHeader(fw: FileWriter, packageName: String) {
25 |     val hdr = Seq(
26 |       "/**",
27 |       "* Licensed to Big Data Genomics (BDG) under one",
28 |       "* or more contributor license agreements.  See the NOTICE file",
29 |       "* distributed with this work for additional information",
30 |       "* regarding copyright ownership.  The BDG licenses this file",
31 |       "* to you under the Apache License, Version 2.0 (the",
32 |       "* \"License\"); you may not use this file except in compliance",
33 |       "* with the License.  You may obtain a copy of the License at",
34 |       "*",
35 |       "*     http://www.apache.org/licenses/LICENSE-2.0",
36 |       "*",
37 |       "* Unless required by applicable law or agreed to in writing, software",
38 |       "* distributed under the License is distributed on an \"AS IS\" BASIS,",
39 |       "* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.",
40 |       "* See the License for the specific language governing permissions and",
41 |       "* limitations under the License.",
42 |       "*/",
43 |       "package %s".format(packageName),
44 |       "",
45 |       "import scala.collection.JavaConversions._",
46 |       "import scala.collection.JavaConverters._").mkString("\n")
47 | 
48 |     fw.write(hdr)
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/adam-codegen/src/main/scala/org/bdgenomics/adam/codegen/ReflectSchema.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.codegen
19 | 
20 | import org.apache.avro.reflect.ReflectData
21 | import org.apache.avro.Schema
22 | 
23 | object ReflectSchema {
24 | 
25 |   private[codegen] def getSchemaByReflection(className: String): Schema = {
26 | 
27 |     // load the class
28 |     val classLoader = Thread.currentThread().getContextClassLoader()
29 |     val klazz = classLoader.loadClass(className)
30 | 
31 |     // get the schema through reflection
32 |     ReflectData.get().getSchema(klazz)
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/adam-core/.gitignore:
--------------------------------------------------------------------------------
1 | dependency-reduced-pom.xml
2 | src/main/resources/git.properties
3 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/ADAMSaveAnyArgs.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds
19 | 
20 | import org.bdgenomics.utils.cli.SaveArgs
21 | 
22 | /**
23 |  * Argument configuration for saving any output format.
24 |  */
25 | trait ADAMSaveAnyArgs extends SaveArgs {
26 | 
27 |   /**
28 |    * If true and saving as FASTQ, we will sort by read name.
29 |    */
30 |   var sortFastqOutput: Boolean
31 | 
32 |   /**
33 |    * If true and saving as a legacy format, we will write shards so that they
34 |    * can be merged into a single file.
35 |    *
36 |    * @see deferMerging
37 |    */
38 |   var asSingleFile: Boolean
39 | 
40 |   /**
41 |    * If true and asSingleFile is true, we will not merge the shards once we
42 |    * write them, and will leave them for the user to merge later. If false and
43 |    * asSingleFile is true, then we will merge the shards on write. If
44 |    * asSingleFile is false, this is ignored.
45 |    *
46 |    * @see asSingleFile
47 |    */
48 |   var deferMerging: Boolean
49 | 
50 |   /**
51 |    * If asSingleFile is true and deferMerging is false, disables the use of the
52 |    * fast file concatenation engine.
53 |    */
54 |   var disableFastConcat: Boolean
55 | }
56 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/GenomicBroadcast.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds
19 | 
20 | import org.apache.spark.broadcast.Broadcast
21 | import org.bdgenomics.adam.models.ReferenceRegion
22 | import org.bdgenomics.utils.interval.array.IntervalArray
23 | 
24 | case class GenomicBroadcast[T, U <: Product, V <: GenomicDataset[T, U, V]] private[ds] (
25 |     backingDataset: V,
26 |     broadcastTree: Broadcast[IntervalArray[ReferenceRegion, T]]) {
27 | }
28 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/GenomicDatasetConversion.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds
19 | 
20 | import org.apache.spark.api.java.function.Function2
21 | import org.apache.spark.sql.Dataset
22 | import scala.reflect.runtime.universe.TypeTag
23 | 
24 | trait GenomicDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V], X, Y <: Product, Z <: GenomicDataset[X, Y, Z]] extends Function2[V, Dataset[Y], Z] {
25 | 
26 |   val yTag: TypeTag[Y]
27 | 
28 |   def call(v1: V, v2: Dataset[Y]): Z
29 | }
30 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/ReferencePartitioner.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds
19 | 
20 | import org.apache.spark.Partitioner
21 | import org.bdgenomics.adam.models.{
22 |   ReferencePosition,
23 |   ReferenceRegion,
24 |   SequenceDictionary
25 | }
26 | 
27 | /**
28 |  * Repartitions objects that are keyed by a ReferencePosition or ReferenceRegion
29 |  * into a single partition per contig.
30 |  */
31 | case class ReferencePartitioner(sd: SequenceDictionary) extends Partitioner {
32 | 
33 |   // extract just the reference names
34 |   private val referenceNames = sd.records.map(_.name)
35 | 
36 |   override def numPartitions: Int = referenceNames.length
37 | 
38 |   private def partitionFromName(name: String): Int = {
39 |     // which reference is this in?
40 |     val pIdx = referenceNames.indexOf(name)
41 | 
42 |     // provide debug info to user if key is bad
43 |     assert(pIdx != -1, "Reference not found in " + sd + " for key " + name)
44 | 
45 |     pIdx
46 |   }
47 | 
48 |   override def getPartition(key: Any): Int = key match {
49 |     case rp: ReferencePosition => {
50 |       partitionFromName(rp.referenceName)
51 |     }
52 |     case rr: ReferenceRegion => {
53 |       partitionFromName(rr.referenceName)
54 |     }
55 |     case _ => throw new IllegalArgumentException("Only ReferencePositions or ReferenceRegions can be used as a key.")
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/RegionJoin.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds
19 | 
20 | import org.bdgenomics.adam.models.ReferenceRegion
21 | import org.apache.spark.rdd.RDD
22 | import scala.reflect.ClassTag
23 | 
24 | /**
25 |  * A trait describing a join in the genomic coordinate space between two RDDs
26 |  * where the values are keyed by a ReferenceRegion.
27 |  *
28 |  * @tparam T The type of the left RDD.
29 |  * @tparam U The type of the right RDD.
30 |  * @tparam RT The type of data yielded by the left RDD at the output of the
31 |  *   join. This may not match T if the join is an outer join, etc.
32 |  * @tparam RU The type of data yielded by the right RDD at the output of the
33 |  *   join.
34 |  */
35 | abstract class RegionJoin[T: ClassTag, U: ClassTag, RT, RU] extends Serializable {
36 | 
37 |   /**
38 |    * Performs a region join between two RDDs.
39 |    *
40 |    * @param baseRDD The 'left' side of the join
41 |    * @param joinedRDD The 'right' side of the join
42 |    * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region
43 |    *         corresponding to x overlaps the region corresponding to y.
44 |    */
45 |   def partitionAndJoin(
46 |     baseRDD: RDD[(ReferenceRegion, T)],
47 |     joinedRDD: RDD[(ReferenceRegion, U)]): RDD[(RT, RU)]
48 | }
49 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/feature/GFF3HeaderWriter.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.feature
19 | 
20 | import org.apache.hadoop.fs.Path
21 | import org.apache.spark.SparkContext
22 | 
23 | /**
24 |  * Writes the header for a GFF3 file to an otherwise empty file.
25 |  */
26 | private[feature] object GFF3HeaderWriter {
27 | 
28 |   val HEADER_STRING = "##gff-version 3.2.1"
29 | 
30 |   /**
31 |    * Writes a GFF3 Header pragma to a file.
32 |    *
33 |    * @param filePath The path to write the file to.
34 |    * @param sc The SparkContext, to access the Hadoop FS Configuration.
35 |    */
36 |   def apply(filePath: String,
37 |             sc: SparkContext) {
38 |     val path = new Path(filePath)
39 |     val fs = path.getFileSystem(sc.hadoopConfiguration)
40 |     val os = fs.create(path)
41 |     os.write(HEADER_STRING.getBytes)
42 |     os.write("\n".getBytes)
43 |     os.close()
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/BAMInFormatter.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.read
19 | 
20 | import htsjdk.samtools.{
21 |   SAMFileHeader,
22 |   SAMFileWriter,
23 |   SAMFileWriterFactory
24 | }
25 | import java.io.OutputStream
26 | import org.bdgenomics.adam.converters.AlignmentConverter
27 | import org.bdgenomics.adam.models.ReadGroupDictionary
28 | 
29 | /**
30 |  * InFormatter companion for building an InFormatter that streams BAM.
31 |  */
32 | object BAMInFormatter extends AnySAMInFormatterCompanion[BAMInFormatter] {
33 | 
34 |   protected def makeFormatter(header: SAMFileHeader,
35 |                               readGroups: ReadGroupDictionary,
36 |                               converter: AlignmentConverter): BAMInFormatter = {
37 |     BAMInFormatter(header, readGroups, converter)
38 |   }
39 | }
40 | 
41 | case class BAMInFormatter private (
42 |     header: SAMFileHeader,
43 |     readGroups: ReadGroupDictionary,
44 |     converter: AlignmentConverter) extends AnySAMInFormatter[BAMInFormatter] {
45 | 
46 |   protected val companion = BAMInFormatter
47 | 
48 |   protected def makeWriter(os: OutputStream): SAMFileWriter = {
49 |     new SAMFileWriterFactory()
50 |       .makeBAMWriter(header, true, os)
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/SAMInFormatter.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.read
19 | 
20 | import htsjdk.samtools.{
21 |   SAMFileHeader,
22 |   SAMFileWriter,
23 |   SAMFileWriterFactory
24 | }
25 | import java.io.OutputStream
26 | import org.bdgenomics.adam.converters.AlignmentConverter
27 | import org.bdgenomics.adam.models.ReadGroupDictionary
28 | 
29 | /**
30 |  * InFormatter companion for building an InFormatter that streams SAM.
31 |  */
32 | object SAMInFormatter extends AnySAMInFormatterCompanion[SAMInFormatter] {
33 | 
34 |   protected def makeFormatter(header: SAMFileHeader,
35 |                               readGroups: ReadGroupDictionary,
36 |                               converter: AlignmentConverter): SAMInFormatter = {
37 |     SAMInFormatter(header, readGroups, converter)
38 |   }
39 | }
40 | 
41 | case class SAMInFormatter private (
42 |     header: SAMFileHeader,
43 |     readGroups: ReadGroupDictionary,
44 |     converter: AlignmentConverter) extends AnySAMInFormatter[SAMInFormatter] {
45 | 
46 |   def this() = {
47 |     this(null, null, null)
48 |   }
49 | 
50 |   protected val companion = SAMInFormatter
51 | 
52 |   protected def makeWriter(os: OutputStream): SAMFileWriter = {
53 |     new SAMFileWriterFactory()
54 |       .makeSAMWriter(header, true, os)
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/realignment/ModPartitioner.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.read.realignment
19 | 
20 | import org.apache.spark.Partitioner
21 | 
22 | private[realignment] case class ModPartitioner(numPartitions: Int) extends Partitioner {
23 | 
24 |   def getPartition(key: Any): Int = key match {
25 |     case i: Int => {
26 |       (i.abs % numPartitions).abs
27 |     }
28 |     case _ => {
29 |       throw new IllegalArgumentException("Key %s is not an Int.".format(key))
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/recalibration/Covariate.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.read.recalibration
19 | 
20 | import org.bdgenomics.formats.avro.Alignment
21 | 
22 | /**
23 |  * A Covariate represents a predictor, also known as a "feature" or
24 |  * "independent variable".
25 |  *
26 |  * @tparam T The type of this feature.
27 |  */
28 | private[recalibration] abstract class Covariate[T] {
29 | 
30 |   /**
31 |    * Given a read, computes the value of this covariate for each residue in the
32 |    * read.
33 |    *
34 |    * @param read The read to observe.
35 |    * @return The covariates corresponding to each base in this read.
36 |    */
37 |   def compute(read: Alignment): Array[T]
38 | 
39 |   /**
40 |    * Format the provided covariate value to be compatible with GATK's CSV output.
41 |    *
42 |    * @param cov A covariate value to render.
43 |    * @return Returns the covariate value rendered as a single CSV cell.
44 |    */
45 |   def toCSV(cov: T): String = {
46 |     cov.toString
47 |   }
48 | 
49 |   /**
50 |    * A short name for this covariate, used in CSV output header.
51 |    */
52 |   val csvFieldName: String
53 | }
54 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/recalibration/ObservationTable.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.read.recalibration
19 | 
20 | import org.bdgenomics.adam.models.ReadGroupDictionary
21 | 
22 | /**
23 |  * Table containing the empirical frequency of mismatches for each set of
24 |  * covariate values.
25 |  *
26 |  * @param entries The error covariate &rarr; observed error frequency mapping.
27 |  */
28 | private[adam] class ObservationTable(
29 |     val entries: scala.collection.Map[CovariateKey, Observation]) extends Serializable {
30 | 
31 |   override def toString = entries.map { case (k, v) => "%s\t%s".format(k, v) }.mkString("\n")
32 | 
33 |   /**
34 |    * @param readGroups The read groups that generated the reads in this table.
35 |    * @return Return this table as CSV.
36 |    */
37 |   def toCSV(readGroups: ReadGroupDictionary): String = {
38 |     val rows = entries.map {
39 |       case (key, obs) =>
40 |         (CovariateSpace.toCSV(key, readGroups) ++
41 |           obs.toCSV ++
42 |           (if (key.containsNone) Seq("**") else Seq()))
43 |     }
44 |     (Seq(csvHeader) ++ rows).map(_.mkString(",")).mkString("\n")
45 |   }
46 | 
47 |   private def csvHeader: Seq[String] = {
48 |     (CovariateSpace.csvHeader ++
49 |       Seq("TotalCount", "MismatchCount", "EmpiricalQ", "IsSkipped"))
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/ds/variant/ADAMVCFOutputFormat.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.variant
19 | 
20 | import htsjdk.variant.vcf.{ VCFHeaderLine, VCFHeader }
21 | import org.apache.hadoop.fs.{ FileSystem, Path }
22 | import org.apache.hadoop.mapreduce.{ RecordWriter, TaskAttemptContext }
23 | import org.bdgenomics.adam.models.SequenceDictionary
24 | import org.seqdoop.hadoop_bam.{
25 |   KeyIgnoringVCFOutputFormat,
26 |   KeyIgnoringVCFRecordWriter,
27 |   VariantContextWritable,
28 |   VCFFormat
29 | }
30 | 
31 | /**
32 |  * Wrapper for Hadoop-BAM to work around requirement for no-args constructor.
33 |  *
34 |  * @tparam K The key type. Keys are not written.
35 |  */
36 | class ADAMVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF) with Serializable {
37 | 
38 |   override def getRecordWriter(context: TaskAttemptContext): RecordWriter[K, VariantContextWritable] = {
39 |     val conf = context.getConfiguration()
40 | 
41 |     // where is our header file?
42 |     val path = new Path(conf.get("org.bdgenomics.adam.rdd.variant.vcf_header_path"))
43 | 
44 |     // read the header file
45 |     readHeaderFrom(path, FileSystem.get(conf))
46 | 
47 |     // return record writer
48 |     super.getRecordWriter(context)
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/models/VCFHeaderWritable.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.models
19 | 
20 | import htsjdk.variant.vcf.VCFHeader
21 | 
22 | /**
23 |  * Serializable wrapper for the VCF header.
24 |  *
25 |  * @param header A VCF header to serialize.
26 |  */
27 | private[adam] case class VCFHeaderWritable(header: VCFHeader) {
28 | }
29 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/sql/VariantContext.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.sql
19 | 
20 | import org.bdgenomics.adam.models.{
21 |   ReferencePosition,
22 |   VariantContext => VariantContextModel
23 | }
24 | import org.bdgenomics.adam.rich.RichVariant
25 | 
26 | object VariantContext {
27 | 
28 |   def fromModel(vc: VariantContextModel): VariantContext = {
29 |     VariantContext(vc.position.referenceName,
30 |       vc.position.start,
31 |       vc.variant.variant.getEnd,
32 |       Variant.fromAvro(vc.variant.variant),
33 |       vc.genotypes.map(g => Genotype.fromAvro(g)).toSeq)
34 |   }
35 | }
36 | 
37 | case class VariantContext(referenceName: String,
38 |                           start: Long,
39 |                           end: Long,
40 |                           variant: Variant,
41 |                           genotypes: Seq[Genotype]) {
42 | 
43 |   def toModel(): VariantContextModel = {
44 |     new VariantContextModel(new ReferencePosition(referenceName, start),
45 |       RichVariant(variant.toAvro),
46 |       genotypes.map(_.toAvro))
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/util/GenomeFileReader.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.util
19 | 
20 | import org.apache.spark.SparkContext
21 | import org.bdgenomics.adam.models.{ SequenceDictionary, SequenceRecord }
22 | 
23 | /**
24 |  * Object for reading Bedtools genome files from disk. Also supports
25 |  * UCSC Genome Browser chromInfo files.
26 |  */
27 | object GenomeFileReader {
28 | 
29 |   /**
30 |    * Populates a SequenceDictionary from a .genome file on disk.
31 |    *
32 |    * @param filePath The path to read the dictionary from.
33 |    * @param sc The SparkContext to use for configuration.
34 |    * @return Returns a populated sequence dictionary.
35 |    */
36 |   def apply(filePath: String,
37 |             sc: SparkContext): SequenceDictionary = {
38 | 
39 |     val records = sc
40 |       .textFile(filePath)
41 |       .map(line => line.split("\t"))
42 |       .map(tokens => if (tokens.length > 2) {
43 |         SequenceRecord(
44 |           tokens(0),
45 |           tokens(1).toLong,
46 |           url = Some(tokens(2)),
47 |           md5 = None,
48 |           refseq = None,
49 |           genbank = None,
50 |           assembly = None,
51 |           species = None,
52 |           index = None
53 |         )
54 |       } else {
55 |         SequenceRecord(tokens(0), tokens(1).toLong)
56 |       })
57 |       .collect
58 | 
59 |     new SequenceDictionary(records.toVector)
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/util/ManualRegionPartitioner.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.util
19 | 
20 | import org.apache.spark.Partitioner
21 | 
22 | private[adam] case class ManualRegionPartitioner[V](partitions: Int) extends Partitioner {
23 | 
24 |   override def numPartitions: Int = partitions
25 | 
26 |   def getPartition(key: Any): Int = {
27 |     key match {
28 |       case (_, f2: Int) => f2
29 |       case (i: Int)     => i
30 |       case _ => {
31 |         throw new Exception("Unable to partition key %s without destination assignment.".format(key))
32 |       }
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/util/ParquetLogger.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.util
19 | 
20 | import java.util.logging.{ Level, Logger }
21 | 
22 | /**
23 |  * Helper object for setting the logging level for Parquet.
24 |  */
25 | object ParquetLogger {
26 | 
27 |   /**
28 |    * Sets the logger level for Parquet.
29 |    */
30 |   val hadoopLoggerLevel = (level: Level) => {
31 |     val parquetHadoopLogger = Logger.getLogger("org.apache.parquet.hadoop")
32 |     parquetHadoopLogger.setLevel(level)
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.util
19 | 
20 | import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary }
21 | 
22 | /**
23 |  * File that contains a reference assembly that can be broadcasted
24 |  */
25 | trait ReferenceFile extends Serializable {
26 |   /**
27 |    * Extract reference sequence from the file.
28 |    *
29 |    * @param region The desired ReferenceRegion to extract.
30 |    * @return The reference sequence at the desired locus.
31 |    */
32 |   def extract(region: ReferenceRegion): String
33 | 
34 |   /*
35 |    * Stores SequenceDictionary for ReferenceFile
36 |    */
37 |   def references: SequenceDictionary
38 | }
39 | 


--------------------------------------------------------------------------------
/adam-core/src/main/scala/org/bdgenomics/adam/util/TextAlignment.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.util
19 | 
20 | private[util] object TextAlignment extends Enumeration {
21 |   type TextAlignment = Value
22 |   val Left, Right, Center = Value
23 | }
24 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_all.fixed-phase-set.excerpt.vcf.README:
--------------------------------------------------------------------------------
 1 | Excerpt from file
 2 | ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv3.3.2/GRCh38/supplementaryFiles/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_all.vcf.gz
 3 | 
 4 | Genome in a Bottle (GIAB) sample HG001 (aka NA12878)
 5 | 
 6 | http://www.nature.com/nbt/journal/v32/n3/full/nbt.2835.html (doi:10.1038/nbt.2835)
 7 | http://www.nature.com/articles/sdata201625 (doi:10.1038/sdata.2016.25)
 8 | 
 9 | ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv3.3.2/README_NISTv3.3.2.txt
10 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/HLA_DQB1_05_01_01_02.dict:
--------------------------------------------------------------------------------
1 | @HD	VN:1.5
2 | @SQ	SN:HLA-DQB1*05:01:01:02	LN:7090	M5:0f304adf7acf3bd4b7c54c1394c85a4b	UR:file:/Users/akmorrow/ADAM/adam/adam-core/src/test/resources/HLA_DQB1_05_01_01_02.fa
3 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/HLA_DQB1_05_01_01_02.fa.fai:
--------------------------------------------------------------------------------
1 | HLA-DQB1*05:01:01:02	7090	39	72	73
2 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/artificial.README.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Description
 3 | 
 4 | artificial.fa and artificial.sam are hand crafted. The idea is that
 5 | there are two candiate indel contensus locations (deletions) but
 6 | the reads support one strictly more than the other one.
 7 | 
 8 | Relevant commands:
 9 | 
10 | After changing the sam do (to fix the MD tags):
11 | 
12 | samtools view -bS artificial.sam | samtools calmd - /home/andre/biotools/artificial.fa | samtools view -bS - > artificial.bam
13 | 
14 | Observe pileup via:
15 | 
16 | samtools mpileup -BIf artificial.fa artificial.bam | less
17 | 
18 | For comparison with GATK:
19 | 
20 | a) (only if new reads were added use Picard to add missing readgroup data):
21 | java -jar AddOrReplaceReadGroups.jar I= artificial.bam O= artificial.fixed.bam SORT_ORDER=coordinate RGID="read_group_id" RGLB="library" RGPL="illumina" RGPU="platform_unit" RGSM="sequencing_center" CREATE_INDEX=True;
22 | 
23 | b) java -jar GenomeAnalysisTK.jar -T RealignerTargetCreator -R artificial.fa -I artificial.fixed.bam -o target.intervals 
24 | 
25 | c) java -jar /home/andre/biotools/gatk/GenomeAnalysisTK.jar -T IndelRealigner -R /home/andre/biotools/artificial.fa -I artificial.fixed.bam -o artificial.realigned.bam -targetIntervals target.intervals
26 | 
27 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/artificial.cram:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/artificial.cram


--------------------------------------------------------------------------------
/adam-core/src/test/resources/artificial.fa:
--------------------------------------------------------------------------------
 1 | >artificial fasta
 2 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAGGGGGGGGGGAAAAAA
 3 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
 4 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
 5 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
 6 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
 7 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
 8 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
 9 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
10 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
11 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
12 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
13 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
14 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
15 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
16 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
17 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
18 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/artificial.fa.fai:
--------------------------------------------------------------------------------
1 | artificial	1120	18	70	71
2 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/artificial.sam:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.3	SO:coordinate
 2 | @SQ	SN:artificial	LN:1120
 3 | read1	67	artificial	6	90	29M10D31M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:70	XS:i:70	NM:i:20	MD:Z:29^GGGGGGGGGG10G0G0G0G0G0G0G0G0G0G11
 4 | read2	67	artificial	11	90	44M10D16M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:70	XS:i:70	NM:i:30	MD:Z:24G0G0G0G0G0G0G0G0G0G10^GGGGGGGGGG0A0A0A0A0A0A0A0A0A0A6
 5 | read3	67	artificial	16	90	19M10D41M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:70	XS:i:70	NM:i:20	MD:Z:19^GGGGGGGGGG10G0G0G0G0G0G0G0G0G0G21
 6 | read4	67	artificial	21	90	34M10D26M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:70	XS:i:70	NM:i:30	MD:Z:14G0G0G0G0G0G0G0G0G0G10^GGGGGGGGGG0A0A0A0A0A0A0A0A0A0A16
 7 | read5	67	artificial	26	90	9M10D51M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:70	XS:i:70	NM:i:20	MD:Z:9^GGGGGGGGGG10G0G0G0G0G0G0G0G0G0G31
 8 | read1	131	artificial	106	90	60M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	AS:i:70	XS:i:70	MD:Z:60
 9 | read2	131	artificial	111	90	60M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	AS:i:70	XS:i:70	MD:Z:60
10 | read3	131	artificial	116	90	60M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	AS:i:70	XS:i:70	MD:Z:60
11 | read4	131	artificial	121	90	60M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	AS:i:70	XS:i:70	MD:Z:60
12 | read5	131	artificial	126	90	60M	=	1	0	AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	AS:i:70	XS:i:70	MD:Z:60
13 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/badheader.sam:
--------------------------------------------------------------------------------
1 | @SQ SNN:1	LN:249250621
2 | @SQ	SN:2	LN:243199373
3 | simread:1:26472783:false	16	1	26472784	60	75M	*	0	0	GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA	*	NM:i:0	AS:i:75	XS:i:0
4 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/bams/small.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/bams/small.bam


--------------------------------------------------------------------------------
/adam-core/src/test/resources/bqsr1.vcf.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/bqsr1.vcf.tbi


--------------------------------------------------------------------------------
/adam-core/src/test/resources/chr20.250k.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/chr20.250k.fa.gz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/ctg123.fasta.gff3:
--------------------------------------------------------------------------------
 1 | ##gff-version 3.2.1
 2 | ##sequence-region ctg123 1 1497228
 3 | ctg123	.	gene	1000	9000	.	+	.	ID=gene00001;Name=EDEN
 4 | ctg123	.	TF_binding_site	1000	1012	.	+	.	ID=tfbs00001;Parent=gene00001
 5 | ctg123	.	mRNA	1050	9000	.	+	.	ID=mRNA00001;Parent=gene00001;Name=EDEN.1
 6 | ctg123	.	five_prime_UTR	1050	1200	.	+	.	Parent=mRNA00001
 7 | ctg123	.	CDS	1201	1500	.	+	0	ID=cds00001;Parent=mRNA00001
 8 | ctg123	.	CDS	3000	3902	.	+	0	ID=cds00001;Parent=mRNA00001
 9 | ctg123	.	CDS	5000	5500	.	+	0	ID=cds00001;Parent=mRNA00001
10 | ctg123	.	CDS	7000	7600	.	+	0	ID=cds00001;Parent=mRNA00001
11 | ctg123	.	three_prime_UTR	7601	9000	.	+	.	Parent=mRNA00001
12 | ctg123	.	cDNA_match	1050	1500	5.8e-42	+	.	ID=match00001;Target=cdna0123+12+462
13 | ctg123	.	cDNA_match	5000	5500	8.1e-43	+	.	ID=match00001;Target=cdna0123+463+963
14 | ctg123	.	cDNA_match	7000	9000	1.4e-40	+	.	ID=match00001;Target=cdna0123+964+2964
15 | ##FASTA
16 | >ctg123
17 | cttctgggcgtacccgattctcggagaacttgccgcaccattccgccttg
18 | tgttcattgctgcctgcatgttcattgtctacctcggctacgtgtggcta
19 | tctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaa
20 | aagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttat
21 | aatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaat
22 | cttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattc
23 | gtatttgatttgggtttactatcgaataatgagaattttcaggcttaggc
24 | ttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
25 | aggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttag
26 | aatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattc
27 | >cnda0123
28 | ttcaagtgctcagtcaatgtgattcacagtatgtcaccaaatattttggc
29 | agctttctcaagggatcaaaattatggatcattatggaatacctcggtgg
30 | aggctcagcgctcgatttaactaaaagtggaaagctggacgaaagtcata
31 | tcgctgtgattcttcgcgaaattttgaaaggtctcgagtatctgcatagt
32 | gaaagaaaaatccacagagatattaaaggagccaacgttttgttggaccg
33 | tcaaacagcggctgtaaaaatttgtgattatggttaaagg
34 | >aa0123
35 | mapgsvtsdispsststagssrspesekpgpshggvppggpshsslpvgr
36 | rhppvlrmvlealqageqrrgtsvaaiklyilhkyptvdvlrfkyllkqa
37 | latgmrrgllarplnskarg
38 | >prot0123
39 | MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHSSLPVGR
40 | RHPPVLRMVLEALQAGEQRRGTSVAAIKLYILHKYPTVDVLRFKYLLKQA
41 | LATGMRRGLLARPLNSKARGATGSFKLVPKHKKKIQPRKMAPATAPRRAG
42 | EAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKVQKPPPKPGAATEKARKQ
43 | GGAAKDTRAQSGEARKVPPKPDKAMRAPSSAGGLSRKAKAKGSRSSQGDA
44 | EAYRKTKAESKSSKPTASKVKNGAASPTKKKVVAKAKAPKAGQGPNTKAA
45 | APAKGSGSKVVPAHLSRKTEAPKGPRKAGLPIKASSSKVSSQRAEA
46 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/dict_with_accession.dict:
--------------------------------------------------------------------------------
1 | @HD	VN:1.4	SO:unsorted
2 | @SQ	SN:1	LN:249250621	UR:file:/gs01/projects/ngs/resources/gatk/2.3/human_g1k_v37.fasta	M5:1b22b98cdeb4a9304cb5d48026a85128	REFSEQ:NC_000001.10	GENBANK:CM000663.1
3 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/env_test_command.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # pipe input to a file
4 | tee ${OUTPUT_PATH} > /dev/null
5 | 
6 | # print out another file
7 | cat ${INPUT_PATH} | tee ${OUTPUT_PATH}_2


--------------------------------------------------------------------------------
/adam-core/src/test/resources/example_intervals.list:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.0	SO:coordinate
 2 | @SQ	SN:1	LN:249250621	AS:GRCh37	UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta	M5:1b22b98cdeb4a9304cb5d48026a85128	SP:Homo Sapiens
 3 | @SQ	SN:2	LN:243199373	AS:GRCh37	UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta	M5:a0d9851da00400dec1098a9255ac712e	SP:Homo Sapiens
 4 | 1	30366	30503	+	target_1
 5 | 1	69089	70010	+	target_2
 6 | 1	367657	368599	+	target_3
 7 | 1	621094	622036	+	target_4
 8 | 1	861320	861395	+	target_5
 9 | 1	865533	865718	+	target_6
10 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/fastq_nobases.fq:
--------------------------------------------------------------------------------
1 | @nobases/1
2 | 
3 | +
4 | 
5 | @nobases/2
6 | 
7 | +
8 | 
9 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/fastq_noqual.fq:
--------------------------------------------------------------------------------
1 | @noqual/1
2 | GATTACA
3 | +
4 | *
5 | @noqual/2
6 | ACATTAG
7 | +
8 | *
9 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/fastq_sample1.fq.bgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/fastq_sample1.fq.bgz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/fastq_sample1.fq.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/fastq_sample1.fq.bz2


--------------------------------------------------------------------------------
/adam-core/src/test/resources/fastq_sample1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/fastq_sample1.fq.gz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/fastq_to_usam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | import sys
 5 | 
 6 | # read lines from stdin
 7 | lines = sys.stdin.readlines()
 8 | 
 9 | # must have multiple of 8
10 | assert len(lines) % 8 == 0, "Expected multiple of 8 lines (got %d -> %s)" % (len(lines), lines)
11 | fastq_records = len(lines) // 4
12 | 
13 | # print sam header
14 | print("@HD\tVN:1.5\tSO:unsorted")
15 | 
16 | # loop and print sam lines
17 | for i in range(fastq_records):
18 | 
19 |     # fastq is:
20 |     #
21 |     # @readname
22 |     # sequence
23 |     # +<optional readname>
24 |     # qualities
25 |     rn1 = lines[4 * i].strip()
26 |     assert rn1[0] == '@'
27 |     readName = rn1[1:-2]
28 |     readNum = rn1[-1:]
29 |     sequence = lines[4 * i + 1].strip()
30 |     rn2 = lines[4 * i + 2]
31 |     assert rn2[0] == '+'
32 |     assert len(rn2.strip()) == 1 or rn2[1:] == readName
33 |     qualities = lines[4 * i + 3].strip()
34 | 
35 |     # flags:
36 |     # 1 = paired (we assume that in this script)
37 |     # 4 = unmapped
38 |     # 8 = mate unmapped
39 |     # 64 = first of pair
40 |     # 128 = second of pair
41 |     flags = 8 | 4 | 1
42 |     if readNum == '1':
43 |         flags |= 64
44 |     elif readNum == '2':
45 |         flags |= 128
46 |     else:
47 |         assert 1 <= readNum <= 2, "Read num must be 1 or 2: %s from %s" % (readNum, rn1)
48 | 
49 |     # sam is the following tab-delimited columns:
50 |     #
51 |     # 1. read name
52 |     # 2. flags
53 |     # 3. ref (* = unaligned)
54 |     # 4. pos (0 = unaligned)
55 |     # 5. map qual (0 if unmapped)
56 |     # 6. cigar (* = unavailable)
57 |     # 7. mate ref (* = unaligned)
58 |     # 8. mate pos (0 = unaligned)
59 |     # 9. tlen (0 = unknown)
60 |     # 10. sequence
61 |     # 11. qualities
62 |     print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName,
63 |                                                      flags,
64 |                                                      sequence,
65 |                                                      qualities))
66 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/gencode.chr20.transcript_names.head10.txt:
--------------------------------------------------------------------------------
 1 | ENST00000608838.1
 2 | ENST00000382410.2
 3 | ENST00000382398.3
 4 | ENST00000542572.1
 5 | ENST00000382388.3
 6 | ENST00000334391.4
 7 | ENST00000544961.1
 8 | ENST00000246105.4
 9 | ENST00000382376.3
10 | ENST00000608495.1
11 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/gencode.v19.pc_transcripts.250k.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/gencode.v19.pc_transcripts.250k.fa.gz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/gencode.v7.annotation.trunc10.bed:
--------------------------------------------------------------------------------
 1 | chr1	11869	14409	gene	.	+	"pseudogene"
 2 | chr1	11869	14409	transcript	.	+	"processed_transcript"
 3 | chr1	11869	12227	exon	.	+	"processed_transcript"
 4 | chr1	12613	12721	exon	.	+	"processed_transcript"
 5 | chr1	13221	14409	exon	.	+	"processed_transcript"
 6 | chr1	12010	13670	transcript	.	+	"transcribed_unprocessed_pseudogene"
 7 | chr1	12010	12057	exon	.	+	"transcribed_unprocessed_pseudogene"
 8 | chr1	12179	12227	exon	.	+	"transcribed_unprocessed_pseudogene"
 9 | chr1	12613	12697	exon	.	+	"transcribed_unprocessed_pseudogene"
10 | chr1	12975	13052	exon	.	+	"transcribed_unprocessed_pseudogene"
11 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/hg19.chrM.2bit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/hg19.chrM.2bit


--------------------------------------------------------------------------------
/adam-core/src/test/resources/hg19.genome:
--------------------------------------------------------------------------------
 1 | chr1	249250621
 2 | chr2	243199373
 3 | chr3	198022430
 4 | chr4	191154276
 5 | chr5	180915260
 6 | chr6	171115067
 7 | chr7	159138663
 8 | chrX	155270560
 9 | chr8	146364022
10 | chr9	141213431
11 | chr10	135534747
12 | chr11	135006516
13 | chr12	133851895
14 | chr13	115169878
15 | chr14	107349540
16 | chr15	102531392
17 | chr16	90354753
18 | chr17	81195210
19 | chr18	78077248
20 | chr20	63025520
21 | chrY	59373566
22 | chr19	59128983
23 | chr22	51304566
24 | chr21	48129895
25 | chr6_ssto_hap7	4928567
26 | chr6_mcf_hap5	4833398
27 | chr6_cox_hap2	4795371
28 | chr6_mann_hap4	4683263
29 | chr6_apd_hap1	4622290
30 | chr6_qbl_hap6	4611984
31 | chr6_dbb_hap3	4610396
32 | chr17_ctg5_hap1	1680828
33 | chr4_ctg9_hap1	590426
34 | chr1_gl000192_random	547496
35 | chrUn_gl000225	211173
36 | chr4_gl000194_random	191469
37 | chr4_gl000193_random	189789
38 | chr9_gl000200_random	187035
39 | chrUn_gl000222	186861
40 | chrUn_gl000212	186858
41 | chr7_gl000195_random	182896
42 | chrUn_gl000223	180455
43 | chrUn_gl000224	179693
44 | chrUn_gl000219	179198
45 | chr17_gl000205_random	174588
46 | chrUn_gl000215	172545
47 | chrUn_gl000216	172294
48 | chrUn_gl000217	172149
49 | chr9_gl000199_random	169874
50 | chrUn_gl000211	166566
51 | chrUn_gl000213	164239
52 | chrUn_gl000220	161802
53 | chrUn_gl000218	161147
54 | chr19_gl000209_random	159169
55 | chrUn_gl000221	155397
56 | chrUn_gl000214	137718
57 | chrUn_gl000228	129120
58 | chrUn_gl000227	128374
59 | chr1_gl000191_random	106433
60 | chr19_gl000208_random	92689
61 | chr9_gl000198_random	90085
62 | chr17_gl000204_random	81310
63 | chrUn_gl000233	45941
64 | chrUn_gl000237	45867
65 | chrUn_gl000230	43691
66 | chrUn_gl000242	43523
67 | chrUn_gl000243	43341
68 | chrUn_gl000241	42152
69 | chrUn_gl000236	41934
70 | chrUn_gl000240	41933
71 | chr17_gl000206_random	41001
72 | chrUn_gl000232	40652
73 | chrUn_gl000234	40531
74 | chr11_gl000202_random	40103
75 | chrUn_gl000238	39939
76 | chrUn_gl000244	39929
77 | chrUn_gl000248	39786
78 | chr8_gl000196_random	38914
79 | chrUn_gl000249	38502
80 | chrUn_gl000246	38154
81 | chr17_gl000203_random	37498
82 | chr8_gl000197_random	37175
83 | chrUn_gl000245	36651
84 | chrUn_gl000247	36422
85 | chr9_gl000201_random	36148
86 | chrUn_gl000235	34474
87 | chrUn_gl000239	33824
88 | chr21_gl000210_random	27682
89 | chrUn_gl000231	27386
90 | chrUn_gl000229	19913
91 | chrM	16571
92 | chrUn_gl000226	15008
93 | chr18_gl000207_random	4262
94 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/hs38DH_chr1_10.fa:
--------------------------------------------------------------------------------
 1 | >chr1  AC:CM000663.2  gi:568336023  LN:248956422  rl:Chromosome  M5:6aef897c3d6ff0c78aff06ac189178dd  AS:GRCh38
 2 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
 3 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
 4 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
 5 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
 6 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
 7 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
 8 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
 9 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
10 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
11 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/human_g1k_v37_chr1_59kb.2bit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/human_g1k_v37_chr1_59kb.2bit


--------------------------------------------------------------------------------
/adam-core/src/test/resources/improper_pairs_1.fq:
--------------------------------------------------------------------------------
 1 | @H06HDADXX130110:2:2116:3345:91806/1
 2 | GTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGCTAGGGTTAAGGGTAGGGTTAGCGAAAGGGCTGGGGTTAGGGGTGCGGGTACGCGTAGCATTAGGGCTAGAAGTAGGATCTGCAGTGCCTGACCGCGTCTGCGCGGCGACTGCCCAAAGCCTGGGGCCGACTCCAGGCTGAAGCTCAT
 3 | +
 4 | >=<=???>?>???=??>>8<?><=2=<===1194<?;:?>>?#3==>###########################################################################################################################################################################################################
 5 | @H06HDADXX130110:1:2103:11970:57672/1
 6 | GGATAGGGTTAGGGTTAGGGTTAGGGCTAGGGATAGGGGTAGGGTTGGGGTTGGTCATCGGGTGTTTCTTTGTGTTTGAGGTTGATTATTGTGATGGTTAAGGTATCTAGGTATTGTAAAAGTTGGCTTTTAACTTAGAAAATTATGTCATTCTGTTCACAAGTGTTTAGATTGGTAGATAGGTACTATGCGATCACTTCCATTGGCTGAGAGTTCGATTGATTATGAGCCACGCTAGTGGTTGAGATCT
 7 | +
 8 | 69+26933-:7;;135,53<>7<692(?2=9:**;<=#####################################################################################################################################################################################################################
 9 | @H06JUADXX130110:1:1108:6424:55322/1
10 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACTCTAACCCTAACCCTAACCCTAACGGTAACCCTTACCCTTACTGTAACGCTTATCCTAAATCAAATTCTTCCTCTTAAGATCGCTGTTAAAATTAATCCTATTAGAACAGGTCTTCTGGCACCAAGTTATGTCAATATCCCTTACTCTAAACATGCCTTGATCTCTCATGCATCACTTCAGCACAGCTCTTATGGATCTAGGATCCTCAGT
11 | +
12 | =>;=?=@@=?@?@@9>7@=?=;=?@>29?=?;=>@;4@*0878;40'=@;(3399@9>7@:A############################################################################################################################################################################################
13 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/improper_pairs_2.fq:
--------------------------------------------------------------------------------
 1 | @H06HDADXX130110:2:2116:3345:91806/2
 2 | TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCCTAACCCTAACCCTAACCCTAACCCGTACCCTAAACCCAACCCTAACCACAAAGCAAATCCCAACCTTAACCGGAACCCGAAATCTCGCAGCAAATCTGCAGTAGAGACGCAGACTCAACCATGCGTCTATTAGTACGCATTATCATTGCCTCATGCTTCTTAAGTACAGAGAGATGAC
 3 | +
 4 | ==;<?>@@@<>>@??<>>???<=>>?>:><@?4=:>7=5=>:<=@;'@A?########################################################################################################################################################################################################
 5 | @H06HDADXX130110:1:2103:11970:57672/2
 6 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTATCGTCAAACCTTACCTCCTCCCTAGCCTCCACCCTGACCATGACACCAACCATCAGCCTTATAGAAAACCCCAGAGATGCTCTTATCCTATACCACAATTACCCCATAACGAAAGAAAGGACTGAAAACAAATAAGTAAAATTCGTACAAATTATATCTATGAGTATGTCCCTGAGTGTAGGTGTAGGTGCATCC
 7 | +
 8 | =>:=>@=?<>>??>;:<?<=;<<?>=;:8;=(5)0-6;1:>?<>##############################################################################################################################################################################################################
 9 | 
10 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/indexed_bams/sorted.2.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/indexed_bams/sorted.2.bai


--------------------------------------------------------------------------------
/adam-core/src/test/resources/indexed_bams/sorted.2.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/indexed_bams/sorted.2.bam


--------------------------------------------------------------------------------
/adam-core/src/test/resources/indexed_bams/sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/indexed_bams/sorted.bam


--------------------------------------------------------------------------------
/adam-core/src/test/resources/indexed_bams/sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/indexed_bams/sorted.bam.bai


--------------------------------------------------------------------------------
/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.bgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.bgz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.bz2


--------------------------------------------------------------------------------
/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/interleaved_fastq_sample1.ifq.gz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/interleaved_fastq_sample2.ifq.output:
--------------------------------------------------------------------------------
 1 | >>>interleaved fastq record start>>>
 2 | @H06HDADXX130110:1:2103:11970:57672_1
 3 | GGATAGGGTTAGGGTTAGGGTTAGGGCTAGGGATAGGGGTAGGGTTGGGGTTGGTCATCGGGTGTTTCTTTGTGTTTGAGGTTGATTATTGTGATGGTTAAGGTATCTAGGTATTGTAAAAGTTGGCTTTTAACTTAGAAAATTATGTCATTCTGTTCACAAGTGTTTAGATTGGTAGATAGGTACTATGCGATCACTTCCATTGGCTGAGAGTTCGATTGATTATGAGCCACGCTAGTGGTTGAGATCT
 4 | +
 5 | 69+26933-:7;;135,53<>7<692(?2=9:**;<=#####################################################################################################################################################################################################################
 6 | @H06HDADXX130110:1:2103:11970:57672_2
 7 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTATCGTCAAACCTTACCTCCTCCCTAGCCTCCACCCTGACCATGACACCAACCATCAGCCTTATAGAAAACCCCAGAGATGCTCTTATCCTATACCACAATTACCCCATAACGAAAGAAAGGACTGAAAACAAATAAGTAAAATTCGTACAAATTATATCTATGAGTATGTCCCTGAGTGTAGGTGTAGGTGCATCC
 8 | +
 9 | =>:=>@=?<>>??>;:<?<=;<<?>=;:8;=(5)0-6;1:>?<>##############################################################################################################################################################################################################
10 | <<<interleaved fastq record end<<<
11 | >>>interleaved fastq record start>>>
12 | @H06JUADXX130110:1:1108:6424:55322_1
13 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACTCTAACCCTAACCCTAACCCTAACGGTAACCCTTACCCTTACTGTAACGCTTATCCTAAATCAAATTCTTCCTCTTAAGATCGCTGTTAAAATTAATCCTATTAGAACAGGTCTTCTGGCACCAAGTTATGTCAATATCCCTTACTCTAAACATGCCTTGATCTCTCATGCATCACTTCAGCACAGCTCTTATGGATCTAGGATCCTCAGT
14 | +
15 | =>;=?=@@=?@?@@9>7@=?=;=?@>29?=?;=>@;4@*0878;40'=@;(3399@9>7@:A############################################################################################################################################################################################
16 | @H06JUADXX130110:1:1108:6424:55322_2
17 | AGGGATAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGATAGGGCTAGGGTTAGGGATAGGGATAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTATCGATAGGGATAGGGATAGGGATAGAGTTAGGGCTATGGGTAGGGTTAGAGTCAGGGAAAGAGATAGGGATGGAGATGGGGTTAAAAAGAAGTCAAGGAATTAAGGTAGGGAAACGGTTCGAGATCTGTAAAGGGCAACGA
18 | +
19 | >>;>*9?:@??@@????@????>@?>>@>@?>?????@@???????=<??8;*;:>?;+A?@?>89?@######################################################################################################################################################################################
20 | <<<interleaved fastq record end<<<
21 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/invalid/small.INFO_flag.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##INFO=<ID=ABADFLAG,Number=.,Type=Flag,Description="A no good, very bad flag.">
3 | ##contig=<ID=1,length=249250621>
4 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA12891	NA12892
5 | 1	14397	.	CTGT	C	139.12	IndelQD	AC=2;AF=0.333;AN=6;BaseQRankSum=1.800;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384	GT:AD:DP:FT:GQ:PL	0/1:16,4:20:rd:99:120,0,827	0/1:8,2:10:dp;rd:60:60,0,414	0/0:39,0:39:PASS:99:0,116,2114


--------------------------------------------------------------------------------
/adam-core/src/test/resources/legacy.fa:
--------------------------------------------------------------------------------
 1 | ;LCBO - Prolactin precursor - Bovine
 2 | ; a sample sequence in FASTA format
 3 | MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSS
 4 | EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL
 5 | VTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGAKETEPYPVWSGLPSLQTKDED
 6 | ARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC*
 7 | 
 8 | >MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken
 9 | ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
10 | FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA
11 | DIDGDGQVNYEEFVQMMTAK*
12 | 
13 | >gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
14 | LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
15 | EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
16 | LLILILL---LALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
17 | GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
18 | IENY
19 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Root logger option
 2 | log4j.rootLogger=INFO, stderr, logfile
 3 | 
 4 | # Direct log messages to stderr
 5 | log4j.appender.stderr=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stderr.Target=System.err
 7 | log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
 8 | log4j.appender.stderr.threshold=WARN
 9 | log4j.appender.stderr.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
10 | 
11 | # Log at INFO level to file
12 | log4j.appender.logfile=org.apache.log4j.FileAppender
13 | log4j.appender.logfile.append=true
14 | log4j.appender.logfile.file=adam.log
15 | log4j.appender.logfile.threshold=INFO
16 | log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
17 | log4j.appender.logfile.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
18 | log4j.appender.logfile.encoding=UTF-8
19 | 
20 | # Tell Parquet to shut up
21 | log4j.logger.org.apache.parquet=ERROR
22 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/multi_chr.sam:
--------------------------------------------------------------------------------
1 | @SQ	SN:1	LN:249250621
2 | @SQ	SN:2	LN:243199373
3 | @PG	ID:p1	PN:myProg	CL:"myProg 123"	VN:1.0.0
4 | @PG	ID:p2	PN:myProg	CL:"myProg 456"	VN:1.0.0	PP:p1
5 | simread:1:26472783:false	16	1	26472784	60	75M	*	0	0	GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA	*	NM:i:0	AS:i:75	XS:i:0
6 | simread:1:240997787:true	0	1	240997788	60	75M	*	0	0	CTTTATTTTTATTTTTAAGGTTTTTTTTGTTTGTTTGTTTTGAGATGGAGTCTCGCTCCACCGCCCAGACTGGAG	*	NM:i:0	AS:i:75	XS:i:39
7 | simread:1:189606653:true	0	2	189606654	60	75M	*	0	0	TGTATCTTCCTCCCCTGCTGTATGTTTCCTGCCCTCAAACATCACACTCCACGTTCTTCAGCTTTAGGACTTGGA	*	NM:i:0	AS:i:75	XS:i:0
8 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/proper_pairs_1.fq:
--------------------------------------------------------------------------------
 1 | @H06HDADXX130110:2:2116:3345:91806/1
 2 | GTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGCTAGGGTTAAGGGTAGGGTTAGCGAAAGGGCTGGGGTTAGGGGTGCGGGTACGCGTAGCATTAGGGCTAGAAGTAGGATCTGCAGTGCCTGACCGCGTCTGCGCGGCGACTGCCCAAAGCCTGGGGCCGACTCCAGGCTGAAGCTCAT
 3 | +
 4 | >=<=???>?>???=??>>8<?><=2=<===1194<?;:?>>?#3==>###########################################################################################################################################################################################################
 5 | @H06HDADXX130110:1:2103:11970:57672/1
 6 | GGATAGGGTTAGGGTTAGGGTTAGGGCTAGGGATAGGGGTAGGGTTGGGGTTGGTCATCGGGTGTTTCTTTGTGTTTGAGGTTGATTATTGTGATGGTTAAGGTATCTAGGTATTGTAAAAGTTGGCTTTTAACTTAGAAAATTATGTCATTCTGTTCACAAGTGTTTAGATTGGTAGATAGGTACTATGCGATCACTTCCATTGGCTGAGAGTTCGATTGATTATGAGCCACGCTAGTGGTTGAGATCT
 7 | +
 8 | 69+26933-:7;;135,53<>7<692(?2=9:**;<=#####################################################################################################################################################################################################################
 9 | @H06JUADXX130110:1:1108:6424:55322/1
10 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACTCTAACCCTAACCCTAACCCTAACGGTAACCCTTACCCTTACTGTAACGCTTATCCTAAATCAAATTCTTCCTCTTAAGATCGCTGTTAAAATTAATCCTATTAGAACAGGTCTTCTGGCACCAAGTTATGTCAATATCCCTTACTCTAAACATGCCTTGATCTCTCATGCATCACTTCAGCACAGCTCTTATGGATCTAGGATCCTCAGT
11 | +
12 | =>;=?=@@=?@?@@9>7@=?=;=?@>29?=?;=>@;4@*0878;40'=@;(3399@9>7@:A############################################################################################################################################################################################
13 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/proper_pairs_2.fq:
--------------------------------------------------------------------------------
 1 | @H06HDADXX130110:2:2116:3345:91806/2
 2 | TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCCTAACCCTAACCCTAACCCTAACCCGTACCCTAAACCCAACCCTAACCACAAAGCAAATCCCAACCTTAACCGGAACCCGAAATCTCGCAGCAAATCTGCAGTAGAGACGCAGACTCAACCATGCGTCTATTAGTACGCATTATCATTGCCTCATGCTTCTTAAGTACAGAGAGATGAC
 3 | +
 4 | ==;<?>@@@<>>@??<>>???<=>>?>:><@?4=:>7=5=>:<=@;'@A?########################################################################################################################################################################################################
 5 | @H06HDADXX130110:1:2103:11970:57672/2
 6 | AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTATCGTCAAACCTTACCTCCTCCCTAGCCTCCACCCTGACCATGACACCAACCATCAGCCTTATAGAAAACCCCAGAGATGCTCTTATCCTATACCACAATTACCCCATAACGAAAGAAAGGACTGAAAACAAATAAGTAAAATTCGTACAAATTATATCTATGAGTATGTCCCTGAGTGTAGGTGTAGGTGCATCC
 7 | +
 8 | =>:=>@=?<>>??>;:<?<=;<<?>=;:8;=(5)0-6;1:>?<>##############################################################################################################################################################################################################
 9 | @H06JUADXX130110:1:1108:6424:55322/2
10 | AGGGATAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGATAGGGCTAGGGTTAGGGATAGGGATAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTATCGATAGGGATAGGGATAGGGATAGAGTTAGGGCTATGGGTAGGGTTAGAGTCAGGGAAAGAGATAGGGATGGAGATGGGGTTAAAAAGAAGTCAAGGAATTAAGGTAGGGAAACGGTTCGAGATCTGTAAAGGGCAACGA
11 | +
12 | >>;>*9?:@??@@????@????>@?>>@>@?>?????@@???????=<??8;*;:>?;+A?@?>89?@######################################################################################################################################################################################
13 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/read_names_with_index_sequences_interleaved.fq:
--------------------------------------------------------------------------------
 1 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 1:N:0:ATCANG
 2 | TCTGTGTAAATTACCCAGCCTCACGTATTCCTTTAGAGCAATGCAAAACAGACTAGACAAAAGGCTTTTAAAAGTCTAATCTGAGATTCCTGACCAAATGT
 3 | +
 4 | CCCFFFFFHHHHHJJJJJJJJJJJJHIJJJJJJJJJIJJJJJJJJJJJJJJJJJJJHIJGHJIJJIJJJJJHHHHHHHFFFFFFFEDDEEEEDDDDDDDDD
 5 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 2:N:0:ATCANG
 6 | NTAATAATGAGTGCACAATAGTTTTTCTCCTGAAACATAATTATTCTCTCAATCATCCCCATCCCCACCAAAGTCAATCACGGGAAGATCAATCAGCCTGC
 7 | +
 8 | #1=DFFFFHHHFHIJJIIIJJJIJJJIJJJJJJJIJJJJJJIJJJJJJJJJJJJJIJJJJJJJJJJIJIJIJJHHHHHHHFFFDDDDDDDDDDDDDDDDDD
 9 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 1:N:0:ATCANG
10 | AAGCTGACTTGTGTTGGGAGCTCATCTGTTCCCTTGACTTCTCTTTTTCCAGTTCTTCGTCAAGGCCACAGGTGCTGCGGGAAAATCAGTAACTAATGAAC
11 | +
12 | @C@FFFDFFFDFBHGDHH;EHHGHE?EBHCHIGGI>BFFECGGIIIIGIHEBBGCHHIG);;FEEADGH<ACAEDDBCCCBBCCCBCCC>@CCDCCC@CCC
13 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 2:N:0:ATCANG
14 | TCCTCCCACTTCTGTCTCCCTCAGCAGCCTCTCATATTGCTGCTGTCTGCCTGGCCTATAGGCTTCTGAGTTATGACACTGGTGTGAAGAGAAAAGGCTTN
15 | +
16 | 1?@DABDDDDF+AE?EBFHIIII>G>?;8?3?EEF<FF9@<DFGDD9?D93?BD889=)=<C3=C)=7DAD77=?7?A:EB2?96;@DB@?=;>B;5<5(:
17 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 1:N:0:ATCANG
18 | AGCTGACATGAGAAAAGCCTGGTAAATCCGGGGCAAGTGACTGAAATGAAAGAATCCAATCAGATTCCAGCTCCAAGGGCCGCTAATTGTAGTAACTGGCT
19 | +
20 | CCCFDFFFHFFFFGIIGGHJEB<CEFIJJIIIHGECG9BFDHGGIGHJFHIGEHHIIAHHGGEEFDDDFDBCEDECCBB@5=BDDDDEC:>@DDEDE@>??
21 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 2:N:0:ATCANG
22 | ATATTAAGCCACTTGCAGCAAGACAGCCTGAAACTTCGTGACTCCCTGGAGCTTTTGGTGGTGGACGAAGCTGACCTTCTTTTTTCCTTTGGCTTTGANNN
23 | +
24 | C@CFDFFFHGHGHIJJJIGHHHIJJJIIEHIJJJJJIHDFGBFGIIJGGHIGIJJGEHCFH@EHEEEDFCDCDDDDDDDDDDDDDDDDDDCDDCDDCDDCC
25 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 1:N:0:ATCANG
26 | TCATTCCACATCTCAATCTCTCCTAGGAAGTTTTCCGGCCTTGTTGACAGGTTTAATTGAAAGGAGAAGCCAAATGTTGAGTAAACAGATTGCAAAAACTG
27 | +
28 | CCCFFFFFHHFHHJJJJJIJIIJHJJJJJJIIJIJJGIIIIJIJJJJJJJJBFIIJJJJJJJJJJJJJJHHHHHFF?DFFFEEEEEEDDCDDDDDDDDDDC
29 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 2:N:0:ATCANG
30 | NNNNNNNNNNNNNNNNNNNNNGGATAANNANCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
31 | +
32 | #####################22@@??##1#0############.############################################++8::<======
33 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/read_names_with_index_sequences_pair1.fq:
--------------------------------------------------------------------------------
 1 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 1:N:0:ATCACG
 2 | TCTGTGTAAATTACCCAGCCTCACGTATTCCTTTAGAGCAATGCAAAACAGACTAGACAAAAGGCTTTTAAAAGTCTAATCTGAGATTCCTGACCAAATGT
 3 | +
 4 | CCCFFFFFHHHHHJJJJJJJJJJJJHIJJJJJJJJJIJJJJJJJJJJJJJJJJJJJHIJGHJIJJIJJJJJHHHHHHHFFFFFFFEDDEEEEDDDDDDDDD
 5 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 1:N:0:ATCACG
 6 | AAGCTGACTTGTGTTGGGAGCTCATCTGTTCCCTTGACTTCTCTTTTTCCAGTTCTTCGTCAAGGCCACAGGTGCTGCGGGAAAATCAGTAACTAATGAAC
 7 | +
 8 | @C@FFFDFFFDFBHGDHH;EHHGHE?EBHCHIGGI>BFFECGGIIIIGIHEBBGCHHIG);;FEEADGH<ACAEDDBCCCBBCCCBCCC>@CCDCCC@CCC
 9 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 1:N:0:ATCACG
10 | AGCTGACATGAGAAAAGCCTGGTAAATCCGGGGCAAGTGACTGAAATGAAAGAATCCAATCAGATTCCAGCTCCAAGGGCCGCTAATTGTAGTAACTGGCT
11 | +
12 | CCCFDFFFHFFFFGIIGGHJEB<CEFIJJIIIHGECG9BFDHGGIGHJFHIGEHHIIAHHGGEEFDDDFDBCEDECCBB@5=BDDDDEC:>@DDEDE@>??
13 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 1:N:0:ATCACG
14 | TCATTCCACATCTCAATCTCTCCTAGGAAGTTTTCCGGCCTTGTTGACAGGTTTAATTGAAAGGAGAAGCCAAATGTTGAGTAAACAGATTGCAAAAACTG
15 | +
16 | CCCFFFFFHHFHHJJJJJIJIIJHJJJJJJIIJIJJGIIIIJIJJJJJJJJBFIIJJJJJJJJJJJJJJHHHHHFF?DFFFEEEEEEDDCDDDDDDDDDDC


--------------------------------------------------------------------------------
/adam-core/src/test/resources/read_names_with_index_sequences_pair2.fq:
--------------------------------------------------------------------------------
 1 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 2:N:0:ATCACG
 2 | NTAATAATGAGTGCACAATAGTTTTTCTCCTGAAACATAATTATTCTCTCAATCATCCCCATCCCCACCAAAGTCAATCACGGGAAGATCAATCAGCCTGC
 3 | +
 4 | #1=DFFFFHHHFHIJJIIIJJJIJJJIJJJJJJJIJJJJJJIJJJJJJJJJJJJJIJJJJJJJJJJIJIJIJJHHHHHHHFFFDDDDDDDDDDDDDDDDDD
 5 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 2:N:0:ATCACG
 6 | TCCTCCCACTTCTGTCTCCCTCAGCAGCCTCTCATATTGCTGCTGTCTGCCTGGCCTATAGGCTTCTGAGTTATGACACTGGTGTGAAGAGAAAAGGCTTN
 7 | +
 8 | 1?@DABDDDDF+AE?EBFHIIII>G>?;8?3?EEF<FF9@<DFGDD9?D93?BD889=)=<C3=C)=7DAD77=?7?A:EB2?96;@DB@?=;>B;5<5(:
 9 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 2:N:0:ATCACG
10 | ATATTAAGCCACTTGCAGCAAGACAGCCTGAAACTTCGTGACTCCCTGGAGCTTTTGGTGGTGGACGAAGCTGACCTTCTTTTTTCCTTTGGCTTTGANNN
11 | +
12 | C@CFDFFFHGHGHIJJJIGHHHIJJJIIEHIJJJJJIHDFGBFGIIJGGHIGIJJGEHCFH@EHEEEDFCDCDDDDDDDDDDDDDDDDDDCDDCDDCDDCC
13 | @HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 2:N:0:ATCACG
14 | NNNNNNNNNNNNNNNNNNNNNGGATAANNANCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
15 | +
16 | #####################22@@??##1#0############.############################################++8::<======


--------------------------------------------------------------------------------
/adam-core/src/test/resources/readname_sorted.sam:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.6	SO:queryname
 2 | @SQ	SN:1	LN:1000
 3 | @SQ	SN:chr2	LN:1000
 4 | @SQ	SN:3	LN:1000
 5 | @SQ	SN:4	LN:2000
 6 | A	0	1	1	50	10M	*	0	0	ACACACACAC	**********
 7 | B	0	3	11	40	4M2I4M	*	0	0	ACACACACAC	**********
 8 | C	0	4	1001	25	8M	*	0	0	ACACACAC	********
 9 | D	0	chr2	501	55	10M2S	*	0	0	ACACACACACAC	************
10 | E	0	chr2	101	45	10M	*	0	0	ACACACACAC	**********
11 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/reads-0-2-0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/reads-0-2-0


--------------------------------------------------------------------------------
/adam-core/src/test/resources/sample_coverage.bed:
--------------------------------------------------------------------------------
1 | chr1	1	10	sequence_feature	3.0	?
2 | chr1	15	20	sequence_feature	2.0	?
3 | chr2	15	20	sequence_feature	2.0	?
4 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/small.1.bed:
--------------------------------------------------------------------------------
1 | 1	143	26423
2 | 1	14397230	26472788
3 | 1	169801934	169801939
4 | 1	240997788	240997796


--------------------------------------------------------------------------------
/adam-core/src/test/resources/small.1.narrowPeak:
--------------------------------------------------------------------------------
 1 | 1	26472784	26472859	simread:1:26472783:false	0	+	0	-1	-1	-1
 2 | 1	240997788	240997863	simread:1:240997787:true	0	+	0	-1	-1	-1
 3 | 1	189606654	189606729	simread:1:189606653:true	0	+	0	-1	-1	-1
 4 | 1	207027739	207027814	simread:1:207027738:true	0	+	0	-1	-1	-1
 5 | 1	14397234	14397309	simread:1:14397233:false	0	+	0	-1	-1	-1
 6 | 1	240344443	240344518	simread:1:240344442:true	0	+	0	-1	-1	-1
 7 | 1	153978725	153978800	simread:1:153978724:false	0	+	0	-1	-1	-1
 8 | 1	237728410	237728485	simread:1:237728409:true	0	+	0	-1	-1	-1
 9 | 1	231911907	231911982	simread:1:231911906:false	0	+	0	-1	-1	-1
10 | 1	50683372	50683447	simread:1:50683371:false	0	+	0	-1	-1	-1
11 | 1	37577446	37577521	simread:1:37577445:false	0	+	0	-1	-1	-1
12 | 1	195211966	195212041	simread:1:195211965:false	0	+	0	-1	-1	-1
13 | 1	163841414	163841489	simread:1:163841413:false	0	+	0	-1	-1	-1
14 | 1	101556379	101556454	simread:1:101556378:false	0	+	0	-1	-1	-1
15 | 1	20101801	20101876	simread:1:20101800:true	0	+	0	-1	-1	-1
16 | 1	186794284	186794359	simread:1:186794283:true	0	+	0	-1	-1	-1
17 | 1	165341383	165341458	simread:1:165341382:true	0	+	0	-1	-1	-1
18 | 1	5469107	5469182	simread:1:5469106:true	0	+	0	-1	-1	-1
19 | 1	89554253	89554328	simread:1:89554252:false	0	+	0	-1	-1	-1
20 | 1	169801934	169802009	simread:1:169801933:true	0	+	0	-1	-1	-1
21 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/small.1_12.bed:
--------------------------------------------------------------------------------
1 | 1	143	26423	line1	0.0	.	150	26400	0,0,0	.	.	.
2 | 1	14397230	26472788	line2	100.0	+	14397230	26472700	255,0,0	1	12075558	14397230
3 | 1	169801934	169801939	line3	200.0	-	.	.	0,255,0	2	100,200	169801934,169801739
4 | 1	240997788	240997796	line4 with a space	1000.0	?	.	.	0,0,255	.	.	.
5 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/small_realignment_targets.intervals:
--------------------------------------------------------------------------------
1 | 702290 702324
2 | 807756
3 | 808685
4 | 857250 857251
5 | 858175 858176
6 | 869645 869648
7 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/small_realignment_targets_README.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Rough summery for generation of test cases.
 3 | 
 4 | The reads were hand-picked from a input generated by the
 5 | Mason read simulator. Indel realignment intervals extracted
 6 | by hand from GATK output of the RealignmentTargetCreator.
 7 | 
 8 | Mouse reference from:
 9 | 
10 | wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38/Primary_Assembly/assembled_chromosomes/FASTA/chrY.fa.gz
11 | 
12 | Mason from:
13 | 
14 | http://www.seqan.de/projects/mason/
15 | 
16 | Mouse reads created by:
17 | 
18 | ./bin/mason illumina -sq -n 100 -hn 2 -pi 0.005 -pd 0.005 -mp -i -rn 1 /home/andre/biotools/mouse_chrY.fa
19 | 
20 | Convert sam to bam and index:
21 | 
22 | samtools view -bS mouse_chrY.fa.fastq.sam > mouse_chrY.fa.fastq.bam
23 | samtools index mouse_chrY.fa.fastq.bam
24 | 
25 | Fix read groups and such:
26 | 
27 | ./picard-tools.sh AddOrReplaceReadGroups.jar I= mouse_chrY.fa.fastq.bam O= mouse_chrY.fa.fastq.fixed.bam SORT_ORDER=coordinate RGID="read_group_id" RGLB="library" RGPL="illumina" RGPU="platform_unit" RGSM="sequencing_center" CREATE_INDEX=True;
28 | 
29 | Reference sequence dictionary:
30 | 
31 | ./picard-tools.sh CreateSequenceDictionary.jar R= mouse_chrY.fa O= mouse_chrY.dict
32 | 
33 | Here we notice that the true bam fails GATK's mapping quality test?!
34 | 
35 | bwa index mouse_chrY.fa
36 | bwa mem -M -t 4 mouse_chrY.fa mouse_chrY.fa_1.fastq mouse_chrY.fa_2.fastq > mouse_chrY.fa.bwa.sam
37 | 
38 | ... and repeat the steps from above
39 | samtools view -bS mouse_chrY.fa.bwa.sam > mouse_chrY.fa.bwa.bam
40 | samtools sort mouse_chrY.fa.bwa.bam mouse_chrY.fa.bwa.sorted
41 | samtools index mouse_chrY.fa.bwa.sorted.bam
42 | ./picard-tools.sh AddOrReplaceReadGroups.jar I= mouse_chrY.fa.bwa.sorted.bam O= mouse_chrY.fa.bwa.sorted.fixed.bam SORT_ORDER=coordinate RGID="read_group_id" RGLB="library" RGPL="illumina" RGPU="platform_unit" RGSM="sequencing_center" CREATE_INDEX=True;
43 | 
44 | Generate samtools mpileup for comparison:
45 | 
46 | samtools mpileup -f /home/andre/biotools/mouse_chrY.fa small_realignment_targets.bam > small_realignment_targets.pileup
47 | 
48 | Notice that MD tag is missing so generate it:
49 | 
50 | samtools calmd small_realignment_targets.bam /home/andre/biotools/mouse_chrY.fa > small_realignment_targets.sam_new
51 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/sorted.sam:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.6	SO:coordinate
 2 | @SQ	SN:1	LN:1000
 3 | @SQ	SN:3	LN:1000
 4 | @SQ	SN:4	LN:2000
 5 | @SQ	SN:chr2	LN:1000
 6 | A	0	1	1	50	10M	*	0	0	ACACACACAC	**********
 7 | B	0	3	11	40	4M2I4M	*	0	0	ACACACACAC	**********
 8 | C	0	4	1001	25	8M	*	0	0	ACACACAC	********
 9 | E	0	chr2	101	45	10M	*	0	0	ACACACACAC	**********
10 | D	0	chr2	501	55	10M2S	*	0	0	ACACACACACAC	************
11 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/tab5_to_usam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | import sys
 5 | 
 6 | # read lines from stdin
 7 | lines = sys.stdin.readlines()
 8 | 
 9 | # print sam header
10 | print("@HD\tVN:1.5\tSO:unsorted")
11 | 
12 | # loop and print sam lines
13 | for line in lines:
14 |     fields = line.split()
15 |     readName = fields[0]
16 |     firstSequence = fields[1]
17 |     firstQualities = fields[2]
18 |     secondSequence = fields[3]
19 |     secondQualities = fields[4]
20 | 
21 |     # flags:
22 |     # 1 = paired (we assume that in this script)
23 |     # 4 = unmapped
24 |     # 8 = mate unmapped
25 |     # 64 = first of pair
26 |     # 128 = second of pair
27 |     firstFlags = 64 | 8 | 4 | 1
28 |     secondFlags = 128 | 8 | 4 | 1
29 | 
30 |     # sam is the following tab-delimited columns:
31 |     #
32 |     # 1. read name
33 |     # 2. flags
34 |     # 3. ref (* = unaligned)
35 |     # 4. pos (0 = unaligned)
36 |     # 5. map qual (0 if unmapped)
37 |     # 6. cigar (* = unavailable)
38 |     # 7. mate ref (* = unaligned)
39 |     # 8. mate pos (0 = unaligned)
40 |     # 9. tlen (0 = unknown)
41 |     # 10. sequence
42 |     # 11. qualities
43 |     print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/1",
44 |                                                    firstFlags,
45 |                                                    firstSequence,
46 |                                                    firstQualities))
47 |     print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/2",
48 |                                                    secondFlags,
49 |                                                    secondSequence,
50 |                                                    secondQualities))
51 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/tab6_to_usam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | import sys
 5 | 
 6 | # read lines from stdin
 7 | lines = sys.stdin.readlines()
 8 | 
 9 | # print sam header
10 | print("@HD\tVN:1.5\tSO:unsorted")
11 | 
12 | # loop and print sam lines
13 | for line in lines:
14 |     fields = line.split()
15 |     firstReadName = fields[0]
16 |     firstSequence = fields[1]
17 |     firstQualities = fields[2]
18 |     secondReadName = fields[3]
19 |     secondSequence = fields[4]
20 |     secondQualities = fields[5]
21 | 
22 |     # flags:
23 |     # 1 = paired (we assume that in this script)
24 |     # 4 = unmapped
25 |     # 8 = mate unmapped
26 |     # 64 = first of pair
27 |     # 128 = second of pair
28 |     firstFlags = 64 | 8 | 4 | 1
29 |     secondFlags = 128 | 8 | 4 | 1
30 | 
31 |     # sam is the following tab-delimited columns:
32 |     #
33 |     # 1. read name
34 |     # 2. flags
35 |     # 3. ref (* = unaligned)
36 |     # 4. pos (0 = unaligned)
37 |     # 5. map qual (0 if unmapped)
38 |     # 6. cigar (* = unavailable)
39 |     # 7. mate ref (* = unaligned)
40 |     # 8. mate pos (0 = unaligned)
41 |     # 9. tlen (0 = unknown)
42 |     # 10. sequence
43 |     # 11. qualities
44 |     print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (firstReadName,
45 |                                                    firstFlags,
46 |                                                    firstSequence,
47 |                                                    firstQualities))
48 |     print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (secondReadName,
49 |                                                    secondFlags,
50 |                                                    secondSequence,
51 |                                                    secondQualities))
52 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/tag.sam:
--------------------------------------------------------------------------------
1 | @SQ	SN:1	LN:249250621
2 | @SQ	SN:2	LN:243199373
3 | @PG	ID:p1	PN:myProg	CL:"myProg 123"	VN:1.0.0
4 | @PG	ID:p2	PN:myProg	CL:"myProg 456"	VN:1.0.0	PP:p1
5 | simread:1:26472783:false	16	1	26472784	60	75M	*	0	0	GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA	*	NM:i:0	AS:i:75	XS:i:0	Zb:B:c,-1,0,1	ZB:B:C,1,0,1	Zi:B:i,-1,0,1,2	ZI:B:I,1,0,1,2	Zs:B:s,-2,0,2	ZS:B:S,2,0,2	ZF:B:f,-1.100000,0.000000,1.100000


--------------------------------------------------------------------------------
/adam-core/src/test/resources/tags.sam:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.4	SO:coordinate
 2 | @SQ	SN:1	LN:1000
 3 | StandardTags	0	1	1	255	10M	*	0	0	ACACACACAC	**********	NM:i:0	MD:Z:10 XS:A:-
 4 | MDTagWithEdits	0	1	1	255	10M	*	0	0	ACAGACACTC	**********	NM:i:2	MD:Z:3G4T1
 5 | HexByteArray	0	1	1	255	10M	*	0	0	ACACACACAC	**********	NM:i:0	MD:Z:10	XB:H:010203
 6 | LengthOneArrays	0	1	1	255	10M	*	0	0	ACACACACAC	**********	NM:i:0	MD:Z:10	XB:B:c,1	XI:B:i,1	XS:B:s,1	XF:B:f,1
 7 | LongerArrays	0	1	1	255	10M	*	0	0	ACACACACAC	**********	NM:i:0	MD:Z:10	XB:B:c,1,2,3	XI:B:i,1,2,3	XS:B:s,1,2,3	XS:B:f,1,2,3
 8 | SignedArrays	0	1	1	255	10M	*	0	0	ACACACACAC	**********	NM:i:0	MD:Z:10	XB:B:c,-1	XI:B:i,-1	XS:B:s,-1
 9 | UnsignedArrays	0	1	1	255	10M	*	0	0	ACACACACAC	**********	NM:i:0	MD:Z:10	XB:B:C,1,2,3	XI:B:I,1,2,3	XS:B:S,1,2,3
10 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test.compressed.bcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.compressed.bcf


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test.conf:
--------------------------------------------------------------------------------
1 | 
2 | accessKey=accessKey
3 | secretKey=secretKey
4 | 
5 | accessKey_s3 = accessKey_s3
6 | secretKey_s3=secretKey_s3
7 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test.uncompressed.bcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.uncompressed.bcf


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##fileDate=20090805
 3 | ##source=myImputationProgramV3.1
 4 | ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
 5 | ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
 6 | ##phasing=partial
 7 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
 8 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 9 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
10 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
11 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
12 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
13 | ##FILTER=<ID=q10,Description="Quality below 10">
14 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
15 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
16 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
17 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
18 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
19 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
20 | 20	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=0.5;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
21 | 20	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3
22 | 20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4
23 | 20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:7:56,60	0|0:48:4:51,51	0/0:61:2
24 | 20	1234567	microsat1	GTC	G,GTCT	50	PASS	NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/2:17:2	1/1:40:3
25 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test.vcf.bgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.vcf.bgz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test.vcf.bgzf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.vcf.bgzf.gz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/adam-core/src/test/resources/test.vcf.gz


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test_command.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # pipe input to a file
4 | tee $1 > /dev/null
5 | 
6 | # print out another file
7 | cat $2


--------------------------------------------------------------------------------
/adam-core/src/test/resources/test_rowgroup_rangeindex.1.txt:
--------------------------------------------------------------------------------
1 | s3://TEST/test-parquet	0	chr22:1000-2000,chr21:10000-20000
2 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/timeout.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | import sys
 5 | import time
 6 | 
 7 | # read lines from stdin
 8 | lines = sys.stdin.readlines()
 9 | 
10 | def print_lines(skip_header=False):
11 |     for line in lines:
12 |         if not (skip_header and line.startswith('@')):
13 |             print(line.strip().rstrip())
14 | 
15 | print_lines()
16 | sys.stdout.flush()
17 | 
18 | time.sleep(10)
19 | 
20 | print_lines(skip_header=True)
21 | sys.stdout.flush()
22 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/unsorted.sam:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.6	SO:unsorted
 2 | @SQ	SN:1	LN:1000
 3 | @SQ	SN:chr2	LN:1000
 4 | @SQ	SN:3	LN:1000
 5 | @SQ	SN:4	LN:2000
 6 | B	0	3	11	40	4M2I4M	*	0	0	ACACACACAC	**********
 7 | E	0	chr2	101	45	10M	*	0	0	ACACACACAC	**********
 8 | C	0	4	1001	25	8M	*	0	0	ACACACAC	********
 9 | A	0	1	1	50	10M	*	0	0	ACACACACAC	**********
10 | D	0	chr2	501	55	10M2S	*	0	0	ACACACACACAC	************
11 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/wgEncodeOpenChromDnaseGm19238Pk.trunc10.narrowPeak:
--------------------------------------------------------------------------------
 1 | chr1	713849	714434	chr1.1	1000	.	0.2252	9.16	-1	263
 2 | chr1	740180	740393	chr1.2	595	.	0.0473	1.94	-1	104
 3 | chr1	752735	753037	chr1.3	613	.	0.0536	 2.2	-1	135
 4 | chr1	762137	763263	chr1.4	1000	.	0.3077	12.5	-1	742
 5 | chr1	773142	773478	chr1.5	571	.	0.0387	1.59	-1	200
 6 | chr1	773831	773990	chr1.6	566	.	0.0370	1.52	-1	66
 7 | chr1	791738	791783	chr1.7	551	.	0.0315	 1.3	-1	13
 8 | chr1	793311	793670	chr1.8	690	.	0.0812	3.32	-1	165
 9 | chr1	793756	794115	chr1.9	588	.	0.0447	1.84	-1	144
10 | chr1	794221	794336	chr1.10	553	.	0.0323	1.33	-1	57
11 | 


--------------------------------------------------------------------------------
/adam-core/src/test/resources/wgs_calling_regions.hg38.interval_list:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.5	SO:coordinate
 2 | @SQ	SN:chr1	LN:248956422	M5:6aef897c3d6ff0c78aff06ac189178dd	AS:38	UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta	SP:Homo sapiens
 3 | @SQ	SN:chr2	LN:242193529	M5:f98db672eb0993dcfdabafe2a882905c	AS:38	UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta	SP:Homo sapiens
 4 | @PG	ID:1	CL:picard.util.IntervalListTools INPUT=[HG38excludeNs.interval_list, genome.interval_list] OUTPUT=wgs_calling_regions.v3.interval_list SORT=true ACTION=INTERSECT    PADDING=0 UNIQUE=false SCATTER_COUNT=1 INCLUDE_FILTERED=false BREAK_BANDS_AT_MULTIPLES_OF=0 SUBDIVISION_MODE=INTERVAL_SUBDIVISION INVERT=false VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json	PN:IntervalListTools
 5 | chr1	10001	207666	+	. intersection ACGTmer
 6 | chr1	257667	297968	+	. intersection ACGTmer
 7 | chr1	347969	535988	+	. intersection ACGTmer
 8 | chr1	585989	2702781	+	. intersection ACGTmer
 9 | chr1	2746291	12954384	+	. intersection ACGTmer
10 | chr1	13004385	16799163	+	. intersection ACGTmer
11 | chr1	16849164	29552233	+	. intersection ACGTmer
12 | chr1	29553836	121976459	+	. intersection ACGTmer
13 | chr1	122026460	124977944	+	. intersection ACGTmer
14 | chr1	124978327	125130246	+	. intersection ACGTmer
15 | chr1	125131848	125171347	+	. intersection ACGTmer
16 | chr1	125173584	125184587	+	. intersection ACGTmer
17 | chr1	143184588	223558935	+	. intersection ACGTmer
18 | chr1	223608936	228558364	+	. intersection ACGTmer
19 | chr1	228608365	248946422	+	. intersection ACGTmer
20 | chr2	10001	16145119	+	. intersection ACGTmer
21 | chr2	16146120	32867130	+	. intersection ACGTmer
22 | chr2	32868131	32916625	+	. intersection ACGTmer
23 | chr2	32917626	89330679	+	. intersection ACGTmer
24 | chr2	89530680	89685992	+	. intersection ACGTmer
25 | chr2	89753993	90402511	+	. intersection ACGTmer
26 | chr2	91402512	92138145	+	. intersection ACGTmer
27 | chr2	92188146	94090557	+	. intersection ACGTmer
28 | chr2	94140558	94293015	+	. intersection ACGTmer
29 | chr2	94496016	97439618	+	. intersection ACGTmer
30 | chr2	97489619	238903659	+	. intersection ACGTmer
31 | chr2	238904048	242183529	+	. intersection ACGTmer
32 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReadsSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.algorithms.consensus
19 | 
20 | import org.apache.spark.rdd.RDD
21 | import org.bdgenomics.adam.ds.ADAMContext._
22 | import org.bdgenomics.adam.rich.RichAlignment
23 | import org.bdgenomics.adam.util.ADAMFunSuite
24 | import org.bdgenomics.formats.avro.Alignment
25 | 
26 | class ConsensusGeneratorFromReadsSuite extends ADAMFunSuite {
27 | 
28 |   val cg = new ConsensusGeneratorFromReads
29 | 
30 |   def artificial_reads: RDD[Alignment] = {
31 |     val path = testFile("artificial.sam")
32 |     sc.loadAlignments(path).rdd
33 |   }
34 | 
35 |   sparkTest("checking search for consensus list for artificial reads") {
36 |     val consensus = cg.findConsensus(artificial_reads.map(new RichAlignment(_))
37 |       .collect()
38 |       .toSeq)
39 | 
40 |     assert(consensus.size === 2)
41 |   }
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.algorithms.consensus
19 | 
20 | import org.bdgenomics.adam.models.ReferenceRegion
21 | import org.scalatest.FunSuite
22 | 
23 | class ConsensusSuite extends FunSuite {
24 | 
25 |   test("test the insertion of a consensus insertion into a reference") {
26 |     val c = Consensus("TCGA", ReferenceRegion("0", 10L, 11L))
27 | 
28 |     val ref = "AAAAAAAAAA"
29 | 
30 |     val cs = c.insertIntoReference(ref, ReferenceRegion("0", 5L, 16L))
31 | 
32 |     assert(cs === "AAAAAATCGAAAAA")
33 |   }
34 | 
35 |   test("test the insertion of a consensus deletion into a reference") {
36 |     val c = Consensus("", ReferenceRegion("0", 10L, 16L))
37 | 
38 |     val ref = "AAAAATTTTT"
39 | 
40 |     val cs = c.insertIntoReference(ref, ReferenceRegion("0", 5L, 16L))
41 | 
42 |     assert(cs === "AAAAA")
43 |   }
44 | 
45 |   test("inserting empty consensus returns the reference") {
46 |     val ref = "AAAAAAAAAAAAA"
47 |     val c = new Consensus("", ReferenceRegion("0", 0L, 1L))
48 | 
49 |     val co = c.insertIntoReference(ref, ReferenceRegion("0", 0, ref.length))
50 | 
51 |     assert(ref === co)
52 |   }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/ds/LeftOuterShuffleRegionJoinSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds
19 | 
20 | import org.apache.spark.rdd.RDD
21 | import org.bdgenomics.adam.models.{
22 |   ReferenceRegion,
23 |   SequenceDictionary,
24 |   SequenceRecord
25 | }
26 | import org.bdgenomics.formats.avro.Alignment
27 | 
28 | class LeftOuterShuffleRegionJoinSuite(partitionMap: Seq[Option[(ReferenceRegion, ReferenceRegion)]])
29 |     extends OuterRegionJoinSuite {
30 | 
31 |   val partitionSize = 3
32 |   var seqDict: SequenceDictionary = _
33 | 
34 |   before {
35 |     seqDict = SequenceDictionary(
36 |       SequenceRecord("chr1", 15, url = "test://chrom1"),
37 |       SequenceRecord("chr2", 15, url = "test://chrom2"))
38 |   }
39 | 
40 |   def runJoin(leftRdd: RDD[(ReferenceRegion, Alignment)],
41 |               rightRdd: RDD[(ReferenceRegion, Alignment)]): RDD[(Option[Alignment], Alignment)] = {
42 |     LeftOuterShuffleRegionJoin[Alignment, Alignment](rightRdd, leftRdd)
43 |       .compute().map(_.swap)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/ds/RightOuterTreeRegionJoinSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds
19 | 
20 | import org.apache.spark.rdd.RDD
21 | import org.bdgenomics.adam.models.ReferenceRegion
22 | import org.bdgenomics.adam.ds.read.AlignmentArray
23 | import org.bdgenomics.formats.avro.Alignment
24 | import org.bdgenomics.utils.interval.array.IntervalArray
25 | 
26 | class RightOuterTreeRegionJoinSuite extends OuterRegionJoinSuite {
27 | 
28 |   def runJoin(leftRdd: RDD[(ReferenceRegion, Alignment)],
29 |               rightRdd: RDD[(ReferenceRegion, Alignment)]): RDD[(Option[Alignment], Alignment)] = {
30 |     RightOuterTreeRegionJoin[Alignment, Alignment]().broadcastAndJoin(
31 |       IntervalArray[ReferenceRegion, Alignment](leftRdd,
32 |         AlignmentArray.apply(_, _)),
33 |       rightRdd)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/ds/feature/GFF3HeaderWriterSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.feature
19 | 
20 | import org.bdgenomics.adam.util.ADAMFunSuite
21 | import scala.io.Source
22 | 
23 | class GFF3HeaderWriterSuite extends ADAMFunSuite {
24 | 
25 |   sparkTest("write gff3 header pragma") {
26 |     val tmp = tmpFile(".gff3")
27 |     GFF3HeaderWriter(tmp, sc)
28 |     val lines = Source.fromFile(tmp)
29 |       .getLines
30 |       .toSeq
31 |     assert(lines.size === 1)
32 |     assert(lines.head === GFF3HeaderWriter.HEADER_STRING)
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/ds/read/realignment/ModPartitionerSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.read.realignment
19 | 
20 | import org.scalatest.FunSuite
21 | 
22 | class ModPartitionerSuite extends FunSuite {
23 | 
24 |   val partitioner = ModPartitioner(123)
25 | 
26 |   test("report number of partitions correctly") {
27 |     assert(partitioner.numPartitions === 123)
28 |   }
29 | 
30 |   test("partition a number that is lower than the number of partitions and positive") {
31 |     assert(partitioner.getPartition(12) == 12)
32 |   }
33 | 
34 |   test("partition a number that is greater than the number of partitions and positive") {
35 |     assert(partitioner.getPartition(321) == 75)
36 |   }
37 | 
38 |   test("partition a number that is lower than the number of partitions and negative") {
39 |     assert(partitioner.getPartition(-21) == 21)
40 |   }
41 | 
42 |   test("partition a number that is greater than the number of partitions and negative") {
43 |     assert(partitioner.getPartition(-1234) == 4)
44 |   }
45 | 
46 |   test("fire an exception if input is not an integer") {
47 |     intercept[IllegalArgumentException] {
48 |       partitioner.getPartition("a string")
49 |     }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/ds/read/recalibration/RecalibrationTableSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.ds.read.recalibration
19 | 
20 | import org.bdgenomics.formats.avro.Alignment
21 | import org.scalatest.FunSuite
22 | 
23 | class RecalibrationTableSuite extends FunSuite {
24 | 
25 |   val observedCovariates = Map((CovariateKey(0,
26 |     (50 + 33).toChar,
27 |     2,
28 |     'A',
29 |     'C') -> new Aggregate(1000000, 1, 10.0)),
30 |     (CovariateKey(0,
31 |       (40 + 33).toChar,
32 |       1,
33 |       'N',
34 |       'N') -> new Aggregate(100000, 1, 10.0)))
35 |   val table = RecalibrationTable(new ObservationTable(
36 |     observedCovariates))
37 | 
38 |   test("look up quality scores in table") {
39 |     val scores = table(observedCovariates.map(_._1).toArray)
40 | 
41 |     assert(scores.size === 2)
42 |     assert(scores(0) === (50 + 33).toChar)
43 |     assert(scores(1) === (47 + 33).toChar)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/io/InterleavedFastqInputFormatSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.io
19 | 
20 | import org.bdgenomics.adam.util.ADAMFunSuite
21 | import org.apache.spark.rdd.RDD
22 | import org.apache.hadoop.io.Text
23 | 
24 | class InterleavedFastqInputFormatSuite extends ADAMFunSuite {
25 |   (1 to 5) foreach { testNumber =>
26 |     val inputName = "interleaved_fastq_sample%d.ifq".format(testNumber)
27 |     val expectedOutputName = inputName + ".output"
28 |     val expectedOutputPath = testFile(expectedOutputName)
29 |     val expectedOutputData = scala.io.Source.fromFile(expectedOutputPath).mkString
30 | 
31 |     sparkTest("interleaved FASTQ hadoop reader: %s->%s".format(inputName, expectedOutputName)) {
32 |       def ifq_reader: RDD[(Void, Text)] = {
33 |         val path = testFile(inputName)
34 |         sc.newAPIHadoopFile(path,
35 |           classOf[InterleavedFastqInputFormat],
36 |           classOf[Void],
37 |           classOf[Text])
38 |       }
39 | 
40 |       val ifq_reads = ifq_reader.collect()
41 | 
42 |       val testOutput = new StringBuilder()
43 | 
44 |       ifq_reads.foreach(pair => {
45 |         testOutput.append(">>>interleaved fastq record start>>>\n")
46 |         testOutput.append(pair._2)
47 |         testOutput.append("<<<interleaved fastq record end<<<\n")
48 |       })
49 | 
50 |       assert(testOutput.toString() == expectedOutputData)
51 |     }
52 |   }
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/io/SingleFastqInputFormatSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.io
19 | 
20 | import org.bdgenomics.adam.util.ADAMFunSuite
21 | import org.apache.spark.rdd.RDD
22 | import org.apache.hadoop.io.Text
23 | 
24 | class SingleFastqInputFormatSuite extends ADAMFunSuite {
25 |   (1 to 4) foreach { testNumber =>
26 |     val inputName = "fastq_sample%d.fq".format(testNumber)
27 |     val expectedOutputName = "single_" + inputName + ".output"
28 |     val expectedOutputPath = testFile(expectedOutputName)
29 |     val expectedOutputData = scala.io.Source.fromFile(expectedOutputPath).mkString
30 | 
31 |     sparkTest("FASTQ hadoop reader: %s->%s".format(inputName, expectedOutputName)) {
32 |       def ifq_reader: RDD[(Void, Text)] = {
33 |         val path = testFile(inputName)
34 |         sc.newAPIHadoopFile(path,
35 |           classOf[SingleFastqInputFormat],
36 |           classOf[Void],
37 |           classOf[Text])
38 |       }
39 | 
40 |       val ifq_reads = ifq_reader.collect()
41 | 
42 |       val testOutput = new StringBuilder()
43 | 
44 |       ifq_reads.foreach(pair => {
45 |         testOutput.append(">>>fastq record start>>>\n")
46 |         testOutput.append(pair._2)
47 |         testOutput.append("<<<fastq record end<<<\n")
48 |       })
49 | 
50 |       assert(testOutput.toString() == expectedOutputData)
51 |     }
52 |   }
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/util/ADAMFunSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.util
19 | 
20 | import htsjdk.samtools.util.Log
21 | import java.util.logging.Level
22 | import org.bdgenomics.utils.misc.SparkFunSuite
23 | 
24 | abstract class ADAMFunSuite extends SparkFunSuite {
25 | 
26 |   // added to resolve #1280
27 |   Log.setGlobalLogLevel(Log.LogLevel.ERROR)
28 | 
29 |   override val appName: String = "adam"
30 |   override val properties: Map[String, String] = Map(
31 |     "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
32 |     "spark.kryo.registrator" -> "org.bdgenomics.adam.serialization.ADAMKryoRegistrator",
33 |     "spark.kryo.referenceTracking" -> "true",
34 |     "spark.kryo.registrationRequired" -> "true"
35 |   )
36 | 
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/adam-core/src/test/scala/org/bdgenomics/adam/util/FileMergerSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Big Data Genomics (BDG) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The BDG licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.bdgenomics.adam.util
19 | 
20 | import org.apache.hadoop.fs.{ FileSystem, Path }
21 | 
22 | class FileMergerSuite extends ADAMFunSuite {
23 | 
24 |   sparkTest("cannot write both empty gzip block and cram eof") {
25 |     intercept[IllegalArgumentException] {
26 |       // we don't need to pass real paths here
27 |       FileMerger.mergeFiles(sc,
28 |         FileSystem.getLocal(sc.hadoopConfiguration),
29 |         new Path("output"),
30 |         new Path("head"),
31 |         writeEmptyGzipBlock = true,
32 |         writeCramEOF = true)
33 |     }
34 |   }
35 | 
36 |   sparkTest("buffer size must be non-negative") {
37 |     intercept[IllegalArgumentException] {
38 |       // we don't need to pass real paths here
39 |       FileMerger.mergeFiles(sc,
40 |         FileSystem.getLocal(sc.hadoopConfiguration),
41 |         new Path("output"),
42 |         new Path("head"),
43 |         optBufferSize = Some(0))
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/adam-distribution/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <parent>
 5 |     <groupId>org.bdgenomics.adam</groupId>
 6 |     <artifactId>adam-parent-spark3_2.12</artifactId>
 7 |     <version>1.1-SNAPSHOT</version>
 8 |     <relativePath>../pom.xml</relativePath>
 9 |   </parent>
10 | 
11 |   <artifactId>adam-distribution-spark3_2.12</artifactId>
12 |   <packaging>pom</packaging>
13 |   <name>ADAM_${scala.version.prefix}: Distribution</name>
14 |   <build>
15 |     <plugins>
16 |       <plugin>
17 |         <groupId>org.apache.maven.plugins</groupId>
18 |         <artifactId>maven-assembly-plugin</artifactId>
19 |         <configuration>
20 |           <descriptors>
21 |             <descriptor>src/main/assembly/assembly.xml</descriptor>
22 |           </descriptors>
23 |         </configuration>
24 |         <executions>
25 |           <execution>
26 |             <phase>package</phase>
27 |             <goals>
28 |               <goal>single</goal>
29 |             </goals>
30 |           </execution>
31 |         </executions>
32 |       </plugin>
33 |     </plugins>
34 |   </build>
35 | </project>
36 | 


--------------------------------------------------------------------------------
/adam-python/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .cache


--------------------------------------------------------------------------------
/adam-python/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | global-exclude *.py[cod] __pycache__ .DS_Store
20 | recursive-include deps/jars *.jar
21 | include version.py
22 | 


--------------------------------------------------------------------------------
/adam-python/README.md:
--------------------------------------------------------------------------------
 1 | # ADAM
 2 | 
 3 | ADAM is a library and command line tool that enables the use of [Apache
 4 | Spark](https://spark.apache.org) to parallelize genomic data analysis across
 5 | cluster/cloud computing environments. ADAM uses a set of schemas to describe
 6 | genomic sequences, reads, variants/genotypes, and features, and can be used
 7 | with data in legacy genomic file formats such as SAM/BAM/CRAM, BED/GFF3/GTF,
 8 | and VCF, as well as data stored in the columnar
 9 | [Apache Parquet](https://parquet.apache.org) format. On a single node, ADAM
10 | provides competitive performance to optimized multi-threaded tools, while
11 | enabling scale out to clusters with more than a thousand cores. ADAM's APIs
12 | can be used from Scala, Java, Python, R, and SQL.
13 | 
14 | ## Documentation
15 | 
16 | ADAM's documentation is hosted at [readthedocs](http://adam.readthedocs.io).
17 | 
18 | ## Python Requirements
19 | 
20 | ADAM depends on having PySpark installed.


--------------------------------------------------------------------------------
/adam-python/bdgenomics/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed to Big Data Genomics (BDG) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The BDG licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | # 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | # 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | __path__ = __import__('pkgutil').extend_path(__path__, __name__)
17 | 


--------------------------------------------------------------------------------
/adam-python/bdgenomics/adam/.gitignore:
--------------------------------------------------------------------------------
1 | schemas.py


--------------------------------------------------------------------------------
/adam-python/bdgenomics/adam/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | r"""
19 | =======================
20 | bdgenomics.adam Package
21 | =======================
22 | .. currentmodule:: bdgenomics.adam
23 | 
24 | ADAM's Python API wraps the ADAMContext and GenomicDataset APIs so they can be used from PySpark.
25 | The Python API is feature complete relative to ADAM's Java API.
26 | 
27 | .. automodule:: bdgenomics.adam.adamContext
28 | .. automodule:: bdgenomics.adam.models
29 | .. automodule:: bdgenomics.adam.ds
30 | .. automodule:: bdgenomics.adam.stringency
31 | 
32 | """
33 | 


--------------------------------------------------------------------------------
/adam-python/bdgenomics/adam/models.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | r"""
19 | ======
20 | models
21 | ======
22 | .. currentmodule:: bdgenomics.adam.models
23 | .. autosummary::
24 |    :toctree: _generate/
25 | 
26 |    ReferenceRegion
27 | """
28 | 
29 | class ReferenceRegion:
30 |     """
31 |     Represents a contiguous region of the reference genome.
32 |     """
33 | 
34 |     def __init__(self, referenceName, start, end):
35 |         """
36 |         Represents a contiguous region of the reference genome.
37 | 
38 |         :param referenceName The name of the sequence (chromosome) in the reference genome
39 |         :param start The 0-based residue-coordinate for the start of the region
40 |         :param end The 0-based residue-coordinate for the first residue <i>after</i> the start
41 |         which is <i>not</i> in the region -- i.e. [start, end) define a 0-based
42 |         half-open interval.
43 |         """
44 | 
45 |         self.referenceName = referenceName
46 |         self.start = start
47 |         self.end = end
48 | 
49 | 
50 |     def _toJava(self, jvm):
51 |         """
52 |         Converts to an org.bdgenomics.adam.models.ReferenceRegion
53 | 
54 |         Should not be called from user code.
55 | 
56 |         :param jvm: Py4j JVM handle.
57 |         """
58 | 
59 |         return jvm.org.bdgenomics.adam.models.ReferenceRegion.fromGenomicRange(self.referenceName, self.start, self.end)
60 | 


--------------------------------------------------------------------------------
/adam-python/bdgenomics/adam/stringency.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | r"""
19 | ==========
20 | stringency
21 | ==========
22 | .. currentmodule:: bdgenomics.adam.stringency
23 | .. autosummary::
24 |    :toctree: _generate/
25 | 
26 |    STRICT
27 |    LENIENT
28 |    SILENT
29 | """
30 | 
31 | STRICT = 2
32 | """
33 |     htsjdk.samtools.ValidationStringency.STRICT
34 | """
35 | LENIENT = 1
36 | """
37 |     htsjdk.samtools.ValidationStringency.LENIENT
38 | """
39 | SILENT = 0
40 | """
41 |     htsjdk.samtools.ValidationStringency.SILENT
42 | """
43 | 
44 | def _toJava(stringency, jvm):
45 |     """
46 |     Converts to an HTSJDK ValidationStringency enum.
47 | 
48 |     Should not be called from user code.
49 | 
50 |     :param bdgenomics.adam.stringency stringency: The desired stringency level.
51 |     :param jvm: Py4j JVM handle.
52 |     """
53 | 
54 |     if stringency is STRICT:
55 |         return jvm.htsjdk.samtools.ValidationStringency.valueOf("STRICT")
56 |     elif stringency is LENIENT:
57 |         return jvm.htsjdk.samtools.ValidationStringency.valueOf("LENIENT")
58 |     elif stringency is SILENT:
59 |         return jvm.htsjdk.samtools.ValidationStringency.valueOf("SILENT")
60 |     else:
61 |         raise RuntimeError("Received %s. Stringency must be one of STRICT (%d), LENIENT (%d), or SILENT (%s)." % (stringency, STRICT, LENIENT, SILENT))
62 | 


--------------------------------------------------------------------------------
/adam-python/bdgenomics/adam/test/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | 
20 | import os
21 | import sys
22 | import tempfile
23 | import unittest
24 | 
25 | from pyspark.sql import SparkSession
26 | 
27 | class SparkTestCase(unittest.TestCase):
28 | 
29 | 
30 |     def resourceFile(self, filename, module='adam-core'):
31 | 
32 |         adamRoot = os.path.dirname(os.getcwd())
33 |         return os.path.join(os.path.join(adamRoot,
34 |                                          "%s/src/test/resources" % module),
35 |                             filename)
36 | 
37 | 
38 |     def tmpFile(self):
39 | 
40 |         tempFile = tempfile.NamedTemporaryFile(delete=True)
41 |         tempFile.close()
42 |         return tempFile.name
43 | 
44 | 
45 |     def checkFiles(self, file1, file2):
46 | 
47 |         f1 = open(file1)
48 |         f2 = open(file2)
49 | 
50 |         try:
51 |             self.assertEqual(f1.read(), f2.read())
52 |         finally:
53 |             f1.close()
54 |             f2.close()
55 | 
56 | 
57 |     def setUp(self):
58 |         self._old_sys_path = list(sys.path)
59 |         class_name = self.__class__.__name__
60 |         self.ss = SparkSession.builder \
61 |                               .master('local[4]') \
62 |                               .appName(class_name) \
63 |                               .getOrCreate()
64 |         self.sc = self.ss.sparkContext
65 | 
66 |         
67 |     def tearDown(self):
68 |         self.sc.stop()
69 |         sys.path = self._old_sys_path
70 | 


--------------------------------------------------------------------------------
/adam-python/bdgenomics/adam/test/variantDataset_test.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | 
20 | from bdgenomics.adam.adamContext import ADAMContext
21 | from bdgenomics.adam.test import SparkTestCase
22 | 
23 | 
24 | class VariantDatasetTest(SparkTestCase):
25 | 
26 |     
27 |     def test_vcf_round_trip(self):
28 |         
29 |         testFile = self.resourceFile("small.vcf")
30 |         ac = ADAMContext(self.ss)
31 |         
32 |         variants = ac.loadVariants(testFile)
33 | 
34 |         tmpPath = self.tmpFile() + ".vcf"
35 |         variants.toVariantContexts().saveAsVcf(tmpPath)
36 | 
37 |         savedVariants = ac.loadVariants(testFile)
38 | 
39 |         self.assertEqual(variants._jvmDataset.jrdd().count(),
40 |                           savedVariants._jvmDataset.jrdd().count())
41 | 
42 | 
43 |     def test_transform(self):
44 | 
45 |         variantPath = self.resourceFile("small.vcf")
46 |         ac = ADAMContext(self.ss)
47 | 
48 |         variants = ac.loadVariants(variantPath)
49 | 
50 |         transformedVariants = variants.transform(lambda x: x.filter(x.start < 19190))
51 | 
52 |         self.assertEqual(transformedVariants.toDF().count(), 3)
53 | 


--------------------------------------------------------------------------------
/adam-python/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <parent>
 5 |     <groupId>org.bdgenomics.adam</groupId>
 6 |     <artifactId>adam-parent-spark3_2.12</artifactId>
 7 |     <version>1.1-SNAPSHOT</version>
 8 |     <relativePath>../pom.xml</relativePath>
 9 |   </parent>
10 | 
11 |   <artifactId>adam-python-spark3_2.12</artifactId>
12 |   <packaging>jar</packaging>
13 |   <name>ADAM_${scala.version.prefix}: Python APIs</name>
14 |   <properties>
15 |     <timestamp>${maven.build.timestamp}</timestamp>
16 |     <maven.build.timestamp.format>yyyy-MM-dd</maven.build.timestamp.format>
17 |   </properties>
18 | 
19 |   <build>
20 |     <plugins>
21 |       <plugin>
22 |         <groupId>org.codehaus.mojo</groupId>
23 |         <artifactId>exec-maven-plugin</artifactId>
24 |         <executions>
25 |           <execution>
26 |             <id>dev-python</id>
27 |             <phase>process-resources</phase>
28 |             <goals>
29 |               <goal>exec</goal>
30 |             </goals>
31 |             <configuration>
32 |               <executable>make</executable>
33 |               <arguments>
34 |                 <argument>develop</argument>
35 |               </arguments>
36 |             </configuration>
37 |           </execution>
38 |           <execution>
39 |             <id>test-python</id>
40 |             <phase>process-test-resources</phase>
41 |             <goals>
42 |               <goal>exec</goal>
43 |             </goals>
44 |             <configuration>
45 |               <executable>make</executable>
46 |               <arguments>
47 |                 <argument>test</argument>
48 |               </arguments>
49 |               <skip>${skipTests}</skip>
50 |             </configuration>
51 |           </execution>
52 |         </executions>
53 |       </plugin>
54 |     </plugins>
55 |   </build>
56 | </project>
57 | 


--------------------------------------------------------------------------------
/adam-python/version.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from __future__ import print_function
20 | 
21 | version = '1.1a0'
22 | 
23 | if __name__ == '__main__':
24 |     print(version)
25 | 


--------------------------------------------------------------------------------
/adam-r/.gitignore:
--------------------------------------------------------------------------------
1 | !bdgenomics.adam
2 | 


--------------------------------------------------------------------------------
/adam-r/bdgenomics.adam/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: bdgenomics.adam
 2 | Type: Package
 3 | Version: 1.0.1
 4 | Title: R Frontend for Big Data Genomics/ADAM
 5 | Description: ADAM is a genomics analysis platform with specialized file formats built using Apache Avro, Apache Spark and Parquet.
 6 | Author: Big Data Genomics
 7 | Maintainer: Frank Austin Nothaft <fnothaft@alumni.stanford.edu>
 8 | Authors@R: c(person("Frank Austin", "Nothaft", role = c("aut", "cre"),
 9 |                     email = "fnothaft@alumni.stanford.edu"),
10 |              person(family = "Big Data Genomics", role = c("aut", "cph")))
11 | License: Apache License (== 2.0)
12 | URL: http://www.bdgenomics.org https://github.com/bigdatagenomics/adam
13 | BugReports: https://github.com/bigdatagenomics/adam/issues
14 | Imports:
15 |     methods,
16 |     SparkR (>= 2.1.0)
17 | Depends:
18 |     R (>= 3.0)
19 | Suggests:
20 |     testthat
21 | Collate:
22 |     'generics.R'
23 |     'adam-context.R'
24 |     'ds.R'
25 | RoxygenNote: 7.1.1
26 | 


--------------------------------------------------------------------------------
/adam-r/bdgenomics.adam/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | library(testthat)
19 | library(bdgenomics.adam)
20 | 
21 | test_check("bdgenomics.adam")
22 | 


--------------------------------------------------------------------------------
/adam-r/bdgenomics.adam/tests/testthat/helpers.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | resourceFile <- function(fileName, submodule="adam-core") {
20 |     file.path(dirname(dirname(dirname(dirname(getwd())))),
21 |               paste(submodule, "src/test/resources", sep = "/",
22 |                     fileName))
23 | }
24 | 
25 | expect_files_match <- function(newFile, originalFile) {
26 |     expect_equal(readLines(newFile), readLines(originalFile))
27 | }
28 | 


--------------------------------------------------------------------------------
/adam-r/bdgenomics.adam/tests/testthat/test_variantDataset.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to Big Data Genomics (BDG) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The BDG licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | library(bdgenomics.adam)
19 | 
20 | context("manipulating variants")
21 | 
22 | ac <- createADAMContext()
23 | 
24 | test_that("round trip vcf", {
25 |     testFile <- resourceFile("small.vcf")
26 |     variants <- loadVariants(ac, testFile)
27 |     tmpPath <- tempfile(fileext = ".vcf")
28 |     saveAsVcf(toVariantContexts(variants), tmpPath)
29 | 
30 |     expect_equal(count(toDF(variants)), count(toDF(loadVariants(ac, tmpPath))))
31 | })
32 | 


--------------------------------------------------------------------------------
/bin/adam-shell:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Licensed to Big Data Genomics (BDG) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The BDG licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | set -e
21 | 
22 | # does the user have ADAM_OPTS set? if yes, then warn
23 | if [[ -z $@ && -n "$ADAM_OPTS" ]]; then
24 |     echo "WARNING: Passing Spark arguments via ADAM_OPTS was recently removed." 1>&2
25 |     echo "Run adam-shell instead as adam-shell <spark-args>" 1>&2
26 | fi
27 | 
28 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]})
29 | 
30 | ADAM_CLI_JAR=$(${SOURCE_DIR}/find-adam-assembly.sh)
31 | 
32 | SPARK_SHELL=$(${SOURCE_DIR}/find-spark.sh spark-shell)
33 | echo "Using SPARK_SHELL=$SPARK_SHELL" 1>&2
34 | 
35 | # submit the job to Spark
36 | "$SPARK_SHELL" \
37 |     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
38 |     --conf spark.kryo.registrator=org.bdgenomics.adam.serialization.ADAMKryoRegistrator \
39 |     --jars ${ADAM_CLI_JAR} \
40 |     "$@"
41 | 


--------------------------------------------------------------------------------
/bin/adamR:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Licensed to Big Data Genomics (BDG) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The BDG licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | set -e
21 | 
22 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]})
23 | 
24 | ADAM_CLI_JAR=$(${SOURCE_DIR}/find-adam-assembly.sh)
25 | 
26 | SPARKR=$(${SOURCE_DIR}/find-spark.sh sparkR)
27 | echo "Using SPARKR=$SPARKR" 1>&2
28 | 
29 | # submit the job to Spark
30 | "$SPARKR" \
31 |     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
32 |     --conf spark.kryo.registrator=org.bdgenomics.adam.serialization.ADAMKryoRegistrator \
33 |     --jars ${ADAM_CLI_JAR} \
34 |     --driver-class-path ${ADAM_CLI_JAR} \
35 |     "$@"
36 | 
37 | 


--------------------------------------------------------------------------------
/bin/find-adam-assembly.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Licensed to Big Data Genomics (BDG) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The BDG licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | set -e
21 | 
22 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]})
23 | . ${SOURCE_DIR}/find-adam-home
24 | 
25 | # Find ADAM cli assembly jar
26 | ADAM_CLI_JAR=
27 | if [ -d "$ADAM_HOME/repo" ]; then
28 |   ASSEMBLY_DIR="$ADAM_HOME/repo"
29 | elif [ -d "$ADAM_HOME/jars" ]; then
30 |   ASSEMBLY_DIR="$ADAM_HOME/jars"
31 | else
32 |   ASSEMBLY_DIR="$ADAM_HOME/adam-assembly/target"
33 | fi
34 | 
35 | ASSEMBLY_JARS=$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\.\_\-]*\.jar$" | grep -v javadoc | grep -v sources || true)
36 | num_jars=$(echo ${ASSEMBLY_JARS} | wc -w)
37 | 
38 | if [ "$num_jars" -eq "0" ]; then
39 |   echo "Failed to find ADAM cli assembly in $ASSEMBLY_DIR." 1>&2
40 |   echo "You need to build ADAM before running this program." 1>&2
41 |   exit 1
42 | fi
43 | 
44 | if [ "$num_jars" -gt "1" ]; then
45 |   echo "Found multiple ADAM cli assembly jars in $ASSEMBLY_DIR:" 1>&2
46 |   echo "$ASSEMBLY_JARS" 1>&2
47 |   echo "Please remove all but one jar." 1>&2
48 |   exit 1
49 | fi
50 | 
51 | echo "${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
52 | 


--------------------------------------------------------------------------------
/bin/find-adam-egg.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Licensed to Big Data Genomics (BDG) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The BDG licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | set -e
21 | 
22 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]})
23 | . ${SOURCE_DIR}/find-adam-home
24 | 
25 | # Find ADAM python egg
26 | if [ -d "$ADAM_HOME/repo" ]; then
27 |   DIST_DIR="$ADAM_HOME/repo"
28 | else
29 |   DIST_DIR="$ADAM_HOME/adam-python/dist"
30 | fi
31 | 
32 | DIST_EGG=$(ls -1 "$DIST_DIR" | grep "^bdgenomics\.adam[0-9A-Za-z\.\_\-]*.egg$" || true)
33 | num_egg=$(echo ${DIST_EGG} | wc -w)
34 | 
35 | if [ "$num_egg" -eq "0" ]; then
36 |   echo "Failed to find ADAM egg in $DIST_DIR." 1>&2
37 |   echo "You need to build ADAM before running this program." 1>&2
38 |   exit 1
39 | fi
40 | 
41 | if [ "$num_egg" -gt "1" ]; then
42 |   echo "Found multiple ADAM eggs in $DIST_DIR:" 1>&2
43 |   echo "$DIST_EGG" 1>&2
44 |   echo "Please remove all but one egg." 1>&2
45 |   exit 1
46 | fi
47 | 
48 | echo "${DIST_DIR}/${DIST_EGG}"
49 | 


--------------------------------------------------------------------------------
/bin/find-adam-home:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Licensed to Big Data Genomics (BDG) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The BDG licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | set -e
21 | 
22 | # Attempts to find a proper value for ADAM_HOME. Should be included using "source" directive.
23 | 
24 | FIND_ADAM_HOME_PYTHON_SCRIPT="$(cd "$(dirname "$0")"; pwd)/find_adam_home.py"
25 | 
26 | # Short cirtuit if the user already has this set.
27 | if [ ! -z "${ADAM_HOME}" ]; then
28 |    true
29 | elif [ ! -f "$FIND_ADAM_HOME_PYTHON_SCRIPT" ]; then
30 |   # If we are not in the same directory as find_adam_home.py we are not pip installed so we don't
31 |   # need to search the different Python directories for a ADAM installation.
32 |   # Note only that, if the user has pip installed adam but is directly calling pyadam or
33 |   # adam-submit in another directory we want to use that version of adam rather than the
34 |   # pip installed version of adam.
35 |   export ADAM_HOME="$(cd "$(dirname "$0")"/..; pwd)"
36 | else
37 |   # We are pip installed, use the Python script to resolve a reasonable ADAM_HOME
38 |   # Default to standard python interpreter unless told otherwise
39 |   if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
40 |      PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
41 |   fi
42 |   export ADAM_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_ADAM_HOME_PYTHON_SCRIPT")
43 | fi
44 | 


--------------------------------------------------------------------------------
/bin/find-spark.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Licensed to Big Data Genomics (BDG) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The BDG licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | set -e
21 | 
22 | SPARK_CMD=${1:-spark-submit}
23 | 
24 | # Find spark-submit script
25 | if [ -z "$SPARK_HOME" ]; then
26 |   SPARK_SUBMIT=$(which ${SPARK_CMD} || echo)
27 | else
28 |   SPARK_SUBMIT=${SPARK_HOME}/bin/${SPARK_CMD}
29 | fi
30 | if [ -z "$SPARK_SUBMIT" ]; then
31 |   echo "SPARK_HOME not set and ${SPARK_CMD} not on PATH; Aborting." 1>&2
32 |   exit 1
33 | fi
34 | 
35 | echo ${SPARK_SUBMIT}
36 | 


--------------------------------------------------------------------------------
/bin/pyadam:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Licensed to Big Data Genomics (BDG) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The BDG licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | set -e
21 | 
22 | SOURCE_DIR=$(dirname ${BASH_SOURCE[0]})
23 | 
24 | ADAM_CLI_JAR=$(${SOURCE_DIR}/find-adam-assembly.sh)
25 | ADAM_EGG=$(${SOURCE_DIR}/find-adam-egg.sh)
26 | 
27 | PYSPARK=$(${SOURCE_DIR}/find-spark.sh pyspark)
28 | echo "Using PYSPARK=$PYSPARK" 1>&2
29 | 
30 | # submit the job to Spark
31 | "$PYSPARK" \
32 |     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
33 |     --conf spark.kryo.registrator=org.bdgenomics.adam.serialization.ADAMKryoRegistrator \
34 |     --jars ${ADAM_CLI_JAR} \
35 |     --driver-class-path ${ADAM_CLI_JAR} \
36 |     --py-files ${ADAM_EGG} \
37 |     "$@"
38 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build


--------------------------------------------------------------------------------
/docs/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/_static/logo.png


--------------------------------------------------------------------------------
/docs/algorithms/dm.rst:
--------------------------------------------------------------------------------
 1 | Duplicate Marking Implementation
 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3 | 
 4 | Reads may be duplicated during sequencing, either due to clonal
 5 | duplication via PCR before sequencing, or due to optical duplication
 6 | while on the sequencer. To identify duplicated reads, we apply a
 7 | heuristic algorithm that looks at read fragments that have a consistent
 8 | mapping signature. First, we bucket together reads that are from the
 9 | same sequenced fragment by grouping reads together on the basis of read
10 | name and read group. Per read bucket, we then identify the 5' mapping
11 | positions of the primarily aligned reads. We mark as duplicates all read
12 | pairs that have the same pair alignment locations, and all unpaired
13 | reads that map to the same sites. Only the highest scoring read/read
14 | pair is kept, where the score is the sum of all quality scores in the
15 | read that are greater than 15.
16 | 


--------------------------------------------------------------------------------
/docs/algorithms/joins.rst:
--------------------------------------------------------------------------------
 1 | ShuffleRegionJoin Load Balancing
 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3 | 
 4 | ShuffleRegionJoins perform a sort-merge join on distributed genomic
 5 | data. The current standard for distributing genomic data are to use a
 6 | binning approach where ranges of genomic data are assigned to a
 7 | particular partition. This approach has a significant limitation that we
 8 | aim to solve: no matter how fine-grained the bins created, they can
 9 | never resolve extremely skewed data. ShuffleRegionJoin also requires
10 | that the data be sorted, so we keep track of the fact that knowledge of
11 | sort through the join so we can reuse this knowledge downstream.
12 | 
13 | The first step in ShuffleRegionJoin is to sort and balance the data.
14 | This is done with a sampling method and the data are sorted if it was
15 | not previously. When we shuffle the data, we also store the region
16 | ranges for all the data on this partition. Storing these partition
17 | bounds allows us to copartition the right dataset by assigning all
18 | records to a partition if the record falls within the partition bounds.
19 | After the right data are colocated with the correct records in the left
20 | dataset, we perform the join locally on each partition.
21 | 
22 | Maintaining the sorted knowledge and partition bounds are extremely
23 | useful for downstream applications that can take advantage of sorted
24 | data. Subsequent joins, for example, will be much faster because the
25 | data are already relatively balanced and sorted. Additional set theory
26 | and aggregation primitives, such as counting nearby regions, grouping
27 | and clustering nearby regions, and finding the set difference will all
28 | benefit from the sorted knowledge because each of these primitives
29 | requires that the data be sorted first.
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/api/img/join_examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/api/img/join_examples.png


--------------------------------------------------------------------------------
/docs/api/img/join_rdds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/api/img/join_rdds.png


--------------------------------------------------------------------------------
/docs/api/python.rst:
--------------------------------------------------------------------------------
1 | ADAM Python Documentation
2 | =========================
3 | 
4 | .. automodule:: bdgenomics.adam
5 | 


--------------------------------------------------------------------------------
/docs/architecture/evidence.rst:
--------------------------------------------------------------------------------
 1 | Interacting with data through ADAM's evidence access layer
 2 | ----------------------------------------------------------
 3 | 
 4 | ADAM exposes access to distributed datasets of genomic data through the
 5 | `ADAMContext <../api/adamContext.html>`__ entrypoint. The ADAMContext wraps Apache
 6 | Spark's SparkContext, which tracks the configuration and state of the
 7 | current running Spark application. On top of the SparkContext, the
 8 | ADAMContext provides data loading functions which yield
 9 | `GenomicDataset <../api/genomicDataset.html>`__\ s. The GenomicDataset classes provide a
10 | wrapper around Apache Spark's two APIs for manipulating distributed
11 | datasets: the legacy Resilient Distributed Dataset (Zaharia et al. 2012)
12 | and the new Spark SQL Dataset/DataFrame API (Armbrust et al. 2015).
13 | Additionally, the GenomicDataset is enriched with genomics-specific metadata
14 | such as computational lineage and sample metadata, and optimized
15 | genomics-specific query patterns such as `region joins <../api/joins.html>`__ and
16 | the `auto-parallelizing pipe API <../api/pipes.html>`__ for running legacy tools
17 | using Apache Spark.
18 | 
19 | .. figure:: img/grdd.png
20 |    :alt: The GenomicDataset Class Hierarchy
21 | 
22 |    The GenomicDataset Class Hierarchy
23 | 
24 | All GenomicDatasets include a sequence dictionary which describes the
25 | reference genome that the data in the genomic dataset are aligned to, if one is
26 | known. Additionally, ReadGroupGenomicDataset store a dictionary with read
27 | groups that are attached to the reads/fragments. Similarly, the
28 | MultisampleGenomicDataset includes a list of samples who are present in the
29 | dataset.
30 | 


--------------------------------------------------------------------------------
/docs/architecture/img/grdd.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/architecture/img/grdd.pdf


--------------------------------------------------------------------------------
/docs/architecture/img/grdd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/architecture/img/grdd.png


--------------------------------------------------------------------------------
/docs/architecture/img/stack-model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/architecture/img/stack-model.pdf


--------------------------------------------------------------------------------
/docs/architecture/img/stack-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/architecture/img/stack-model.png


--------------------------------------------------------------------------------
/docs/architecture/schemas.rst:
--------------------------------------------------------------------------------
 1 | The bdg-formats schemas
 2 | -----------------------
 3 | 
 4 | The schemas that comprise ADAM's narrow waist are defined in the
 5 | `bdg-formats <https://github.com/bigdatagenomics/bdg-formats>`__
 6 | project, using the `Apache Avro <https://avro.apache.org>`__ schema
 7 | description language. This schema definition language automatically
 8 | generates implementations of this schema for multiple common languages,
 9 | including Java, C, C++, and Python. bdg-formats contains several core
10 | schemas:
11 | 
12 | -  The *Alignment* schema represents a genomic read, along with
13 |    that read's alignment to a reference genome, if available.
14 | -  The *Feature* schema represents a generic genomic feature. This
15 |    record can be used to tag a region of the genome with an annotation,
16 |    such as coverage observed over that region, or the coordinates of an
17 |    exon.
18 | -  The *Fragment* schema represents a set of read alignments that came
19 |    from a single sequenced fragment.
20 | -  The *Genotype* schema represents a genotype call, along with
21 |    annotations about the quality/read support of the called genotype.
22 | -  The *Sequence* and *Slice* schema represents sequences and slices of
23 |    sequences, respectfully.
24 | -  The *Variant* schema represents a sequence variant, along with
25 |    statistics about that variant's support across a group of samples,
26 |    and annotations about the effect of the variant.
27 | 
28 | The bdg-formats schemas are designed so that common fields are easy to
29 | query, while maintaining extensibility and the ability to interoperate
30 | with common genomics file formats. Where necessary, the bdg-formats
31 | schemas are nested, which allows for the description of complex nested
32 | features and groupings (such as the Fragment record, which groups
33 | together Alignments). All fields in the bdg-formats schemas are
34 | nullable, and the schemas themselves do not contain invariants around
35 | valid values for a field. Instead, we validate data on ingress and
36 | egress to/from a conventional genomic file format. This allows users to
37 | take advantage of features such as field projection, which can improve
38 | the performance of queries like `flagstat <#flagstat>`__ by an order of
39 | magnitude.
40 | 


--------------------------------------------------------------------------------
/docs/benchmarks/img/bam.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/bam.pdf


--------------------------------------------------------------------------------
/docs/benchmarks/img/bam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/bam.png


--------------------------------------------------------------------------------
/docs/benchmarks/img/bed.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/bed.pdf


--------------------------------------------------------------------------------
/docs/benchmarks/img/bed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/bed.png


--------------------------------------------------------------------------------
/docs/benchmarks/img/gff.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/gff.pdf


--------------------------------------------------------------------------------
/docs/benchmarks/img/gff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/gff.png


--------------------------------------------------------------------------------
/docs/benchmarks/img/plot-speedup.py:
--------------------------------------------------------------------------------
 1 | from pylab import loglog, ylabel, xlabel, title, grid, savefig, show, legend, xticks, yticks, figure, xlim, ylim
 2 | 
 3 | def setup(n, st):
 4 |     exp_n = [1.0 / st, (n [-1] / n [0]) / st]
 5 |     l_n = [n [0], n [-1]]
 6 | 
 7 |     figure ()
 8 |     loglog (l_n, exp_n, 'k-', basex=2, basey=2, label="Ideal Speedup")
 9 | 
10 | def plot (n, mt, label_name, pattern):
11 | 
12 |     speedup = []
13 |     
14 |     for m in mt:
15 |         
16 |         speedup.append (1.0 / m)
17 | 
18 |     loglog (n, speedup, pattern, basex=2, basey=2, label=label_name)
19 | 
20 | def label(name, t, lloc=2):
21 |     locs,labels = xticks()
22 |     xn = ["", "32", "64", "128", "256", "512", "1024", ""]
23 |     xticks(locs, xn)
24 | 
25 |     yn = ["", "32K", "16K", "8K", "4K", "2K", "1K", "500", ""]
26 |     locs,labels = yticks()
27 |     yticks(locs, yn)
28 | 
29 |     ylabel ("Runtime (seconds)")
30 |     xlabel ("Number of Threads")
31 |     legend (loc=lloc)
32 |     title (t)
33 |     grid (True)
34 |     savefig (name)
35 | 
36 | n_ideal = [32, 1024]
37 | 
38 | n = [32, 128, 256, 512, 1024]
39 | markdup = [16639.22, 4438.37, 2005.25, 1247.36, 844.03]
40 | frag_md = [8249.56, 2594.44, 1409.86, 868.19, 529.19]
41 | gatk_md = [17068.58, 4036.25, 1737.97, 991.62, 589.37]
42 | bqsr = [27034.11, 7461.35, 4663.84, 2977.69, 2108.43]
43 | gatk_bqsr = [(28232.96 + 2931.97), (8473.64 + 1312.90), (5578.24 + 732.01), (3465.61 + 551.55), (2410.03 + 487.16)]
44 | ir = [23808.67, 6476.63, 3507.99, 2407.57, 1242.10]
45 | 
46 | setup(n_ideal, frag_md[0])
47 | 
48 | plot(n, markdup, 'ADAM Mark Duplicates', 'bx-')
49 | plot(n, frag_md, 'ADAM Fragments Mark Duplicates', 'bo--')
50 | plot(n, gatk_md, 'GATK4 Mark Duplicates', 'c.--')
51 | 
52 | label("speedup-md.pdf",
53 |       "Duplicate Marking Speedup on NA12878 (High Coverage)", lloc=4)
54 | 
55 | setup(n_ideal, bqsr[0])
56 | 
57 | plot(n, bqsr, 'ADAM BQSR', 'bx-')
58 | plot(n, gatk_bqsr, 'GATK4 BQSR', 'c.--')
59 | 
60 | label("speedup-bqsr.pdf",
61 |       "Base Recalibration Speedup on NA12878 (High Coverage)")
62 | 
63 | setup(n_ideal, ir[0])
64 | 
65 | plot(n, ir, 'INDEL Realignment', 'bx-')
66 | 
67 | label("speedup-ir.pdf",
68 |       "INDEL Realignment Speedup on NA12878 (High Coverage)")
69 | 


--------------------------------------------------------------------------------
/docs/benchmarks/img/speedup-bqsr.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-bqsr.pdf


--------------------------------------------------------------------------------
/docs/benchmarks/img/speedup-bqsr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-bqsr.png


--------------------------------------------------------------------------------
/docs/benchmarks/img/speedup-ir.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-ir.pdf


--------------------------------------------------------------------------------
/docs/benchmarks/img/speedup-ir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-ir.png


--------------------------------------------------------------------------------
/docs/benchmarks/img/speedup-md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-md.pdf


--------------------------------------------------------------------------------
/docs/benchmarks/img/speedup-md.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/speedup-md.png


--------------------------------------------------------------------------------
/docs/benchmarks/img/vcf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/vcf.pdf


--------------------------------------------------------------------------------
/docs/benchmarks/img/vcf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/benchmarks/img/vcf.png


--------------------------------------------------------------------------------
/docs/citing.rst:
--------------------------------------------------------------------------------
 1 | Citing ADAM
 2 | ===========
 3 | 
 4 | ADAM has been described in two manuscripts. The first,
 5 | `a tech report <https://www2.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-207.pdf>`__,
 6 | came out in 2013 and described the rationale behind using schemas for genomics,
 7 | and presented an early implementation of some of the preprocessing algorithms.
 8 | 
 9 | To cite this paper, please cite::
10 | 
11 |   @techreport{massie13,
12 |      title={{ADAM}: Genomics Formats and Processing Patterns for Cloud Scale Computing},
13 |      author={Massie, Matt and Nothaft, Frank and Hartl, Christopher and Kozanitis, Christos and Schumacher, Andr{\'e} and Joseph, Anthony D and Patterson, David A},
14 |      year={2013},
15 |      institution={UCB/EECS-2013-207, EECS Department, University of California, Berkeley}
16 |    }
17 | 
18 | 
19 | The second,
20 | `a conference paper <http://dl.acm.org/ft_gateway.cfm?ftid=1586788&id=2742787)>`__,
21 | appeared in the SIGMOD 2015 Industrial Track. This paper described how ADAM's
22 | design was influenced by database systems, expanded upon the concept of a stack
23 | architecture for scientific analyses, presented more results comparing ADAM to
24 | state-of-the-art single node genomics tools, and demonstrated how the
25 | architecture generalized beyond genomics.
26 | 
27 | To cite this paper, please cite::
28 | 
29 |   @inproceedings{nothaft15,
30 |      title={Rethinking Data-Intensive Science Using Scalable Analytics Systems},
31 |      author={Nothaft, Frank A and Massie, Matt and Danford, Timothy and Zhang, Zhao and Laserson, Uri and Yeksigian, Carl and Kottalam, Jey and Ahuja, Arun and Hammerbacher, Jeff and Linderman, Michael and Franklin, Michael and Joseph, Anthony D. and Patterson, David A.},
32 |      booktitle={Proceedings of the 2015 International Conference on Management of Data (SIGMOD '15)},
33 |      year={2015},
34 |      organization={ACM}
35 |    }
36 | 
37 | 
38 | We prefer that you cite both papers, but if you can only cite one paper, we
39 | prefer that you cite the SIGMOD 2015 manuscript.
40 | 


--------------------------------------------------------------------------------
/docs/downstream/overview.rst:
--------------------------------------------------------------------------------
 1 | Building Downstream Applications
 2 | ================================
 3 | 
 4 | ADAM is packaged so that it can be used interactively via the ADAM
 5 | shell, called from the command line interface (CLI), or included as a
 6 | library when building downstream applications.
 7 | 
 8 | This document covers three patterns for building applications downstream
 9 | of ADAM:
10 | 
11 | -  Extend the ADAM CLI by `adding new commands <cli.html#extend-the-adam-cli-by-adding-new-commands>`__
12 | -  Extend the ADAM CLI by `adding new commands in an external repository <cli.html#extend-the-adam-cli-by-adding-new-commands-in-an-external-repository>`__
13 | -  Use ADAM as a `library in new applications <library.html#use-adam-as-a-library-in-new-applications>`__
14 | 


--------------------------------------------------------------------------------
/docs/img/bdgenomics-stack.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/img/bdgenomics-stack.key


--------------------------------------------------------------------------------
/docs/img/bdgenomics-stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/img/bdgenomics-stack.png


--------------------------------------------------------------------------------
/docs/img/stack-model.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdatagenomics/adam/6f2aec83d2289f48104801a88c326b894ec7df2c/docs/img/stack-model.ai


--------------------------------------------------------------------------------
/docs/installation/example.rst:
--------------------------------------------------------------------------------
 1 | Running an example command
 2 | ==========================
 3 | 
 4 | flagstat
 5 | --------
 6 | 
 7 | Once you have data converted to ADAM, you can gather statistics from the
 8 | ADAM file using flagstat_. This command will output
 9 | stats identically to the samtools ``flagstat`` command.
10 | 
11 | .. code:: bash
12 | 
13 |     adam-submit flagstat NA12878_chr20.adam
14 | 
15 | Outputs:
16 | 
17 | ::
18 | 
19 |     51554029 + 0 in total (QC-passed reads + QC-failed reads)
20 |     0 + 0 duplicates
21 |     50849935 + 0 mapped (98.63%:0.00%)
22 |     51554029 + 0 paired in sequencing
23 |     25778679 + 0 read1
24 |     25775350 + 0 read2
25 |     49874394 + 0 properly paired (96.74%:0.00%)
26 |     50145841 + 0 with itself and mate mapped
27 |     704094 + 0 singletons (1.37%:0.00%)
28 |     158721 + 0 with mate mapped to a different chr
29 |     105812 + 0 with mate mapped to a different chr (mapQ>=5)
30 | 
31 | In practice, you will find that the ADAM ``flagstat`` command takes
32 | orders of magnitude less time than samtools to compute these statistics.
33 | For example, on a MacBook Pro, the command above took 17 seconds to run
34 | while ``samtools flagstat NA12878_chr20.bam`` took 55 seconds. On larger
35 | files, the difference in speed is even more dramatic. ADAM is faster
36 | because it is multi-threaded, distributed and uses a columnar storage
37 | format (with a projected schema that only materializes the read flags
38 | instead of the whole read).
39 | 
40 | Running on a cluster
41 | --------------------
42 | 
43 | We provide the ``adam-submit`` and ``adam-shell`` commands under the
44 | ``bin`` directory. These can be used to submit ADAM jobs to a spark
45 | cluster, or to run ADAM interactively.
46 | 
47 | 


--------------------------------------------------------------------------------
/docs/installation/pip.rst:
--------------------------------------------------------------------------------
 1 | Installing ADAM using Pip
 2 | =========================
 3 | 
 4 | ADAM is available through the `Python Package Index`_ and thus can be installed
 5 | using pip. To install ADAM using pip, run:
 6 | 
 7 | .. code:: bash
 8 | 
 9 |     pip install bdgenomics.adam
10 | 
11 | Pip will install the bdgenomics.adam Python binding, as well as the ADAM CLI.
12 | 
13 | .. _Python Package Index: https://pypi.python.org/pypi
14 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==1.7.7
2 | 


--------------------------------------------------------------------------------
/scripts/move_to_scala_2.11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set +x
 4 | 
 5 | grep "<scala\.version>" pom.xml | grep -q 2.11
 6 | if [[ $? == 0 ]];
 7 | then
 8 |     echo "Scala version is already set to 2.11 (Scala artifacts have _2.11 version suffix in artifact name)."
 9 |     echo "Cowardly refusing to move to Scala 2.11 a second time..."
10 | 
11 |     exit 1
12 | fi
13 | 
14 | find . -name "pom.xml" -exec sed -e "s/2.12.10/2.11.12/g" \
15 |     -e "s/2.12/2.11/g" \
16 |     -i.2.11.bak '{}' \;
17 | find . -name "*.2.11.*bak" -exec rm -f {} \;
18 | 


--------------------------------------------------------------------------------
/scripts/move_to_scala_2.12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set +x
 4 | 
 5 | grep "<scala\.version>" pom.xml | grep -q 2.12
 6 | if [[ $? == 0 ]];
 7 | then
 8 |     echo "Scala version is already set to 2.12 (Scala artifacts have _2.12 version suffix in artifact name)."
 9 |     echo "Cowardly refusing to move to Scala 2.12 a second time..."
10 | 
11 |     exit 1
12 | fi
13 | 
14 | find . -name "pom.xml" -exec sed -e "s/2.11.12/2.12.10/g" \
15 |     -e "s/2.11/2.12/g" \
16 |     -i.2.12.bak '{}' \;
17 | find . -name "*.2.12.*bak" -exec rm -f {} \;
18 | 


--------------------------------------------------------------------------------
/scripts/move_to_spark_2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set +x
 4 | 
 5 | grep -q "spark2" pom.xml
 6 | if [[ $? == 0 ]];
 7 | then
 8 |     echo "POM is already set up for Spark 2 (Spark 2 artifacts have -spark2 suffix in artifact names)."
 9 |     echo "Cowardly refusing to move to Spark 2 a second time..."
10 | 
11 |     exit 1
12 | fi
13 | 
14 | svp="\${scala.version.prefix}"
15 | substitution_cmd="s/-spark3_$svp/-spark2_$svp/g"
16 | 
17 | find . -name "pom.xml" -exec sed \
18 |     -e "/adam-/ s/-spark3_2\.1/-spark2_2\.1/" \
19 |     -e "/adam-/ $substitution_cmd" \
20 |     -e "/utils-/ s/-spark3_2\.1/-spark2_2\.1/" \
21 |     -e "/utils-/ $substitution_cmd" \
22 |     -e "/spark.version/ s/3.1.2/2.4.7/g" \
23 |     -i.spark2.bak '{}' \;
24 | 


--------------------------------------------------------------------------------
/scripts/move_to_spark_3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set +x
 4 | 
 5 | grep -q "spark3" pom.xml
 6 | if [[ $? == 0 ]];
 7 | then
 8 |     echo "POM is already set up for Spark 3 (Spark 3 artifacts have -spark3 suffix in artifact names)."
 9 |     echo "Cowardly refusing to move to Spark 3 a second time..."
10 | 
11 |     exit 1
12 | fi
13 | 
14 | svp="\${scala.version.prefix}"
15 | substitution_cmd="s/-spark2_$svp/-spark3_$svp/g"
16 | 
17 | find . -name "pom.xml" -exec sed \
18 |     -e "/adam-/ s/-spark2_2\.1/-spark3_2\.1/" \
19 |     -e "/adam-/ $substitution_cmd" \
20 |     -e "/utils-/ s/-spark2_2\.1/-spark3_2\.1/" \
21 |     -e "/utils-/ $substitution_cmd" \
22 |     -e "/spark.version/ s/2.4.7/3.1.2/g" \
23 |     -i.spark3.bak '{}' \;
24 | 


--------------------------------------------------------------------------------
/scripts/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # do we have enough arguments?
 4 | if [ $# < 4 ]; then
 5 |     echo "Usage:"
 6 |     echo
 7 |     echo "./release.sh <release version> <development version> <milestone id>"
 8 |     exit 1
 9 | fi
10 | 
11 | # pick arguments
12 | release=$1
13 | devel=$2
14 | milestone=$3
15 | 
16 | # get current branch
17 | branch=$(git status -bs | awk '{ print $2 }' | awk -F'.' '{ print $1 }' | head -n 1)
18 | 
19 | # update changelog per Github milestone
20 | mvn com.github.heuermh.maven.plugin.changes:github-changes-maven-plugin:1.2:github-changes -DmilestoneId=${milestone}
21 | git commit -a -m "Modifying changelog."
22 | 
23 | # update R version
24 | sed -i -e "s/Version: [0-9.]*/Version: $1/g" adam-r/bdgenomics.adam/DESCRIPTION
25 | git commit -a -m "Bumping R version to $1."
26 | 
27 | commit=$(git log --pretty=format:"%H" | head -n 1)
28 | echo "releasing from ${commit} on branch ${branch}"
29 | 
30 | git push origin ${branch}
31 | 
32 | # do spark 3, scala 2.12 release
33 | git checkout -b maint_spark3_2.12-${release} ${branch}
34 | 
35 | mvn --batch-mode \
36 |   -P distribution \
37 |   -Dresume=false \
38 |   -Dtag=adam-parent-spark3_2.12-${release} \
39 |   -DreleaseVersion=${release} \
40 |   -DdevelopmentVersion=${devel} \
41 |   -DbranchName=adam-spark3_2.12-${release} \
42 |   release:clean \
43 |   release:prepare \
44 |   release:perform
45 | 
46 | if [ $? != 0 ]; then
47 |   echo "Releasing Spark 3, Scala 2.12 version failed."
48 |   exit 1
49 | fi
50 | 
51 | if [ $branch = "master" ]; then
52 |   # if original branch was master, update versions on original branch
53 |   git checkout ${branch}
54 |   mvn versions:set -DnewVersion=${devel} \
55 |     -DgenerateBackupPoms=false
56 |   git commit -a -m "Modifying pom.xml files for new development after ${release} release."
57 |   git push origin ${branch}
58 | fi
59 | 


--------------------------------------------------------------------------------