├── .clang-format ├── .dir-locals.el ├── .gitignore ├── .travis.yml ├── CODING_STYLE.txt ├── LICENSE.txt ├── Makefile ├── README.md ├── __init__.py ├── alignmentDepth ├── Makefile └── halAlignmentDepth.cpp ├── analysis ├── Makefile ├── __init__.py ├── constraintTurnover │ ├── __init__.py │ └── turnoverModel.py ├── halContiguousRegions.py ├── halContiguousRegionsTest.py ├── neutralIndel │ ├── __init__.py │ ├── backgroundRate.py │ ├── bedConservation.py │ ├── bedHistogram.py │ ├── bedMutations.py │ ├── estimateTurnoverParams.py │ ├── getBedLength.py │ ├── halTreeNIBackground.py │ ├── halTreeNIConservation.py │ ├── halTreeNITurnover.py │ └── turnoverRate.py └── syntenyRates.py ├── api ├── Makefile ├── __init__.py ├── doc │ ├── doxy.cfg │ └── naming-conventions.txt ├── hdf5_impl │ ├── hdf5Alignment.cpp │ ├── hdf5Alignment.h │ ├── hdf5BottomSegment.cpp │ ├── hdf5BottomSegment.h │ ├── hdf5Common.h │ ├── hdf5DnaArray.h │ ├── hdf5DnaDriver.cpp │ ├── hdf5DnaDriver.h │ ├── hdf5ExternalArray.cpp │ ├── hdf5ExternalArray.h │ ├── hdf5Genome.cpp │ ├── hdf5Genome.h │ ├── hdf5MetaData.cpp │ ├── hdf5MetaData.h │ ├── hdf5Sequence.cpp │ ├── hdf5Sequence.h │ ├── hdf5SequenceIterator.cpp │ ├── hdf5SequenceIterator.h │ ├── hdf5TopSegment.cpp │ ├── hdf5TopSegment.h │ ├── hdf5UDCFuseDriver.cpp │ └── hdf5UDCFuseDriver.h ├── hdf5_tests │ ├── allTests.cpp │ ├── allTests.h │ ├── hdf5DnaTypeTest.cpp │ ├── hdf5ExternalArrayTest.cpp │ ├── hdf5SegmentTypeTest.cpp │ ├── hdf5SequenceTypeTest.cpp │ ├── hdf5Test.cpp │ └── hdf5Test.h ├── impl │ ├── halAlignmentInstance.cpp │ ├── halBottomSegment.cpp │ ├── halBottomSegmentIterator.cpp │ ├── halCLParser.cpp │ ├── halColumnIterator.cpp │ ├── halCommon.cpp │ ├── halGappedBottomSegmentIterator.cpp │ ├── halGappedTopSegmentIterator.cpp │ ├── halGenome.cpp │ ├── halMappedSegment.cpp │ ├── halPositionCache.cpp │ ├── halRearrangement.cpp │ ├── halSegment.cpp │ ├── halSegmentIterator.cpp │ ├── halSegmentMapper.cpp │ ├── halSequence.cpp │ ├── halTopSegment.cpp │ ├── halTopSegmentIterator.cpp │ ├── halValidate.cpp │ └── udc2.c ├── inc │ ├── hal.h │ ├── halAlignment.h │ ├── halAlignmentInstance.h │ ├── halBottomSegment.h │ ├── halBottomSegmentIterator.h │ ├── halCLParser.h │ ├── halColumnIterator.h │ ├── halColumnIteratorStack.h │ ├── halCommon.h │ ├── halDefs.h │ ├── halDnaDriver.h │ ├── halDnaIterator.h │ ├── halGappedBottomSegmentIterator.h │ ├── halGappedSegmentIterator.h │ ├── halGappedTopSegmentIterator.h │ ├── halGenome.h │ ├── halMappedSegment.h │ ├── halMappedSegmentContainers.h │ ├── halMetaData.h │ ├── halPositionCache.h │ ├── halRearrangement.h │ ├── halSegment.h │ ├── halSegmentIterator.h │ ├── halSegmentMapper.h │ ├── halSegmentedSequence.h │ ├── halSequence.h │ ├── halSequenceIterator.h │ ├── halSlicedSegment.h │ ├── halTopSegment.h │ ├── halTopSegmentIterator.h │ ├── halValidate.h │ └── udc2.h ├── mmap_impl │ ├── mmapAlignment.cpp │ ├── mmapAlignment.h │ ├── mmapArray.h │ ├── mmapBottomSegment.cpp │ ├── mmapBottomSegment.h │ ├── mmapBottomSegmentData.h │ ├── mmapDnaDriver.cpp │ ├── mmapDnaDriver.h │ ├── mmapFile.cpp │ ├── mmapFile.h │ ├── mmapGenome.cpp │ ├── mmapGenome.h │ ├── mmapGenomeSiteMap.cpp │ ├── mmapGenomeSiteMap.h │ ├── mmapMetaData.h │ ├── mmapPerfectHashTable.cpp │ ├── mmapPerfectHashTable.h │ ├── mmapPhf.cpp │ ├── mmapPhf.h │ ├── mmapPhf.md │ ├── mmapRbTree.cpp │ ├── mmapRbTree.h │ ├── mmapSequence.cpp │ ├── mmapSequence.h │ ├── mmapSequenceData.h │ ├── mmapSequenceIterator.h │ ├── mmapString.h │ ├── mmapTopSegment.cpp │ ├── mmapTopSegment.h │ └── mmapTopSegmentData.h └── tests │ ├── halAlignmentTreesTest.cpp │ ├── halApiTestSupport.cpp │ ├── halApiTestSupport.h │ ├── halBottomSegmentTest.cpp │ ├── halColumnIteratorTest.cpp │ ├── halGappedSegmentIteratorTest.cpp │ ├── halGenomeTest.cpp │ ├── halMappedSegmentTest.cpp │ ├── halMetaDataTest.cpp │ ├── halRandNumberGen.h │ ├── halRandomData.cpp │ ├── halRandomData.h │ ├── halRearrangementTest.cpp │ ├── halSegmentTestSupport.h │ ├── halSequenceTest.cpp │ ├── halTopSegmentTest.cpp │ ├── halValidateTest.cpp │ └── udc2Test.c ├── assemblyHub ├── Makefile ├── README.md ├── __init__.py ├── alignabilityTrack.py ├── assemblyHubCommon.py ├── bedCommon.py ├── bedTrack.py ├── conservationTrack.py ├── docs │ ├── __init__.py │ ├── alignabilityDocs.py │ ├── conservationDocs.py │ ├── gcPercentDocs.py │ ├── hubCentralDocs.py │ ├── makeDocs.py │ └── repeatMaskerDocs.py ├── gcPercentTrack.py ├── groupExclusiveRegions.py ├── hal2assemblyHub.py ├── hal2assemblyHubDoc.pdf ├── halGenerateComparisonHub.py ├── prepareHubFiles.py ├── prepareLodFiles.py ├── rmskTrack.py ├── snakeTrack.py ├── treeCommon.py └── wigTrack.py ├── benchmarks ├── benchMark.py ├── mafMutations.cpp ├── results │ └── .gitignore └── runAndGetResources.py ├── blockViz ├── Makefile ├── __init__.py ├── impl │ ├── hal2chain.cpp │ └── halBlockViz.cpp ├── inc │ └── halBlockViz.h └── tests │ ├── blockVizBed.cpp │ ├── blockVizBenchmark.py │ ├── blockVizMaf.cpp │ ├── blockVizTest.cpp │ ├── expected │ ├── blockVizHdf5Tests.out │ └── blockVizMmapTests.out │ └── timing.sh ├── doc └── to-do.org ├── extra ├── dotplot │ ├── README.md │ ├── example.png │ ├── plotDotplot.R │ └── runDotplot.py └── insertionStats │ ├── getInsertionStats.py │ └── plotInsertionStats.R ├── extract ├── Makefile ├── __init__.py ├── impl │ ├── hal4dExtract.cpp │ ├── hal4dExtractMain.cpp │ ├── halAlignedExtract.cpp │ ├── halExtract.cpp │ ├── halMaskExtractMain.cpp │ ├── halMaskExtractor.cpp │ └── halSingleCopyRegionsExtract.cpp ├── inc │ ├── hal4dExtract.h │ └── halMaskExtractor.h └── tests │ ├── hal4dExtractTest.cpp │ └── input │ └── small.mmap1.0.hal.bz2 ├── fasta ├── Makefile └── hal2fasta.cpp ├── include.mk ├── liftover ├── Makefile ├── halLiftoverStatus.py ├── impl │ ├── halBedLine.cpp │ ├── halBedScanner.cpp │ ├── halBlockLiftover.cpp │ ├── halBlockMapper.cpp │ ├── halColumnLiftover.cpp │ ├── halLiftover.cpp │ ├── halLiftoverMain.cpp │ ├── halWiggleLiftover.cpp │ ├── halWiggleLiftoverMain.cpp │ ├── halWiggleLoader.cpp │ └── halWiggleScanner.cpp ├── inc │ ├── halBedLine.h │ ├── halBedScanner.h │ ├── halBlockLiftover.h │ ├── halBlockMapper.h │ ├── halColumnLiftover.h │ ├── halLiftover.h │ ├── halWiggleLiftover.h │ ├── halWiggleLoader.h │ ├── halWiggleScanner.h │ └── halWiggleTiles.h └── tests │ ├── expected │ ├── halLiftoverBed12ExtraTest.bed │ ├── halLiftoverBed12Test.bed │ ├── halLiftoverBed3Test.bed │ ├── halLiftoverBed4ExtraTest.bed │ ├── halLiftoverPsl12Test.psl │ └── halLiftoverPsl3Test.psl │ ├── halLiftoverTests.cpp │ ├── halLiftoverTests.h │ └── input │ ├── test1.bed12 │ ├── test1.bed12+2 │ ├── test1.bed3 │ └── test1.bed4+2 ├── lod ├── Makefile ├── __init__.py ├── halLodBenchmark.py ├── halLodInterpolate.py ├── impl │ ├── halLodBlock.cpp │ ├── halLodExtract.cpp │ ├── halLodExtractMain.cpp │ ├── halLodGraph.cpp │ ├── halLodManager.cpp │ └── halLodSegment.cpp └── inc │ ├── halLodBlock.h │ ├── halLodExtract.h │ ├── halLodGraph.h │ ├── halLodManager.h │ └── halLodSegment.h ├── maf ├── Makefile ├── __init__.py ├── hal2mafMP.py ├── impl │ ├── hal2maf.cpp │ ├── halMafBed.cpp │ ├── halMafBlock.cpp │ ├── halMafExport.cpp │ ├── halMafScanDimensions.cpp │ ├── halMafScanReference.cpp │ ├── halMafScanner.cpp │ ├── halMafWriteGenomes.cpp │ ├── maf2hal.cpp │ └── naiveLiftUp.py ├── inc │ ├── halMafBed.h │ ├── halMafBlock.h │ ├── halMafExport.h │ ├── halMafScanDimensions.h │ ├── halMafScanReference.h │ ├── halMafScanner.h │ └── halMafWriteGenomes.h └── tests │ ├── expected │ ├── hal2mafMPBySeqTest_Genome_0_seq.maf │ ├── hal2mafMPRefTargetsGenomesTest.maf │ ├── hal2mafMPTargetGenomesTest.maf │ ├── hal2mafSeqPartTest.maf │ ├── hal2mafSeqTest.maf │ └── hal2mafSmallTest.maf │ ├── halMafBlockTest.cpp │ ├── halMafBlockTest.h │ ├── halMafExportTest.cpp │ ├── halMafTests.cpp │ ├── halMafTests.h │ └── input │ └── small-Genome_0.bed ├── modify ├── Makefile ├── ancestorsML.cpp ├── ancestorsML.h ├── ancestorsMLBed.cpp ├── ancestorsMLBed.h ├── ancestorsMLMP.py ├── ancestorsMLMain.cpp ├── ancestorsMLTest.cpp ├── findRegionsExclusivelyInGroup.cpp ├── halAddToBranch.cpp ├── halAppendSubtree.cpp ├── halRemoveGenome.cpp ├── halRemoveSubtree.cpp ├── halRenameGenomes.cpp ├── halRenameSequences.cpp ├── halReplaceGenome.cpp ├── halSetMetadata.cpp ├── halUpdateBranchLengths.cpp ├── halWriteNucleotides.cpp ├── markAncestors.cpp ├── markAncestors.h ├── renameFile.cpp └── renameFile.h ├── mutations ├── Makefile ├── __init__.py ├── impl │ ├── __init__.py │ ├── halBranchMutations.cpp │ ├── halBranchMutationsMain.cpp │ ├── halIndels.cpp │ ├── halMutationsStats.cpp │ ├── halSnps.cpp │ ├── halSummarizeMutations.cpp │ ├── halSummarizeMutationsMain.cpp │ └── halTreeMutations.py └── inc │ ├── halAverage.h │ ├── halBranchMutations.h │ ├── halMutationsStats.h │ └── halSummarizeMutations.h ├── paf ├── Makefile ├── hal2paf.cpp └── tests │ ├── expected │ ├── hal2pafMouseRatTest.paf.gz │ └── hal2pafSmallMMapTest.paf.gz │ └── input │ └── mr.hal ├── phyloP ├── Makefile ├── halPhyloPMP.py ├── halPhyloPTrain.py ├── halTreePhyloP.py ├── impl │ ├── halPhyloP.cpp │ ├── halPhyloPBed.cpp │ └── halPhyloPMain.cpp ├── inc │ ├── halPhyloP.h │ └── halPhyloPBed.h └── test │ ├── blanchette.hal │ ├── blanchette.mod │ └── test.sh ├── randgen ├── Makefile ├── __init__.py ├── halRandGen.cpp └── halTestGen.cpp ├── requirements.txt ├── rules.mk ├── stats ├── Makefile ├── __init__.py ├── halStats.py ├── halcoverage-table.py ├── impl │ ├── halCoverage.cpp │ ├── halPctIdentity.cpp │ ├── halStats.cpp │ └── halStatsMain.cpp └── inc │ └── halStats.h ├── synteny ├── Makefile ├── README.md ├── impl │ ├── hal2psl.cpp │ ├── halSynteny.cpp │ ├── psl_io.cpp │ └── psl_merger.cpp ├── inc │ ├── hal2psl.h │ ├── psl.h │ ├── psl_io.h │ └── psl_merger.h └── tests │ └── expected │ └── test1.psl ├── testdata └── mammals.mod └── validate ├── Makefile ├── __init__.py └── halValidateMain.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | AllowShortBlocksOnASingleLine: false 3 | AllowShortFunctionsOnASingleLine: None 4 | AllowShortIfStatementsOnASingleLine: false 5 | AllowShortLoopsOnASingleLine: false 6 | # BinPackArguments: false 7 | # BinPackParameters: false 8 | # BreakBeforeBinaryOperators: false 9 | # BreakBeforeBraces: Attach 10 | # BreakAfterJavaFieldAnnotations: true 11 | ColumnLimit: 128 12 | # IndentCaseLabels: true 13 | IndentWidth: 4 14 | MaxEmptyLinesToKeep: 1 15 | NamespaceIndentation: All 16 | TabWidth: 4 17 | UseTab: Never 18 | 19 | -------------------------------------------------------------------------------- /.dir-locals.el: -------------------------------------------------------------------------------- 1 | ;;; Directory Local Variables 2 | ;;; For more information see (info "(emacs) Directory Variables") 3 | 4 | ((c++-mode 5 | (##) 6 | (c-file-style . "k&r") 7 | (c-basic-offset . 4) 8 | (c-file-offsets 9 | (innamespace . 4)))) 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .pydevproject 2 | .project 3 | .cproject 4 | *.o 5 | *.pyc 6 | /bin 7 | /lib 8 | /objs 9 | *.depend 10 | .emacs.bak 11 | TAGS 12 | include.local.mk 13 | output 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: bionic 3 | language: cpp 4 | 5 | before_install: 6 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get -qq update; fi 7 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get install -y libhdf5-serial-dev python3 python3-pip libpython3-dev; fi 8 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then HOMEBREW_NO_AUTO_UPDATE=1 brew install hdf5 python3 python3-pip || echo "a brew error code when installing gcc is expected"; fi 9 | - git clone https://github.com/ComparativeGenomicsToolkit/sonLib.git 10 | install: 11 | - sudo pip3 install setuptools --upgrade 12 | - pip3 install -r requirements.txt 13 | - sh -c 'cd sonLib && make' 14 | script: 15 | - alias python=python3 16 | - alias pip=pip3 17 | - export sonLibRootDir=`pwd`/sonLib 18 | - make -j 4 19 | - PYTHONPATH=..:.:$PYTHONPATH PATH=./bin:$PATH make -k test 20 | os: 21 | - linux 22 | - osx 23 | env: 24 | - CGL_DEBUG=1 25 | - CGL_DEBUG=ultra ASAN_OPTIONS=detect_leaks=0 26 | matrix: 27 | exclude: 28 | # The default OSX env doesn't have a version that supports 29 | # -fsanitize=address. 30 | - env: CGL_DEBUG=ultra ASAN_OPTIONS=detect_leaks=0 31 | os: osx 32 | -------------------------------------------------------------------------------- /CODING_STYLE.txt: -------------------------------------------------------------------------------- 1 | 2 | Coding style is based on the K&R style. It was reformatted with clang-format 3 | LLVM, with some changes to remove special cases for consistency and 4 | simplicity: 5 | 6 | * Line width is set to 128 characters, since no one has seen a Wyse 60 is a very long time. 7 | * Standard indentation is used namespaces 8 | 9 | For emacs users, this is set automatically from `.dir.locals.el`. 10 | 11 | The `clang-format` program was used '.clang-format' as a configuration. 12 | 13 | Other conventions: 14 | - member variables: begin with _ 15 | - never use "using namespace" in a header file 16 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 2 | Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | 22 | 23 | This license excludes certain files in the 'externalTools' sub-directory, please 24 | see for copyright information and license information. 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | rootDir = . 2 | include include.mk 3 | 4 | modules = api stats randgen validate mutations fasta alignmentDepth liftover lod maf blockViz extract analysis phyloP modify assemblyHub synteny paf 5 | 6 | 7 | .PHONY: all libs %.libs progs %.progs clean %.clean doxy %.doxy 8 | 9 | all : libs progs 10 | 11 | libs: ${modules:%=%.libs} 12 | %.libs: 13 | cd $* && ${MAKE} libs 14 | 15 | progs: ${modules:%=%.progs} 16 | %.progs: libs 17 | cd $* && ${MAKE} progs 18 | 19 | clean: ${modules:%=%.clean} 20 | rm -f hal 21 | rm -rf lib bin objs 22 | rm -f *.pyc */*.pyc */*/*.pyc 23 | rm -rf __pycache__ */__pycache__ */*/__pycache__ 24 | 25 | %.clean: 26 | cd $* && ${MAKE} clean 27 | 28 | # create symbolic links for test so that python packages work without assuming name of 29 | # directory, but then remove, as it can cause grief for naive copy programs 30 | test: 31 | rm -f hal 32 | ln -sf . hal 33 | ${MAKE} doTests 34 | rm -f hal 35 | 36 | 37 | doTests: ${modules:%=%.test} 38 | 39 | %.test: all 40 | cd $* && ${MAKE} test 41 | 42 | 43 | doxy : ${modules:%=doxy.%} 44 | 45 | doxy.%: 46 | cd api && ${MAKE} doxy 47 | 48 | etags: 49 | etags $$(find . -name '*.h' -o -name '*.cpp') 50 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/__init__.py -------------------------------------------------------------------------------- /alignmentDepth/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/alignmentDepth 4 | 5 | halAlignmentDepth_srcs = halAlignmentDepth.cpp 6 | halAlignmentDepth_objs = ${halAlignmentDepth_srcs:%.cpp=${modObjDir}/%.o} 7 | srcs = ${halAlignmentDepth_srcs} 8 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 9 | depends = ${srcs:%.cpp=%.depend} 10 | progs = ${binDir}/halAlignmentDepth 11 | 12 | all: progs 13 | libs: 14 | progs: ${progs} 15 | 16 | clean: 17 | rm -f ${objs} ${progs} ${depends} 18 | 19 | test: 20 | 21 | include ${rootDir}/rules.mk 22 | 23 | # don't fail on missing dependencies, they are first time the .o is generates 24 | -include ${depends} 25 | 26 | 27 | # Local Variables: 28 | # mode: makefile-gmake 29 | # End: 30 | 31 | -------------------------------------------------------------------------------- /analysis/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/api 4 | 5 | progs = ${binDir}/halTreeNIConservation.py ${binDir}/halTreeNIBackground.py ${binDir}/halTreeNITurnover.py 6 | 7 | all: progs 8 | libs: 9 | progs: ${progs} 10 | clean: 11 | rm -f ${progs} 12 | test: 13 | 14 | ${binDir}/%.py : neutralIndel/%.py 15 | @mkdir -p $(dir $@) 16 | cp -f $< $@ 17 | chmod a+x,a-w $@ 18 | 19 | # Local Variables: 20 | # mode: makefile-gmake 21 | # End: 22 | 23 | -------------------------------------------------------------------------------- /analysis/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | -------------------------------------------------------------------------------- /analysis/constraintTurnover/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | -------------------------------------------------------------------------------- /analysis/neutralIndel/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | -------------------------------------------------------------------------------- /analysis/neutralIndel/getBedLength.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | import argparse 6 | 7 | def getBedLength(bedPath): 8 | length = 0 9 | bedFile = open(bedPath) 10 | for line in bedFile: 11 | clnLine = line.strip() 12 | if len(clnLine) > 0 and clnLine[0] != "#": 13 | toks = clnLine.split() 14 | if len(toks) > 2: 15 | start = int(toks[1]) 16 | end = int(toks[2]) 17 | lineLength = end - start 18 | length += lineLength 19 | bedFile.close() 20 | return length 21 | 22 | def main(argv=None): 23 | if argv is None: 24 | argv = sys.argv 25 | 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("bedPath", type=str, 28 | help="path to bed file") 29 | 30 | args = parser.parse_args() 31 | print(getBedLength(args.bedPath)) 32 | 33 | if __name__ == "__main__": 34 | sys.exit(main()) 35 | -------------------------------------------------------------------------------- /api/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/api 4 | 5 | libHal_srcs = $(wildcard impl/*.cpp) $(wildcard hdf5_impl/*.cpp) $(wildcard mmap_impl/*.cpp) 6 | ifdef ENABLE_UDC 7 | libHal_c_srcs = impl/udc2.c 8 | endif 9 | libHal_objs = ${libHal_srcs:%.cpp=${modObjDir}/%.o} ${libHal_c_srcs:%.c=${modObjDir}/%.o} 10 | 11 | halHdf5Tests_srcs = $(wildcard hdf5_tests/*.cpp) 12 | halHdf5Tests_objs = ${halHdf5Tests_srcs:%.cpp=${modObjDir}/%.o} 13 | halHdf5Tests_progs = ${binDir}/halHdf5Tests 14 | 15 | halApiTests_srcs = $(wildcard tests/*.cpp) 16 | halApiTests_objs = ${halApiTests_srcs:%.cpp=${modObjDir}/%.o} 17 | 18 | halApiTest_names = halAlignmentTreesTest \ 19 | halBottomSegmentTest \ 20 | halColumnIteratorTest \ 21 | halGappedSegmentIteratorTest \ 22 | halGenomeTest \ 23 | halMappedSegmentTest \ 24 | halMetaDataTest \ 25 | halRearrangementTest \ 26 | halSequenceTest \ 27 | halTopSegmentTest \ 28 | halValidateTest 29 | halApiTest_progs = ${halApiTest_names:%=${binDir}/%} 30 | 31 | # make magic to generate the variables containing the objects for the link rule. 32 | # for each prog name this generates a _objs variable (e.g. halValidateTest_objs) 33 | $(foreach prog,${halApiTest_names},$(eval ${prog}_objs = ${modObjDir}/tests/${prog}.o ${halApiTestSupportLibs})) 34 | 35 | ifdef ENABLE_UDC 36 | udc2Tests_srcs = $(wildcard tests/udc2Test.c) 37 | udc2Tests_objs = ${udc2Tests_srcs:%.c=${modObjDir}/%.o} 38 | endif 39 | srcs = ${libHal_srcs} ${halApiTests_srcs} ${halHdf5Tests_srcs} 40 | c_srcs = ${udc2Tests_srcs} 41 | objs = ${srcs:%.cpp=${modObjDir}/%.o} ${c_srcs:%.c=${modObjDir}/%.o} 42 | depends = ${srcs:%.cpp=%.depend} ${c_srcs:%.c=%.depend} 43 | 44 | progs = ${halHdf5Tests_progs} ${halApiTest_progs} 45 | inclSpec += -Ihdf5_impl -Immap_impl 46 | ifdef ENABLE_UDC 47 | # FIXME: standarize var names 48 | inclSpec += -I${KENTSRC}/inc -I${KENTSRC}/htslib 49 | progs += ${binDir}/udc2Tests 50 | endif 51 | 52 | 53 | all : libs progs 54 | libs: ${libHal} ${halApiTestSupportLibs} 55 | progs: ${progs} 56 | 57 | # only udc2.c should include kent .h files, to preven conflicts with older versions of 58 | # phast 59 | ${modObjDir}/impl/udc2.o: impl/udc2.c 60 | @mkdir -p $(dir $@) 61 | ${CC} -MM -MT $@ ${CFLAGS} ${UDCCFLAGS} ${inclSpec} -c $< >$*.depend 62 | ${CC} ${CFLAGS} ${inclSpec} -c $< -o $@ 63 | 64 | 65 | clean : 66 | rm -f ${libHal} ${objs} ${progs} ${depends} 67 | 68 | test: halHdf5Tests halApiTests 69 | 70 | halHdf5Tests: all 71 | ${binDir}/halHdf5Tests 72 | 73 | 74 | halApiTests: hdf5.halApiTestsStorage mmap.halApiTestsStorage 75 | 76 | %.halApiTestsStorage: 77 | ${MAKE} runHalApiTest halStorageFormat=$* 78 | 79 | runHalApiTest: ${halApiTest_names:%=%.runHalApiTest} 80 | 81 | %.runHalApiTest: 82 | ${binDir}/$* ${halStorageFormat} 83 | 84 | doxy : 85 | doxygen doc/doxy.cfg 86 | 87 | etags: 88 | (cd .. && ${MAKE} etags} 89 | 90 | include ${rootDir}/rules.mk 91 | 92 | # don't fail on missing dependencies, they are first time the .o is generates 93 | -include ${depends} 94 | 95 | 96 | # Local Variables: 97 | # mode: makefile-gmake 98 | # End: 99 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/api/__init__.py -------------------------------------------------------------------------------- /api/doc/naming-conventions.txt: -------------------------------------------------------------------------------- 1 | 2 | - Variable naming for consistency, which maybe prefixed (e.g. fredSegIt) 3 | SequenceIterator: seqIt 4 | TopSegment: topSeg 5 | TopSegmentIterator: topSegIt 6 | BottomSegment: botSeg 7 | BottomSegmentIterator: botSegIt 8 | GappedTopSegmentIterator: gapTopSegIt 9 | GappedBottomSegmentIterator: gapBotSegIt 10 | DnaIterator: dnaIt 11 | ColumnIterator: colit 12 | LinkedTopIterator: linkTopIt 13 | LinkedBotIterator: linkBotIt 14 | -------------------------------------------------------------------------------- /api/hdf5_impl/hdf5Common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #ifndef _HDF5COMMON_H 8 | #define _HDF5COMMON_H 9 | #include 10 | 11 | /* Class to disable HDF5 exception printing. It is re-enabled when the 12 | * class is destroyed. Can comment out dontPrint() call for debugging. 13 | */ 14 | class HDF5DisableExceptionPrinting { 15 | public: 16 | HDF5DisableExceptionPrinting() { 17 | H5::Exception::getAutoPrint(_func, &_clientData); 18 | H5::Exception::dontPrint(); 19 | } 20 | ~HDF5DisableExceptionPrinting() { 21 | H5::Exception::setAutoPrint(_func, _clientData); 22 | } 23 | 24 | private: 25 | H5E_auto2_t _func; 26 | void *_clientData; 27 | }; 28 | 29 | #endif 30 | 31 | // Local Variables: 32 | // mode: c++ 33 | // End: 34 | -------------------------------------------------------------------------------- /api/hdf5_impl/hdf5DnaArray.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HDF5DNAARRAY_H 9 | #define _HDF5DNAARRAY_H 10 | 11 | #include "rawH5ExternalArray.h" 12 | #include 13 | #include 14 | 15 | namespace hal { 16 | 17 | /** 18 | * Wraps the RawH5ExternalArray with interface tailored to storing and accessing 19 | * only DNA characters 20 | */ 21 | class Hdf5DnaArray { 22 | public: 23 | /** Constructor */ 24 | Hdf5DnaArray(); 25 | 26 | /** Destructor */ 27 | virtual ~Hdf5DnaArray(); 28 | 29 | /** Create a new array (overloads method in parent) 30 | * @param file HDF5 file in which to add new array dataset 31 | * @param path location of new array in file 32 | * @param size Fixed length of the new array 33 | * @param cparams Creation parameters for new array (chunking, zipping) */ 34 | void create(H5File *file, const std::string &path, hsize_t size, 35 | const H5::DSetCreatPropList &cparms = H5::DSetCreatPropList::DEFAULT); 36 | 37 | /** Open an existing array 38 | * @param file HDF5 file containing array to open 39 | * @param path location of array in file */ 40 | void open(H5File *file, const std::string &path); 41 | 42 | /** Write any unsaved buffer contents back to the file */ 43 | void write(); 44 | 45 | /** Get read/write iterator 46 | * @param offset position of iterator in array */ 47 | Hdf5DnaIterator getDnaIterator(hsize_t offset = 0); 48 | 49 | /** Get read-only iterator 50 | * @param offset position of iterator in array */ 51 | Hdf5DnaConstIterator getDnaConstIterator(hsize_t offset = 0); 52 | 53 | /** Get size of array */ 54 | hsize_t size(); 55 | 56 | private: 57 | RawH5ExternalArray _array; 58 | }; 59 | 60 | // INLINE METHODS 61 | 62 | inline Hdf5DnaArray::getDnaIterator(hsize_t offset) { 63 | assert(offset < size()); 64 | return DnaIterator(_array, offset); 65 | } 66 | 67 | inline Hdf5DnaConstIterator(hsize_t offset) { 68 | assert(offset < size()); 69 | return DnaIterator(_array, offset); 70 | } 71 | } 72 | // Local Variables: 73 | // mode: c++ 74 | // End: 75 | -------------------------------------------------------------------------------- /api/hdf5_impl/hdf5DnaDriver.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include "hdf5DnaDriver.h" 8 | #include "hdf5Alignment.h" 9 | #include "hdf5Genome.h" 10 | 11 | using namespace hal; 12 | 13 | HDF5DnaAccess::HDF5DnaAccess(Hdf5Genome *genome, Hdf5ExternalArray *dnaArray, hal_index_t index) 14 | : DnaAccess(0, 0, NULL), _dnaArray(dnaArray) { 15 | fetch(index); 16 | } 17 | 18 | void HDF5DnaAccess::flush() { 19 | // ensure that marked dirty 20 | if (_dirty) { 21 | _dnaArray->setDirty(); 22 | } 23 | _dirty = false; 24 | } 25 | 26 | void HDF5DnaAccess::fetch(hal_index_t index) const { 27 | if (_dirty) { 28 | _dnaArray->setDirty(); 29 | } 30 | _dnaArray->page(index / 2); 31 | // DANGER _dnaArray is close-ended 32 | _startIndex = 2 * _dnaArray->getBufStart(); 33 | _endIndex = 2 * (_dnaArray->getBufEnd() + 1); 34 | _buffer = _dnaArray->getBuf(); 35 | _dirty = false; 36 | } 37 | -------------------------------------------------------------------------------- /api/hdf5_impl/hdf5DnaDriver.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #ifndef _HDF5DNADRIVER_H 8 | #define _HDF5DNADRIVER_H 9 | #include "halDnaDriver.h" 10 | 11 | namespace hal { 12 | class Hdf5Genome; 13 | class Hdf5ExternalArray; 14 | 15 | /** 16 | * HDF5 implementation of DnaAccess. 17 | */ 18 | class HDF5DnaAccess : public DnaAccess { 19 | public: 20 | HDF5DnaAccess(Hdf5Genome *genome, Hdf5ExternalArray *dnaArray, hal_index_t index); 21 | 22 | virtual ~HDF5DnaAccess() { 23 | } 24 | 25 | void flush(); 26 | 27 | protected: 28 | virtual void fetch(hal_index_t index) const; 29 | 30 | private: 31 | Hdf5ExternalArray *_dnaArray; 32 | }; 33 | } 34 | 35 | #endif 36 | // Local Variables: 37 | // mode: c++ 38 | // End: 39 | -------------------------------------------------------------------------------- /api/hdf5_impl/hdf5MetaData.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HDF5METADATA_H 9 | #define _HDF5METADATA_H 10 | 11 | #include "halMetaData.h" 12 | #include "hdf5ExternalArray.h" 13 | #include 14 | #include 15 | #include 16 | 17 | namespace hal { 18 | 19 | /** 20 | * HDF5 string map used for general metadata 21 | */ 22 | class HDF5MetaData : public MetaData { 23 | public: 24 | HDF5MetaData(); 25 | HDF5MetaData(H5::PortableH5Location *parent, const std::string &name); 26 | virtual ~HDF5MetaData(); 27 | 28 | void set(const std::string &key, const std::string &value); 29 | const std::string &get(const std::string &key) const; 30 | bool has(const std::string &key) const; 31 | const std::map &getMap() const; 32 | 33 | void write(); 34 | 35 | void open(H5::PortableH5Location *parent, const std::string &name); 36 | 37 | private: 38 | static const std::string MetaGroupName; 39 | 40 | H5::PortableH5Location *_parent; 41 | H5::Group _group; 42 | std::map _map; 43 | bool _dirty; 44 | std::string _name; 45 | }; 46 | } 47 | #endif 48 | 49 | // Local Variables: 50 | // mode: c++ 51 | // End: 52 | -------------------------------------------------------------------------------- /api/hdf5_impl/hdf5SequenceIterator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include "hdf5SequenceIterator.h" 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | using namespace H5; 14 | using namespace hal; 15 | 16 | Hdf5SequenceIterator::Hdf5SequenceIterator(Hdf5Genome *genome, hal_index_t index) 17 | : _sequence(genome, &genome->_sequenceIdxArray, &genome->_sequenceNameArray, index) { 18 | } 19 | 20 | Hdf5SequenceIterator::~Hdf5SequenceIterator() { 21 | } 22 | 23 | SequenceIteratorPtr Hdf5SequenceIterator::clone() const { 24 | Hdf5SequenceIterator *seqIt = new Hdf5SequenceIterator(_sequence._genome, _sequence._index); 25 | return SequenceIteratorPtr(seqIt); 26 | } 27 | 28 | void Hdf5SequenceIterator::toNext() { 29 | ++_sequence._index; 30 | } 31 | 32 | void Hdf5SequenceIterator::toPrev() { 33 | --_sequence._index; 34 | } 35 | 36 | bool Hdf5SequenceIterator::atEnd() const { 37 | return (_sequence._index < 0) or (_sequence._index >= (hal_index_t)_sequence._genome->_sequenceNameArray.getSize()); 38 | } 39 | 40 | Sequence *Hdf5SequenceIterator::getSequence() { 41 | assert(_sequence._index >= 0 && _sequence._index < (hal_index_t)_sequence._genome->_sequenceNameArray.getSize()); 42 | // don't return local sequence pointer. give cached pointer from 43 | // genome instead (so it will not expire when iterator moves!) 44 | return _sequence._genome->getSequence(_sequence.getName()); 45 | } 46 | 47 | bool Hdf5SequenceIterator::equals(SequenceIteratorPtr other) const { 48 | const Hdf5SequenceIterator *h5Other = reinterpret_cast(other.get()); 49 | assert(_sequence.getGenome() == h5Other->_sequence.getGenome()); 50 | return _sequence._index == h5Other->_sequence._index; 51 | } 52 | -------------------------------------------------------------------------------- /api/hdf5_impl/hdf5SequenceIterator.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HDF5SEQUENCEITERATOR_H 9 | #define _HDF5SEQUENCEITERATOR_H 10 | 11 | #include "halSequenceIterator.h" 12 | #include "hdf5ExternalArray.h" 13 | #include "hdf5Genome.h" 14 | #include "hdf5Sequence.h" 15 | #include 16 | 17 | namespace hal { 18 | 19 | class Hdf5SequenceIterator : public SequenceIterator { 20 | public: 21 | Hdf5SequenceIterator(Hdf5Genome *genome, hal_index_t index); 22 | ~Hdf5SequenceIterator(); 23 | 24 | // SEQUENCE ITERATOR METHODS 25 | SequenceIteratorPtr clone() const; 26 | void toNext(); 27 | void toPrev(); 28 | bool atEnd() const; 29 | Sequence *getSequence(); 30 | const Sequence *getSequence() const { 31 | return const_cast(this)->getSequence(); 32 | } 33 | bool equals(SequenceIteratorPtr other) const; 34 | 35 | private: 36 | Hdf5Sequence _sequence; 37 | }; 38 | } 39 | #endif 40 | // Local Variables: 41 | // mode: c++ 42 | // End: 43 | -------------------------------------------------------------------------------- /api/hdf5_impl/hdf5UDCFuseDriver.h: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * Copyright by The HDF Group. * 3 | * Copyright by the Board of Trustees of the University of Illinois. * 4 | * All rights reserved. * 5 | * * 6 | * This file is part of HDF5. The full HDF5 copyright notice, including * 7 | * terms governing use, modification, and redistribution, is contained in * 8 | * the files COPYING and Copyright.html. COPYING can be found at the root * 9 | * of the source code distribution tree; Copyright.html can be found at the * 10 | * root level of an installed copy of the electronic HDF5 document set and * 11 | * is linked from the top-level documents page. It can also be found at * 12 | * http://hdfgroup.org/HDF5/doc/Copyright.html. If you do not have * 13 | * access to either file, you may request a copy from help@hdfgroup.org. * 14 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 15 | 16 | /* 17 | * Programmer: Robb Matzke 18 | * Monday, August 2, 1999 19 | * 20 | * Purpose: The public header file for the sec2 driver. 21 | */ 22 | #ifndef H5FDudc_fuse_H 23 | #define H5FDudc_fuse_H 24 | 25 | #include "H5Ipublic.h" 26 | 27 | #define H5FD_UDC_FUSE (H5FD_udc_fuse_init()) 28 | 29 | extern "C" { 30 | 31 | H5_DLL hid_t H5FD_udc_fuse_init(void); 32 | H5_DLL void H5FD_udc_fuse_term(void); 33 | H5_DLL herr_t H5Pset_fapl_udc_fuse(hid_t fapl_id); 34 | 35 | void H5FD_udc_fuse_set_cache_dir(const char *cacheDir); 36 | } 37 | 38 | #endif 39 | // Local Variables: 40 | // mode: c++ 41 | // End: 42 | -------------------------------------------------------------------------------- /api/hdf5_tests/allTests.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include "allTests.h" 8 | #include 9 | #include 10 | 11 | int hdf5RunAllTests(void) { 12 | CuString *output = CuStringNew(); 13 | CuSuite *suite = CuSuiteNew(); 14 | // CuSuiteAddSuite(suite, hdf5TestSuite()); 15 | // CuSuiteAddSuite(suite, hdf5ExternalArrayTestSuite()); 16 | // CuSuiteAddSuite(suite, hdf5DNATypeTestSuite()); 17 | // CuSuiteAddSuite(suite, hdf5SegmentTypeTestSuite()); 18 | // CuSuiteAddSuite(suite, hdf5SequenceTypeTestSuite()); 19 | CuSuiteRun(suite); 20 | CuSuiteSummary(suite, output); 21 | CuSuiteDetails(suite, output); 22 | printf("%s\n", output->buffer); 23 | return suite->failCount > 0; 24 | } 25 | 26 | int main(int argc, char *argv[]) { 27 | H5::Exception::dontPrint(); 28 | return hdf5RunAllTests(); 29 | } 30 | -------------------------------------------------------------------------------- /api/hdf5_tests/allTests.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _ALLTESTS_H 9 | #define _ALLTESTS_H 10 | 11 | extern "C" { 12 | #include "CuTest.h" 13 | } 14 | 15 | CuSuite *hdf5TestSuite(); 16 | CuSuite *hdf5ExternalArrayTestSuite(); 17 | CuSuite *hdf5DNATypeTestSuite(); 18 | CuSuite *hdf5SegmentTypeTestSuite(); 19 | CuSuite *hdf5SequenceTypeTestSuite(); 20 | 21 | #endif 22 | // Local Variables: 23 | // mode: c++ 24 | // End: 25 | -------------------------------------------------------------------------------- /api/hdf5_tests/hdf5Test.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HDF5TEST_H 9 | #define _HDF5TEST_H 10 | #include 11 | extern "C" { 12 | #include "CuTest.h" 13 | } 14 | 15 | 16 | /** some functionality shared by tests that rely on basic 17 | * hdf5 stuff 18 | */ 19 | 20 | static const hsize_t N = 500000; 21 | static const std::string datasetName("name"); 22 | extern char *fileName; 23 | extern int64_t *numbers; 24 | 25 | void hdf5TestTeardown(); 26 | void hdf5TestSetup(); 27 | void writeNumbers(hsize_t chunkSize); 28 | void checkNumbers(CuTest *testCase); 29 | 30 | #endif 31 | // Local Variables: 32 | // mode: c++ 33 | // End: 34 | -------------------------------------------------------------------------------- /api/impl/halBottomSegment.cpp: -------------------------------------------------------------------------------- 1 | #include "halBottomSegment.h" 2 | #include "halDnaIterator.h" 3 | 4 | using namespace hal; 5 | 6 | void BottomSegment::getString(std::string &outString) const { 7 | DnaIteratorPtr dnaIt(getGenome()->getDnaIterator(getStartPosition())); 8 | dnaIt->readString(outString, getLength()); 9 | } 10 | -------------------------------------------------------------------------------- /api/impl/halPositionCache.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include "halPositionCache.h" 8 | 9 | using namespace std; 10 | using namespace hal; 11 | 12 | bool PositionCache::insert(hal_index_t pos) { 13 | IntervalSet::iterator i; 14 | if (_prev != _set.end() && _prev->first == pos - 1) { 15 | ++_prev; 16 | i = _prev; 17 | assert(i == _set.lower_bound(pos)); 18 | } else { 19 | i = _set.lower_bound(pos); 20 | } 21 | _prev = i; 22 | 23 | IntervalSet::iterator j; 24 | if (i != _set.end() && i->second <= pos) { 25 | assert(i->first >= pos); 26 | return false; 27 | } 28 | 29 | // merge to beginning of existing interval 30 | if (i != _set.end() && i->second == pos + 1) { 31 | --i->second; 32 | } else { 33 | // set hint to position before pos in set. according to the docs, the 34 | // hint works differently in C++11 where it wants the position *after* 35 | // so we try to detect the compiler below... 36 | j = i; 37 | #if __cplusplus < 201103L 38 | if (j != _set.begin()) { 39 | --j; 40 | } 41 | #endif 42 | // create new unit interval 43 | i = _set.insert(j, pair(pos, pos)); 44 | _prev = i; 45 | } 46 | assert(i->second <= i->first); 47 | // merge abutting left interval 48 | if (i != _set.begin()) { 49 | j = i; 50 | --j; 51 | if (j->first == i->second - 1) { 52 | i->second = j->second; 53 | assert(i->second <= i->first); 54 | _set.erase(j); 55 | } 56 | } 57 | 58 | // merge abutting right interval 59 | j = i; 60 | ++j; 61 | if (j != _set.end() && j->second == i->first + 1) { 62 | j->second = i->second; 63 | assert(j->second <= j->first); 64 | _set.erase(i); 65 | } 66 | 67 | ++_size; 68 | assert(find(pos) == true); 69 | return true; 70 | } 71 | 72 | bool PositionCache::find(hal_index_t pos) const { 73 | IntervalSet::const_iterator i = _set.lower_bound(pos); 74 | if (i != _set.end() && i->second <= pos) { 75 | return true; 76 | } 77 | return false; 78 | } 79 | 80 | void PositionCache::clear() { 81 | _set.clear(); 82 | _size = 0; 83 | _prev = _set.begin(); 84 | } 85 | 86 | // for debugging 87 | bool PositionCache::check() const { 88 | hal_size_t size = 0; 89 | for (IntervalSet::const_iterator i = _set.begin(); i != _set.end(); ++i) { 90 | size += (i->first + 1) - i->second; 91 | IntervalSet::const_iterator j = i; 92 | ++j; 93 | if (j != _set.end()) { 94 | // test overlap 95 | if (j->second <= i->first || i->second >= j->first) { 96 | return false; 97 | } 98 | // test merge 99 | if (j->second == i->first + 1) { 100 | return false; 101 | } 102 | } 103 | } 104 | return size == _size; 105 | } 106 | -------------------------------------------------------------------------------- /api/impl/halSegment.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include "halSegment.h" 8 | #include "halDnaIterator.h" 9 | #include "halGenome.h" 10 | 11 | using namespace hal; 12 | 13 | bool Segment::isMissingData(double nThreshold) const { 14 | DnaIteratorPtr dnaIt(getGenome()->getDnaIterator(getStartPosition())); 15 | size_t length = getLength(); 16 | size_t maxNs = nThreshold * (double)length; 17 | size_t Ns = 0; 18 | char c; 19 | for (size_t i = 0; i < length; ++i, dnaIt->toRight()) { 20 | c = dnaIt->getBase(); 21 | if (c == 'N' || c == 'n') { 22 | ++Ns; 23 | } 24 | if (Ns > maxNs) { 25 | return true; 26 | } 27 | if ((length - i) < (maxNs - Ns)) { 28 | return false; 29 | } 30 | } 31 | return false; 32 | } 33 | -------------------------------------------------------------------------------- /api/impl/halSequence.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halDefs.h" 9 | #include "halSequence.h" 10 | #include "halGenome.h" 11 | 12 | 13 | /* thrown when sequence not found in genome */ 14 | hal::SequenceNotFoundException::SequenceNotFoundException(const Genome* genome, 15 | const std::string &name): 16 | hal_exception("Sequence '" + name + "' not found in genome '" + genome->getName() + "'") { 17 | } 18 | -------------------------------------------------------------------------------- /api/impl/halTopSegment.cpp: -------------------------------------------------------------------------------- 1 | #include "halTopSegment.h" 2 | #include "halDnaIterator.h" 3 | 4 | using namespace hal; 5 | 6 | void TopSegment::getString(std::string &outString) const { 7 | DnaIteratorPtr dnaIt(getGenome()->getDnaIterator(getStartPosition())); 8 | dnaIt->readString(outString, getLength()); 9 | } 10 | -------------------------------------------------------------------------------- /api/inc/halDnaDriver.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALDNADRIVER_H 9 | #define _HALDNADRIVER_H 10 | #include "halCommon.h" 11 | 12 | namespace hal { 13 | /** 14 | * Class for access a genome's DNA sequence. It handles buffering data and only goes 15 | * to the storage layer when the buffer needs filled, inlining of base access. 16 | * There can be multiple object active independently on a given genome. Assumes 17 | * that DNA is nibble-encode and handles encoding and decoding. 18 | */ 19 | class DnaAccess { 20 | public: 21 | /* Destructor */ 22 | virtual ~DnaAccess() noexcept(false) { 23 | if (_dirty) { 24 | throw hal_exception("DnaAccess is dirty, flush() should have been called"); 25 | } 26 | } 27 | 28 | /* flush dirty buffer if necessary */ 29 | virtual void flush() = 0; 30 | 31 | /* get a base at the specified index. */ 32 | inline char getBase(hal_index_t index) const { 33 | hal_index_t relIndex = access(index); 34 | return dnaUnpack(relIndex, _buffer[relIndex / 2]); 35 | } 36 | 37 | /* set a base at the specified index. */ 38 | inline void setBase(hal_index_t index, char base) { 39 | hal_index_t relIndex = access(index); 40 | _buffer[relIndex / 2] = dnaPack(base, relIndex, _buffer[relIndex / 2]); 41 | _dirty = true; 42 | } 43 | 44 | protected: 45 | /* constructor */ 46 | DnaAccess(hal_index_t startIndex, hal_index_t endIndex, char *buffer) 47 | : _startIndex(startIndex), _endIndex(endIndex), _buffer(buffer), _dirty(false) { 48 | } 49 | 50 | /* refresh the buffer if needed and return relative index */ 51 | inline hal_index_t access(hal_index_t index) const { 52 | if ((index < _startIndex) or (index >= _endIndex)) { 53 | fetch(index); 54 | } 55 | return index - _startIndex; 56 | } 57 | 58 | /* refreshed the buffer, const since it is abstracted as a cache */ 59 | virtual void fetch(hal_index_t index) const = 0; 60 | 61 | /* Cached buffer. Index will always be even (first nibble) */ 62 | mutable hal_index_t _startIndex; 63 | mutable hal_index_t _endIndex; 64 | mutable char *_buffer; 65 | mutable bool _dirty; 66 | }; 67 | } 68 | #endif 69 | // Local Variables: 70 | // mode: c++ 71 | // End: 72 | -------------------------------------------------------------------------------- /api/inc/halGappedSegmentIterator.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALGAPPEDSEGMENTITERATOR_H 9 | #define _HALGAPPEDSEGMENTITERATOR_H 10 | 11 | #include "halSegmentIterator.h" 12 | 13 | namespace hal { 14 | 15 | /** 16 | * Interface for general gappedSegment iterator. Behaves like 17 | * a regular iterator, but operates on a linear sequence of 18 | * segments that are consistent modulo gaps. Is only really used 19 | * internally at the moment so I haven't spent the effort to 20 | * make gapped iterators fully general (such as including the 21 | * Top/Bottom Segment iterfaces) 22 | */ 23 | class GappedSegmentIterator : virtual public SegmentIterator { 24 | public: 25 | /** Destructor */ 26 | virtual ~GappedSegmentIterator() { 27 | } 28 | 29 | /** Get the gap length threshold. This is the maximum length (in sites) 30 | * of an indel such that it can be considered a gap (and therefore ignored 31 | * in the rearrangement analysis */ 32 | virtual hal_size_t getGapThreshold() const = 0; 33 | 34 | /** When the gap iterator is in atomic mode, it allows for now gaps. 35 | * Also, it will not merge consistent segments together if they are adjacent 36 | * and there are no gaps (something that could happen when the threshold is 37 | * set to zero but atomic is false). This is mostly a hack to get the gapped 38 | * iterator to behive as a single segment */ 39 | virtual bool getAtomic() const = 0; 40 | 41 | /** Gapped iterators are tied to a specific parent child pair. The child 42 | * index is the index of the child genome within the parent */ 43 | virtual hal_size_t getChildIndex() const = 0; 44 | 45 | /** Get the number of segments that have been agglomerated together within 46 | * the gapped iterator */ 47 | virtual hal_size_t getNumSegments() const = 0; 48 | 49 | /** Get the number of gaps within segments that have been agglomerated 50 | * together within the gapped iterator */ 51 | virtual hal_size_t getNumGaps() const = 0; 52 | 53 | /** Get the number of bases within gaps within segments that have been 54 | * agglomerated together within the gapped iterator */ 55 | virtual hal_size_t getNumGapBases() const = 0; 56 | 57 | /** Get the Segment array index of the left segment of the iterator */ 58 | virtual hal_index_t getLeftArrayIndex() const = 0; 59 | 60 | /** Get the Segment array index of the right segment of the iterator */ 61 | virtual hal_index_t getRightArrayIndex() const = 0; 62 | }; 63 | } 64 | #endif 65 | // Local Variables: 66 | // mode: c++ 67 | // End: 68 | -------------------------------------------------------------------------------- /api/inc/halMappedSegmentContainers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #ifndef _HALMAPPEDSEGMENTCONTAINERS_H 8 | #define _HALMAPPEDSEGMENTCONTAINERS_H 9 | #include "halDefs.h" 10 | #include 11 | 12 | namespace hal { 13 | /* Functor for set compare; implemented in halMappedSegment.cpp This needs 14 | * to be in it's is used by hal::Segment which is required by 15 | * halMappedSegment.h. It also needs to have the implementation split 16 | * from the class definition since the compare function references 17 | * MappedSegment. */ 18 | struct MappedSegmentLess { 19 | bool operator()(const hal::MappedSegment &m1, const hal::MappedSegment &m2) const; 20 | bool operator()(const hal::MappedSegmentPtr &m1, const hal::MappedSegmentPtr &m2) const; 21 | }; 22 | 23 | /* set of MappedSegments objects */ 24 | class MappedSegmentSet : public std::set {}; 25 | } 26 | 27 | #endif 28 | 29 | // Local Variables: 30 | // mode: c++ 31 | // End: 32 | -------------------------------------------------------------------------------- /api/inc/halMetaData.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMETADATA_H 9 | #define _HALMETADATA_H 10 | 11 | #include "halDefs.h" 12 | #include 13 | #include 14 | 15 | namespace hal { 16 | 17 | /** 18 | * Interface for alignment (or genome) metadata 19 | * MetaData is a set of key/value pairs where each key and each 20 | * value is represented by a string. 21 | */ 22 | class MetaData { 23 | public: 24 | /** Destructor */ 25 | virtual ~MetaData() { 26 | } 27 | 28 | /** Get read-only reference to the map of metadata */ 29 | virtual const std::map &getMap() const = 0; 30 | 31 | /** Get the value associated with a key (throws error if key doesn't exist) 32 | * @param key MetaData key */ 33 | virtual const std::string &get(const std::string &key) const = 0; 34 | 35 | /** Set a key-value pair (create's if doesn't exist, updates if does) 36 | * @param key Key to update 37 | * @param value Value to update */ 38 | virtual void set(const std::string &key, const std::string &value) = 0; 39 | 40 | /** Determine if key exists in metadata 41 | * @param key Key to test */ 42 | virtual bool has(const std::string &key) const = 0; 43 | }; 44 | } 45 | #endif 46 | // Local Variables: 47 | // mode: c++ 48 | // End: 49 | -------------------------------------------------------------------------------- /api/inc/halPositionCache.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALPOSITIONCACHE_H 9 | #define _HALPOSITIONCACHE_H 10 | 11 | #include "halDefs.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace hal { 18 | 19 | /** keep track of bases by storing 2d intervals 20 | * For example, if we want to flag positions in a genome 21 | * that we have visited, this structure will be fairly 22 | * efficient provided positions are clustered into intervals */ 23 | class PositionCache { 24 | public: 25 | PositionCache() : _size(0), _prev(_set.begin()) { 26 | } 27 | PositionCache(const PositionCache &positionCache) 28 | : _set(*positionCache.getIntervalSet()), _size(positionCache.size()), _prev(_set.begin()) { 29 | } 30 | // sorted by last index, so each interval is (last, first) 31 | typedef std::map IntervalSet; 32 | 33 | bool insert(hal_index_t pos); 34 | bool find(hal_index_t pos) const; 35 | void clear(); 36 | bool check() const; 37 | hal_size_t size() const { 38 | return _size; 39 | } 40 | hal_size_t numIntervals() const { 41 | return _set.size(); 42 | } 43 | 44 | const IntervalSet *getIntervalSet() const { 45 | return &_set; 46 | } 47 | 48 | private: 49 | IntervalSet _set; 50 | hal_size_t _size; 51 | IntervalSet::iterator _prev; 52 | }; 53 | } 54 | 55 | #endif 56 | 57 | // Local Variables: 58 | // mode: c++ 59 | // End: 60 | -------------------------------------------------------------------------------- /api/inc/halSegmentMapper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALSEGMENTMAPPER_H 9 | #define _HALSEGMENTMAPPER_H 10 | #include "halDefs.h" 11 | #include "halSegmentIterator.h" 12 | #include 13 | 14 | namespace hal { 15 | class Segment; 16 | class MappedSegmentSet; 17 | class Genome; 18 | 19 | /** Get homologous segments in target genome. Returns the number 20 | * of mapped segments found. 21 | * @param source Input. 22 | * @param outSegments Output. Mapped segments are sorted along the 23 | * *target* genome. 24 | * @param tgtGenome Target genome to map to. Can be the same as current. 25 | * @param genomesOnPath Intermediate genomes that must be visited 26 | * on the way down from coalescenceLimit to tgt. If this is 27 | * specified as NULL, then the path will be computed automatically 28 | * (using hal::getGenomesInSpanningTree(coalescenceLimit, tgtGenome)). 29 | * Specifying this can avoid recomputing the path over and over again 30 | * when, say, calling halMapSegment repeatedly for the same 31 | * source and target. 32 | * @param doDupes Specify whether paralogy edges are followed 33 | * @param minLength Minimum length of segments to consider. It is 34 | * potentially much faster to filter using this parameter than 35 | * doing a second pass on the output. If minLength is 0, then no 36 | * segments are filtered based on length. 37 | * @param coalescenceLimit Any paralogs that coalesce in or below 38 | * this genome will be mapped to the target as well. Must be the 39 | * MRCA or higher. By default, the coalescenceLimit is the MRCA. 40 | * @param mrca The MRCA of the source and target genomes. By 41 | * default, it is computed automatically. */ 42 | hal_size_t halMapSegment(const SegmentIterator *source, MappedSegmentSet &outSegments, const Genome *tgtGenome, 43 | const std::set *genomesOnPath = NULL, bool doDupes = true, 44 | hal_size_t minLength = 0, const Genome *coalescenceLimit = NULL, const Genome *mrca = NULL); 45 | 46 | /* call main function with smart pointer */ 47 | hal_size_t halMapSegmentSP(const SegmentIteratorPtr &source, MappedSegmentSet &outSegments, const Genome *tgtGenome, 48 | const std::set *genomesOnPath = NULL, bool doDupes = true, 49 | hal_size_t minLength = 0, const Genome *coalescenceLimit = NULL, const Genome *mrca = NULL); 50 | } 51 | #endif 52 | -------------------------------------------------------------------------------- /api/inc/halSequenceIterator.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALSEQUENCEITERATOR_H 9 | #define _HALSEQUENCEITERATOR_H 10 | 11 | #include "halDefs.h" 12 | #include "halSequence.h" 13 | 14 | namespace hal { 15 | 16 | /** 17 | * Iterate over sequences in the genome. 18 | */ 19 | class SequenceIterator { 20 | public: 21 | /** Destructor */ 22 | virtual ~SequenceIterator() { 23 | } 24 | 25 | /** Create a duplicate iterator referring to the same sequence 26 | * which itself is not copied */ 27 | virtual SequenceIteratorPtr clone() const = 0; 28 | 29 | /** Move iterator to next sequence in the genome */ 30 | virtual void toNext() = 0; 31 | 32 | /** Move iterator to previous sequence in the genome */ 33 | virtual void toPrev() = 0; 34 | 35 | /** has the iterator reach the end of the traversal in the direction of 36 | * movement? */ 37 | virtual bool atEnd() const = 0; 38 | 39 | /** Return pointer to the sequence */ 40 | virtual const Sequence *getSequence() const = 0; 41 | 42 | /** Return pointer to the sequence */ 43 | virtual Sequence *getSequence() = 0; 44 | 45 | /** Test if iterator points to same sequence as other iterator */ 46 | virtual bool equals(SequenceIteratorPtr p2) const = 0; 47 | }; 48 | 49 | inline bool operator==(SequenceIteratorPtr p1, SequenceIteratorPtr p2) { 50 | if (p1.get() == NULL || p2.get() == NULL) { 51 | return p1.get() == NULL && p2.get() == NULL; 52 | } 53 | return p1->equals(p2); 54 | } 55 | 56 | inline bool operator!=(SequenceIteratorPtr p1, SequenceIteratorPtr p2) { 57 | return !(p1 == p2); 58 | } 59 | } 60 | #endif 61 | // Local Variables: 62 | // mode: c++ 63 | // End: 64 | -------------------------------------------------------------------------------- /api/inc/halSlicedSegment.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALSLICEDSEGMENT_H 9 | #define _HALSLICEDSEGMENT_H 10 | 11 | #include "halDefs.h" 12 | #include "halSegment.h" 13 | 14 | namespace hal { 15 | 16 | /** 17 | * Interface for a sliced segment. This extends the segment interface 18 | * by allowing slicing (accessing just subintervals of the segment), 19 | * along with reversing. 20 | */ 21 | class SlicedSegment : public virtual Segment { 22 | public: 23 | /** Destructor */ 24 | virtual ~SlicedSegment() { 25 | } 26 | 27 | /** switch to segment's reverse complement */ 28 | virtual void toReverse() = 0; 29 | 30 | /** switch to segment's reverse complement without affecting the 31 | * coordinates in the forward strand. Unless the segment is sliced 32 | * this will have an identical effect to toReverse(). Both methods 33 | * can be useful in different situations but the distinction is 34 | * confusing. For example, if the segment represents range [0, 10] 35 | * on the forward strand but is sliced to to the subregion [3,8], 36 | * then toReverse() will result in the the region [7,2], but to 37 | * toReverseInPlace() would yield [8,3]. */ 38 | virtual void toReverseInPlace() = 0; 39 | 40 | /** Get the start offset of the slice in the segment */ 41 | virtual hal_offset_t getStartOffset() const = 0; 42 | 43 | /** Get the start offset of the slice in the segment */ 44 | virtual hal_offset_t getEndOffset() const = 0; 45 | 46 | /** Set the start and end offsets 47 | * @param startOffset offset from beginning of segment 48 | * @param endOffset offset from end of segment */ 49 | virtual void slice(hal_offset_t startOffset = 0, hal_offset_t endOffset = 0) = 0; 50 | 51 | /** Check whether iterator is on segment's reverse complement */ 52 | virtual bool getReversed() const = 0; 53 | 54 | protected: 55 | /** constructor */ 56 | SlicedSegment(Genome *genome, hal_index_t index) : Segment(genome, index) { 57 | } 58 | /** constructor */ 59 | SlicedSegment() : Segment() { 60 | } 61 | }; 62 | 63 | inline bool operator<(const SlicedSegment &segmentIt, hal_index_t genomePos) { 64 | return segmentIt.leftOf(genomePos); 65 | } 66 | 67 | inline bool operator>(const SlicedSegment &segmentIt, hal_index_t genomePos) { 68 | return segmentIt.rightOf(genomePos); 69 | } 70 | } 71 | #endif 72 | // Local Variables: 73 | // mode: c++ 74 | // End: 75 | -------------------------------------------------------------------------------- /api/inc/halValidate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALVALIDATE_H 9 | #define _HALVALIDATE_H 10 | 11 | #include "halAlignment.h" 12 | #include 13 | #include 14 | #include 15 | 16 | namespace hal { 17 | 18 | /** Go through a bottom segment, and throw an exception if anything 19 | * appears out of whack. */ 20 | void validateBottomSegment(const BottomSegment *bottomSegment); 21 | 22 | /** Go through a top segment, and throw an exception if anything 23 | * appears out of whack. */ 24 | void validateTopSegment(const TopSegment *topSegment); 25 | 26 | /** Go through a sequence, and throw an exception if anything 27 | * appears out of whack. */ 28 | void validateSequence(const Sequence *sequence); 29 | 30 | /** Go through a genome, and throw an exception if anything 31 | * appears out of whack. */ 32 | void validateGenome(const Genome *genome); 33 | 34 | /** Go through a genome, and throw an exception if any duplications 35 | * appears out of whack. */ 36 | void validateDuplications(const Genome *genome); 37 | 38 | /** Go through an alignment, and throw an excpetion if anything 39 | * appears out of whack. */ 40 | void validateAlignment(const Alignment *alignment); 41 | } 42 | #endif 43 | 44 | // Local Variables: 45 | // mode: c++ 46 | // End: 47 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapArray.h: -------------------------------------------------------------------------------- 1 | #ifndef _MMAPARRAY_H 2 | #define _MMAPARRAY_H 3 | #include "halMetaData.h" 4 | 5 | namespace hal { 6 | class MMapArrayData { 7 | public: 8 | size_t _elementSize; 9 | size_t _capacity; 10 | size_t _length; 11 | }; 12 | 13 | template class MMapArray { 14 | public: 15 | // Construct & initalize new MMapArray. 16 | MMapArray(MMapAlignment *alignment) : _alignment(alignment), _data(NULL) { 17 | grow(8); 18 | }; 19 | // Construct a MMapArray representing the existing array at this offset. 20 | MMapArray(MMapAlignment *alignment, size_t offset) : _alignment(alignment), _offset(offset) { 21 | _data = (MMapArrayData *)_alignment->resolveOffset(_offset, sizeof(MMapArrayData)); 22 | }; 23 | T *operator[](size_t index) { 24 | return getSlice(index, 1); 25 | }; 26 | T *getSlice(size_t index, size_t length) { 27 | return (T *)_alignment->resolveOffset(_offset + sizeof(MMapArrayData) + index * _data->_elementSize, 28 | _data->_elementSize * length); 29 | }; 30 | size_t getOffset() { 31 | return _offset; 32 | }; 33 | size_t getCapacity() { 34 | return _data->_capacity; 35 | }; 36 | size_t getLength() { 37 | return _data->_length; 38 | }; 39 | 40 | void grow(size_t capacity) { 41 | size_t size = sizeof(MMapArrayData) + capacity * sizeof(T); 42 | size_t newOffset = _alignment->allocateNewArray(size); 43 | MMapArrayData *newData = (MMapArrayData *)_alignment->resolveOffset(newOffset, size); 44 | if (_data != NULL) { 45 | memcpy(newData, _data, sizeof(MMapArrayData) + _data->_length * _data->_elementSize); 46 | } 47 | _data = newData; 48 | _offset = newOffset; 49 | _data->_capacity = capacity; 50 | _data->_elementSize = sizeof(T); 51 | }; 52 | 53 | size_t setLength(size_t length) { 54 | if (length <= _data->_capacity) { 55 | _data->_length = length; 56 | } else { 57 | grow(length); 58 | _data->_length = length; 59 | } 60 | return getOffset(); 61 | } 62 | 63 | private: 64 | MMapAlignment *_alignment; 65 | size_t _offset; 66 | MMapArrayData *_data; 67 | }; 68 | } 69 | #endif 70 | // Local Variables: 71 | // mode: c++ 72 | // End: 73 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapBottomSegment.cpp: -------------------------------------------------------------------------------- 1 | #include "mmapBottomSegment.h" 2 | #include "halDnaIterator.h" 3 | #include "mmapTopSegment.h" 4 | 5 | using namespace hal; 6 | 7 | void MMapBottomSegment::setCoordinates(hal_index_t startPos, hal_size_t length) { 8 | if (_genome && 9 | (startPos >= (hal_index_t)_genome->getSequenceLength() || startPos + length > _genome->getSequenceLength())) { 10 | throw hal_exception("Trying to set top segment coordinate out of range"); 11 | } 12 | 13 | _data->setStartPosition(startPos); 14 | getNextData()->setStartPosition(startPos + length); 15 | } 16 | 17 | hal_offset_t MMapBottomSegment::getTopParseOffset() const { 18 | assert(_index >= 0); 19 | hal_offset_t offset = 0; 20 | hal_index_t topIndex = getTopParseIndex(); 21 | if (topIndex != NULL_INDEX) { 22 | MMapTopSegment ts(getMMapGenome(), topIndex); 23 | assert(ts.getStartPosition() <= getStartPosition()); 24 | assert((hal_index_t)(ts.getStartPosition() + ts.getLength()) >= getStartPosition()); 25 | offset = getStartPosition() - ts.getStartPosition(); 26 | } 27 | return offset; 28 | } 29 | 30 | void MMapBottomSegment::print(std::ostream &os) const { 31 | os << "MMapBottomSegment" << getStartPosition() << " " << getEndPosition() << std::endl; 32 | } 33 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapBottomSegmentData.h: -------------------------------------------------------------------------------- 1 | #ifndef _MMAPBOTTOMSEGMENTDATA_H 2 | #define _MMAPBOTTOMSEGMENTDATA_H 3 | 4 | namespace hal { 5 | class MMapBottomSegmentData { 6 | public: 7 | void setStartPosition(hal_index_t startPosition) { 8 | _startPosition = startPosition; 9 | }; 10 | void setTopParseIndex(hal_index_t parseIndex) { 11 | _topParseIndex = parseIndex; 12 | }; 13 | void setChildIndex(hal_size_t child, hal_index_t childIndex) { 14 | *getChildIndexLocation(child) = childIndex; 15 | }; 16 | void setChildReversed(hal_size_t numChildren, hal_size_t child, bool childReversed) { 17 | *getChildReversedLocation(numChildren, child) = childReversed; 18 | }; 19 | 20 | hal_index_t getStartPosition() const { 21 | return _startPosition; 22 | }; 23 | hal_index_t getTopParseIndex() const { 24 | return _topParseIndex; 25 | }; 26 | hal_index_t getChildIndex(hal_size_t child) const { 27 | return *getChildIndexLocation(child); 28 | }; 29 | hal_index_t getChildReversed(hal_size_t numChildren, hal_size_t child) const { 30 | return *getChildReversedLocation(numChildren, child); 31 | }; 32 | 33 | // Get on-disk size of this element for the given genome. NB: the size is 34 | // rounded up to the next 8-byte boundary for alignment purposes. 35 | static size_t getSize(const Genome *genome) { 36 | size_t extraAlignmentBytes = 0; 37 | if ((genome->getNumChildren() % 8) != 0) { 38 | extraAlignmentBytes = 8 - (genome->getNumChildren() % 8); 39 | } 40 | return sizeof(hal_index_t) * (2 + genome->getNumChildren()) + (genome->getNumChildren() + extraAlignmentBytes); 41 | }; 42 | 43 | private: 44 | hal_index_t *getChildIndexLocation(hal_size_t child) const { 45 | return const_cast(&_topParseIndex + 1 + child); 46 | } 47 | bool *getChildReversedLocation(hal_size_t numChildren, hal_size_t child) const { 48 | return ((bool *)getChildIndexLocation(numChildren)) + child; 49 | } 50 | hal_index_t _startPosition; 51 | hal_index_t _topParseIndex; 52 | }; 53 | } 54 | #endif 55 | // Local Variables: 56 | // mode: c++ 57 | // End: 58 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapDnaDriver.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include "mmapDnaDriver.h" 8 | #include "mmapAlignment.h" 9 | #include "mmapGenome.h" 10 | 11 | using namespace hal; 12 | 13 | static const int UDC_FETCH_SIZE = 64 * 1024; // size to bring in for UDC access 14 | 15 | MMapDnaAccess::MMapDnaAccess(MMapGenome *genome, hal_index_t index) 16 | : DnaAccess(0, 0, NULL), _genome(genome), 17 | _isUdcProtocol(dynamic_cast(_genome->getAlignment())->getMMapFile()->isUdcProtocol()) { 18 | if (_isUdcProtocol) { 19 | fetch(index); 20 | } else { 21 | // for local mmap, just include the whole thing 22 | _endIndex = _genome->getSequenceLength(); 23 | _buffer = _genome->getDNA(0, 1); 24 | } 25 | } 26 | 27 | void MMapDnaAccess::flush() { 28 | // kernel handles page out 29 | _dirty = false; 30 | } 31 | 32 | void MMapDnaAccess::fetch(hal_index_t index) const { 33 | if (_isUdcProtocol) { 34 | _startIndex = 2 * (index / 2); // even boundary 35 | _endIndex = std::max(hal_size_t(_startIndex + UDC_FETCH_SIZE), _genome->getSequenceLength()); 36 | _buffer = _genome->getDNA(_startIndex / 2, (((_endIndex - _startIndex) + 1) / 2)); 37 | } else { 38 | assert(false); // this should never be called for local 39 | } 40 | _dirty = false; // keep consistent, but not actually used 41 | } 42 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapDnaDriver.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #ifndef _MMAPDNADRIVER_H 8 | #define _MMAPDNADRIVER_H 9 | #include "halDnaDriver.h" 10 | 11 | namespace hal { 12 | class MMapGenome; 13 | class MMapAlignment; 14 | 15 | /** 16 | * Mmap implementation of DnaAccess. 17 | */ 18 | class MMapDnaAccess : public DnaAccess { 19 | public: 20 | MMapDnaAccess(MMapGenome *genome, hal_index_t index); 21 | 22 | virtual ~MMapDnaAccess() { 23 | } 24 | 25 | void flush(); 26 | 27 | protected: 28 | virtual void fetch(hal_index_t index) const; 29 | 30 | private: 31 | MMapGenome *_genome; 32 | bool _isUdcProtocol; 33 | }; 34 | } 35 | 36 | #endif 37 | // Local Variables: 38 | // mode: c++ 39 | // End: 40 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapSequenceData.h: -------------------------------------------------------------------------------- 1 | #ifndef _MMAPSEQUENCEDATA_H 2 | #define _MMAPSEQUENCEDATA_H 3 | #include "mmapAlignment.h" 4 | namespace hal { 5 | class MMapSequenceData { 6 | friend class MMapSequence; 7 | friend class MMapGenome; 8 | 9 | public: 10 | const char *getName(MMapAlignment *alignment) const { 11 | return (const char *)alignment->resolveOffset(_nameOffset, _nameLength); 12 | }; 13 | void setName(MMapAlignment *alignment, const std::string &newName) { 14 | size_t size = newName.size() + 1; 15 | _nameOffset = alignment->allocateNewArray(sizeof(char) * size); 16 | strncpy((char *)alignment->resolveOffset(_nameOffset, size), newName.c_str(), size); 17 | _nameLength = size; 18 | }; 19 | 20 | private: 21 | hal_index_t _startPosition; 22 | hal_index_t _index; 23 | hal_size_t _length; 24 | hal_index_t _topSegmentStartIndex; 25 | hal_index_t _bottomSegmentStartIndex; 26 | hal_size_t _numTopSegments; 27 | hal_size_t _numBottomSegments; 28 | size_t _nameLength; 29 | size_t _nameOffset; 30 | char _reserved[256]; // 256 bytes of reserved added in mmap API 1.1 31 | }; 32 | } 33 | #endif 34 | // Local Variables: 35 | // mode: c++ 36 | // End: 37 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapSequenceIterator.h: -------------------------------------------------------------------------------- 1 | #ifndef _MMAPSEQUENCEITERATOR_H 2 | #define _MMAPSEQUENCEITERATOR_H 3 | #include "halSequenceIterator.h" 4 | namespace hal { 5 | // FIXME: the fact that _data is being moved but _index is not moved is confusing; 6 | // why does this class have an _index? 7 | class MMapSequenceIterator : public SequenceIterator { 8 | public: 9 | MMapSequenceIterator(MMapGenome *genome, hal_index_t index) 10 | : _genome(genome), _sequence(_genome, _genome->getSequenceData(index)), _index(index){}; 11 | 12 | // SEQUENCE ITERATOR METHODS 13 | SequenceIteratorPtr clone() const { 14 | return SequenceIteratorPtr(new MMapSequenceIterator(_genome, _index)); 15 | } 16 | void toNext() { 17 | _index++; 18 | _sequence._data += 1; 19 | } 20 | void toPrev() { 21 | _index--; 22 | _sequence._data -= 1; 23 | } 24 | bool atEnd() const { 25 | return (_sequence._data < _genome->getSequenceData(0)) or 26 | (_sequence._data >= _genome->getSequenceData(_genome->getNumSequences())); 27 | } 28 | const Sequence *getSequence() const { 29 | return &_sequence; 30 | } 31 | Sequence *getSequence() { 32 | return &_sequence; 33 | } 34 | bool equals(SequenceIteratorPtr other) const { 35 | const MMapSequenceIterator *mmapOther = reinterpret_cast(other.get()); 36 | assert(_sequence.getGenome() == mmapOther->_sequence.getGenome()); 37 | return _sequence.getArrayIndex() == mmapOther->_sequence.getArrayIndex(); 38 | }; 39 | 40 | private: 41 | MMapGenome *_genome; 42 | MMapSequence _sequence; 43 | hal_index_t _index; 44 | }; 45 | } 46 | #endif 47 | // Local Variables: 48 | // mode: c++ 49 | // End: 50 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapString.h: -------------------------------------------------------------------------------- 1 | #ifndef _MMAP_STRING_H 2 | #define _MMAP_STRING_H 3 | #include "mmapArray.h" 4 | namespace hal { 5 | class MMapString : public MMapArray { 6 | public: 7 | MMapString(MMapAlignment *alignment, const std::string &string) : MMapArray(alignment), _string(string) { 8 | set(_string); 9 | }; 10 | MMapString(MMapAlignment *alignment, size_t offset) : MMapArray(alignment, offset) { 11 | read(); 12 | }; 13 | const char *c_str() { 14 | return getSlice(0, getLength()); 15 | } 16 | const std::string &get() { 17 | return _string; 18 | } 19 | size_t set(const std::string &string) { 20 | _string = string; 21 | setLength(string.size() + 1); 22 | for (size_t i = 0; i < string.size(); i++) { 23 | *(this->operator[](i)) = string[i]; 24 | } 25 | *(this->operator[](string.size())) = '\0'; 26 | return getOffset(); 27 | } 28 | 29 | private: 30 | void read() { 31 | _string = c_str(); 32 | }; 33 | std::string _string; 34 | }; 35 | } 36 | #endif 37 | // Local Variables: 38 | // mode: c++ 39 | // End: 40 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapTopSegment.cpp: -------------------------------------------------------------------------------- 1 | #include "mmapTopSegment.h" 2 | #include "halDnaIterator.h" 3 | #include "mmapBottomSegment.h" 4 | 5 | using namespace hal; 6 | 7 | void MMapTopSegment::setCoordinates(hal_index_t startPos, hal_size_t length) { 8 | if (_genome && 9 | (startPos >= (hal_index_t)_genome->getSequenceLength() || startPos + length > _genome->getSequenceLength())) { 10 | throw hal_exception("Trying to set top segment coordinate out of range"); 11 | } 12 | 13 | _data->setStartPosition(startPos); 14 | (_data + 1)->setStartPosition(startPos + length); 15 | } 16 | 17 | hal_offset_t MMapTopSegment::getBottomParseOffset() const { 18 | assert(_index >= 0); 19 | hal_offset_t offset = 0; 20 | hal_index_t bottomIndex = getBottomParseIndex(); 21 | if (bottomIndex != NULL_INDEX) { 22 | MMapBottomSegment bs(getMMapGenome(), bottomIndex); 23 | assert(bs.getStartPosition() <= getStartPosition()); 24 | assert((hal_index_t)(bs.getStartPosition() + bs.getLength()) >= getStartPosition()); 25 | offset = getStartPosition() - bs.getStartPosition(); 26 | } 27 | return offset; 28 | } 29 | 30 | bool MMapTopSegment::isCanonicalParalog() const { 31 | bool isCanon = false; 32 | if (hasParent()) { 33 | MMapGenome *parGenome = const_cast(dynamic_cast(_genome->getParent())); 34 | 35 | MMapBottomSegment parent(parGenome, getParentIndex()); 36 | hal_index_t childGenomeIndex = parGenome->getChildIndex(_genome); 37 | isCanon = parent.getChildIndex(childGenomeIndex) == _index; 38 | } 39 | return isCanon; 40 | } 41 | 42 | void MMapTopSegment::print(std::ostream &os) const { 43 | os << "MMapTopSegment" << getStartPosition() << " " << getEndPosition() << std::endl; 44 | } 45 | -------------------------------------------------------------------------------- /api/mmap_impl/mmapTopSegmentData.h: -------------------------------------------------------------------------------- 1 | #ifndef _MMAPTOPSEGMENTDATA_H 2 | #define _MMAPTOPSEGMENTDATA_H 3 | 4 | namespace hal { 5 | class MMapTopSegmentData { 6 | public: 7 | void setStartPosition(hal_index_t startPosition) { 8 | _startPosition = startPosition; 9 | }; 10 | void setBottomParseIndex(hal_index_t parseIndex) { 11 | _bottomParseIndex = parseIndex; 12 | }; 13 | void setNextParalogyIndex(hal_index_t paralogyIndex) { 14 | _paralogyIndex = paralogyIndex; 15 | }; 16 | void setParentIndex(hal_index_t parentIndex) { 17 | _parentIndex = parentIndex; 18 | }; 19 | void setReversed(bool reversed) { 20 | _reversed = reversed; 21 | }; 22 | 23 | hal_index_t getStartPosition() const { 24 | return _startPosition; 25 | }; 26 | hal_index_t getBottomParseIndex() const { 27 | return _bottomParseIndex; 28 | }; 29 | hal_index_t getNextParalogyIndex() const { 30 | return _paralogyIndex; 31 | }; 32 | hal_index_t getParentIndex() const { 33 | return _parentIndex; 34 | }; 35 | hal_index_t getReversed() const { 36 | return _reversed; 37 | }; 38 | 39 | private: 40 | hal_index_t _startPosition; 41 | hal_index_t _bottomParseIndex; 42 | hal_index_t _paralogyIndex; 43 | hal_index_t _parentIndex; 44 | bool _reversed; 45 | }; 46 | } 47 | #endif 48 | // Local Variables: 49 | // mode: c++ 50 | // End: 51 | -------------------------------------------------------------------------------- /api/tests/halAlignmentTreesTest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halApiTestSupport.h" 9 | #include "halAlignment.h" 10 | #include "halGenome.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | extern "C" { 16 | #include "commonC.h" 17 | } 18 | 19 | using namespace std; 20 | using namespace hal; 21 | 22 | class AlignmentTestTrees : public AlignmentTest { 23 | public: 24 | void createCallBack(AlignmentPtr alignment) { 25 | hal_size_t alignmentSize = alignment->getNumGenomes(); 26 | CuAssertTrue(_testCase, alignmentSize == 0); 27 | 28 | alignment->addRootGenome("Root", 0); 29 | alignment->addLeafGenome("Leaf", "Root", 10); 30 | alignment->addRootGenome("NewRoot", 15); 31 | alignment->addLeafGenome("Leaf1", "Root", 4.1); 32 | alignment->addLeafGenome("Leaf2", "Root", 5.1); 33 | alignment->addLeafGenome("Leaf3", "Root", 6.1); 34 | alignment->addLeafGenome("Leaf4", "Root", 7.1); 35 | alignment->updateBranchLength("Root", "Leaf1", 3.0); 36 | alignment->updateBranchLength("Root", "Leaf2", 6.1); 37 | alignment->updateBranchLength("Root", "Leaf2", 5.1); 38 | } 39 | 40 | void checkCallBack(AlignmentConstPtr alignment) { 41 | CuAssertTrue(_testCase, alignment->getRootName() == "NewRoot"); 42 | CuAssertTrue(_testCase, 43 | alignment->getNewickTree() == "((Leaf:10,Leaf1:3,Leaf2:5.1,Leaf3:6.1,Leaf4:7.1)Root:15)NewRoot;"); 44 | CuAssertTrue(_testCase, alignment->getBranchLength("Root", "Leaf") == 10.0); 45 | CuAssertTrue(_testCase, alignment->getBranchLength("Root", "Leaf1") == 3.0); 46 | vector children = alignment->getChildNames("Root"); 47 | CuAssertTrue(_testCase, children.size() == 5); 48 | 49 | vector leaves = alignment->getLeafNamesBelow("Leaf"); 50 | CuAssertTrue(_testCase, leaves.size() == 0); 51 | leaves = alignment->getLeafNamesBelow("NewRoot"); 52 | CuAssertTrue(_testCase, leaves.size() == 5); 53 | for (size_t i = 0; i < leaves.size(); ++i) { 54 | CuAssertTrue(_testCase, leaves[i][0] == 'L'); 55 | } 56 | leaves = alignment->getLeafNamesBelow("Root"); 57 | CuAssertTrue(_testCase, leaves.size() == 5); 58 | for (size_t i = 0; i < leaves.size(); ++i) { 59 | CuAssertTrue(_testCase, leaves[i][0] == 'L'); 60 | } 61 | } 62 | }; 63 | 64 | static void halAlignmentTestTrees(CuTest *testCase) { 65 | AlignmentTestTrees tester; 66 | tester.check(testCase); 67 | } 68 | 69 | static CuSuite *halAlignmentTestSuite(void) { 70 | CuSuite *suite = CuSuiteNew(); 71 | SUITE_ADD_TEST(suite, halAlignmentTestTrees); 72 | return suite; 73 | } 74 | 75 | int main(int argc, char *argv[]) { 76 | return runHalTestSuite(argc, argv, halAlignmentTestSuite()); 77 | } 78 | -------------------------------------------------------------------------------- /api/tests/halApiTestSupport.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALAPITESTSUPPORT_H 9 | #define _HALAPITESTSUPPORT_H 10 | 11 | #include "halAlignmentInstance.h" 12 | #include 13 | extern "C" { 14 | #include "CuTest.h" 15 | #include "commonC.h" 16 | } 17 | 18 | using namespace hal; 19 | using namespace std; 20 | 21 | AlignmentPtr getTestAlignmentInstances(const string &storageFormat, const string &alignmentPath, unsigned mode); 22 | 23 | /** parse command line and run a test suite for the given storage driver, 24 | * return exit code */ 25 | int runHalTestSuite(int argc, char *argv[], CuSuite *suite); 26 | 27 | /* Base class for alignment tests. Handles setup of test HAL and has required 28 | * methods. */ 29 | class AlignmentTest { 30 | public: 31 | AlignmentTest() { 32 | } 33 | virtual ~AlignmentTest() { 34 | } 35 | void check(CuTest *testCase); 36 | virtual void createCallBack(AlignmentPtr alignment) { 37 | } 38 | virtual void checkCallBack(AlignmentConstPtr alignment) { 39 | } 40 | CuTest *_testCase; 41 | string _createPath; 42 | string _checkPath; 43 | static string randomString(hal_size_t length); 44 | void checkOne(CuTest *testCase, const string &storageFormat); 45 | }; 46 | 47 | 48 | 49 | #endif 50 | // Local Variables: 51 | // mode: c++ 52 | // End: 53 | -------------------------------------------------------------------------------- /api/tests/halMetaDataTest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halApiTestSupport.h" 9 | #include "halAlignment.h" 10 | #include "halGenome.h" 11 | #include "halMetaData.h" 12 | #include 13 | #include 14 | extern "C" { 15 | #include "commonC.h" 16 | } 17 | 18 | using namespace std; 19 | using namespace hal; 20 | 21 | struct MetaDataTest : public AlignmentTest { 22 | void createCallBack(AlignmentPtr alignment) { 23 | hal_size_t alignmentSize = alignment->getNumGenomes(); 24 | CuAssertTrue(_testCase, alignmentSize == 0); 25 | 26 | MetaData *meta = alignment->getMetaData(); 27 | CuAssertTrue(_testCase, meta->getMap().empty() == true); 28 | meta->set("colour", "red"); 29 | meta->set("number", "1"); 30 | meta->set("animal", "cat"); 31 | meta->set("colour", "black"); 32 | 33 | CuAssertTrue(_testCase, meta->get("colour") == "black"); 34 | CuAssertTrue(_testCase, meta->get("number") == "1"); 35 | CuAssertTrue(_testCase, meta->get("animal") == "cat"); 36 | 37 | CuAssertTrue(_testCase, meta->has("colour") == true); 38 | CuAssertTrue(_testCase, meta->has("city") == false); 39 | 40 | CuAssertTrue(_testCase, meta->getMap().size() == 3); 41 | } 42 | 43 | void checkCallBack(AlignmentConstPtr alignment) { 44 | const MetaData *meta = alignment->getMetaData(); 45 | 46 | CuAssertTrue(_testCase, meta->get("colour") == "black"); 47 | CuAssertTrue(_testCase, meta->get("number") == "1"); 48 | CuAssertTrue(_testCase, meta->get("animal") == "cat"); 49 | 50 | CuAssertTrue(_testCase, meta->has("colour") == true); 51 | CuAssertTrue(_testCase, meta->has("city") == false); 52 | 53 | CuAssertTrue(_testCase, meta->getMap().size() == 3); 54 | } 55 | }; 56 | 57 | static void halMetaDataTest(CuTest *testCase) { 58 | MetaDataTest tester; 59 | tester.check(testCase); 60 | } 61 | 62 | static CuSuite *halMetaDataTestSuite(void) { 63 | CuSuite *suite = CuSuiteNew(); 64 | // FIXME: commented this out till we implement this for mmap. 65 | if (false) { 66 | SUITE_ADD_TEST(suite, halMetaDataTest); 67 | } 68 | return suite; 69 | } 70 | 71 | int main(int argc, char *argv[]) { 72 | return runHalTestSuite(argc, argv, halMetaDataTestSuite()); 73 | } 74 | -------------------------------------------------------------------------------- /api/tests/halRandomData.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALRANDOMDATA_H 9 | #define _HALRANDOMDATA_H 10 | 11 | #include "hal.h" 12 | #include 13 | 14 | namespace hal { 15 | class RandNumberGen; 16 | 17 | void createRandomAlignment(RandNumberGen &rng, AlignmentPtr emptyAlignment, double meanDegree, double maxBranchLength, 18 | hal_size_t minGenomes, hal_size_t maxGenomes, hal_size_t minSegmentLength, 19 | hal_size_t maxSegmentLength, hal_size_t minSegments, hal_size_t maxSegments); 20 | 21 | void createRandomTree(RandNumberGen &rng, AlignmentPtr emptyAlignment, double meanDegree, double maxBranchLength, 22 | hal_size_t minGenomes, hal_size_t maxGenomes); 23 | 24 | void createRandomDimensions(RandNumberGen &rng, AlignmentPtr alignment, hal_size_t minSegmentLength, 25 | hal_size_t maxSegmentLength, hal_size_t minSegments, hal_size_t maxSegments); 26 | 27 | void createRandomGenome(RandNumberGen &rng, AlignmentPtr alignment, Genome *genome); 28 | 29 | void createRandomSegment(RandNumberGen &rng, Genome *genome, hal_size_t indexInParent, 30 | std::set> &edgeSet, TopSegmentIteratorPtr topIter, 31 | BottomSegmentIteratorPtr botIter, double branchLength); 32 | } 33 | #endif 34 | // Local Variables: 35 | // mode: c++ 36 | // End: 37 | -------------------------------------------------------------------------------- /assemblyHub/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | 4 | progs = ${binDir}/hal2assemblyHub.py 5 | 6 | all: libs progs 7 | libs: 8 | progs: ${progs} 9 | clean: 10 | rm -f ${progs} 11 | test: 12 | 13 | include ${rootDir}/rules.mk 14 | 15 | # don't fail on missing dependencies, they are first time the .o is generates 16 | -include ${depends} 17 | 18 | 19 | # Local Variables: 20 | # mode: makefile-gmake 21 | # End: 22 | 23 | -------------------------------------------------------------------------------- /assemblyHub/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/assemblyHub/__init__.py -------------------------------------------------------------------------------- /assemblyHub/alignabilityTrack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2013 by Ngan Nguyen 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | """Creating Alignability (Alignment Depth) track for the hubs 9 | """ 10 | import os 11 | from sonLib.bioio import system 12 | from toil.job import Job 13 | 14 | class GetAlignability( Job ): 15 | def __init__(self, genomedir, genome, halfile): 16 | Job.__init__(self) 17 | self.genomedir = genomedir 18 | self.genome = genome 19 | self.halfile = halfile 20 | 21 | def run(self, fileStore): 22 | outfile = os.path.join(self.genomedir, "%s.alignability.bw" %self.genome) 23 | tempwig = os.path.join(self.genomedir, "%s.alignability.wig" %self.genome) 24 | system("halAlignmentDepth %s %s > %s" %(self.halfile, self.genome, tempwig)) 25 | chrsizefile = os.path.join(self.genomedir, "chrom.sizes") 26 | system("wigToBigWig %s %s %s" %(tempwig, chrsizefile, outfile)) 27 | system("rm -f %s" %tempwig) 28 | 29 | def writeTrackDb_alignability(f, genome, genomeCount): 30 | f.write("track alignability\n") 31 | f.write("longLabel Alignability\n") 32 | f.write("shortLabel Alignability\n") 33 | f.write("type bigWig 0 %d\n" %genomeCount) 34 | f.write("group map\n") 35 | f.write("visibility dense\n") 36 | f.write("windowingFunction Mean\n") 37 | f.write("bigDataUrl %s.alignability.bw\n" %genome) 38 | 39 | f.write("priority 2\n") 40 | f.write("autoScale On\n") 41 | f.write("maxHeightPixels 128:36:16\n") 42 | f.write("graphTypeDefault Bar\n") 43 | f.write("gridDefault OFF\n") 44 | f.write("color 0,0,0\n") 45 | f.write("altColor 128,128,128\n") 46 | f.write("viewLimits 0:%d\n" %genomeCount) 47 | f.write("html ../documentation/alignability\n") 48 | f.write("\n") 49 | 50 | def addAlignabilityOptions(parser): 51 | from optparse import OptionGroup 52 | #group = parser.add_argument_group("ALIGNABILITY", "Alignability: the number of genomes aligned to each position.") 53 | group = parser.add_argument_group("ALIGNABILITY") 54 | group.add_argument('--alignability', dest='alignability', action='store_true', default=False, help='If specified, make Alignability (aka Alignment Depth) tracks. ') 55 | group = parser.add_argument_group(group) 56 | 57 | -------------------------------------------------------------------------------- /assemblyHub/docs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/assemblyHub/docs/__init__.py -------------------------------------------------------------------------------- /assemblyHub/docs/alignabilityDocs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2013 by Ngan Nguyen 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | """ 9 | Mon Oct 28 09:35:55 PDT 2013 10 | Documentation for the alignability track 11 | """ 12 | import sys, os 13 | 14 | def writeAlignabilityDocs_description(f): 15 | f.write("

Description

\n") 16 | f.write("

\n") 17 | f.write("This track shows the number of genomes aligned to each position of the reference. The values range from 0 to the total number of input genomes and imputed ancestral genomes.\n") 18 | f.write("

\n") 19 | f.write("\n") 20 | 21 | def writeAlignabilityDocs_methods(f): 22 | f.write("

Methods

\n") 23 | f.write("

\n") 24 | f.write("Alignability was generated using the halAlignability script of the HAL tools package.\n") 25 | f.write("

\n") 26 | f.write("\n") 27 | 28 | def writeAlignabilityDocs_references(f): 29 | f.write("

References

\n") 30 | f.write("

\n") 31 | f.write("Hickey et al..\n") 32 | f.write("HAL: a hierarchical format for storing and analyzing multiple genome alignments..\n") 33 | f.write("Bioinformatics. 2013 May;29(10):1341-1342.\n") 34 | f.write("

\n") 35 | f.write("\n") 36 | 37 | def writeAlignabilityDocs(file): 38 | f = open(file, 'w') 39 | writeAlignabilityDocs_description(f) 40 | writeAlignabilityDocs_methods(f) 41 | writeAlignabilityDocs_references(f) 42 | f.close() 43 | 44 | def makeAlignabilityDocs(outdir): 45 | outfile = os.path.join(outdir, "alignability.html") 46 | writeAlignabilityDocs(outfile) 47 | 48 | def main(): 49 | makeAlignabilityDocs(sys.argv[1]) 50 | 51 | if __name__ == '__main__': 52 | main() 53 | 54 | 55 | -------------------------------------------------------------------------------- /assemblyHub/docs/gcPercentDocs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2013 by Ngan Nguyen 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | """ 9 | Mon Oct 28 09:35:55 PDT 2013 10 | Documentation for the gcPercent track 11 | """ 12 | import sys, os 13 | 14 | def writeGcPercentDocs_description(f): 15 | f.write("

Description

\n") 16 | f.write("

\n") 17 | f.write("The GC percent track shows the percentage of G (guanine) and C (cytosine) bases in 5-base windows. High GC content is typically associated with gene-rich areas.\n") 18 | f.write("

\n") 19 | f.write("

\n") 20 | f.write("This track may be configured in a variety of ways to highlight different aspects of the displayed information. Click the \"Graph configuration help\" link for an explanation of the configuration options.\n") 21 | f.write("

\n") 22 | f.write("\n") 23 | 24 | def writeGcPercentDocs_methods(f): 25 | f.write("

Methods

\n") 26 | f.write("

\n") 27 | f.write("This track was generated following the UCSC GC_Percent Track Construction instructions, using the sequence information extracted from the multiple sequence alignments.\n") 28 | f.write("

\n") 29 | f.write("\n") 30 | 31 | def writeGcPercentDocs_credits(f): 32 | f.write("

References

\n") 33 | f.write("

\n") 34 | f.write("The GC Percent graph presentation is by Hiram Clawson. The data was automatically generated using the HAL tools package.\n") 35 | f.write("

\n") 36 | f.write("\n") 37 | 38 | def writeGcPercentDocs(file): 39 | f = open(file, 'w') 40 | writeGcPercentDocs_description(f) 41 | writeGcPercentDocs_methods(f) 42 | writeGcPercentDocs_credits(f) 43 | f.close() 44 | 45 | def makeGcPercentDocs(outdir): 46 | outfile = os.path.join(outdir, "gcPercent.html") 47 | writeGcPercentDocs(outfile) 48 | 49 | def main(): 50 | makeGcPercentDocs(sys.argv[1]) 51 | 52 | if __name__ == '__main__': 53 | main() 54 | 55 | -------------------------------------------------------------------------------- /assemblyHub/docs/makeDocs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2013 by Ngan Nguyen 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | """ 9 | Make track documentation html files 10 | """ 11 | from hal.assemblyHub.docs.gcPercentDocs import makeGcPercentDocs 12 | from hal.assemblyHub.docs.alignabilityDocs import makeAlignabilityDocs 13 | from hal.assemblyHub.docs.conservationDocs import makeConservationDocs 14 | from hal.assemblyHub.docs.repeatMaskerDocs import makeRepeatMaskerDocs 15 | from hal.assemblyHub.docs.hubCentralDocs import makeHubCentralDocs 16 | 17 | def writeDocFiles(outdir, options): 18 | if options.gcContent: 19 | makeGcPercentDocs(outdir) 20 | if options.alignability: 21 | makeAlignabilityDocs(outdir) 22 | if options.conservation: 23 | makeConservationDocs(outdir) 24 | if options.rmskdir: 25 | makeRepeatMaskerDocs(outdir) 26 | makeHubCentralDocs(outdir) 27 | return 28 | 29 | -------------------------------------------------------------------------------- /assemblyHub/gcPercentTrack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2013 by Ngan Nguyen 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | """Creating GC percent track for the hubs 9 | """ 10 | import os 11 | from sonLib.bioio import system 12 | from toil.job import Job 13 | 14 | class GetGCpercent( Job ): 15 | def __init__(self, genomedir, genome): 16 | Job.__init__(self) 17 | self.genomedir = genomedir 18 | self.genome = genome 19 | 20 | def run(self, fileStore): 21 | twobitfile = os.path.join(self.genomedir, "%s.2bit" %self.genome) 22 | tempfile = os.path.join(self.genomedir, "%s.gc.wigVarStep.gz" %self.genome) 23 | cmd = "hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 %s %s | gzip -c > %s" %(self.genome, twobitfile, tempfile) 24 | system(cmd) 25 | chrsizefile = os.path.join(self.genomedir, "chrom.sizes") 26 | gcfile = os.path.join(self.genomedir, "%s.gc.bw" %self.genome) 27 | cmd = "wigToBigWig %s %s %s" %(tempfile, chrsizefile, gcfile) 28 | system(cmd) 29 | system("rm -f %s" %tempfile) 30 | 31 | def writeTrackDb_gcPercent(f, genome): 32 | f.write("track gcPercent\n") 33 | f.write("longLabel GC Percent in 5-base Window\n") 34 | f.write("shortLabel GC Percent\n") 35 | f.write("type bigWig 0 100\n") 36 | f.write("group map\n") 37 | f.write("visibility dense\n") 38 | f.write("windowingFunction Mean\n") 39 | f.write("bigDataUrl %s.gc.bw\n" %genome) 40 | 41 | f.write("priority 2\n") 42 | f.write("autoScale Off\n") 43 | f.write("maxHeightPixels 128:36:16\n") 44 | f.write("graphTypeDefault Bar\n") 45 | f.write("gridDefault OFF\n") 46 | f.write("color 0,0,0\n") 47 | f.write("altColor 128,128,128\n") 48 | f.write("viewLimits 30:70\n") 49 | f.write("html ../documentation/gcPercent\n") 50 | f.write("\n") 51 | 52 | def addGcOptions(parser): 53 | from optparse import OptionGroup 54 | #group = parser.add_argument_group("GC PERCENT", "GC Percent in 5-base Window.") 55 | group = parser.add_argument_group("GC PERCENT") 56 | group.add_argument('--gcContent', dest='gcContent', action='store_true', default=False, help='If specified, make GC-content tracks. ') 57 | group = parser.add_argument_group(group) 58 | 59 | -------------------------------------------------------------------------------- /assemblyHub/hal2assemblyHubDoc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/assemblyHub/hal2assemblyHubDoc.pdf -------------------------------------------------------------------------------- /assemblyHub/rmskTrack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2013 by Ngan Nguyen 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | """RepeatMasker track 9 | """ 10 | import os 11 | from sonLib.bioio import system 12 | from optparse import OptionGroup 13 | 14 | def writeTrackDb_rmsk(f, rmskdir, genomedir): 15 | if not os.path.exists(rmskdir): 16 | return 17 | f.write("track repeatMasker\n") 18 | f.write("compositeTrack on\n") 19 | f.write("shortLabel RepeatMasker\n") 20 | f.write("longLabel Repeating Elements by RepeatMasker\n") 21 | f.write("group map\n") 22 | f.write("visibility dense\n") 23 | f.write("type bed 3 .\n") 24 | f.write("noInherit on\n") 25 | f.write("html ../documentation/repeatMasker\n") 26 | f.write("\n") 27 | 28 | system("ln -s %s %s" %(os.path.abspath(rmskdir), os.path.join(genomedir, "repeatMasker"))) 29 | files = os.listdir(rmskdir) 30 | for i, file in enumerate(files): 31 | element = file.split('.')[0] 32 | f.write("\ttrack repeatMasker%s\n" %element) 33 | f.write("\tparent repeatMasker\n") 34 | f.write("\tshortLabel %s\n" %element) 35 | f.write("\tlongLabel %s Repeating Elements by RepeatMasker\n" %element) 36 | f.write("\tpriority %d\n" %i) 37 | f.write("\tspectrum on\n") 38 | f.write("\tmaxWindowToDraw 10000000\n") 39 | f.write("\tcolorByStrand 50,50,150 150,50,50\n") 40 | f.write("\ttype bigBed 6 +\n") 41 | f.write("\tbigDataUrl repeatMasker/%s.bb\n" %element) 42 | f.write("\n") 43 | f.write("\n") 44 | 45 | def addRmskOptions(parser): 46 | #group = parser.add_argument_group("REPEATMASKER", "RepeatMasker options.") 47 | group = parser.add_argument_group("REPEATMASKER") 48 | group.add_argument('--rmskDir', dest='rmskdir', help="Directory containing repeatMasker's output files for each genome. Format: rmskDir/ then genome1/ then genome.rmsk.SINE.bb, genome.rmsk.LINE.bb, ... ") 49 | group = parser.add_argument_group(group) 50 | 51 | def checkRmskOptions(parser, options): 52 | if options.rmskdir: 53 | if not os.path.exists(options.rmskdir) or not os.path.isdir(options.rmskdir): 54 | parser.error("RepeatMasker directory %s does not exist or is not a directory.\n" %options.rmskdir) 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /assemblyHub/snakeTrack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2013 by Ngan Nguyen 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | """Snake tracks 9 | """ 10 | from optparse import OptionGroup 11 | import re 12 | 13 | def addSnakeOptions(parser): 14 | group = parser.add_argument_group("SNAKE TRACKS", "Snake track options") 15 | group.add_argument('--selfAlignmentSnakes', dest="selfAlignmentTrack", 16 | help="Produce a self-alignment snake track for every genome", 17 | action="store_true", default=False) 18 | group = parser.add_argument_group(group) 19 | 20 | def writeTrackDb_snakes(f, halfile, genomes, subgenomes, currgenome, properName, snpwidth=None, doSelfAlignment=False): 21 | for i, genome in enumerate(genomes): 22 | if not doSelfAlignment and genome == currgenome: #current genome 23 | continue 24 | #SNAKE TRACKS 25 | genomeProperName = genome 26 | if genome in properName: 27 | genomeProperName = properName[genome] 28 | if genome == currgenome: 29 | genomeProperName += " (self)" 30 | f.write("\t\ttrack snake%s\n" %genome) 31 | f.write("\t\tlongLabel %s\n" %genomeProperName) 32 | f.write("\t\tshortLabel %s\n" %genomeProperName) 33 | f.write("\t\totherSpecies %s\n" %genome) 34 | if genome in subgenomes: 35 | f.write("\t\tvisibility full\n") 36 | f.write("\t\tparent hubCentralAlignments\n") 37 | else: 38 | f.write("\t\tvisibility hide\n") 39 | f.write("\t\tparent hubCentralAlignments off\n") 40 | if snpwidth: 41 | f.write("\t\tshowSnpWidth %d\n" % snpwidth) 42 | f.write("\t\tpriority %d\n" %(i + 2)) 43 | f.write("\t\tbigDataUrl %s\n" % halfile) 44 | f.write("\t\ttype halSnake\n") 45 | f.write("\t\tgroup snake\n") 46 | f.write("\t\tsubGroups view=Snake orgs=%s\n" %genome) 47 | f.write("\n") 48 | 49 | -------------------------------------------------------------------------------- /benchmarks/benchMark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2012 by Glenn Hickey 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | 9 | import argparse 10 | import os 11 | import sys 12 | import traceback 13 | import time 14 | import random 15 | import resource 16 | import psutil 17 | 18 | from sonLib.bioio import getTempDirectory 19 | from sonLib.bioio import getTempFile 20 | from sonLib.bioio import popenCatch 21 | from sonLib.bioio import system 22 | 23 | def runHalGen(preset, seed, hdf5Chunk, hdf5Compression, outPath): 24 | system("halRandGen --preset %s --seed %d --hdf5Chunk %d\ 25 | --hdf5Compression %d %s" % (preset, seed, hdf5Chunk, hdf5Compression, outPath)) 26 | 27 | def runHalCons(halPath, outputPath): 28 | system("halCons %s > outputPath" % halPath) 29 | 30 | 31 | def main(argv=None): 32 | if argv is None: 33 | argv = sys.argv 34 | 35 | seed = random.randint(0, 2**31) 36 | parser = argparse.ArgumentParser(description='Run little hal test') 37 | parser.add_argument('--preset', type=str, 38 | help='halGenRandom preset to use [small, medium, big, large]', default='small') 39 | args = parser.parse_args() 40 | rval = 0 41 | print("chunk, comp, time(gen), time(cons), fsize(k)") 42 | try: 43 | for chunkSize in [10000, 100000, 1000000, 10000000]: 44 | for compression in [0, 2, 5, 7, 9]: 45 | try: 46 | tempDir = getTempDirectory(rootDir="./") 47 | tempFile = getTempFile(suffix=".h5", rootDir=tempDir) 48 | except: 49 | traceback.print_exc(file=sys.stdout) 50 | return 1 51 | 52 | t = time.time() 53 | runHalGen(args.preset, seed, chunkSize, compression, tempFile) 54 | fsize = os.path.getsize(tempFile) 55 | th = time.time() - t 56 | runHalCons(tempFile, getTempFile(rootDir=tempDir)) 57 | tc = time.time() - th - t 58 | print("%d, %d, %f.3, %f.3, %f.2" % ( 59 | chunkSize, compression, th, tc, fsize / 1024.)) 60 | 61 | except: 62 | traceback.print_exc(file=sys.stdout) 63 | return 1 64 | 65 | system("rm -rf %s" % tempDir) 66 | return rval 67 | 68 | if __name__ == "__main__": 69 | sys.exit(main()) 70 | -------------------------------------------------------------------------------- /benchmarks/results/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/benchmarks/results/.gitignore -------------------------------------------------------------------------------- /benchmarks/runAndGetResources.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2012 by Glenn Hickey 4 | # Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | # 6 | #Released under the MIT license, see LICENSE.txt 7 | 8 | 9 | import os 10 | import sys 11 | import time 12 | 13 | from sonLib.bioio import popenCatch 14 | from sonLib.bioio import getTotalCpuTimeAndMemoryUsage 15 | from sonLib.bioio import system 16 | 17 | def main(argv=None): 18 | if argv is None: 19 | argv = sys.argv 20 | if len(argv) != 2: 21 | print("usage: runAndGetResources.py \'cmdline\'") 22 | exit(1) 23 | cmdline = argv[1] 24 | wallStart = time.time() 25 | output = popenCatch(cmdline) 26 | wallClock = time.time() - wallStart 27 | print((wallClock,) + getTotalCpuTimeAndMemoryUsage()) 28 | return 0 29 | 30 | if __name__ == "__main__": 31 | sys.exit(main()) 32 | -------------------------------------------------------------------------------- /blockViz/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/blockViz 4 | 5 | libHalBlockViz_srcs = impl/halBlockViz.cpp 6 | libHalBlockViz_objs = ${libHalBlockViz_srcs:%.cpp=${modObjDir}/%.o} 7 | blockVizBed_srcs = tests/blockVizBed.cpp 8 | blockVizBed_objs = ${blockVizBed_srcs:%.cpp=${modObjDir}/%.o} 9 | blockVizMaf_srcs = tests/blockVizMaf.cpp 10 | blockVizMaf_objs = ${blockVizMaf_srcs:%.cpp=${modObjDir}/%.o} 11 | blockVizTest_srcs = tests/blockVizTest.cpp 12 | blockVizTest_objs = ${blockVizTest_srcs:%.cpp=${modObjDir}/%.o} 13 | srcs = ${libHalBlockViz_srcs} ${blockVizBed_srcs} \ 14 | ${blockVizMaf_srcs} ${blockVizTest_srcs} 15 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 16 | depends = ${srcs:%.cpp=%.depend} 17 | inclSpec += -I${rootDir}/liftover/inc -I${rootDir}/lod/inc -I${rootDir}/maf/inc -I${halApiTestIncl} 18 | otherLibs += ${halApiTestSupportLibs} ${libHalBlockViz} ${libHalLiftover} ${libHalLod} ${libHalMaf} 19 | progs = ${binDir}/blockVizBed ${binDir}/blockVizMaf ${binDir}/blockVizTest 20 | 21 | testTmpDir = output 22 | testHdf5Hal = ${testTmpDir}/small.haf5.hal 23 | testMmapHal = ${testTmpDir}/small.mmap.hal 24 | 25 | all: libs progs 26 | libs: ${libHalBlockViz} 27 | progs: ${progs} 28 | 29 | # only blockVizTest/Maf.cpp should include kent .h files, to preven conflicts with older versions of 30 | # phast 31 | ${modObjDir}/tests/blockVizTest.o : tests/blockVizTest.cpp 32 | @mkdir -p $(dir $@) 33 | ${CXX} -MM -MT $@ ${CXXFLAGS} ${UDCCXXFLAGS} ${inclSpec} -c $< >$*.depend 34 | ${CXX} ${CXXFLAGS} ${UDCCXXFLAGS} ${inclSpec} -c $< -o $@ 35 | 36 | ${modObjDir}/tests/blockVizMaf.o : tests/blockVizMaf.cpp 37 | @mkdir -p $(dir $@) 38 | ${CXX} -MM -MT $@ ${CXXFLAGS} ${UDCCXXFLAGS} ${inclSpec} -c $< >$*.depend 39 | ${CXX} ${CXXFLAGS} ${UDCCXXFLAGS} ${inclSpec} -c $< -o $@ 40 | 41 | ${modObjDir}/tests/blockVizBed.o : tests/blockVizBed.cpp 42 | @mkdir -p $(dir $@) 43 | ${CXX} -MM -MT $@ ${CXXFLAGS} ${UDCCXXFLAGS} ${inclSpec} -c $< >$*.depend 44 | ${CXX} ${CXXFLAGS} ${UDCCXXFLAGS} ${inclSpec} -c $< -o $@ 45 | 46 | clean: 47 | rm -f ${libHalBlockViz} ${objs} ${progs} ${depends} 48 | rm -rf ${testTmpDir} 49 | 50 | test: blockVizHdf5Tests blockVizMmapTests 51 | 52 | blockVizHdf5Tests: ${testHdf5Hal} ${progs} 53 | ${binDir}/blockVizTest --verbose --doSeq ${testHdf5Hal} Genome_2 Genome_0 Genome_0_seq 0 3000 >${testTmpDir}/$@.out 54 | diff tests/expected/$@.out ${testTmpDir}/$@.out 55 | 56 | 57 | blockVizMmapTests: ${testMmapHal} ${progs} 58 | ${binDir}/blockVizTest --verbose --doSeq ${testMmapHal} Genome_2 Genome_0 Genome_0_seq 0 3000 >${testTmpDir}/$@.out 59 | diff tests/expected/$@.out ${testTmpDir}/$@.out 60 | 61 | randGenArgs = --preset small --seed 0 --minSegmentLength 3000 --maxSegmentLength 5000 62 | 63 | ${testHdf5Hal}: ${progs} ${binDir}/halRandGen 64 | @mkdir -p $(dir $@) 65 | ${binDir}/halRandGen ${randGenArgs} --format hdf5 $@ 66 | 67 | ${testMmapHal}: ${progs} ${binDir}/halRandGen 68 | @mkdir -p $(dir $@) 69 | ${binDir}/halRandGen ${randGenArgs} --format mmap $@ 70 | 71 | include ${rootDir}/rules.mk 72 | 73 | # don't fail on missing dependencies, they are first time the .o is generates 74 | -include ${depends} 75 | 76 | 77 | # Local Variables: 78 | # mode: makefile-gmake 79 | # End: 80 | 81 | -------------------------------------------------------------------------------- /blockViz/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/blockViz/__init__.py -------------------------------------------------------------------------------- /blockViz/tests/blockVizMaf.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "halBlockViz.h" 3 | #include 4 | #include 5 | #include 6 | 7 | #ifdef ENABLE_UDC 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | #include "common.h" 12 | #include "udc2.h" 13 | #include 14 | #ifdef __cplusplus 15 | } 16 | #endif 17 | #endif 18 | 19 | struct bv_args_t { 20 | char *path; 21 | char *qSpecies; 22 | char *tSpecies; 23 | char *tChrom; 24 | int tStart; 25 | int tEnd; 26 | int doSeq; 27 | int doDupes; 28 | char *udcPath; 29 | }; 30 | 31 | static int parseArgs(int argc, char **argv, bv_args_t *args) { 32 | if (argc != 7 && argc != 8 && argc != 9) { 33 | return -1; 34 | } 35 | args->path = argv[1]; 36 | args->qSpecies = argv[2]; 37 | args->tSpecies = argv[3]; 38 | args->tChrom = argv[4]; 39 | if (sscanf(argv[5], "%d", &args->tStart) != 1 || sscanf(argv[6], "%d", &args->tEnd) != 1) { 40 | return -1; 41 | } 42 | args->doSeq = 0; 43 | args->doDupes = 0; 44 | if (argc >= 8) { 45 | if (sscanf(argv[7], "%d", &args->doDupes) != 1) { 46 | return -1; 47 | } 48 | } 49 | args->udcPath = NULL; 50 | if (argc >= 9) { 51 | args->udcPath = argv[8]; 52 | } 53 | return 0; 54 | } 55 | 56 | int main(int argc, char **argv) { 57 | bv_args_t args; 58 | 59 | if (parseArgs(argc, argv, &args) != 0) { 60 | fprintf(stderr, "Usage: %s " 61 | " [doDupes=0] [udcPath=NULL]\n\n", 62 | argv[0]); 63 | return -1; 64 | } 65 | #ifdef ENABLE_UDC 66 | if (args.udcPath != NULL) { 67 | udc2SetDefaultDir(args.udcPath); 68 | } 69 | #endif 70 | 71 | int handle = halOpenHalOrLod(args.path, NULL); 72 | int ret = -1; 73 | if (handle >= 0) { 74 | // printStats(stdout, handle); 75 | hal_species_t qSpecies; 76 | qSpecies.name = args.qSpecies; 77 | qSpecies.next = NULL; 78 | 79 | long numBytes = halGetMaf(stdout, handle, &qSpecies, args.tSpecies, args.tChrom, args.tStart, args.tEnd, 80 | 0, // maxRefGap 81 | 0, // maxBlockLength 82 | args.doDupes, NULL); 83 | 84 | if (numBytes >= 0) { 85 | ret = 0; 86 | } 87 | printf("\nread %ld bytes\n", numBytes); 88 | } 89 | return ret; 90 | } 91 | -------------------------------------------------------------------------------- /blockViz/tests/expected/blockVizHdf5Tests.out: -------------------------------------------------------------------------------- 1 | chr:Genome_2_seq, tSt:0, qSt:0, size:3000, strand:+: tgt : GCTATCGGGG query: GCTATCGGGG 2 | chr:Genome_2_seq, tSt:3000, qSt:3000, size:113, strand:+: tgt : TAAAACGCTA query: TAAAACGCTA 3 | -------------------------------------------------------------------------------- /blockViz/tests/expected/blockVizMmapTests.out: -------------------------------------------------------------------------------- 1 | chr:Genome_2_seq, tSt:0, qSt:0, size:3000, strand:+: tgt : GCTATCGGGG query: GCTATCGGGG 2 | chr:Genome_2_seq, tSt:3000, qSt:3000, size:113, strand:+: tgt : TAAAACGCTA query: TAAAACGCTA 3 | -------------------------------------------------------------------------------- /blockViz/tests/timing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | binSize=1000000 4 | reps=1000 5 | ref=reference 6 | tgts=EscherichiaColiWUid162011,EscherichiaColiKo11flUid162099,EscherichiaColiKo11flUid52593,ShigellaSonnei53gUid84383 7 | refChr=referencerefChr1 8 | len=9808498 9 | lodPath=http://hgwdev.cse.ucsc.edu/~nknguyen/ecoli/hub/pangenome-dups/lod.txt 10 | halPath=http://hgwdev.cse.ucsc.edu/~nknguyen/ecoli/hub/pangenome-dups/out.hal 11 | udcPath=/hive/users/hickey/udcTemp 12 | outPath=/hive/users/hickey/timing 13 | seed=2323 14 | python=/cluster/home/jcarmstr/Python-2.7/python 15 | 16 | ${python} ./blockVizBenchmark.py ${lodPath} ${ref} ${refChr} ${len} ${tgts} --udc ${udcPath} --reps ${reps} --binSize ${binSize} --seed ${seed} > ${outPath}/ecoli.csv 17 | 18 | ${python} ./blockVizBenchmark.py ${lodPath} ${ref} ${refChr} ${len} ${tgts} --udc ${udcPath} --reps ${reps} --binSize ${binSize} --zapUdc --seed ${seed} > ${outPath}/ecoli_noudc.csv 19 | 20 | ${python} ./blockVizBenchmark.py ${halPath} ${ref} ${refChr} ${len} ${tgts} --udc ${udcPath} --reps ${reps} --binSize ${binSize} --seed ${seed} > ${outPath}/ecoli_nolod.csv -------------------------------------------------------------------------------- /extra/dotplot/README.md: -------------------------------------------------------------------------------- 1 | # Dotplot script 2 | Get a dotplot relating two sequences in a HAL file. Requires the statistical language R. 3 | ## Usage 4 | ``` 5 | $ source PROGRESSIVE_CACTUS_DIR/environment 6 | $ ./runDotplot.py > dotplot.tsv 7 | $ ./plotDotplot.R dotplot.tsv output.pdf 8 | ``` 9 | 10 | ## Example 11 | ### Command line 12 | ``` 13 | $ source ~/progressiveCactus/environment 14 | $ ./runDotplot.py ~/progressiveCactus-phylogeny/mammals1/original.hal simHuman_chr6 simHuman.chr6 simMouse_chr6 simMouse.chr6 > t.tsv 15 | $ ./plotDotplot.R t.tsv out.pdf 16 | ``` 17 | ### Result 18 | ![dot plot](example.png) 19 | -------------------------------------------------------------------------------- /extra/dotplot/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/extra/dotplot/example.png -------------------------------------------------------------------------------- /extra/dotplot/plotDotplot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args <- commandArgs(TRUE) 4 | if (length(args) < 2) { 5 | print("Usage: plotDotplot.R inputDotplotTsv outputPdf") 6 | quit(status = 1) 7 | } 8 | inputFile <- args[[1]] 9 | outputPdfFile <- args[[2]] 10 | dots <- read.table(inputFile, header=T) 11 | pdf(outputPdfFile) 12 | print(plot(dots, type="l")) 13 | dev.off() -------------------------------------------------------------------------------- /extra/insertionStats/plotInsertionStats.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Usage: plotInsertionStats.R 3 | # Produces: filename.pdf: density of insertion size in each genome 4 | # filename-ecdf.pdf: cumulative % of insertions at or below X insertion size in each genome 5 | # filename-seqFrac-density.pdf: density plot of insertion size, weighted by insertion size 6 | # filename-seqFrac-hist.pdf: histogram of insertion size, weighted by insertion size (i.e. height of bar = fraction of total inserted sequence) 7 | require(ggplot2) 8 | require(data.table) 9 | 10 | args <- commandArgs(TRUE) 11 | insertionStatsFile <- args[[1]] 12 | 13 | #ancestorNames <- list(Anc8="human-chimp", Anc7="human-rhesus", Anc5="euarchontoglires", Anc6="mouse-rat", Anc0="boreoeutherian", Anc1="laurasiatheria", Anc2="pig-cow-horse", Anc3="pig-cow", Anc4="dog-cat") 14 | 15 | insertionStats <- read.table("entirelyInsertedSequences.tsv", sep="\t", header=T) 16 | insertionStats$maskedBasesFraction <- insertionStats$maskedBases / insertionStats$insertionSize 17 | 18 | # Rename the 10-way ancestors 19 | ## insertionStats$genome = sapply(insertionStats$genome, function(x) { 20 | ## if (x %in% names(ancestorNames)) { 21 | ## ancestorNames[[as.character(x)]] 22 | ## } else { 23 | ## as.character(x) 24 | ## }}) 25 | 26 | plot <- ggplot(insertionStats, aes(x=insertionSize)) + scale_x_log10() + facet_wrap(~ genome) + theme_bw() 27 | 28 | pdf(paste(insertionStatsFile, ".pdf", sep=""), width=10, height=10) 29 | print(plot + geom_density() + ylab("Density")) 30 | dev.off() 31 | 32 | pdf(paste(insertionStatsFile, "-ecdf.pdf", sep=""), width=10, height=10) 33 | print(plot + stat_ecdf() + ylab("Cumulative fraction of insertions")) 34 | dev.off() 35 | 36 | pdf(paste(insertionStatsFile, "-seqFrac-density.pdf", sep=""), width=10, height=10) 37 | # stolen from stackoverflow -- get per-genome total insertion size so we can weight correctly. 38 | insertionStats.dt <- data.table(insertionStats) 39 | insertionStats.dt[, totalInsertionSize.per.genome := sum(insertionSize), genome] 40 | print(ggplot(insertionStats.dt, aes(x=insertionSize, weight=insertionSize/totalInsertionSize.per.genome)) + scale_x_log10() + facet_wrap(~ genome) + theme_bw() + geom_density() + ylab("Density weighted by insertion size")) 41 | dev.off() 42 | 43 | pdf(paste(insertionStatsFile, "-seqFrac-hist.pdf", sep=""), width=10, height=10) 44 | print(ggplot(insertionStats.dt, aes(x=insertionSize, weight=insertionSize/totalInsertionSize.per.genome)) + scale_x_log10() + facet_wrap(~ genome) + theme_bw() + geom_histogram() + ylab("Fraction of total inserted bases in genome")) 45 | dev.off() 46 | -------------------------------------------------------------------------------- /extract/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/extract 4 | 5 | halExtract_srcs = impl/halExtract.cpp 6 | halExtract_objs = ${halExtract_srcs:%.cpp=${modObjDir}/%.o} 7 | halAlignedExtract_srcs = impl/halAlignedExtract.cpp 8 | halAlignedExtract_objs = ${halAlignedExtract_srcs:%.cpp=${modObjDir}/%.o} 9 | halMaskExtract_srcs = impl/halMaskExtractMain.cpp impl/halMaskExtractor.cpp 10 | halMaskExtract_objs = ${halMaskExtract_srcs:%.cpp=${modObjDir}/%.o} 11 | hal4dExtract_srcs = impl/hal4dExtractMain.cpp impl/hal4dExtract.cpp 12 | hal4dExtract_objs = ${hal4dExtract_srcs:%.cpp=${modObjDir}/%.o} 13 | halSingleCopyRegionsExtract_srcs = impl/halSingleCopyRegionsExtract.cpp 14 | halSingleCopyRegionsExtract_objs = ${halSingleCopyRegionsExtract_srcs:%.cpp=${modObjDir}/%.o} 15 | hal4dExtractTest_srcs = tests/hal4dExtractTest.cpp 16 | hal4dExtractTest_objs = ${hal4dExtractTest_srcs:%.cpp=${modObjDir}/%.o} ${modObjDir}/impl/hal4dExtract.o 17 | srcs = ${halExtract_srcs} ${halAlignedExtract_srcs} ${halMaskExtract_srcs} \ 18 | ${hal4dExtract_srcs} ${halSingleCopyRegionsExtract_srcs} ${hal4dExtractTest_srcs} 19 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 20 | depends = ${srcs:%.cpp=%.depend} 21 | progs = ${binDir}/halStats ${binDir}/halCoverage 22 | inclSpec += -I${rootDir}/liftover/inc -I${halApiTestIncl} 23 | otherLibs += ${halApiTestSupportLibs} ${libHalLiftover} 24 | progs = ${binDir}/halExtract ${binDir}/halAlignedExtract ${binDir}/halMaskExtract \ 25 | ${binDir}/hal4dExtract ${binDir}/halSingleCopyRegionsExtract ${binDir}/hal4dExtractTest 26 | 27 | testTmpDir = output 28 | testHdf5Hal = ${testTmpDir}/small.haf5.hal 29 | testMmapHal = ${testTmpDir}/small.mmap.hal 30 | 31 | all: progs 32 | libs: 33 | progs: ${progs} 34 | 35 | clean : 36 | rm -f ${objs} ${progs} ${depends} 37 | rm -rf ${testTmpDir} 38 | 39 | test: hal4dExtractTest halExtactHdf5ToMmap halExtactMmapToHdf5 halExtactMmapV1.0 40 | 41 | hal4dExtractTest: 42 | ${binDir}/hal4dExtractTest 43 | 44 | halExtactHdf5ToMmap: ${testHdf5Hal} 45 | ${binDir}/halExtract --outputFormat mmap $< ${testTmpDir}/$@.mmap.hal 46 | 47 | halExtactMmapToHdf5: ${testMmapHal} 48 | ${binDir}/halExtract --outputFormat hdf5 $< ${testTmpDir}/$@.hdf5.hal 49 | 50 | # this tests reading V1.0 mmap files 51 | halExtactMmapV1.0: 52 | @mkdir -p $(dir $@) 53 | bzcat tests/input/small.mmap1.0.hal.bz2 > output/small.mmap1.0.hal 54 | ${binDir}/halExtract --outputFormat mmap output/small.mmap1.0.hal ${testTmpDir}/$@.mmap.hal 55 | 56 | 57 | randGenArgs = --preset small --seed 0 --minSegmentLength 3000 --maxSegmentLength 5000 58 | 59 | ${testHdf5Hal}: ${progs} ${binDir}/halRandGen 60 | @mkdir -p $(dir $@) 61 | ${binDir}/halRandGen ${randGenArgs} --format hdf5 $@ 62 | 63 | ${testMmapHal}: ${progs} ${binDir}/halRandGen 64 | @mkdir -p $(dir $@) 65 | ${binDir}/halRandGen ${randGenArgs} --format mmap $@ 66 | 67 | ${binDir}/halRandGen: 68 | cd ../randgen && ${MAKE} 69 | 70 | 71 | include ${rootDir}/rules.mk 72 | 73 | # don't fail on missing dependencies, they are first time the .o is generates 74 | -include ${depends} 75 | 76 | 77 | # Local Variables: 78 | # mode: makefile-gmake 79 | # End: 80 | 81 | -------------------------------------------------------------------------------- /extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/extract/__init__.py -------------------------------------------------------------------------------- /extract/impl/halMaskExtractMain.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halMaskExtractor.h" 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | using namespace hal; 15 | 16 | int main(int argc, char **argv) { 17 | CLParser optionsParser; 18 | optionsParser.setDescription("Write masked intervals of genome into bed " 19 | "file"); 20 | optionsParser.addArgument("halFile", "path to hal file to analyze"); 21 | optionsParser.addArgument("genome", "name of genome to process"); 22 | optionsParser.addOption("maskFile", "path to bed file to write to", "stdout"); 23 | optionsParser.addOption("extend", "extend masked regions by given num. " 24 | "of bases.", 25 | 0); 26 | optionsParser.addOption("extendPct", "extend masked regions by percentage" 27 | " of their lengths", 28 | 0); 29 | 30 | string halPath; 31 | string genomeName; 32 | string bedPath; 33 | hal_size_t extend; 34 | double extendPct; 35 | try { 36 | optionsParser.parseOptions(argc, argv); 37 | halPath = optionsParser.getArgument("halFile"); 38 | genomeName = optionsParser.getArgument("genome"); 39 | bedPath = optionsParser.getOption("maskFile"); 40 | extend = optionsParser.getOption("extend"); 41 | extendPct = optionsParser.getOption("extendPct"); 42 | 43 | if (extend != 0 && extendPct != 0.) { 44 | throw hal_exception("--extend and --extendPct options are exclusive."); 45 | } 46 | } catch (exception &e) { 47 | cerr << e.what() << endl; 48 | optionsParser.printUsage(cerr); 49 | exit(1); 50 | } 51 | try { 52 | AlignmentConstPtr alignment(openHalAlignment(halPath, &optionsParser)); 53 | 54 | const Genome *genome = alignment->openGenome(genomeName); 55 | if (genome == NULL) { 56 | throw hal_exception(string("Genome ") + genomeName + " not found."); 57 | } 58 | 59 | ostream *bedStream = &cout; 60 | bool newBed = false; 61 | if (bedPath != "stdout") { 62 | bedStream = new ofstream(bedPath.c_str()); 63 | newBed = true; 64 | } 65 | if (!bedStream) { 66 | throw hal_exception(string("Error opening ") + bedPath + " for writing"); 67 | } 68 | 69 | MaskExtractor mask; 70 | mask.extract(alignment, genome, bedStream, extend, extendPct); 71 | 72 | if (newBed) { 73 | delete bedStream; 74 | } 75 | } catch (hal_exception &e) { 76 | cerr << "hal exception caught: " << e.what() << endl; 77 | return 1; 78 | } catch (exception &e) { 79 | cerr << "Exception caught: " << e.what() << endl; 80 | return 1; 81 | } 82 | 83 | return 0; 84 | } 85 | -------------------------------------------------------------------------------- /extract/inc/hal4dExtract.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALEXTRACT4D_H 9 | #define _HALEXTRACT4D_H 10 | 11 | #include "halBedScanner.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Use the halBedScanner to parse a bed file, extracting 4d codons from 21 | * each position */ 22 | class Extract4d : public BedScanner { 23 | public: 24 | Extract4d(); 25 | virtual ~Extract4d(); 26 | 27 | void run(const Genome *refGenome, std::istream *inBedStream, std::ostream *outBedStream, bool conserved = false); 28 | 29 | static const char CodonPrefixTable[2][8]; 30 | 31 | protected: 32 | virtual void visitLine(); 33 | 34 | void extractBlocks4d(bool conserved); 35 | void write(); 36 | 37 | protected: 38 | std::ostream *_outBedStream; 39 | const Genome *_refGenome; 40 | const Sequence *_refSequence; 41 | std::deque _outBedLines; 42 | bool _conserved; 43 | }; 44 | } 45 | 46 | #endif 47 | // Local Variables: 48 | // mode: c++ 49 | // End: 50 | -------------------------------------------------------------------------------- /extract/inc/halMaskExtractor.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMASKEXTRACTOR_H 9 | #define _HALMASKEXTRACTOR_H 10 | 11 | #include "hal.h" 12 | #include 13 | #include 14 | #include 15 | 16 | namespace hal { 17 | 18 | class MaskExtractor { 19 | public: 20 | MaskExtractor(); 21 | virtual ~MaskExtractor(); 22 | 23 | void extract(AlignmentConstPtr alignment, const Genome *genome, std::ostream *bedStream, hal_size_t extend, 24 | double extendPct); 25 | 26 | protected: 27 | void addMaskedBasesToCache(); 28 | void extendCachedIntervals(); 29 | void writeCachedIntervals(); 30 | 31 | protected: 32 | AlignmentConstPtr _alignment; 33 | const Genome *_genome; 34 | const Sequence *_sequence; 35 | std::ostream *_bedStream; 36 | hal_size_t _extend; 37 | double _extendPct; 38 | PositionCache _posCache; 39 | }; 40 | } 41 | 42 | #endif 43 | // Local Variables: 44 | // mode: c++ 45 | // End: 46 | -------------------------------------------------------------------------------- /extract/tests/input/small.mmap1.0.hal.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/extract/tests/input/small.mmap1.0.hal.bz2 -------------------------------------------------------------------------------- /fasta/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/fasta 4 | 5 | hal2fasta_srcs = hal2fasta.cpp 6 | hal2fasta_objs = ${hal2fasta_srcs:%.cpp=${modObjDir}/%.o} 7 | srcs = ${hal2fasta_srcs} 8 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 9 | depends = ${srcs:%.cpp=%.depend} 10 | progs = ${binDir}/hal2fasta 11 | 12 | all: progs 13 | libs: 14 | progs: ${progs} 15 | 16 | clean: 17 | rm -f ${objs} ${progs} ${depends} 18 | test: 19 | 20 | include ${rootDir}/rules.mk 21 | 22 | # don't fail on missing dependencies, they are first time the .o is generates 23 | -include ${depends} 24 | 25 | 26 | # Local Variables: 27 | # mode: makefile-gmake 28 | # End: 29 | 30 | -------------------------------------------------------------------------------- /liftover/impl/halBedScanner.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "halBedScanner.h" 15 | 16 | using namespace std; 17 | using namespace hal; 18 | 19 | BedScanner::BedScanner() : _bedStream(NULL) { 20 | } 21 | 22 | BedScanner::~BedScanner() { 23 | } 24 | 25 | void BedScanner::scan(const string &bedPath, int bedType) { 26 | assert(_bedStream == NULL); 27 | _bedStream = new ifstream(bedPath.c_str()); 28 | try { 29 | scan(_bedStream, bedType); 30 | } catch (hal_exception &e) { 31 | delete _bedStream; 32 | _bedStream = NULL; 33 | throw hal_exception(string(e.what()) + " in file " + bedPath); 34 | } 35 | 36 | delete _bedStream; 37 | _bedStream = NULL; 38 | } 39 | 40 | void BedScanner::scan(istream *is, int bedType) { 41 | visitBegin(); 42 | _bedStream = is; 43 | if (_bedStream->bad()) { 44 | throw hal_exception("Error reading bed input stream"); 45 | } 46 | string lineBuffer; 47 | _lineNumber = 0; 48 | try { 49 | skipWhiteSpaces(_bedStream); 50 | while (_bedStream->good()) { 51 | ++_lineNumber; 52 | _bedLine.read(*_bedStream, lineBuffer, bedType); 53 | visitLine(); 54 | skipWhiteSpaces(_bedStream); 55 | } 56 | } catch (hal_exception &e) { 57 | throw hal_exception(string(e.what()) + " in input bed line " + std::to_string(_lineNumber)); 58 | } 59 | visitEOF(); 60 | _bedStream = NULL; 61 | } 62 | 63 | size_t BedScanner::getNumColumns(const string &bedLine) { 64 | return chopString(bedLine, "\t").size(); 65 | } 66 | void BedScanner::visitBegin() { 67 | } 68 | 69 | void BedScanner::visitLine() { 70 | } 71 | 72 | void BedScanner::visitEOF() { 73 | } 74 | 75 | void BedScanner::skipWhiteSpaces(istream *bedStream) { 76 | while (bedStream->good() && std::isspace((char)bedStream->peek())) { 77 | bedStream->get(); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /liftover/impl/halWiggleLoader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halWiggleLoader.h" 9 | #include "halWiggleLiftover.h" 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | using namespace hal; 15 | 16 | WiggleLoader::WiggleLoader() { 17 | } 18 | 19 | WiggleLoader::~WiggleLoader() { 20 | } 21 | 22 | void WiggleLoader::load(AlignmentConstPtr alignment, const Genome *genome, istream *inputFile, WiggleTiles *vals) { 23 | _alignment = AlignmentConstPtr(alignment); 24 | _srcGenome = genome; 25 | _srcSequence = NULL; 26 | _vals = vals; 27 | scan(inputFile); 28 | } 29 | 30 | void WiggleLoader::visitHeader() { 31 | _srcSequence = _srcGenome->getSequence(_sequenceName); 32 | if (_srcSequence == NULL) { 33 | throw hal_exception("Sequence " + _sequenceName + " not found in genome " + _srcGenome->getName()); 34 | } 35 | } 36 | 37 | void WiggleLoader::visitLine() { 38 | if (_srcSequence == NULL) { 39 | throw hal_exception("Missing Wig header"); 40 | } 41 | 42 | hal_index_t absFirst = _first + _srcSequence->getStartPosition(); 43 | hal_index_t absLast = _last + _srcSequence->getStartPosition(); 44 | 45 | for (hal_index_t absPos = absFirst; absPos <= absLast; ++absPos) { 46 | _vals->set(absPos, _value); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /liftover/inc/halBedScanner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALBEDSCANNER_H 9 | #define _HALBEDSCANNER_H 10 | 11 | #include "hal.h" 12 | #include "halBedLine.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Parse a BED file line by line 21 | * written independently from the bed export, and it's too much of a 22 | * bother to reuse any of that code. */ 23 | class BedScanner { 24 | public: 25 | BedScanner(); 26 | virtual ~BedScanner(); 27 | virtual void scan(const std::string &bedPath, int bedType=0); 28 | virtual void scan(std::istream *bedStream, int bedType=0); 29 | 30 | static size_t getNumColumns(const std::string &bedLine); 31 | 32 | protected: 33 | virtual void visitBegin(); 34 | virtual void visitLine(); 35 | virtual void visitEOF(); 36 | 37 | static void skipWhiteSpaces(std::istream *bedStream); 38 | 39 | protected: 40 | std::istream *_bedStream; 41 | BedLine _bedLine; 42 | hal_size_t _lineNumber; 43 | }; 44 | } 45 | 46 | #endif 47 | // Local Variables: 48 | // mode: c++ 49 | // End: 50 | -------------------------------------------------------------------------------- /liftover/inc/halBlockLiftover.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALBLOCKLIFTOVER_H 9 | #define _HALBLOCKLIFTOVER_H 10 | 11 | #include "halLiftover.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace hal { 18 | 19 | class BlockLiftover : public Liftover { 20 | public: 21 | BlockLiftover(); 22 | virtual ~BlockLiftover(); 23 | 24 | protected: 25 | void liftInterval(BedList &mappedBedLines); 26 | void visitBegin(); 27 | 28 | void cleanTargetParalogies(); 29 | void readPSLInfo(std::vector &fragments, BedLine &outBedLine); 30 | 31 | protected: 32 | MappedSegmentSet _mappedSegments; 33 | SegmentIteratorPtr _refSeg; 34 | hal_index_t _lastIndex; 35 | std::set _downwardPath; 36 | const Genome *_mrca; 37 | }; 38 | } 39 | #endif 40 | // Local Variables: 41 | // mode: c++ 42 | // End: 43 | -------------------------------------------------------------------------------- /liftover/inc/halColumnLiftover.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALCOLUMNLIFTOVER_H 9 | #define _HALCOLUMNLIFTOVER_H 10 | 11 | #include "halLiftover.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace hal { 18 | 19 | class ColumnLiftover : public Liftover { 20 | public: 21 | ColumnLiftover(); 22 | virtual ~ColumnLiftover(); 23 | 24 | protected: 25 | void liftInterval(BedList &mappedBedLines); 26 | 27 | typedef ColumnIterator::DNASet DNASet; 28 | typedef ColumnIterator::ColumnMap ColumnMap; 29 | typedef PositionCache::IntervalSet IntervalSet; 30 | 31 | typedef std::pair SeqIndex; 32 | typedef std::map PositionMap; 33 | 34 | protected: 35 | ColumnIteratorPtr _colIt; 36 | std::set _missedSet; 37 | bool _outParalogy; 38 | }; 39 | } 40 | #endif 41 | // Local Variables: 42 | // mode: c++ 43 | // End: 44 | -------------------------------------------------------------------------------- /liftover/inc/halLiftover.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALLIFTOVER_H 9 | #define _HALLIFTOVER_H 10 | 11 | #include "halBedScanner.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | class Liftover : public BedScanner { 21 | public: 22 | Liftover(); 23 | virtual ~Liftover(); 24 | 25 | void convert(AlignmentConstPtr alignment, const Genome *srcGenome, std::istream *inputFile, const Genome *tgtGenome, 26 | std::ostream *outputFile, int bedType = 0, 27 | bool traverseDupes = true, bool outPSL = false, bool outPSLWithName = false, 28 | const Genome *coalescenceLimit = NULL); 29 | 30 | protected: 31 | typedef std::list BedList; 32 | 33 | virtual void visitBegin(); 34 | virtual void visitLine(); 35 | virtual void visitEOF(); 36 | virtual void writeLineResults(); 37 | virtual void assignBlocksToIntervals(); 38 | virtual bool compatible(const BedLine &tgtBed, const BedLine &newBlock); 39 | virtual void flipBlocks(BedList &bedList); 40 | virtual void computePSLInserts(BedList &bedList); 41 | virtual void writeBlocksAsIntervals(); 42 | virtual void cleanResults(); 43 | virtual void liftBlockIntervals(); 44 | virtual void liftInterval(BedList &mappedBedLines) = 0; 45 | 46 | protected: 47 | AlignmentConstPtr _alignment; 48 | std::ostream *_outBedStream; 49 | bool _bedType; 50 | bool _traverseDupes; 51 | BedList _outBedLines; 52 | bool _outPSL; 53 | bool _outPSLWithName; 54 | 55 | BedList _mappedBlocks; 56 | 57 | const Genome *_srcGenome; 58 | const Genome *_tgtGenome; 59 | const Genome *_coalescenceLimit; 60 | const Sequence *_srcSequence; 61 | std::set _tgtSet; 62 | 63 | ColumnIteratorPtr _colIt; 64 | std::set _missedSet; 65 | }; 66 | } 67 | #endif 68 | // Local Variables: 69 | // mode: c++ 70 | // End: 71 | -------------------------------------------------------------------------------- /liftover/inc/halWiggleLiftover.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALWIGGLELIFTOVER_H 9 | #define _HALWIGGLELIFTOVER_H 10 | 11 | #include "halWiggleScanner.h" 12 | #include "halWiggleTiles.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | class WiggleLiftover : public WiggleScanner { 21 | public: 22 | WiggleLiftover(); 23 | virtual ~WiggleLiftover(); 24 | 25 | void preloadOutput(AlignmentConstPtr alignment, const Genome *tgtGenome, std::istream *inputFile); 26 | 27 | void convert(AlignmentConstPtr alignment, const Genome *srcGenome, std::istream *inputFile, const Genome *tgtGenome, 28 | std::ostream *outputFile, bool traverseDupes = true, bool unique = false); 29 | 30 | static const double DefaultValue; 31 | static const hal_size_t DefaultTileSize; 32 | 33 | protected: 34 | virtual void visitLine(); 35 | virtual void visitHeader(); 36 | virtual void visitEOF(); 37 | 38 | void mapSegment(); 39 | void mapFragments(std::vector &fragments); 40 | void write(); 41 | 42 | protected: 43 | struct CoordVal { 44 | hal_index_t _first; 45 | hal_index_t _last; 46 | double _val; 47 | }; 48 | typedef std::vector ValVec; 49 | 50 | AlignmentConstPtr _alignment; 51 | std::istream *_inStream; 52 | std::ostream *_outStream; 53 | bool _traverseDupes; 54 | bool _unique; 55 | 56 | const Genome *_srcGenome; 57 | const Genome *_tgtGenome; 58 | const Sequence *_srcSequence; 59 | std::set _tgtSet; 60 | MappedSegmentSet _mappedSegments; 61 | hal_index_t _lastIndex; 62 | 63 | SegmentIteratorPtr _segment; 64 | ValVec _cvals; 65 | WiggleTiles _outVals; 66 | hal_index_t _cvIdx; 67 | }; 68 | } 69 | #endif 70 | // Local Variables: 71 | // mode: c++ 72 | // End: 73 | -------------------------------------------------------------------------------- /liftover/inc/halWiggleLoader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALWIGGLELOADER_H 9 | #define _HALWIGGLELOADER_H 10 | 11 | #include "halWiggleScanner.h" 12 | #include "halWiggleTiles.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Quick hack to load a wiggle into memory, in order to have wiggleLiftover 21 | * --append option work better. Ideally would be base class of WiggleLiftover 22 | * but don't have time to refactor right now */ 23 | class WiggleLoader : public WiggleScanner { 24 | public: 25 | WiggleLoader(); 26 | virtual ~WiggleLoader(); 27 | 28 | void load(AlignmentConstPtr alignment, const Genome *genome, std::istream *inputFile, WiggleTiles *vals); 29 | 30 | protected: 31 | virtual void visitLine(); 32 | virtual void visitHeader(); 33 | 34 | AlignmentConstPtr _alignment; 35 | const Genome *_srcGenome; 36 | const Sequence *_srcSequence; 37 | WiggleTiles *_vals; 38 | }; 39 | } 40 | #endif 41 | // Local Variables: 42 | // mode: c++ 43 | // End: 44 | -------------------------------------------------------------------------------- /liftover/inc/halWiggleScanner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALWIGGLESCANNER_H 9 | #define _HALWIGGLESCANNER_H 10 | 11 | #include "hal.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Parse a WIGGLE file line by line */ 21 | class WiggleScanner { 22 | public: 23 | WiggleScanner(); 24 | virtual ~WiggleScanner(); 25 | virtual void scan(const std::string &wigglePath); 26 | virtual void scan(std::istream *wiggleStream); 27 | 28 | protected: 29 | virtual void visitBegin(); 30 | virtual void visitLine(); 31 | virtual void visitHeader(); 32 | virtual void visitEOF(); 33 | 34 | virtual bool scanHeader(const std::string &lineBuffer); 35 | virtual void scanLine(const std::string &lineBuffer); 36 | static void skipWhiteSpaces(std::istream *wiggleStream); 37 | 38 | protected: 39 | std::istream *_wiggleStream; 40 | double _value; 41 | hal_index_t _first; 42 | hal_index_t _last; 43 | 44 | // header information 45 | std::string _sequenceName; 46 | hal_index_t _start; 47 | hal_index_t _step; 48 | hal_index_t _span; 49 | bool _fixedStep; 50 | 51 | hal_index_t _lineNumber; 52 | std::string _buffer; 53 | hal_index_t _offset; 54 | }; 55 | } 56 | 57 | #endif 58 | // Local Variables: 59 | // mode: c++ 60 | // End: 61 | -------------------------------------------------------------------------------- /liftover/tests/expected/halLiftoverBed12ExtraTest.bed: -------------------------------------------------------------------------------- 1 | Genome_2_seq 0 128 region1 100 + 0 128 0,0,0 1 128 0 Fred Betty 2 | Genome_2_seq 128 256 region2 100 + 128 256 0,0,0 1 128 0 Pebbles Dino 3 | Genome_2_seq 256 3314 region3 100 + 256 3314 0,0,0 3 128,91,91 0,1795,2967 Barney Betty 4 | -------------------------------------------------------------------------------- /liftover/tests/expected/halLiftoverBed12Test.bed: -------------------------------------------------------------------------------- 1 | Genome_2_seq 0 128 region1 100 + 0 128 0,0,0 1 128 0 2 | -------------------------------------------------------------------------------- /liftover/tests/expected/halLiftoverBed3Test.bed: -------------------------------------------------------------------------------- 1 | Genome_2_seq 0 128 2 | -------------------------------------------------------------------------------- /liftover/tests/expected/halLiftoverBed4ExtraTest.bed: -------------------------------------------------------------------------------- 1 | Genome_2_seq 0 128 region1 Fred Wilma 2 | Genome_2_seq 128 256 region2 Pebbles Dino 3 | Genome_2_seq 256 512 region3 Barney Betty 4 | Genome_2_seq 2051 2270 region3 Barney Betty 5 | Genome_2_seq 3223 3442 region3 Barney Betty 6 | -------------------------------------------------------------------------------- /liftover/tests/expected/halLiftoverPsl12Test.psl: -------------------------------------------------------------------------------- 1 | 128 0 0 0 0 0 0 0 ++ Genome_0_seq 1758 0 128 Genome_2_seq 4270 0 128 1 128, 0, 0, 2 | -------------------------------------------------------------------------------- /liftover/tests/expected/halLiftoverPsl3Test.psl: -------------------------------------------------------------------------------- 1 | 128 0 0 0 0 0 0 0 ++ Genome_0_seq 1758 0 128 Genome_2_seq 4270 0 128 1 128, 0, 0, 2 | -------------------------------------------------------------------------------- /liftover/tests/halLiftoverTests.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALLIFTOVERTESTS_H 9 | #define _HALLIFTOVERTESTS_H 10 | 11 | #include "hal.h" 12 | #include "halApiTestSupport.h" 13 | 14 | extern "C" { 15 | #include "CuTest.h" 16 | } 17 | 18 | using namespace hal; 19 | 20 | struct BedLiftoverTest : public AlignmentTest { 21 | private: 22 | void liftAndCheck(AlignmentConstPtr alignment, 23 | const Genome *srcGenome, 24 | const Genome *tgtGenome, 25 | const std::string& inBed, 26 | const std::string& expectBed, 27 | bool outPSL = false, bool outPSLWithName = false); 28 | public: 29 | void createCallBack(AlignmentPtr alignment); 30 | void checkCallBack(AlignmentConstPtr alignment); 31 | void testOneBranchLifts(AlignmentConstPtr alignment); 32 | void testMultiBranchLifts(AlignmentConstPtr alignment); 33 | }; 34 | 35 | struct WiggleLiftoverTest : public AlignmentTest { 36 | void createCallBack(AlignmentPtr alignment); 37 | void checkCallBack(AlignmentConstPtr alignment); 38 | void testOneBranchLifts(AlignmentConstPtr alignment); 39 | void testMultiBranchLifts(AlignmentConstPtr alignment); 40 | }; 41 | 42 | CuSuite *halLiftoverTestSuite(); 43 | 44 | #endif 45 | // Local Variables: 46 | // mode: c++ 47 | // End: 48 | -------------------------------------------------------------------------------- /liftover/tests/input/test1.bed12: -------------------------------------------------------------------------------- 1 | Genome_0_seq 0 128 region1 100 + 0 128 0,0,0 1 128, 0, 2 | -------------------------------------------------------------------------------- /liftover/tests/input/test1.bed12+2: -------------------------------------------------------------------------------- 1 | Genome_0_seq 0 128 region1 100 + 0 128 0,0,0 1 128, 0, Fred Betty 2 | Genome_0_seq 128 256 region2 100 + 0 128 0,0,0 1 128, 0, Pebbles Dino 3 | Genome_0_seq 256 512 region3 100 + 0 128 0,0,0 1 128, 0, Barney Betty 4 | -------------------------------------------------------------------------------- /liftover/tests/input/test1.bed3: -------------------------------------------------------------------------------- 1 | Genome_0_seq 0 128 2 | -------------------------------------------------------------------------------- /liftover/tests/input/test1.bed4+2: -------------------------------------------------------------------------------- 1 | Genome_0_seq 0 128 region1 Fred Wilma 2 | Genome_0_seq 128 256 region2 Pebbles Dino 3 | Genome_0_seq 256 512 region3 Barney Betty 4 | -------------------------------------------------------------------------------- /lod/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/lod 4 | 5 | libHalLod_srcs = impl/halLodBlock.cpp impl/halLodExtract.cpp impl/halLodGraph.cpp \ 6 | impl/halLodManager.cpp impl/halLodSegment.cpp 7 | libHalLod_objs = ${libHalLod_srcs:%.cpp=${modObjDir}/%.o} 8 | halLodExtract_srcs =impl/halLodExtractMain.cpp 9 | halLodExtract_objs = ${halLodExtract_srcs:%.cpp=${modObjDir}/%.o} 10 | srcs = ${libHalLod_srcs} ${halLodExtract_srcs} 11 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 12 | depends = ${srcs:%.cpp=%.depend} 13 | pyprogs = ${binDir}/halLodInterpolate.py 14 | progs = ${binDir}/halLodExtract ${pyprogs} 15 | otherLibs = ${libHalLod} 16 | 17 | all : libs progs 18 | libs: ${libHalLod} 19 | progs: ${progs} 20 | 21 | clean : 22 | rm -f ${libHalLod} ${objs} ${progs} ${depends} 23 | test: 24 | 25 | ${binDir}/%.py: %.py 26 | @mkdir -p $(dir $@) 27 | cp -f $< $@ 28 | chmod a+x,-w $@ 29 | 30 | include ${rootDir}/rules.mk 31 | 32 | # don't fail on missing dependencies, they are first time the .o is generates 33 | -include ${depends} 34 | 35 | 36 | # Local Variables: 37 | # mode: makefile-gmake 38 | # End: 39 | 40 | -------------------------------------------------------------------------------- /lod/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/lod/__init__.py -------------------------------------------------------------------------------- /lod/inc/halLodManager.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALLODMANAGER_H 9 | #define _HALLODMANAGER_H 10 | 11 | #include "hal.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | class CLParser; 20 | 21 | /** This is a container that keeps track of LOD alignments as generated 22 | * by halLodExtract.py 23 | */ 24 | class LodManager { 25 | public: 26 | LodManager(); 27 | virtual ~LodManager(); 28 | 29 | /** Load series of alignments specified in the lodPath file. Options 30 | * from the given CLParser are applied if specified. 31 | * 32 | * If the paths of the HAL files are relative (do not begin with /) then 33 | * they will be concatenated to the directory of lodPath. If they 34 | * are absolute (beginning with /) then they will be opened directly. 35 | * Paths that contain ":/" are assumed to be 36 | * web addressed of some sort and considered absolute. */ 37 | void loadLODFile(const std::string &lodPath, const CLParser *options = NULL); 38 | 39 | /** Just use the given HAL file for everything. Same as if we gave a 40 | * lodFile containing only "0 halPath"*/ 41 | void loadSingeHALFile(const std::string &halPath, const CLParser *options = NULL); 42 | 43 | AlignmentConstPtr getAlignment(hal_size_t queryLength, bool needDNA); 44 | 45 | /** Check if query length corresponds to LOD 0 (ie original HAL) */ 46 | bool isLod0(hal_size_t queryLenth) const; 47 | 48 | /** Any query greater than this is disabled */ 49 | hal_size_t getMaxQueryLength() const; 50 | 51 | /** Maximum age of a URL in seconds such that we dont try to 52 | * preload headers for all the HAL files */ 53 | static const unsigned long MaxAgeSec; 54 | 55 | /** Token that specifies upper limit for LODs, that sits in path field */ 56 | static const std::string MaxLodToken; 57 | 58 | private: 59 | std::string resolvePath(const std::string &lodPath, const std::string &halPath); 60 | void checkMap(const std::string &lodPath); 61 | void checkAlignment(hal_size_t minQuery, const std::string &path, AlignmentConstPtr alignment); 62 | void preloadAlignments(); 63 | 64 | typedef std::pair PathAlign; 65 | typedef std::map AlignmentMap; 66 | 67 | const CLParser *_options; 68 | AlignmentMap _map; 69 | hal_size_t _maxLodLowerBound; 70 | }; 71 | 72 | inline hal_size_t LodManager::getMaxQueryLength() const { 73 | return _maxLodLowerBound - 1; 74 | } 75 | 76 | HAL_FORWARD_DEC_CLASS(LodManager) 77 | } 78 | 79 | #endif 80 | // Local Variables: 81 | // mode: c++ 82 | // End: 83 | -------------------------------------------------------------------------------- /maf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/maf/__init__.py -------------------------------------------------------------------------------- /maf/impl/halMafBed.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halMafBed.h" 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | using namespace hal; 14 | 15 | MafBed::MafBed(std::ostream &mafStream, AlignmentConstPtr alignment, const Genome *refGenome, 16 | std::set &targetSet, MafExport &mafExport) 17 | : BedScanner(), _mafStream(mafStream), _alignment(alignment), _refGenome(refGenome), _targetSet(targetSet), 18 | _mafExport(mafExport) { 19 | } 20 | 21 | MafBed::~MafBed() { 22 | } 23 | 24 | void MafBed::visitLine() { 25 | const Sequence *refSequence = _refGenome->getSequence(_bedLine._chrName); 26 | if (refSequence == NULL) { 27 | cerr << "Line " << _lineNumber << ": BED sequence " << _bedLine._chrName << " not found in genome " 28 | << _refGenome->getName() << '\n'; 29 | return; 30 | } 31 | if (_bedLine._bedType <= 9) { 32 | if (_bedLine._end <= _bedLine._start || _bedLine._end > (hal_index_t)refSequence->getSequenceLength()) { 33 | cerr << "Line " << _lineNumber << ": BED coordinates invalid\n"; 34 | } else { 35 | hal_index_t start = _bedLine._start; 36 | hal_index_t end = _bedLine._end; 37 | _mafExport.convertSequence(_mafStream, _alignment, refSequence, start, end - start, _targetSet); 38 | } 39 | } else { 40 | for (size_t i = 0; i < _bedLine._blocks.size(); ++i) { 41 | if (_bedLine._blocks[i]._length == 0 || 42 | _bedLine._start + _bedLine._blocks[i]._start + _bedLine._blocks[i]._length >= 43 | (hal_index_t)refSequence->getSequenceLength()) { 44 | cerr << "Line " << _lineNumber << ", block " << i << ": BED coordinates invalid\n"; 45 | } else { 46 | hal_index_t start = _bedLine._start + _bedLine._blocks[i]._start; 47 | hal_index_t end = _bedLine._start + _bedLine._blocks[i]._start + _bedLine._blocks[i]._length; 48 | _mafExport.convertSequence(_mafStream, _alignment, refSequence, start, end - start, _targetSet); 49 | } 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /maf/impl/halMafScanReference.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include "halMafScanReference.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | using namespace hal; 15 | 16 | MafScanReference::MafScanReference() : MafScanner() { 17 | } 18 | 19 | MafScanReference::~MafScanReference() { 20 | } 21 | 22 | std::string MafScanReference::getRefName(const std::string &mafPath) { 23 | MafScanner::scan(mafPath, set()); 24 | return _name; 25 | } 26 | 27 | void MafScanReference::aLine() { 28 | } 29 | 30 | void MafScanReference::sLine() { 31 | Row &row = _block[_rows - 1]; 32 | // this is the first pass. so we do a quick sanity check 33 | if (row._sequenceName.find('.') == string::npos || row._sequenceName.find('.') == 0) { 34 | throw hal_exception("illegal sequence name found: " + row._sequenceName + 35 | ". Sequence names must be in genomeName.sequenceName format."); 36 | } 37 | 38 | _name = genomeName(row._sequenceName); 39 | _mafFile.seekg(0, ios_base::end); 40 | } 41 | 42 | void MafScanReference::end() { 43 | } 44 | -------------------------------------------------------------------------------- /maf/inc/halMafBed.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMAFBED_H 9 | #define _HALMAFBED_H 10 | 11 | #include "halBedScanner.h" 12 | #include "halMafExport.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Use the halBedScanner to parse a bed file, running mafExport on each 21 | * line */ 22 | class MafBed : public BedScanner { 23 | public: 24 | MafBed(std::ostream &mafStream, AlignmentConstPtr alignment, const Genome *refGenome, 25 | std::set &targetSet, MafExport &mafExport); 26 | virtual ~MafBed(); 27 | 28 | void run(std::istream *bedStream); 29 | 30 | protected: 31 | virtual void visitLine(); 32 | 33 | protected: 34 | std::ostream &_mafStream; 35 | AlignmentConstPtr _alignment; 36 | const Genome *_refGenome; 37 | const Sequence *_refSequence; 38 | hal_index_t _refStart; 39 | hal_size_t _refLength; 40 | std::set &_targetSet; 41 | MafExport &_mafExport; 42 | }; 43 | } 44 | 45 | #endif 46 | // Local Variables: 47 | // mode: c++ 48 | // End: 49 | -------------------------------------------------------------------------------- /maf/inc/halMafExport.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMAFEXPORT_H 9 | #define _HALMAFEXPORT_H 10 | 11 | #include "halMafBlock.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace hal { 18 | 19 | class MafExport { 20 | public: 21 | MafExport(): 22 | _mafStream(NULL), _maxRefGap(0), _noDupes(false), _noAncestors(false), 23 | _ucscNames(false), _unique(false), _append(false), _printTree(false), 24 | _onlyOrthologs(false), _keepEmptyRefBlocks(false) { 25 | } 26 | 27 | virtual ~MafExport() { 28 | } 29 | 30 | void convertSequence(std::ostream &mafStream, AlignmentConstPtr alignment, const Sequence *seq, 31 | hal_index_t startPosition, hal_size_t length, const std::set &targets); 32 | 33 | // Convert all columns in the leaf genomes to MAF. Each column is 34 | // reported exactly once regardless of the unique setting, although 35 | // this may change in the future. Likewise, maxRefGap has no 36 | // effect, although noDupes will work. 37 | void convertEntireAlignment(std::ostream &mafStream, AlignmentConstPtr alignment); 38 | 39 | void setMaxRefGap(hal_size_t maxRefGap) { 40 | _maxRefGap = maxRefGap; 41 | } 42 | void setNoDupes(bool noDupes) { 43 | _noDupes = noDupes; 44 | } 45 | void setNoAncestors(bool noAncestors) { 46 | _noAncestors = noAncestors; 47 | } 48 | void setUcscNames(bool ucscNames) { 49 | _ucscNames = ucscNames; 50 | } 51 | void setUnique(bool unique) { 52 | _unique = unique; 53 | } 54 | void setAppend(bool append) { 55 | _append = append; 56 | } 57 | void setMaxBlockLength(hal_index_t maxLength) { 58 | _mafBlock.setMaxLength(maxLength); 59 | } 60 | void setPrintTree(bool printTree) { 61 | _printTree = printTree; 62 | } 63 | void setOnlyOrthologs(bool onlyOrthologs) { 64 | _onlyOrthologs = onlyOrthologs; 65 | } 66 | void setKeepEmptyRefBlocks(bool keepEmptyRefBlocks) { 67 | _keepEmptyRefBlocks = keepEmptyRefBlocks; 68 | } 69 | 70 | protected: 71 | void writeHeader(); 72 | 73 | protected: 74 | AlignmentConstPtr _alignment; 75 | std::ostream *_mafStream; 76 | MafBlock _mafBlock; 77 | hal_size_t _maxRefGap; 78 | bool _noDupes; 79 | bool _noAncestors; 80 | bool _ucscNames; 81 | bool _unique; 82 | bool _append; 83 | bool _printTree; 84 | bool _onlyOrthologs; 85 | bool _keepEmptyRefBlocks; 86 | }; 87 | } 88 | 89 | #endif 90 | // Local Variables: 91 | // mode: c++ 92 | // End: 93 | -------------------------------------------------------------------------------- /maf/inc/halMafScanDimensions.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMAFSCANDIMENSIONS_H 9 | #define _HALMAFSCANDIMENSIONS_H 10 | 11 | #include "halMafScanner.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Parse a MAF file line by line, getting some dimension stats 21 | * and maybe checking for some errros. */ 22 | class MafScanDimensions : public MafScanner { 23 | public: 24 | // map start position to array index 25 | struct ArrayInfo { 26 | hal_size_t _index : 42; 27 | hal_size_t _count : 20; 28 | hal_size_t _empty : 1; 29 | mutable hal_size_t _written : 1; 30 | }; 31 | typedef std::map StartMap; 32 | 33 | typedef std::pair FilePosition; 34 | typedef std::set PosSet; 35 | 36 | struct Record { 37 | hal_size_t _length; 38 | hal_size_t _numSegments; 39 | StartMap _startMap; 40 | PosSet _badPosSet; 41 | }; 42 | typedef std::map DimMap; 43 | 44 | public: 45 | MafScanDimensions(); 46 | ~MafScanDimensions(); 47 | void scan(const std::string &mafPath, const std::set &targetSet); 48 | const DimMap &getDimensions() const; 49 | 50 | protected: 51 | void aLine(); 52 | void sLine(); 53 | void end(); 54 | void updateDimensionsFromBlock(); 55 | void updateArrayIndices(); 56 | 57 | protected: 58 | DimMap _dimMap; 59 | }; 60 | } 61 | 62 | #endif 63 | // Local Variables: 64 | // mode: c++ 65 | // End: 66 | -------------------------------------------------------------------------------- /maf/inc/halMafScanReference.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMAFSCANREFERENCE_H 9 | #define _HALMAFSCANREFERENCE_H 10 | 11 | #include "halMafScanner.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Parse a MAF file line by line, getting some dimension stats 21 | * and maybe checking for some errros. */ 22 | class MafScanReference : private MafScanner { 23 | public: 24 | MafScanReference(); 25 | ~MafScanReference(); 26 | 27 | std::string getRefName(const std::string &mafPath); 28 | 29 | private: 30 | void aLine(); 31 | void sLine(); 32 | void end(); 33 | 34 | std::string _name; 35 | }; 36 | } 37 | 38 | #endif 39 | // Local Variables: 40 | // mode: c++ 41 | // End: 42 | -------------------------------------------------------------------------------- /maf/inc/halMafScanner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMAFSCANNER_H 9 | #define _HALMAFSCANNER_H 10 | 11 | #include "hal.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Parse a MAF file line by line 21 | * written independently from the maf export, and it's too much of a 22 | * bother to reuse any of that code. */ 23 | class MafScanner { 24 | public: 25 | MafScanner(); 26 | virtual ~MafScanner(); 27 | virtual void scan(const std::string &mafPath, const std::set &targetSet); 28 | hal_size_t getNumBlocks() const { 29 | return _numBlocks; 30 | } 31 | static std::string genomeName(const std::string &fullName); 32 | static std::string sequenceName(const std::string &fullName); 33 | 34 | struct Row { 35 | std::string _sequenceName; 36 | hal_size_t _startPosition; 37 | hal_size_t _length; 38 | char _strand; 39 | hal_size_t _srcLength; 40 | std::string _line; 41 | }; 42 | typedef std::vector Block; 43 | typedef std::vector Mask; 44 | 45 | protected: 46 | virtual void aLine() = 0; 47 | virtual void sLine() = 0; 48 | virtual void end() = 0; 49 | void nextLine(); 50 | void updateMask(); 51 | 52 | std::ifstream _mafFile; 53 | std::set _targets; 54 | 55 | Block _block; 56 | size_t _rows; 57 | Mask _mask; 58 | hal_size_t _numBlocks; 59 | }; 60 | } 61 | 62 | #endif 63 | // Local Variables: 64 | // mode: c++ 65 | // End: 66 | -------------------------------------------------------------------------------- /maf/tests/halMafBlockTest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halMafBlockTest.h" 9 | #include "halMafBlock.h" 10 | 11 | using namespace std; 12 | using namespace hal; 13 | 14 | void MafBlockCreateTest::createCallBack(AlignmentPtr alignment) { 15 | } 16 | 17 | void MafBlockCreateTest::checkCallBack(AlignmentConstPtr alignment) { 18 | } 19 | 20 | void halMafBlockCreateTest(CuTest *testCase) { 21 | try { 22 | #if 0 // FIXME: test callback are empty 23 | MafBlockCreateTest tester; 24 | tester.check(testCase); 25 | #else 26 | std::cerr << "Warning: halMafBlockCreateTest are not implemented" << std::endl; 27 | #endif 28 | } catch (...) { 29 | CuAssertTrue(testCase, false); 30 | } 31 | } 32 | 33 | CuSuite *halMafBlockTestSuite(void) { 34 | CuSuite *suite = CuSuiteNew(); 35 | SUITE_ADD_TEST(suite, halMafBlockCreateTest); 36 | return suite; 37 | } 38 | -------------------------------------------------------------------------------- /maf/tests/halMafBlockTest.h: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 4 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 5 | * 6 | * Released under the MIT license, see LICENSE.txt 7 | */ 8 | 9 | #ifndef _HALMAFBLOCKTEST_H 10 | #define _HALMAFBLOCKTEST_H 11 | 12 | #include "hal.h" 13 | #include "halApiTestSupport.h" 14 | #include "halMafTests.h" 15 | #include 16 | 17 | using namespace hal; 18 | 19 | struct MafBlockCreateTest : public AlignmentTest { 20 | void createCallBack(AlignmentPtr alignment); 21 | void checkCallBack(AlignmentConstPtr alignment); 22 | }; 23 | 24 | #endif 25 | // Local Variables: 26 | // mode: c++ 27 | // End: 28 | -------------------------------------------------------------------------------- /maf/tests/halMafExportTest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halMafExport.h" 9 | #include "halMafTests.h" 10 | 11 | using namespace std; 12 | using namespace hal; 13 | 14 | CuSuite *halMafExportTestSuite(void) { 15 | CuSuite *suite = CuSuiteNew(); 16 | return suite; 17 | } 18 | -------------------------------------------------------------------------------- /maf/tests/halMafTests.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | #include "halMafTests.h" 8 | #include 9 | 10 | int halMafRunAllTests(void) { 11 | CuString *output = CuStringNew(); 12 | CuSuite *suite = CuSuiteNew(); 13 | CuSuiteAddSuite(suite, halMafExportTestSuite()); 14 | CuSuiteAddSuite(suite, halMafBlockTestSuite()); 15 | CuSuiteRun(suite); 16 | CuSuiteSummary(suite, output); 17 | CuSuiteDetails(suite, output); 18 | printf("%s\n", output->buffer); 19 | return suite->failCount > 0; 20 | } 21 | 22 | int main(int argc, char *argv[]) { 23 | 24 | return halMafRunAllTests(); 25 | } 26 | -------------------------------------------------------------------------------- /maf/tests/halMafTests.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMAFTESTS_H 9 | #define _HALMAFTESTS_H 10 | 11 | #include "halApiTestSupport.h" 12 | 13 | extern "C" { 14 | #include "CuTest.h" 15 | } 16 | 17 | CuSuite *halMafExportTestSuite(); 18 | CuSuite *halMafBlockTestSuite(); 19 | 20 | #endif 21 | // Local Variables: 22 | // mode: c++ 23 | // End: 24 | -------------------------------------------------------------------------------- /maf/tests/input/small-Genome_0.bed: -------------------------------------------------------------------------------- 1 | Genome_0_seq 0 293 2 | Genome_0_seq 293 586 3 | Genome_0_seq 586 879 4 | Genome_0_seq 879 1033 5 | Genome_0_seq 1033 1172 6 | -------------------------------------------------------------------------------- /modify/ancestorsML.h: -------------------------------------------------------------------------------- 1 | #ifndef __ANCESTORSML_H_ 2 | #define __ANCESTORSML_H_ 3 | #include "halAlignment.h" 4 | #include "halDefs.h" 5 | #include "halGenome.h" 6 | #include "sonLibTree.h" 7 | #include 8 | extern "C" { 9 | #include "tree_model.h" 10 | } 11 | // PHAST code defines min, max macros which conflict with the reserved C++ names. 12 | #undef min 13 | #undef max 14 | typedef struct { 15 | const hal::Genome *rootGenome; 16 | hal_index_t pos; 17 | // Reversed with respect to reference? 18 | bool reversed; 19 | } rootInfo; 20 | 21 | typedef struct { 22 | // Position of this site in the genome 23 | hal_index_t pos; 24 | // Probability of leaves under this node given each nucleotide. 25 | double pLeaves[4]; 26 | // This is a terrible and incorrect name 27 | // TODO change it without breaking everything 28 | double pOtherLeaves[4]; 29 | // phast ID from the model. 30 | int phastId; 31 | // Posterior probability of this call (in case we need it later) 32 | double post; 33 | // should only be set on the leaves at first. 34 | char dna; 35 | // Reversed with respect to reference? 36 | bool reversed; 37 | // Whether this node has already been calculated. 38 | bool done; 39 | } felsensteinData; 40 | 41 | using namespace hal; 42 | 43 | void doFelsenstein(stTree *node, TreeModel *mod); 44 | 45 | void reEstimate(TreeModel *mod, AlignmentConstPtr alignment, const Genome *genome, hal_index_t startPos, hal_index_t endPos, 46 | std::map &nameToId, double threshold, bool printWrites, bool outputPosts); 47 | 48 | #endif 49 | // Local Variables: 50 | // mode: c++ 51 | // End: 52 | -------------------------------------------------------------------------------- /modify/ancestorsMLBed.cpp: -------------------------------------------------------------------------------- 1 | #ifndef __ANCESTORSMLBED_H_ 2 | #define __ANCESTORSMLBED_H_ 3 | #include "hal.h" 4 | #include 5 | #include 6 | extern "C" { 7 | #include "tree_model.h" 8 | } 9 | // PHAST code defines min, max macros which conflict with the reserved C++ names. 10 | #undef min 11 | #undef max 12 | #include "ancestorsML.h" 13 | #include "ancestorsMLBed.h" 14 | 15 | using namespace hal; 16 | using namespace std; 17 | 18 | void AncestorsMLBed::visitLine() { 19 | hal_index_t startPos, endPos; 20 | string sequenceName = _bedLine._chrName; 21 | startPos = _bedLine._start; 22 | endPos = _bedLine._end; 23 | const Sequence *sequence = _genome->getSequenceCheck(sequenceName); 24 | if (sequence == NULL) { 25 | throw hal_exception("Sequence name not found!"); 26 | } 27 | startPos += sequence->getStartPosition(); 28 | endPos += sequence->getStartPosition(); 29 | 30 | reEstimate(_mod, _alignment, _genome, startPos, endPos, _nameToId, _threshold, _printWrites, _outputPosts); 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /modify/ancestorsMLBed.h: -------------------------------------------------------------------------------- 1 | #include "halBedScanner.h" 2 | extern "C" { 3 | #include "tree_model.h" 4 | } 5 | // PHAST code defines min, max macros which conflict with the reserved C++ names. 6 | #undef min 7 | #undef max 8 | 9 | using namespace hal; 10 | 11 | class AncestorsMLBed : public hal::BedScanner { 12 | public: 13 | AncestorsMLBed(TreeModel *mod, AlignmentConstPtr alignment, const Genome *genome, std::map &nameToId, 14 | double threshold, bool printWrites, bool outputPosts) 15 | : _mod(mod), _alignment(alignment), _genome(genome), _nameToId(nameToId), _threshold(threshold), 16 | _printWrites(printWrites), _outputPosts(outputPosts){}; 17 | void visitLine(); 18 | TreeModel *_mod; 19 | AlignmentConstPtr _alignment; 20 | const Genome *_genome; 21 | std::map &_nameToId; 22 | double _threshold; 23 | bool _printWrites; 24 | bool _outputPosts; 25 | }; 26 | // Local Variables: 27 | // mode: c++ 28 | // End: 29 | -------------------------------------------------------------------------------- /modify/halRemoveGenome.cpp: -------------------------------------------------------------------------------- 1 | #include "hal.h" 2 | #include "halCLParser.h" 3 | #include "markAncestors.h" 4 | 5 | using namespace std; 6 | using namespace hal; 7 | 8 | static void initParser(CLParser &optionsParser) { 9 | optionsParser.addArgument("inFile", "existing tree"); 10 | optionsParser.addArgument("deleteNode", "(leaf) genome to delete"); 11 | optionsParser.addOptionFlag("noMarkAncestors", "don't mark ancestors for" 12 | " update", 13 | false); 14 | } 15 | 16 | int main(int argc, char *argv[]) { 17 | CLParser optionsParser(WRITE_ACCESS); 18 | initParser(optionsParser); 19 | string inPath, deleteNode; 20 | bool noMarkAncestors; 21 | try { 22 | optionsParser.parseOptions(argc, argv); 23 | inPath = optionsParser.getArgument("inFile"); 24 | deleteNode = optionsParser.getArgument("deleteNode"); 25 | noMarkAncestors = optionsParser.getFlag("noMarkAncestors"); 26 | } catch (exception &e) { 27 | optionsParser.printUsage(cerr); 28 | return 1; 29 | } 30 | AlignmentPtr alignment(openHalAlignment(inPath, &optionsParser, READ_ACCESS | WRITE_ACCESS)); 31 | if (!noMarkAncestors) { 32 | markAncestorsForUpdate(alignment, deleteNode); 33 | } 34 | alignment->removeGenome(deleteNode); 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /modify/halRemoveSubtree.cpp: -------------------------------------------------------------------------------- 1 | #include "hal.h" 2 | #include "halCLParser.h" 3 | #include "markAncestors.h" 4 | 5 | using namespace std; 6 | using namespace hal; 7 | 8 | static void initParser(CLParser &optionsParser) { 9 | optionsParser.addArgument("inFile", "existing tree"); 10 | optionsParser.addArgument("root", "subtree below this node will be deleted (but not the node itself)"); 11 | optionsParser.addOptionFlag("noMarkAncestors", "don't mark ancestors for" 12 | " update", 13 | false); 14 | } 15 | 16 | void remove_recursive(AlignmentPtr aln, const string& node) { 17 | vector children = aln->getChildNames(node); 18 | for (size_t i = 0; i < children.size(); ++i) { 19 | remove_recursive(aln, children[i]); 20 | } 21 | cerr << "[halRemoveSubtree] removing " << node << endl; 22 | aln->removeGenome(node); 23 | } 24 | 25 | int main(int argc, char *argv[]) { 26 | CLParser optionsParser(WRITE_ACCESS); 27 | initParser(optionsParser); 28 | string inPath, root; 29 | bool noMarkAncestors; 30 | try { 31 | optionsParser.parseOptions(argc, argv); 32 | inPath = optionsParser.getArgument("inFile"); 33 | root = optionsParser.getArgument("root"); 34 | noMarkAncestors = optionsParser.getFlag("noMarkAncestors"); 35 | } catch (exception &e) { 36 | optionsParser.printUsage(cerr); 37 | return 1; 38 | } 39 | AlignmentPtr alignment(openHalAlignment(inPath, &optionsParser, READ_ACCESS | WRITE_ACCESS)); 40 | 41 | if (alignment->openGenome(root) == NULL) { 42 | cerr << "[halRemoveSubtree] Error: given root " << root << " not found in alignment" << endl; 43 | return 1; 44 | } 45 | 46 | if (!noMarkAncestors) { 47 | markAncestorsForUpdate(alignment, root); 48 | } 49 | 50 | // the main use case for this is prepping for halAppendSubtree, in which case we 51 | // want to leave the root node. 52 | vector children = alignment->getChildNames(root); 53 | if (children.empty()) { 54 | cerr << "[halRemoveSubtree] Warning: given root " << root << " is a leaf: doing nothing" << endl; 55 | } 56 | for (size_t i = 0; i < children.size(); ++i) { 57 | remove_recursive(alignment, children[i]); 58 | } 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /modify/halRenameGenomes.cpp: -------------------------------------------------------------------------------- 1 | #include "hal.h" 2 | #include "renameFile.h" 3 | 4 | using namespace std; 5 | using namespace hal; 6 | 7 | static void initParser(CLParser &optionsParser) { 8 | optionsParser.setDescription("Rename genomes in a HAL file in-place."); 9 | optionsParser.addArgument("halFile", "hal file"); 10 | optionsParser.addArgument("renameFile", "Tab-separated file. First column: existing genome" 11 | " name, second column: new genome name. Any " 12 | "genomes not provided will stay the same."); 13 | } 14 | 15 | int main(int argc, char *argv[]) { 16 | CLParser optionsParser(WRITE_ACCESS); 17 | initParser(optionsParser); 18 | string halPath, renamePath; 19 | try { 20 | optionsParser.parseOptions(argc, argv); 21 | halPath = optionsParser.getArgument("halFile"); 22 | renamePath = optionsParser.getArgument("renameFile"); 23 | } catch (exception &e) { 24 | cerr << e.what() << endl; 25 | optionsParser.printUsage(cerr); 26 | return 1; 27 | } 28 | 29 | AlignmentPtr alignment(openHalAlignment(halPath, &optionsParser, WRITE_ACCESS | READ_ACCESS)); 30 | map renameMap = ingestRenameFile(renamePath); 31 | 32 | // Check that the alignment has all the old genome names, and none 33 | // of the new genome names. 34 | for (map::iterator it = renameMap.begin(); it != renameMap.end(); it++) { 35 | Genome *genome = alignment->openGenome(it->first); 36 | if (genome == NULL) { 37 | throw hal_exception("Genome " + it->first + " not found in alignment"); 38 | } 39 | 40 | genome = alignment->openGenome(it->second); 41 | if (genome != NULL) { 42 | throw hal_exception("Attempting to rename " + it->first + " to " + it->second + " failed: " + it->second + 43 | " is already in the alignment! Name it to something" 44 | " temporary first"); 45 | } 46 | } 47 | 48 | // Do the actual renaming now that we are relatively sure nothing 49 | // will go wrong. 50 | for (map::iterator it = renameMap.begin(); it != renameMap.end(); it++) { 51 | cout << "Renaming " << it->first << " to " << it->second << endl; 52 | Genome *genome = alignment->openGenome(it->first); 53 | genome->rename(it->second); 54 | } 55 | 56 | alignment->close(); 57 | } 58 | -------------------------------------------------------------------------------- /modify/halRenameSequences.cpp: -------------------------------------------------------------------------------- 1 | #include "hal.h" 2 | #include "renameFile.h" 3 | 4 | using namespace hal; 5 | using namespace std; 6 | 7 | static void initParser(CLParser &optionsParser) { 8 | optionsParser.setDescription("Rename the sequences of a genome in-place."); 9 | optionsParser.addArgument("halFile", "hal file"); 10 | optionsParser.addArgument("genome", "genome to rename the sequences of"); 11 | optionsParser.addArgument("renameFile", "Tab-separated file. First column: existing " 12 | "sequence name, second column: new sequence name." 13 | " Any sequences not provided will stay the same."); 14 | } 15 | 16 | int main(int argc, char *argv[]) { 17 | CLParser optionsParser(WRITE_ACCESS); 18 | initParser(optionsParser); 19 | string halPath, renamePath, genomeName; 20 | try { 21 | optionsParser.parseOptions(argc, argv); 22 | halPath = optionsParser.getArgument("halFile"); 23 | genomeName = optionsParser.getArgument("genome"); 24 | renamePath = optionsParser.getArgument("renameFile"); 25 | } catch (exception &e) { 26 | cerr << e.what() << endl; 27 | optionsParser.printUsage(cerr); 28 | return 1; 29 | } 30 | 31 | AlignmentPtr alignment(openHalAlignment(halPath, &optionsParser, WRITE_ACCESS | READ_ACCESS)); 32 | Genome *genome = alignment->openGenome(genomeName); 33 | if (genome == NULL) { 34 | throw hal_exception("Genome " + genomeName + " not found in alignment"); 35 | } 36 | map renameMap = ingestRenameFile(renamePath); 37 | 38 | for (map::iterator it = renameMap.begin(); it != renameMap.end(); it++) { 39 | Sequence *sequence = genome->getSequenceCheck(it->first); 40 | 41 | sequence = genome->getSequence(it->second); 42 | if (sequence != NULL) { 43 | throw hal_exception("Attempting to rename sequence " + it->first + " to " + it->second + " failed: " + it->second + 44 | " is already in the genome! Name it to something" 45 | " temporary first"); 46 | } 47 | } 48 | 49 | // Do the actual renaming now that we are relatively sure nothing 50 | // will go wrong. 51 | for (map::iterator it = renameMap.begin(); it != renameMap.end(); it++) { 52 | cout << "Renaming " << it->first << " to " << it->second << endl; 53 | Sequence *sequence = genome->getSequence(it->first); 54 | sequence->setName(it->second); 55 | } 56 | 57 | alignment->close(); 58 | } 59 | -------------------------------------------------------------------------------- /modify/halSetMetadata.cpp: -------------------------------------------------------------------------------- 1 | // Simple utility to set metadata for a hal genome or alignment 2 | #include "hal.h" 3 | 4 | using namespace hal; 5 | using namespace std; 6 | 7 | static void initParser(CLParser &optionsParser) { 8 | optionsParser.setDescription("Set metadata for an alignment or genome"); 9 | optionsParser.addArgument("halFile", "hal file to modify"); 10 | optionsParser.addArgument("key", "metadata key"); 11 | optionsParser.addArgument("value", "metadata value"); 12 | optionsParser.addOption("genome", "genome to set metadata for instead of " 13 | "setting it for the entire alignment", 14 | ""); 15 | } 16 | 17 | int main(int argc, char *argv[]) { 18 | string halPath, key, value, genomeName; 19 | CLParser optionsParser(WRITE_ACCESS); 20 | initParser(optionsParser); 21 | try { 22 | optionsParser.parseOptions(argc, argv); 23 | halPath = optionsParser.getArgument("halFile"); 24 | key = optionsParser.getArgument("key"); 25 | value = optionsParser.getArgument("value"); 26 | genomeName = optionsParser.getOption("genome"); 27 | } catch (exception &e) { 28 | cerr << e.what() << endl; 29 | optionsParser.printUsage(cerr); 30 | return 1; 31 | } 32 | 33 | AlignmentPtr alignment(openHalAlignment(halPath, &optionsParser, 34 | WRITE_ACCESS | READ_ACCESS)); 35 | if (genomeName == "") { 36 | // No genome to set metadata for, so set the alignment-wide metadata. 37 | MetaData *metadata = alignment->getMetaData(); 38 | metadata->set(key, value); 39 | } else { 40 | Genome *genome = alignment->openGenome(genomeName); 41 | if (genome == NULL) { 42 | throw hal_exception("No genome named " + genomeName + " in alignment"); 43 | } 44 | MetaData *metadata = genome->getMetaData(); 45 | metadata->set(key, value); 46 | } 47 | alignment->close(); 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /modify/halUpdateBranchLengths.cpp: -------------------------------------------------------------------------------- 1 | #include "hal.h" 2 | #include "halAlignmentInstance.h" 3 | #include "sonLibTree.h" 4 | 5 | using namespace std; 6 | using namespace hal; 7 | 8 | static void initParser(CLParser &optionsParser) { 9 | optionsParser.addArgument("halFile", "hal file"); 10 | optionsParser.addArgument("newickTree", "newick tree (must be identical," 11 | " except for the branch lengths"); 12 | } 13 | 14 | void updateBranches(AlignmentPtr alignment, Genome *genome, stTree *newTree) { 15 | if (genome->getNumChildren() == 0) { 16 | return; 17 | } 18 | for (hal_size_t i = 0; i < genome->getNumChildren(); i++) { 19 | Genome *child = genome->getChild(i); 20 | bool found = false; 21 | for (int64_t j = 0; j < stTree_getChildNumber(newTree); j++) { 22 | stTree *newChild = stTree_getChild(newTree, j); 23 | if (child->getName() == stTree_getLabel(newChild)) { 24 | found = true; 25 | alignment->updateBranchLength(genome->getName(), child->getName(), stTree_getBranchLength(newChild)); 26 | updateBranches(alignment, child, newChild); 27 | break; 28 | } 29 | } 30 | if (!found) { 31 | throw hal_exception("Genome " + child->getName() + " not found in proper" 32 | " place in replacement newick tree."); 33 | } 34 | } 35 | } 36 | 37 | int main(int argc, char *argv[]) { 38 | string halPath, newickTree; 39 | CLParser optionsParser(WRITE_ACCESS); 40 | initParser(optionsParser); 41 | try { 42 | optionsParser.parseOptions(argc, argv); 43 | halPath = optionsParser.getArgument("halFile"); 44 | newickTree = optionsParser.getArgument("newickTree"); 45 | } catch (exception &e) { 46 | cerr << e.what() << endl; 47 | optionsParser.printUsage(cerr); 48 | return 1; 49 | } 50 | AlignmentPtr alignment(openHalAlignment(halPath, &optionsParser, 51 | WRITE_ACCESS | READ_ACCESS)); 52 | stTree *newTree = stTree_parseNewickString(newickTree.c_str()); 53 | // recursively update branches 54 | updateBranches(alignment, alignment->openGenome(alignment->getRootName()), newTree); 55 | } 56 | -------------------------------------------------------------------------------- /modify/halWriteNucleotides.cpp: -------------------------------------------------------------------------------- 1 | // Hacky script to allow writing to a genome's sequence from a TSV (probably 2 | // after an ancestorsML run). 3 | #include "hal.h" 4 | #include "halAlignmentInstance.h" 5 | #include 6 | 7 | using namespace hal; 8 | using namespace std; 9 | 10 | static void initParser(CLParser &optionsParser) { 11 | optionsParser.setDescription("Write changes to a hal sequence from a TSV " 12 | "containing fields " 13 | "genomeName\tpos\toldChar\tnewChar. Note that " 14 | "the position is in genome coordinates!"); 15 | optionsParser.addArgument("inFile", "hal file"); 16 | optionsParser.addArgument("tsvFile", "tsv file"); 17 | } 18 | 19 | int main(int argc, char *argv[]) { 20 | CLParser optionsParser(WRITE_ACCESS); 21 | initParser(optionsParser); 22 | string inPath, tsvFile; 23 | try { 24 | optionsParser.parseOptions(argc, argv); 25 | inPath = optionsParser.getArgument("inFile"); 26 | tsvFile = optionsParser.getArgument("tsvFile"); 27 | } catch (exception &e) { 28 | optionsParser.printUsage(cerr); 29 | return 1; 30 | } 31 | AlignmentPtr alignment(openHalAlignment(inPath, &optionsParser, 32 | READ_ACCESS | WRITE_ACCESS)); 33 | 34 | ifstream tsv(tsvFile.c_str()); 35 | string line; 36 | int64_t lineNum = 0; 37 | while (getline(tsv, line)) { 38 | stringstream lineStream(line); 39 | string genomeName; 40 | hal_index_t pos; 41 | char prevChar, newChar; 42 | 43 | lineNum++; 44 | if (lineNum % 100000 == 0) { 45 | cout << lineNum << endl; 46 | } 47 | lineStream >> genomeName; 48 | lineStream >> pos; 49 | lineStream >> prevChar; 50 | lineStream >> newChar; 51 | Genome *genome = alignment->openGenome(genomeName); 52 | DnaIteratorPtr dnaIt = genome->getDnaIterator(pos); 53 | if (fastUpper(dnaIt->getBase()) != prevChar) { 54 | dnaIt->toReverse(); 55 | if (fastUpper(dnaIt->getBase()) != prevChar) { 56 | throw hal_exception("previous nucleotide " + string(1, dnaIt->getBase()) + " does not match expected " + 57 | string(1, prevChar) + "! Aborting early. Your hal file could be invalid now."); 58 | } 59 | } 60 | 61 | dnaIt->setBase(newChar); 62 | } 63 | tsv.close(); 64 | alignment->close(); 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /modify/markAncestors.cpp: -------------------------------------------------------------------------------- 1 | #include "hal.h" 2 | 3 | using namespace std; 4 | using namespace hal; 5 | 6 | // Mark that all nodes above this one (but not this one) need to be 7 | // updated. 8 | void markAncestorsForUpdate(AlignmentPtr alignment, string node) { 9 | Genome *parent = alignment->openGenome(alignment->getParentName(node)); 10 | if (!parent) { 11 | return; 12 | } 13 | MetaData *metadata = parent->getMetaData(); 14 | metadata->set("needsUpdate", "true"); 15 | markAncestorsForUpdate(alignment, parent->getName()); 16 | alignment->closeGenome(parent); 17 | } 18 | -------------------------------------------------------------------------------- /modify/markAncestors.h: -------------------------------------------------------------------------------- 1 | #ifndef _MARK_ANCESTORS_H_ 2 | #define _MARK_ANCESTORS_H_ 3 | using namespace hal; 4 | 5 | void markAncestorsForUpdate(AlignmentPtr alignment, std::string node); 6 | #endif // _MARK_ANCESTORS_H_ 7 | // Local Variables: 8 | // mode: c++ 9 | // End: 10 | -------------------------------------------------------------------------------- /modify/renameFile.cpp: -------------------------------------------------------------------------------- 1 | #include "renameFile.h" 2 | #include "hal.h" 3 | #include "sonLib.h" 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace hal; 10 | 11 | map hal::ingestRenameFile(string tsvPath) { 12 | map ret; 13 | set values; 14 | ifstream tsv(tsvPath.c_str()); 15 | string line; 16 | while (getline(tsv, line)) { 17 | stList *tokens = stString_splitByString(line.c_str(), "\t"); 18 | if (stList_length(tokens) != 2) { 19 | throw hal_exception("Rename file does not have 2 tab-separated fields " 20 | "in line: " + 21 | line); 22 | } 23 | string oldName((char *)stList_get(tokens, 0)); 24 | string newName((char *)stList_get(tokens, 1)); 25 | stList_destruct(tokens); 26 | 27 | if (ret.count(oldName)) { 28 | throw hal_exception("Old name " + oldName + " is represented twice in rename file"); 29 | } 30 | if (values.count(oldName)) { 31 | throw hal_exception("New name " + oldName + " is same as an old name " + oldName + 32 | ". Collisions are not allowed, rename to " 33 | "something temporary first."); 34 | } 35 | if (values.count(newName)) { 36 | throw hal_exception("New name " + newName + " is represented twice in rename file"); 37 | } 38 | 39 | ret.insert(make_pair(oldName, newName)); 40 | values.insert(newName); 41 | } 42 | return ret; 43 | } 44 | -------------------------------------------------------------------------------- /modify/renameFile.h: -------------------------------------------------------------------------------- 1 | #ifndef _RENAME_FILE_H_ 2 | #define _RENAME_FILE_H_ 3 | #include 4 | #include 5 | namespace hal { 6 | std::map ingestRenameFile(std::string tsvPath); 7 | } 8 | #endif // _RENAME_FILE_H_ 9 | // Local Variables: 10 | // mode: c++ 11 | // End: 12 | -------------------------------------------------------------------------------- /mutations/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/mutations 4 | 5 | libHalMutations_srcs = impl/halBranchMutations.cpp impl/halMutationsStats.cpp impl/halSummarizeMutations.cpp 6 | libHalMutations_objs = ${libHalMutations_srcs:%.cpp=${modObjDir}/%.o} 7 | halIndels_srcs = impl/halIndels.cpp 8 | halIndels_objs = ${halIndels_srcs:%.cpp=${modObjDir}/%.o} 9 | halBranchMutations_srcs = impl/halBranchMutationsMain.cpp 10 | halBranchMutations_objs = ${halBranchMutations_srcs:%.cpp=${modObjDir}/%.o} 11 | halSnps_srcs = impl/halSnps.cpp 12 | halSnps_objs = ${halSnps_srcs:%.cpp=${modObjDir}/%.o} 13 | halSummarizeMutations_srcs = impl/halSummarizeMutationsMain.cpp 14 | halSummarizeMutations_objs = ${halSummarizeMutations_srcs:%.cpp=${modObjDir}/%.o} 15 | srcs = ${libHalMutations_srcs} ${halIndels_srcs} ${halBranchMutations_srcs} ${halSnps_srcs} ${halSummarizeMutations_srcs} 16 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 17 | depends = ${srcs:%.cpp=%.depend} 18 | progs = ${binDir}/halIndels ${binDir}/halBranchMutations ${binDir}/halSnps ${binDir}/halSummarizeMutations 19 | otherLibs = ${libHalMutations} 20 | 21 | all : libs progs 22 | libs: ${libHalMutations} 23 | progs: ${progs} 24 | 25 | clean : 26 | rm -f ${libHalMutations} ${objs} ${progs} ${depends} 27 | test: 28 | 29 | include ${rootDir}/rules.mk 30 | 31 | # don't fail on missing dependencies, they are first time the .o is generates 32 | -include ${depends} 33 | 34 | 35 | # Local Variables: 36 | # mode: makefile-gmake 37 | # End: 38 | 39 | -------------------------------------------------------------------------------- /mutations/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | -------------------------------------------------------------------------------- /mutations/impl/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | -------------------------------------------------------------------------------- /mutations/inc/halBranchMutations.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALBRANCHMUTATIONS_H 9 | #define _HALBRANCHMUTATIONS_H 10 | 11 | #include "hal.h" 12 | #include 13 | #include 14 | #include 15 | 16 | namespace hal { 17 | 18 | class BranchMutations { 19 | public: 20 | BranchMutations(); 21 | virtual ~BranchMutations(); 22 | 23 | void printCsv(std::ostream &outStream) const; 24 | void analyzeBranch(AlignmentConstPtr alignment, hal_size_t gapThreshold, double nThreshold, std::ostream *refBedStream, 25 | std::ostream *parentBedStream, std::ostream *snpBedStream, std::ostream *delBreakBedStream, 26 | const Genome *reference, hal_index_t startPosition, hal_size_t length); 27 | 28 | static const std::string inversionBedTag; 29 | static const std::string insertionBedTag; 30 | static const std::string deletionBedTag; 31 | static const std::string deletionBreakBedTag; 32 | static const std::string transpositionBedTag; 33 | static const std::string duplicationBedTag; 34 | static const std::string gapInsertionBedTag; 35 | static const std::string gapDeletionBedTag; 36 | static const std::string gapDeletionBreakBedTag; 37 | static std::string substitutionBedTag(char parent, char child); 38 | 39 | protected: 40 | void writeInsertionOrInversion(); 41 | void writeSubstitutions(TopSegmentIteratorPtr first, TopSegmentIteratorPtr lastPlusOne); 42 | void writeGapInsertions(); 43 | void writeDeletion(); 44 | void writeDeletionBreakPoint(); 45 | void writeDuplication(); 46 | void writeHeaders(); 47 | 48 | protected: 49 | AlignmentConstPtr _alignment; 50 | std::ostream *_refStream; 51 | std::ostream *_parentStream; 52 | std::ostream *_snpStream; 53 | std::ostream *_delBreakStream; 54 | hal_size_t _maxGap; 55 | double _nThreshold; 56 | const Genome *_reference; 57 | const Sequence *_sequence; 58 | hal_size_t _start; 59 | hal_size_t _length; 60 | std::string _refName; 61 | std::string _parName; 62 | 63 | RearrangementPtr _rearrangement; 64 | TopSegmentIteratorPtr _top; 65 | BottomSegmentIteratorPtr _bottom1, _bottom2; 66 | }; 67 | } 68 | 69 | #endif 70 | // Local Variables: 71 | // mode: c++ 72 | // End: 73 | -------------------------------------------------------------------------------- /mutations/inc/halMutationsStats.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMUTATIONSSTATS_H 9 | #define _HALMUTATIONSSTATS_H 10 | 11 | #include "hal.h" 12 | #include "halAverage.h" 13 | #include 14 | #include 15 | #include 16 | 17 | namespace hal { 18 | 19 | struct MutationsStats { 20 | typedef Average Avg; 21 | // Tree Information 22 | hal_size_t _genomeLength; 23 | hal_size_t _parentLength; 24 | double _branchLength; 25 | 26 | // Subsitution Information 27 | hal_size_t _subs; 28 | hal_size_t _transitions; 29 | hal_size_t _transversions; 30 | hal_size_t _matches; 31 | 32 | // Rearrangement Information 33 | Avg _nothingLength; 34 | Avg _inversionLength; 35 | Avg _insertionLength; 36 | Avg _deletionLength; 37 | Avg _transpositionLength; 38 | Avg _duplicationLength; 39 | Avg _otherLength; 40 | Avg _gapInsertionLength; 41 | Avg _gapDeletionLength; 42 | 43 | static void printHeader(std::ostream &os); 44 | }; 45 | 46 | std::ostream &operator<<(std::ostream &os, const MutationsStats &stats); 47 | 48 | MutationsStats &operator+=(MutationsStats &ms, const MutationsStats &other); 49 | MutationsStats &operator/=(MutationsStats &ms, hal_size_t N); 50 | } 51 | 52 | #endif 53 | // Local Variables: 54 | // mode: c++ 55 | // End: 56 | -------------------------------------------------------------------------------- /mutations/inc/halSummarizeMutations.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALMUTATIONS_H 9 | #define _HALMUTATIONS_H 10 | 11 | #include "hal.h" 12 | #include "halAverage.h" 13 | #include "halMutationsStats.h" 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | class SummarizeMutations { 21 | public: 22 | SummarizeMutations(); 23 | virtual ~SummarizeMutations(); 24 | 25 | void printCsv(std::ostream &outStream) const; 26 | void analyzeAlignmentPtr(AlignmentConstPtr alignment, hal_size_t gapThreshold, double nThreshold, bool justSubs, 27 | const std::set *targetSet = NULL); 28 | 29 | protected: 30 | void analyzeGenomeRecursive(const std::string &genomeName); 31 | void substitutionAnalysis(const Genome *genome, MutationsStats &stats); 32 | void rearrangementAnalysis(const Genome *genome, MutationsStats &stats); 33 | void subsAndGapInserts(GappedTopSegmentIteratorPtr gappedTop, MutationsStats &stats); 34 | 35 | typedef std::pair StrPair; 36 | typedef std::map BranchMap; 37 | 38 | BranchMap _branchMap; 39 | AlignmentConstPtr _alignment; 40 | hal_size_t _gapThreshold; 41 | double _nThreshold; 42 | bool _justSubs; 43 | const std::set *_targetSet; 44 | }; 45 | } 46 | 47 | inline std::ostream &operator<<(std::ostream &os, const hal::SummarizeMutations &halCons) { 48 | halCons.printCsv(os); 49 | return os; 50 | } 51 | 52 | #endif 53 | // Local Variables: 54 | // mode: c++ 55 | // End: 56 | -------------------------------------------------------------------------------- /paf/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/paf 4 | 5 | hal2paf_srcs = hal2paf.cpp 6 | hal2paf_objs = ${hal2paf_srcs:%.cpp=${modObjDir}/%.o} 7 | srcs = ${hal2paf_srcs} 8 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 9 | depends = ${srcs:%.cpp=%.depend} 10 | progs = ${binDir}/hal2paf 11 | 12 | all: progs 13 | libs: 14 | progs: ${progs} 15 | 16 | clean: 17 | rm -f ${objs} ${progs} ${depends} 18 | 19 | test: hal2pafSmallMMapTest hal2pafMouseRatTest 20 | 21 | hal2pafSmallMMapTest: tests/output/small.mmap1.0.hal tests/output/hal2pafSmallMMapTest.paf.baseline 22 | ../bin/hal2paf tests/output/small.mmap1.0.hal --onlySequenceNames > tests/output/$@.paf 23 | diff tests/output/$@.paf tests/output/hal2pafSmallMMapTest.paf.baseline 24 | 25 | hal2pafMouseRatTest: tests/output/hal2pafMouseRatTest.paf.baseline 26 | ../bin/hal2paf tests/input/mr.hal > tests/output/$@.paf 27 | diff tests/output/$@.paf tests/output/hal2pafMouseRatTest.paf.baseline 28 | 29 | tests/output/small.mmap1.0.hal: output 30 | bunzip2 -dc ../extract/tests/input/small.mmap1.0.hal.bz2 > tests/output/small.mmap1.0.hal 31 | 32 | tests/output/hal2pafSmallMMapTest.paf.baseline: output 33 | gzip -dc tests/expected/hal2pafSmallMMapTest.paf.gz > tests/output/hal2pafSmallMMapTest.paf.baseline 34 | 35 | tests/output/hal2pafMouseRatTest.paf.baseline: output 36 | gzip -dc tests/expected/hal2pafMouseRatTest.paf.gz > tests/output/hal2pafMouseRatTest.paf.baseline 37 | 38 | output: 39 | mkdir -p tests/output 40 | 41 | include ${rootDir}/rules.mk 42 | 43 | # don't fail on missing dependencies, they are first time the .o is generates 44 | -include ${depends} 45 | 46 | 47 | 48 | # Local Variables: 49 | # mode: makefile-gmake 50 | # End: 51 | 52 | -------------------------------------------------------------------------------- /paf/tests/expected/hal2pafMouseRatTest.paf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/paf/tests/expected/hal2pafMouseRatTest.paf.gz -------------------------------------------------------------------------------- /paf/tests/expected/hal2pafSmallMMapTest.paf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/paf/tests/expected/hal2pafSmallMMapTest.paf.gz -------------------------------------------------------------------------------- /paf/tests/input/mr.hal: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/paf/tests/input/mr.hal -------------------------------------------------------------------------------- /phyloP/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/phyloP 4 | 5 | halPhyloP_srcs = impl/halPhyloPMain.cpp impl/halPhyloPBed.cpp impl/halPhyloP.cpp 6 | halPhyloP_objs = ${halPhyloP_srcs:%.cpp=${modObjDir}/%.o} 7 | srcs = ${halPhyloP_srcs} 8 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 9 | depends = ${srcs:%.cpp=%.depend} 10 | progs = ${binDir}/halPhyloP ${binDir}/halPhyloPTrain.py ${binDir}/halPhyloPMP.py ${binDir}/halTreePhyloP.py 11 | otherLibs = ${libHalLiftover} 12 | inclSpec += -I${rootDir}/liftover/inc ${PHASTCXXFLAGS} 13 | 14 | ifdef ENABLE_PHYLOP 15 | all: progs 16 | libs: 17 | progs: ${progs} 18 | else 19 | all: 20 | libs: 21 | progs: 22 | endif 23 | 24 | clean: 25 | rm -f ${objs} ${progs} ${depends} 26 | test: 27 | 28 | include ${rootDir}/rules.mk 29 | 30 | # don't fail on missing dependencies, they are first time the .o is generates 31 | -include ${depends} 32 | 33 | 34 | # Local Variables: 35 | # mode: makefile-gmake 36 | # End: 37 | 38 | -------------------------------------------------------------------------------- /phyloP/inc/halPhyloP.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) and 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * Melissa Jane Hubisz (Cornell University) 5 | * 6 | * Released under the MIT license, see LICENSE.txt 7 | */ 8 | 9 | #ifndef _HALPHYLOP_H 10 | #define _HALPHYLOP_H 11 | 12 | #include "hal.h" 13 | #include 14 | #include 15 | 16 | #undef __cplusplus 17 | extern "C" { 18 | #include "fit_column.h" 19 | #include "hashtable.h" 20 | #include "msa.h" 21 | #include "sufficient_stats.h" 22 | #include "tree_model.h" 23 | } 24 | // PHAST code defines min, max macros which conflict with the reserved C++ names. 25 | #undef min 26 | #undef max 27 | 28 | namespace hal { 29 | 30 | /** Use the Phast library methods to compute a PhyloP score for a HAL 31 | * aligment, column by column. Thanks to Melissa Jane Hubisz. */ 32 | class PhyloP { 33 | public: 34 | PhyloP(); 35 | virtual ~PhyloP(); 36 | 37 | /** @param dupHardMask true for hard duplication mask or false for 38 | * soft duplication mask 39 | * @param dupType ambiguous or all 40 | * @param phyloPMode "CONACC", "CON", "ACC", "NNEUT" are choices, 41 | * though I think we are mainly interested in CONACC 42 | * (conservation/acceleration- negative p-values indicate acceleration) 43 | * @param subtree If equal to empty string, perform phyloP test on 44 | * entire tree. Otherwise, subtree names a branch to perform test on 45 | * subtree relative to rest of tree. The subtree includes all children 46 | * of the named node as well as the branch leading to the node. 47 | */ 48 | void init(AlignmentConstPtr alignment, const std::string &modFilePath, std::ostream *outStream, 49 | bool softMaskDups = true, const std::string &dupType = "ambiguous", const std::string &phyloPMode = "CONACC", 50 | const std::string &subtree = "\"\""); 51 | 52 | void processSequence(const Sequence *sequence, hal_index_t start, hal_size_t length, hal_size_t step); 53 | 54 | protected: 55 | // return phyloP score 56 | double pval(const ColumnIterator::ColumnMap *cmap); 57 | 58 | void clear(); 59 | 60 | protected: 61 | AlignmentConstPtr _alignment; 62 | TreeModel *_mod; 63 | TreeModel *_modcpy; 64 | std::set _targetSet; 65 | std::ostream *_outStream; 66 | 67 | // 1 default = soft mask, if 0 use hard mask (mask entire column) 68 | int _softMaskDups; 69 | int _maskAllDups; 70 | 71 | // 0 default = mask only ambiguous bases in dups; if 1 mask any duplication 72 | hash_table *_seqnameHash; 73 | ColFitData *_colfitdata; 74 | ColFitData *_colfitdata2; 75 | List *_insideNodes; 76 | List *_outsideNodes; 77 | mode_type _mode; 78 | MSA *_msa; 79 | }; 80 | } 81 | #endif 82 | // Local Variables: 83 | // mode: c++ 84 | // End: 85 | -------------------------------------------------------------------------------- /phyloP/inc/halPhyloPBed.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALPHYLOPBED_H 9 | #define _HALPHYLOPBED_H 10 | 11 | #include "halBedScanner.h" 12 | #include "halPhyloP.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace hal { 19 | 20 | /** Use the halBedScanner to parse a bed file, running halPhyloP on each 21 | * line */ 22 | class PhyloPBed : public BedScanner { 23 | public: 24 | PhyloPBed(AlignmentConstPtr alignment, const Genome *refGenome, const Sequence *refSequence, hal_index_t start, 25 | hal_size_t length, hal_size_t step, PhyloP &phyloP, std::ostream &outStream); 26 | virtual ~PhyloPBed(); 27 | 28 | void run(std::istream *bedStream); 29 | 30 | protected: 31 | virtual void visitLine(); 32 | 33 | protected: 34 | AlignmentConstPtr _alignment; 35 | const Genome *_refGenome; 36 | const Sequence *_refSequence; 37 | hal_index_t _refStart; 38 | hal_index_t _refLength; 39 | hal_size_t _step; 40 | PhyloP &_phyloP; 41 | std::ostream &_outStream; 42 | }; 43 | } 44 | 45 | #endif 46 | // Local Variables: 47 | // mode: c++ 48 | // End: 49 | -------------------------------------------------------------------------------- /phyloP/test/blanchette.hal: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/phyloP/test/blanchette.hal -------------------------------------------------------------------------------- /phyloP/test/blanchette.mod: -------------------------------------------------------------------------------- 1 | ALPHABET: A C G T 2 | ORDER: 0 3 | SUBST_MOD: REV 4 | BACKGROUND: 0.295000 0.205000 0.205000 0.295000 5 | RATE_MAT: 6 | -0.976030 0.165175 0.539722 0.271133 7 | 0.237691 -0.990352 0.189637 0.563024 8 | 0.776673 0.189637 -1.248143 0.281833 9 | 0.271133 0.391254 0.195849 -0.858237 10 | TREE: ((((HUMAN:0.006969,CHIMP:0.009727)Anc6:0.025291,BABOON:0.044568)Anc5:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc3:0.02326,((DOG:0.07,CAT:0.07)Anc2:0.087381,(PIG:0.06,COW:0.06)Anc1:0.104728)Anc0:0.04)ProgressiveCactusRoot; 11 | -------------------------------------------------------------------------------- /phyloP/test/test.sh: -------------------------------------------------------------------------------- 1 | hal2maf blanchette.hal blanchette.maf --refGenome HUMAN 2 | halPhyloP blanchette.hal HUMAN blanchette.mod fromHal 3 | phyloP -i MAF --method LRT --mode CONACC --wig-scores blanchette.mod blanchette.maf > fromMaf 4 | -------------------------------------------------------------------------------- /randgen/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/randgen 4 | 5 | halRandGen_srcs = halRandGen.cpp 6 | halRandGen_objs = ${halRandGen_srcs:%.cpp=${modObjDir}/%.o} 7 | halTestGen_srcs = halTestGen.cpp 8 | halTestGen_objs = ${halTestGen_srcs:%.cpp=${modObjDir}/%.o} 9 | srcs = ${halRandGen_srcs} ${halTestGen_srcs} 10 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 11 | depends = ${srcs:%.cpp=%.depend} 12 | progs = ${binDir}/halRandGen ${binDir}/halTestGen 13 | 14 | inclSpec += -I${halApiTestIncl} 15 | otherLibs += ${halApiTestSupportLibs} 16 | 17 | all: progs 18 | libs: 19 | progs: ${progs} 20 | 21 | clean: 22 | rm -f ${objs} ${progs} ${depends} 23 | test: 24 | 25 | include ${rootDir}/rules.mk 26 | 27 | # don't fail on missing dependencies, they are first time the .o is generates 28 | -include ${depends} 29 | 30 | 31 | # Local Variables: 32 | # mode: makefile-gmake 33 | # End: 34 | 35 | -------------------------------------------------------------------------------- /randgen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/randgen/__init__.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | newick 3 | toil[aws]==5.3.0 4 | pathlib 5 | -------------------------------------------------------------------------------- /rules.mk: -------------------------------------------------------------------------------- 1 | ## 2 | # common rules 3 | ## 4 | 5 | 6 | # Copy python program. This need to preceed linking of objects or some 7 | # versions of gnu make try to run the link rule 8 | ${binDir}/%.py : %.py 9 | @mkdir -p $(dir $@) 10 | cp -f $< $@ 11 | chmod a+x,a-w $@ 12 | 13 | # Generate .depend and compile objects. Due to some test code is being 14 | # compiled by different modules, it is possible to generate the .depend file 15 | # multiple times, so do it atomically. 16 | ${modObjDir}/%.o: %.cpp 17 | @mkdir -p $(dir $@) 18 | ${CXX} -MM -MT $@ ${CXXFLAGS} ${inclSpec} -c $< >$*.depend 19 | ${CXX} ${CXXFLAGS} ${inclSpec} -c $< -o $@ 20 | 21 | ${modObjDir}/%.o: %.c 22 | @mkdir -p $(dir $@) 23 | ${CC} -MM -MT $@ ${CFLAGS} ${inclSpec} -c $< >$*.depend 24 | ${CC} ${CFLAGS} ${inclSpec} -c $< -o $@ 25 | 26 | # compile a program. 27 | # ${prog_objs} - has object files specific for ${prog} 28 | # otherLibs - other libraries to used 29 | .SECONDEXPANSION: 30 | ${binDir}/% : $${$$*_objs} ${libHal} ${otherLibs} ${LIBDEPENDS} 31 | @mkdir -p $(dir $@) 32 | ${CXX} ${CXXFLAGS} ${inclSpec} -I tests -o $@ ${${*}_objs} ${otherLibs} ${libHal} ${LDLIBS} 33 | 34 | 35 | # build a library 36 | # $lib_objs has objects for $lib 37 | .SECONDEXPANSION: 38 | ${libDir}/%.a: $${$$*_objs} 39 | @mkdir -p $(dir $@) 40 | ${AR} rc $@ ${$*_objs} 41 | ${RANLIB} $@ 42 | -------------------------------------------------------------------------------- /stats/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/stats 4 | 5 | libHalStats_srcs = impl/halStats.cpp 6 | libHalStats_objs = ${libHalStats_srcs:%.cpp=${modObjDir}/%.o} 7 | halStats_srcs = impl/halStatsMain.cpp 8 | halStats_objs = ${halStats_srcs:%.cpp=${modObjDir}/%.o} 9 | halCoverage_srcs = impl/halCoverage.cpp 10 | halCoverage_objs = ${halCoverage_srcs:%.cpp=${modObjDir}/%.o} 11 | halPctId_srcs = impl/halPctIdentity.cpp 12 | halPctId_objs = ${halPctId_srcs:%.cpp=${modObjDir}/%.o} 13 | srcs = ${libHalStats_srcs} ${halStats_srcs} ${halCoverage_srcs} ${halPctId_srcs} 14 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 15 | depends = ${srcs:%.cpp=%.depend} 16 | progs = ${binDir}/halStats ${binDir}/halCoverage ${binDir}/halPctId 17 | otherLibs = ${libHalStats} 18 | 19 | all : libs progs 20 | libs: ${libHalStats} 21 | progs: ${progs} 22 | 23 | clean : 24 | rm -f ${libHalStats} ${objs} ${progs} ${depends} 25 | test: 26 | 27 | include ${rootDir}/rules.mk 28 | 29 | # don't fail on missing dependencies, they are first time the .o is generates 30 | -include ${depends} 31 | 32 | 33 | # Local Variables: 34 | # mode: makefile-gmake 35 | # End: 36 | 37 | -------------------------------------------------------------------------------- /stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/stats/__init__.py -------------------------------------------------------------------------------- /stats/impl/halStats.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halStats.h" 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | using namespace hal; 14 | 15 | HalStats::HalStats() { 16 | } 17 | 18 | HalStats::HalStats(AlignmentConstPtr alignment) { 19 | readAlignmentPtr(alignment); 20 | } 21 | 22 | HalStats::~HalStats() { 23 | } 24 | 25 | void HalStats::printCsv(ostream &outStream) const { 26 | outStream << _tree << endl << endl; 27 | 28 | outStream << "GenomeName, NumChildren, Length, NumSequences, " 29 | << "NumTopSegments, NumBottomSegments" << endl; 30 | 31 | vector::const_iterator i; 32 | for (i = _genomeStatsVec.begin(); i != _genomeStatsVec.end(); ++i) { 33 | outStream << i->_name << ", " << i->_numChildren << ", " << i->_length << ", " << i->_numSequences << ", " 34 | << i->_numTopSegments << ", " << i->_numBottomSegments << endl; 35 | } 36 | outStream << endl; 37 | } 38 | 39 | void HalStats::readAlignmentPtr(AlignmentConstPtr alignment) { 40 | _tree.clear(); 41 | _genomeStatsVec.clear(); 42 | 43 | if (alignment->getNumGenomes() > 0) { 44 | _tree = alignment->getNewickTree(); 45 | _genomeStatsVec.reserve(alignment->getNumGenomes()); 46 | const Genome *root = alignment->openGenome(alignment->getRootName()); 47 | readGenomeRecursive(alignment, root); 48 | } 49 | } 50 | 51 | void HalStats::readGenomeRecursive(AlignmentConstPtr alignment, const Genome *genome) { 52 | assert(genome != NULL); 53 | 54 | GenomeStats genomeStats; 55 | genomeStats._name = genome->getName(); 56 | genomeStats._numChildren = genome->getNumChildren(); 57 | genomeStats._length = genome->getSequenceLength(); 58 | genomeStats._numSequences = genome->getNumSequences(); 59 | genomeStats._numTopSegments = genome->getNumTopSegments(); 60 | genomeStats._numBottomSegments = genome->getNumBottomSegments(); 61 | _genomeStatsVec.push_back(genomeStats); 62 | 63 | vector children = alignment->getChildNames(genome->getName()); 64 | for (hal_size_t i = 0; i < children.size(); ++i) { 65 | const Genome *child = alignment->openGenome(children[i]); 66 | readGenomeRecursive(alignment, child); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /stats/inc/halStats.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #ifndef _HALSTATS_H 9 | #define _HALSTATS_H 10 | 11 | #include "hal.h" 12 | #include 13 | #include 14 | #include 15 | 16 | namespace hal { 17 | 18 | struct GenomeStats : public hal::Sequence::Info { 19 | size_t _numChildren; 20 | size_t _numSequences; 21 | }; 22 | 23 | class HalStats { 24 | public: 25 | HalStats(); 26 | HalStats(AlignmentConstPtr alignment); 27 | virtual ~HalStats(); 28 | 29 | void printCsv(std::ostream &outStream) const; 30 | void readAlignmentPtr(AlignmentConstPtr alignment); 31 | 32 | protected: 33 | void readGenomeRecursive(AlignmentConstPtr alignment, const Genome *genome); 34 | 35 | std::string _tree; 36 | std::vector _genomeStatsVec; 37 | }; 38 | } 39 | 40 | inline std::ostream &operator<<(std::ostream &os, const hal::HalStats &halStats) { 41 | halStats.printCsv(os); 42 | return os; 43 | } 44 | 45 | #endif 46 | // Local Variables: 47 | // mode: c++ 48 | // End: 49 | -------------------------------------------------------------------------------- /synteny/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/synteny 4 | 5 | halSynteny_srcs = impl/halSynteny.cpp impl/hal2psl.cpp impl/psl_io.cpp impl/psl_merger.cpp 6 | halSynteny_objs = ${halSynteny_srcs:%.cpp=${modObjDir}/%.o} 7 | srcs = ${halSynteny_srcs} 8 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 9 | depends = ${srcs:%.cpp=%.depend} 10 | inclSpec += -I${rootDir}/liftover/inc 11 | otherLibs += ${libHalLiftover} 12 | progs = ${binDir}/halSynteny 13 | 14 | all: progs 15 | libs: 16 | progs: ${progs} 17 | 18 | clean : 19 | rm -rf ${objs} ${progs} ${depends} output 20 | 21 | test: test1 22 | 23 | test1: output/rand1.hal 24 | ../bin/halSynteny --queryGenome "Genome_14" --targetGenome "Genome_18" $< output/$@.psl 25 | diff tests/expected/$@.psl output/$@.psl 26 | 27 | output/rand1.hal: 28 | @mkdir -p output 29 | ../bin/halRandGen --seed 0 --testRand --format hdf5 $@ 30 | 31 | 32 | include ${rootDir}/rules.mk 33 | 34 | # don't fail on missing dependencies, they are first time the .o is generates 35 | -include ${depends} 36 | 37 | 38 | # Local Variables: 39 | # mode: makefile-gmake 40 | # End: 41 | -------------------------------------------------------------------------------- /synteny/README.md: -------------------------------------------------------------------------------- 1 | DAG-based Reconstruction of Synteny Blocks 2 | ===== 3 | 4 | Run-time parameters 5 | ----- 6 | Option | Effect 7 | --- | --- 8 | `--maxAnchorDistance ` | upper bound on distance for syntenic blocks, default is 5Kb 9 | `--minBlockSize ` | lower bound on synteny block length, default is 5Kb 10 | `--queryChromosome ` | chromosome to infer synteny, default is whole genome 11 | `--queryGenome ` | source genome name 12 | `--targetGenome ` | reference genome name 13 | 14 | Other parameters describe the options for handling alignment file, in particular HDF5 options (applicable if alignments are in HAL format). Detailed information can be found in the main [README.md](https://github.com/ComparativeGenomicsToolkit/hal/blob/master/README.md) 15 | 16 | Algorithm 17 | ----- 18 | In order to build a set of most continuous synteny blocks covering as much of genomes as possible we build a graph and apply the algorithm: 19 | 20 | 1. Initialize weight labels of vertices and edges: 21 | 22 | * initial weight of each vertex is initialized as the size of the corresponding alignment block 23 | 24 | * the weight of each edge coming into a vertex i equals to the length of the corresponding alignment block (initial weight of the vertex i) 25 | 26 | * for each vertex A consider its possible candidate blocks (descendants), update weight of each descendant vertex B in case the weight of an edge coming into B + weight of the previous vertex A is greater than weight of B; if the weight was changed update the id of the previous vertex 27 | 28 | 2. Find the vertex with maximal weight and trace back 29 | 30 | 3. Remove the vertices that comprise the best path from the graph 31 | 32 | 4. If not all vertices are in some paths then go to 2 33 | 34 | Sample Usage 35 | ----- 36 | * Create synteny blocks for the alignment cactus.hal including genomes Genome1 and Genome2 37 | 38 | `halSynteny --queryGenome Genome2 --targetGenome Genome1 --maxAnchorDistance 1000000 --minBlockSize 1000000 cactus.hal out.psl` 39 | 40 | * Create synteny blocks for alignments in [PSL](http://genome.ucsc.edu/FAQ/FAQformat#format2) format from for query chromosome chrA 41 | 42 | `halSynteny --queryGenome Genome2 --targetGenome Genome1 --queryChromosome chrA --maxAnchorDistance 1000000 --minBlockSize 1000000 cactus.hal out.psl` 43 | 44 | Code Contributors 45 | ----- 46 | * Ksenia Krasheninnikova 47 | * Joel Armstrong 48 | * Mark Diekhans 49 | 50 | 51 | -------------------------------------------------------------------------------- /synteny/inc/hal2psl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | 7 | /* 8 | * File: hal2psl.h 9 | * Author: admin 10 | * 11 | * Created on July 11, 2018, 12:15 PM 12 | */ 13 | 14 | #ifndef HAL2PSL_H 15 | #define HAL2PSL_H 16 | 17 | #include "halBedLine.h" 18 | #include "halBlockLiftover.h" 19 | #include "psl.h" 20 | 21 | namespace hal { 22 | class Hal2Psl : public BlockLiftover { 23 | 24 | void storePslResults(std::vector &pslBlocks); 25 | void makeUpPsl(const std::vector &vpsl, const std::vector &blocks, const char strand, 26 | const hal_index_t start, const std::string chrName, std::vector &pslBlocks); 27 | 28 | public: 29 | Hal2Psl() { 30 | } 31 | std::vector convert2psl(AlignmentConstPtr alignment, const Genome *srcGenome, const Genome *tgtGenome, 32 | const std::string srcChrom); 33 | }; 34 | } 35 | #endif /* HAL_MERGER_H */ 36 | 37 | // Local Variables: 38 | // mode: c++ 39 | // End: 40 | -------------------------------------------------------------------------------- /synteny/inc/psl_io.h: -------------------------------------------------------------------------------- 1 | #ifndef PSL_IO_H 2 | #define PSL_IO_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "psl.h" 10 | 11 | namespace psl_io { 12 | std::vector split(const std::string &s, char delim); 13 | 14 | std::vector get_blocks_set(const std::string psl); 15 | 16 | std::vector get_qInserts(const std::vector &blocks); 17 | 18 | std::vector get_tInserts(const std::vector &blocks); 19 | 20 | Psl construct_psl(std::vector blocks); 21 | 22 | void write_psl(const std::vector> &merged_blocks, std::ofstream &ofs); 23 | 24 | void write_psl(const std::vector> &merged_blocks, const std::string &outFilePath); 25 | } 26 | 27 | #endif /* PSL_IO_H */ 28 | 29 | // Local Variables: 30 | // mode: c++ 31 | // End: 32 | -------------------------------------------------------------------------------- /synteny/inc/psl_merger.h: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | 7 | /* 8 | * File: psl_merger.h 9 | * Author: admin 10 | * 11 | * Created on July 6, 2018, 1:13 PM 12 | */ 13 | 14 | #ifndef PSL_MERGER_H 15 | #define PSL_MERGER_H 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "psl.h" 26 | bool are_syntenic(const PslBlock &a, const PslBlock &b); 27 | 28 | bool is_not_overlapping_ordered_pair(const PslBlock &a, const PslBlock &b, const hal_size_t threshold = 5000); 29 | 30 | std::vector get_next(const int pos, const std::vector &queryGroup, const hal_size_t maxAnchorDistance = 5000); 31 | 32 | std::map> weigh_dag(const std::vector &group, std::map> &dag, 33 | const std::set &hiddenVertices, const hal_size_t maxAnchorDistance); 34 | 35 | int get_maxed_vertex(const std::map> &weightedDag); 36 | 37 | std::vector traceback(std::map> &weightedDag, std::set &hiddenVertices, 38 | const std::vector &group); 39 | 40 | std::vector> dag_merge(const std::vector &blocks, const hal_size_t minBlockBreath, 41 | const hal_size_t maxAnchorDistance); 42 | 43 | #endif /* PSL_MERGER_H */ 44 | 45 | // Local Variables: 46 | // mode: c++ 47 | // End: 48 | -------------------------------------------------------------------------------- /testdata/mammals.mod: -------------------------------------------------------------------------------- 1 | ALPHABET: A C G T 2 | ORDER: 0 3 | SUBST_MOD: SSREV 4 | BACKGROUND: 0.295610 0.203744 0.206235 0.294412 5 | RATE_MAT: 6 | -0.883619 0.165683 0.514799 0.203138 7 | 0.240387 -1.170076 0.194786 0.734903 8 | 0.737894 0.192433 -1.169739 0.239412 9 | 0.203964 0.508581 0.167708 -0.880254 10 | TREE: ((human:0.123671,(mouse:0.0750111,rat:0.0890998)mr:0.24383)e:0.029297,(pig:0.175296,cow:0.145418)l:0.029297)b; 11 | -------------------------------------------------------------------------------- /validate/Makefile: -------------------------------------------------------------------------------- 1 | rootDir = .. 2 | include ${rootDir}/include.mk 3 | modObjDir = ${objDir}/validate 4 | 5 | halValidate_srcs = halValidateMain.cpp 6 | halValidate_objs = ${halValidate_srcs:%.cpp=${modObjDir}/%.o} 7 | srcs = ${halValidate_srcs} 8 | objs = ${srcs:%.cpp=${modObjDir}/%.o} 9 | depends = ${srcs:%.cpp=%.depend} 10 | progs = ${binDir}/halValidate 11 | 12 | inclSpec += -I${rootDir}/stats/inc/ 13 | 14 | 15 | all: progs 16 | libs: 17 | progs: ${progs} 18 | 19 | clean: 20 | rm -f ${objs} ${progs} ${depends} 21 | test: 22 | 23 | 24 | include ${rootDir}/rules.mk 25 | 26 | # don't fail on missing dependencies, they are first time the .o is generates 27 | -include ${depends} 28 | 29 | 30 | # Local Variables: 31 | # mode: makefile-gmake 32 | # End: 33 | 34 | -------------------------------------------------------------------------------- /validate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/hal/888a1d4b532b97598f19ffa3bf9ff6c9e4c9846e/validate/__init__.py -------------------------------------------------------------------------------- /validate/halValidateMain.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * Copyright (C) 2012-2019 by UCSC Computational Genomics Lab 4 | * 5 | * Released under the MIT license, see LICENSE.txt 6 | */ 7 | 8 | #include "halStats.h" 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | using namespace hal; 14 | 15 | int main(int argc, char **argv) { 16 | CLParser optionsParser; 17 | optionsParser.addArgument("halFile", "path to hal file to validate"); 18 | optionsParser.addOption("genome", "specific genome to validate instead of entire file", ""); 19 | optionsParser.setDescription("Check if hal database is valid"); 20 | string path, genomeName; 21 | try { 22 | optionsParser.parseOptions(argc, argv); 23 | path = optionsParser.getArgument("halFile"); 24 | genomeName = optionsParser.getOption("genome"); 25 | } catch (exception &e) { 26 | cerr << e.what() << endl; 27 | optionsParser.printUsage(cerr); 28 | exit(1); 29 | } 30 | try { 31 | AlignmentConstPtr alignment(openHalAlignment(path, &optionsParser)); 32 | if (genomeName == "") { 33 | validateAlignment(alignment.get()); 34 | } else { 35 | const Genome *genome = alignment->openGenome(genomeName); 36 | validateGenome(genome); 37 | } 38 | } catch (hal_exception &e) { 39 | cerr << "hal exception caught: " << e.what() << endl; 40 | return 1; 41 | } catch (exception &e) { 42 | cerr << "Exception caught: " << e.what() << endl; 43 | return 1; 44 | } 45 | cout << "\nFile valid" << endl; 46 | 47 | return 0; 48 | } 49 | --------------------------------------------------------------------------------