├── .gitignore ├── .gitlab-ci.yml ├── .gitmodules ├── .travis_old.yml ├── BIN-INSTALL.md ├── DEVELOPMENT.md ├── Dockerfile ├── Dockerfile.ci-base ├── Dockerfile.kegalign ├── LICENSE.txt ├── Makefile ├── README.md ├── ReleaseNotes.md ├── api ├── Makefile ├── __init__.py ├── impl │ ├── cactusBlock.c │ ├── cactusBlockPrivate.h │ ├── cactusCap.c │ ├── cactusCapPrivate.h │ ├── cactusChain.c │ ├── cactusChainPrivate.h │ ├── cactusDisk.c │ ├── cactusDiskPrivate.h │ ├── cactusEnd.c │ ├── cactusEndPrivate.h │ ├── cactusEvent.c │ ├── cactusEventPrivate.h │ ├── cactusEventTree.c │ ├── cactusEventTreePrivate.h │ ├── cactusFlower.c │ ├── cactusFlowerPrivate.h │ ├── cactusGlobalsPrivate.h │ ├── cactusGroup.c │ ├── cactusGroupPrivate.h │ ├── cactusLink.c │ ├── cactusLinkPrivate.h │ ├── cactusMisc.c │ ├── cactusSegment.c │ ├── cactusSegmentPrivate.h │ ├── cactusSequence.c │ ├── cactusSequencePrivate.h │ ├── cactusTestCommon.c │ └── cactus_params_parser.c ├── inc │ ├── cactus.h │ ├── cactusBlock.h │ ├── cactusCap.h │ ├── cactusChain.h │ ├── cactusDisk.h │ ├── cactusEnd.h │ ├── cactusEvent.h │ ├── cactusEventTree.h │ ├── cactusFlower.h │ ├── cactusGlobals.h │ ├── cactusGroup.h │ ├── cactusLink.h │ ├── cactusMisc.h │ ├── cactusSegment.h │ ├── cactusSequence.h │ ├── cactusTestCommon.h │ └── cactus_params_parser.h └── tests │ ├── allTests.c │ ├── cactusBlockTest.c │ ├── cactusBlocksTestShared.h │ ├── cactusCapTest.c │ ├── cactusChainTest.c │ ├── cactusChainsTestShared.h │ ├── cactusDiskTest.c │ ├── cactusEndTest.c │ ├── cactusEndsTestShared.h │ ├── cactusEventTest.c │ ├── cactusEventTreeTest.c │ ├── cactusFlowerTest.c │ ├── cactusGroupTest.c │ ├── cactusLinkTest.c │ ├── cactusMiscTest.c │ ├── cactusParamsTest.c │ ├── cactusSegmentTest.c │ └── cactusSequenceTest.c ├── bar ├── Makefile ├── __init__.py ├── impl │ ├── adjacencySequences.c │ ├── bar.c │ ├── endAligner.c │ ├── flowerAligner.c │ ├── poaBarAligner.c │ └── rescue.c ├── inc │ ├── adjacencySequences.h │ ├── endAligner.h │ ├── flowerAligner.h │ ├── poaBarAligner.h │ └── rescue.h └── tests │ ├── adjacencySequencesTest.c │ ├── allTests.c │ ├── endAlignerTest.c │ ├── flowerAlignerTest.c │ ├── flowersShared.h │ ├── poaBarTest.c │ └── rescueTest.c ├── build-tools ├── KegAlign.commit ├── downloadMafTools ├── downloadPangenomeTools ├── downloadPhast ├── downloadUcscLib ├── downloadVCFWave ├── makeBinRelease ├── makeCpuDockerRelease ├── makeGpuDockerRelease ├── makeSrcRelease ├── quayTagRelease └── releaseLib.sh ├── caf ├── Makefile ├── __init__.py ├── impl │ ├── addAdjacencies.c │ ├── annealing.c │ ├── cactus_setup.c │ ├── caf.c │ ├── filtering.c │ ├── finishing.c │ ├── giantComponent.c │ ├── melting.c │ ├── phylogeny.c │ ├── pinchIterator.c │ └── pinchToCactus.c ├── inc │ ├── stCaf.h │ ├── stCafPhylogeny.h │ ├── stGiantComponent.h │ └── stPinchIterator.h └── tests │ ├── allTests.c │ ├── annealingTest.c │ ├── filteringTest.c │ ├── giantComponentTest.c │ ├── phylogenyTests.c │ ├── pinchIteratorTest.c │ └── recoverableChainsTest.c ├── conftest.py ├── doc ├── INSTALL.txt ├── README.pages ├── README.pdf ├── add-genome-fig-github.png ├── cactus-update-prepare.md ├── grch38-alt-pg-lrc_kir.png ├── grch38-alt-pg-mhc.png ├── mc-pangenomes │ ├── 10-chicken-pg-2022-09-23-commands.md │ ├── 10-chicken-pg-2022-09-23-seqfile.txt │ ├── 10-chicken-pg-2023-06-27-commands.md │ ├── 10-chicken-pg-2023-06-27-seqfile.txt │ ├── 10-t2t-apes-mc-2023v2.README.md │ ├── 10-t2t-apes-mc-2023v2.seqfile.txt │ ├── 16-fly-pg-2022-05-26-commands.md │ ├── 16-fly-pg-2022-05-26-seqfile.txt │ ├── 16-fly-pg-2023-08-25-commands.md │ ├── 16-fly-pg-2023-08-25-seqfile.txt │ ├── 17-soybean-pg-2022-09-26-commands.md │ ├── 17-soybean-pg-2022-09-26-seqfile.txt │ ├── 30-mouse-pg-2022-09-23-commands.md │ ├── 30-mouse-pg-2022-09-23-seqfile.txt │ ├── 4-t2t-orangs-mc-2023v2.README.md │ ├── 4-t2t-orangs-mc-2023v2.seqfile.txt │ ├── 5-cow-pg-2022-09-22-commands.md │ ├── 5-cow-pg-2022-09-22-seqfile.txt │ ├── 5-cow-pg-2023-03-31-commands.md │ ├── 5-cow-pg-2023-03-31-seqfile.txt │ ├── 9-dog-pg-2022-09-23-commands.md │ ├── 9-dog-pg-2022-09-23-seqfile.txt │ ├── 9-dog-pg-2023-06-27-commands.md │ ├── 9-dog-pg-2023-06-27-seqfile.txt │ ├── README.md │ ├── grch38-alts-pg-2023-04-13-commands.md │ ├── grch38-alts-pg-2023-04-13-seqfile.txt │ ├── hprc-v1.1-mc-input-contigs.bed │ ├── hprc-v1.1-mc.md │ └── hprc-v1.1-mc.seqfile ├── mc-paper │ ├── README.md │ ├── fly-sra.tsv │ ├── fly │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── Snakefile │ │ ├── Snakefile_aws │ │ ├── annotate-repeats.R │ │ ├── cluster-svs.R │ │ ├── compare-freebayes-calls-hetrate.R │ │ ├── compare-freebayes-calls.R │ │ ├── mapstats-analysis.R │ │ ├── scripts │ │ │ ├── compute_mapping_stats.py │ │ │ ├── decompose_graph_variants.py │ │ │ ├── decompose_graph_variants_simple.py │ │ │ ├── drop_inconsistent_sites.py │ │ │ ├── freebayes_stats.R │ │ │ ├── read_svs.R │ │ │ ├── remove_duplicates.py │ │ │ └── rename_chr_vcf.py │ │ ├── snakemake_log.compare_freebayes_calls.log │ │ ├── snakemake_log.decompose_pangenome.log │ │ ├── snakemake_log.decompose_vg_calls.log │ │ ├── snakemake_log.mapping_stats_dm6_bwa.log │ │ ├── snakemake_log.mapping_stats_mc_giraffe.log │ │ ├── snakemake_log.merge_freebayes_dm6_bwa_calls.log │ │ ├── snakemake_log.merge_freebayes_mc_giraffe_calls.log │ │ └── svs-analysis.R │ └── hprc │ │ ├── GRCh38-f1g-90-mc-jun1.minigraph.split.log.gz │ │ ├── README.md │ │ ├── Snakefile_eval_chm13 │ │ ├── Snakefile_eval_grch38 │ │ ├── Snakefile_mapstats │ │ ├── calls-evaluation-exploration.R │ │ ├── chm13-f1g-90-mc-jun1.minigraph.split.log.gz │ │ ├── contig.inclusion.stats.R │ │ ├── eval-stratification-grch38-chm13.R │ │ ├── graphs-pangenie.R │ │ ├── mapstats-analysis.R │ │ ├── resources │ │ ├── compute_mapping_stats.py │ │ ├── compute_mapping_stats_gaf.py │ │ ├── compute_mapping_stats_jq.py │ │ ├── eval-roc-summary.R │ │ ├── eval-summary.R │ │ ├── get_stratification_on_sample.R │ │ ├── giab.truthset.paths.tsv │ │ ├── prepare_stratification_regions.R │ │ └── rename_chr_vcf.py │ │ ├── snakemake_config_chm13.yaml │ │ ├── snakemake_config_giab1-2-5.yaml │ │ ├── snakemake_config_hg002.yaml │ │ ├── snakemake_config_lifted.yaml │ │ ├── snakemake_config_lifted_cmrg.yaml │ │ ├── snarls-stats.R │ │ └── terra-files │ │ ├── CHM13.path_list.txt │ │ ├── GRCh38.path_list.txt │ │ ├── bwa-deepvariant-chm13.json │ │ ├── bwa_deepvariant.wdl │ │ ├── giraffe-deepvariant-chm13_pangenome-chm13_projection.json │ │ ├── giraffe-deepvariant-chm13_pangenome-grch38_projection.json │ │ └── giraffe-deepvariant-grch38_pangenome-grch38_projection.json ├── pangenome.md ├── progressive.md ├── running-in-aws.md ├── sa_refgraph_hackathon_2023.md ├── updating-alignments.md ├── yeast-pg-chrI.full.draw.png ├── yeast-pg-chrI.full.viz.png ├── yeast-pg-chrII.full.viz.png ├── yeast-pg-chunk-view.png └── yeast-pg-chunk-viz.png ├── examples ├── 10way-mhc.txt ├── evolverMammals.txt ├── evolverPrimates.txt ├── par1_cb.txt ├── par1_hcb.txt ├── par1_hcbg.txt └── yeastPangenome.txt ├── hal ├── Makefile ├── __init__.py ├── impl │ ├── fasta.c │ └── hal.c ├── inc │ └── hal.h └── tests │ └── allTests.c ├── include.mk ├── pipeline ├── Makefile ├── cactus_consolidated.c ├── docker_test_script.py ├── impl │ ├── convertAlignmentCoordinates.c │ └── traverseFlowers.c ├── inc │ ├── convertAlignmentCoordinates.h │ └── traverseFlowers.h └── tests │ └── allTests.c ├── preprocessor ├── Makefile ├── cactus_analyseAssembly.c ├── cactus_filterSmallFastaSequences.py ├── cactus_makeAlphaNumericHeaders.py ├── cactus_redPrefilter.c ├── cactus_sanitizeFastaHeaders.c ├── cactus_softmask2hardmask.c └── lastzRepeatMasking │ ├── Makefile │ ├── cactus_covered_intervals.c │ ├── cactus_fasta_fragments.py │ └── cactus_fasta_softmask_intervals.py ├── pytest.ini ├── reference ├── Makefile ├── __init__.py ├── impl │ ├── addReferenceCoordinates.c │ ├── blockMLString.c │ ├── buildReference.c │ ├── getReferenceSequences.c │ └── recursiveThreadBuilder.c ├── inc │ ├── addReferenceCoordinates.h │ ├── blockMLString.h │ ├── cactusReference.h │ └── recursiveThreadBuilder.h └── tests │ ├── addReferenceCoordinatesTest.c │ ├── allTests.c │ ├── buildReferenceTest.c │ └── recursiveThreadBuilderTest.c ├── runtime └── wrapper.sh ├── setup.py ├── setup ├── Makefile ├── __init__.py ├── impl │ └── setup.c └── inc │ └── cactus_setup.h ├── src ├── __init__.py └── cactus │ ├── __init__.py │ ├── attcc-alpha.knm │ ├── bar │ └── cactus_barTest.py │ ├── blast │ ├── __init__.py │ └── cactus_blast.py │ ├── cactus_progressive_config.xml │ ├── hal │ ├── __init__.py │ └── cactus_halTest.py │ ├── maf │ ├── __init__.py │ ├── cactus_hal2chains.py │ ├── cactus_hal2maf.py │ └── cactus_maf2bigmaf.py │ ├── paf │ ├── __init__.py │ ├── last_scoring.py │ ├── local_alignment.py │ ├── paf.py │ └── pafTest.py │ ├── pipeline │ ├── __init__.py │ ├── cactus_evolverTest.py │ ├── cactus_workflow.py │ └── cactus_workflowTest.py │ ├── preprocessor │ ├── __init__.py │ ├── cactus_preprocessor.py │ ├── cactus_preprocessorTest.py │ ├── checkUniqueHeaders.py │ ├── cutHeaders.py │ ├── dnabrnnMasking.py │ ├── fileMasking.py │ ├── lastzRepeatMasking │ │ ├── __init__.py │ │ ├── cactus_lastzRepeatMask.py │ │ └── cactus_lastzRepeatMaskTest.py │ ├── preprocessorTest.py │ └── redMasking.py │ ├── progressive │ ├── __init__.py │ ├── cactus_constructFromIntermediates.py │ ├── cactus_prepare.py │ ├── cactus_progressive.py │ ├── cactus_progressiveTest.py │ ├── cactus_terra_helper.py │ ├── multiCactusTree.py │ ├── multiCactusTreeTest.py │ ├── outgroup.py │ ├── outgroupTest.py │ ├── progressive_decomposition.py │ └── seqFile.py │ ├── reference │ ├── __init__.py │ └── cactus_referenceTest.py │ ├── refmap │ ├── __init__.py │ ├── apply_dipcall_bed_filter.py │ ├── cactus_graphmap.py │ ├── cactus_graphmap_join.py │ ├── cactus_graphmap_split.py │ ├── cactus_minigraph.py │ ├── cactus_pangenome.py │ ├── cactus_refmap.py │ ├── fasta_preprocessing.py │ └── paf_to_lastz.py │ ├── setup │ ├── __init__.py │ └── cactus_align.py │ ├── shared │ ├── __init__.py │ ├── common.py │ ├── commonTest.py │ ├── configWrapper.py │ └── test.py │ └── update │ ├── __init__.py │ └── cactus_update_prepare.py ├── test ├── evolverMammals-default.comp.xml ├── evolverPrimates-default.comp.xml └── evolverTest.py └── toil-requirement.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .pydevproject 2 | .project 3 | .cproject 4 | bin/* 5 | lib/* 6 | *.o 7 | *.pyc 8 | /bin 9 | /lib 10 | tmp_* 11 | temporaryCactusDisk 12 | build 13 | dist 14 | *.egg-info 15 | *.a 16 | cactusSequences 17 | *~ 18 | .cache 19 | src/cactus/shared/version.py -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: quay.io/comparative-genomics-toolkit/cactus-ci-base:latest 2 | 3 | variables: 4 | GIT_SUBMODULE_STRATEGY: recursive 5 | 6 | before_script: 7 | - whoami 8 | - sudo apt-get -q -y update 9 | - sudo apt-get -q -y install --no-upgrade bcftools parallel libdeflate-dev cmake libjemalloc-dev python3-distutils pybind11-dev autoconf libzstd-dev libhts-dev rsync 10 | - startdocker || true 11 | - docker info 12 | 13 | after_script: 14 | - stopdocker || true 15 | 16 | stages: 17 | - test 18 | 19 | test-job: 20 | stage: test 21 | script: 22 | - make clean 23 | - virtualenv -p python3.9 venv 24 | - source venv/bin/activate 25 | - python3.9 -m pip install -r toil-requirement.txt 26 | - python3.9 -m pip install -U . 27 | # downgrad pysam to work around import pysam error: "module 'pysam.libcalignedsegment' has no attribute 'CMATCH'" 28 | - python3.9 -m pip install pysam==0.21.0 29 | # these are the old travis tests, followed by its docker push 30 | - git clone https://github.com/ComparativeGenomicsToolkit/cactusTestData 31 | - export ASAN_OPTIONS="detect_leaks=0" 32 | - CGL_DEBUG=ultra make -j 8 33 | - CACTUS_BINARIES_MODE=local SON_TRACE_DATASETS=$(pwd)/cactusTestData CACTUS_TEST_CHOICE=normal make test 34 | - pip install -U newick attrs 35 | - make -j 8 hal_test 36 | # rebuild without all the debug flags 37 | - make clean 38 | - make -j 8 39 | - numcpu=8 build-tools/downloadPangenomeTools 40 | - CACTUS_LEGACY_ARCH=1 numcpu=8 build-tools/downloadMafTools 41 | - python3.9 -m pip install -U . 42 | # force local docker image to use legacy mode so tests run 43 | - sed -i Dockerfile -e 's/ENV avx2 1/ENV CACTUS_LEGACY_ARCH 1/g' 44 | - make docker 45 | - make -j 8 evolver_test 46 | 47 | artifacts: 48 | # Let Gitlab see the junit report 49 | #reports: 50 | # junit: test-report.xml 51 | #when: always 52 | 53 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "submodules/pinchesAndCacti"] 2 | path = submodules/pinchesAndCacti 3 | url = https://github.com/ComparativeGenomicsToolkit/pinchesAndCacti 4 | [submodule "submodules/matchingAndOrdering"] 5 | path = submodules/matchingAndOrdering 6 | url = https://github.com/ComparativeGenomicsToolkit/matchingAndOrdering 7 | [submodule "submodules/cPecan"] 8 | path = submodules/cPecan 9 | url = https://github.com/ComparativeGenomicsToolkit/cPecan 10 | [submodule "submodules/sonLib"] 11 | path = submodules/sonLib 12 | url = https://github.com/ComparativeGenomicsToolkit/sonLib 13 | [submodule "submodules/hal"] 14 | path = submodules/hal 15 | url = https://github.com/ComparativeGenomicsToolkit/hal 16 | [submodule "submodules/cactus2hal"] 17 | path = submodules/cactus2hal 18 | url = https://github.com/ComparativeGenomicsToolkit/cactus2hal 19 | [submodule "submodules/kyoto"] 20 | path = submodules/kyoto 21 | url = https://github.com/ComparativeGenomicsToolkit/kyoto.git 22 | [submodule "submodules/abPOA"] 23 | path = submodules/abPOA 24 | url = https://github.com/yangao07/abPOA.git 25 | [submodule "submodules/lastz"] 26 | path = submodules/lastz 27 | url = https://github.com/ComparativeGenomicsToolkit/lastz.git 28 | [submodule "submodules/paffy"] 29 | path = submodules/paffy 30 | url = https://github.com/ComparativeGenomicsToolkit/paffy.git 31 | [submodule "submodules/red"] 32 | path = submodules/red 33 | url = https://github.com/glennhickey/red.git 34 | [submodule "submodules/collapse-bubble"] 35 | path = submodules/collapse-bubble 36 | url = https://github.com/Han-Cao/collapse-bubble.git 37 | [submodule "submodules/FASTGA"] 38 | path = submodules/FASTGA 39 | url = https://github.com/thegenemyers/FASTGA.git 40 | -------------------------------------------------------------------------------- /.travis_old.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: bionic 3 | language: python 4 | 5 | python: 6 | - "3.7" 7 | 8 | before_install: 9 | - | 10 | if [[ "$TRAVIS_OS_NAME" == "linux" && "$CACTUS_BINARIES_MODE" == "local" ]]; then 11 | sudo apt-get -qq update 12 | sudo apt-get install -y git gcc g++ build-essential zlib1g-dev wget valgrind libbz2-dev pkg-config libhdf5-dev liblzo2-dev 13 | fi 14 | - | 15 | if [[ "$TRAVIS_OS_NAME" == "linux" && "$CACTUS_BINARIES_MODE" == "local" ]]; then 16 | sudo apt-get install -y git gcc g++ build-essential zlib1g-dev wget valgrind libbz2-dev pkg-config libhdf5-dev liblzo2-dev 17 | fi 18 | - | 19 | if [[ ! -z "$SON_TRACE_DATASETS" ]]; then 20 | git clone https://github.com/ComparativeGenomicsToolkit/cactusTestData 21 | fi 22 | 23 | # Push docker containers. We skip this on PR builds because Travis 24 | # doesn't include the user/password in those builds. 25 | - | 26 | if [[ "$CACTUS_BINARIES_MODE" == "docker" ]]; then 27 | if [[ "$TRAVIS_PULL_REQUEST" == "false" ]]; then 28 | docker login --username $QUAY_USERNAME --password $QUAY_PASSWORD quay.io 29 | make push 30 | else 31 | make docker 32 | fi 33 | fi 34 | script: 35 | # install toil / cactus 36 | - pip install -r toil-requirement.txt 37 | - pip install -U . 38 | - if [[ "$CACTUS_TEST_CHOICE" == "normal" ]]; then export MAKE_TARGET=test_nonblast; fi 39 | - if [[ "$CACTUS_TEST_CHOICE" == "blast" ]]; then export MAKE_TARGET=test_blast; fi 40 | - if [[ "$CACTUS_BINARIES_MODE" == "local" ]]; then make && PATH=`pwd`/bin:$PATH PYTHONPATH=`pwd`:`pwd`/src travis_wait 50 make ${MAKE_TARGET}; fi 41 | - if [[ "$CACTUS_BINARIES_MODE" == "docker" ]]; then travis_wait 40 make ${MAKE_TARGET}; fi 42 | os: 43 | - linux 44 | services: 45 | - docker 46 | env: 47 | # Assert-enabled tests using extra-large test data, but restricted to blast tests only to keep under 50 mins. 48 | - CGL_DEBUG=1 CACTUS_BINARIES_MODE=local SON_TRACE_DATASETS=$TRAVIS_BUILD_DIR/cactusTestData CACTUS_TEST_CHOICE=blast 49 | # Assert-enabled tests using extra-large test data, but restricted to non-blast tests only to keep under 50 mins. 50 | - CGL_DEBUG=1 CACTUS_BINARIES_MODE=local SON_TRACE_DATASETS=$TRAVIS_BUILD_DIR/cactusTestData CACTUS_TEST_CHOICE=normal 51 | # Docker container push and test. 52 | - CACTUS_BINARIES_MODE=docker CACTUS_TEST_CHOICE=normal 53 | # Paranoia about memory corruption. Leak detection is disabled 54 | # because many utilities are short-lived and don't bother cleaning 55 | # up memory before exit. 56 | - CGL_DEBUG=ultra ASAN_OPTIONS=detect_leaks=0 CACTUS_BINARIES_MODE=local CACTUS_TEST_CHOICE=normal 57 | -------------------------------------------------------------------------------- /BIN-INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation of the Cactus binary distribution 2 | 3 | This describes the steps require to install the Cactus 4 | pre-compile binary, static linked distribution. 5 | 6 | ## Extracting 7 | If you have not already extract the distribution and cd into the cactus directory: 8 | ``` 9 | tar -xzf cactus-bin-v2.9.8.tar.gz 10 | cd cactus-bin-v2.9.8 11 | ``` 12 | 13 | ## Setup 14 | 15 | To build a python virtualenv and activate, do the following steps. This requires Python version >= 3.9 (so Ubuntu 18.04 users should use `-p python3.9` below): 16 | ``` 17 | virtualenv -p python3 venv-cactus-v2.9.8 18 | printf "export PATH=$(pwd)/bin:\$PATH\nexport PYTHONPATH=$(pwd)/lib:\$PYTHONPATH\nexport LD_LIBRARY_PATH=$(pwd)/lib:\$LD_LIBRARY_PATH\n" >> venv-cactus-v2.9.8/bin/activate 19 | source venv-cactus-v2.9.8/bin/activate 20 | python3 -m pip install -U setuptools pip wheel 21 | python3 -m pip install -U . 22 | python3 -m pip install -U -r ./toil-requirement.txt 23 | ``` 24 | 25 | Some tools required for `hal2assemblyHub.py`, `cactus-hal2chains` and `cactus-maf2bigmaf` are not included and must be downloaded separately. 26 | They are `wigToBigWig faToTwoBit bedToBigBed bigBedToBed axtChain pslPosTarget bedSort hgGcPercent mafToBigMaf hgLoadMafSummary hgLoadChain`. More information 27 | can be found [here](https://hgdownload.cse.ucsc.edu/admin/exe/). Note that some may require 28 | a license for commercial use. Static binaries are not available, but the following command 29 | should set them up successfully on many 64 bit Linux systems: 30 | ``` 31 | cd bin && for i in wigToBigWig faToTwoBit bedToBigBed bigBedToBed axtChain pslPosTarget bedSort hgGcPercent mafToBigMaf hgLoadMafSummary hgLoadChain; do wget -q http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/${i}; chmod +x ${i}; done 32 | ``` 33 | 34 | `vcfwave` isn't included in the release binaries (but is in the docker image). You can can try building it and adding it to `bin/` with the following command 35 | ``` 36 | build-tools/downloadVCFWave 37 | ``` 38 | 39 | ## Testing 40 | 41 | To test Cactus, the following will run a tiny sumulated alignment. 42 | ``` 43 | cactus ./jobstore ./examples/evolverMammals.txt ./evolverMammals.hal 44 | `` 45 | -------------------------------------------------------------------------------- /DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | # Notes on developing and debugging cactus 2 | 3 | ## Overriding make settings 4 | A file include.local.mk can be created in the root directory 5 | to override make variables, including setting environment variables. 6 | This should not be committed. 7 | 8 | ## Environment variables controlling how cactus is run 9 | - CACTUS_BINARIES_MODE - how are cactus programs found? 10 | - docker 11 | - singularity 12 | - local 13 | - CACTUS_DOCKER_MODE - is Docker being used? 14 | - 1 15 | - 0 16 | - CACTUS_USE_LOCAL_IMAGE - is Docker image on local server? 17 | - 0 18 | - 1 19 | 20 | ## Environment variables controlling tests 21 | - SON_TRACE_DATASETS location of test data set, currently available with 22 | git clone https://github.com/ComparativeGenomicsToolkit/cactusTestData 23 | 24 | - SONLIB_TEST_LENGTH filters tests by maximum run time length category (case-insensitive) 25 | - SHORT - tests taking less than ~10 seconds, with some exceptions 26 | - MEDIUM - tests taking less than ~100 seconds 27 | - LONG - test taking less than ~1000 seconds 28 | - VERG_LONG - test taking even longer 29 | 30 | - CACTUS_TEST_LOG_LEVEL - Set log-level used for the test, may not set it for all test, but very useful for Toil. 31 | 32 | 33 | 34 | ## Running tests with docker in single machine mode 35 | make docker 36 | export CACTUS_USE_LOCAL_IMAGE=1 37 | make test 38 | 39 | ## Debugging hints 40 | - The main Cactus Python process will print out a stack trace of all of the Python 41 | threads if sent a SIGUSR1 signal. They will then continue execution. This 42 | maybe useful in determining the state of cactus. 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /Dockerfile.ci-base: -------------------------------------------------------------------------------- 1 | # Building Cactus on Gitlab is slow and often crashes with errors like: 2 | # E: Failed to fetch http://archive.ubuntu.com/ubuntu/pool/main/m/make-dfsg/make_4.1-9.1ubuntu1_amd64.deb Undetermined Error [IP: 91.189.88.142 80] 3 | # E: Unable to fetch some archives, maybe run apt-get update or try with --fix-missing? 4 | # So we try to avoid that by keeping a base image around with dependencies pre-installed that CI can use. 5 | 6 | FROM quay.io/vgteam/dind 7 | 8 | # apt dependencies for build 9 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential git python3.9 python3.9-dev python3-pip zlib1g-dev wget libbz2-dev pkg-config libhdf5-dev liblzo2-dev libtokyocabinet-dev wget liblzma-dev libxml2-dev libssl-dev libpng-dev uuid-dev libcurl4-gnutls-dev libffi-dev 10 | 11 | # apt dependencies for ci 12 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y default-jre wget docker.io python3-virtualenv libcurl4-gnutls-dev libgnutls28-dev 13 | -------------------------------------------------------------------------------- /Dockerfile.kegalign: -------------------------------------------------------------------------------- 1 | # Reminder: if updating this image, also update it in build-tools/makeGpuDockerRelease 2 | FROM nvidia/cuda:11.7.1-devel-ubuntu22.04 as builder 3 | 4 | # Prevent dpkg from trying to ask any questions, ever 5 | ENV DEBIAN_FRONTEND noninteractive 6 | 7 | # system dependencies are installed by ./installUbuntu.sh below, but we need sudo first 8 | RUN apt-get -qq -y update && \ 9 | apt-get -qq -y upgrade && \ 10 | apt-get -qq -y install \ 11 | git build-essential wget curl jq dos2unix 12 | 13 | # Use the commit from the file 14 | COPY build-tools/KegAlign.commit / 15 | 16 | # clone KegAlign 17 | RUN git clone https://github.com/galaxyproject/KegAlign.git && cd KegAlign && git checkout $(cat /KegAlign.commit) && git submodule update --init --recursive 18 | 19 | # make the conda environment 20 | RUN cd /KegAlign && \ 21 | dos2unix ./scripts/*.bash ./scripts/*.py ./scripts/run_kegalign && \ 22 | ./scripts/make-conda-env.bash -dev 23 | 24 | # build KegAlign 25 | RUN cd /KegAlign && \ 26 | mkdir build && \ 27 | bash -c "source ./conda-env-dev.bash && cd build && cmake -DCMAKE_BUILD_TYPE=Release .. && make -j $(nproc)" 28 | 29 | # Create a thinner final Docker image with only runtime dependencies 30 | FROM nvidia/cuda:11.7.1-runtime-ubuntu22.04 31 | 32 | # Install runtime dependencies 33 | RUN apt-get -qq -y update && \ 34 | apt-get -qq -y upgrade && \ 35 | apt-get -qq -y install mbuffer 36 | 37 | # copy kegalign runtime essentials (for historic reasons, cactus is in /home so we put kegalign there too) 38 | RUN mkdir /home/KegAlign /home/KegAlign/bin 39 | COPY --from=builder /KegAlign/build/kegalign /home/KegAlign/bin/ 40 | COPY --from=builder /KegAlign/scripts /home/KegAlign/scripts 41 | COPY --from=builder /KegAlign/.conda/miniforge3/envs/kegalign-dev/lib /home/KegAlign/lib 42 | 43 | # add the library path 44 | ENV LD_LIBRARY_PATH="/home/KegAlign/lib:${LD_LIBRARY_PATH}" 45 | 46 | # add the kegalign path 47 | ENV PATH="/home/KegAlign/bin:/home/KegAlign/scripts:${PATH}" 48 | 49 | # remember that commit 50 | COPY --from=builder /KegAlign.commit / 51 | 52 | # UCSC convention is to work in /data 53 | RUN mkdir -p /data 54 | WORKDIR /data 55 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2011 by Benedict Paten (benedictpaten@gmail.com) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | 21 | 22 | This license excludes certain files in the 'externalTools' sub-directory, please 23 | see for copyright information and license information. -------------------------------------------------------------------------------- /api/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = .. 2 | include ${rootPath}/include.mk 3 | 4 | 5 | libSources = impl/cactus*.c 6 | libHeaders = inc/cactus*.h 7 | libInternalHeaders = impl/*.h 8 | libTests = tests/cactus*.c 9 | libTestsHeaders = tests/cactus*.h 10 | 11 | CPPFLAGS += -I{tests} -Iimpl 12 | 13 | all: all_libs all_progs 14 | all_libs: ${LIBDIR}/cactusLib.a 15 | all_progs: all_libs 16 | ${MAKE} ${BINDIR}/cactusAPITests 17 | 18 | clean : 19 | rm -f ${LIBDIR}/cactusLib.a ${LIBDIR}/cactus*.h ${BINDIR}/cactusAPITests *.o 20 | 21 | ${LIBDIR}/cactusLib.a : ${libSources} ${libHeaders} ${libInternalHeaders} 22 | ${CC} ${CPPFLAGS} ${CFLAGS} -I inc -I ${LIBDIR}/ -c ${libSources} 23 | ${AR} rc cactusLib.a *.o 24 | ${RANLIB} cactusLib.a 25 | mv cactusLib.a ${LIBDIR}/ 26 | 27 | ${BINDIR}/cactusAPITests : ${libTests} ${libTestsHeaders} ${libSources} ${libHeaders} ${libInternalHeaders} tests/allTests.c ${LIBDIR}/cactusLib.a ${LIBDEPENDS} 28 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/cactusAPITests tests/allTests.c ${libTests} ${LIBDIR}/cactusLib.a ${LDLIBS} 29 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /api/impl/cactusBlockPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_ATOM_PRIVATE_H_ 8 | #define CACTUS_ATOM_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | struct _block_instanceIterator { 13 | Segment *segment; 14 | Block *block; 15 | }; 16 | 17 | //////////////////////////////////////////////// 18 | //////////////////////////////////////////////// 19 | //////////////////////////////////////////////// 20 | //Block functions. 21 | //////////////////////////////////////////////// 22 | //////////////////////////////////////////////// 23 | //////////////////////////////////////////////// 24 | 25 | /* 26 | * Constructs the block, but not its ends. 27 | */ 28 | Block *block_construct2(Name name, int64_t length, End *leftEnd, End *rightEnd, Flower *flower); 29 | 30 | /* 31 | * Destructs the block and all segments it contains. 32 | */ 33 | void block_destruct(Block *block); 34 | 35 | /* 36 | * Adds in the instance to the block. 37 | */ 38 | void block_addInstance(Block *block, Segment *segment); 39 | 40 | /* 41 | * Removes the instance from the block. 42 | */ 43 | void block_removeInstance(Block *block, Segment *segment); 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /api/impl/cactusCapPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_END_INSTANCE_PRIVATE_H_ 8 | #define CACTUS_END_INSTANCE_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | typedef struct _capCoreContents { 13 | Name instance; 14 | int64_t coordinate; 15 | void *eventOrSequence; 16 | } CapCoreContents; 17 | 18 | typedef struct _capContents { 19 | CapCoreContents core; 20 | Cap *adjacency; 21 | End *end; 22 | Cap *nCap; // Links together different caps in the end 23 | } CapContents; 24 | 25 | typedef struct _segmentCapContents { 26 | CapCoreContents core; 27 | Cap *leftAdjacency; 28 | Cap *rightAdjacency; 29 | Block *block; 30 | Segment *nSegment; // Links together different segments in the block 31 | } SegmentCapContents; 32 | 33 | struct _cap { // Note, a segment has the same structure 34 | char bits; 35 | }; 36 | 37 | //////////////////////////////////////////////// 38 | //////////////////////////////////////////////// 39 | //////////////////////////////////////////////// 40 | //End instance functions. 41 | //////////////////////////////////////////////// 42 | //////////////////////////////////////////////// 43 | //////////////////////////////////////////////// 44 | 45 | /* 46 | * These functions deal with the internal structures 47 | * used to represent caps and caps+segments 48 | */ 49 | bool cap_forward(Cap *cap); 50 | bool cap_partOfSegment(Cap *cap); 51 | bool cap_isSegment(Cap *cap); 52 | bool cap_left(Cap *cap); 53 | bool cap_getHasEventNotSequence(Cap *cap); 54 | SegmentCapContents *cap_getSegmentContents(Cap *cap); 55 | SegmentCapContents *segment_getContents(Segment *segment); 56 | CapContents *cap_getContents(Cap *cap); 57 | CapCoreContents *cap_getCoreContents(Cap *cap); 58 | 59 | 60 | /* 61 | * Constructs an cap, but not its connecting objects. Instance is the suffix m of the instance name n.m. 62 | */ 63 | Cap *cap_construct3(Name name, Event *event, End *end); 64 | 65 | /* 66 | * As default constructor, but also sets the instance's coordinates and event. 67 | */ 68 | Cap *cap_construct4(Name name, End *end, int64_t startCoordinate, 69 | bool strand, Sequence *sequence); 70 | 71 | /* 72 | * Destructs the cap, but not any connecting objects. 73 | */ 74 | void cap_destruct(Cap *cap); 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /api/impl/cactusChainPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_CHAIN_PRIVATE_H_ 8 | #define CACTUS_CHAIN_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | struct _chain { 13 | Name name; 14 | Flower *flower; 15 | Link *link; 16 | Link *endLink; // Could be removed if we build back to front 17 | //int64_t linkNumber; // can easily be removed 18 | //int64_t chainIndex; 19 | }; 20 | 21 | //////////////////////////////////////////////// 22 | //////////////////////////////////////////////// 23 | //////////////////////////////////////////////// 24 | //Chain functions. 25 | //////////////////////////////////////////////// 26 | //////////////////////////////////////////////// 27 | //////////////////////////////////////////////// 28 | 29 | /* 30 | * Constructs a chain, which in turn holds links. 31 | */ 32 | Chain *chain_construct2(Name name, Flower *flower); 33 | 34 | /* 35 | * Add the link to the chain. 36 | */ 37 | void chain_addLink(Chain *chain, Link *childLink); 38 | 39 | /* 40 | * Sets the flower containing the chain. 41 | */ 42 | void chain_setFlower(Chain *chain, Flower *flower); 43 | 44 | /* 45 | * Joins two chains together where the _5Chain abuts at the 3' end with the _3Chain. 46 | */ 47 | void chain_join(Chain *_5Chain, Chain *_3Chain); 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /api/impl/cactusDiskPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_DISK_PRIVATE_H_ 8 | #define CACTUS_DISK_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | #if defined(_OPENMP) 12 | #include 13 | #endif 14 | 15 | struct _cactusDisk { 16 | stSortedSet *sequences; 17 | stSortedSet *flowers; 18 | EventTree *eventTree; 19 | #if defined(_OPENMP) 20 | omp_lock_t writelock; // This lock used to gate access to concurrently accessed variables 21 | #endif 22 | stHash *allStrings; // If the strings are being all stored in memory, a map of names to strings 23 | Name currentName; // Used as a counter for issuing names 24 | }; 25 | 26 | //////////////////////////////////////////////// 27 | //////////////////////////////////////////////// 28 | //////////////////////////////////////////////// 29 | //Cactus disk functions. 30 | //////////////////////////////////////////////// 31 | //////////////////////////////////////////////// 32 | //////////////////////////////////////////////// 33 | 34 | /* 35 | * Adds a newly constructed flower to the memory of the cactusDisk. 36 | */ 37 | void cactusDisk_addFlower(CactusDisk *cactusDisk, Flower *flower); 38 | 39 | /* 40 | * Registers the flower is being freed from memory. 41 | */ 42 | void cactusDisk_removeFlower(CactusDisk *cactusDisk, Flower *flower); 43 | 44 | /* 45 | * Functions on meta sequences. 46 | */ 47 | 48 | /* 49 | * Adds a newly constructed meta sequence to the memory of the cactusDisk. 50 | */ 51 | void cactusDisk_addSequence(CactusDisk *cactusDisk, 52 | Sequence *sequence); 53 | 54 | /* 55 | * Registers the meta sequence is being freed from memory. 56 | */ 57 | void cactusDisk_removeSequence(CactusDisk *cactusDisk, 58 | Sequence *sequence); 59 | 60 | /* 61 | * Functions on strings stored by the cactus disk. 62 | */ 63 | 64 | /* 65 | * Adds the sequence string to the database. 66 | */ 67 | Name cactusDisk_addString(CactusDisk *cactusDisk, const char *string); 68 | 69 | /* 70 | * Retrieves a string from the bucket of sequence. 71 | */ 72 | char *cactusDisk_getString(CactusDisk *cactusDisk, Name name, 73 | int64_t start, int64_t length, int64_t strand, int64_t totalSequenceLength); 74 | 75 | /* 76 | * Set the event tree for this disk. (Hopefully this only happens once.) 77 | */ 78 | void cactusDisk_setEventTree(CactusDisk *cactusDisk, EventTree *eventTree); 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /api/impl/cactusEndPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_END_PRIVATE_H_ 8 | #define CACTUS_END_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | typedef struct _endContents { 13 | Name name; 14 | Cap *firstCap; 15 | Group *group; 16 | Flower *flower; 17 | End *nEnd; 18 | } EndContents; 19 | 20 | typedef struct _blockEndContents { 21 | Name name; 22 | Segment *firstSegment; 23 | int64_t length; 24 | Flower *flower; 25 | Group *leftGroup; 26 | Group *rightGroup; 27 | End *nEndLeft; 28 | End *nEndRight; 29 | } BlockEndContents; 30 | 31 | struct _end { 32 | char bits; 33 | }; 34 | 35 | struct _end_instanceIterator { 36 | Cap *cap; 37 | End *end; 38 | }; 39 | 40 | bool end_isBlock(End *end); 41 | BlockEndContents *end_getBlockEndContents(End *end); 42 | 43 | //////////////////////////////////////////////// 44 | //////////////////////////////////////////////// 45 | //////////////////////////////////////////////// 46 | //End functions. 47 | //////////////////////////////////////////////// 48 | //////////////////////////////////////////////// 49 | //////////////////////////////////////////////// 50 | 51 | /* 52 | * Get the block contents object 53 | */ 54 | BlockEndContents *block_getContents(Block *block); 55 | 56 | /* 57 | * Get the contents object shared between the ends. 58 | */ 59 | EndContents *end_getContents(End *end); 60 | 61 | /* 62 | * Constructs the end, but not any attached block. 63 | */ 64 | End *end_construct3(Name name, int64_t isAttached, 65 | int64_t side, Flower *flower); 66 | 67 | /* 68 | * Destructs the end and any contained caps. 69 | */ 70 | void end_destruct(End *end); 71 | 72 | /* 73 | * Adds the cap to the end. 74 | */ 75 | void end_addInstance(End *end, Cap *cap); 76 | 77 | /* 78 | * Removes the instance from the end. 79 | */ 80 | void end_removeInstance(End *end, Cap *cap); 81 | 82 | /* 83 | * Hash key for an end, uses the name of the end to hash.. hence 84 | * the key doesn't care about the orientation. 85 | */ 86 | uint64_t end_hashKey(const void *o); 87 | 88 | /* 89 | * Hash equals key, equal only if the two ends have the same name and orientation. 90 | */ 91 | int end_hashEqualsKey(const void *o, const void *o2); 92 | 93 | /* 94 | * Sets the flower associated with the end. 95 | */ 96 | void end_setFlower(End *end, Flower *flower); 97 | 98 | /* 99 | * Get pointer to next end in the group. 100 | */ 101 | End **getNextEndPointer(End *end); 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /api/impl/cactusEventPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_EVENT_PRIVATE_H_ 8 | #define CACTUS_EVENT_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | struct _event { 13 | Name name; 14 | char *header; 15 | struct List *children; 16 | float branchLength; 17 | Event *parent; 18 | EventTree *eventTree; 19 | bool isOutgroup; 20 | }; 21 | 22 | //////////////////////////////////////////////// 23 | //////////////////////////////////////////////// 24 | //////////////////////////////////////////////// 25 | //Private event functions. 26 | //////////////////////////////////////////////// 27 | //////////////////////////////////////////////// 28 | //////////////////////////////////////////////// 29 | 30 | /* 31 | * Destructs the event and also destructs any attached child events. 32 | */ 33 | void event_destruct(Event *event); 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /api/impl/cactusEventTreePrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_EVENT_TREE_PRIVATE_H_ 8 | #define CACTUS_EVENT_TREE_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | struct _eventTree { 13 | Event *rootEvent; 14 | stSortedSet *events; 15 | CactusDisk *cactusDisk; 16 | }; 17 | 18 | //////////////////////////////////////////////// 19 | //////////////////////////////////////////////// 20 | //////////////////////////////////////////////// 21 | //Private event tree functions. 22 | //////////////////////////////////////////////// 23 | //////////////////////////////////////////////// 24 | //////////////////////////////////////////////// 25 | 26 | /* 27 | * Destructs the event tree and all its events. 28 | */ 29 | void eventTree_destruct(EventTree *eventTree); 30 | 31 | /* 32 | * Adds the cap to the event tree. 33 | */ 34 | void eventTree_addEvent(EventTree *eventTree, Event *event); 35 | 36 | /* 37 | * Removes the instance from the event tree. 38 | */ 39 | void eventTree_removeEvent(EventTree *eventTree, Event *event); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /api/impl/cactusFlowerPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_FLOWER_PRIVATE_H_ 8 | #define CACTUS_FLOWER_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | struct _flower { 13 | Name name; 14 | stList *ends; 15 | stSortedSet *ends2; 16 | stList *caps; 17 | stSortedSet *caps2; 18 | stList *groups; 19 | stList *chains; 20 | stList *sequences; 21 | Name parentFlowerName; 22 | CactusDisk *cactusDisk; 23 | bool builtBlocks; 24 | }; 25 | 26 | //////////////////////////////////////////////// 27 | //////////////////////////////////////////////// 28 | //////////////////////////////////////////////// 29 | //Private flower functions. 30 | //////////////////////////////////////////////// 31 | //////////////////////////////////////////////// 32 | //////////////////////////////////////////////// 33 | 34 | /* 35 | * Adds the event tree for the flower to the flower. 36 | * If an previous event tree exists for the flower 37 | * it will call eventTree_destruct for the existing tree 38 | * (which should not exist without the flower). 39 | */ 40 | void flower_setEventTree(Flower *flower, EventTree *eventTree); 41 | 42 | /* 43 | * This function is called by eventTree_destruct and cleans up the reference. 44 | */ 45 | void flower_removeEventTree(Flower *flower, EventTree *eventTree); 46 | 47 | /* 48 | * Adds the cap to the flower. 49 | */ 50 | void flower_addCap(Flower *flower, Cap *cap); 51 | 52 | /* 53 | * Bulk add a set of ends to the flower. 54 | */ 55 | void flower_bulkAddCaps(Flower *flower, stList *capsToAdd); 56 | 57 | /* 58 | * Adds the end to the flower. 59 | */ 60 | void flower_addEnd(Flower *flower, End *end); 61 | 62 | /* 63 | * Bulk add a set of ends to the flower. 64 | */ 65 | void flower_bulkAddEnds(Flower *flower, stList *endsToAdd); 66 | 67 | /* 68 | * Remove the end from the flower. 69 | */ 70 | void flower_removeEnd(Flower *flower, End *end); 71 | 72 | /* 73 | * Adds the group to the flower. 74 | */ 75 | void flower_addGroup(Flower *flower, Group *group); 76 | 77 | /* 78 | * Removes an empty group from the flower. 79 | */ 80 | void flower_removeGroup(Flower *flower, Group *group); 81 | 82 | /* 83 | * Sets the parent group of the flower. 84 | */ 85 | void flower_setParentGroup(Flower *flower, Group *group); 86 | 87 | /* 88 | * Adds the chain to the flower. 89 | */ 90 | void flower_addChain(Flower *flower, Chain *chain); 91 | 92 | /* 93 | * Remove the chain from the flower. 94 | */ 95 | void flower_removeChain(Flower *flower, Chain *chain); 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /api/impl/cactusGlobalsPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_GLOBALS_PRIVATE_H_ 8 | #define CACTUS_GLOBALS_PRIVATE_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "CuTest.h" 23 | #include "sonLib.h" 24 | #include "commonC.h" 25 | 26 | #define NAME_STRING "%" PRIi64 "" 27 | 28 | #include "cactusGroup.h" 29 | #include "cactusGroupPrivate.h" 30 | #include "cactusBlock.h" 31 | #include "cactusSegment.h" 32 | #include "cactusSegmentPrivate.h" 33 | #include "cactusBlockPrivate.h" 34 | #include "cactusChain.h" 35 | #include "cactusChainPrivate.h" 36 | #include "cactusEnd.h" 37 | #include "cactusCap.h" 38 | #include "cactusCapPrivate.h" 39 | #include "cactusEndPrivate.h" 40 | #include "cactusEvent.h" 41 | #include "cactusEventPrivate.h" 42 | #include "cactusEventTree.h" 43 | #include "cactusEventTreePrivate.h" 44 | #include "cactusGlobals.h" 45 | #include "cactusLink.h" 46 | #include "cactusLinkPrivate.h" 47 | #include "cactusSequence.h" 48 | #include "cactusSequencePrivate.h" 49 | #include "cactusFlower.h" 50 | #include "cactusDisk.h" 51 | #include "cactusDiskPrivate.h" 52 | #include "cactusMisc.h" 53 | #include "cactusFlowerPrivate.h" 54 | #include "cactusTestCommon.h" 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /api/impl/cactusGroupPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_ADJACENCY_COMPONENT_PRIVATE_H_ 8 | #define CACTUS_ADJACENCY_COMPONENT_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | struct _group { 13 | void *flowerOrChain; 14 | Link *nLink; 15 | Name name; 16 | End *firstEnd; // If a link, this becomes the 5end and the second is the 3end in the chain 17 | char bits; // 0 bit: is leaf, 1 bit: is link 18 | }; 19 | 20 | struct _group_endIterator { 21 | Group *group; 22 | End *end; 23 | }; 24 | 25 | //////////////////////////////////////////////// 26 | //////////////////////////////////////////////// 27 | //////////////////////////////////////////////// 28 | //Group functions. 29 | //////////////////////////////////////////////// 30 | //////////////////////////////////////////////// 31 | //////////////////////////////////////////////// 32 | 33 | /* 34 | * Constructs a nested flower without having the nested flower loaded in memory. 35 | */ 36 | Group *group_construct4(Flower *flower, Name nestedFlowerName, bool terminalGroup); 37 | 38 | /* 39 | * Removes the end from the group. 40 | */ 41 | void group_removeEnd(Group *group, End *end); 42 | 43 | /* 44 | * Adds an end to the group (the public function is end_setGroup). 45 | */ 46 | void group_addEnd(Group *group, End *end); 47 | 48 | /* 49 | * Set as a leaf group 50 | */ 51 | void group_setLeaf(Group *group, bool isLeaf); 52 | 53 | /* 54 | * Set the group as a link 55 | */ 56 | void group_setLink(Group *group, bool isLink); 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /api/impl/cactusLinkPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_LINK_PRIVATE_H_ 8 | #define CACTUS_LINK_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | //////////////////////////////////////////////// 13 | //////////////////////////////////////////////// 14 | //////////////////////////////////////////////// 15 | //Link functions. 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | //////////////////////////////////////////////// 19 | 20 | /* 21 | * Destructs the link and all subsequent links in the chain 22 | */ 23 | void link_destruct(Link *link); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /api/impl/cactusMisc.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "cactusGlobalsPrivate.h" 8 | #include 9 | #include 10 | 11 | //////////////////////////////////////////////// 12 | //////////////////////////////////////////////// 13 | //////////////////////////////////////////////// 14 | //Useful utility functions. 15 | //////////////////////////////////////////////// 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | 19 | int64_t cactusMisc_nameCompare(Name name1, Name name2) { 20 | return name1 > name2 ? 1 : (name1 < name2 ? -1 : 0); 21 | } 22 | 23 | Name cactusMisc_stringToName(const char *stringName) { 24 | assert(stringName != NULL); 25 | Name name; 26 | int64_t i = sscanf(stringName, NAME_STRING, &name); 27 | if (i != 1) { 28 | fprintf(stderr, "Can not get a valid name from the given string: %s\n", stringName); 29 | return NULL_NAME; 30 | } 31 | return name; 32 | } 33 | 34 | char *cactusMisc_nameToString(Name name) { 35 | char *cA; 36 | cA = st_malloc(sizeof(char) * 21); 37 | sprintf(cA, NAME_STRING, name); 38 | return cA; 39 | } 40 | 41 | const char *cactusMisc_getDefaultReferenceEventHeader() { 42 | return stString_print("reference"); 43 | } 44 | 45 | const char *CACTUS_CHECK_EXCEPTION_ID = "CACTUS_CHECK_EXCEPTION_ID"; 46 | 47 | void cactusCheck(bool condition) { 48 | if (!condition) { 49 | //assert(0); 50 | stThrowNew(CACTUS_CHECK_EXCEPTION_ID, "Cactus check condition failed"); 51 | } 52 | } 53 | 54 | void cactusCheck2(bool condition, char *string, ...) { 55 | if(!condition) { 56 | static char cA[100000]; 57 | va_list ap; 58 | va_start(ap, string); 59 | vsprintf(cA, string, ap); 60 | va_end(ap); 61 | //assert(0); 62 | assert(strlen(cA) < 100000); 63 | stThrowNew(CACTUS_CHECK_EXCEPTION_ID, "Cactus check condition failed: %s", cA); 64 | } 65 | } 66 | 67 | -------------------------------------------------------------------------------- /api/impl/cactusSegmentPrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_ATOM_INSTANCE_PRIVATE_H_ 8 | #define CACTUS_ATOM_INSTANCE_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | //////////////////////////////////////////////// 13 | //////////////////////////////////////////////// 14 | //////////////////////////////////////////////// 15 | //Private segment functions 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | //////////////////////////////////////////////// 19 | 20 | /* 21 | * Constructs segment with the two caps, which must both have the same instance name. 22 | */ 23 | Segment *segment_construct3(Name name, Block *block, 24 | Cap *_5Cap, Cap *_3Cap); 25 | 26 | /* 27 | * Destruct the segment, does not destruct ends. 28 | */ 29 | void segment_destruct(Segment *segment); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /api/impl/cactusSequencePrivate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_META_SEQUENCE_PRIVATE_H_ 8 | #define CACTUS_META_SEQUENCE_PRIVATE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | struct _sequence { 13 | Name name; 14 | Name stringName; 15 | int64_t start; 16 | int64_t length; 17 | Event *event; 18 | CactusDisk *cactusDisk; 19 | char *header; 20 | bool isTrivialSequence; //This flag is used to indicate if a sequence is trivial. 21 | }; 22 | 23 | //////////////////////////////////////////////// 24 | //////////////////////////////////////////////// 25 | //////////////////////////////////////////////// 26 | //Private meta sequence functions. 27 | //////////////////////////////////////////////// 28 | //////////////////////////////////////////////// 29 | //////////////////////////////////////////////// 30 | 31 | /* 32 | * Constructs a meta sequence using an existing reference to a sequence in the sequence file. 33 | */ 34 | Sequence *sequence_construct2(Name name, int64_t start, int64_t length, Name stringName, const char *header, 35 | Event *event, bool isTrivialSequence, CactusDisk *cactusDisk); 36 | 37 | /* 38 | * Destructs a meta sequence. 39 | */ 40 | void sequence_destruct(Sequence *sequence); 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /api/impl/cactusTestCommon.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "cactusGlobalsPrivate.h" 8 | 9 | //////////////////////////////////////////////// 10 | //////////////////////////////////////////////// 11 | //////////////////////////////////////////////// 12 | //Functions shared by the test code. 13 | //////////////////////////////////////////////// 14 | //////////////////////////////////////////////// 15 | //////////////////////////////////////////////// 16 | 17 | char *testCommon_getTmpTestDir(const char *testName) { 18 | return stFile_pathJoin("test-output/tmp", testName); 19 | } 20 | 21 | Name testCommon_addThreadToFlower(Flower *flower, char *header, int64_t length) { 22 | char *dna = stRandom_getRandomDNAString(length, true, true, true); 23 | EventTree *eventTree = flower_getEventTree(flower); 24 | assert(eventTree != NULL); 25 | Sequence *sequence = sequence_construct(2, length, dna, header, eventTree_getRootEvent(eventTree), flower_getCactusDisk(flower)); 26 | 27 | End *end1 = end_construct2(0, 0, flower); 28 | End *end2 = end_construct2(1, 0, flower); 29 | Cap *cap1 = cap_construct2(end1, 1, 1, sequence); 30 | Cap *cap2 = cap_construct2(end2, length + 2, 1, sequence); 31 | cap_makeAdjacent(cap1, cap2); 32 | 33 | free(dna); 34 | return cap_getName(cap1); 35 | } 36 | -------------------------------------------------------------------------------- /api/inc/cactus.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_H_ 8 | #define CACTUS_H_ 9 | 10 | #include "cactusGroup.h" 11 | #include "cactusBlock.h" 12 | #include "cactusSegment.h" 13 | #include "cactusChain.h" 14 | #include "cactusEnd.h" 15 | #include "cactusCap.h" 16 | #include "cactusEvent.h" 17 | #include "cactusEventTree.h" 18 | #include "cactusGlobals.h" 19 | #include "cactusLink.h" 20 | #include "cactusSequence.h" 21 | #include "cactusFlower.h" 22 | #include "cactusDisk.h" 23 | #include "cactusMisc.h" 24 | #include "cactusTestCommon.h" 25 | #include "cactus_params_parser.h" 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /api/inc/cactusChain.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_CHAIN_H_ 8 | #define CACTUS_CHAIN_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | //////////////////////////////////////////////// 13 | //////////////////////////////////////////////// 14 | //////////////////////////////////////////////// 15 | //Basic chain functions. 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | //////////////////////////////////////////////// 19 | 20 | /* 21 | * Constructs a chain, which in turn holds links. 22 | */ 23 | Chain *chain_construct(Flower *flower); 24 | 25 | /* 26 | * Destructs the chain. Does not mess with groups, should be clean. 27 | */ 28 | void chain_destruct(Chain *chain); 29 | 30 | /* 31 | * Gets the first link in the chain. 32 | */ 33 | Link *chain_getFirst(Chain *chain); 34 | 35 | /* 36 | * Gets the last link in the chain. 37 | */ 38 | Link *chain_getLast(Chain *chain); 39 | 40 | /* 41 | * Gets the name of the chain in the flower. 42 | */ 43 | Name chain_getName(Chain *chain); 44 | 45 | /* 46 | * Gets the parent flower of the chain. 47 | */ 48 | Flower *chain_getFlower(Chain *chain); 49 | 50 | /* 51 | * Returns non-zero if the chain is circular. 52 | */ 53 | bool chain_isCircular(Chain *chain); 54 | 55 | /* 56 | * Checks (amongst other things) the following: 57 | * That each link is properly contained in the chain. 58 | * Links and the contained ends are properly connected. 59 | * That each contiguous pair of link groups are bridged by a block. 60 | * If a block end is at the 5 or 3 prime end of a chain the other end of the 61 | * block is not in a link group (otherwise the chain is not maximal). 62 | * That stub ends are not the ends of the links in the chain. 63 | */ 64 | void chain_check(Chain *chain); 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /api/inc/cactusDisk.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_DISK_H_ 8 | #define CACTUS_DISK_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | // General database exception id 13 | extern const char *CACTUS_DISK_EXCEPTION_ID; 14 | 15 | //////////////////////////////////////////////// 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | //Basic cactus disk functions. 19 | //////////////////////////////////////////////// 20 | //////////////////////////////////////////////// 21 | //////////////////////////////////////////////// 22 | 23 | /* 24 | * Constructs a cactus disk to hold the flower hierarchy. 25 | */ 26 | CactusDisk *cactusDisk_construct(); 27 | 28 | /* 29 | * Destructs the cactus disk and all open flowers and sequences, and 30 | * then disconnects from the cactus DB. 31 | */ 32 | void cactusDisk_destruct(CactusDisk *cactusDisk); 33 | 34 | /* 35 | * Retrieves the next unique ID. 36 | */ 37 | int64_t cactusDisk_getUniqueID(CactusDisk *cactusDisk); 38 | 39 | /* 40 | * Retrieves a contiguous interval of unique ids starting from the return value to return value + intervalSize (exclusive). 41 | */ 42 | int64_t cactusDisk_getUniqueIDInterval(CactusDisk *cactusDisk, int64_t intervalSize); 43 | 44 | /* 45 | * Gets a flower the cactusDisk contains. If the flower is not in memory it will be loaded. If not in memory or on disk, returns NULL. 46 | */ 47 | Flower *cactusDisk_getFlower(CactusDisk *cactusDisk, Name flowerName); 48 | 49 | /* 50 | * Gets a sequence 51 | */ 52 | Sequence *cactusDisk_getSequence(CactusDisk *cactusDisk, Name sequenceName); 53 | 54 | /* 55 | * Get the event tree. 56 | */ 57 | EventTree *cactusDisk_getEventTree(CactusDisk *cactusDisk); 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /api/inc/cactusGlobals.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_GLOBALS_H_ 8 | #define CACTUS_GLOBALS_H_ 9 | 10 | #include 11 | 12 | /* 13 | * For the basic lib stuff 14 | */ 15 | #include "sonLib.h" 16 | /* 17 | * For lists 18 | */ 19 | #include "commonC.h" 20 | 21 | //////////////////////////////////////////////// 22 | //////////////////////////////////////////////// 23 | //////////////////////////////////////////////// 24 | //Basic data structure declarations (contents hidden) 25 | //////////////////////////////////////////////// 26 | //////////////////////////////////////////////// 27 | //////////////////////////////////////////////// 28 | 29 | #define NULL_NAME INT64_MAX 30 | typedef int64_t Name; 31 | typedef struct _event Event; 32 | typedef struct _eventTree EventTree; 33 | typedef struct _sequence Sequence; 34 | typedef struct _end End; 35 | typedef struct _cap Cap; 36 | typedef struct _cap Segment; 37 | typedef struct _end Block; 38 | typedef struct _group Group; 39 | typedef struct _group Link; 40 | typedef struct _chain Chain; 41 | typedef struct _flower Flower; 42 | typedef struct _cactusDisk CactusDisk; 43 | typedef stSortedSetIterator EventTree_Iterator; 44 | typedef struct _end_instanceIterator End_InstanceIterator; 45 | typedef struct _block_instanceIterator Block_InstanceIterator; 46 | typedef struct _group_endIterator Group_EndIterator; 47 | typedef stListIterator Flower_SequenceIterator; 48 | typedef stListIterator Flower_CapIterator; 49 | typedef stListIterator Flower_EndIterator; 50 | typedef stListIterator Flower_GroupIterator; 51 | typedef stListIterator Flower_ChainIterator; 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /api/inc/cactusLink.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_LINK_H_ 8 | #define CACTUS_LINK_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | //////////////////////////////////////////////// 13 | //////////////////////////////////////////////// 14 | //////////////////////////////////////////////// 15 | //Basic link functions. 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | //////////////////////////////////////////////// 19 | 20 | /* 21 | * Construct a link. 22 | */ 23 | Link *link_construct(End *_3End, End *_5End, Group *group, Chain *parentChain); 24 | 25 | /* 26 | * Gets the next link in the link. 27 | */ 28 | Link *link_getNextLink(Link *link); 29 | 30 | /* 31 | * Gets the nested flower the link contains. 32 | */ 33 | Group *link_getGroup(Link *link); 34 | 35 | /* 36 | * Gets the left end of the link in the link, which will 37 | * be positively oriented and a 3' end. 38 | */ 39 | End *link_get3End(Link *link); 40 | 41 | /* 42 | * Gets the right end of the link in the link, which will 43 | * be positively oriented and a 5' end. 44 | */ 45 | End *link_get5End(Link *link); 46 | 47 | /* 48 | * Gets the chain the link is part of. 49 | */ 50 | Chain *link_getChain(Link *link); 51 | 52 | /* 53 | * Returns true if and only if the two ends of the link are block ends and they are always adjacent (no self loops), and 54 | * all the adjacencies are trivial (empty). 55 | */ 56 | bool link_isTrivial(Link *link); 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /api/inc/cactusMisc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_MISC_H_ 8 | #define CACTUS_MISC_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | //////////////////////////////////////////////// 13 | //////////////////////////////////////////////// 14 | //////////////////////////////////////////////// 15 | //Useful utility functions. 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | //////////////////////////////////////////////// 19 | 20 | extern const char *CACTUS_CHECK_EXCEPTION_ID; 21 | 22 | /* 23 | * Compares to names, giving an ordering to names (arbitrary but consistent). 24 | */ 25 | int64_t cactusMisc_nameCompare(Name name1, Name name2); 26 | 27 | /* 28 | * Converts the string which holds the name (and nothing else), into a name. 29 | */ 30 | Name cactusMisc_stringToName(const char *stringName); 31 | 32 | /* 33 | * Creates a new string (which must be freed) representing the name as a string. 34 | */ 35 | char *cactusMisc_nameToString(Name name); 36 | 37 | /* 38 | * Creates a new string with orientation sign (which must be freed) representing the name as a string. 39 | */ 40 | char *cactusMisc_nameToStringWithOrientation(Name name, int64_t orientation); 41 | 42 | /* 43 | * Gets the default name of the reference event string. 44 | */ 45 | const char *cactusMisc_getDefaultReferenceEventHeader(); 46 | 47 | /* 48 | * Check a condition is true, if not throw an exception - a short hand to defining your own exception. 49 | */ 50 | void cactusCheck(bool condition); 51 | 52 | /* 53 | * Check a condition is true, if not throw an exception with the given string. 54 | */ 55 | void cactusCheck2(bool condition, char *string, ...); 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /api/inc/cactusSequence.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_SEQUENCE_H_ 8 | #define CACTUS_SEQUENCE_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | //////////////////////////////////////////////// 13 | //////////////////////////////////////////////// 14 | //////////////////////////////////////////////// 15 | //Meta sequence functions. 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | //////////////////////////////////////////////// 19 | 20 | /* 21 | * Constructs a meta sequence, which contains all the essential info for a sequence. 22 | * 23 | * This function is NOT thread safe, do not try to have concurrent instances of this function! 24 | */ 25 | Sequence *sequence_construct(int64_t start, int64_t length, const char *string, const char *header, 26 | Event *event, CactusDisk *cactusDisk); 27 | 28 | /* 29 | * Adds the isTrivialSequence field. 30 | */ 31 | Sequence *sequence_construct3(int64_t start, int64_t length, const char *string, const char *header, Event *event, 32 | bool isTrivialSequence, CactusDisk *cactusDisk); 33 | 34 | /* 35 | * Gets the name of the sequence. 36 | */ 37 | Name sequence_getName(Sequence *sequence); 38 | 39 | /* 40 | * Gets the start coordinate of the sequence. 41 | */ 42 | int64_t sequence_getStart(Sequence *sequence); 43 | 44 | /* 45 | * Gets the length of the sequence. 46 | */ 47 | int64_t sequence_getLength(Sequence *sequence); 48 | 49 | /* 50 | * Gets the associated event name. 51 | */ 52 | Event *sequence_getEvent(Sequence *sequence); 53 | 54 | /* 55 | * Gets a string for representing a subsequence of the meta sequence. 56 | */ 57 | char *sequence_getString(Sequence *sequence, int64_t start, int64_t length, int64_t strand); 58 | 59 | /* 60 | * Gets the header line associated with the meta sequence. 61 | */ 62 | const char *sequence_getHeader(Sequence *sequence); 63 | 64 | /* 65 | * Returns flag indicating if sequence is trivial. 66 | */ 67 | bool sequence_isTrivialSequence(Sequence *sequence); 68 | 69 | /* 70 | * Sets the header line associated with the meta sequence. 71 | */ 72 | void sequence_setHeader(Sequence *sequence, char *newHeader); 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /api/inc/cactusTestCommon.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #ifndef CACTUS_TEST_COMMON_H_ 8 | #define CACTUS_TEST_COMMON_H_ 9 | 10 | #include "cactusGlobals.h" 11 | 12 | //////////////////////////////////////////////// 13 | //////////////////////////////////////////////// 14 | //////////////////////////////////////////////// 15 | //Functions shared by the test code. 16 | //////////////////////////////////////////////// 17 | //////////////////////////////////////////////// 18 | //////////////////////////////////////////////// 19 | 20 | /* 21 | * Get a temporary directory for a test. 22 | */ 23 | char *testCommon_getTmpTestDir(const char *testName); 24 | 25 | /* 26 | * Adds a thread with random nucleotides to the flower, and return its corresponding name in the pinch graph. 27 | */ 28 | Name testCommon_addThreadToFlower(Flower *flower, char *header, int64_t length); 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /api/inc/cactus_params_parser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Released under the MIT license, see LICENSE.txt 3 | */ 4 | 5 | #ifndef ST_CACTUS_PARAMS_PARSER_H_ 6 | #define ST_CACTUS_PARAMS_PARSER_H_ 7 | 8 | #include 9 | #include 10 | 11 | /* 12 | * Cactus parameters object. 13 | */ 14 | typedef struct _cactusParams { 15 | xmlDocPtr doc; // The underlying xml document representing the parameters 16 | xmlNodePtr root; // The root node 17 | xmlNodePtr cur; // The node of the xml tree we search from to retrieve parameters. 18 | // can be set by cactusParams_set_root(CactusParams *p, int, ...), by default is set 19 | // to the root of the tree. 20 | } CactusParams; 21 | 22 | /* 23 | * Cleanup the CactusParams. 24 | */ 25 | void cactusParams_destruct(CactusParams *p); 26 | 27 | /* 28 | * Load the CactusParams. 29 | */ 30 | CactusParams *cactusParams_load(char *file_name); 31 | 32 | /* 33 | * Set the root node of the params tree. 34 | * e.g. cactusParams_set_root(p, 2, "blast", "divergence") would set the root 35 | * to the cactusWorkflowConfig->caf->divergence node. 36 | */ 37 | void cactusParams_set_root(CactusParams *p, int num, ...); 38 | 39 | /* 40 | * Get a string parameter. 41 | */ 42 | char *cactusParams_get_string(CactusParams *p, int, ...); 43 | 44 | /* 45 | * Get an integer parameter. 46 | */ 47 | int64_t cactusParams_get_int(CactusParams *p, int, ...); 48 | 49 | /* 50 | * Get a float parameter. 51 | */ 52 | double cactusParams_get_float(CactusParams *p, int, ...); 53 | 54 | /* 55 | * Get a sequence of integers 56 | */ 57 | int64_t *cactusParams_get_ints(CactusParams *p, int64_t *length, int, ...); 58 | 59 | #endif /* ST_CACTUS_PARAMS_PARSER_H_ */ 60 | -------------------------------------------------------------------------------- /api/tests/allTests.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "cactusGlobalsPrivate.h" 8 | 9 | CuSuite *cactusGroupTestSuite(); 10 | CuSuite *cactusSegmentTestSuite(); 11 | CuSuite *cactusBlockTestSuite(); 12 | CuSuite *cactusChainTestSuite(); 13 | CuSuite *cactusCapTestSuite(); 14 | CuSuite *cactusEndTestSuite(); 15 | CuSuite *cactusEventTestSuite(); 16 | CuSuite *cactusEventTreeTestSuite(); 17 | CuSuite *cactusLinkTestSuite(); 18 | CuSuite *cactusSequenceTestSuite(); 19 | CuSuite *cactusDiskTestSuite(); 20 | CuSuite *cactusMiscTestSuite(); 21 | CuSuite *cactusFlowerTestSuite(); 22 | CuSuite *cactusParamsTestSuite(void); 23 | 24 | int cactusAPIRunAllTests(void) { 25 | CuString *output = CuStringNew(); 26 | CuSuite* suite = CuSuiteNew(); 27 | CuSuiteAddSuite(suite, cactusEventTestSuite()); 28 | CuSuiteAddSuite(suite, cactusGroupTestSuite()); 29 | CuSuiteAddSuite(suite, cactusSegmentTestSuite()); 30 | CuSuiteAddSuite(suite, cactusBlockTestSuite()); 31 | CuSuiteAddSuite(suite, cactusChainTestSuite()); 32 | CuSuiteAddSuite(suite, cactusCapTestSuite()); 33 | CuSuiteAddSuite(suite, cactusEndTestSuite()); 34 | CuSuiteAddSuite(suite, cactusEventTreeTestSuite()); 35 | CuSuiteAddSuite(suite, cactusLinkTestSuite()); 36 | CuSuiteAddSuite(suite, cactusSequenceTestSuite()); 37 | CuSuiteAddSuite(suite, cactusDiskTestSuite()); 38 | CuSuiteAddSuite(suite, cactusMiscTestSuite()); 39 | CuSuiteAddSuite(suite, cactusFlowerTestSuite()); 40 | CuSuiteAddSuite(suite, cactusParamsTestSuite()); 41 | CuSuiteRun(suite); 42 | CuSuiteSummary(suite, output); 43 | CuSuiteDetails(suite, output); 44 | printf("%s\n", output->buffer); 45 | int i= suite->failCount > 0; 46 | CuSuiteDelete(suite); 47 | CuStringDelete(output); 48 | return i; 49 | } 50 | 51 | int main(int argc, char *argv[]) { 52 | if(argc == 2) { 53 | st_setLogLevelFromString(argv[1]); 54 | } 55 | int i = cactusAPIRunAllTests(); 56 | //while(1); 57 | return i; 58 | } 59 | -------------------------------------------------------------------------------- /api/tests/cactusBlocksTestShared.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "cactusGlobalsPrivate.h" 8 | 9 | static CactusDisk *cactusDisk; 10 | static Flower *flower; 11 | static EventTree *eventTree; 12 | static Sequence *sequence; 13 | static Event *rootEvent; 14 | static Event *leafEvent; 15 | 16 | static Block *block; 17 | static Segment *rootSegment; 18 | static Segment *leaf1Segment; 19 | static Segment *leaf2Segment; 20 | 21 | static void cactusBlocksTestSharedTeardown(const char *testName) { 22 | if (cactusDisk != NULL) { 23 | cactusDisk_destruct(cactusDisk); 24 | cactusDisk = NULL; 25 | } 26 | } 27 | 28 | static void cactusBlocksTestSharedSetup(const char *testName) { 29 | cactusBlocksTestSharedTeardown(testName); 30 | cactusDisk = cactusDisk_construct(); 31 | flower = flower_construct(cactusDisk); 32 | 33 | eventTree = eventTree_construct2(cactusDisk); 34 | 35 | rootEvent = eventTree_getRootEvent(eventTree); 36 | leafEvent = event_construct3("LEAF1", 0.2, rootEvent, eventTree); 37 | 38 | sequence = sequence_construct(1, 10, "ACTGACTGAC", ">one", 39 | leafEvent, cactusDisk); 40 | flower_addSequence(flower, sequence); 41 | 42 | block = block_construct(3, flower); 43 | leaf2Segment = segment_construct2(block_getReverse(block), 4, 0, sequence); 44 | leaf1Segment = segment_construct2(block, 2, 1, sequence); 45 | rootSegment = segment_construct(block_getReverse(block), rootEvent); 46 | } 47 | -------------------------------------------------------------------------------- /api/tests/cactusEndsTestShared.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "cactusGlobalsPrivate.h" 8 | 9 | static CactusDisk *cactusDisk = NULL; 10 | static Flower *flower; 11 | static EventTree *eventTree; 12 | static Sequence *sequence; 13 | static End *end; 14 | 15 | static Event *rootEvent; 16 | static Event *leafEvent; 17 | 18 | static Cap *rootCap; 19 | static Cap *leaf1Cap; 20 | static Cap *leaf2Cap; 21 | static Cap *leaf3Cap; 22 | 23 | static void cactusEndsTestSharedTeardown(const char *testName) { 24 | if (cactusDisk != NULL) { 25 | cactusDisk_destruct(cactusDisk); 26 | cactusDisk = NULL; 27 | } 28 | } 29 | 30 | static void cactusEndsTestSharedSetup(const char *testName) { 31 | cactusEndsTestSharedTeardown(testName); 32 | cactusDisk = cactusDisk_construct(); 33 | flower = flower_construct(cactusDisk); 34 | 35 | eventTree = eventTree_construct2(cactusDisk); 36 | 37 | rootEvent = eventTree_getRootEvent(eventTree); 38 | leafEvent = event_construct3("LEAF2", 0.2, rootEvent, eventTree); 39 | 40 | sequence = sequence_construct(0, 10, "ACTGACTGAC", ">one", 41 | leafEvent, cactusDisk); 42 | flower_addSequence(flower, sequence); 43 | 44 | end = end_construct(1, flower); 45 | 46 | leaf3Cap = cap_construct2(end_getReverse(end), 7, 0, sequence); 47 | leaf2Cap = cap_construct2(end, 6, 0, sequence); 48 | leaf1Cap = cap_construct2(end_getReverse(end), 4, 1, sequence); 49 | rootCap = cap_construct(end_getReverse(end), rootEvent); 50 | } 51 | -------------------------------------------------------------------------------- /api/tests/cactusMiscTest.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "cactusGlobalsPrivate.h" 8 | 9 | static CactusDisk *cactusDisk = NULL; 10 | 11 | static void cactusMiscTestTeardown(CuTest* testCase) { 12 | if (cactusDisk != NULL) { 13 | cactusDisk_destruct(cactusDisk); 14 | cactusDisk = NULL; 15 | } 16 | } 17 | 18 | static void cactusMiscTestSetup(CuTest* testCase) { 19 | cactusMiscTestTeardown(testCase); 20 | cactusDisk = cactusDisk_construct(); 21 | } 22 | 23 | void testCactusMisc_nameCompare(CuTest* testCase) { 24 | cactusMiscTestSetup(testCase); 25 | Name name = cactusDisk_getUniqueID(cactusDisk); 26 | Name name2 = cactusDisk_getUniqueID(cactusDisk); 27 | CuAssertTrue(testCase, cactusMisc_nameCompare(name, name2) == -1); 28 | CuAssertTrue(testCase, cactusMisc_nameCompare(name2, name) == 1); 29 | CuAssertTrue(testCase, cactusMisc_nameCompare(name, name) == 0); 30 | cactusMiscTestTeardown(testCase); 31 | } 32 | 33 | void testCactusMisc_stringNameFns(CuTest* testCase) { 34 | cactusMiscTestSetup(testCase); 35 | int64_t i; 36 | for (i = 0; i < 1000000; i++) { 37 | Name name = cactusDisk_getUniqueID(cactusDisk); 38 | char *cA = cactusMisc_nameToString(name); 39 | CuAssertTrue( 40 | testCase, 41 | cactusMisc_nameCompare( 42 | cactusMisc_stringToName(cA), name) 43 | == 0); 44 | free(cA); 45 | } 46 | cactusMiscTestTeardown(testCase); 47 | } 48 | 49 | static void testCactusCheck(CuTest* testCase) { 50 | //While we have an assert that fails in that function to provide a stack trace. 51 | cactusCheck(1); 52 | stTry { 53 | cactusCheck(0); 54 | CuAssertTrue(testCase, 0); 55 | } stCatch(except) { 56 | st_logInfo("This is the message %s\n", stExcept_getMsg(except)); 57 | //stExcept_free(except); 58 | } stTryEnd 59 | 60 | cactusCheck2(1, "This shouldn't throw an exception: %s", "blah"); 61 | stTry { 62 | cactusCheck2(0, "This should throw an exception: %s", "blah"); 63 | CuAssertTrue(testCase, 0); 64 | } stCatch(except) { 65 | st_logInfo("This is the message: %s\n", stExcept_getMsg(except)); 66 | //stExcept_free(except); 67 | } stTryEnd 68 | } 69 | 70 | CuSuite* cactusMiscTestSuite(void) { 71 | CuSuite* suite = CuSuiteNew(); 72 | SUITE_ADD_TEST(suite, testCactusMisc_nameCompare); 73 | SUITE_ADD_TEST(suite, testCactusMisc_stringNameFns); 74 | SUITE_ADD_TEST(suite, testCactusCheck); 75 | return suite; 76 | } 77 | -------------------------------------------------------------------------------- /api/tests/cactusParamsTest.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "CuTest.h" 8 | #include "sonLib.h" 9 | #include "cactus_params_parser.h" 10 | 11 | static char *params_file = "./src/cactus/cactus_progressive_config.xml"; 12 | 13 | static void testCactusParams(CuTest *testCase) { 14 | CactusParams *p = cactusParams_load(params_file); 15 | 16 | const char *c = cactusParams_get_string(p, 3, "blast", "lastzArguments", "default"); 17 | CuAssertStrEquals(testCase, "--step=1 --ambiguous=iupac,100,100 --ydrop=3000 --queryhspbest=100000", c); 18 | 19 | int64_t i = cactusParams_get_int(p, 3, "bar", "pecan", "spanningTrees"); 20 | CuAssertIntEquals(testCase, 5, i); 21 | 22 | double d = cactusParams_get_float(p, 3, "bar", "poa", "partialOrderAlignmentBandFraction"); 23 | CuAssertDblEquals(testCase, 0.1, d, 0.000001); 24 | 25 | int64_t length; 26 | int64_t *l = cactusParams_get_ints(p, &length, 2, "caf", "deannealingRounds"); 27 | CuAssertTrue(testCase, length >= 3); 28 | CuAssertIntEquals(testCase, l[0], 2); 29 | CuAssertIntEquals(testCase, l[1], 32); 30 | CuAssertIntEquals(testCase, l[2], 256); 31 | 32 | // Test moving the root of the search 33 | cactusParams_set_root(p, 1, "caf"); 34 | 35 | i = cactusParams_get_int(p, 1, "trim"); 36 | CuAssertIntEquals(testCase, 3, i); 37 | 38 | // Check we can set it back 39 | cactusParams_set_root(p, 0); 40 | i = cactusParams_get_int(p, 3, "bar", "pecan", "spanningTrees"); 41 | CuAssertIntEquals(testCase, 5, i); 42 | 43 | cactusParams_destruct(p); // Cleanup 44 | free(l); 45 | } 46 | 47 | CuSuite* cactusParamsTestSuite(void) { 48 | CuSuite* suite = CuSuiteNew(); 49 | SUITE_ADD_TEST(suite, testCactusParams); 50 | return suite; 51 | } 52 | -------------------------------------------------------------------------------- /bar/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = .. 2 | include ${rootPath}/include.mk 3 | 4 | libSources = impl/*.c 5 | libHeaders = inc/*.h 6 | libTests = tests/adjacencySequencesTest.c tests/allTests.c tests/endAlignerTest.c tests/flowerAlignerTest.c tests/rescueTest.c tests/poaBarTest.c 7 | libRunEndAlignment = tests/runEndAlignment.c 8 | 9 | commonBarLibs = ${LIBDIR}/stCaf.a ${LIBDIR}/stPaf.a ${sonLibDir}/stPinchesAndCacti.a ${LIBDIR}/cactusLib.a ${sonLibDir}/3EdgeConnected.a ${sonLibDir}/cPecanLib.a 10 | stBarDependencies = ${commonBarLibs} ${LIBDEPENDS} 11 | LDLIBS += ${commonBarLibs} ${sonLibDir}/sonLib.a ${databaseLibs} -lm 12 | # simde (included via abPOA) doesn't compile with --Werror --pedantic 13 | CFLAGS:=$(filter-out --pedantic,$(CFLAGS)) 14 | 15 | all: all_libs all_progs 16 | all_libs: ${LIBDIR}/cactusBarLib.a 17 | all_progs: all_libs 18 | ${MAKE} ${BINDIR}/cactus_barTests 19 | 20 | clean : 21 | rm -f ${BINDIR}/cactus_barTests ${LIBDIR}/cactusBarLib.a *.o 22 | 23 | ${BINDIR}/cactus_barTests : ${libTests} tests/*.h ${LIBDIR}/cactusBarLib.a ${stBarDependencies} 24 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -Wno-error -o ${BINDIR}/cactus_barTests ${libTests} ${LIBDIR}/cactusBarLib.a ${LDLIBS} 25 | 26 | ${LIBDIR}/cactusBarLib.a : ${libSources} ${libHeaders} ${stBarDependencies} 27 | # the -Wno-unused-function is required to include abpoa.h with CGL_DEBUG defined 28 | ${CC} ${CPPFLAGS} ${CFLAGS} -c ${libSources} -Wno-unused-function 29 | ${AR} rc cactusBarLib.a *.o 30 | ${RANLIB} cactusBarLib.a 31 | mv cactusBarLib.a ${LIBDIR}/ 32 | -------------------------------------------------------------------------------- /bar/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /bar/impl/adjacencySequences.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | /* 8 | * getSequences.c 9 | * 10 | * Created on: 24 Jun 2010 11 | * Author: benedictpaten 12 | */ 13 | 14 | #include "adjacencySequences.h" 15 | 16 | /* 17 | * Gets the raw sequence. 18 | */ 19 | static char *getAdjacencySequenceP(Cap *cap, int64_t maxLength) { 20 | Sequence *sequence = cap_getSequence(cap); 21 | assert(sequence != NULL); 22 | Cap *cap2 = cap_getAdjacency(cap); 23 | assert(cap2 != NULL); 24 | assert(!cap_getSide(cap)); 25 | 26 | if (cap_getStrand(cap)) { 27 | int64_t length = cap_getCoordinate(cap2) - cap_getCoordinate(cap) - 1; 28 | assert(length >= 0); 29 | assert(maxLength >= 0); 30 | return sequence_getString(sequence, cap_getCoordinate(cap) + 1, length 31 | > maxLength ? maxLength : length, 1); 32 | } else { 33 | int64_t length = cap_getCoordinate(cap) - cap_getCoordinate(cap2) - 1; 34 | assert(length >= 0); 35 | return sequence_getString(sequence, 36 | length > maxLength ? cap_getCoordinate(cap) - maxLength 37 | : cap_getCoordinate(cap2) + 1, 38 | length > maxLength ? maxLength : length, 0); 39 | } 40 | } 41 | 42 | AdjacencySequence *adjacencySequence_construct(Cap *cap, int64_t maxLength) { 43 | AdjacencySequence *subSequence = (AdjacencySequence *) st_malloc( 44 | sizeof(AdjacencySequence)); 45 | subSequence->string = getAdjacencySequenceP(cap, maxLength); 46 | Cap *adjacentCap = cap_getAdjacency(cap); 47 | assert(adjacentCap != NULL); 48 | assert(!cap_getSide(cap)); 49 | assert(cap_getSequence(cap) != NULL); 50 | subSequence->subsequenceIdentifier = cap_getName(cap_getStrand(cap) ? cap : adjacentCap); 51 | subSequence->strand = cap_getStrand(cap); 52 | subSequence->start = cap_getCoordinate(cap) + (cap_getStrand(cap) ? 1 : -1); 53 | subSequence->length = strlen(subSequence->string); 54 | subSequence->hasStubEnd = end_isFree(cap_getEnd(adjacentCap)) && end_isStubEnd(cap_getEnd(adjacentCap)); 55 | return subSequence; 56 | } 57 | 58 | void adjacencySequence_destruct(AdjacencySequence *subSequence) { 59 | free(subSequence->string); 60 | free(subSequence); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /bar/inc/adjacencySequences.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | /* 8 | * adjacencySequences.h 9 | * 10 | * Created on: 1 Jul 2010 11 | * Author: benedictpaten 12 | */ 13 | 14 | #ifndef ADJACENCYSEQUENCES_H_ 15 | #define ADJACENCYSEQUENCES_H_ 16 | 17 | #include "cactus.h" 18 | #include "sonLib.h" 19 | 20 | /* 21 | * Datastructure to hold adjacency sequence. 22 | */ 23 | typedef struct _AdjacencySequence { 24 | char *string; 25 | int64_t subsequenceIdentifier; 26 | bool strand; 27 | int64_t start; 28 | int64_t length; 29 | bool hasStubEnd; 30 | } AdjacencySequence; 31 | 32 | /* 33 | * Gets an adjacency sequence struct for the given adjacency from the cap. 34 | */ 35 | AdjacencySequence *adjacencySequence_construct(Cap *cap, int64_t maxLength); 36 | 37 | /* 38 | * Destructs the adjacency sequence. 39 | */ 40 | void adjacencySequence_destruct(AdjacencySequence *subSequence); 41 | 42 | 43 | #endif /* ADJACENCYSEQUENCES_H_ */ 44 | -------------------------------------------------------------------------------- /bar/inc/endAligner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | /* 8 | * endAligner.h 9 | * 10 | * Created on: 1 Jul 2010 11 | * Author: benedictpaten 12 | */ 13 | 14 | #ifndef ENDALIGNER_H_ 15 | #define ENDALIGNER_H_ 16 | 17 | #include "sonLib.h" 18 | #include "cactus.h" 19 | #include "pairwiseAligner.h" 20 | 21 | typedef struct _AlignedPair { 22 | int64_t subsequenceIdentifier; 23 | int64_t position; 24 | bool strand; 25 | int64_t score; 26 | struct _AlignedPair *reverse; 27 | } AlignedPair; 28 | 29 | /* 30 | * Constructs the an aligned pair. 31 | */ 32 | AlignedPair *alignedPair_construct(int64_t subsequenceIdentifier1, int64_t position1, bool strand1, 33 | int64_t subsequenceIdentifier2, int64_t position2, bool strand2, int64_t score1, int64_t score2); 34 | 35 | /* 36 | * Destruct the aligned pair. 37 | */ 38 | void alignedPair_destruct(AlignedPair *alignedPair); 39 | 40 | /* 41 | * Compares two aligned pairs. 42 | */ 43 | int alignedPair_cmpFn(const AlignedPair *alignedPair1, const AlignedPair *alignedPair2); 44 | 45 | /* 46 | * Creates a global alignment (as a set of aligned pairs) of the sequences from the end, 47 | * the pairs returned are ordered according 48 | * to the alignerPair comparison function. 49 | */ 50 | stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength, 51 | bool useProgressiveMerging, float gapGamma, 52 | PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters); 53 | 54 | /* 55 | * Writes an end alignment to the given file. 56 | */ 57 | void writeEndAlignmentToDisk(End *end, stSortedSet *endAlignment, FILE *fileHandle); 58 | 59 | /* 60 | * Loads an end alignment from the given file. 61 | */ 62 | stSortedSet *loadEndAlignmentFromDisk(Flower *flower, FILE *fileHandle, End **end); 63 | 64 | 65 | #endif /* ENDALIGNER_H_ */ 66 | -------------------------------------------------------------------------------- /bar/inc/flowerAligner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | /* 8 | * flowerAligner.h 9 | * 10 | * Created on: 2 Jul 2010 11 | * Author: benedictpaten 12 | */ 13 | 14 | #ifndef FLOWER_ALIGNER_H_ 15 | #define FLOWER_ALIGNER_H_ 16 | 17 | #include "pairwiseAligner.h" 18 | 19 | /* 20 | * Constructs an alignment for the flower by constructing an alignment for each end 21 | * then filtering the alignments against each other so each position is a member of only one 22 | * end alignment. Spanning trees controls the number of pairwise alignments used 23 | * to construct the alignment, maxSequenceLength is the maximum length of a sequence to consider in the end alignment. 24 | * Model parameters is the parameters of the pairwise alignment model. 25 | */ 26 | stSortedSet *makeFlowerAlignment(StateMachine *sM, Flower *flower, int64_t spanningTrees, 27 | int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, 28 | PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments); 29 | 30 | /* 31 | * As above, but including alignments from disk. 32 | */ 33 | stSortedSet *makeFlowerAlignment3(StateMachine *sM, Flower *flower, stList *listOfEndAlignmentFiles, int64_t spanningTrees, 34 | int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, 35 | PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments); 36 | 37 | /* 38 | * Returns an end, if exists, that has cap involved in every adjacency, else returns null. 39 | */ 40 | End *getDominantEnd(Flower *flower); 41 | 42 | /* 43 | * Ascertain which ends should be aligned separately. 44 | */ 45 | stSortedSet *getEndsToAlignSeparately(Flower *flower, int64_t maxSequenceLength, int64_t largeEndSize); 46 | 47 | /* 48 | * The total number of unaligned bases in adjacencies incident with the end. 49 | */ 50 | int64_t getTotalAdjacencyLength(End *end); 51 | 52 | #endif /* NETALIGNER_H_ */ 53 | -------------------------------------------------------------------------------- /bar/inc/rescue.h: -------------------------------------------------------------------------------- 1 | #ifndef RESCUE_H_ 2 | #define RESCUE_H_ 3 | #include "stPinchGraphs.h" 4 | 5 | typedef struct { 6 | Name name; 7 | int64_t start; 8 | int64_t stop; 9 | } bedRegion; 10 | 11 | bedRegion *bedRegion_construct(Name name, int64_t start, int64_t stop); 12 | 13 | // Compare two bed regions in their little-endian format as mapped 14 | // from the file. Returns 0 for any overlap. 15 | int bedRegion_cmp(const bedRegion *region1, const bedRegion *region2); 16 | 17 | // Find any regions covered by outgroups that are in segments with no 18 | // block, and "rescue" them into single-degree blocks. 19 | void rescueCoveredRegions(stPinchThread *thread, bedRegion *beds, size_t numBeds, 20 | Name name, int64_t minSegmentLength, double coveredBasesThreshold); 21 | 22 | #endif // RESCUE_H_ 23 | -------------------------------------------------------------------------------- /bar/tests/allTests.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "CuTest.h" 8 | #include 9 | #include 10 | #include 11 | #include "sonLib.h" 12 | 13 | CuSuite* adjacencySequenceTestSuite(void); 14 | CuSuite* endAlignerTestSuite(void); 15 | CuSuite* flowerAlignerTestSuite(void); 16 | CuSuite* rescueTestSuite(void); 17 | CuSuite* poaBarAlignerTestSuite(void); 18 | 19 | int stBaseAlignerRunAllTests(void) { 20 | CuString *output = CuStringNew(); 21 | CuSuite* suite = CuSuiteNew(); 22 | CuSuiteAddSuite(suite, adjacencySequenceTestSuite()); 23 | CuSuiteAddSuite(suite, endAlignerTestSuite()); 24 | CuSuiteAddSuite(suite, flowerAlignerTestSuite()); 25 | CuSuiteAddSuite(suite, rescueTestSuite()); 26 | CuSuiteAddSuite(suite, poaBarAlignerTestSuite()); 27 | CuSuiteRun(suite); 28 | CuSuiteSummary(suite, output); 29 | CuSuiteDetails(suite, output); 30 | printf("%s\n", output->buffer); 31 | return suite->failCount > 0; 32 | } 33 | 34 | int main(int argc, char *argv[]) { 35 | if(argc == 2) { 36 | st_setLogLevelFromString(argv[1]); 37 | } 38 | return stBaseAlignerRunAllTests(); 39 | } 40 | -------------------------------------------------------------------------------- /build-tools/KegAlign.commit: -------------------------------------------------------------------------------- 1 | d13a20ec17180dfa52aa39af258f70ddea5434ee 2 | -------------------------------------------------------------------------------- /build-tools/downloadPhast: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download and statically build Phast, which is required for halPhyloP 3 | # all binaries get copied into cactus/bin 4 | 5 | # set this to one to make sure everything gets built statically (necessary for binary release) 6 | STATIC_CHECK=$1 7 | 8 | set -beEu -o pipefail 9 | gitrel=85f7ed179dd097a86ba4added22d571785cc3e1d 10 | 11 | binDir=$(pwd)/bin 12 | 13 | # works on MacOS and Linux 14 | numcpu=$(getconf _NPROCESSORS_ONLN) 15 | 16 | # hal expects phast as a sister directory, so we stick it there 17 | submodulesDir=$(pwd)/submodules 18 | CWD=$(pwd) 19 | 20 | set -x 21 | 22 | mkdir -p ${binDir} 23 | 24 | # build clapack 25 | cd ${submodulesDir} 26 | rm -rf clapack 27 | wget -q http://www.netlib.org/clapack/clapack.tgz 28 | tar -xvzf clapack.tgz 29 | mv CLAPACK-3.2.1 clapack 30 | cd clapack 31 | cp make.inc.example make.inc && make -j ${numcpu} f2clib && make -j ${numcpu} blaslib && make -j ${numcpu} lib 32 | export CLAPACKPATH=$(pwd) 33 | cd .. 34 | 35 | # build phast 36 | cd ${submodulesDir} 37 | rm -rf phast 38 | git clone https://github.com/CshlSiepelLab/phast.git 39 | cd phast 40 | git checkout ${gitrel} 41 | # hack in flags support 42 | sed -i src/make-include.mk -e 's/CFLAGS =/CFLAGS +=/' -e 's/LIBS =/LIBS +=/' 43 | # note: phast's makefile doesn't support -j (somehow this only came up when upgrading to ubunut 22.04) 44 | cd src && make 45 | 46 | # copy over the binaries 47 | for PHASTBIN in ../bin/* 48 | do 49 | if [[ $STATIC_CHECK -ne 1 || $(ldd ${PHASTBIN} | grep so | wc -l) -eq 0 ]] 50 | then 51 | cp ${PHASTBIN} ${binDir} 52 | else 53 | exit 1; 54 | fi 55 | done 56 | 57 | cd ${CWD} 58 | 59 | set +x 60 | -------------------------------------------------------------------------------- /build-tools/downloadUcscLib: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download and statically build UCSC browser source, which is required 3 | # to build remote access into the HAL using UDC. 4 | 5 | set -beEu -o pipefail 6 | gitrel=v415_branch 7 | 8 | # works on MacOS and Linux 9 | numcpu=$(getconf _NPROCESSORS_ONLN) 10 | 11 | # put is sister directory of hal, since that is also done with PHAST 12 | submodulesDir=$(pwd)/submodules 13 | CWD=$(pwd) 14 | set -x 15 | 16 | # build kent non-browser libraries 17 | cd ${submodulesDir} 18 | rm -rf kent 19 | git clone https://github.com/ucscGenomeBrowser/kent.git 20 | cd kent 21 | git checkout ${gitrel} 22 | 23 | ## 24 | # env settings needed for compiling 25 | ## 26 | export MACHTYPE=$(uname -m) 27 | 28 | # Only important if you know what hgwdev is to the UCSC browser group 29 | # 30 | # common.mk does special checks for uname -n being "hgwdev", the browser 31 | # development system, and makes various assumptions, including using 32 | # -Werror. We don't want this to happen when doing docker build on hgwdev, 33 | # which oddly sets the HOSTNAME to hgwdev (but not in docker run), so we just 34 | # fake the hostname here if doesn't appear to be native hgwdev by 35 | # checking for the /hive file system... yuk 36 | 37 | if [ ! -e /hive ] ; then 38 | HOSTNAME=cactusbuild 39 | fi 40 | 41 | cd src/htslib 42 | make -j ${numcpu} HOSTNAME=${HOSTNAME} 43 | cd ../../src/lib 44 | make -j ${numcpu} HOSTNAME=${HOSTNAME} 45 | 46 | set +x 47 | -------------------------------------------------------------------------------- /build-tools/downloadVCFWave: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download vcfwave. You can run this when installing the release binaries because vcfwave 3 | # isn't included in the release (because no static build) 4 | 5 | set -beEu -o pipefail 6 | 7 | # Note: this bit below should be kept consistent with downloadPangenomeTools 8 | pangenomeBuildDir=$(realpath -m build-pangenome-tools) 9 | binDir=$(pwd)/bin 10 | libDir=$(pwd)/lib 11 | # just use cactusRootPath for now 12 | dataDir=$(pwd)/src/cactus 13 | CWD=$(pwd) 14 | # works on MacOS and Linux 15 | if [ -z ${numcpu+x} ]; then 16 | numcpu=$(getconf _NPROCESSORS_ONLN) 17 | fi 18 | 19 | set -x 20 | rm -rf ${pangenomeBuildDir} 21 | mkdir -p ${pangenomeBuildDir} 22 | mkdir -p ${binDir} 23 | mkdir -p ${libDir} 24 | 25 | cd ${pangenomeBuildDir} 26 | git clone --recursive https://github.com/vcflib/vcflib.git 27 | cd vcflib 28 | git checkout 5bae713c06aae9f4d8f40447684c6e10352a9f41 29 | mkdir build 30 | cd build 31 | cmake -DZIG=OFF -DWFA_GITMODULE=ON -DCMAKE_BUILD_TYPE=Debug .. 32 | cmake --build . -- -j ${numcpu} 33 | mv vcfwave vcfcreatemulti vcfbreakmulti vcfuniq vcffixup ${binDir} 34 | mv ./contrib/WFA2-lib/libwfa2.so.0 ${libDir} 35 | 36 | cd ${CWD} 37 | rm -rf ${pangenomeBuildDir} 38 | -------------------------------------------------------------------------------- /build-tools/makeCpuDockerRelease: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Make a cpu-enabled docker image and push it to quay 3 | # Todo: it'd be nice to get travis to do this to be more consistent with normal docker 4 | # Note: must be run for cactus/ directory 5 | 6 | set -x 7 | set -beEu -o pipefail 8 | mydir=$(dirname $(which $0)) 9 | source ${mydir}/releaseLib.sh 10 | 11 | buildDir=$(realpath -m build) 12 | binBuildDir="${buildDir}/cpu-docker-tmp" 13 | 14 | set -x 15 | rm -rf ${binBuildDir} 16 | mkdir -p ${binBuildDir} 17 | cd ${binBuildDir} 18 | git clone --recursive https://github.com/ComparativeGenomicsToolkit/cactus.git 19 | cd cactus 20 | git fetch --tags origin 21 | 22 | REL_TAG=$(getLatestReleaseTag) 23 | git checkout "${REL_TAG}" 24 | git submodule update --init --recursive 25 | 26 | docker build . -f Dockerfile -t ${dockname}:${REL_TAG} 27 | docker tag ${dockname}:${REL_TAG} ${dockname}:latest 28 | 29 | read -p "Are you sure you want to push ${dockname}:${REL_TAG} to quay?" yn 30 | case $yn in 31 | [Yy]* ) docker push ${dockname}:${REL_TAG} && docker push ${dockname}:latest ; break;; 32 | [Nn]* ) exit;; 33 | * ) echo "Please answer yes or no.";; 34 | esac 35 | popd 36 | -------------------------------------------------------------------------------- /build-tools/makeGpuDockerRelease: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Make a gpu-enabled docker image and push it to quay 3 | # Todo: it'd be nice to get travis to do this to be more consistent with normal docker 4 | # Note: must be run for cactus/ directory 5 | 6 | set -x 7 | set -beEu -o pipefail 8 | mydir=$(dirname $(which $0)) 9 | source ${mydir}/releaseLib.sh 10 | 11 | buildDir=$(realpath -m build) 12 | binBuildDir="${buildDir}/gpu-docker-tmp" 13 | 14 | set -x 15 | rm -rf ${binBuildDir} 16 | mkdir -p ${binBuildDir} 17 | cd ${binBuildDir} 18 | git clone --recursive https://github.com/ComparativeGenomicsToolkit/cactus.git 19 | cd cactus 20 | git fetch --tags origin 21 | 22 | REL_TAG=$(getLatestReleaseTag) 23 | git checkout "${REL_TAG}" 24 | git submodule update --init --recursive 25 | 26 | CFLAGS="" CXXFLAGS="" docker build . -f Dockerfile.kegalign -t kegalign:local 27 | # switch the runtime image to kegalign, and the build image to the build image from Dockerfile.kegalign 28 | # important, if the build image in Dockerfile.segaling changes, the line below needs to be updated too 29 | sed '0,/FROM/! s/FROM.*/FROM kegalign:local/' Dockerfile | sed -e '0,/FROM/s/FROM.*/FROM nvidia\/cuda:11.7.1-devel-ubuntu22.04 as builder/g' > Dockerfile.gpu 30 | # enable gpu by default 31 | sed -i src/cactus/cactus_progressive_config.xml -e 's/gpu="0"/gpu="all"/g' -e 's/realign="1"/realign="0"/' 32 | docker build . -f Dockerfile.gpu -t ${dockname}:${REL_TAG}-gpu 33 | # disable it again 34 | sed -i src/cactus/cactus_progressive_config.xml -e 's/gpu="all"/gpu="0"/g' -e 's/realign="0"/realign="1"/' 35 | read -p "Are you sure you want to push ${dockname}:${REL_TAG}-gpu to quay?" yn 36 | case $yn in 37 | [Yy]* ) docker push ${dockname}:${REL_TAG}-gpu; break;; 38 | [Nn]* ) exit;; 39 | * ) echo "Please answer yes or no.";; 40 | esac 41 | popd 42 | -------------------------------------------------------------------------------- /build-tools/makeSrcRelease: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Generate release tar file of source 3 | # Must be run after tree is tagged and pushed to master. 4 | # Use --keep to keep working directory for debugging. 5 | 6 | mydir=$(dirname $(which $0)) 7 | source ${mydir}/releaseLib.sh 8 | 9 | keep=no 10 | if [ $1 = '--keep' ] ; then 11 | keep=yes 12 | fi 13 | set -beEu -o pipefail 14 | 15 | buildDir=$(realpath -m build) 16 | srcBuildDir="${buildDir}/src-tmp" 17 | 18 | set -x 19 | rm -rf ${srcBuildDir} 20 | mkdir -p ${srcBuildDir} 21 | cd ${srcBuildDir} 22 | git clone --recursive https://github.com/ComparativeGenomicsToolkit/cactus.git 23 | cd cactus 24 | git fetch --tags origin 25 | 26 | REL_TAG=$(getLatestReleaseTag) 27 | git checkout "${REL_TAG}" 28 | git submodule update --init --recursive 29 | find submodules -name ".git" -exec rm -Rf "{}" \; 30 | cd .. 31 | mv cactus cactus-${REL_TAG} 32 | tar -czf ${buildDir}/cactus-${REL_TAG}.tar.gz cactus-${REL_TAG} 33 | if [ "$keep" = "no" ] ; then 34 | rm -Rf ${srcBuildDir} 35 | fi 36 | -------------------------------------------------------------------------------- /build-tools/quayTagRelease: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # tag the docker image at quay.io corresponding to the release 3 | 4 | set -x 5 | set -beEu -o pipefail 6 | mydir=$(dirname $(which $0)) 7 | source ${mydir}/releaseLib.sh 8 | 9 | REL_TAG=$(getLatestReleaseTag) 10 | REL_COMMIT=$(git rev-list -n 1 ${REL_TAG}) 11 | 12 | docker image tag ${dockname}:${REL_COMMIT} ${dockname}:${REL_TAG} 13 | -------------------------------------------------------------------------------- /build-tools/releaseLib.sh: -------------------------------------------------------------------------------- 1 | # definitions and functions for release bash programs 2 | 3 | PYTHON=python3.6 4 | PIP="${PYTHON} -m pip" 5 | 6 | dockstore="quay.io/comparative-genomics-toolkit" 7 | dockname=${dockstore}/cactus 8 | 9 | 10 | # get the tag for the lastest release, in the form v1.2.3, from git 11 | getLatestReleaseTag() { 12 | git describe --tags $(git rev-list --tags --max-count=10) | egrep -e '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -1 13 | } 14 | 15 | -------------------------------------------------------------------------------- /caf/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = .. 2 | include ${rootPath}/include.mk 3 | 4 | libSources = impl/*.c 5 | libHeaders = inc/*.h 6 | libTests = tests/*.c 7 | 8 | commonCafLibs = ${sonLibDir}/stPinchesAndCacti.a ${sonLibDir}/3EdgeConnected.a ${LIBDIR}/cactusLib.a ${LIBDIR}/stPaf.a 9 | stCafDependencies = ${commonCafLibs} ${LIBDEPENDS} 10 | stCafLibs = ${commonCafLibs} ${LDLIBS} 11 | 12 | all: all_libs all_progs 13 | all_libs: ${LIBDIR}/stCaf.a 14 | all_progs: all_libs 15 | ${MAKE} ${BINDIR}/stCafTests 16 | 17 | ${LIBDIR}/stCaf.a : ${libSources} ${libHeaders} ${stCafDependencies} 18 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -c ${libSources} 19 | ${AR} rc stCaf.a *.o 20 | ${RANLIB} stCaf.a 21 | mv stCaf.a ${LIBDIR}/ 22 | 23 | ${BINDIR}/stCafTests : ${libTests} ${LIBDIR}/stCaf.a ${stCafDependencies} 24 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/stCafTests ${libTests} ${libSources} ${LIBDIR}/stCaf.a ${stCafLibs} ${LDLIBS} 25 | 26 | clean : 27 | rm -f *.o 28 | rm -f ${LIBDIR}/stCaf.a ${BINDIR}/stCafTests 29 | 30 | -------------------------------------------------------------------------------- /caf/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /caf/impl/addAdjacencies.c: -------------------------------------------------------------------------------- 1 | #include "cactus.h" 2 | #include "sonLib.h" 3 | 4 | static int addAdjacenciesPP(Cap *cap1, Cap *cap2) { 5 | assert(cap_getStrand(cap1) && cap_getStrand(cap2)); 6 | Sequence *sequence1 = cap_getSequence(cap1); 7 | Sequence *sequence2 = cap_getSequence(cap2); 8 | int64_t i = cactusMisc_nameCompare(sequence_getName(sequence1), sequence_getName(sequence2)); 9 | if (i == 0) { 10 | int64_t j = cap_getCoordinate(cap1); 11 | int64_t k = cap_getCoordinate(cap2); 12 | i = j > k ? 1 : (j < k ? -1 : 0); 13 | if (i == 0) { 14 | assert(cap_getSegment(cap1) == cap_getSegment(cap2)); 15 | j = cap_getSide(cap1); 16 | k = cap_getSide(cap2); 17 | assert((j && !k) || (!j && k)); 18 | i = j ? -1 : 1; 19 | } 20 | } 21 | return i; 22 | } 23 | 24 | void stCaf_addAdjacencies(Flower *flower) { 25 | //Build a list of caps. 26 | stList *list = stList_construct(); 27 | Flower_EndIterator *endIterator = flower_getEndIterator(flower); 28 | End *end; 29 | while ((end = flower_getNextEnd(endIterator)) != NULL) { 30 | End_InstanceIterator *instanceIterator = end_getInstanceIterator(end); 31 | Cap *cap; 32 | while ((cap = end_getNext(instanceIterator)) != NULL) { 33 | if (!cap_getStrand(cap)) { 34 | cap = cap_getReverse(cap); 35 | } 36 | stList_append(list, cap); 37 | } 38 | end_destructInstanceIterator(instanceIterator); 39 | } 40 | flower_destructEndIterator(endIterator); 41 | assert(stList_length(list) % 2 == 0); 42 | //Sort the list of caps. 43 | stList_sort(list, (int(*)(const void *, const void *)) addAdjacenciesPP); 44 | //Now make the adjacencies. 45 | for (int64_t i = 1; i < stList_length(list); i += 2) { 46 | Cap *cap = stList_get(list, i - 1); 47 | Cap *cap2 = stList_get(list, i); 48 | cap_makeAdjacent(cap, cap2); 49 | } 50 | //Clean up. 51 | stList_destruct(list); 52 | } 53 | -------------------------------------------------------------------------------- /caf/inc/stGiantComponent.h: -------------------------------------------------------------------------------- 1 | /* 2 | * giantComponent.h 3 | * 4 | * Created on: 22 Feb 2012 5 | * Author: benedictpaten 6 | */ 7 | 8 | #ifndef ST_GIANTCOMPONENT_H_ 9 | #define ST_GIANTCOMPONENT_H_ 10 | 11 | #include "sonLib.h" 12 | #include "stPinchGraphs.h" 13 | 14 | /* 15 | * Nodes is a list of integers representing the nodes. 16 | * Each edge is represented as an int tuple (weight, vertex1, vertex2). 17 | * Returns a sublist of the edges in edges that must deleted, so that the size of the largest component in the graph 18 | * is smaller than maxComponentSize. 19 | */ 20 | stList *stCaf_breakupComponentGreedily(stList *nodes, stList *edges, int64_t maxComponentSize); 21 | 22 | /* 23 | * Break up component extra large compoonents greedily. 24 | */ 25 | void stCaf_breakupComponentsGreedily(stPinchThreadSet *threadSet, float maximumAdjacencyComponentSizeRatio); 26 | 27 | #endif /* ST_GIANTCOMPONENT_H_ */ 28 | -------------------------------------------------------------------------------- /caf/inc/stPinchIterator.h: -------------------------------------------------------------------------------- 1 | /* 2 | * stPinchIterator.h 3 | * 4 | * Created on: 21 Mar 2012 5 | * Author: benedictpaten 6 | */ 7 | 8 | #ifndef ST_PINCH_ITERATOR_H_ 9 | #define ST_PINCH_ITERATOR_H_ 10 | 11 | #include "sonLib.h" 12 | #include "stPinchGraphs.h" 13 | 14 | typedef struct _stPinchIterator { 15 | int64_t alignmentTrim; 16 | void *alignmentArg; 17 | stPinch *(*getNextAlignment)(void *, stPinch *); 18 | void *(*startAlignmentStack)(void *); 19 | void (*destructAlignmentArg)(void *); 20 | } stPinchIterator; 21 | 22 | /* 23 | * Get next alignment from iterator. pinchToFillOut is filled out and returned. A NULL return value indicates 24 | * there are no further pinches 25 | */ 26 | stPinch *stPinchIterator_getNext(stPinchIterator *stPinchIterator, stPinch *pinchToFillOut); 27 | 28 | /* 29 | * Reset the iterator, returning again to the beginning of the sequence. 30 | */ 31 | void stPinchIterator_reset( 32 | stPinchIterator *stPinchIterator); 33 | 34 | /* 35 | * Cleanup the iterator 36 | */ 37 | void stPinchIterator_destruct( 38 | stPinchIterator *stPinchIterator); 39 | 40 | /* 41 | * Get a pairwise alignment iterator from a file. 42 | */ 43 | stPinchIterator *stPinchIterator_constructFromFile(const char *alignmentFile); 44 | 45 | /* 46 | * Constructs iterator from aligned pairs. 47 | */ 48 | stPinchIterator *stPinchIterator_constructFromAlignedPairs(stSortedSet *alignedPairs, 49 | stPinch *(*getNextAlignedPairAlignment)(stSortedSetIterator *, stPinch *)); 50 | 51 | /* 52 | * Sets the amount to trim from the ends of each pinch in bases. 53 | */ 54 | void stPinchIterator_setTrim(stPinchIterator *pinchIterator, int64_t alignmentTrim); 55 | 56 | #endif /* ST_PINCH_ITERATOR_H_ */ 57 | -------------------------------------------------------------------------------- /caf/tests/allTests.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "CuTest.h" 8 | #include "sonLib.h" 9 | 10 | CuSuite* annealingTestSuite(void); 11 | CuSuite* giantComponentTestSuite(void); 12 | CuSuite* pinchIteratorTestSuite(void); 13 | CuSuite* recoverableChainsTestSuite(void); 14 | CuSuite* phylogenyTestSuite(void); 15 | CuSuite* filteringTestSuite(void); 16 | 17 | int cactusCoreRunAllTests(void) { 18 | CuString *output = CuStringNew(); 19 | CuSuite* suite = CuSuiteNew(); 20 | CuSuiteAddSuite(suite, annealingTestSuite()); 21 | CuSuiteAddSuite(suite, pinchIteratorTestSuite()); 22 | CuSuiteAddSuite(suite, giantComponentTestSuite()); 23 | CuSuiteAddSuite(suite, recoverableChainsTestSuite()); 24 | CuSuiteAddSuite(suite, phylogenyTestSuite()); 25 | CuSuiteAddSuite(suite, filteringTestSuite()); 26 | 27 | CuSuiteRun(suite); 28 | CuSuiteSummary(suite, output); 29 | CuSuiteDetails(suite, output); 30 | printf("%s\n", output->buffer); 31 | return suite->failCount > 0; 32 | } 33 | 34 | int main(int argc, char *argv[]) { 35 | if(argc == 2) { 36 | st_setLogLevelFromString(argv[1]); 37 | } 38 | int i = cactusCoreRunAllTests(); 39 | //while(1); 40 | return i; 41 | } 42 | -------------------------------------------------------------------------------- /caf/tests/annealingTest.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "CuTest.h" 8 | #include "sonLib.h" 9 | #include "stCaf.h" 10 | #include "stPinchGraphs.h" 11 | 12 | void stCaf_anneal2(stPinchThreadSet *threadSet, stPinch *(*pinchIterator)(void *), void *extraArg); 13 | 14 | void stCaf_annealBetweenAdjacencyComponents2(stPinchThreadSet *threadSet, stPinch *(*pinchIterator)(void *), 15 | void *extraArg, bool (*filterFn)(stPinchSegment *, stPinchSegment *)); 16 | 17 | static stPinch *randomPinch(void *extraArg) { 18 | if(st_random() < 0.01) { 19 | return NULL; 20 | } 21 | static stPinch pinch; 22 | pinch = stPinchThreadSet_getRandomPinch(extraArg); 23 | return &pinch; 24 | } 25 | 26 | static void testAnnealing(CuTest *testCase) { 27 | //return; 28 | for (int64_t test = 0; test < 100; test++) { 29 | st_logInfo("Starting annealing random test %" PRIi64 "\n", test); 30 | stPinchThreadSet *threadSet = stPinchThreadSet_getRandomEmptyGraph(); 31 | stCaf_anneal2(threadSet, randomPinch, threadSet); 32 | } 33 | } 34 | 35 | static void testAnnealingBetweenAdjacencyComponents(CuTest *testCase) { 36 | //return; 37 | for (int64_t test = 0; test < 100; test++) { 38 | st_logInfo("Starting annealing between adjacency components random test %" PRIi64 "\n", test); 39 | stPinchThreadSet *threadSet = stPinchThreadSet_getRandomGraph(); 40 | stCaf_annealBetweenAdjacencyComponents2(threadSet, randomPinch, threadSet, NULL); 41 | } 42 | } 43 | 44 | CuSuite* annealingTestSuite(void) { 45 | CuSuite* suite = CuSuiteNew(); 46 | SUITE_ADD_TEST(suite, testAnnealing); 47 | SUITE_ADD_TEST(suite, testAnnealingBetweenAdjacencyComponents); 48 | return suite; 49 | } 50 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption( 6 | "--suite", default="all", choices=["blast", "nonblast", "all"], help="test suite to run" 7 | ) 8 | 9 | def pytest_collection_modifyitems(config, items): 10 | suite = config.getoption("--suite") 11 | if suite == "all": 12 | # Don't skip any tests 13 | return 14 | skip = pytest.mark.skip(reason="skipping non-selected suite") 15 | for item in items: 16 | if suite != "blast" and "blast" in item.keywords: 17 | item.add_marker(skip) 18 | if suite == "blast" and "blast" not in item.keywords: 19 | item.add_marker(skip) 20 | -------------------------------------------------------------------------------- /doc/INSTALL.txt: -------------------------------------------------------------------------------- 1 | Installing Cactus. 2 | 3 | (1) Download and install sonLib. See https://github.com/benedictpaten/sonLib 4 | 5 | (2) Download and install pinchesAndCacti. See https://github.com/benedictpaten/pinchesAndCacti 6 | 7 | (3) Download and install matchingAndOrdering. See https://github.com/benedictpaten/matchingAndOrdering 8 | 9 | (4) Download and install jobTree. See https://github.com/benedictpaten/jobTree 10 | 11 | (5) Download and install tokyo cabinet. See http://fallabs.com/tokyocabinet/ 12 | 13 | (6) [optional, but strongly advised] Download and install kyoto tycoon. See http://fallabs.com/kyototycoon/ 14 | 15 | (7) Download and install lastz. See http://www.bx.psu.edu/~rsharris/lastz/ 16 | NOTE, to use cactus_lastzRepeatMaxk.py preprocessor, you must instead: 17 | Download and install LATEST lastz. See http://www.bx.psu.edu/~rsharris/lastz/newer/ 18 | Add lastz's /tools directory to your path (only required for repeatmasking) 19 | 20 | (8) Install networkx and psutil python packages: 21 | easy_install networkx 22 | easy_install psutil 23 | 24 | (9) Place the directory containing Cactus on your python path, i.e. 25 | PYTHONPATH=${PYTHONPATH}:FOO 26 | where FOO/sonLib is the path to the base directory of Cactus. 27 | 28 | (10) Compile the C code: 29 | Modify the include.mk file to point at where you installed sonLib. 30 | In cactus type 'make all' 31 | 32 | (11) [optional, but required if you want all tests to pass] Download the dataset 33 | http://dl.dropbox.com/u/156669/datasets.tar.bz2 and create an environment variable 34 | SON_TRACE_DATASETS to point at it. 35 | 36 | (12) Run python allTests.py to run the set of tests to check the installation 37 | 38 | See https://github.com/benedictpaten/ for links to the dependencies mentioned. 39 | 40 | 41 | -------------------------------------------------------------------------------- /doc/README.pages: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/README.pages -------------------------------------------------------------------------------- /doc/README.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/README.pdf -------------------------------------------------------------------------------- /doc/add-genome-fig-github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/add-genome-fig-github.png -------------------------------------------------------------------------------- /doc/grch38-alt-pg-lrc_kir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/grch38-alt-pg-lrc_kir.png -------------------------------------------------------------------------------- /doc/grch38-alt-pg-mhc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/grch38-alt-pg-mhc.png -------------------------------------------------------------------------------- /doc/mc-pangenomes/10-chicken-pg-2022-09-23-seqfile.txt: -------------------------------------------------------------------------------- 1 | galGal6 https://hgdownload.soe.ucsc.edu/goldenPath/galGal6/bigZips/galGal6.fa.gz 2 | bGalGal1_mat_broiler_GRCg7b.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/699/485/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b_genomic.fna.gz 3 | bGalGal1_pat_whiteleghornlayer_GRCg7w_W.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/700/215/GCF_016700215.2_bGalGal1.pat.whiteleghornlayer.GRCg7w/GCF_016700215.2_bGalGal1.pat.whiteleghornlayer.GRCg7w_genomic.fna.gz 4 | Ogye1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/798/355/GCA_002798355.1_Ogye1.0/GCA_002798355.1_Ogye1.0_genomic.fna.gz 5 | ASM2465302v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/653/025/GCA_024653025.1_ASM2465302v1/GCA_024653025.1_ASM2465302v1_genomic.fna.gz 6 | ASM2465303v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/653/035/GCA_024653035.1_ASM2465303v1/GCA_024653035.1_ASM2465303v1_genomic.fna.gz 7 | ASM2465299v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/652/995/GCA_024652995.1_ASM2465299v1/GCA_024652995.1_ASM2465299v1_genomic.fna.gz 8 | ASM2465304v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/653/045/GCA_024653045.1_ASM2465304v1/GCA_024653045.1_ASM2465304v1_genomic.fna.gz 9 | ASM2465298v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/652/985/GCA_024652985.1_ASM2465298v1/GCA_024652985.1_ASM2465298v1_genomic.fna.gz 10 | ASM2420605v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/206/055/GCA_024206055.1_ASM2420605v1/GCA_024206055.1_ASM2420605v1_genomic.fna.gz 11 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/10-chicken-pg-2023-06-27-commands.md: -------------------------------------------------------------------------------- 1 | This is the same input data as "10-chicken-pg-2022-09-23", just run with the current version of Cactus on SLURM. 2 | 3 | Cactus version: v2.6.1 4 | 5 | Note: `--consMemory 256Gi` is very conservative and could be lowered by at least half(see memory usage in `10-chicken-pg-2023-06-27.log`). But unfortunately `--consMemory` is required to run this data on slurm, as some of the smaller chromosomes, ex chr30, use more memory than Cactus estimates based on their tiny size. If not running on slurm, this won't be an issue. 6 | 7 | ``` 8 | cactus-pangenome ./js ./10-chicken-pg-2022-09-23-seqfile.txt --outName 10-chicken-pg-2023-06-7 --outDir 10-chicken-pg-2023-06-27 --reference galGal6 --batchSystem slurm --indexCores 64 --consCores 64 --mgCores 64 --mapCores 8 --logFile 10-chicken-pg-2023-06-27.log --maxLocalJobs 1000 --gbz --gfa --vcf --giraffe --chrom-og --chrom-vg --viz --consMemory 256Gi 9 | ``` 10 | 11 | Oops! (gaf file extension not added due to bug in this release) 12 | ``` 13 | mv 10-chicken-pg-2023-06-27/10-chicken-pg-2023-06-27 10-chicken-pg-2023-06-27/10-chicken-pg-2023-06-27.gaf.gz 14 | ``` 15 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/10-chicken-pg-2023-06-27-seqfile.txt: -------------------------------------------------------------------------------- 1 | galGal6 https://hgdownload.soe.ucsc.edu/goldenPath/galGal6/bigZips/galGal6.fa.gz 2 | bGalGal1_mat_broiler_GRCg7b.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/699/485/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b_genomic.fna.gz 3 | bGalGal1_pat_whiteleghornlayer_GRCg7w_W.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/700/215/GCF_016700215.2_bGalGal1.pat.whiteleghornlayer.GRCg7w/GCF_016700215.2_bGalGal1.pat.whiteleghornlayer.GRCg7w_genomic.fna.gz 4 | Ogye1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/798/355/GCA_002798355.1_Ogye1.0/GCA_002798355.1_Ogye1.0_genomic.fna.gz 5 | ASM2465302v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/653/025/GCA_024653025.1_ASM2465302v1/GCA_024653025.1_ASM2465302v1_genomic.fna.gz 6 | ASM2465303v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/653/035/GCA_024653035.1_ASM2465303v1/GCA_024653035.1_ASM2465303v1_genomic.fna.gz 7 | ASM2465299v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/652/995/GCA_024652995.1_ASM2465299v1/GCA_024652995.1_ASM2465299v1_genomic.fna.gz 8 | ASM2465304v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/653/045/GCA_024653045.1_ASM2465304v1/GCA_024653045.1_ASM2465304v1_genomic.fna.gz 9 | ASM2465298v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/652/985/GCA_024652985.1_ASM2465298v1/GCA_024652985.1_ASM2465298v1_genomic.fna.gz 10 | ASM2420605v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/206/055/GCA_024206055.1_ASM2420605v1/GCA_024206055.1_ASM2420605v1_genomic.fna.gz 11 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/10-t2t-apes-mc-2023v2.seqfile.txt: -------------------------------------------------------------------------------- 1 | hs1 https://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.fa.gz 2 | hg38 https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz 3 | hg002.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/018/852/605/GCA_018852605.2/GCA_018852605.2.fa.gz 4 | hg002.2 https://hgdownload.soe.ucsc.edu/hubs/GCA/018/852/615/GCA_018852615.2/GCA_018852615.2.fa.gz 5 | mPanTro3.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/858/775/GCA_028858775.2_NHGRI_mPanTro3-v2.0_pri/GCA_028858775.2_NHGRI_mPanTro3-v2.0_pri_genomic.fna.gz 6 | mPanTro3.2 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/858/805/GCA_028858805.2_NHGRI_mPanTro3-v2.0_alt/GCA_028858805.2_NHGRI_mPanTro3-v2.0_alt_genomic.fna.gz 7 | mPanPan1.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/858/825/GCA_028858825.2_NHGRI_mPanPan1-v2.0_pat/GCA_028858825.2_NHGRI_mPanPan1-v2.0_pat_genomic.fna.gz 8 | mPanPan1.2 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/858/845/GCA_028858845.2_NHGRI_mPanPan1-v2.0_mat/GCA_028858845.2_NHGRI_mPanPan1-v2.0_mat_genomic.fna.gz 9 | mGorGor1.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/885/475/GCA_028885475.2_NHGRI_mGorGor1-v2.0_pat/GCA_028885475.2_NHGRI_mGorGor1-v2.0_pat_genomic.fna.gz 10 | mGorGor1.2 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/885/495/GCA_028885495.2_NHGRI_mGorGor1-v2.0_mat/GCA_028885495.2_NHGRI_mGorGor1-v2.0_mat_genomic.fna.gz 11 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/16-fly-pg-2022-05-26-seqfile.txt: -------------------------------------------------------------------------------- 1 | dm6 genomes_3_softmask/dm6.fa.masked 2 | A1.0 genomes_3_softmask/a1.scaffold.fasta.masked 3 | A2.0 genomes_3_softmask/a2.scaffold.fasta.masked 4 | A3.0 genomes_3_softmask/a3.scaffold.fasta.masked 5 | A4.0 genomes_3_softmask/a4.scaffold.fasta.masked 6 | A5.0 genomes_3_softmask/a5.scaffold.fasta.masked 7 | A6.0 genomes_3_softmask/a6.scaffold.fasta.masked 8 | A7.0 genomes_3_softmask/a7.scaffold.fasta.masked 9 | AB8.0 genomes_3_softmask/ab8.scaffold.fasta.masked 10 | B1.0 genomes_3_softmask/b1.scaffold.fasta.masked 11 | B2.0 genomes_3_softmask/b2.scaffold.fasta.masked 12 | B3.0 genomes_3_softmask/b3.scaffold.fasta.masked 13 | B4.0 genomes_3_softmask/b4.scaffold.fasta.masked 14 | B6.0 genomes_3_softmask/b6.scaffold.fasta.masked 15 | B7.0 genomes_3_softmask/b7.scaffold.fasta.masked 16 | OreR.0 genomes_3_softmask/ore.scaffold.fasta.masked 17 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/16-fly-pg-2023-08-25-commands.md: -------------------------------------------------------------------------------- 1 | # Steps to reproduce 16-fly-pg-2023-08-25 2 | 3 | First, pull in the input assemblies that we used for the pangeome in the mc paper 4 | ``` 5 | wget https://s3-us-west-2.amazonaws.com/human-pangenomics/publications/mc_2022/mc_pangenomes/16-fruitfly-mc-2022-05-26/16-fly-softmasked-fa.tar.gz 6 | tar zxf 16-fly-softmasked-fa.tar.gz 7 | ``` 8 | 9 | Make and index the pangenome with Cactus commit 1f537de69a9b7cb2c6f46ecc048023f92f911ed2 on slurm (which contains some fixes that will appear in v2.6.8) 10 | 11 | Notable options: 12 | * `--permissiveContigFilter` to help assign tiny contigs to reference chromosome 13 | * `--haplo` to make the new subsampling index (.hapl) for giraffe, which obviates need for the filter (.d2) graph 14 | 15 | 16 | ``` 17 | cactus-pangenome ./js ./16-fly-pg-2023-08-25-seqfile.txt --outDir 16-fly-pg-2023-08-25 --outName 16-fly-pg-2023-08-25 --reference dm6 --giraffe clip filter --gbz clip filter full --gfa clip filter full --vcf --permissiveContigFilter --haplo --chrom-vg clip filter --chrom-og full --viz --consCores 32 --indexCores 32 --mgCores 64 --mapCores 8 --batchSystem slurm --logFile 16-fly-pg-2023-08-25.log 2> 16-fly-pg-2023-08-25.stderr 18 | ``` 19 | 20 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/16-fly-pg-2023-08-25-seqfile.txt: -------------------------------------------------------------------------------- 1 | dm6 genomes_3_softmask/dm6.fa.masked 2 | A1 genomes_3_softmask/a1.scaffold.fasta.masked 3 | A2 genomes_3_softmask/a2.scaffold.fasta.masked 4 | A3 genomes_3_softmask/a3.scaffold.fasta.masked 5 | A4 genomes_3_softmask/a4.scaffold.fasta.masked 6 | A5 genomes_3_softmask/a5.scaffold.fasta.masked 7 | A6 genomes_3_softmask/a6.scaffold.fasta.masked 8 | A7 genomes_3_softmask/a7.scaffold.fasta.masked 9 | AB8 genomes_3_softmask/ab8.scaffold.fasta.masked 10 | B1 genomes_3_softmask/b1.scaffold.fasta.masked 11 | B2 genomes_3_softmask/b2.scaffold.fasta.masked 12 | B3 genomes_3_softmask/b3.scaffold.fasta.masked 13 | B4 genomes_3_softmask/b4.scaffold.fasta.masked 14 | B6 genomes_3_softmask/b6.scaffold.fasta.masked 15 | B7 genomes_3_softmask/b7.scaffold.fasta.masked 16 | OreR genomes_3_softmask/ore.scaffold.fasta.masked 17 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/17-soybean-pg-2022-09-26-seqfile.txt: -------------------------------------------------------------------------------- 1 | Glycine_max_v4_0 s3://vg-k8s/users/hickey/soybean-pangenome/GCF_000004515.6.ucsc.fa.gz 2 | JD17_chr_final.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/733/175/GCA_021733175.1_JD17_chr_final/GCA_021733175.1_JD17_chr_final_genomic.fna.gz 3 | Gmax_ZH13_v2_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/349/995/GCA_003349995.2_Gmax_ZH13_v2.0/GCA_003349995.2_Gmax_ZH13_v2.0_genomic.fna.gz 4 | ASM2049715v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/497/155/GCA_020497155.1_ASM2049715v1/GCA_020497155.1_ASM2049715v1_genomic.fna.gz 5 | PI594527_PLATINUM_CHROMOSOMES_fasta.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/019/321/705/GCA_019321705.1_PI594527.PLATINUM.CHROMOSOMES.fasta/GCA_019321705.1_PI594527.PLATINUM.CHROMOSOMES.fasta_genomic.fna.gz 6 | WHFS_GmHX3_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/282/145/GCA_014282145.1_WHFS_GmHX3_1.0/GCA_014282145.1_WHFS_GmHX3_1.0_genomic.fna.gz 7 | GmBRS537_a1_v2.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/012/273/815/GCA_012273815.2_GmBRS537.a1.v2/GCA_012273815.2_GmBRS537.a1.v2_genomic.fna.gz 8 | WHFS_GmZH13_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/282/085/GCA_014282085.1_WHFS_GmZH13_1.0/GCA_014282085.1_WHFS_GmZH13_1.0_genomic.fna.gz 9 | WHFS_GmJY_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/282/095/GCA_014282095.1_WHFS_GmJY_1.0/GCA_014282095.1_WHFS_GmJY_1.0_genomic.fna.gz 10 | Tianlong1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/015/227/745/GCA_015227745.1_Tianlong1/GCA_015227745.1_Tianlong1_genomic.fna.gz 11 | WHFS_GmHF25_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/282/065/GCA_014282065.1_WHFS_GmHF25_1.0/GCA_014282065.1_WHFS_GmHF25_1.0_genomic.fna.gz 12 | ASM2211499v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/022/114/995/GCA_022114995.1_ASM2211499v1/GCA_022114995.1_ASM2211499v1_genomic.fna.gz 13 | WHFS_GmW82_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/282/185/GCA_014282185.1_WHFS_GmW82_1.0/GCA_014282185.1_WHFS_GmW82_1.0_genomic.fna.gz 14 | WHFS_GmWF7_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/282/035/GCA_014282035.1_WHFS_GmWF7_1.0/GCA_014282035.1_WHFS_GmWF7_1.0_genomic.fna.gz 15 | WHFS_GmZH35_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/282/075/GCA_014282075.1_WHFS_GmZH35_1.0/GCA_014282075.1_WHFS_GmZH35_1.0_genomic.fna.gz 16 | glyma_Lee_gnm1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/905/335/GCA_002905335.2_glyma.Lee.gnm1/GCA_002905335.2_glyma.Lee.gnm1_genomic.fna.gz 17 | WHFS_GsojaF_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/282/345/GCA_014282345.1_WHFS_GsojaF_1.0/GCA_014282345.1_WHFS_GsojaF_1.0_genomic.fna.gz 18 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/4-t2t-orangs-mc-2023v2.seqfile.txt: -------------------------------------------------------------------------------- 1 | mPonAbe1_pri https://hgdownload.soe.ucsc.edu/hubs/GCA/028/885/655/GCA_028885655.2/GCA_028885655.2.fa.gz 2 | mPonAbe1_alt https://hgdownload.soe.ucsc.edu/hubs/GCA/028/885/685/GCA_028885685.2/GCA_028885685.2.fa.gz 3 | mPonPyg2.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/028/885/625/GCA_028885625.2/GCA_028885625.2.fa.gz 4 | mPonPyg2.2 https://hgdownload.soe.ucsc.edu/hubs/GCA/028/885/525/GCA_028885525.2/GCA_028885525.2.fa.gz 5 | 6 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/5-cow-pg-2022-09-22-seqfile.txt: -------------------------------------------------------------------------------- 1 | bosTau9 https://hgdownload.soe.ucsc.edu/goldenPath/bosTau9/bigZips/bosTau9.fa.gz 2 | ARS-LIC_NZ_Jersey.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/234/555/GCA_021234555.1_ARS-LIC_NZ_Jersey/GCA_021234555.1_ARS-LIC_NZ_Jersey_genomic.fna.gz 3 | ROSLIN_BTI_ANK1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/905/123/885/GCA_905123885.1_ROSLIN_BTI_ANK1/GCA_905123885.1_ROSLIN_BTI_ANK1_genomic.fna.gz 4 | ROSLIN_BTT_NDA1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/905/123/515/GCA_905123515.1_ROSLIN_BTT_NDA1/GCA_905123515.1_ROSLIN_BTT_NDA1_genomic.fna.gz 5 | ARS-LIC_NZ_Holstein-Friesian_1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/347/905/GCA_021347905.1_ARS-LIC_NZ_Holstein-Friesian_1/GCA_021347905.1_ARS-LIC_NZ_Holstein-Friesian_1_genomic.fna.gz 6 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/5-cow-pg-2023-03-31-commands.md: -------------------------------------------------------------------------------- 1 | This is the same input as `5-cow-pg-2022-09-22` but run with the new `cactus-pangenome` interface to do everything in one command. 2 | 3 | Cactus commit: b5982ed1fa54aaf4c22087ab4132098230b97fe3 4 | 5 | Note: If I could go back, I'd probably add a `--gbz` to get a GBZ output of the clipped graph (can be obtained here using `vg gbwt` on the `gfa` output) 6 | 7 | ``` 8 | cactus-pangenome aws:us-west-2:glennhickey-jobstore-cow-pg ./5-cow-pg-2023-03-31-seqfile.txt --outDir s3://vg-k8s/users/hickey/5-cow-pg-2023-03-31 --outName 5-cow-pg-2023-03-31 --indexCores 31 --mapCores 8 --indexCores 31 --consCores 16 --logFile 5-cow-pg-2023-03-31.log --giraffe --vcf --gfa --otherContig chrOther --refContigs $(for i in `seq 29`; do echo chr$i; done ; echo "chrX chrM") --reference bosTau9 --batchSystem mesos --provisioner aws --defaultPreemptable --nodeType r5.8xlarge:1.5 --nodeStorage 650 --maxNodes 25 --betaInertia 0 --targetTime 1 9 | ``` -------------------------------------------------------------------------------- /doc/mc-pangenomes/5-cow-pg-2023-03-31-seqfile.txt: -------------------------------------------------------------------------------- 1 | RS-LIC_NZ_Jersey.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/234/555/GCA_021234555.1_ARS-LIC_NZ_Jersey/GCA_021234555.1_ARS-LIC_NZ_Jersey_genomic.fna.gz 2 | ROSLIN_BTI_ANK1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/905/123/885/GCA_905123885.1_ROSLIN_BTI_ANK1/GCA_905123885.1_ROSLIN_BTI_ANK1_genomic.fna.gz 3 | ROSLIN_BTT_NDA1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/905/123/515/GCA_905123515.1_ROSLIN_BTT_NDA1/GCA_905123515.1_ROSLIN_BTT_NDA1_genomic.fna.gz 4 | ARS-LIC_NZ_Holstein-Friesian_1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/347/905/GCA_021347905.1_ARS-LIC_NZ_Holstein-Friesian_1/GCA_021347905.1_ARS-LIC_NZ_Holstein-Friesian_1_genomic.fna.gz 5 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/9-dog-pg-2022-09-23-seqfile.txt: -------------------------------------------------------------------------------- 1 | canFam4 https://hgdownload.soe.ucsc.edu/goldenPath/canFam4/bigZips/canFam4.fa.gz 2 | canFam3.0 https://hgdownload.soe.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.fa.gz 3 | canFam5.0 https://hgdownload.soe.ucsc.edu/goldenPath/canFam5/bigZips/canFam5.fa.gz 4 | canFam6.0 https://hgdownload.soe.ucsc.edu/goldenPath/canFam6/bigZips/canFam6.fa.gz 5 | ROS_Cfam_1_0.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/014/441/545/GCF_014441545.1_ROS_Cfam_1.0/GCF_014441545.1_ROS_Cfam_1.0_genomic.fna.gz 6 | ASM1204501v1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/012/045/015/GCA_012045015.1_ASM1204501v1/GCA_012045015.1_ASM1204501v1_genomic.fna.gz 7 | UNSW_CanFamBas_1_2.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/276/365/GCA_013276365.2_UNSW_CanFamBas_1.2/GCA_013276365.2_UNSW_CanFamBas_1.2_genomic.fna.gz 8 | ASM864105v3.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/641/055/GCA_008641055.3_ASM864105v3/GCA_008641055.3_ASM864105v3_genomic.fna.gz 9 | Basenji_breed-1_1.0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/004/886/185/GCA_004886185.2_Basenji_breed-1.1/GCA_004886185.2_Basenji_breed-1.1_genomic.fna.gz 10 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/9-dog-pg-2023-06-27-commands.md: -------------------------------------------------------------------------------- 1 | This is the same input data as "9-dog-pg-2022-09-23", just run with the current version of Cactus on SLURM. 2 | 3 | Cactus version: v2.6.1 4 | 5 | Note: `--consMemory 256Gi` probably very conservative and could be lowered (see memory usage in `9-dog-pg-2023-06-27.log`) 6 | 7 | ``` 8 | cactus-pangenome ./js ./9-dog-pg-2023-06-27-seqfile.txt --outName 9-dog-pg-2023-06-27 --outDir 9-dog-pg-2023-06-27 --reference canFam4 --batchSystem slurm --indexCores 64 --consCores 64 --mgCores 64 --mapCores 8 --logFile 9-dog-pg-2023-06-27.log --gbz --gfa --vcf --giraffe --chrom-og --chrom-vg --viz --consMemory 256Gi 9 | ``` 10 | 11 | Oops! (gaf file extension not added due to bug in this release) 12 | ``` 13 | mv 9-dog-pg-2023-06-27/9-dog-pg-2023-06-27 9-dog-pg-2023-06-27/9-dog-pg-2023-06-27.gaf.gz 14 | ``` 15 | 16 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/9-dog-pg-2023-06-27-seqfile.txt: -------------------------------------------------------------------------------- 1 | canFam4 https://hgdownload.soe.ucsc.edu/goldenPath/canFam4/bigZips/canFam4.fa.gz 2 | canFam3 https://hgdownload.soe.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.fa.gz 3 | canFam5 https://hgdownload.soe.ucsc.edu/goldenPath/canFam5/bigZips/canFam5.fa.gz 4 | canFam6 https://hgdownload.soe.ucsc.edu/goldenPath/canFam6/bigZips/canFam6.fa.gz 5 | ROS_Cfam_1_0 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/014/441/545/GCF_014441545.1_ROS_Cfam_1.0/GCF_014441545.1_ROS_Cfam_1.0_genomic.fna.gz 6 | ASM1204501v1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/012/045/015/GCA_012045015.1_ASM1204501v1/GCA_012045015.1_ASM1204501v1_genomic.fna.gz 7 | UNSW_CanFamBas_1_2 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/276/365/GCA_013276365.2_UNSW_CanFamBas_1.2/GCA_013276365.2_UNSW_CanFamBas_1.2_genomic.fna.gz 8 | ASM864105v3 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/641/055/GCA_008641055.3_ASM864105v3/GCA_008641055.3_ASM864105v3_genomic.fna.gz 9 | Basenji_breed-1_1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/004/886/185/GCA_004886185.2_Basenji_breed-1.1/GCA_004886185.2_Basenji_breed-1.1_genomic.fna.gz 10 | -------------------------------------------------------------------------------- /doc/mc-pangenomes/grch38-alts-pg-2023-04-13-commands.md: -------------------------------------------------------------------------------- 1 | Cactus version used: v2.5.0 2 | 3 | Download hg38 and split it into one fasta file for non-alt contigs, and a seperate fast file for each alt. 4 | 5 | ``` 6 | wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz 7 | gzip -d hg38.fa.gz 8 | samtools faidx hg38.fa 9 | mkdir -p fa 10 | rm -f fa/hg38_no_alts.fa 11 | for contig in $(grep -v _alt hg38.fa.fai | awk '{print $1}'); do samtools faidx hg38.fa $contig >> fa/hg38_no_alts.fa; done 12 | bgzip fa/hg38_no_alts.fa --threads 16 13 | printf "GRCh38\tfa/hg38_no_alts.fa.gz\n" > grch38_alts.seqfile 14 | for contig in $(grep _alt hg38.fa.fai | awk '{print $1}'); \ 15 | do samtools faidx hg38.fa $contig | bgzip > fa/${contig}.fa.gz; \ 16 | printf "${contig}\tfa/${contig}.fa.gz\n" >> grch38_alts.seqfile ; done 17 | ``` 18 | 19 | Make the pangenome, setting a very permissive overlap threshold to make sure all contigs get into a chromosome 20 | 21 | ``` 22 | mkdir -p work 23 | cactus-pangenome ./js ./grch38_alts.seqfile --reference GRCh38 --gbz clip --giraffe clip --vcf --outName grch38-alts-apr13 --outDir grch38-alts --logFile grch38-alts-apr13.log --indexCores 32 --mapCores 8 --consCores 8 --refContigs $(for i in `seq 22`; do printf "chr$i "; done ; echo "chrX chrY chrM") --otherContig chrOther --permissiveContigFilter 0.05 --workDir work 24 | ``` 25 | 26 | Note: the actual command run had quotes around the refContigs, causing them all to be run at once. This slows it down but doesn't affect the output. Quotes are removed above so others don't make the same mistake. 27 | -------------------------------------------------------------------------------- /doc/mc-paper/README.md: -------------------------------------------------------------------------------- 1 | # Methods for "Pangenome Graph Construction from Whole Genome Alignments" 2 | 3 | Links to the commands and scripts used in the paper can all be found below. The pangenome graphs discussed, among several others, can also be found [on this page](../mc-pangenomes/README.md). 4 | * [HPRC methods](hprc) 5 | * [D. melanogaster methods](fly) 6 | 7 | These are links to the VCFs used in the paper 8 | * [HPRC GIAB calls](https://s3-us-west-2.amazonaws.com/human-pangenomics/index.html?prefix=publications/mc_2022/hprc-human/) 9 | * [HPRC PanGenie SV genotypes](https://doi.org/10.5281/zenodo.7669083) 10 | * [D. Melanogaster small variants](https://s3-us-west-2.amazonaws.com/human-pangenomics/publications/mc_2022/fruitfly/surject.16-fruitfly-mc-2022-05-26-d2.vcf.gz) 11 | * [D. Melanogaster SVs](https://s3-us-west-2.amazonaws.com/human-pangenomics/publications/mc_2022/fruitfly/16-fruitfly-mc-2022-05-26-d2.100samples.decomposed.svs.vcf.gz) 12 | 13 | -------------------------------------------------------------------------------- /doc/mc-paper/fly-sra.tsv: -------------------------------------------------------------------------------- 1 | SRX021040 SRR834526 2 | SRX155996 SRR834509 3 | SRX155997 SRR834512 4 | SRX025317 SRR834541 5 | SRX021235 SRR835025 6 | SRX021255 SRR835027 7 | SRX155989 SRR835034 8 | SRX021267 SRR835037 9 | SRX021296 SRR933581 10 | SRX021384 SRR835087 11 | SRX023456 SRR835058 12 | SRX156026 SRR833244 13 | SRX021008 SRR932121 14 | SRX020759 SRR834551 15 | SRX021026 SRR834547 16 | SRX020912 SRR933563 17 | SRX155979 SRR834523 18 | SRX155994 SRR834527 19 | SRX021053 SRR834531 20 | SRX021082 SRR834507 21 | SRX155981 SRR834508 22 | SRX021063 SRR834510 23 | SRX021094 SRR834511 24 | SRX021095 SRR933569 25 | SRX156029 SRR834514 26 | SRX021101 SRR834516 27 | SRX156027 SRR834517 28 | SRX156028 SRR834545 29 | SRX023833 SRR834537 30 | SRX023424 SRR834546 31 | SRX155984 SRR834553 32 | SRX021104 SRR834539 33 | SRX023834 SRR834543 34 | SRX021112 SRR933573 35 | SRX156013 SRR834552 36 | SRX021113 SRR834554 37 | SRX156014 SRR834519 38 | SRX021157 SRR834520 39 | SRX156015 SRR834521 40 | SRX156017 SRR834522 41 | SRX021242 SRR835023 42 | SRX021254 SRR835024 43 | SRX021245 SRR835026 44 | SRX006155 SRR933577 45 | SRX021244 SRR835028 46 | SRX021246 SRR835029 47 | SRX023835 SRR835030 48 | SRX021260 SRR835031 49 | SRX021262 SRR835033 50 | SRX021268 SRR835035 51 | SRX021270 SRR835036 52 | SRX021271 SRR835038 53 | SRX156002 SRR835039 54 | SRX021272 SRR835040 55 | SRX021273 SRR835041 56 | SRX021282 SRR835042 57 | SRX155985 SRR835043 58 | SRX156031 SRR835044 59 | SRX021290 SRR835045 60 | SRX021293 SRR835046 61 | SRX156034 SRR835047 62 | SRX006159 SRR933580 63 | SRX156032 SRR835048 64 | SRX156033 SRR835050 65 | SRX156004 SRR835096 66 | SRX155988 SRR835097 67 | SRX156003 SRR835098 68 | SRX156018 SRR835086 69 | SRX006162 SRR933585 70 | SRX006163 SRR933586 71 | SRX006164 SRR933587 72 | SRX021382 SRR933589 73 | SRX006167 SRR933591 74 | SRX023451 SRR933592 75 | SRX021383 SRR933593 76 | SRX021385 SRR933594 77 | SRX021386 SRR835088 78 | SRX021387 SRR835089 79 | SRX021388 SRR835091 80 | SRX021389 SRR835092 81 | SRX021400 SRR835095 82 | SRX021418 SRR835051 83 | SRX021419 SRR835052 84 | SRX156006 SRR835054 85 | SRX021476 SRR835055 86 | SRX021479 SRR933599 87 | SRX156036 SRR835059 88 | SRX156035 SRR835060 89 | SRX155993 SRR835061 90 | SRX021563 SRR835062 91 | SRX021492 SRR835063 92 | SRX021496 SRR835067 93 | SRX021527 SRR835069 94 | SRX021499 SRR835071 95 | SRX023838 SRR835072 96 | SRX021528 SRR835073 97 | SRX023457 SRR933601 98 | SRX021500 SRR835074 99 | SRX021501 SRR835075 100 | SRX156024 SRR835077 101 | -------------------------------------------------------------------------------- /doc/mc-paper/fly/annotate-repeats.R: -------------------------------------------------------------------------------- 1 | library(sveval) ## install with: BiocManager::install('jmonlong/sveval') 2 | library(GenomicRanges) ## install with: BiocManager::install('GenomicRanges') 3 | library(Biostrings) ## install with: BiocManager::install('Biostrings') 4 | library(dplyr) ## install with: install.packages('dplyr') 5 | ## install BiocManager with: install.packages('BiocManager') 6 | 7 | species='drosophila' 8 | docker.image='jmonlong/repeatmasker:release-4.0.9-p2' 9 | nb.cores = 16 10 | 11 | svs = readRDS('16-fruitfly-mc-2022-05-26-d2.100samples.decomposed.svs.site.rol90.insd100.rds') 12 | svs$id = paste0('sv', 1:length(svs)) 13 | 14 | svs.df = svs %>% as.data.frame %>% 15 | filter(size>=40) %>% 16 | group_by(svsite, type, clique, seqnames) %>% 17 | arrange(desc(af)) %>% 18 | summarize(start=min(start), end=max(end), 19 | size.min=min(size), size.max=max(size), 20 | size=size[1], alleles=n(), 21 | ac=sum(ac), af=sum(af), 22 | id=id[1], 23 | .groups='drop') 24 | 25 | svs = subset(svs, id %in% svs.df$id) 26 | 27 | seqs = svs$alt 28 | seqs[which(svs$type %in% c("DEL", "INV", "DUP"))] = svs$ref[which(svs$type %in% 29 | c("DEL", "INV", "DUP"))] 30 | names(seqs) = svs$id 31 | seqs = as(seqs, "XStringSet") 32 | 33 | temp.fa = paste0(tempfile(), ".fa") 34 | writeXStringSet(seqs, temp.fa, format = "FASTA") 35 | 36 | temp.dir = dirname(temp.fa) 37 | system2("docker", c("run", "-t", "-v", paste0(temp.dir, 38 | ":", temp.dir), docker.image, "RepeatMasker", temp.fa, 39 | "--species", species, "-pa", nb.cores)) 40 | 41 | rmout = utils::read.table(paste0(temp.fa, ".out"), skip = 3, as.is = TRUE, fill = TRUE, header=FALSE) 42 | rmout = rmout[, c(5:7, 10, 11)] 43 | colnames(rmout) = c("id", "start", "end", "repeat.name", 44 | "repeat.class.family") 45 | head(rmout) 46 | 47 | rmout$rm.w = rmout$end - rmout$start 48 | rmout = rmout %>% group_by(id, repeat.name, repeat.class.family) %>% 49 | summarize(rm.w=sum(rm.w)) %>% 50 | group_by(id) %>% arrange(desc(rm.w)) %>% 51 | do(head(., 1)) %>% 52 | as.data.frame(stringsAsFactors = FALSE) 53 | rownames(rmout) = rmout$id 54 | 55 | svs$rmsk.classfam = rmout[svs$id, "repeat.class.family"] 56 | svs$rmsk.name = rmout[svs$id, "repeat.name"] 57 | svs$rmsk.cov = rmout[svs$id, "rm.w"]/GenomicRanges::width(seqs) 58 | 59 | saveRDS(svs, file='16-fruitfly-mc-2022-05-26-d2.100samples.decomposed.svs.site.rol90.insd100.rmsk.rds') 60 | -------------------------------------------------------------------------------- /doc/mc-paper/fly/cluster-svs.R: -------------------------------------------------------------------------------- 1 | library(sveval) ## install with: BiocManager::install('jmonlong/sveval') 2 | library(GenomicRanges) ## install with: BiocManager::install('GenomicRanges') 3 | ## install BiocManager with: install.packages('BiocManager') 4 | 5 | ## simple repeat annotation 6 | if(!file.exists('dm6.simpleRepeat.txt.gz')){ 7 | download.file('https://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/simpleRepeat.txt.gz', 'dm6.simpleRepeat.txt.gz') 8 | } 9 | sr = read.table('dm6.simpleRepeat.txt.gz', as.is=TRUE, sep='\t') 10 | sr = GRanges(paste0('dm6.', sr$V2), IRanges(sr$V3, sr$V4)) 11 | sr = reduce(sr) 12 | 13 | ## SVs in the pangenome 14 | svs = lapply(list.files('svs', '*.rds'), function(fn){ 15 | gr = readRDS(paste0('svs/', fn)) 16 | gr$sample = gsub('.+\\.(.*)\\.decomposed.svs.rds', '\\1', fn) 17 | gr 18 | }) 19 | 20 | svs = clusterSVs(svs, range.seq.comp=TRUE, ins.seq.comp=TRUE, nb.cores=16, batch.maxsize=500, simprep=sr, min.rol=.9, max.ins.dist=100) 21 | 22 | saveRDS(svs, file='16-fruitfly-mc-2022-05-26.svs.site.rol90.insd100.rds') 23 | 24 | ## calls 25 | svs = readRDS('16-fruitfly-mc-2022-05-26-d2.100samples.decomposed.svs.rds') 26 | svs = subset(svs, ac>0) 27 | 28 | svs = clusterSVs(svs, range.seq.comp=TRUE, ins.seq.comp=TRUE, nb.cores=16, batch.maxsize=500, simprep=sr, min.rol=.9, max.ins.dist=100) 29 | 30 | saveRDS(svs, file='16-fruitfly-mc-2022-05-26-d2.100samples.decomposed.svs.site.rol90.insd100.rds') 31 | -------------------------------------------------------------------------------- /doc/mc-paper/fly/mapstats-analysis.R: -------------------------------------------------------------------------------- 1 | library(dplyr) ## install with: install.packages('dplyr') 2 | library(ggplot2) ## install with: install.packages('ggplot2') 3 | 4 | ## read dm6-bwa mapping stats 5 | map.d = list.files('dm6_mappings/') 6 | map.d = lapply(map.d, function(ff){ 7 | df = read.table(paste0('dm6_mappings/', ff), as.is=TRUE) 8 | colnames(df) = c('n', 'mapq', 'perfect') 9 | df$sample = gsub('dm6.(.*).bwa.mapstats.txt', '\\1', ff) 10 | df 11 | }) %>% bind_rows 12 | 13 | ## read pangenome-giraffe mapping stats 14 | map.g = list.files('16-fruitfly-mc-2022-05-26-d2_mappings/') 15 | map.g = lapply(map.g, function(ff){ 16 | df = read.table(paste0('16-fruitfly-mc-2022-05-26-d2_mappings/', ff), as.is=TRUE) 17 | colnames(df) = c('n', 'mapq', 'perfect') 18 | df$sample = gsub('16-fruitfly-mc-2022-05-26-d2.(.*).giraffe.mapstats.txt', '\\1', ff) 19 | df 20 | }) %>% bind_rows 21 | 22 | ## combine 23 | map.df = rbind(map.d %>% mutate(method='dm6_bwa'), 24 | map.g %>% mutate(method='cactus_giraffe') 25 | ) %>% 26 | mutate(perfect=as.logical(perfect)) 27 | 28 | ## subset to samples analyzed by both mappers 29 | map.df = map.df %>% group_by(sample) %>% filter(length(unique(method))>1) 30 | 31 | map.df = map.df %>% mutate(method=factor(method, levels=c('dm6_bwa', 'cactus_giraffe'), 32 | labels=c('dm6-BWA', 'Cactus-Giraffe'))) 33 | 34 | gp.df = map.df %>% 35 | group_by(sample, method, perfect) %>% 36 | summarize(n=sum(n)) %>% 37 | group_by(sample, method) %>% mutate(prop=n/sum(n)) %>% 38 | filter(perfect) 39 | gp.df %>% group_by(method) %>% summarize(prop=mean(prop)) 40 | ggp.perfect = ggplot(gp.df, aes(x=method, y=prop)) + 41 | geom_line(aes(group=sample), alpha=.1) + 42 | geom_boxplot(alpha=.4, fill='lightblue') + 43 | theme_bw() + 44 | coord_flip() + 45 | ylab('proportion of reads aligned perfectly') 46 | ggp.perfect 47 | 48 | gp.df = map.df %>% 49 | mutate(mapping.status=ifelse(mapq>0, 'high', 'low')) %>% 50 | group_by(sample, method, mapping.status) %>% 51 | summarize(n=sum(n)) %>% 52 | group_by(sample, method) %>% mutate(prop=n/sum(n)) %>% 53 | filter(mapping.status=='high') 54 | gp.df %>% group_by(method) %>% summarize(prop=mean(prop)) 55 | ggp.mapq = ggplot(gp.df, aes(x=method, y=prop)) + 56 | geom_line(aes(group=sample), alpha=.1) + 57 | geom_boxplot(alpha=.3, fill='indianred2') + 58 | theme_bw() + coord_flip() + 59 | ylab('proportion of reads with mapq>0') 60 | ggp.mapq 61 | 62 | pdf('fly-mapping-stats.pdf', 8, 2) 63 | ggp.perfect 64 | ggp.mapq 65 | dev.off() 66 | -------------------------------------------------------------------------------- /doc/mc-paper/fly/scripts/compute_mapping_stats.py: -------------------------------------------------------------------------------- 1 | import fileinput 2 | 3 | records = {} 4 | 5 | for line in fileinput.input(): 6 | if line[0] == "@": 7 | continue 8 | line = line.rstrip().split('\t') 9 | ## skip if secondary or supplementary alignment 10 | if int(int(line[1])/256)%2 == 1 or int(int(line[1])/2048)%2 == 1: 11 | continue 12 | ## handle unmapped reads 13 | if line[5] == "*": 14 | line[4] = '-1' 15 | ## check if perfectly aligned 16 | perfect = False 17 | if line[5] == str(len(line[9])) + "M": 18 | ## check MD tag 19 | for ii in range(11, len(line)): 20 | tag = line[ii].split(':') 21 | if tag[0] == 'MD': 22 | if line[ii] == "MD:Z:" + str(len(line[9])): 23 | perfect = True 24 | break 25 | ## increment records 26 | rid = '{}\t{}'.format(line[4], perfect) 27 | if rid not in records: 28 | records[rid] = 1 29 | else: 30 | records[rid] += 1 31 | 32 | for rec in records.keys(): 33 | print('{}\t{}'.format(records[rec], rec)) 34 | -------------------------------------------------------------------------------- /doc/mc-paper/fly/scripts/read_svs.R: -------------------------------------------------------------------------------- 1 | library(methods) 2 | library(sveval) 3 | 4 | args = commandArgs(TRUE) 5 | 6 | multisamps = FALSE 7 | if(length(args)>2){ 8 | multisamps = as.logical(args[3]) 9 | } 10 | 11 | if(multisamps){ 12 | svs = readSVvcf.multisamps(args[1], keep.ins.seq=TRUE, keep.ref.seq=TRUE, check.inv=TRUE, keep.ids=TRUE, min.sv.size=30) 13 | } else { 14 | svs = readSVvcf(args[1], keep.ins.seq=TRUE, keep.ref.seq=TRUE, check.inv=TRUE, keep.ids=TRUE, min.sv.size=30) 15 | } 16 | 17 | saveRDS(svs, args[2]) 18 | -------------------------------------------------------------------------------- /doc/mc-paper/fly/scripts/rename_chr_vcf.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('-i', help='Input gzipped VCF', required=True) 6 | parser.add_argument('-r', help='Prefix to remove', default='') 7 | parser.add_argument('-a', help='Prefix to add', default='') 8 | args = parser.parse_args() 9 | 10 | if args.r == '' and args.a == '': 11 | print('either -r or -a arguments must be specified') 12 | exit() 13 | 14 | pref_rm = False 15 | def rename_chr(contig): 16 | return(args.a + contig) 17 | if args.r != '': 18 | pref_rm = True 19 | def rename_chr(contig): 20 | return(contig.replace(args.r, '')) 21 | 22 | for line in gzip.open(args.i, 'rb'): 23 | line = line.rstrip().decode('ascii') 24 | if line[0] == '#': 25 | if 'contig' in line: 26 | line = line.replace('##contig=% mutate(sample=gsub('.+\\.(.+)\\..+\\..+\\.nofalsedup_in_chm13.stratification.rds', '\\1', fn), 11 | method=gsub('.+\\..+\\..+\\.(.+)\\.nofalsedup_in_chm13.stratification.rds', '\\1', fn)) 12 | }) %>% bind_rows 13 | 14 | ## compute numbers for all samples 15 | eval.giab.regs = eval.giab.regs %>% 16 | mutate(method=factor(method, 17 | levels=c("giraffedv", "giraffedv_chm13"), 18 | labels=c('HPRC_GRCh38', 'HPRC_CHM13'))) %>% 19 | group_by(method, region) %>% 20 | summarize(TP=sum(TP, na.rm=TRUE), 21 | FP=sum(FP, na.rm=TRUE), 22 | FN=sum(FN, na.rm=TRUE)) %>% 23 | mutate(precision=TP/(TP+FP), recall=TP/(TP + FN), 24 | F1=2*precision * recall/(precision + recall)) 25 | 26 | ## amount of sequence for each region set 27 | giab.regs = readRDS('regions.all.giab.stratifications.RDS') 28 | regs.mb = tibble(region=names(giab.regs), 29 | mb=unlist(lapply(giab.regs, function(gr) sum(width(gr)/1e6)))) 30 | 31 | ## those regions were excluded from the evaluation 32 | regions.torm = c("false_duplications_correct_copy", 33 | "false_duplications_incorrect_copy", 34 | "collapsed_duplication_FP_regions", 35 | "CMRGv1.00_falselyduplicatedgenes", 36 | "population_CNV_FP_regions") 37 | 38 | pdf('hprc-smallvariant-giab-hg00125-stratifications.pdf', 8, 8) 39 | eval.giab.regs %>% filter(!(region %in% regions.torm)) %>% 40 | dplyr::select(method, region, F1) %>% 41 | pivot_wider(names_from=method, values_from=F1) %>% 42 | mutate(f1.diff=HPRC_CHM13-HPRC_GRCh38) %>% 43 | merge(regs.mb) %>% 44 | ungroup %>% 45 | mutate(label=ifelse(rank(-abs(f1.diff))<15, region, '')) %>% 46 | ggplot(aes(y=f1.diff, x=mb*1e6)) + 47 | geom_hline(yintercept=0, linetype=2, alpha=.7) + 48 | geom_point() + 49 | geom_label_repel(aes(label=label), show.legend=FALSE) + 50 | theme_bw() + 51 | scale_x_log10() + 52 | xlab('total size (bp)') + 53 | ylab('F1 score difference (CHM13-GRCh38)') 54 | dev.off() 55 | 56 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/compute_mapping_stats.py: -------------------------------------------------------------------------------- 1 | import fileinput 2 | 3 | # parse the SAM record and count how many reads 4 | # in each profile (mapping quality x perfect alignement x aligment score) 5 | # mapping quality: number. '-1' for unmapped reads 6 | # perfect: boolean to specify if the reads aligned perfectly 7 | # alignment score: value of the AS tag 8 | 9 | # to tally the number of reads/records in each profile 10 | records = {} 11 | 12 | for line in fileinput.input(): 13 | if line[0] == "@": 14 | continue 15 | line = line.rstrip().split('\t') 16 | ## skip if secondary or supplementary alignment 17 | if int(int(line[1])/256)%2 == 1 or int(int(line[1])/2048)%2 == 1: 18 | continue 19 | ## handle unmapped reads 20 | if line[5] == "*": 21 | line[4] = '-1' 22 | ## extract MD and AS tags 23 | as_tag = False 24 | md_tag = False 25 | for ii in range(11, len(line)): 26 | tag = line[ii].split(':') 27 | if tag[0] == 'MD': 28 | md_tag = line[ii] 29 | elif tag[0] == 'AS': 30 | as_tag = tag[2] 31 | if as_tag and md_tag: 32 | break 33 | ## check if perfectly aligned 34 | perfect = False 35 | if line[5] == str(len(line[9])) + "M": 36 | if md_tag and md_tag == "MD:Z:" + str(len(line[9])): 37 | perfect = True 38 | ## increment records for this profile/rid 39 | rid = '{}\t{}\t{}'.format(line[4], perfect, as_tag) 40 | if rid not in records: 41 | records[rid] = 1 42 | else: 43 | records[rid] += 1 44 | 45 | # print the counts for each rid/profiles 46 | for rec in records.keys(): 47 | print('{}\t{}'.format(records[rec], rec)) 48 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/compute_mapping_stats_gaf.py: -------------------------------------------------------------------------------- 1 | import fileinput 2 | 3 | # parse the GAF record and count how many reads 4 | # in each profile (mapping quality x perfect alignement x aligment score) 5 | # mapping quality: number. '-1' for unmapped reads 6 | # perfect: boolean to specify if the reads aligned perfectly 7 | # alignment score: value of the AS tag 8 | 9 | # to tally the number of reads/records in each profile 10 | records = {} 11 | 12 | for line in fileinput.input(): 13 | line = line.rstrip().split('\t') 14 | ## handle unmapped reads 15 | if line[9] == "*": 16 | line[11] = '-1' 17 | ## extract AS tag 18 | as_tag = False 19 | for ii in range(11, len(line)): 20 | tag = line[ii].split(':') 21 | if tag[0] == 'AS': 22 | as_tag = tag[2] 23 | break 24 | ## check if perfectly aligned 25 | perfect = False 26 | if line[9] != "*" and line[9] == line[10]: 27 | perfect = True 28 | ## increment records 29 | rid = '{}\t{}\t{}'.format(line[11], perfect, as_tag) 30 | if rid not in records: 31 | records[rid] = 1 32 | else: 33 | records[rid] += 1 34 | 35 | # print the counts for each rid/profiles 36 | for rec in records.keys(): 37 | print('{}\t{}'.format(records[rec], rec)) 38 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/compute_mapping_stats_jq.py: -------------------------------------------------------------------------------- 1 | import fileinput 2 | 3 | 4 | # parse a TSV-like stream created by jq from the json-converted GAM records 5 | # and count how many reads 6 | # in each profile (mapping quality x perfect alignement x aligment score) 7 | # mapping quality: number. '-1' for unmapped reads 8 | # perfect: boolean to specify if the reads aligned perfectly 9 | # alignment score: value of the AS tag 10 | # the input tsv columns are: name, identity, score, query_position, sequence 11 | 12 | # to tally the number of reads/records in each profile 13 | records = {} 14 | 15 | # to deal with secondary (skip them) and split reads (flag them) 16 | cur_reads = '' 17 | cur_pos = '' 18 | cur_len = '' 19 | cur_score = '' 20 | cur_split = False 21 | cur_identity = '' 22 | 23 | for line in fileinput.input(): 24 | line = line.rstrip().split('\t') 25 | # only use information from the first record for each read 26 | # (assuming the secondary/splits are right next to the primary) 27 | if line[0] == cur_reads: 28 | # check if overlap in read space 29 | # if no, flag as a "split read" 30 | if (not cur_split): 31 | if (int(line[3]) > cur_pos + cur_len) or \ 32 | (int(line[3]) + len(line[4]) < cur_pos): 33 | cur_split = True 34 | continue 35 | else: 36 | # it's a new read, so write the previous one 37 | if cur_reads != '': 38 | # increment records 39 | rid = '{}\t{}\t{}'.format(cur_identity, cur_score, cur_split) 40 | if rid not in records: 41 | records[rid] = 1 42 | else: 43 | records[rid] += 1 44 | cur_split = False 45 | cur_reads = line[0] 46 | cur_identity = int(100*float(line[1])) 47 | cur_score = line[2] 48 | cur_pos = int(line[3]) 49 | cur_len = len(line[4]) 50 | 51 | # print the counts for each rid/profiles 52 | for rec in records.keys(): 53 | print('{}\t{}'.format(records[rec], rec)) 54 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/eval-roc-summary.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | library(ggrepel) 4 | 5 | args = commandArgs(TRUE) 6 | ## Inputs: summaryc csv path/file names 7 | 8 | df = lapply(args, function(f.path){ 9 | fn = basename(f.path) 10 | read.csv(f.path, as.is=TRUE) %>% 11 | mutate(file=fn, 12 | truthset=gsub('(.*)\\..*\\..*\\..*\\..*\\.roc\\.all\\.csv\\.gz', '\\1', file), 13 | sample=gsub('.*\\.(.*)\\..*\\..*\\..*\\.roc\\.all\\.csv\\.gz', '\\1', file), 14 | reads=gsub('.*\\..*\\.(.*)\\..*\\..*\\.roc\\.all\\.csv\\.gz', '\\1', file), 15 | method=gsub('.*\\..*\\..*\\.(.*)\\..*\\.roc\\.all\\.csv\\.gz', '\\1', file), 16 | region=gsub('.*\\..*\\..*\\..*\\.(.*)\\.roc\\.all\\.csv\\.gz', '\\1', file)) %>% 17 | select(-file) 18 | }) %>% bind_rows 19 | 20 | df = df %>% filter(Subtype=='*', Filter=='ALL', Subset=="*", QQ!='*') %>% 21 | mutate(QQ=as.numeric(QQ), precision=METRIC.Precision, recall=METRIC.Recall, f1=METRIC.F1_Score) %>% 22 | select(truthset, sample, reads, method, region, Type, QQ, precision, recall, f1, method, TRUTH.TOTAL, TRUTH.TP, TRUTH.FN, QUERY.FP, QUERY.TP, QUERY.TOTAL) 23 | 24 | write.table(df, file='eval-roc-summary.tsv', sep='\t', row.names=FALSE, quote=FALSE) 25 | 26 | pdf('eval-roc-summary.pdf', 9, 6) 27 | df.best = df %>% group_by(Type, method, reads, sample, region, truthset) %>% arrange(desc(f1)) %>% do(head(., 1)) %>% 28 | mutate(label=paste0('F1:', f1)) 29 | 30 | tmp = lapply(unique(df$regions), function(reg){ 31 | tmp = lapply(unique(df$reads), function(read){ 32 | 33 | ggp = df %>% filter(reads==read, region==reg) %>% 34 | arrange(method, Type, QQ) %>% 35 | ggplot(aes(x=precision, y=recall, color=method)) + 36 | geom_path(size=1.5, alpha=.8) + 37 | theme_bw() + 38 | facet_wrap(paste(sample, truthset)~Type, scales='free') + 39 | theme(legend.position='bottom') + 40 | scale_color_brewer(palette='Set1') + 41 | geom_label_repel(aes(label=label), data=df.best, show.legend=FALSE) + 42 | ggtitle(reg) + labs(caption=read) 43 | print(ggp) 44 | 45 | }) 46 | }) 47 | dev.off() 48 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/eval-summary.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | 4 | args = commandArgs(TRUE) 5 | ## Inputs: summaryc csv path/file names 6 | 7 | df = lapply(args, function(f.path){ 8 | fn = basename(f.path) 9 | read.csv(f.path, as.is=TRUE) %>% 10 | mutate(file=fn, 11 | truthset=gsub('(.*)\\..*\\..*\\..*\\..*\\.summary\\.csv', '\\1', file), 12 | sample=gsub('.*\\.(.*)\\..*\\..*\\..*\\.summary\\.csv', '\\1', file), 13 | reads=gsub('.*\\..*\\.(.*)\\..*\\..*\\.summary\\.csv', '\\1', file), 14 | method=gsub('.*\\..*\\..*\\.(.*)\\..*\\.summary\\.csv', '\\1', file), 15 | region=gsub('.*\\..*\\..*\\..*\\.(.*)\\.summary\\.csv', '\\1', file)) %>% 16 | select(-file) 17 | }) %>% bind_rows 18 | 19 | write.table(df, file='eval-summary.tsv', sep='\t', row.names=FALSE, quote=FALSE) 20 | 21 | pdf('eval-summary.pdf', 9, 6) 22 | for(metric in c("METRIC.F1_Score", "METRIC.Recall", "METRIC.Precision", 'TRUTH.TP', 'TRUTH.FN', 'QUERY.FP')){ 23 | for(reg in unique(df$region)){ 24 | 25 | cat('\n\n## ', metric, ' - ', reg, '\n\n') 26 | 27 | ggp = df %>% filter(Filter=='PASS', region==reg) %>% 28 | ggplot(aes_string(x='sample', color='method', y=metric)) + 29 | geom_point(stat='identity', position=position_dodge(.5)) + 30 | facet_grid(Type~truthset, space='free', scales='free') + 31 | theme_bw() 32 | print(ggp) 33 | } 34 | } 35 | def.off() 36 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/get_stratification_on_sample.R: -------------------------------------------------------------------------------- 1 | args = commandArgs(TRUE) 2 | ## 1. tsv made with: bcftools query -f '%CHROM\t%POS\t%END[\t%BD\t%BVT]\n' happy-annotated.vcf.gz | awk '{if(($4!="." && $4 != "N") || ($6!="." && $6 != "N")){print $0}}' | gzip 3 | ## 2. output RDS file 4 | 5 | library(dplyr) 6 | library(GenomicRanges) 7 | library(tidyr) 8 | 9 | ## summary counts 10 | sumEval <-function(vars){ 11 | ss.df = mcols(vars) %>% as.data.frame %>% group_by(eval, type) %>% summarize(n=n(), .groups='drop') %>% 12 | pivot_wider(names_from=eval, values_from=n, values_fill=0) 13 | if(is.null(ss.df$FN)) ss.df$FN = 0 14 | if(is.null(ss.df$TP)) ss.df$TP = 0 15 | if(is.null(ss.df$FP)) ss.df$FP = 0 16 | ss.df %>% 17 | mutate(called=TP+FP, 18 | precision=TP/(TP + FP), precision=round(precision, 6), 19 | recall=TP/(TP + FN), recall=round(recall, 6), 20 | recall=ifelse(is.nan(recall), 0, recall), 21 | F1 = 2 * precision * recall/(precision + recall), F1 = round(F1, 6), 22 | F1 = ifelse(precision == 0 & recall == 0, 0, F1)) %>% 23 | select(type, everything()) 24 | } 25 | ## equivalent GRanges object for overlap later 26 | extractVariants <- function(evm){ 27 | evm.gr = makeGRangesFromDataFrame(evm) 28 | tp.indel = evm.gr[which(evm$giab=='TP' & evm$giab.type=='INDEL')] 29 | tp.indel$type = 'INDEL' 30 | tp.indel$eval = 'TP' 31 | fn.indel = evm.gr[which(evm$giab=='FN' & evm$giab.type=='INDEL')] 32 | fn.indel$type = 'INDEL' 33 | fn.indel$eval = 'FN' 34 | fp.indel = evm.gr[which(evm$grch38=='FP' & evm$grch38.type=='INDEL')] 35 | fp.indel$type = 'INDEL' 36 | fp.indel$eval = 'FP' 37 | tp.snp = evm.gr[which(evm$giab=='TP' & evm$giab.type=='SNP')] 38 | tp.snp$type = 'SNP' 39 | tp.snp$eval = 'TP' 40 | fn.snp = evm.gr[which(evm$giab=='FN' & evm$giab.type=='SNP')] 41 | fn.snp$type = 'SNP' 42 | fn.snp$eval = 'FN' 43 | fp.snp = evm.gr[which(evm$grch38=='FP' & evm$grch38.type=='SNP')] 44 | fp.snp$type = 'SNP' 45 | fp.snp$eval = 'FP' 46 | return(c(tp.indel, fn.indel, fp.indel, tp.snp, fn.snp, fp.snp)) 47 | } 48 | 49 | ## cores to use 50 | NB.CORES = 2 51 | 52 | ## read happy results 53 | eval = read.table(args[1], as.is=TRUE, sep='\t', header=FALSE) 54 | colnames(eval) = c('chr', 'start', 'end', 'giab', 'giab.type', 'grch38', 'grch38.type') 55 | 56 | vars = extractVariants(eval) 57 | 58 | giab.regs = readRDS('regions.all.giab.stratifications.RDS') 59 | 60 | eval.giab.regs = mclapply(names(giab.regs), function(regn){ 61 | ol = overlapsAny(vars, giab.regs[[regn]]) 62 | sumEval(vars[which(ol),]) %>% mutate(region=regn) 63 | }, mc.cores=NB.CORES) %>% bind_rows 64 | 65 | saveRDS(eval.giab.regs, file=args[2]) 66 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/giab.truthset.paths.tsv: -------------------------------------------------------------------------------- 1 | HG001 ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/ HG001_GRCh38_1_22_v4.2.1_benchmark.bed HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz 2 | HG002 ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/ HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz 3 | HG003 ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG003_NA24149_father/NISTv4.2.1/GRCh38/ HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz 4 | HG004 ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG004_NA24143_mother/NISTv4.2.1/GRCh38/ HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz 5 | HG005 ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/ChineseTrio/HG005_NA24631_son/NISTv4.2.1/GRCh38/ HG005_GRCh38_1_22_v4.2.1_benchmark.bed HG005_GRCh38_1_22_v4.2.1_benchmark.vcf.gz 6 | HG006 ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/ChineseTrio/HG006_NA24694_father/NISTv4.2.1/GRCh38/ HG006_GRCh38_1_22_v4.2.1_benchmark.bed HG006_GRCh38_1_22_v4.2.1_benchmark.vcf.gz 7 | HG007 ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/ChineseTrio/HG007_NA24695_mother/NISTv4.2.1/GRCh38/ HG007_GRCh38_1_22_v4.2.1_benchmark.bed HG007_GRCh38_1_22_v4.2.1_benchmark.vcf.gz 8 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/prepare_stratification_regions.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(GenomicRanges) 3 | library(ggplot2) 4 | 5 | if(!file.exists('v3.0-GRCh38-stratifications-all-except-genome-specific-stratifications.tsv')){ 6 | download.file('https://raw.githubusercontent.com/genome-in-a-bottle/genome-stratifications/master/GRCh38/v3.0-GRCh38-stratifications-all-except-genome-specific-stratifications.tsv', 'v3.0-GRCh38-stratifications-all-except-genome-specific-stratifications.tsv') 7 | } 8 | reg.tsv = read.table('v3.0-GRCh38-stratifications-all-except-genome-specific-stratifications.tsv', as.is=TRUE, sep='\t') 9 | colnames(reg.tsv) = c('name', 'path') 10 | 11 | regions.l = lapply(1:nrow(reg.tsv), function(ii){ 12 | download.file(paste0('https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.0/GRCh38/', reg.tsv$path[ii]), 'temp.bed.gz') 13 | reg = read.table('temp.bed.gz', as.is=TRUE, sep='\t') 14 | reg = GRanges(reg[,1], IRanges(reg[,2], reg[,3])) 15 | unlink('temp.bed.gz') 16 | gc() 17 | return(reg) 18 | }) 19 | names(regions.l) = reg.tsv$name 20 | saveRDS(regions.l, file='regions.all.giab.stratifications.RDS') 21 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/resources/rename_chr_vcf.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | 4 | # remove a prefix from the chromosome/seqnames. 5 | # for example the prefix added in the pangenome "GRCh38.chr1" -> "chr1" 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('-i', help='Input gzipped VCF', required=True) 8 | parser.add_argument('-p', help='Prefix to remove', default='GRCh38.') 9 | args = parser.parse_args() 10 | 11 | for line in gzip.open(args.i, 'rb'): 12 | line = line.rstrip().decode('ascii') 13 | if line[0] == '#': 14 | if 'contig' in line: 15 | line = line.replace(args.p, '') 16 | print(line) 17 | continue 18 | line = line.split('\t') 19 | line[0] = line[0].replace(args.p, '') 20 | print('\t'.join(line)) 21 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/snakemake_config_chm13.yaml: -------------------------------------------------------------------------------- 1 | methods: ['bwadv', 'giraffedv'] 2 | samples: ['HG002'] 3 | regions: ['wg_noinconsistent'] 4 | truthset: ['CMRG_1_0', '20211005_dipcall_z2k'] 5 | reads: ['30x_novaseq_pcrfree'] 6 | label: 'HG002_chm13' 7 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/snakemake_config_giab1-2-5.yaml: -------------------------------------------------------------------------------- 1 | methods: ['giraffedv', 'giraffedv_chm13', 'bwadv', 'dragen', 'giraffe_noclip'] 2 | samples: ['HG001', 'HG002', 'HG005'] 3 | regions: ['chr20_noinconsistent', 'wg_noinconsistent', 'nofalsedup_in_chm13'] 4 | truthset: ['GIAB_4_2_1'] 5 | reads: ['30x_novaseq_pcrfree'] 6 | label: 'GIAB_4_2_1_HG00_1_2_5' 7 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/snakemake_config_hg002.yaml: -------------------------------------------------------------------------------- 1 | methods: ['giraffedv', 'bwadv', 'dragen', 'giraffedv_chm13', 'giraffe_noclip'] 2 | samples: ['HG002'] 3 | regions: ['chr20_noinconsistent', 'wg_noinconsistent', 'nofalsedup_in_chm13'] 4 | truthset: ['CMRG_1_0', 'GIAB_4_2_1'] 5 | reads: ['30x_novaseq_pcrfree'] 6 | label: 'HG002' 7 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/snakemake_config_lifted.yaml: -------------------------------------------------------------------------------- 1 | methods: ['giraffedv_chm13_lifted', 'bwadv_chm13_lifted'] 2 | samples: ['HG001', 'HG002', 'HG005'] 3 | regions: ['wg_noinconsistent'] 4 | truthset: ['GIAB_4_2_1_chm13visible'] 5 | reads: ['30x_novaseq_pcrfree'] 6 | label: 'GIAB_4_2_1_chm13visible' 7 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/snakemake_config_lifted_cmrg.yaml: -------------------------------------------------------------------------------- 1 | methods: ['giraffedv_chm13_lifted', 'bwadv_chm13_lifted'] 2 | samples: ['HG002'] 3 | regions: ['wg_noinconsistent'] 4 | truthset: ['CMRG_1_0_chm13visible'] 5 | reads: ['30x_novaseq_pcrfree'] 6 | label: 'CMRG_1_0_chm13visible' 7 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/snarls-stats.R: -------------------------------------------------------------------------------- 1 | library(dplyr) ## install with: install.packages('dplyr') 2 | library(ggplot2) ## install with: install.packages('ggplot2') 3 | library(RColorBrewer) 4 | library(colorspace) 5 | 6 | ## color palette 7 | pal = brewer.pal(4, 'Set1')[c(4,1,1)] 8 | pal[2] = darken(pal[2], .3) 9 | pal[3] = lighten(pal[3], .2) 10 | 11 | ## snarls 12 | snarls.df = rbind( 13 | read.table('hprc-v1.0-mc-grch38-maxdel.10mb.dist-stats.tsv.gz', 14 | as.is=TRUE, header=TRUE) %>% mutate(pangenome='minigraph-cactus'), 15 | read.table('hprc-v1.0-mc-chm13-maxdel.10mb.dist-stats.tsv.gz', 16 | as.is=TRUE, header=TRUE) %>% mutate(pangenome='minigraph-cactus-chm13'), 17 | read.table('hprc-v1.0-minigraph-grch38.dist-stats.tsv.gz', 18 | as.is=TRUE, header=TRUE) %>% mutate(pangenome='minigraph') 19 | ) 20 | 21 | snarls.df = snarls.df %>% mutate(diff_length=max_length-min_length) 22 | 23 | ## how to split snarls in to size classes in the graphs 24 | size.breaks = c(-1, 0, 10, 49, 100, 500, 1000, 5000, 1e4, 1e5, Inf) 25 | size.labs = c('0', '1-10', '11-49', '50-100', '100-500', 26 | '500-1K', '1K-5K', '5K-10K', '10K-100K', '>100K') 27 | y.bks = 10^(1:10) 28 | 29 | snarls.df %>% 30 | mutate(diff_length=cut(diff_length, breaks=size.breaks, labels=size.labs), 31 | pangenome=factor(pangenome, levels=c('minigraph', 'minigraph-cactus', 32 | 'minigraph-cactus-chm13'), 33 | labels=c('Minigraph', 'GRCh38_Minigraph-Cactus', 34 | 'CHM13_Minigraph-Cactus'))) %>% 35 | group_by(pangenome, diff_length) %>% summarize(n=sum(n)) %>% 36 | ggplot(aes(x=diff_length, y=n+1, fill=pangenome)) + 37 | geom_bar(stat='identity', position='dodge') + 38 | geom_text(aes(label=paste0(format(n, big.mark=','), ' ')), angle=90, 39 | position=position_dodge(.9), hjust=1, color='white', size=2) + 40 | xlab('difference between shortest and largest paths through snarl (bp)') + 41 | ylab('number of snarls') + 42 | theme_bw() + 43 | scale_fill_manual(values=pal) + 44 | theme(legend.position=c(.99,.99), legend.justification=c(1,1), 45 | legend.title=element_blank()) + 46 | scale_y_log10(breaks=y.bks, labels=format(y.bks, big.mark=',', scientific=FALSE)) 47 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/terra-files/CHM13.path_list.txt: -------------------------------------------------------------------------------- 1 | CHM13.chr1 2 | CHM13.chr2 3 | CHM13.chr3 4 | CHM13.chr4 5 | CHM13.chr5 6 | CHM13.chr6 7 | CHM13.chr7 8 | CHM13.chr8 9 | CHM13.chr9 10 | CHM13.chr10 11 | CHM13.chr11 12 | CHM13.chr12 13 | CHM13.chr13 14 | CHM13.chr14 15 | CHM13.chr15 16 | CHM13.chr16 17 | CHM13.chr17 18 | CHM13.chr18 19 | CHM13.chr19 20 | CHM13.chr20 21 | CHM13.chr21 22 | CHM13.chr22 23 | CHM13.chrX 24 | CHM13.chrM 25 | CHM13.chrY 26 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/terra-files/GRCh38.path_list.txt: -------------------------------------------------------------------------------- 1 | GRCh38.chr1 2 | GRCh38.chr2 3 | GRCh38.chr3 4 | GRCh38.chr4 5 | GRCh38.chr5 6 | GRCh38.chr6 7 | GRCh38.chr7 8 | GRCh38.chr8 9 | GRCh38.chr9 10 | GRCh38.chr10 11 | GRCh38.chr11 12 | GRCh38.chr12 13 | GRCh38.chr13 14 | GRCh38.chr14 15 | GRCh38.chr15 16 | GRCh38.chr16 17 | GRCh38.chr17 18 | GRCh38.chr18 19 | GRCh38.chr19 20 | GRCh38.chr20 21 | GRCh38.chr21 22 | GRCh38.chr22 23 | GRCh38.chrX 24 | GRCh38.chrY 25 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/terra-files/bwa-deepvariant-chm13.json: -------------------------------------------------------------------------------- 1 | { 2 | "BWADV.INPUT_READ_FILE_1": "${this.fastq1}", 3 | "BWADV.INPUT_READ_FILE_2": "${this.fastq2}", 4 | "BWADV.REFERENCE_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/chm13v2.0.plus_hs38d1_analysis_set.compact_decoys.fa", 5 | "BWADV.PATH_LIST_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/fasta-indexes-for-crams/GRCh38_full_analysis_set_plus_decoy_hla.chr1-22-X-Y.txt", 6 | "BWADV.SAMPLE_NAME": "${this.sample}" 7 | } 8 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/terra-files/giraffe-deepvariant-chm13_pangenome-chm13_projection.json: -------------------------------------------------------------------------------- 1 | { 2 | "GiraffeDeepVariant.INPUT_READ_FILE_1": "${this.fastq1}", 3 | "GiraffeDeepVariant.INPUT_READ_FILE_2": "${this.fastq2}", 4 | "GiraffeDeepVariant.XG_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.grch38.xg", 5 | "GiraffeDeepVariant.GBWT_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.gbwt", 6 | "GiraffeDeepVariant.MIN_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.min", 7 | "GiraffeDeepVariant.DIST_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.dist", 8 | "GiraffeDeepVariant.GGBWT_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.gg", 9 | "GiraffeDeepVariant.DV_MODEL_META": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.meta", 10 | "GiraffeDeepVariant.DV_MODEL_DATA": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.data-00000-of-00001", 11 | "GiraffeDeepVariant.DV_MODEL_INDEX": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.index", 12 | "GiraffeDeepVariant.REFERENCE_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/chm13v2.0.plus_hs38d1_analysis_set.compact_decoys.fa", 13 | "GiraffeDeepVariant.REFERENCE_PREFIX": "CHM13.", 14 | "GiraffeDeepVariant.SAMPLE_NAME": "${this.sample}", 15 | "GiraffeDeepVariant.PATH_LIST_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/CHM13.path_list.txt", 16 | "GiraffeDeepVariant.VG_CONTAINER": "quay.io/vgteam/vg:ci-412-34a822dae4e1cefc01058ccb3887de9183c834b7", 17 | "GiraffeDeepVariant.OUTPUT_GAF": "false", 18 | "GiraffeDeepVariant.OUTPUT_GAM": "true", 19 | "GiraffeDeepVariant.CALL_DISK": "100", 20 | "GiraffeDeepVariant.CALL_MEM": "100" 21 | } 22 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/terra-files/giraffe-deepvariant-chm13_pangenome-grch38_projection.json: -------------------------------------------------------------------------------- 1 | { 2 | "GiraffeDeepVariant.INPUT_READ_FILE_1": "${this.fastq1}", 3 | "GiraffeDeepVariant.INPUT_READ_FILE_2": "${this.fastq2}", 4 | "GiraffeDeepVariant.XG_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.grch38.xg", 5 | "GiraffeDeepVariant.GBWT_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.gbwt", 6 | "GiraffeDeepVariant.MIN_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.min", 7 | "GiraffeDeepVariant.DIST_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.dist", 8 | "GiraffeDeepVariant.GGBWT_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-v1.0-mc-chm13-minaf.0.1.gg", 9 | "GiraffeDeepVariant.DV_MODEL_META": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.meta", 10 | "GiraffeDeepVariant.DV_MODEL_DATA": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.data-00000-of-00001", 11 | "GiraffeDeepVariant.DV_MODEL_INDEX": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.index", 12 | "GiraffeDeepVariant.SAMPLE_NAME": "${this.sample}", 13 | "GiraffeDeepVariant.PATH_LIST_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/GRCh38.path_list.txt", 14 | "GiraffeDeepVariant.REFERENCE_PREFIX": "GRCh38.", 15 | "GiraffeDeepVariant.REFERENCE_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hg38.fa", 16 | "GiraffeDeepVariant.VG_CONTAINER": "quay.io/vgteam/vg:ci-412-34a822dae4e1cefc01058ccb3887de9183c834b7", 17 | "GiraffeDeepVariant.OUTPUT_GAM": "false", 18 | "GiraffeDeepVariant.OUTPUT_GAF": "true", 19 | "GiraffeDeepVariant.CALL_DISK": "100", 20 | "GiraffeDeepVariant.CALL_MEM": "100" 21 | } 22 | -------------------------------------------------------------------------------- /doc/mc-paper/hprc/terra-files/giraffe-deepvariant-grch38_pangenome-grch38_projection.json: -------------------------------------------------------------------------------- 1 | { 2 | "GiraffeDeepVariant.INPUT_READ_FILE_1": "${this.fastq1}", 3 | "GiraffeDeepVariant.INPUT_READ_FILE_2": "${this.fastq2}", 4 | "GiraffeDeepVariant.XG_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-f1-D10M-clipped-indexes/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.xg", 5 | "GiraffeDeepVariant.MIN_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-f1-D10M-clipped-indexes/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.min", 6 | "GiraffeDeepVariant.DIST_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-f1-D10M-clipped-indexes/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.dist", 7 | "GiraffeDeepVariant.GBWT_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-f1-D10M-clipped-indexes/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.gbwt", 8 | "GiraffeDeepVariant.GGBWT_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hprc-f1-D10M-clipped-indexes/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.gg", 9 | "GiraffeDeepVariant.SAMPLE_NAME": "${this.sample}", 10 | "GiraffeDeepVariant.DV_MODEL_INDEX": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.index", 11 | "GiraffeDeepVariant.DV_MODEL_DATA": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.data-00000-of-00001", 12 | "GiraffeDeepVariant.DV_MODEL_META": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/giraffe-hprc-20220121/dv-giraffe-model/model.ckpt-364300.meta", 13 | "GiraffeDeepVariant.PATH_LIST_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/GRCh38.path_list.txt", 14 | "GiraffeDeepVariant.REFERENCE_PREFIX": "GRCh38.", 15 | "GiraffeDeepVariant.REFERENCE_FILE": "gs://fc-0e9b62da-2dd8-42e5-bb50-8d08e7cdebc3/hg38.fa", 16 | "GiraffeDeepVariant.OUTPUT_GAM": "false", 17 | "GiraffeDeepVariant.OUTPUT_GAF": "true", 18 | "GiraffeDeepVariant.SPLIT_READ_CORES": "16", 19 | "GiraffeDeepVariant.SPLIT_READ_DISK": "120" 20 | } 21 | -------------------------------------------------------------------------------- /doc/yeast-pg-chrI.full.draw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/yeast-pg-chrI.full.draw.png -------------------------------------------------------------------------------- /doc/yeast-pg-chrI.full.viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/yeast-pg-chrI.full.viz.png -------------------------------------------------------------------------------- /doc/yeast-pg-chrII.full.viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/yeast-pg-chrII.full.viz.png -------------------------------------------------------------------------------- /doc/yeast-pg-chunk-view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/yeast-pg-chunk-view.png -------------------------------------------------------------------------------- /doc/yeast-pg-chunk-viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/doc/yeast-pg-chunk-viz.png -------------------------------------------------------------------------------- /examples/10way-mhc.txt: -------------------------------------------------------------------------------- 1 | (((((Human:0.006969,Chimp:0.009727)Anc10:0.025291,Rhesus:0.044568)Anc07:0.07,Tree_shrew:0.19)Anc03:0.03,(Kangaroo_rat:0.17,(Mouse:0.072818,Rat:0.081244)Anc08:0.11)Anc04:0.150342)Anc01:0.02326,((Dog:0.07,Cat:0.07)Anc05:0.087381,((Pig:0.06,Cow:0.06)Anc09:0.104728,Horse:0.05)Anc06:0.05)Anc02:0.04)root; 2 | 3 | Cat https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Cat.mhc.fa.gz 4 | Chimp https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Chimp.mhc.fa.gz 5 | Cow https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Cow.mhc.fa.gz 6 | Dog https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Dog.mhc.fa.gz 7 | Horse https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Horse.mhc.fa.gz 8 | Human https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Human.mhc.fa.gz 9 | Kangaroo_rat https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Kangaroo_rat.mhc.fa.gz 10 | Mouse https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Mouse.mhc.fa.gz 11 | Pig https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Pig.mhc.fa.gz 12 | Rat https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Rat.mhc.fa.gz 13 | Rhesus https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Rhesus.mhc.fa.gz 14 | Tree_shrew https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/10_way_mhc/Tree_shrew.mhc.fa.gz 15 | -------------------------------------------------------------------------------- /examples/evolverMammals.txt: -------------------------------------------------------------------------------- 1 | ((simHuman_chr6:0.144018,(simMouse_chr6:0.084509,simRat_chr6:0.091589)mr:0.271974):0.020593,(simCow_chr6:0.18908,simDog_chr6:0.16303):0.032898); 2 | 3 | simCow_chr6 https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/mammals/loci1/simCow.chr6 4 | simDog_chr6 https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/mammals/loci1/simDog.chr6 5 | simHuman_chr6 https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/mammals/loci1/simHuman.chr6 6 | simMouse_chr6 https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/mammals/loci1/simMouse.chr6 7 | simRat_chr6 https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/mammals/loci1/simRat.chr6 8 | -------------------------------------------------------------------------------- /examples/evolverPrimates.txt: -------------------------------------------------------------------------------- 1 | (simOrang:0.00993,((simChimp:0.00272,simHuman:0.00269)cb:0.00415,simGorilla:0.00644)hcb:0.00046); 2 | 3 | simOrang https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simOrang.chr6 4 | simChimp https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simChimp.chr6 5 | simHuman https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simHuman.chr6 6 | simGorilla https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simGorilla.chr6 7 | -------------------------------------------------------------------------------- /examples/par1_cb.txt: -------------------------------------------------------------------------------- 1 | (Pan_troglodytes:0.00272,Pan_paniscus:0.00269); 2 | 3 | Pan_troglodytes https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/mPanTro3_XY_1_5000000.fa 4 | Pan_paniscus https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/mPanPan1_XY_1_5000000.fa 5 | 6 | -------------------------------------------------------------------------------- /examples/par1_hcb.txt: -------------------------------------------------------------------------------- 1 | ((Pan_troglodytes:0.00272,Pan_paniscus:0.00269)cb:0.00415,hs1:0.00644); 2 | 3 | Pan_troglodytes https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/mPanTro3_XY_1_5000000.fa 4 | Pan_paniscus https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/mPanPan1_XY_1_5000000.fa 5 | hs1 https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/hs1_XY_1_5000000.fa 6 | -------------------------------------------------------------------------------- /examples/par1_hcbg.txt: -------------------------------------------------------------------------------- 1 | (Gorilla_gorilla:0.00993,((Pan_troglodytes:0.00272,Pan_paniscus:0.00269)cb:0.00415,hs1:0.00644)hcb:0.00046); 2 | 3 | Gorilla_gorilla https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/mGorGor1_XY_1_15000000.fa 4 | Pan_troglodytes https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/mPanTro3_XY_1_5000000.fa 5 | Pan_paniscus https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/mPanPan1_XY_1_5000000.fa 6 | hs1 https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/T2T_primate_PAR/hs1_XY_1_5000000.fa 7 | -------------------------------------------------------------------------------- /examples/yeastPangenome.txt: -------------------------------------------------------------------------------- 1 | S288C https://github.com/ComparativeGenomicsToolkit/cactusTestData/raw/master/yeast/S288C.fa.gz 2 | SK1 https://github.com/ComparativeGenomicsToolkit/cactusTestData/raw/master/yeast/SK1.fa.gz 3 | DBVPG6044 https://github.com/ComparativeGenomicsToolkit/cactusTestData/raw/master/yeast/DBVPG6044.fa.gz 4 | UWOPS034614 https://github.com/ComparativeGenomicsToolkit/cactusTestData/raw/master/yeast/UWOPS034614.fa.gz 5 | Y12 https://github.com/ComparativeGenomicsToolkit/cactusTestData/raw/master/yeast/Y12.fa.gz 6 | YPS128 https://github.com/ComparativeGenomicsToolkit/cactusTestData/raw/master/yeast/YPS128.fa.gz 7 | -------------------------------------------------------------------------------- /hal/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = .. 2 | include ${rootPath}/include.mk 3 | 4 | libSources = impl/*.c 5 | libHeaders = inc/*.h 6 | libTests = tests/*.c 7 | 8 | commonHalLibs = ${LIBDIR}/stReference.a ${LIBDIR}/cactusLib.a 9 | stHalDependencies = ${commonHalLibs} ${LIBDEPENDS} 10 | 11 | LDLIBS := ${commonHalLibs} ${LDLIBS} 12 | 13 | 14 | 15 | all: all_libs all_progs 16 | all_libs: 17 | all_progs: all_libs 18 | ${MAKE} ${LIBDIR}/stCactusToHal.a ${BINDIR}/cactus_halGeneratorTests 19 | 20 | clean : 21 | rm -f ${BINDIR}/cactus_halGeneratorTests 22 | 23 | ${BINDIR}/cactus_halGeneratorTests : ${libTests} ${LIBDIR}/stCactusToHal.a ${stHalDependencies} 24 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -Wno-error -o ${BINDIR}/cactus_halGeneratorTests ${libTests} ${LIBDIR}/stCactusToHal.a ${LDLIBS} 25 | 26 | ${LIBDIR}/stCactusToHal.a : ${libSources} ${libHeaders} 27 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -c ${libSources} 28 | ${AR} rc stCactusToHal.a *.o 29 | ${RANLIB} stCactusToHal.a 30 | mv stCactusToHal.a ${LIBDIR}/ -------------------------------------------------------------------------------- /hal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/hal/__init__.py -------------------------------------------------------------------------------- /hal/impl/fasta.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "cactus.h" 8 | #include "sonLib.h" 9 | #include "bioioC.h" 10 | 11 | static Name globalReferenceEventName; 12 | 13 | static int compareSequences(Sequence *sequence, Sequence *sequence2) { 14 | Event *event = sequence_getEvent(sequence); 15 | Event *event2 = sequence_getEvent(sequence2); 16 | int i = cactusMisc_nameCompare(event_getName(event), event_getName(event2)); 17 | if (i != 0) { 18 | return event_getName(event) == globalReferenceEventName ? -1 : (event_getName(event2) == globalReferenceEventName ? 1 : i); 19 | } 20 | i = cactusMisc_nameCompare(sequence_getName(sequence), sequence_getName(sequence2)); 21 | return i; 22 | } 23 | 24 | static stList *getSequences(Flower *flower, Name referenceEventName) { 25 | globalReferenceEventName = referenceEventName; 26 | stList *sequences = stList_construct(); 27 | Sequence *sequence; 28 | Flower_SequenceIterator *seqIt = flower_getSequenceIterator(flower); 29 | while ((sequence = flower_getNextSequence(seqIt)) != NULL) { 30 | stList_append(sequences, sequence); 31 | } 32 | flower_destructSequenceIterator(seqIt); 33 | stList_sort(sequences, (int (*)(const void *, const void *))compareSequences); 34 | return sequences; 35 | } 36 | 37 | void printFastaSequences(Flower *flower, FILE *fileHandle, Name referenceEventName) { 38 | stList *sequences = getSequences(flower, referenceEventName); 39 | for(int64_t i=0; i%s\n%s\n", (char *)header, string); 47 | free(string); 48 | } 49 | } 50 | stList_destruct(sequences); 51 | } 52 | -------------------------------------------------------------------------------- /hal/inc/hal.h: -------------------------------------------------------------------------------- 1 | /* 2 | * hal.h 3 | * 4 | * Created on: 21 Jun 2012 5 | * Author: benedictpaten 6 | */ 7 | 8 | #ifndef HAL_H_ 9 | #define HAL_H_ 10 | 11 | #include "sonLib.h" 12 | #include "cactus.h" 13 | #include "recursiveThreadBuilder.h" 14 | 15 | void makeHalFormat(Flower *flower, stKVDatabase *database, Name referenceEventName, 16 | FILE *fileHandle); 17 | 18 | void makeHalFormatNoDb(Flower *flower, RecordHolder *rh, Name referenceEventName, FILE *fileHandle); 19 | 20 | void printFastaSequences(Flower *flower, FILE *fileHandle, Name referenceEventName); 21 | 22 | #endif /* HAL_H_ */ 23 | -------------------------------------------------------------------------------- /hal/tests/allTests.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "CuTest.h" 8 | #include 9 | #include 10 | #include 11 | #include "sonLib.h" 12 | 13 | 14 | int halGeneratorAllTests(void) { 15 | CuString *output = CuStringNew(); 16 | CuSuite* suite = CuSuiteNew(); 17 | CuSuiteRun(suite); 18 | CuSuiteSummary(suite, output); 19 | CuSuiteDetails(suite, output); 20 | printf("%s\n", output->buffer); 21 | return suite->failCount > 0; 22 | } 23 | 24 | int main(int argc, char *argv[]) { 25 | if(argc == 2) { 26 | st_setLogLevelFromString(argv[1]); 27 | } 28 | return halGeneratorAllTests(); 29 | } 30 | -------------------------------------------------------------------------------- /pipeline/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = .. 2 | include ${rootPath}/include.mk 3 | 4 | libTests = tests/*.c 5 | libSources = impl/*.c 6 | libHeaders = inc/*.h 7 | 8 | commonCafLibs = ${LIBDIR}/stCactusToHal.a ${LIBDIR}/stReference.a ${LIBDIR}/cactusBarLib.a ${LIBDIR}/stCaf.a ${LIBDIR}/stCactusSetup.a ${sonLibDir}/stPinchesAndCacti.a ${sonLibDir}/matchingAndOrdering.a ${sonLibDir}/3EdgeConnected.a ${sonLibDir}/cPecanLib.a ${LIBDIR}/cactusLib.a ${LIBDIR}/stPaf.a 9 | # simde (included via abPOA) doesn't compile with --Werror --pedantic 10 | CFLAGS:=$(filter-out --pedantic,$(CFLAGS)) 11 | 12 | all: all_libs all_progs 13 | all_libs: 14 | all_progs: all_libs 15 | ${MAKE} ${BINDIR}/stPipelineTests ${BINDIR}/cactus_consolidated ${BINDIR}/docker_test_script 16 | 17 | ${BINDIR}/stPipelineTests : ${libTests} ${LIBDIR}/cactusLib.a ${LIBDEPENDS} 18 | ${CC} ${CPPFLAGS} ${CFLAGS} -o ${BINDIR}/stPipelineTests ${libTests} ${LIBDIR}/cactusLib.a ${LDLIBS} 19 | 20 | ${BINDIR}/cactus_consolidated : cactus_consolidated.c ${LIBDEPENDS} ${commonCafLibs} ${libSources} ${libHeaders} 21 | # the -Wno-unused-function is required to include abpoa.h with CGL_DEBUG defined 22 | ${CC} ${CPPFLAGS} ${CFLAGS} -o ${BINDIR}/cactus_consolidated cactus_consolidated.c ${libSources} ${commonCafLibs} ${LDLIBS} -Wno-unused-function 23 | 24 | ${BINDIR}/docker_test_script : docker_test_script.py 25 | cp docker_test_script.py ${BINDIR}/docker_test_script 26 | chmod +x ${BINDIR}/docker_test_script 27 | 28 | clean : 29 | rm -f *.o 30 | rm -f ${BINDIR}/cactus_workflow.py ${BINDIR}/cactus_consolidated ${BINDIR}/docker_test_script 31 | -------------------------------------------------------------------------------- /pipeline/docker_test_script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | import sys 6 | 7 | def main(): 8 | for line in sys.stdin: 9 | print(line) 10 | 11 | if __name__=="__main__": 12 | main() 13 | -------------------------------------------------------------------------------- /pipeline/impl/traverseFlowers.c: -------------------------------------------------------------------------------- 1 | #include "traverseFlowers.h" 2 | 3 | void extendFlowers(Flower *flower, stList *extendedFlowers, int64_t minFlowerSize) { 4 | Flower_GroupIterator *groupIterator = flower_getGroupIterator(flower); 5 | Group *group; 6 | assert(flower_builtBlocks( 7 | flower)); //This recursion depends on the block structure having been properly defined for all nodes. 8 | while ((group = flower_getNextGroup(groupIterator)) != NULL) { 9 | if (group_isLeaf(group)) { // Has no nested flower 10 | int64_t size = group_getTotalBaseLength(group); 11 | assert(size >= 0); 12 | if (size >= minFlowerSize) { 13 | Flower *nestedFlower = group_makeNestedFlower(group); 14 | stList_append(extendedFlowers, nestedFlower); 15 | } 16 | } 17 | else { // Recursively search for more nested flowers to exnted 18 | Flower *nestedFlower = group_getNestedFlower(group); 19 | assert(nestedFlower != NULL); 20 | extendFlowers(nestedFlower, extendedFlowers, minFlowerSize); 21 | } 22 | } 23 | flower_destructGroupIterator(groupIterator); 24 | } 25 | 26 | /* 27 | * Get all the child flowers of a given flower. 28 | */ 29 | void getChildFlowers(Flower *flower, stList *children) { 30 | if (!flower_isLeaf(flower)) { 31 | assert(flower_builtBlocks( 32 | flower)); //This recursion depends on the block structure having been properly defined for all nodes. 33 | Flower_GroupIterator *groupIterator = flower_getGroupIterator(flower); 34 | Group *group; 35 | while ((group = flower_getNextGroup(groupIterator)) != NULL) { 36 | if (!group_isLeaf(group)) { 37 | Flower *nestedFlower = group_getNestedFlower(group); 38 | assert(nestedFlower != NULL); 39 | stList_append(children, nestedFlower); 40 | } 41 | } 42 | flower_destructGroupIterator(groupIterator); 43 | } 44 | } 45 | 46 | stList *getFlowerHierarchyInLayers(Flower *rootFlower) { 47 | stList *flowers = stList_construct(); 48 | stList_append(flowers, rootFlower); 49 | stList *flowerLayers = stList_construct3(0, (void (*)(void *)) stList_destruct); 50 | while (stList_length(flowers) > 0) { 51 | stList *childFlowers = stList_construct(); 52 | for(int64_t i=0; ibuffer); 20 | return suite->failCount > 0; 21 | } 22 | 23 | int main(int argc, char *argv[]) { 24 | if(argc == 2) { 25 | st_setLogLevelFromString(argv[1]); 26 | } 27 | int i = cactusPipelineRunAllTests(); 28 | //while(1); 29 | return i; 30 | } 31 | -------------------------------------------------------------------------------- /preprocessor/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = .. 2 | include ${rootPath}/include.mk 3 | 4 | CFLAGS += ${hiredisIncl} 5 | 6 | all: all_libs all_progs 7 | all_libs: 8 | all_progs: all_libs 9 | ${MAKE} ${BINDIR}/cactus_analyseAssembly ${BINDIR}/cactus_makeAlphaNumericHeaders.py ${BINDIR}/cactus_filterSmallFastaSequences.py ${BINDIR}/cactus_softmask2hardmask ${BINDIR}/cactus_sanitizeFastaHeaders ${BINDIR}/cactus_redPrefilter 10 | cd lastzRepeatMasking && ${MAKE} all 11 | 12 | ${BINDIR}/cactus_filterSmallFastaSequences.py : cactus_filterSmallFastaSequences.py 13 | cp cactus_filterSmallFastaSequences.py ${BINDIR}/cactus_filterSmallFastaSequences.py 14 | chmod +x ${BINDIR}/cactus_filterSmallFastaSequences.py 15 | 16 | ${BINDIR}/cactus_makeAlphaNumericHeaders.py : cactus_makeAlphaNumericHeaders.py 17 | cp cactus_makeAlphaNumericHeaders.py ${BINDIR}/cactus_makeAlphaNumericHeaders.py 18 | chmod +x ${BINDIR}/cactus_makeAlphaNumericHeaders.py 19 | 20 | ${BINDIR}/cactus_analyseAssembly : cactus_analyseAssembly.c ${LIBDEPENDS} ${LIBDIR}/cactusLib.a 21 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/cactus_analyseAssembly cactus_analyseAssembly.c ${LIBDIR}/cactusLib.a ${LDLIBS} 22 | 23 | ${BINDIR}/cactus_softmask2hardmask : cactus_softmask2hardmask.c ${LIBDEPENDS} ${LIBDIR}/cactusLib.a 24 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/cactus_softmask2hardmask cactus_softmask2hardmask.c ${LIBDIR}/cactusLib.a ${LDLIBS} 25 | 26 | ${BINDIR}/cactus_sanitizeFastaHeaders : cactus_sanitizeFastaHeaders.c ${LIBDEPENDS} ${LIBDIR}/cactusLib.a 27 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/cactus_sanitizeFastaHeaders cactus_sanitizeFastaHeaders.c ${LIBDIR}/cactusLib.a ${LDLIBS} 28 | 29 | ${BINDIR}/cactus_redPrefilter : cactus_redPrefilter.c ${LIBDEPENDS} ${LIBDIR}/cactusLib.a 30 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/cactus_redPrefilter cactus_redPrefilter.c ${LIBDIR}/cactusLib.a ${LDLIBS} 31 | 32 | 33 | clean : 34 | rm -f *.o 35 | rm -f ${BINDIR}/cactus_analyseAssembly ${BINDIR}/cactus_softmask2hardmask 36 | cd lastzRepeatMasking && ${MAKE} clean 37 | -------------------------------------------------------------------------------- /preprocessor/cactus_makeAlphaNumericHeaders.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Removes all non-alpha-numeric chatavyers from the fasta headers of a fasta file. 4 | """ 5 | 6 | import os 7 | from optparse import OptionParser 8 | 9 | from sonLib.bioio import fastaRead 10 | from sonLib.bioio import fastaWrite 11 | from sonLib.bioio import getTempFile 12 | 13 | def fixHeader(header): 14 | return "".join([ i for i in header if str.isalnum(i) ]) 15 | 16 | def main(): 17 | ########################################## 18 | #Construct the arguments. 19 | ########################################## 20 | 21 | usage = "usage: %prog [options] \n\n" + \ 22 | " : fasta sequence to annotate\n" 23 | description = "Ensure sequence names contain only alphanumeric characters\n" 24 | parser = OptionParser(usage=usage, description=description) 25 | 26 | options, args = parser.parse_args() 27 | 28 | if len(args) != 2: 29 | parser.print_help() 30 | return 1 31 | 32 | inputName = args[0] 33 | inputFile = open(inputName, "r") 34 | outputName = args[1] 35 | outputFile = open(outputName, "w") 36 | 37 | for header, seq in fastaRead(inputFile): 38 | fastaWrite(outputFile, fixHeader(header), seq) 39 | 40 | outputFile.close() 41 | inputFile.close() 42 | return 0 43 | 44 | if __name__ == '__main__': 45 | exit(main()) 46 | -------------------------------------------------------------------------------- /preprocessor/lastzRepeatMasking/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = ../.. 2 | include ${rootPath}/include.mk 3 | 4 | CFLAGS += ${hiredisIncl} 5 | 6 | all : ${BINDIR}/cactus_fasta_fragments.py ${BINDIR}/cactus_fasta_softmask_intervals.py ${BINDIR}/cactus_covered_intervals 7 | 8 | ${BINDIR}/cactus_covered_intervals : *.c ${LIBDEPENDS} 9 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/cactus_covered_intervals cactus_covered_intervals.c ${LDLIBS} 10 | 11 | ${BINDIR}/cactus_fasta_fragments.py : cactus_fasta_fragments.py 12 | cp cactus_fasta_fragments.py ${BINDIR}/cactus_fasta_fragments.py 13 | chmod +x ${BINDIR}/cactus_fasta_fragments.py 14 | 15 | ${BINDIR}/cactus_fasta_softmask_intervals.py : cactus_fasta_softmask_intervals.py 16 | cp cactus_fasta_softmask_intervals.py ${BINDIR}/cactus_fasta_softmask_intervals.py 17 | chmod +x ${BINDIR}/cactus_fasta_softmask_intervals.py 18 | 19 | clean : 20 | rm -f *.o 21 | rm -f ${BINDIR}/cactus_lastzRepeatMask.py ${BINDIR}/cactus_fasta_fragments.py ${BINDIR}/cactus_fasta_softmask_intervals.py ${BINDIR}/cactus_covered_intervals 22 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | norecursedirs = venv .git submodules 3 | python_files = *_test.py *Test.py test_*.py 4 | markers = 5 | blast 6 | nonblast -------------------------------------------------------------------------------- /reference/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = .. 2 | include ${rootPath}/include.mk 3 | 4 | CFLAGS += ${hiredisIncl} 5 | 6 | libSources = impl/*.c 7 | libHeaders = inc/*.h 8 | libTests = tests/*.c 9 | 10 | commonReferenceLibs = ${sonLibDir}/matchingAndOrdering.a ${LIBDIR}/cactusLib.a 11 | stReferenceDependencies = ${commonReferenceLibs} ${LIBDEPENDS} 12 | stReferenceLibs = ${commonReferenceLibs} ${LDLIBS} 13 | 14 | all: all_libs all_progs 15 | all_libs: ${LIBDIR}/stReference.a 16 | all_progs: all_libs 17 | ${MAKE} ${BINDIR}/referenceTests 18 | 19 | ${BINDIR}/referenceTests : ${libTests} ${libSources} ${libHeaders} ${stReferenceDependencies} 20 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -I${LIBDIR} -o ${BINDIR}/referenceTests ${libTests} ${libSources} ${stReferenceLibs} 21 | 22 | ${LIBDIR}/stReference.a : ${libSources} ${libHeaders} ${stReferenceDependencies} 23 | ${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -c ${libSources} 24 | ${AR} rc stReference.a *.o 25 | ${RANLIB} stReference.a 26 | mv stReference.a ${LIBDIR}/ 27 | 28 | clean : 29 | rm -f *.o 30 | rm -f ${LIBDIR}/stReference.a ${BINDIR}/referenceTests 31 | 32 | -------------------------------------------------------------------------------- /reference/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /reference/impl/getReferenceSequences.c: -------------------------------------------------------------------------------- 1 | #include "cactus.h" 2 | #include "bioioC.h" 3 | 4 | static char *formatSequenceHeader(Sequence *sequence) { 5 | const char *sequenceHeader = sequence_getHeader(sequence); 6 | if (strlen(sequenceHeader) > 0) { 7 | char *cA = st_malloc(sizeof(char) * (1 + strlen(sequenceHeader))); 8 | sscanf(sequenceHeader, "%s", cA); 9 | return cA; 10 | } else { 11 | return cactusMisc_nameToString(sequence_getName(sequence)); 12 | } 13 | } 14 | 15 | void getReferenceSequences(FILE *fileHandle, Flower *flower, char *referenceEventString){ 16 | //get names of all the sequences in 'flower' for event with name 'referenceEventString' 17 | Sequence *sequence; 18 | Flower_SequenceIterator * seqIterator = flower_getSequenceIterator(flower); 19 | while((sequence = flower_getNextSequence(seqIterator)) != NULL) 20 | { 21 | Event* event = sequence_getEvent(sequence); 22 | const char* eventName = event_getHeader(event); 23 | if (strcmp(eventName, referenceEventString) == 0 && 24 | sequence_getLength(sequence) > 0 && 25 | !sequence_isTrivialSequence(sequence)) { 26 | char *sequenceHeader = formatSequenceHeader(sequence); 27 | st_logDebug("Sequence %s\n", sequenceHeader); 28 | char *string = sequence_getString(sequence, sequence_getStart(sequence), sequence_getLength(sequence), 1); 29 | fastaWrite(string, sequenceHeader, fileHandle); 30 | free(string); 31 | free(sequenceHeader); 32 | } 33 | } 34 | flower_destructSequenceIterator(seqIterator); 35 | return; 36 | } 37 | -------------------------------------------------------------------------------- /reference/inc/addReferenceCoordinates.h: -------------------------------------------------------------------------------- 1 | /* 2 | * addReferenceCoordinates.h 3 | * 4 | * Created on: 17 Jan 2012 5 | * Author: benedictpaten 6 | */ 7 | 8 | #ifndef ADDREFERENCECOORDINATES_H_ 9 | #define ADDREFERENCECOORDINATES_H_ 10 | 11 | #include "cactus.h" 12 | #include "recursiveThreadBuilder.h" 13 | 14 | Cap *getCapForReferenceEvent(End *end, Name referenceEventName); 15 | 16 | void bottomUp(Flower *flower, stKVDatabase *sequenceDatabase, Name referenceEventName, bool isTop, stMatrix *(*generateSubstitutionMatrix)(double)); 17 | 18 | void bottomUpNoDb(Flower *flower, RecordHolder *rh, Name referenceEventName, 19 | bool isTop, stMatrix *(*generateSubstitutionMatrix)(double)); 20 | 21 | void topDown(Flower *flower, Name referenceEventName); 22 | 23 | #endif /* ADDREFERENCECOORDINATES_H_ */ 24 | -------------------------------------------------------------------------------- /reference/inc/blockMLString.h: -------------------------------------------------------------------------------- 1 | /* 2 | * blockMLString.h 3 | * 4 | * Created on: Nov 26, 2014 5 | * Author: benedictpaten 6 | */ 7 | 8 | #ifndef BLOCKMLSTRING_H_ 9 | #define BLOCKMLSTRING_H_ 10 | 11 | char *getMaximumLikelihoodString(stTree *tree, Block *block); 12 | 13 | stMatrix *generateJukesCantorMatrix(double distance); 14 | 15 | stTree *getPhylogeneticTreeRootedAtGivenEvent(Event *event, stMatrix *(*generateSubstitutionMatrix)(double)); 16 | 17 | Event *getEvent(stTree *tree); 18 | 19 | stMatrix *getSubMatrix(stTree *tree); 20 | 21 | void cleanupPhylogeneticTree(stTree *tree); 22 | 23 | void maskAncestralRepeatBases(Block *block, char *mlString); 24 | 25 | #endif /* BLOCKMLSTRING_H_ */ 26 | -------------------------------------------------------------------------------- /reference/inc/cactusReference.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | /* 8 | * reference.h 9 | * 10 | * Created on: 1 Apr 2010 11 | * Author: benedictpaten 12 | * 13 | * Algorithms for building references for the cactus structure. 14 | */ 15 | 16 | #ifndef REFERENCE_H_ 17 | #define REFERENCE_H_ 18 | 19 | #include "cactus.h" 20 | #include "stMatchingAlgorithms.h" 21 | 22 | extern const char *REFERENCE_BUILDING_EXCEPTION; 23 | 24 | /* 25 | * Overall coordination function 26 | */ 27 | void cactus_make_reference(stList *flowers, char *referenceEventString, CactusDisk *cactusDisk, CactusParams *params); 28 | 29 | /* 30 | * Construct a reference for the flower, top down. 31 | */ 32 | void buildReferenceTopDown(Flower *flower, const char *referenceEventHeader, 33 | int64_t permutations, 34 | stList *(*matchingAlgorithm)(stList *edges, int64_t nodeNumber), 35 | double (*temperature)(double), 36 | double theta, 37 | double phi, 38 | int64_t maxWalkForCalculatingZ, bool ignoreUnalignedGaps, 39 | double wiggle, int64_t numberOfNsForScaffoldGap, 40 | int64_t minNumberOfSequencesToSupportAdjacency, bool makeScaffolds); 41 | 42 | double *calculateZ(Flower *flower, stHash *endsToNodes, double theta); 43 | 44 | /* 45 | * Weights events by how informative they are for inferring the 46 | * reference event. Accounts for both distance and the sharing of 47 | * branches. Returns a hash of chosen events to weights. 48 | */ 49 | stHash *getEventWeighting(Event *referenceEvent, double phi, 50 | stSet *chosenEvents); 51 | 52 | /* 53 | * Get the reference sequences, dumping them to the given file handle. 54 | */ 55 | void getReferenceSequences(FILE *fileHandle, Flower *flower, char *referenceEventString); 56 | 57 | #endif /* REFERENCE_H_ */ 58 | -------------------------------------------------------------------------------- /reference/inc/recursiveThreadBuilder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * recursiveThreadBuilder.h 3 | * 4 | * Created on: 21 Jun 2012 5 | * Author: benedictpaten 6 | */ 7 | 8 | #ifndef RECURSIVETHREADBUILDER_H_ 9 | #define RECURSIVETHREADBUILDER_H_ 10 | 11 | void buildRecursiveThreads(stKVDatabase *database, stList *caps, 12 | char *(*segmentWriteFn)(Segment *, void *), 13 | char *(*terminalAdjacencyWriteFn)(Cap *, void *), void *extraArg); 14 | 15 | stList *buildRecursiveThreadsInList(stKVDatabase *database, stList *caps, 16 | char *(*segmentWriteFn)(Segment *, void *), 17 | char *(*terminalAdjacencyWriteFn)(Cap *, void *), void *extraArg); 18 | 19 | typedef stHash RecordHolder; 20 | 21 | RecordHolder *recordHolder_construct(); 22 | 23 | void recordHolder_destruct(RecordHolder *rh); 24 | 25 | int64_t recordHolder_size(RecordHolder *rh); 26 | 27 | /* 28 | * Removes the records from rhToAdd and puts them in rhToAddTo, leaving rhToAdd empty. 29 | */ 30 | void recordHolder_transferAll(RecordHolder *rhToAddTo, RecordHolder *rhToAdd); 31 | 32 | void buildRecursiveThreadsNoDb(RecordHolder *rh, stList *caps, char *(*segmentWriteFn)(Segment *, void *), 33 | char *(*terminalAdjacencyWriteFn)(Cap *, void *), void *extraArg); 34 | 35 | stList *buildRecursiveThreadsInListNoDb(RecordHolder *rh, stList *caps, char *(*segmentWriteFn)(Segment *, void *), 36 | char *(*terminalAdjacencyWriteFn)(Cap *, void *), void *extraArg); 37 | 38 | #endif /* RECURSIVETHREADBUILDER_H_ */ 39 | -------------------------------------------------------------------------------- /reference/tests/allTests.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | #include "CuTest.h" 8 | #include "sonLib.h" 9 | 10 | CuSuite *buildReferenceTestSuite(void); 11 | CuSuite* addReferenceCoordinatesTestSuite(void); 12 | CuSuite* recursiveThreadBuilderTestSuite(void); 13 | 14 | int referenceRunAllTests(void) { 15 | CuString *output = CuStringNew(); 16 | CuSuite* suite = CuSuiteNew(); 17 | CuSuiteAddSuite(suite, buildReferenceTestSuite()); 18 | CuSuiteAddSuite(suite, addReferenceCoordinatesTestSuite()); 19 | CuSuiteAddSuite(suite, recursiveThreadBuilderTestSuite()); 20 | 21 | CuSuiteRun(suite); 22 | CuSuiteSummary(suite, output); 23 | CuSuiteDetails(suite, output); 24 | printf("%s\n", output->buffer); 25 | return suite->failCount > 0; 26 | } 27 | 28 | int main(int argc, char *argv[]) { 29 | if(argc == 2) { 30 | st_setLogLevelFromString(argv[1]); 31 | } 32 | int i = referenceRunAllTests(); 33 | //while(1); 34 | return i; 35 | } 36 | -------------------------------------------------------------------------------- /runtime/wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | # Set monitor mode: give child processes a new PGID 5 | set -m 6 | 7 | # Find pgid of the child process 8 | # Credit: https://stackoverflow.com/a/36820679 9 | pgid_from_pid() { 10 | ps -o pgid= "$pid" 2>/dev/null | egrep -o "[0-9]+" 11 | } 12 | 13 | # Forward signals to the child process tree 14 | sigint() { 15 | kill -SIGINT -- -$(pgid_from_pid) 16 | wait $pid 17 | } 18 | 19 | trap sigint SIGINT 20 | 21 | sigterm() { 22 | kill -SIGTERM -- -$(pgid_from_pid) 23 | wait $pid 24 | } 25 | 26 | trap sigterm SIGTERM 27 | 28 | options="catchsegv" 29 | for arg in "$@" 30 | do 31 | if [ "$arg" == "cactus-redirect" ]; then 32 | options="$options >" 33 | else 34 | options="$options '${arg}'" 35 | fi 36 | done 37 | 38 | >&2 echo "Running command ${options}" 39 | eval "${options}" <&0 & 40 | pid=$! 41 | wait $pid 42 | exit $? 43 | -------------------------------------------------------------------------------- /setup/Makefile: -------------------------------------------------------------------------------- 1 | rootPath = .. 2 | include ${rootPath}/include.mk 3 | 4 | libSources = impl/*.c 5 | libHeaders = inc/*.h 6 | 7 | CFLAGS += ${hiredisIncl} 8 | 9 | all: all_libs 10 | all_progs : all_libs 11 | all_libs: ${LIBDIR}/stCactusSetup.a 12 | 13 | ${LIBDIR}/stCactusSetup.a : ${libSources} ${libHeaders} 14 | ${CC} ${CPPFLAGS} ${CFLAGS} -I inc -I ${LIBDIR}/ -c ${libSources} 15 | ${AR} rc stCactusSetup.a *.o 16 | ${RANLIB} stCactusSetup.a 17 | mv stCactusSetup.a ${LIBDIR}/ 18 | 19 | clean : 20 | rm -f *.o 21 | rm -f ${LIBDIR}/stCactusSetup.a 22 | -------------------------------------------------------------------------------- /setup/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /setup/inc/cactus_setup.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Released under the MIT license, see LICENSE.txt 3 | */ 4 | 5 | #ifndef ST_CACTUS_SETUP_H_ 6 | #define ST_CACTUS_SETUP_H_ 7 | 8 | #include "cactus.h" 9 | 10 | /* 11 | * Build the first flower, adding an event tree and sequence files. 12 | */ 13 | Flower *cactus_setup_first_flower(CactusDisk *cactusDisk, CactusParams *params, 14 | char *speciesTree, char *outgroupEvents, char *sequenceFilesAndEvents); 15 | 16 | #endif /* ST_CACTUS_SETUP_H_ */ 17 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/__init__.py -------------------------------------------------------------------------------- /src/cactus/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/__init__.py -------------------------------------------------------------------------------- /src/cactus/attcc-alpha.knm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/attcc-alpha.knm -------------------------------------------------------------------------------- /src/cactus/bar/cactus_barTest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | import unittest 7 | import os 8 | import sys 9 | import random 10 | 11 | from sonLib.bioio import TestStatus 12 | from sonLib.bioio import getTempDirectory 13 | from sonLib.bioio import logger 14 | from sonLib.bioio import system 15 | from sonLib.bioio import fastaAlignmentRead 16 | from sonLib.bioio import fastaWrite 17 | from sonLib.bioio import fastaAlignmentWrite 18 | from sonLib.bioio import fastaReadHeaders 19 | from sonLib.bioio import getLogLevelString 20 | 21 | from cactus.shared.common import cactus_call 22 | 23 | """Tests cactus_bar. Requires the installation of cactusTools and mafTools. 24 | """ 25 | 26 | class TestCase(unittest.TestCase): 27 | 28 | def setUp(self): 29 | self.testNo = TestStatus.getTestSetup(3, 10, 0, 0) 30 | self.batchSystem = "parasol" 31 | unittest.TestCase.setUp(self) 32 | 33 | @TestStatus.shortLength 34 | def testPosetAlignerAPI(self): 35 | """Run all the cactus base aligner CuTests, fail if any of them fail. 36 | """ 37 | cactus_call(parameters=["cactus_barTests", getLogLevelString()]) 38 | 39 | if __name__ == '__main__': 40 | unittest.main() 41 | -------------------------------------------------------------------------------- /src/cactus/blast/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /src/cactus/hal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/hal/__init__.py -------------------------------------------------------------------------------- /src/cactus/hal/cactus_halTest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | import unittest 7 | import sys 8 | 9 | from sonLib.bioio import TestStatus, system, getLogLevelString 10 | 11 | from cactus.shared.test import getCactusInputs_random 12 | from cactus.shared.test import getCactusInputs_blanchette 13 | from cactus.shared.test import runWorkflow_multipleExamples 14 | 15 | from cactus.shared.common import cactus_call 16 | 17 | class TestCase(unittest.TestCase): 18 | @TestStatus.mediumLength 19 | def testCactusRecursiveHalGenerator_Random(self): 20 | runWorkflow_multipleExamples(self.id(), getCactusInputs_random, 21 | testNumber=TestStatus.getTestSetup(), 22 | buildHal=True, buildFasta=True) 23 | 24 | @unittest.skip("test was never updated when changes were made to the way ancestors work (ERROR: Couldn't find reference event reference)") 25 | @TestStatus.mediumLength 26 | def testCactusRecursiveHalGenerator_Blanchette(self): 27 | runWorkflow_multipleExamples(self.id(), getCactusInputs_blanchette, 28 | buildHal=True, buildFasta=True) 29 | 30 | def testHalGeneratorFunctions(self): 31 | """Run all the CuTests, fail if any of them fail. 32 | """ 33 | cactus_call(parameters=["cactus_halGeneratorTests", getLogLevelString()]) 34 | 35 | if __name__ == '__main__': 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /src/cactus/maf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/maf/__init__.py -------------------------------------------------------------------------------- /src/cactus/paf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/paf/__init__.py -------------------------------------------------------------------------------- /src/cactus/paf/pafTest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Copyright (C) 2009-2021 by Benedict Paten (benedictpaten@gmail.com) 5 | 6 | Released under the MIT license, see LICENSE.txt 7 | """ 8 | 9 | import unittest 10 | from sonLib.bioio import newickTreeParser 11 | from paf import * 12 | 13 | class TestCase(unittest.TestCase): 14 | def setUp(self): 15 | self.tree_string = "((simCow:0.1,simDog:0.2)anc1:0.3,(simChimp:0.4,simHuman:0.5)anc2:0.2)anc0:0.1;" 16 | self.tree = newickTreeParser(self.tree_string) 17 | 18 | def test_get_leaf_event_pairs(self): 19 | self.assertEqual(6, len(list(get_leaf_event_pairs(self.tree)))) 20 | distances = get_distances(self.tree) 21 | leaves = get_leaves(self.tree) 22 | for species_a, species_b, distance in get_leaf_event_pairs(self.tree): 23 | self.assertTrue(species_a in leaves) 24 | self.assertTrue(species_b in leaves) 25 | self.assertEqual(distances[(species_a, species_b)], distance) 26 | 27 | def test_get_distances(self): 28 | distances = get_distances(self.tree) 29 | cow = get_node(self.tree, "simCow") 30 | human = get_node(self.tree, "simHuman") 31 | dog = get_node(self.tree, "simDog") 32 | anc2 = get_node(self.tree, "anc2") 33 | self.assertAlmostEqual(distances[(cow, human)], 0.1+0.3+0.5+0.2) 34 | self.assertAlmostEqual(distances[(cow, dog)], 0.1+0.2) 35 | self.assertAlmostEqual(distances[(dog, cow)], 0.1+0.2) 36 | self.assertAlmostEqual(distances[(anc2, human)], 0.5) 37 | self.assertAlmostEqual(distances[(anc2, cow)], 0.1+0.3+0.2) 38 | 39 | def test_get_node(self): 40 | for i in "simCow", "simDog", "simChimp", "simHuman", "anc1", "anc2", "anc0": 41 | self.assertEqual(i, get_node(self.tree, i).iD) 42 | 43 | def test_get_subtree_nodes(self): 44 | self.assertEqual({ "simCow", "simDog", "simChimp", "simHuman", "anc1", "anc2", "anc0" }, { i.iD for i in get_subtree_nodes(self.tree) }) 45 | 46 | def test_get_leaves(self): 47 | self.assertEqual({ "simCow", "simDog", "simChimp", "simHuman" }, { i.iD for i in get_leaves(self.tree) }) 48 | 49 | if __name__ == '__main__': 50 | unittest.main() -------------------------------------------------------------------------------- /src/cactus/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/pipeline/__init__.py -------------------------------------------------------------------------------- /src/cactus/preprocessor/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /src/cactus/preprocessor/lastzRepeatMasking/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /src/cactus/preprocessor/preprocessorTest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import shutil 4 | import unittest 5 | from sonLib.bioio import TestStatus 6 | from sonLib.bioio import fastaRead 7 | from sonLib.bioio import getTempDirectory 8 | 9 | from toil.job import Job 10 | 11 | """Base case used for testing the preprocessor and lastz repeat masking 12 | """ 13 | 14 | @pytest.mark.blast 15 | @TestStatus.needsTestData 16 | class TestCase(unittest.TestCase): 17 | def setUp(self): 18 | unittest.TestCase.setUp(self) 19 | self.encodeRegion = "ENm001" 20 | self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005") 21 | self.regionPath = os.path.join(self.encodePath, self.encodeRegion) 22 | self.tempDir = getTempDirectory(os.getcwd()) 23 | self.tempOutputFile = os.path.join(self.tempDir, "results1.txt") 24 | self.toilDir = os.path.join(self.tempDir, "toil") 25 | self.toilOptions = Job.Runner.getDefaultOptions(self.toilDir) 26 | self.toilOptions.disableCaching = True 27 | 28 | def tearDown(self): 29 | unittest.TestCase.tearDown(self) 30 | shutil.rmtree(self.tempDir) 31 | 32 | def checkSequenceSetsEqualModuloSoftMasking(self, sequences1, sequences2): 33 | self.assertEqual(list(sequences1.keys()), list(sequences2.keys())) 34 | for seqName in list(sequences1.keys()): 35 | sequence1 = sequences1[seqName] 36 | sequence2 = sequences2[seqName] 37 | self.assertEqual(sequence1.upper(), sequence2.upper()) 38 | 39 | def getSequences(sequenceFile): 40 | sequences = {} 41 | fileHandle = open(sequenceFile, "r") 42 | for header, sequence in fastaRead(fileHandle): 43 | sequences[header] = sequence 44 | fileHandle.close() 45 | return sequences 46 | 47 | def getMaskedBases(sequences): 48 | maskedBases = set() 49 | for header in list(sequences.keys()): 50 | sequence = sequences[header] 51 | for i in range(len(sequence)): 52 | base = sequence[i] 53 | if base.upper() != base or base == 'N': 54 | maskedBases.add((header, i, base)) 55 | return maskedBases 56 | 57 | def getLowerCaseBases(sequenceFile): 58 | #Counts lower case bases in fasta sequences 59 | from sonLib.bioio import fastaRead 60 | totalMasked = 0 61 | total = 0 62 | fileHandle = open(sequenceFile, "r") 63 | for header, sequence in fastaRead(fileHandle): 64 | for base in sequence: 65 | if base != base.upper(): 66 | totalMasked += 1 67 | total += len(sequence) 68 | fileHandle.close() 69 | return total, totalMasked 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /src/cactus/progressive/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /src/cactus/progressive/cactus_constructFromIntermediates.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility to create a HAL from intermediates left by intermediateResultsUrl. 3 | Requires pypi library "newick". 4 | """ 5 | 6 | import sys 7 | from newick import loads, dumps 8 | from argparse import ArgumentParser 9 | from subprocess import check_output, check_call 10 | from copy import deepcopy 11 | 12 | def postorder_create(node, prefix, hal): 13 | if node.name is None: 14 | raise RuntimeError("Requires a tree with all ancestors labeled.") 15 | sys.stderr.write("working on node %r\n" % (node.name)) 16 | c2h = prefix + '-' + node.name + '.c2h' 17 | hal_fa = prefix + '-' + node.name + '.hal.fa' 18 | # get outgroup list (everything in c2h except children / anc) 19 | outgroups = [] 20 | for species_line in check_output("grep -E '^s' %s | cut -f 2 | uniq" % c2h, shell=True).splitlines(): 21 | # strip ' marks on either side 22 | species = species_line[1:-1] 23 | if species != node.name and species not in [n.name for n in node.descendants]: 24 | outgroups.append(species) 25 | # get local newick string 26 | subtree = deepcopy(node) 27 | for child in subtree.descendants: 28 | child.descendants = [] 29 | newick = dumps(subtree) 30 | # actually perform the addition 31 | cmd = ['halAppendCactusSubtree', c2h, hal_fa, newick, hal] 32 | if len(outgroups) > 0: 33 | cmd.extend(['--outgroups', ",".join(outgroups)]) 34 | sys.stderr.write('Running command %r\n' % cmd) 35 | check_call(cmd) 36 | # recurse 37 | for child in node.descendants: 38 | if len(child.descendants) == 0: 39 | # Leaf 40 | continue 41 | postorder_create(child, prefix, hal) 42 | 43 | if __name__ == '__main__': 44 | parser = ArgumentParser(description=__doc__) 45 | parser.add_argument('tree', help='newick tree (*WITH ANCESTORS LABELED*)') 46 | parser.add_argument('prefix', help='intermediate prefix, i.e. prefix shared' 47 | ' by c2h / .hal.fa files') 48 | parser.add_argument('output', help='output HAL file') 49 | opts = parser.parse_args() 50 | tree = loads(opts.tree)[0] 51 | postorder_create(tree, opts.prefix, opts.output) 52 | -------------------------------------------------------------------------------- /src/cactus/reference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/reference/__init__.py -------------------------------------------------------------------------------- /src/cactus/refmap/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /src/cactus/refmap/fasta_preprocessing.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | 3 | def rename_duplicate_contig_ids(assembly_files, reference, new_assembly_files): 4 | """ 5 | Sometimes, when combining assemblies from multiple sources, multiple contigs get the 6 | same name. This function slightly modifies all but one of the contigs with the same 7 | name to ensure that there are no duplicates. Renamed contigs are in a format that 8 | should be easy to reverse. 9 | 10 | Given a list of assembly files (in job.fileStore.writeGlobalFile id output), 11 | outputs the list of edited assembly files, with the only 12 | difference being that all the contigs have been given unique names. Unique names 13 | follow this formula: 14 | x = original contig id 15 | y = unique_integer 16 | new id = x_renamed_y 17 | """ 18 | 19 | contig_ids = set() 20 | unique_id = int() 21 | 22 | #first, record the sequence ids in reference. (It is assumed that the reference 23 | # doesn't contain duplicate ids internally) 24 | reference_contigs = SeqIO.parse(assembly_files[reference], "fasta") 25 | for seq in reference_contigs: 26 | contig_ids.add(seq.id) 27 | 28 | for asm in assembly_files: 29 | if asm == reference: 30 | # we've already preprocessed the reference. Skip it. 31 | continue 32 | 33 | asm_contigs = SeqIO.parse(assembly_files[asm], "fasta") 34 | output_contigs = list() 35 | 36 | for contig in asm_contigs: 37 | 38 | if contig.id in contig_ids: 39 | old_id = contig.id 40 | 41 | while contig.id in contig_ids: 42 | # then there is a duplicate contig_id. edit this one. 43 | # keep changing the contig_id until we get a completely unique id. 44 | contig.id = old_id + "_renamed_" + str(unique_id) 45 | contig.description = old_id + "_renamed_" + str(unique_id) 46 | unique_id += 1 47 | 48 | #record the new contig id as an observed id. 49 | contig_ids.add(contig.id) 50 | 51 | else: 52 | # this isn't a duplicate contig_id. record it. 53 | contig_ids.add(contig.id) 54 | 55 | output_contigs.append(contig) 56 | 57 | # write the altered asm. 58 | SeqIO.write(output_contigs, new_assembly_files[asm], "fasta") 59 | 60 | return new_assembly_files 61 | -------------------------------------------------------------------------------- /src/cactus/refmap/paf_to_lastz.py: -------------------------------------------------------------------------------- 1 | import os 2 | from toil.common import Toil 3 | from toil.job import Job 4 | from cactus.shared.common import cactus_call 5 | from toil.statsAndLogging import logger 6 | from toil.realtimeLogger import RealtimeLogger 7 | 8 | def paf_to_lastz(job, paf_file, sort_secondaries=True, mask_bed_id=None, paf_to_stable=False): 9 | """ 10 | Makes lastz output using paf2lastz. Also splits the input paf_file into two files 11 | in the output, one for the primary and the other for secondary. 12 | 13 | sort_secondaries bool, if true, will cause fxn to return two files instead of one. 14 | 15 | """ 16 | 17 | work_dir = job.fileStore.getLocalTempDir() 18 | paf_path = os.path.join(work_dir, "alignments.paf") 19 | lastz_path = os.path.join(work_dir, "alignments.cigar") 20 | secondary_lastz_path = os.path.join(work_dir, "secondary_alignments.cigar") 21 | 22 | job.fileStore.readGlobalFile(paf_file, paf_path) 23 | 24 | cmd = [] 25 | if paf_to_stable: 26 | cmd.append(['paf2stable', paf_path]) 27 | 28 | if mask_bed_id: 29 | mask_bed_path = os.path.join(work_dir, "mask.bed") 30 | job.fileStore.readGlobalFile(mask_bed_id, mask_bed_path) 31 | cmd.append(['pafmask', paf_path if not cmd else '-', mask_bed_path]) 32 | 33 | paf2lastz_cmd = ['paf2lastz', paf_path if not cmd else '-', '-q'] 34 | if sort_secondaries: 35 | paf2lastz_cmd += ['-s', secondary_lastz_path] 36 | cmd.append(paf2lastz_cmd) 37 | 38 | if len(cmd) == 1: 39 | cmd = cmd[0] 40 | 41 | cactus_call(parameters=cmd, outfile=lastz_path) 42 | 43 | lastz_id = job.fileStore.writeGlobalFile(lastz_path) 44 | 45 | if sort_secondaries: 46 | secondary_id = job.fileStore.writeGlobalFile(secondary_lastz_path) 47 | return [lastz_id, secondary_id] 48 | else: 49 | return lastz_id 50 | -------------------------------------------------------------------------------- /src/cactus/setup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/setup/__init__.py -------------------------------------------------------------------------------- /src/cactus/shared/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) 4 | # 5 | #Released under the MIT license, see LICENSE.txt 6 | -------------------------------------------------------------------------------- /src/cactus/update/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/3ac246f5afcdb1778079d2c4a2952a2f0c4662b6/src/cactus/update/__init__.py -------------------------------------------------------------------------------- /toil-requirement.txt: -------------------------------------------------------------------------------- 1 | backports.zoneinfo[tzdata];python_version<"3.9" 2 | toil[aws]==8.2.0 3 | --------------------------------------------------------------------------------