├── version.txt ├── .gitmodules ├── .gitignore ├── CITATION.md ├── test_database_update.sh ├── .github └── workflows │ ├── dockerhub.yml │ ├── trigger_action.sh │ ├── get_binary_release.sh │ ├── binary.yml │ ├── ccpp.yml │ ├── mac_ccpp.yml │ ├── conda.yml │ └── mac_conda.yml ├── test_prot.gff ├── LICENSE ├── test_disrupt.expected ├── README.md ├── curl_easy.hpp ├── test_prot.expected ├── columns.hpp ├── fasta2parts.cpp ├── test_amrfinder.sh ├── mutate.cpp ├── amrfinder_index.cpp ├── curl_easy.cpp ├── test_dna.expected ├── common.inc ├── test_prot.fa ├── gff.hpp ├── test_both.expected ├── fasta_extract.cpp ├── Makefile ├── fasta_check.cpp ├── alignment.hpp ├── gff_check.cpp ├── disruption2genesymbol.cpp ├── tsv.hpp ├── amrfinder_update.cpp ├── dna_mutation.cpp ├── test_dna_mut_all.expected └── gff.cpp /version.txt: -------------------------------------------------------------------------------- 1 | 4.2.5 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "stxtyper"] 2 | path = stx 3 | url = https://github.com/ncbi/stxtyper.git 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | *.o 3 | amr_report 4 | amrfinder 5 | amrfinder_index 6 | amrfinder_update 7 | amrfinder_customize 8 | disruption2genesymbol 9 | dna_mutation 10 | fasta_check 11 | fasta_extract 12 | fasta2parts 13 | gff_check 14 | mutate 15 | *.got 16 | -------------------------------------------------------------------------------- /CITATION.md: -------------------------------------------------------------------------------- 1 | Please see our [wiki instructions on citing AMRFinderPlus](https://github.com/ncbi/amr/wiki#citation) for more information on how to cite AMRFinderPlus. Importantly we ask that you include both the software version and database version in your methods so people can reproduce your results. 2 | -------------------------------------------------------------------------------- /test_database_update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "test_database_update.sh - test update from staging area" 4 | git status -uno 5 | echo "" 6 | echo "Attempts to update from ftp://ftp.ncbi.nlm.nih.gov/pathogen/Technical/AMRFinder_technical/test_database/" 7 | echo "WARNING: recompiles AMRFinderPlus to use a different update URL" 8 | echo "To continue press , to abort ^C" 9 | read 10 | 11 | set -x 12 | 13 | touch amrfinder_update.cpp 14 | make TEST_UPDATE=1 15 | ./amrfinder -U 16 | ./test_amrfinder.sh -n 17 | #touch amrfinder_update.cpp 18 | #make 19 | 20 | -------------------------------------------------------------------------------- /.github/workflows/dockerhub.yml: -------------------------------------------------------------------------------- 1 | name: dockerhub image 2 | 3 | on: 4 | schedule: 5 | - cron: '15 21 * * *' # 9:15pm everyday 6 | workflow_dispatch: 7 | repository_dispatch: 8 | types: [docker-test, install-test] 9 | 10 | jobs: 11 | 12 | test_dockerhub: 13 | runs-on: ubuntu-latest 14 | timeout-minutes: 30 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Test docker 18 | run: docker run hello-world 19 | - name: Software and DB version 20 | run: docker run ncbi/amr amrfinder --database_version 21 | - name: image ls for debugging 22 | run: docker image ls 23 | - name: Protein 24 | run: docker run --rm -v ${PWD}:/data ncbi/amr amrfinder --plus -n test_dna.fa -O Escherichia --mutation_all test_dna_mut_all.got --print_node > test_dna.got 25 | - name: Check results 26 | run: diff test_dna.expected test_dna.got 27 | - name: Combined 28 | run: docker run --rm -v ${PWD}:/data ncbi/amr amrfinder --plus -n test_dna.fa -p test_prot.fa -g test_prot.gff -O Escherichia --print_node > test_both.got 29 | - name: Check combined results 30 | run: diff test_both.expected test_both.got 31 | -------------------------------------------------------------------------------- /test_prot.gff: -------------------------------------------------------------------------------- 1 | ##gff-version 3 2 | ##sequence-region contig1 1-50000 3 | contig01 . gene 101 961 . + . ID=gene1;Name=blaTEM-156 4 | contigX . gene 1 501 . + . ID=gene2;Name=nimIJ_hmm 5 | contig02 . gene 1 1191 . + . ID=gene3;Name=blaPDC-114_blast 6 | contig03 . gene 101 802 . + . ID=gene4;Name=blaOXA-436_partial 7 | contig04 . gene 101 1147 . + . ID=gene5;Name=vanG 8 | contig06 . gene 31 2616 . + . ID=gene6;Name=gyrA 9 | contig07 . gene 101 526 . + . ID=gene7;Name=50S_L22 10 | contig09 . gene 1 675 . - . Name=aph3pp-Ib_partial_5p_neg 11 | contig09 . gene 715 1377 . - . Name=sul2_partial_3p_neg 12 | contig11 . gene 113 547 . + . Name=blaTEM-internal_stop 13 | contig12 . gene 71 637 . + . Name=qacR-curated_blast 14 | contig13 . gene 1 1137 . + . Name=emrD3-suppressed-in-vibrio 15 | contig13 . gene 1141 1491 . + . Name=arsR-suppressed-in-escherichia 16 | contig14 . gene 1093 2181 . + . Name=pmrB_C84R 17 | contig16 . gene 1 423 . + . Name=nfsA_R15C_K141STOP 18 | contig18 . gene 279 1238 . + . Name=stxA2a_prot 19 | contig18 Protein Homology CDS 279 1238 . + 0 Name=stxA2a_prot 20 | contig18 . gene 1250 1519 . + . Name=stxB2a_prot 21 | contig18 Protein Homology CDS 1250 1519 . + 0 Name=stxB2a_prot 22 | -------------------------------------------------------------------------------- /.github/workflows/trigger_action.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cat <&2 echo "Downloading AMRFinderPlus version $release" 20 | >&2 echo "Binaries from $URL" 21 | 22 | # download and unpack AMRFinder binaries 23 | curl --silent -L -O $URL 24 | tarball_name=$(echo $URL | perl -pe 's#^.*/(.*)#\1#') 25 | tar xfz $tarball_name 26 | rm $tarball_name 27 | 28 | # download and unpack test 29 | curl --silent \ 30 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_dna.fa \ 31 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_prot.fa \ 32 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_prot.gff \ 33 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_both.expected \ 34 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_dna.expected \ 35 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_prot.expected 36 | 37 | # download database 38 | ./amrfinder --update 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | PUBLIC DOMAIN NOTICE 3 | National Center for Biotechnology Information 4 | 5 | This software/database is a "United States Government Work" under the 6 | terms of the United States Copyright Act. It was written as part of 7 | the author's official duties as a United States Government employee and 8 | thus cannot be copyrighted. This software/database is freely available 9 | to the public for use. The National Library of Medicine and the U.S. 10 | Government have not placed any restriction on its use or reproduction. 11 | 12 | Although all reasonable efforts have been taken to ensure the accuracy 13 | and reliability of the software and data, the NLM and the U.S. 14 | Government do not and cannot warrant the performance or results that 15 | may be obtained by using this software or data. The NLM and the U.S. 16 | Government disclaim all warranties, express or implied, including 17 | warranties of performance, merchantability or fitness for any particular 18 | purpose. 19 | 20 | Please cite Feldgarden, Michael, Vyacheslav Brover, Daniel H. Haft, Arjun B. 21 | Prasad, Douglas J. Slotta, Igor Tolstoy, Gregory H. Tyson et al. "Validating 22 | the AMRFinder tool and resistance gene database by using antimicrobial 23 | resistance genotype-phenotype correlations in a collection of isolates." 24 | Antimicrobial agents and chemotherapy 63, no. 11 (2019): e00483-19. 25 | https://pubmed.gov/31427293 in any work or product based on this material. 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/binary.yml: -------------------------------------------------------------------------------- 1 | name: binary tarball 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | branches: [ master ] 7 | repository_dispatch: 8 | types: [linux-binary-test, install-test] 9 | schedule: 10 | - cron: '15 15 * * *' # 3:15pm every day 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | timeout-minutes: 30 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: prerequisites 18 | run: | 19 | sudo apt-get update 20 | sudo apt-get install -y hmmer ncbi-blast+ curl build-essential 21 | - name: download 22 | run: bash -x .github/workflows/get_binary_release.sh 23 | - name: Software and db version 24 | run: ./amrfinder --database_version 25 | - name: run tests included with distribution 26 | run: | 27 | ./amrfinder --plus -p test_prot.fa -g test_prot.gff -O Escherichia --print_node > test_prot.got 28 | diff test_prot.expected test_prot.got 29 | ./amrfinder --plus -n test_dna.fa -O Escherichia --mutation_all test_dna_mut_all.got --print_node > test_dna.got 30 | diff test_dna.expected test_dna.got 31 | ./amrfinder --plus -n test_dna.fa -p test_prot.fa -g test_prot.gff -O Escherichia --print_node > test_both.got 32 | diff test_both.expected test_both.got 33 | - name: Run tests 34 | run: | 35 | # temporarily we need to download the test script. Can remove 36 | # once we have a new software release and it's included in the 37 | # distribution 38 | BASE_URL="https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master" 39 | echo "BASE_URL=${BASE_URL}" 40 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 41 | bash -x ./test_amrfinder.sh 42 | -------------------------------------------------------------------------------- /test_disrupt.expected: -------------------------------------------------------------------------------- 1 | Protein id Contig id Start Stop Strand Element symbol Element name Scope Type Subtype Class Subclass Method Target length Reference sequence length % Coverage of reference % Identity to reference Alignment length Closest reference accession Closest reference name HMM accession HMM description Hierarchy node 2 | NA cirA_A169insTer10 47 5134 + cirA_A169insTer10 Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 1696 657 100.00 99.85 657 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 3 | NA cirA_K633LfsTer8 13 3644 + cirA_K633LfsTer8 Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 1210 657 100.00 100.00 657 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 4 | NA cirA_N184NRHSEWTer 69 6059 + cirA_N184NRHSEWTer Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 1997 657 100.00 100.00 657 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 5 | NA cirA_N184NRHSEWTer 2058 2930 - blaCTX-M-15 extended-spectrum class A beta-lactamase CTX-M-15 core AMR AMR BETA-LACTAM CEPHALOSPORIN ALLELEX 291 291 100.00 100.00 291 WP_000239590.1 extended-spectrum class A beta-lactamase CTX-M-15 NA NA blaCTX-M-15 6 | NA cirA_Q562Ter 164 2134 - cirA_Q562Ter Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 657 657 100.00 99.85 657 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 7 | NA cirA_T98Ter 14 1973 + cirA_T98Ter Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 653 657 100.00 95.59 658 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 8 | NA cirA_Y253CfsTer5 60 833 + cirA_Y253CfsTer5 Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT BETA-LACTAM CEFIDEROCOL POINTX 258 258 100.00 100.00 258 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 9 | NA ompK35_R60LfsTer31 18 1101 + ompK35_R60LfsTer31 Klebsiella pneumoniae carbapenem resistant OmpK35 core AMR POINT_DISRUPT BETA-LACTAM CARBAPENEM POINTX 361 359 100.00 100.00 359 WP_004141771.1 outer membrane porin OmpK35 NA NA ompK35 10 | -------------------------------------------------------------------------------- /.github/workflows/ccpp.yml: -------------------------------------------------------------------------------- 1 | name: C++ CI 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | repository_dispatch: 8 | types: [linux-compile-test, install-test] 9 | jobs: 10 | linux_x86: 11 | runs-on: ubuntu-latest 12 | timeout-minutes: 30 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: prerequisites 16 | run: | 17 | sudo apt-get update 18 | sudo apt-get install -y hmmer ncbi-blast+ git libcurl4-openssl-dev build-essential curl 19 | - name: submodule checkout 20 | run: git submodule update --init --recursive 21 | - name: make 22 | run: cat version.txt; make -j -O 23 | - name: download db 24 | run: ./amrfinder -u 25 | - name: Software and DB version 26 | run: ./amrfinder --database_version 27 | - name: make test 28 | run: make test 29 | - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) 30 | run: ./amrfinder -u 2>&1 | fgrep 'Skipping update' 31 | - name: make github_binaries 32 | run: make github_binaries 33 | - uses: actions/upload-artifact@v4 34 | with: 35 | name: release-binary 36 | path: amrfinder_binaries_v*.tar.gz 37 | linux-arm64: 38 | runs-on: ubuntu-24.04-arm 39 | timeout-minutes: 30 40 | steps: 41 | - uses: actions/checkout@v4 42 | - name: prerequisites 43 | run: | 44 | sudo apt-get update 45 | sudo apt-get install -y hmmer ncbi-blast+ git libcurl4-openssl-dev build-essential curl 46 | - name: submodule checkout 47 | run: git submodule update --init --recursive 48 | - name: make 49 | run: cat version.txt; make -j -O 50 | - name: download db 51 | run: ./amrfinder -u 52 | - name: Software and DB version 53 | run: ./amrfinder --database_version 54 | - name: make test 55 | run: make test 56 | - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) 57 | run: ./amrfinder -u 2>&1 | fgrep 'Skipping update' 58 | - name: make github_binaries 59 | run: | 60 | make github_binaries 61 | version=`cat version.txt` 62 | mv amrfinder_binaries_v$version.tar.gz amrfinder_binaries_linux_aarch64_v$version.tar.gz 63 | - uses: actions/upload-artifact@v4 64 | with: 65 | name: release-binary-arm 66 | path: amrfinder_binaries_linux_aarch64_v*.tar.gz 67 | 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NCBI Antimicrobial Resistance Gene Finder (AMRFinderPlus) 2 | 3 | This software and the accompanying database are designed to find acquired antimicrobial resistance genes and point mutations in protein and/or assembled nucleotide sequences. We have also added "plus" stress, heat, and biocide resistance and virulence factors for [some organisms](https://github.com/evolarjun/amr/wiki/Curated-organisms). 4 | 5 | ## See [the wiki for documentation](https://github.com/ncbi/amr/wiki) 6 | ## [Citing AMRFinderPlus](https://github.com/ncbi/amr/wiki#how-to-cite) 7 | ## Please [subscribe to our announce list](https://www.ncbi.nlm.nih.gov/mailman/listinfo/amrfinder-announce) for announcements of database and software updates. 8 | 9 | ---- 10 | # Licenses 11 | 12 | ## PUBLIC DOMAIN NOTICE 13 | 14 | ### National Center for Biotechnology Information 15 | 16 | This software/database is a "United States Government Work" under the 17 | terms of the United States Copyright Act. It was written as part of 18 | the authors' official duties as a United States Government employee and 19 | thus cannot be copyrighted. This software/database is freely available 20 | to the public for use. The National Library of Medicine and the U.S. 21 | Government have not placed any restriction on its use or reproduction. 22 | 23 | Although all reasonable efforts have been taken to ensure the accuracy 24 | and reliability of the software and data, the NLM and the U.S. 25 | Government do not and cannot warrant the performance or results that 26 | may be obtained by using this software or data. The NLM and the U.S. 27 | Government disclaim all warranties, express or implied, including 28 | warranties of performance, merchantability or fitness for any particular 29 | purpose. 30 | 31 | In any work or product derived from this material, proper attribution of the 32 | authors as the source of the software or data should be made, using the 33 | following citation: 34 | 35 | Feldgarden M, Brover V, Gonzalez-Escalona N, Frye JG, Haendiges J, Haft DH, 36 | Hoffmann M, Pettengill JB, Prasad AB, Tillman GE, Tyson GH, Klimke W. 37 | AMRFinderPlus and the Reference Gene Catalog facilitate examination of the 38 | genomic links among antimicrobial resistance, stress response, and virulence. 39 | Sci Rep. 2021 Jun 16;11(1):12728. doi: [10.1038/s41598-021-91456-0](https://doi.org/10.1038/s41598-021-91456-0). PMID: [34135355](https://pubmed.ncbi.nlm.nih.gov/34135355/); PMCID: [PMC8208984](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8208984/). 40 | 41 | 42 | -------------------------------------------------------------------------------- /.github/workflows/mac_ccpp.yml: -------------------------------------------------------------------------------- 1 | name: MacOS C++ CI 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | repository_dispatch: 8 | types: [mac-compile-test, install-test] 9 | jobs: 10 | macos_arm: 11 | runs-on: macos-latest 12 | timeout-minutes: 30 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: submodule checkout 16 | run: git submodule update --init --recursive 17 | - name: prerequisites 18 | run: | 19 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" 20 | brew install blast 21 | brew install hmmer 22 | - name: make 23 | run: make -j 24 | - name: download db 25 | run: ./amrfinder -u 26 | - name: Software and DB versions 27 | run: | 28 | cat version.txt 29 | ./amrfinder --database_version 30 | - name: make test 31 | run: make test 32 | - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) 33 | run: ./amrfinder -u 2>&1 | fgrep 'Skipping update' 34 | - name: make github_binaries 35 | run: | 36 | make github_binaries 37 | version=`cat version.txt` 38 | mv amrfinder_binaries_v$version.tar.gz amrfinder_binaries_macos_aarch64_v$version.txt 39 | - uses: actions/upload-artifact@v4 40 | with: 41 | name: release-binary 42 | path: amrfinder_binaries_macos_aarch64_v*.tar.gz 43 | macos_x86_64: 44 | runs-on: macos-15-intel 45 | timeout-minutes: 30 46 | steps: 47 | - uses: actions/checkout@v4 48 | - name: submodule checkout 49 | run: git submodule update --init --recursive 50 | - name: prerequisites 51 | run: | 52 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" 53 | brew install blast 54 | brew install hmmer 55 | - name: make 56 | run: make -j 57 | - name: download db 58 | run: ./amrfinder -u 59 | - name: Software and DB versions 60 | run: | 61 | cat version.txt 62 | ./amrfinder --database_version 63 | - name: make test 64 | run: make test 65 | - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) 66 | run: ./amrfinder -u 2>&1 | fgrep 'Skipping update' 67 | - name: make github_binaries 68 | run: | 69 | make github_binaries 70 | version=`cat version.txt` 71 | mv amrfinder_binaries_v$version.tar.gz amrfinder_binaries_macos_x86_64_v$version.txt 72 | - uses: actions/upload-artifact@v4 73 | with: 74 | name: release-binary 75 | path: amrfinder_binaries_macos_x86_64_v*.tar.gz 76 | -------------------------------------------------------------------------------- /.github/workflows/conda.yml: -------------------------------------------------------------------------------- 1 | name: Linux bioconda 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '15 3 * * *' # 3:15am everyday 7 | repository_dispatch: 8 | types: [linux-bioconda-test, install-test] 9 | jobs: 10 | conda_linux_x86_64: 11 | runs-on: ubuntu-latest 12 | timeout-minutes: 30 13 | steps: 14 | - name: When was this run 15 | run: date 16 | - name: configure conda 17 | run: | 18 | . $CONDA/bin/activate 19 | conda config --add channels defaults 20 | conda config --add channels bioconda 21 | conda config --add channels conda-forge 22 | - name: install AMRFinderPlus 23 | run: | 24 | . $CONDA/bin/activate 25 | conda install --update-deps -c conda-forge -c bioconda -y ncbi-amrfinderplus 26 | amrfinder --version 27 | - name: download latest AMRFinderPlus database 28 | run: | 29 | source /usr/share/miniconda/bin/activate 30 | echo CONDA_PREFIX = ${CONDA_PREFIX} 31 | /usr/share/miniconda/bin/amrfinder --force_update 32 | - name: Software and DB version 33 | run: | 34 | source /usr/share/miniconda/bin/activate 35 | amrfinder --database_version 36 | - name: download tests 37 | run: | 38 | BASE_URL=https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master 39 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 40 | - name: run tests 41 | run: | 42 | source /usr/share/miniconda/bin/activate 43 | echo CONDA_PREFIX = $CONDA_PREFIX 44 | bash ./test_amrfinder.sh -p 45 | conda_linux_aarch64: 46 | runs-on: ubuntu-24.04-arm 47 | timeout-minutes: 30 48 | steps: 49 | - uses: conda-incubator/setup-miniconda@v3 50 | with: 51 | auto-update-conda: true 52 | - name: When was this run 53 | run: date 54 | - name: configure conda 55 | shell: bash -el {0} 56 | run: | 57 | conda config --add channels defaults 58 | conda config --add channels bioconda 59 | conda config --add channels conda-forge 60 | - name: install AMRFinderPlus 61 | shell: bash -el {0} 62 | run: | 63 | conda install --update-deps -c conda-forge -c bioconda -y ncbi-amrfinderplus 64 | amrfinder --version 65 | - name: download latest AMRFinderPlus database 66 | shell: bash -el {0} 67 | run: amrfinder --force_update 68 | - name: Software and DB version 69 | shell: bash -el {0} 70 | run: amrfinder --database_version 71 | - name: download tests 72 | run: | 73 | BASE_URL=https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master 74 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 75 | - name: run tests 76 | shell: bash -el {0} 77 | run: bash ./test_amrfinder.sh -p 78 | -------------------------------------------------------------------------------- /curl_easy.hpp: -------------------------------------------------------------------------------- 1 | // curl_easy.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * Dependencies: curl.{h,c} 30 | * 31 | * File Description: 32 | * curl_easy functions 33 | * 34 | */ 35 | 36 | 37 | #ifdef _MSC_VER 38 | #error "UNIX is required" 39 | #endif 40 | 41 | 42 | #include 43 | extern "C" 44 | { 45 | #include 46 | // Linking requires: -lcurl 47 | } 48 | 49 | #include "common.hpp" 50 | using namespace Common_sp; 51 | 52 | 53 | 54 | 55 | namespace CURL_sp 56 | { 57 | 58 | 59 | SoftwareVersion getLibVersion (); 60 | 61 | 62 | 63 | struct Curl 64 | { 65 | CURL* eh {nullptr}; 66 | 67 | 68 | Curl () 69 | : eh (curl_easy_init ()) 70 | { if (! eh) 71 | throw runtime_error ("Cannot initialize curl_easy"); 72 | // Override the libcurl system-wide default 73 | // PD-5495 / https://github.com/ncbi/amr/issues/170 74 | if (const char *env_ca_bundle = getenv ("CURL_CA_BUNDLE")) 75 | curl_easy_setopt (eh, CURLOPT_CAINFO, env_ca_bundle); 76 | } 77 | ~Curl () 78 | { curl_easy_cleanup (eh); } 79 | 80 | 81 | void download (const string &url, 82 | const string &fName); 83 | string read (const string &url); 84 | private: 85 | void process (const string &url, 86 | const string &error_msg_action); 87 | }; 88 | 89 | 90 | 91 | 92 | } // namespace 93 | 94 | 95 | -------------------------------------------------------------------------------- /test_prot.expected: -------------------------------------------------------------------------------- 1 | Protein id Contig id Start Stop Strand Element symbol Element name Scope Type Subtype Class Subclass Method Target length Reference sequence length % Coverage of reference % Identity to reference Alignment length Closest reference accession Closest reference name HMM accession HMM description Hierarchy node 2 | blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase blaTEM-156 3 | blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase blaPDC 4 | blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam 5 | vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG vanG 6 | aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib 7 | sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIALP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA sul2 8 | blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 144 286 50.35 97.22 144 WP_000027057.1 broad-spectrum class A beta-lactamase TEM-1 NA NA blaTEM 9 | qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA qacR 10 | emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA emrD3 11 | pmrB_C84R contig14 1093 2181 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTP 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA pmrB 12 | nfsA_R15C_K141STOP contig16 1 423 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTP 140 240 58.33 99.29 140 WP_089631889.1 nitroreductase NfsA NA NA nfsA 13 | stxA2a_prot contig18 279 1238 + stxA2 Shiga toxin Stx2 subunit A plus VIRULENCE VIRULENCE STX2 stxA2 EXACTP 319 319 100.00 100.00 319 TJA36680.1 Shiga toxin Stx2 subunit A NF041702.1 Shiga toxin Stx2 subunit A stxA2_acd 14 | stxB2a_prot contig18 1250 1519 + stxB2 Shiga toxin Stx2a subunit B plus VIRULENCE VIRULENCE STX2 stxB2a EXACTP 89 89 100.00 100.00 89 AAM90978.1 Shiga toxin Stx2a subunit B NF033660.0 Shiga toxin Stx2 subunit B stxB2a 15 | nimIJ_hmm contigX 1 501 + nimIJ NimIJ family 5-nitroimidazole reductase core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family 5-nitroimidazole reductase NF000262.1 NimIJ family 5-nitroimidazole reductase nimIJ 16 | -------------------------------------------------------------------------------- /columns.hpp: -------------------------------------------------------------------------------- 1 | // columns.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * AMRFinderPlus column names 31 | * 32 | */ 33 | 34 | 35 | 36 | // Columns 37 | // PD-5085 38 | constexpr const char* prot_colName = "Protein id"; // PD-2534 39 | constexpr const char* contig_colName = "Contig id"; 40 | // Target 41 | constexpr const char* start_colName = "Start"; 42 | constexpr const char* stop_colName = "Stop"; 43 | constexpr const char* strand_colName = "Strand"; 44 | // 45 | constexpr const char* genesymbol_colName = "Element symbol"; // PD-4924 46 | constexpr const char* elemName_colName = "Element name"; // PD-4910 47 | constexpr const char* scope_colName = "Scope"; // PD-2825 48 | // PD-1856 49 | constexpr const char* type_colName = "Type"; 50 | constexpr const char* subtype_colName = "Subtype"; 51 | constexpr const char* class_colName = "Class"; 52 | constexpr const char* subclass_colName = "Subclass"; 53 | // 54 | constexpr const char* method_colName = "Method"; 55 | constexpr const char* targetLen_colName = "Target length"; // was: "Element length" (temporarily) 56 | constexpr const char* refLen_colName = "Reference sequence length"; 57 | constexpr const char* refCov_colName = "% Coverage of reference"; 58 | constexpr const char* refIdent_colName = "% Identity to reference"; 59 | constexpr const char* alignLen_colName = "Alignment length"; 60 | constexpr const char* closestRefAccession_colName = "Closest reference accession"; 61 | constexpr const char* closestRefName_colName = "Closest reference name"; 62 | constexpr const char* hmmAccession_colName = "HMM accession"; 63 | constexpr const char* hmmDescr_colName = "HMM description"; 64 | constexpr const char* hierarchyNode_colName = "Hierarchy node"; 65 | 66 | 67 | // PD-5155 68 | constexpr const char* fusion_infix = "::"; // was: "/" 69 | 70 | constexpr const char* na = "NA"; 71 | 72 | constexpr const char* disruption_delim = "_@"; 73 | 74 | 75 | // Methods 76 | constexpr const char* frameshift_Name = "FRAMESHIFT"; 77 | constexpr const char* internalStop_Name = "INTERNAL_STOP"; 78 | constexpr const char* partial_Name = "PARTIAL"; 79 | constexpr const char* partialContigEnd_Name = "PARTIAL_CONTIG_END"; 80 | 81 | -------------------------------------------------------------------------------- /fasta2parts.cpp: -------------------------------------------------------------------------------- 1 | // fasta2parts.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Split the sequences a FASTA file into chunks without breaking sequences 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | 40 | #include "common.inc" 41 | 42 | 43 | 44 | namespace 45 | { 46 | 47 | 48 | 49 | struct ThisApplication final : Application 50 | { 51 | ThisApplication () 52 | : Application ("Split the sequences a FASTA file into parts without breaking sequences") 53 | { 54 | addPositional ("in", "FASTA file"); 55 | addPositional ("parts_max", "Max. number of parts (>= 2)"); 56 | addPositional ("dir", "Output directory where chunks are saved named by integers starting with 1"); 57 | version = SVN_REV; 58 | } 59 | 60 | 61 | 62 | void body () const final 63 | { 64 | const string fName = getArg ("in"); 65 | const size_t parts_max = str2 (getArg ("parts_max")); 66 | const string dirName = getArg ("dir"); 67 | 68 | if (parts_max <= 1) 69 | throw runtime_error ("Number of parts must be >= 2"); 70 | 71 | 72 | const size_t chunk_min = (size_t) getFileSize (fName) / parts_max + 1; 73 | 74 | size_t part = 0; 75 | unique_ptr out; 76 | size_t seqSize = 0; 77 | LineInput f (fName); 78 | while (f. nextLine ()) 79 | { 80 | trimTrailing (f. line); 81 | if (f. line. empty ()) 82 | continue; 83 | if ( f. line [0] == '>' 84 | && seqSize >= chunk_min 85 | && part < parts_max 86 | ) 87 | { 88 | out. reset (); 89 | seqSize = 0; 90 | } 91 | if (! out. get ()) 92 | { 93 | part++; 94 | ASSERT (part <= parts_max); 95 | out. reset (new OFStream (dirName, toString (part), "")); 96 | } 97 | *out << f. line << endl; 98 | seqSize += f. line. size (); 99 | } 100 | } 101 | }; 102 | 103 | 104 | 105 | } // namespace 106 | 107 | 108 | 109 | int main (int argc, 110 | const char* argv[]) 111 | { 112 | ThisApplication app; 113 | return app. run (argc, argv); 114 | } 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /.github/workflows/mac_conda.yml: -------------------------------------------------------------------------------- 1 | name: Mac bioconda 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: '15 9 * * *' # 9:15am everyday 6 | repository_dispatch: 7 | types: [mac-bioconda-test, install-test] 8 | jobs: 9 | conda_macos_aarch64: 10 | runs-on: macos-latest 11 | timeout-minutes: 30 12 | steps: 13 | - name: Install conda because built-in conda is borked 14 | run: | 15 | curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh 16 | bash ./Miniconda3-latest-MacOSX-x86_64.sh -b -p /Users/runner/miniconda3 17 | - name: Configure conda 18 | run: | 19 | source /Users/runner/miniconda3/bin/activate 20 | conda init 21 | # THIS DOESN"T WORK! Just install miniconda myself 22 | # . $CONDA/bin/activate 23 | conda config --add channels defaults 24 | conda config --add channels bioconda 25 | conda config --add channels conda-forge 26 | # permissions are messed up on the mac runner 27 | # Is this faster than installing miniconda myself? 28 | # sudo chown -R 501:20 $CONDA 29 | conda update conda 30 | - name: Install AMRFinderPlus 31 | run: | 32 | source /Users/runner/miniconda3/bin/activate 33 | conda install --update-deps -c bioconda -c conda-forge -y ncbi-amrfinderplus 34 | - name: Download AMRFinderPlus database 35 | run: | 36 | source /Users/runner/miniconda3/bin/activate 37 | /Users/runner/miniconda3/bin/amrfinder -u 38 | - name: Software and DB version 39 | run: | 40 | source /Users/runner/miniconda3/bin/activate 41 | amrfinder --database_version 42 | - name: Download tests 43 | run: | 44 | BASE_URL=https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master 45 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 46 | - name: Run tests 47 | run: | 48 | source /Users/runner/miniconda3/bin/activate 49 | echo CONDA_PREFIX = $CONDA_PREFIX 50 | bash ./test_amrfinder.sh -p 51 | conda_macos_x86_64: 52 | runs-on: macos-15-intel 53 | timeout-minutes: 30 54 | steps: 55 | - name: Install conda because built-in conda is borked 56 | run: | 57 | curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh 58 | bash ./Miniconda3-latest-MacOSX-x86_64.sh -b -p /Users/runner/miniconda3 59 | - name: Configure conda 60 | run: | 61 | source /Users/runner/miniconda3/bin/activate 62 | conda init 63 | # THIS DOESN"T WORK! Just install miniconda myself 64 | # . $CONDA/bin/activate 65 | conda config --add channels defaults 66 | conda config --add channels bioconda 67 | conda config --add channels conda-forge 68 | # permissions are messed up on the mac runner 69 | # Is this faster than installing miniconda myself? 70 | # sudo chown -R 501:20 $CONDA 71 | conda update conda 72 | - name: Install AMRFinderPlus 73 | run: | 74 | source /Users/runner/miniconda3/bin/activate 75 | conda install --update-deps -c bioconda -c conda-forge -y ncbi-amrfinderplus 76 | - name: Download AMRFinderPlus database 77 | run: | 78 | source /Users/runner/miniconda3/bin/activate 79 | /Users/runner/miniconda3/bin/amrfinder -u 80 | - name: Software and DB version 81 | run: | 82 | source /Users/runner/miniconda3/bin/activate 83 | amrfinder --database_version 84 | - name: Download tests 85 | run: | 86 | BASE_URL=https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master 87 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 88 | - name: Run tests 89 | run: | 90 | source /Users/runner/miniconda3/bin/activate 91 | echo CONDA_PREFIX = $CONDA_PREFIX 92 | bash ./test_amrfinder.sh -p 93 | -------------------------------------------------------------------------------- /test_amrfinder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | AMRFINDER_OPTS=" --plus --print_node --threads 6 " 4 | path=0 5 | no_download=0 6 | print_help=0 7 | while getopts "pnh" opt; do 8 | case $opt in 9 | p) path=1 ;; 10 | n) no_download=1 ;; 11 | h) print_help=1 ;; 12 | esac 13 | done 14 | 15 | if [ "$print_help" -gt 0 ] 16 | then 17 | echo "test_amrfinder.sh - Run tests" 18 | echo "Options: " 19 | echo " -p Test the amrfinder command in path instead of ./amrfinder" 20 | echo " -n Don't attempt to download fresh test data, use the test data in $PWD" 21 | echo " -h print this help message" 22 | exit 1 23 | fi 24 | 25 | # some color macros 26 | if [ "$TERM" == "" ] || [ "$TERM" == "dumb" ] || [ ! -t 1 ] 27 | then 28 | green='' # no colors 29 | red='' 30 | reset='' 31 | else 32 | green=`tput setaf 2` # Set green foreground color (code 2) 33 | red=`tput setaf 1` # Set red foreground color (code 1) 34 | reset=`tput sgr0` # Reset color to default 35 | fi 36 | 37 | if [ "$path" -gt 0 ] 38 | then 39 | echo "Testing amrfinder command in your \$PATH" 40 | which amrfinder 41 | AMRFINDER=amrfinder 42 | else 43 | echo "Testing ./amrfinder" 44 | AMRFINDER=./amrfinder 45 | fi 46 | 47 | if [ "$no_download" -gt 0 ] 48 | then 49 | echo "-n option detected, skipping download of test data and using tests in" 50 | echo "$PWD. Note test data may not match the latest database release." 51 | else 52 | echo Downloading fresh test data... 53 | BASE_URL=https://raw.githubusercontent.com/ncbi/amr/master 54 | curl --silent --location \ 55 | -O ${BASE_URL}/test_dna.fa \ 56 | -O ${BASE_URL}/test_prot.fa \ 57 | -O ${BASE_URL}/test_prot.gff \ 58 | -O ${BASE_URL}/test_both.expected \ 59 | -O ${BASE_URL}/test_dna.expected \ 60 | -O ${BASE_URL}/test_dna_mut_all.expected \ 61 | -O ${BASE_URL}/test_prot.expected \ 62 | -O ${BASE_URL}/test_disrupt.fa \ 63 | -O ${BASE_URL}/test_disrupt.expected 64 | 65 | if [ $? != 0 ] 66 | then 67 | echo "${red}WARNING: Could not download new test data.${reset}" 68 | echo "Will attempt to use test data in $PWD." 69 | echo "Test data included with installation may not match the latest database release." 70 | fi 71 | fi 72 | 73 | TESTS=0 74 | TEST_TEXT="" 75 | FAILURES=0 76 | 77 | function test_input_file { 78 | local test_base="$1" 79 | local options="$2" 80 | 81 | TESTS=$(( $TESTS + 1 )) 82 | 83 | if ! $AMRFINDER $options $AMRFINDER_OPTS > "$test_base.got" 84 | then 85 | echo "${red}not ok: $AMRFINDER returned a non-zero exit value indicating a failure of the software${reset}" 86 | echo "# $AMRFINDER $options $AMRFINDER_OPTS > $test_base.got" 87 | return 1 88 | else 89 | if ! diff -q "$test_base.expected" "$test_base.got" 90 | then 91 | echo "${red}not ok: $AMRFINDER returned output different from expected.${reset}" 92 | echo "# diff $test_base.expected $test_base.got" 93 | echo "# To approve run: " 94 | echo "# mv $test_base.got $test_base.expected" 95 | TEST_TEXT="$TEST_TEXT"$'\n'"${red}Failed $test_base${reset}"; 96 | echo "" 97 | return 1 98 | else 99 | echo "${green}ok:${reset} $test_base" 100 | return 0 101 | fi 102 | fi 103 | } 104 | 105 | test_input_file "test_prot" "-p test_prot.fa -g test_prot.gff -O Escherichia" 106 | FAILURES=$(( $? + $FAILURES )) 107 | 108 | test_input_file "test_dna" "-n test_dna.fa -O Escherichia --mutation_all test_dna_mut_all.got" 109 | FAILURES=$(( $? + $FAILURES )) 110 | 111 | test_input_file "test_both" "-n test_dna.fa -g test_prot.gff -p test_prot.fa -O Escherichia" 112 | FAILURES=$(( $? + $FAILURES )) 113 | 114 | test_input_file "test_disrupt" "-n test_disrupt.fa -O Klebsiella_pneumoniae" 115 | FAILURES=$(( $? + $FAILURES )) 116 | 117 | # gzipped input 118 | # gzip -c test_prot.fa > test_prot.fa.gz 119 | # gzip -c test_dna.fa > test_dna.fa.gz 120 | # gzip -c test_prot.gff > test_prot.gff.gz 121 | # test_input_file "test_prot" "-n test_dna.fa.gz -p test_prot.fa.gz -g test_prot.gff.gz --protein_output test_prot.gz.fa --nucleotide_output test_dna.gz.out" 122 | 123 | echo "Done." 124 | echo "$TEST_TEXT" 125 | echo "" 126 | if [ "$FAILURES" -gt 0 ] 127 | then 128 | PASSED=$(( $TESTS - $FAILURES )) 129 | echo "${red}not ok overall: $FAILURES out of $TESTS amrfinder tests failed${reset}" 130 | exit 1 131 | else 132 | echo "${green}ok: all $TESTS amrfinder tests passed ${reset}" 133 | echo "Success!" 134 | fi 135 | -------------------------------------------------------------------------------- /mutate.cpp: -------------------------------------------------------------------------------- 1 | // mutate.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Mutate a FASTA file 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | #include "alignment.hpp" 40 | using namespace Alignment_sp; 41 | #include "seq.hpp" 42 | using namespace Seq_sp; 43 | 44 | #include "common.inc" 45 | 46 | 47 | 48 | namespace 49 | { 50 | 51 | 52 | 53 | struct ThisApplication final : Application 54 | { 55 | ThisApplication () 56 | : Application ("Mutate a FASTA file") 57 | { 58 | addPositional ("in", "Input FASTA file"); 59 | addPositional ("mut", "AmrMutation table: <1-based pos> "); 60 | addFlag ("aa", "Protein/DNA"); 61 | addFlag ("orig", "Add the original, non-mutated sequences"); 62 | 63 | version = SVN_REV; 64 | } 65 | 66 | 67 | 68 | void body () const final 69 | { 70 | const string inFName = getArg ("in"); 71 | const string mutFName = getArg ("mut"); 72 | const bool aa = getFlag ("aa"); 73 | const bool orig = getFlag ("orig"); 74 | 75 | 76 | map > id2mutation; 77 | { 78 | LineInput in (mutFName); 79 | Istringstream iss; 80 | while (in. nextLine ()) 81 | { 82 | iss. reset (in. line); 83 | string seqId; 84 | size_t pos; 85 | string mutation_std; 86 | string mutation_report; 87 | iss >> seqId >> pos >> mutation_std >> mutation_report; 88 | QC_ASSERT (! mutation_report. empty ()); 89 | AmrMutation mut (pos, mutation_std, mutation_report, "X", "X", "X"); 90 | mut. qc (); 91 | id2mutation [seqId] << std::move (mut); 92 | } 93 | } 94 | 95 | 96 | Multifasta fIn (inFName, aa); 97 | while (fIn. next ()) 98 | { 99 | unique_ptr seq; 100 | try 101 | { 102 | if (aa) 103 | { 104 | auto pep = new Peptide (fIn, 1000, false); // PAR 105 | pep->pseudo = true; 106 | seq. reset (pep); 107 | } 108 | else 109 | seq. reset (new Dna (fIn, 100000, false)); // PAR 110 | seq->qc (); 111 | if (orig) 112 | seq->saveText (cout); 113 | if (const Vector* muts = findPtr (id2mutation, seq->getId ())) 114 | for (const AmrMutation& mut : *muts) 115 | { 116 | unique_ptr seq1 (seq->copy ()); 117 | mut. apply (seq1->seq); 118 | if (! aa) 119 | strLower (seq1->seq); 120 | seq1->name += ":" + to_string (mut. pos_real + 1) + ":" + mut. geneMutation; 121 | seq1->qc (); 122 | seq1->saveText (cout); 123 | } 124 | } 125 | catch (const exception &e) 126 | { 127 | if (seq) 128 | throw runtime_error (seq->name + "\n" + e. what ()); 129 | throw; 130 | } 131 | } 132 | } 133 | }; 134 | 135 | 136 | 137 | } // namespace 138 | 139 | 140 | 141 | 142 | int main (int argc, 143 | const char* argv[]) 144 | { 145 | ThisApplication app; 146 | return app. run (argc, argv); 147 | } 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /amrfinder_index.cpp: -------------------------------------------------------------------------------- 1 | // amrfinder_index.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Indexing of AMRFinder data 31 | * 32 | * Dependencies: NCBI BLAST, HMMer 33 | * 34 | * Release changes: see amrfinder.cpp 35 | * 36 | */ 37 | 38 | 39 | 40 | 41 | #ifdef _MSC_VER 42 | #error "UNIX is required" 43 | #endif 44 | 45 | #undef NDEBUG 46 | #include "common.hpp" 47 | using namespace Common_sp; 48 | 49 | #include "common.inc" 50 | 51 | 52 | 53 | 54 | namespace 55 | { 56 | 57 | 58 | 59 | // ThisApplication 60 | 61 | struct ThisApplication final : ShellApplication 62 | { 63 | ThisApplication () 64 | : ShellApplication ("Index the database for AMRFinder", true, false, true, true) 65 | { 66 | addPositional ("DATABASE", "Directory with AMRFinder database"); 67 | addKey ("blast_bin", "Directory for BLAST", "", '\0', "BLAST_DIR"); 68 | addKey ("hmmer_bin", "Directory for HMMer", "", '\0', "HMMER_DIR"); 69 | version = SVN_REV; 70 | } 71 | 72 | 73 | 74 | void shellBody () const final 75 | { 76 | string dbDir = getArg ("DATABASE"); 77 | string blast_bin = getArg ("blast_bin"); 78 | string hmmer_bin = getArg ("hmmer_bin"); 79 | 80 | addDirSlash (dbDir); 81 | addDirSlash (blast_bin); 82 | addDirSlash (hmmer_bin); 83 | 84 | 85 | const Verbose vrb (qc_on); 86 | 87 | 88 | if (! directoryExists (dbDir)) 89 | throw runtime_error ("Database directory " + dbDir + " does not exist"); 90 | 91 | if (! blast_bin. empty ()) 92 | prog2dir ["makeblastdb"] = blast_bin; 93 | findProg ("makeblastdb"); 94 | 95 | if (! hmmer_bin. empty ()) 96 | prog2dir ["hmmpress"] = hmmer_bin; 97 | findProg ("hmmpress"); 98 | 99 | 100 | // Cf. amrfinder_update.cpp 101 | StringVector dnaPointMuts; 102 | { 103 | LineInput f (dbDir + "taxgroup.tsv", verbose () ? 1 : 0); 104 | while (f. nextLine ()) 105 | { 106 | if (isLeft (f. line, "#")) 107 | continue; 108 | string taxgroup = f. line; 109 | const int n = str2 (rfindSplit (taxgroup, '\t')); 110 | const string gpipe = rfindSplit (taxgroup, '\t'); 111 | QC_ASSERT (n >= 0); 112 | QC_ASSERT (! contains (taxgroup, ' ')); 113 | if (n) 114 | dnaPointMuts << taxgroup; 115 | } 116 | } 117 | 118 | stderr. section ("Indexing"); 119 | exec (fullProg ("hmmpress") + " -f " + shellQuote (dbDir + "AMR.LIB") + " > /dev/null 2> " + tmp + "/hmmpress.err", tmp + "/hmmpress.err"); 120 | setSymlink (dbDir, tmp + "/db", true); 121 | exec (fullProg ("makeblastdb") + " -in " + tmp + "/db/AMRProt.fa" + " -dbtype prot -logfile " + tmp + "/makeblastdb.AMRProt", tmp + "/makeblastdb.AMRProt"); 122 | exec (fullProg ("makeblastdb") + " -in " + tmp + "/db/AMR_CDS.fa" + " -dbtype nucl -logfile " + tmp + "/makeblastdb.AMR_CDS", tmp + "/makeblastdb.AMR_CDS"); 123 | for (const string& dnaPointMut : dnaPointMuts) 124 | exec (fullProg ("makeblastdb") + " -in " + tmp + "/db/AMR_DNA-" + dnaPointMut + ".fa -dbtype nucl -logfile " + tmp + "/makeblastdb.AMR_DNA-" + dnaPointMut, tmp + "/makeblastdb.AMR_DNA-" + dnaPointMut); 125 | } 126 | }; 127 | 128 | 129 | 130 | } // namespace 131 | 132 | 133 | 134 | int main (int argc, 135 | const char* argv[]) 136 | { 137 | ThisApplication app; 138 | return app. run (argc, argv); 139 | } 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /curl_easy.cpp: -------------------------------------------------------------------------------- 1 | // curl_easy.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * curl_easy functions 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "curl_easy.hpp" 38 | using namespace Common_sp; 39 | 40 | #include "common.inc" 41 | 42 | 43 | 44 | namespace CURL_sp 45 | { 46 | 47 | 48 | 49 | SoftwareVersion getLibVersion () 50 | { 51 | if (const curl_version_info_data* ver = curl_version_info (CURLVERSION_NOW)) 52 | { 53 | const uint major = (ver->version_num >> 16) & 0xff; 54 | const uint minor = (ver->version_num >> 8) & 0xff; 55 | const uint patch = ver->version_num & 0xff; 56 | return SoftwareVersion (major, minor, patch); 57 | } 58 | return SoftwareVersion (); 59 | } 60 | 61 | 62 | 63 | 64 | // Curl 65 | 66 | namespace 67 | { 68 | size_t write_stream_cb (char* ptr, 69 | size_t size, 70 | size_t nMemb, 71 | void* userData) 72 | { 73 | ASSERT (ptr); 74 | ASSERT (size == 1); 75 | ASSERT (userData); 76 | 77 | OFStream& f = * static_cast (userData); 78 | FOR (size_t, i, nMemb) 79 | f << ptr [i]; 80 | 81 | return nMemb; 82 | } 83 | 84 | 85 | 86 | size_t write_string_cb (char* ptr, 87 | size_t size, 88 | size_t nMemb, 89 | void* userData) 90 | { 91 | ASSERT (ptr); 92 | ASSERT (size == 1); 93 | ASSERT (userData); 94 | 95 | string& s = * static_cast (userData); 96 | FOR (size_t, i, nMemb) 97 | s += ptr [i]; 98 | 99 | return nMemb; 100 | } 101 | } 102 | 103 | 104 | 105 | void Curl::download (const string &url, 106 | const string &fName) 107 | { 108 | ASSERT (! fName. empty ()); 109 | 110 | { 111 | OFStream f (fName); 112 | curl_easy_setopt (eh, CURLOPT_WRITEFUNCTION, write_stream_cb); 113 | curl_easy_setopt (eh, CURLOPT_WRITEDATA, & f); 114 | process (url, "download"); 115 | } 116 | 117 | IFStream f (fName); 118 | string s; 119 | f >> s; 120 | if (s == " (end);) 43 | #define FOR_REV(type,i,start) FOR_REV_END(type, i, 0, (start)) 44 | // FOR(type,i,a) and FOR_REV(type,i,a) iterate over the same i 45 | // FOR_START(type,i,a,b) and FOR_REV_END(type,i,a,b) iterate over the same i 46 | 47 | #define ITER(ContainerType,iter,container) \ 48 | for (ContainerType::iterator iter = (container). begin (); iter != (container). end (); iter++) 49 | #define CONST_ITER(ContainerType,iter,container) \ 50 | for (ContainerType::const_iterator iter = (container). begin (); iter != (container). end (); iter++) 51 | #define ITER_REV(ContainerType,iter,container) \ 52 | for (ContainerType::reverse_iterator iter = (container). rbegin (); iter != (container). rend (); iter++) 53 | #define CONST_ITER_REV(ContainerType,iter,container) \ 54 | for (ContainerType::const_reverse_iterator iter = (container). rbegin (); iter != (container). rend (); iter++) 55 | 56 | 57 | #define Case break; case 58 | #define Default break; default 59 | 60 | 61 | // Exceptions 62 | 63 | #include 64 | #include 65 | 66 | 67 | #ifdef _MSC_VER 68 | #define FUNC std::string (__FUNCSIG__) + ":\n" 69 | #else 70 | #define FUNC std::string (__PRETTY_FUNCTION__) + ":\n" 71 | #endif 72 | 73 | 74 | #define ERROR_MSG(msg) \ 75 | { if (! std::uncaught_exceptions ()) \ 76 | throwf (std::string ("\"" __FILE__ "\", line ") + to_string (__LINE__) + ", in " + (FUNC) + (msg)); \ 77 | exit (1); \ 78 | } 79 | #define ERROR ERROR_MSG ("ERROR") 80 | #define NOT_IMPLEMENTED ERROR_MSG ("NOT IMPLEMENTED") 81 | #define NEVER_CALL ERROR_MSG ("NEVER CALL") 82 | 83 | 84 | #define QC_ASSERT(cond) { errno = 0; if (! (cond)) ERROR_MSG (#cond) } 85 | 86 | 87 | namespace 88 | { 89 | const bool debugP = 90 | #ifdef NDEBUG 91 | false 92 | #else 93 | true 94 | #endif 95 | ; 96 | } 97 | 98 | 99 | // Logic errors 100 | #ifdef NDEBUG 101 | #define ASSERT(cond) 102 | #else 103 | #define ASSERT(cond) QC_ASSERT (cond) 104 | #endif 105 | 106 | 107 | #ifdef NDEBUG 108 | #define EXEC_ASSERT(cond) cond 109 | #else 110 | #define EXEC_ASSERT(cond) { const bool c_ = (cond); if (! c_) ERROR_MSG (#cond); } 111 | #endif 112 | 113 | 114 | #define IMPLY(a,b) { if (a) { ASSERT (b) }} 115 | #define QC_IMPLY(a,b) { if (a) { QC_ASSERT (b) }} 116 | 117 | #define ASSERT_EQ(x,y,delta) ASSERT (std::fabs((x) - (y)) <= (delta)) 118 | #define QC_ASSERT_EQ(x,y,delta) QC_ASSERT (std::fabs((x) - (y)) <= (delta)) 119 | 120 | 121 | 122 | #define MODULE_INIT static bool run = false; \ 123 | if (run) \ 124 | return true; \ 125 | run = true; 126 | 127 | 128 | #define LESS_PART(x,y,part) { if ((x).part < (y).part) return true; \ 129 | if ((y).part < (x).part) return false; } 130 | #define LESS_COMP(comp) { const ebool c = comp; if (c == etrue) return true; if (c == efalse) return false; } 131 | #define COMP_PART(x,y,part) { if ((x).part < (y).part) return -1; \ 132 | if ((y).part < (x).part) return 1; } 133 | 134 | 135 | 136 | #define PRINT(x) { Offset::newLn (cout); cout << #x << " = " << (x); } 137 | 138 | #define LOG(x) { if (logPtr) *logPtr << (x) << endl; } 139 | 140 | #define XSTR(s) STR(s) 141 | #define STR(s) #s 142 | 143 | 144 | -------------------------------------------------------------------------------- /test_prot.fa: -------------------------------------------------------------------------------- 1 | >blaTEM-156 2 | MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLS 3 | RVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNIGDHVTRL 4 | DRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS 5 | RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW 6 | >blaPDC-114_blast BLAST (100% length, but 1 mismatch) 7 | MRDTRFPCLCGIAASTLLFATTPAIAGEAPADRLKALVDAAVQPVMKANDIPGLAVAISLKGEPHYFSYGLASKEDGRRV 8 | TPETLFEIGSVSKTFTATLAGYALTQDKMRLDDRASQHWPALVGSRFDGISLLDLATYTAGGLPLQFPDSVQKDQAQIRD 9 | YYRQWQPTYAPGSQRLYSNPSIGLFGYLAARSLGQPFERLMEQQVFPALGLEQTHLDVPEAALAQYAQGYGKDDRPLRAG 10 | PGPLDAEGYGVKTSAADLLRFVDANLHPERLDRPWAQALDATHRGYYKVGDMTQGLGWEAYDWPISLKRLQAGNSTPMAL 11 | QPHRIARLPAPQALEGQRLLNKTGSTNGFGAYVAFVPGRDLGLVILANRNYPNAERVKIAYAILSGLEQQGKVPLKR 12 | >blaOXA-436_partial (Should be partial OXA-48 family 13 | MRALALSAVLMVTTMIGMPAVAKEWQENKSWNAHFSEHKTQGVVVLWNENTQQGFTNDLKRANQAFLPASTFKIPNSLIA 14 | LDLGVVKDEHQVFKWDGQTRDIAAWNRDHDLITAMKYSVVPVYQEFARQIGEARMSKMLHAFDYGNEDISGNLDSFWLDG 15 | GIRISATQQIAFLRKLYHNKLHVSERSQRIVKQAMLTEANADYIIRAKTGYSVRIEPKIGWWVGWIELDDNVW 16 | >vanG 17 | MQNKKIAVIFGGNSTEYEVSLQSASAVFENINTNKFDIIPIGITRSGEWYHYTGEKEKILNNTWFEDSKN 18 | LCPVVVSQNRSVKGFLEIASDKYRIIKVDLVFPVLHGKNGENGTLQGIFELAGIPVVGCDTLSSALCMDK 19 | DRAHKLVSLAGISVPKSVTFKRFNEEAAMKEIEANLTYPLFIKPVRAGSSFGITKVIEKQELDAAIELAF 20 | EHDTEVIVEETINGFEVGCAVLGIDELIVGRVDEIELSSGFFDYTEKYTLKSSKIYMPARIDAEAEKRIQ 21 | EAAVTIYKALGCSGFSRVDMFYTPSGEIVFNEVNTIPGFTSHSRYPNMMKGIGLSFSQMLDKLIGLYVE 22 | >gyrA T86I Campylobacter 23 | MENIFSKDSDIELVDIENSIKGSYLDYSMSVIIGRALPDARDGLKPVHRRILYAMQNDEAKSRTDFVKSARIVGAVIGRYHPHGDAAVYDALVRMAQDFSMRYPSITGQGNFGSIDGDSAAAMRYTEAKMSKLSHELLKD 24 | IDKDTVDFVPNYDGSESEPDVLPSRVPNLLLNGSSGIAVGMATNIPPHSLNELIDGLLYLLDSKDASLEE 25 | IMQFIKGPDFPTGGIIYGKKGIIEAYRTGRGRVKVRAKTHIEKKTNKDVIVIDELPYQTNKARLIEQIAE 26 | LVKEKQIEGISEVRDESNKEGIRVVIELKREAMSEIVLNNLFKSTTMESTFGVIMLAIHNKEPKIFSLLE 27 | LLNLFLTHRKTVIIRRTIFELQKARARAHILEGLKIALDNIDEVIALIKNSSDNNTARDSLVAKFGLSEL 28 | QANAILDMKLGRLTGLEREKIENELAELMKEIARLEEILKSETLLENLIRDELKEIRSKFDVPRITQIED 29 | DYDDIDIEDLIPNENMVVTITHRGYIKRVPSKQYEKQKRGGKGKLAVTTYDDDFIESFFTANTHDTLMFV 30 | TDRGQLYWLKVYKIPEGSRTAKGKAVVNLINLQAEEKIMAIIPTTDFDESKSLCFFTKNGIVKRTNLSEY 31 | QNIRSVGVRAINLDENDELVTAIIVQRDEDEIFATGGEENLENQEIENLDDENLENEESVSTQGKMLFAV 32 | TKKGMCIKFPLAKVREIGRVSRGVTAIKFKEKNDELVGAVVIENDEQEILSISAKGIGKRTNAGEYRLQS 33 | RGGKGVICMKLTEKTKDLISVVIVDETMDLMALTSSGKMIRVDMQSIRKAGRNTSGVIVVNVENDEVVSI 34 | AKCPKEENDEDELSDENFGLDL 35 | >50S_L22 Campylobacter 50S_L22:A103V 36 | MSKALIKFIRLSSTKARLIAREVQGMNAELAMASLKFMPNKGAKYIANAISSAVANGGFEANEVIVKSCRVDAAAVLKRF 37 | RPRARGSASRIRKPTSHILVEVVKAEVKAEEKKTVAKKTTTTKAPAKKTTSTKKATAKKES 38 | 39 | >nimIJ_hmm WP_027455679.1 NimIJ family nitroimidazole resistance protein [Prevotella brevis] 40 | MSEFREMRRKRQQLTDADSIAVLQKATSGTLALLGDNDYPYAVPISYVYDNGKLYFHSAMAGHKVDAIRR 41 | CNKASFCVIEKDDVRPEKYTTYFRSVIAFGRIEIVEDEAEKRTIMHMMGNRFNPNHDDALQKELESGLAH 42 | MLAIRMDIEHLTGKEAIELVRQRGGN 43 | 44 | >aph3pp-Ib_partial_5p_neg NZ_QKNQ01000001.1 Providencia rettgeri strain Pret_2032, whole genome shotgun sequence 2160922-2162737 150-1527 704-137 45 | IRKLKEPPLNRTNIFFGESHSDWLPVRGGESGDFVFRRGDGHAFAKIAPASRRGELAGERDRLIWLKGRGVACPEVINWQEEQEGACLVITAIPGVPAADLSGADLLKAWPSMGQQLGAVHSLSVDQCPFERRLSRMFGRAVDVVSRNAVNPDFLPDEDKSTPQLDLLARVERELPVRLDQERTDMVVCHGDPCMPNFMVDPKTLQCTGLIDLGRLGTADRYADL 46 | 47 | >sul2_partial_3p_neg NZ_QKNQ01000001.1 Providencia rettgeri strain Pret_2032, whole genome shotgun sequence 2160922-2162737 150-1377 2-667 48 | SSNPDAAPVSSDTEIERIAPVLDALKADGIPVSLDSYQPATQAYALSRGVAYLNDIRGFPDAAFYPQLAKSSAKLVVMHSVQDGQADRREAPAGDIMDHIAAFFDARIAALTGAGIKRNRLVLDPGMGFFLGAAPETSLSVLARFDELRLRFDLPVLLSVSRKSFLRALTGRGPGDVGAATLAAELAAAAGGADFIRTHEPRPLRDGLAVLAALKETARIR 49 | 50 | >blaTEM-internal_stop 51 | HFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVNYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSCVDAG 52 | QEQLGRSIHYSQNDLVEYSPVTEKHLTDGMTLRELCSAAITMSDNTAANLLLTTIGGPKELTA 53 | 54 | >qacR-curated_blast (ADK23698.1) 55 | MKLKDKILGVAKELFIKNGYNATTTGEIVKLSESSKGNLYYHFKTKENLFLEILNIEESKWQEQWKKEQI 56 | KCKTNREKFYLYNELSLTTEYYYPLQNAIIEFYTEYYKTNSINEKMNKLENKYIDAYHVIFKEGNLNGEW 57 | CINDVNAVSKIAANAVNGIVTFTHEQNINERIKLMNKFSQIFLNGLSK 58 | 59 | >arsR-suppressed-in-escherichia BAE77793.1 arsR_K-12 As(III)-sensing_metalloregulatory_transcriptional_repressor_ArsR 60 | MSFLLPIQLFKILADETRLGIVLLLSELGELCVCDLCTALDQSQPKISRHLALLRESGLLLDRKQGKWVHYRLSPHIPAW 61 | AAKIIDEAWRCEQEKVQAIVRNLARQNCSGDSKNICS 62 | 63 | >emrD3-suppressed-in-vibrio ABQ18953.1 64 | MKTKPSLWLMVIMLMFPQIVETIYSPVLGSIARSFSVSDAQAAQTLSVYFLAFALGVVIWGVLADKWGRRPTMLVGLLIY 65 | GSATFIAMQTDSFTILMLARVFSAFGIAVGSVVTQTILRDVFSGHELRKVFSLMGIGISISPVLGMLLGGQLAFAGGHQL 66 | VFLALFFIALVLFVYNLCQLPETQQVKPKIALGCLVARMFKDRQVLLSALLVALYNVALFSYYQLGAFIFSDLGLDAEQF 67 | GYSGIALGLGSLIGSFLNKTLLAKQVPQRALLLLAALLLIMGTIGVSLTLDSIGFVAAMILVVIAYGMAIPNILSTALVE 68 | YKSQAGSAGALFGLLYYLLIGSGLALTGLVQRLGVVLLMCAGITLLATLARSSHIARLP 69 | 70 | >pmrB_C84R 71 | MHFLRRPISLRQRLILTIGAILLVFELISVFWLWHESTEQIQLFEQALRDNRNNDRHIMREIREAVASLIVPGVFMVSLTLFIRYQAVRRITRPLAELQKELEARTADNLTPIAIHSATLEIEAVVSALNDLVSRLTSTLDNERLFTADVAHELRTPLAGVRLHLELLAKTHHIDVAPLVARLDQMMESVSQLLQLARAGQSFSSGNYQHVKLLEDVILPSYDELSTMLDQRQQTLLLPESAADITVQGDATLLRMLLRNLVENAHRYSPQGSNIMIKLQEDDGAVMAVEDEGPGIDESKCGELSKAFVRMDSRYGGIGLGLSIVSRITQLHHGQFFLQNRQETSGTRAWVRLKKDQYVANQI 72 | 73 | >nfsA_R15C_K141STOP 74 | MTPTIELICGHRSICHFTDEPISEAQREAIINSARATSSSSFLQYSSIIRITDKALREELVTLTGGQKHVAQAAEFWVFCADFNRHLQICPDAQLGLAEQLLLGVVDTAMMAQNALIAAESLGLGGVYIGGLRNNIEAVT 75 | 76 | >stxA2a_prot EHY1938862.1 Shiga toxin Stx2a subunit A [Escherichia coli] 77 | MKCILFKWVLCLLLGFSSVSYSREFTIDFSTQQSYVSSLNSIRTEISTPLEHISQGTTSVSVINHTPPGSYFAVDIRGLD 78 | VYQARFDHLRLIIEQNNLYVAGFVNTATNTFYRFSDFTHISVPGVTTVSMTTDSSYTTLQRVAALERSGMQISRHSLVSS 79 | YLALMEFSGNTMTRDASRAVLRFVTVTAEALRFRQIQREFRQALSETAPVYTMTPGDVDLTLNWGRISNVLPEYRGEDGV 80 | RVGRISFNNISAILSTVAVILNCHHQGARSVRAVNEESQPECQITGDRPVIKINNTLWESNTAAAFLNRKSQFLYTTGK 81 | >stxB2a_prot EHY1938863.1 Shiga toxin Stx2a subunit B [Escherichia coli] 82 | MKKMFMAVLFALVSVNAMAADCAKGKIEFSKYNEDDTFTVKVDGKEYWTSRWNLQPLLQSAQLTGMTVTIKSSTCESGSG 83 | FAEVQFNND 84 | -------------------------------------------------------------------------------- /gff.hpp: -------------------------------------------------------------------------------- 1 | // gff.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * .gff file reader 31 | * 32 | */ 33 | 34 | 35 | #ifndef GFF_HPP 36 | #define GFF_HPP 37 | 38 | 39 | #include "common.hpp" 40 | using namespace Common_sp; 41 | 42 | 43 | 44 | namespace GFF_sp 45 | { 46 | 47 | 48 | 49 | struct Locus 50 | { 51 | static constexpr size_t end_delta = 3; // PAR 52 | size_t lineNum {0}; 53 | // >= 1 54 | // 0 - unknown 55 | string contig; 56 | // DNA FASTA id 57 | size_t start {0}; 58 | size_t stop {0}; 59 | // start <= stop 60 | bool strand {false}; 61 | bool partial {false}; 62 | size_t contigLen {0}; 63 | // 0 <=> unknown 64 | bool crossOrigin {false}; 65 | string gene; 66 | string product; 67 | 68 | 69 | Locus (size_t lineNum_arg, 70 | const string &contig_arg, 71 | size_t start_arg, 72 | size_t stop_arg, 73 | bool strand_arg, 74 | bool partial_arg, 75 | size_t crossOriginSeqLen, 76 | string gene_arg, 77 | string product_arg); 78 | Locus () = default; 79 | 80 | 81 | bool empty () const 82 | { return contig. empty (); } 83 | void print (ostream &os) const 84 | { os << contig 85 | << ' ' << start 86 | << ' ' << stop 87 | << ' ' << strand 88 | << ' ' << contigLen 89 | << ' ' << crossOrigin 90 | << ' ' << gene 91 | << ' ' << product 92 | << endl; 93 | } 94 | bool operator< (const Locus& other) const; 95 | size_t size () const 96 | { return crossOrigin 97 | ? contigLen - stop + start 98 | : stop - start; 99 | } 100 | bool atContigStart () const 101 | { return start <= end_delta; } 102 | bool atContigStop () const 103 | { return contigLen && contigLen - stop <= end_delta;} 104 | }; 105 | 106 | 107 | 108 | struct Gff 109 | { 110 | enum Type {bakta, genbank, microscope, patric, pgap, prodigal, prokka, pseudomonasdb, rast, standard/*PD-4548*/}; 111 | // Alphabetic order 112 | static const StringVector names; 113 | static Type name2type (const string &name); 114 | }; 115 | 116 | 117 | 118 | struct Annot final : Root 119 | { 120 | // Protein GFF id is a function of attributes (column in GFF) 121 | map> prot2loci; 122 | map fasta2gff_prot; 123 | // empty() => protein FASTA id = protein GFF id 124 | 125 | 126 | Annot (const string &fName, 127 | Gff::Type gffType, 128 | bool protMatch, 129 | bool lcl); 130 | // GFF 131 | // https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md 132 | // https://github.com/ncbi/amr/issues/91 133 | // Input: protMatch: property of protein FASTA: 134 | // genbank: "[locus_tag=...]" in comment 135 | // microscope: ">|ID:|| 136 | // prodigal: "ID=" in comment 137 | // lcl: property of DNA FASTA: >lcl|... 138 | /* 139 | gffType protein GFF id 140 | ------- -------------- 141 | bakta ID= 142 | genbank locus_tag=[project:]acc // if pseudo or protMatch 143 | Name=[project:]acc // else 144 | microscope ID= 145 | patric ID=...;locus_tag=... 146 | pgap Name= 147 | prodigal ID= 148 | prokka ID= 149 | pseudomonasdb Alias= (or locus=) 150 | rast ID= 151 | standard Name= 152 | 153 | ["]acc["] 154 | */ 155 | explicit Annot (const string &fName); 156 | // Bed 157 | // https://genome.ucsc.edu/FAQ/FAQformat.html#format1 158 | 159 | 160 | void load_fasta2gff_prot (const string &fName); 161 | // Input: fName: file is created by gff_check.cpp -gff_prot_match 162 | // Output: fasta2gff_prot 163 | void load_fasta2gff_dna (const string &fName); 164 | // Input: fName: file is created by gff_check.cpp -gff_dna_match 165 | // Output: Locus::contig 166 | const Set& findLoci (const string &fasta_prot) const; 167 | // Return: !empty() 168 | // throw if not found 169 | }; 170 | 171 | 172 | 173 | 174 | } 175 | 176 | 177 | 178 | #endif 179 | -------------------------------------------------------------------------------- /test_both.expected: -------------------------------------------------------------------------------- 1 | Protein id Contig id Start Stop Strand Element symbol Element name Scope Type Subtype Class Subclass Method Target length Reference sequence length % Coverage of reference % Identity to reference Alignment length Closest reference accession Closest reference name HMM accession HMM description Hierarchy node 2 | NA contig01 1 984 + blaTEMp_G162T Escherichia amoxicillin-clavulanic acid/piperacillin-tazobactam/ticarcillin-clavulanic acid resistant blaTEM promoter core AMR POINT BETA-LACTAM AMOXICILLIN-CLAVULANIC_ACID/PIPERACILLIN-TAZOBACTAM/TICARCILLIN-CLAVULANIC_ACID POINTN 984 1176 83.67 99.80 984 NZ_CP095603.1:148777-149952 blaTEM promoter region NA NA NA 3 | blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase blaTEM-156 4 | blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase blaPDC 5 | blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam 6 | vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG vanG 7 | NA contig04 1261 2391 + blaEC BlaEC family class C beta-lactamase plus AMR AMR BETA-LACTAM BETA-LACTAM BLASTX 377 377 100.00 98.14 377 WP_063610930.1 extended-spectrum class C beta-lactamase EC-15 NA NA blaEC 8 | NA contig08 1 700 + blaTEMp_G162T Escherichia amoxicillin-clavulanic acid/piperacillin-tazobactam/ticarcillin-clavulanic acid resistant blaTEM promoter core AMR POINT BETA-LACTAM AMOXICILLIN-CLAVULANIC_ACID/PIPERACILLIN-TAZOBACTAM/TICARCILLIN-CLAVULANIC_ACID POINTN 700 1176 59.52 99.71 700 NZ_CP095603.1:148777-149952 blaTEM promoter region NA NA NA 9 | NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NA NA blaTEM 10 | aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib 11 | sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA sul2 12 | NA contig10 486 1307 + blaOXA OXA-9 family oxacillin-hydrolyzing class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NA NA blaOXA-9_fam 13 | NA contig11 1 984 + blaTEMp_G162T Escherichia amoxicillin-clavulanic acid/piperacillin-tazobactam/ticarcillin-clavulanic acid resistant blaTEM promoter core AMR POINT BETA-LACTAM AMOXICILLIN-CLAVULANIC_ACID/PIPERACILLIN-TAZOBACTAM/TICARCILLIN-CLAVULANIC_ACID POINTN 984 1176 83.67 96.04 984 NZ_CP095603.1:148777-149952 blaTEM promoter region NA NA NA 14 | blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 144 286 50.35 97.22 144 WP_000027057.1 broad-spectrum class A beta-lactamase TEM-1 NA NA blaTEM 15 | qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA qacR 16 | emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA emrD3 17 | NA contig14 1 1089 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA pmrB 18 | pmrB_C84R contig14 1093 2181 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTP 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA pmrB 19 | NA contig15 1 2905 + 23S_A2058T Escherichia azithromycin/erythromycin/telithromycin resistant 23S core AMR POINT MACROLIDE AZITHROMYCIN/ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA NA 20 | NA contig16 1 720 + nfsA_K141Ter Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA nfsA 21 | NA contig16 1 720 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA nfsA 22 | NA contig17 1 247 + ampC_T-14TGT Escherichia cephalosporin resistant ampC promoter core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA NA 23 | stxA2a_prot contig18 279 1238 + stxA2 Shiga toxin Stx2 subunit A plus VIRULENCE VIRULENCE STX2 stxA2 EXACTP 319 319 100.00 100.00 319 TJA36680.1 Shiga toxin Stx2 subunit A NF041702.1 Shiga toxin Stx2 subunit A stxA2_acd 24 | NA contig18 279 1516 + stx2a_operon stx2a operon plus VIRULENCE STX_TYPE STX2 STX2A COMPLETE 1238 NA NA 100.00 408 AAS07600.1,AAM90978.1 Shiga toxin stx2a NA NA stxA2a::stxB2a 25 | stxB2a_prot contig18 1250 1519 + stxB2 Shiga toxin Stx2a subunit B plus VIRULENCE VIRULENCE STX2 stxB2a EXACTP 89 89 100.00 100.00 89 AAM90978.1 Shiga toxin Stx2a subunit B NF033660.0 Shiga toxin Stx2 subunit B stxB2a 26 | nimIJ_hmm contigX 1 501 + nimIJ NimIJ family 5-nitroimidazole reductase core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family 5-nitroimidazole reductase NF000262.1 NimIJ family 5-nitroimidazole reductase nimIJ 27 | -------------------------------------------------------------------------------- /fasta_extract.cpp: -------------------------------------------------------------------------------- 1 | // fasta_check.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Extract sequences out of a FASTA file 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | 40 | #include "common.inc" 41 | 42 | 43 | 44 | namespace 45 | { 46 | 47 | 48 | 49 | struct Segment 50 | // not circular 51 | { 52 | size_t start {0}; 53 | size_t stop {0}; 54 | bool strand {true}; 55 | // false <=> negative 56 | string genesymbol; 57 | string name; 58 | 59 | 60 | bool isDna () const 61 | { return stop; } 62 | size_t size () const 63 | { return stop - start; } 64 | void saveText (ostream &os) const 65 | { os << start 66 | << '\t' << stop 67 | << '\t' << strand 68 | << '\t' << genesymbol 69 | << '\t' << name 70 | << endl; 71 | } 72 | }; 73 | 74 | 75 | 76 | char complementaryNucleotide (char wildNucleotide) 77 | { 78 | char r = ' '; 79 | switch (toLower (wildNucleotide)) 80 | { 81 | case 'a': r = 't'; break; 82 | case 'c': r = 'g'; break; 83 | case 'g': r = 'c'; break; 84 | case 't': r = 'a'; break; 85 | case 'm': r = 'k'; break; 86 | case 'r': r = 'y'; break; 87 | case 'w': r = 'w'; break; 88 | case 's': r = 's'; break; 89 | case 'y': r = 'r'; break; 90 | case 'k': r = 'm'; break; 91 | case 'v': r = 'b'; break; 92 | case 'h': r = 'd'; break; 93 | case 'd': r = 'h'; break; 94 | case 'b': r = 'v'; break; 95 | case 'n': r = 'n'; break; 96 | case '-': r = '-'; break; 97 | default: 98 | throw runtime_error ("Bad nucleotide " + to_string (wildNucleotide)); 99 | } 100 | if (isupper (wildNucleotide)) 101 | r = toUpper (r); 102 | 103 | return r; 104 | } 105 | 106 | 107 | 108 | bool process (const string &id, 109 | string &seq, 110 | const map> &id2segments) 111 | { 112 | if (id. empty ()) 113 | return false; 114 | const Vector* segments = findPtr (id2segments, id); 115 | if (! segments) 116 | return false; 117 | 118 | replaceStr (seq, "-", ""); 119 | QC_ASSERT (! seq. empty ()); 120 | 121 | for (Segment& seg : var_cast (*segments)) 122 | { 123 | cout << '>' << id; 124 | if (seg. isDna ()) 125 | { 126 | QC_ASSERT (seg. start <= seq. size ()); 127 | minimize (seg. stop, seq. size ()); 128 | QC_ASSERT (seg. start < seg. stop); 129 | cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-'); 130 | } 131 | cout << ' ' << seg. genesymbol << ' ' << seg. name << endl; 132 | string seq1 (seq); 133 | if (seg. isDna ()) 134 | { 135 | ASSERT (seg. stop <= seq1. size ()); 136 | seq1 = seq1. substr (seg. start, seg. size ()); 137 | if (! seg. strand) 138 | { 139 | reverse (seq1); 140 | for (char &c : seq1) 141 | c = complementaryNucleotide (c); 142 | } 143 | //strLower (seq1); // Letter case can indicate nucleotide quality 144 | } 145 | //else 146 | //strUpper (seq1); 147 | constexpr size_t line_len = 60; // PAR 148 | for (size_t i = 0; i < seq1. size (); i += line_len) 149 | cout << seq1. substr (i, line_len) << endl; 150 | } 151 | 152 | return true; 153 | } 154 | 155 | 156 | 157 | struct ThisApplication final : Application 158 | { 159 | ThisApplication () 160 | : Application ("Extract sequences out of a FASTA file") 161 | { 162 | addPositional ("fasta", "FASTA file"); 163 | addPositional ("target", "Target identifiers in the FASTA file to extract.\n\ 164 | Line format for amino acid sequences : \n\ 165 | Line format for nucleotide sequences : =1)> = start)> \ 166 | "); 167 | addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); 168 | version = SVN_REV; 169 | } 170 | 171 | 172 | 173 | void body () const final 174 | { 175 | const string fName = getArg ("fasta"); 176 | const string targetFName = getArg ("target"); 177 | const bool aa = getFlag ("aa"); 178 | 179 | 180 | map> id2segments; 181 | { 182 | LineInput f (targetFName); 183 | string id; 184 | Istringstream iss; 185 | while (f. nextLine ()) 186 | { 187 | iss. reset (f. line); 188 | Segment seg; 189 | iss >> id; 190 | if (! aa) 191 | { 192 | char strand = '\0'; 193 | iss >> seg. start >> seg. stop >> strand; 194 | QC_ASSERT (seg. start); 195 | QC_ASSERT (seg. start <= seg. stop); 196 | seg. start--; 197 | QC_ASSERT ( strand == '+' 198 | || strand == '-' 199 | ); 200 | seg. strand = (strand == '+'); 201 | } 202 | iss >> seg. genesymbol; 203 | seg. name = f. line. substr ((size_t) iss. tellg ()); 204 | trim (seg. name); 205 | QC_ASSERT (aa == ! seg. isDna ()); 206 | id2segments [id] << std::move (seg); 207 | } 208 | } 209 | if (verbose ()) 210 | for (const auto& it : id2segments) 211 | { 212 | cout << it. first << ": " << endl; 213 | for (const Segment& seg : it. second) 214 | { 215 | cout << " "; 216 | seg. saveText (cout); 217 | } 218 | } 219 | if (id2segments. empty ()) 220 | return; 221 | 222 | 223 | size_t processed = 0; 224 | { 225 | LineInput f (fName); 226 | string id; 227 | string seq; 228 | while (f. nextLine ()) 229 | { 230 | trimTrailing (f. line); 231 | if (f. line. empty ()) 232 | continue; 233 | if (f. line [0] == '>') 234 | { 235 | processed += process (id, seq, id2segments); 236 | size_t pos = 1; 237 | while (pos < f. line. size () && ! isspace (f. line [pos])) 238 | pos++; 239 | id = f. line. substr (1, pos - 1); 240 | seq. clear (); 241 | } 242 | else 243 | seq += f. line; 244 | } 245 | processed += process (id, seq, id2segments); 246 | } 247 | if (processed != id2segments. size ()) 248 | throw runtime_error ("Requested identifiers: " + to_string (id2segments. size ()) + ", but processed: " + to_string (processed)); 249 | // Assumed: no duplicate identifiers in FASTA 250 | } 251 | }; 252 | 253 | 254 | 255 | } // namespace 256 | 257 | 258 | 259 | int main (int argc, 260 | const char* argv[]) 261 | { 262 | ThisApplication app; 263 | return app. run (argc, argv); 264 | } 265 | 266 | 267 | 268 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | # PUBLIC DOMAIN NOTICE This software/database is "United States Government 3 | # Work" under the terms of the United States Copyright Act. It was written as 4 | # part of the authors' official duties for the United States Government and 5 | # thus cannot be copyrighted. This software/database is freely available to the 6 | # public for use without a copyright notice. Restrictions cannot be placed on 7 | # its present or future use. 8 | # 9 | # Although all reasonable efforts have been taken to ensure the accuracy and 10 | # reliability of the software and data, the National Center for Biotechnology 11 | # Information (NCBI) and the U.S. Government do not and cannot warrant the 12 | # performance or results that may be obtained by using this software or data. 13 | # NCBI, NLM, and the U.S. Government disclaim all warranties as to performance, 14 | # merchantability or fitness for any particular purpose. 15 | # 16 | # In any work or product derived from this material, proper attribution of the 17 | # authors as the source of the software or data should be made, using: 18 | # https://ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder/ as the 19 | # citation. 20 | ############################################################################### 21 | 22 | # the SVNREV is set automatically here for convenience, 23 | # but when actually building we should override it like: 24 | # make all SVNREV=-D\'SVN_REV=\"$VERSION\"\' or use 25 | # a version.txt file 26 | ifeq ($(wildcard version.txt),) 27 | VERSION_STRING := $(shell git describe --tags) 28 | else 29 | VERSION_STRING := $(shell cat version.txt) 30 | endif 31 | SVNREV := -D'SVN_REV="$(VERSION_STRING)"' 32 | 33 | INSTALL=install 34 | 35 | # make it possible to hard define a database directory 36 | # Define default paths 37 | # This is a little convoluted because I broke things and don't want 38 | # to change two different ways of defining the paths. This could 39 | # be simplified in a later release 40 | PREFIX ?= /usr/local 41 | ifneq '$(INSTALL_DIR)' '' 42 | bindir=$(INSTALL_DIR) 43 | endif 44 | bindir ?= $(PREFIX)/bin 45 | ifneq '$(CONDA_DB_DIR)' '' 46 | DBDIR := -D'CONDA_DB_DIR="$(CONDA_DB_DIR)"' 47 | endif 48 | ifneq '$(DEFAULT_DB_DIR)' '' 49 | DBDIR := -D'CONDA_DB_DIR="$(DEFAULT_DB_DIR)"' 50 | endif 51 | 52 | # for testing database updates using 53 | ifdef TEST_UPDATE 54 | TEST_UPDATE_DB := '-D TEST_UPDATE' 55 | endif 56 | 57 | # detect system architecture and set appropriate flags 58 | # this is probably not the best way (i.e. M1 Mac would be arm64) 59 | # but it works for Nvidia Jetson boards (aarch64) 60 | ARCH := $(shell uname -m) 61 | OS := $(shell uname -s) 62 | # "hack": if amd64 we can set to aarch64 63 | # as AArch64 and ARM64 refer to the same thing 64 | # this should build for Mac M1 and other arm64 chips 65 | ifeq ($(ARCH),arm64) 66 | ARCH := aarch64 67 | endif 68 | # report detected OS and arch in stdout 69 | $(info Dectected architecture: $(OS) $(ARCH)) 70 | # set CFLAGS based on arch 71 | ifeq ($(ARCH),aarch64) 72 | # set arm CFLAGS 73 | CPPFLAGS = -std=gnu++17 -pthread --signed-char -falign-jumps -fno-math-errno -O3 74 | else 75 | # set x86_x64 CFLAGS 76 | CPPFLAGS = -std=gnu++17 -pthread -malign-double -fno-math-errno -O3 77 | endif 78 | # was: -std=gnu++14 79 | 80 | CXX=g++ 81 | COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) $(DBDIR) $(TEST_UPDATE_DB) -c 82 | 83 | 84 | .PHONY: all clean install release stxtyper test 85 | 86 | BINARIES= amr_report amrfinder amrfinder_index amrfinder_update fasta_check \ 87 | fasta_extract fasta2parts gff_check dna_mutation mutate disruption2genesymbol 88 | 89 | all: $(BINARIES) stxtyper 90 | 91 | release: clean 92 | svnversion . > version.txt 93 | make all 94 | 95 | common.o: common.hpp common.inc 96 | curl_easy.o: curl_easy.hpp common.hpp common.inc 97 | gff.o: gff.hpp common.hpp common.inc 98 | alignment.o: alignment.hpp seq.hpp common.hpp common.inc 99 | seq.o: seq.hpp graph.hpp common.hpp common.inc 100 | 101 | amr_report.o: common.hpp common.inc gff.hpp alignment.hpp tsv.hpp seq.hpp columns.hpp version.txt 102 | amr_reportOBJS=amr_report.o common.o gff.o alignment.o seq.o graph.o 103 | amr_report: $(amr_reportOBJS) 104 | $(CXX) $(LDFLAGS) -o $@ $(amr_reportOBJS) 105 | 106 | amrfinder.o: common.hpp common.inc gff.hpp seq.hpp tsv.hpp columns.hpp version.txt 107 | amrfinderOBJS=amrfinder.o common.o gff.o tsv.o 108 | amrfinder: $(amrfinderOBJS) 109 | $(CXX) $(LDFLAGS) -o $@ $(amrfinderOBJS) -pthread $(DBDIR) 110 | 111 | amrfinder_update.o: common.hpp common.inc curl_easy.hpp version.txt 112 | amrfinder_updateOBJS=amrfinder_update.o common.o curl_easy.o 113 | amrfinder_update: $(amrfinder_updateOBJS) 114 | @if [ "$(TEST_UPDATE)" != "" ] ; \ 115 | then \ 116 | touch amrfinder_update.cpp ;\ 117 | fi # make sure the next make command rebuilds amrfinder_update 118 | $(CXX) $(LDFLAGS) -o $@ $(amrfinder_updateOBJS) -lcurl 119 | 120 | amrfinder_index.o: common.hpp common.inc version.txt 121 | amrfinder_indexOBJS=amrfinder_index.o common.o 122 | amrfinder_index: $(amrfinder_indexOBJS) 123 | $(CXX) $(LDFLAGS) -o $@ $(amrfinder_indexOBJS) 124 | 125 | fasta_check.o: common.hpp common.inc version.txt 126 | fasta_checkOBJS=fasta_check.o common.o 127 | fasta_check: $(fasta_checkOBJS) 128 | $(CXX) $(LDFLAGS) -o $@ $(fasta_checkOBJS) 129 | 130 | fasta_extract.o: common.hpp common.inc version.txt 131 | fasta_extractOBJS=fasta_extract.o common.o 132 | fasta_extract: $(fasta_extractOBJS) 133 | $(CXX) $(LDFLAGS) -o $@ $(fasta_extractOBJS) 134 | 135 | fasta2parts.o: common.hpp common.inc version.txt 136 | fasta2partsOBJS=fasta2parts.o common.o 137 | fasta2parts: $(fasta2partsOBJS) 138 | $(CXX) $(LDFLAGS) -o $@ $(fasta2partsOBJS) 139 | 140 | gff_check.o: common.hpp common.inc gff.hpp version.txt 141 | gff_checkOBJS=gff_check.o common.o gff.o 142 | gff_check: $(gff_checkOBJS) 143 | $(CXX) $(LDFLAGS) -o $@ $(gff_checkOBJS) 144 | 145 | dna_mutation.o: common.hpp common.inc alignment.hpp seq.hpp tsv.hpp columns.hpp version.txt 146 | dna_mutationOBJS=dna_mutation.o common.o alignment.o seq.o graph.o 147 | dna_mutation: $(dna_mutationOBJS) 148 | $(CXX) $(LDFLAGS) -o $@ $(dna_mutationOBJS) 149 | 150 | mutate.o: common.hpp common.inc alignment.hpp seq.hpp version.txt 151 | mutateOBJS=mutate.o common.o alignment.o seq.o graph.o 152 | mutate: $(mutateOBJS) 153 | $(CXX) -o $@ $(mutateOBJS) 154 | 155 | disruption2genesymbol.o: common.hpp common.inc seq.hpp version.txt 156 | disruption2genesymbolOBJS=disruption2genesymbol.o common.o alignment.o seq.o graph.o 157 | disruption2genesymbol: $(disruption2genesymbolOBJS) 158 | $(CXX) -o $@ $(disruption2genesymbolOBJS) 159 | 160 | stxtyper: 161 | $(MAKE) -C stx 162 | 163 | clean: 164 | rm -f *.o 165 | rm -f $(BINARIES) 166 | $(MAKE) -C stx clean 167 | 168 | install: 169 | @if [ ! -e $(DESTDIR)$(bindir) ]; \ 170 | then \ 171 | mkdir -p $(DESTDIR)$(bindir); \ 172 | fi 173 | $(INSTALL) $(BINARIES) $(DESTDIR)$(bindir) 174 | make -C stx install PREFIX=$(PREFIX) bindir=$(bindir) 175 | mkdir $(DESTDIR)$(bindir)/stx 176 | ln -s ../stxtyper $(DESTDIR)$(bindir)/stx/stxtyper 177 | 178 | # amrfinder binaries for github binary release 179 | GITHUB_FILE=amrfinder_binaries_v$(VERSION_STRING) 180 | GITHUB_FILES = test_amrfinder.sh test_*.expected test_*.fa test_*.gff $(BINARIES) 181 | 182 | github_binaries: 183 | @if [ ! -e version.txt ]; \ 184 | then \ 185 | echo >&2 "version.txt required to make a distribution file"; \ 186 | false; \ 187 | fi 188 | # first recompile amrfinder.o to pick up the new version info 189 | # and remove leaky NCBI paths 190 | make clean 191 | # make all CXX=/usr/bin/g++ LD_RUN_PATH= 192 | make all LD_RUN_PATH= 193 | mkdir $(GITHUB_FILE) 194 | echo $(VERSION_STRING) > $(GITHUB_FILE)/version.txt 195 | cp $(GITHUB_FILES) $(GITHUB_FILE) 196 | # make -C stx 197 | # make -C stx install INSTALL_DIR=../$(GITHUB_FILE)/stx CXX=/usr/bin/g++ LD_RUN_PATH= 198 | make -C stx install INSTALL_DIR=../$(GITHUB_FILE)/stx LD_RUN_PATH= 199 | cp stx/test_stxtyper.sh stx/version.txt $(GITHUB_FILE)/stx 200 | mkdir $(GITHUB_FILE)/stx/test 201 | cp -R stx/test/*.fa stx/test/*.expected $(GITHUB_FILE)/stx/test 202 | if [ -e $(GITHUB_FILE).tar.gz ]; then rm $(GITHUB_FILE).tar.gz; fi 203 | cd $(GITHUB_FILE); ln -s stx/stxtyper .; tar cvfz ../$(GITHUB_FILE).tar.gz * 204 | rm -r $(GITHUB_FILE)/* 205 | rmdir $(GITHUB_FILE) 206 | 207 | test: $(DISTFILES) Makefile *.cpp *.hpp *.inc test_dna.fa test_prot.fa test_prot.gff test_dna.fa test_dna.expected test_prot.expected test_both.expected 208 | make -C stx test 209 | # test the amrfinder in the current directory 210 | # with the data in the current directory 211 | ./test_amrfinder.sh -n 212 | -------------------------------------------------------------------------------- /fasta_check.cpp: -------------------------------------------------------------------------------- 1 | // fasta_check.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Check the correctness of a FASTA file 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | 40 | #include "common.inc" 41 | 42 | 43 | 44 | namespace 45 | { 46 | 47 | 48 | 49 | struct ThisApplication final : Application 50 | { 51 | ThisApplication () 52 | : Application ("Check the correctness of a FASTA file. Exit with an error if it is incorrect. Print the number of sequences, max. sequence length and total sequence length") 53 | { 54 | addPositional ("in", "FASTA file"); 55 | addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); 56 | addFlag ("hyphen", "Hyphens are allowed"); 57 | addFlag ("ambig", "Ambiguous characters are allowed"); 58 | addKey ("ambig_max", "Max. number of ambiguous characters in sequences", "0"); 59 | addFlag ("stop_codon", "Stop codons ('*') in the protein sequence are allowed"); 60 | addKey ("len", "Output file with lines: "); 61 | addKey ("out", "Output FASTA file with some of the issues fixed"); 62 | version = SVN_REV; 63 | } 64 | 65 | 66 | 67 | void body () const final 68 | { 69 | const string fName = getArg ("in"); 70 | const bool aa = getFlag ("aa"); 71 | const bool hyphen = getFlag ("hyphen"); 72 | const bool ambig = getFlag ("ambig"); 73 | const size_t ambig_max = str2 (getArg ("ambig_max")); 74 | const bool stop_codon = getFlag ("stop_codon"); 75 | const string lenFName = getArg ("len"); 76 | const string outFName = getArg ("out"); 77 | 78 | QC_IMPLY (stop_codon, aa); 79 | 80 | 81 | unique_ptr lenF; 82 | if (! lenFName. empty ()) 83 | lenF. reset (new OFStream (lenFName)); 84 | unique_ptr outF; 85 | if (! outFName. empty ()) 86 | outF. reset (new OFStream (outFName)); 87 | size_t lines = 0; 88 | StringVector ids; ids. reserve (100000); // PAR 89 | size_t seqSize_max = 0; 90 | size_t seqSize_sum = 0; 91 | //string errorS; 92 | // One sequence 93 | size_t xs = 0; 94 | string header; 95 | string seq; 96 | 97 | auto processSeq = [&] () 98 | { 99 | if (! lines) 100 | return; 101 | ASSERT (! header. empty ()); 102 | ASSERT (! ids. empty ()); 103 | const string id (ids. back ()); 104 | if (aa && ! stop_codon) 105 | { 106 | while (! seq. empty () && seq. back () == '*') 107 | if (outF) 108 | seq. erase (seq. size () - 1); 109 | else 110 | throw runtime_error (id + ": '*' at the sequence end"); 111 | } 112 | if (seq. empty ()) 113 | throw runtime_error (id + ": Empty sequence"); 114 | bool skip = false; 115 | if (! ambig && xs > ambig_max) 116 | { 117 | if (outF) 118 | skip = true; 119 | else 120 | throw runtime_error (id + ": Too many ambiguities"); 121 | } 122 | if (skip) 123 | { LOG ("Skipping " + id); } 124 | else 125 | { 126 | if (lenF. get ()) 127 | *lenF << id << '\t' << seq. size () << endl; 128 | if (outF) 129 | *outF << header << endl << seq << endl; 130 | maximize (seqSize_max, seq. size ()); 131 | seqSize_sum += seq. size (); 132 | } 133 | xs = 0; 134 | header. clear (); 135 | seq. clear (); 136 | }; 137 | 138 | size_t nuc = 0; 139 | { 140 | LineInput f (fName); 141 | string id; 142 | while (f. nextLine ()) 143 | { 144 | trimTrailing (f. line); 145 | if (f. line. empty ()) 146 | continue; 147 | const string errorS ("File " + fName + ", " + f. lineStr (false) + ": "); 148 | if (f. line [0] == '>') 149 | { 150 | size_t pos = 1; 151 | while (pos < f. line. size () && ! isspace (f. line [pos])) 152 | pos++; 153 | id = f. line. substr (1, pos - 1); 154 | if (id. empty ()) 155 | throw runtime_error (errorS + "Empty sequence identifier"); 156 | #if 0 157 | if (id. size () > 1000) // PAR 158 | throw runtime_error (errorS + "Too long sequence identifier"); 159 | #endif 160 | for (const char c : id) 161 | if (! printable (c)) 162 | throw runtime_error (errorS + "Non-printable character in the sequence identifier: " + to_string ((int) c)); 163 | // BLAST: PD-4548 164 | if (! aa) 165 | { 166 | if (id. front () == '?') 167 | throw runtime_error (errorS + "Sequence identifier starts with '?'"); 168 | for (const char c : {',', ';', '.', '~'}) 169 | if (id. back () == c) 170 | throw runtime_error (errorS + "Sequence identifier ends with " + strQuote (string (1, c))); 171 | if (contains (id, "\\t")) 172 | throw runtime_error (errorS + "Sequence identifier contains '\\t'"); 173 | if (contains (id, ",,")) 174 | throw runtime_error (errorS + "Sequence identifier contains ',,'"); 175 | } 176 | processSeq (); 177 | header = f. line; 178 | ids << id; 179 | } 180 | else 181 | { 182 | if (! lines) 183 | throw runtime_error (errorS + "FASTA should start with '>'"); 184 | for (const char c : f. line) 185 | { 186 | bool skip = false; 187 | if (c == '-') 188 | if (hyphen) 189 | ; 190 | else 191 | { 192 | if (outF) 193 | skip = true; 194 | else 195 | throw runtime_error (errorS + "Hyphen in the sequence"); 196 | } 197 | else 198 | { 199 | const char c1 = toLower (c); 200 | if (aa) 201 | { 202 | if (! charInSet (c1, "acdefghiklmnpqrstvwyxbzjuoacdefghiklmnpqrstvwyxbzjuo*")) 203 | throw runtime_error (errorS + "Wrong amino acid character: (code = " + to_string ((int) c) + ") '" + c + "'"); 204 | if (charInSet (c1, "acgt")) 205 | nuc++; 206 | if (charInSet (c1, "xbzjuo")) 207 | xs++; 208 | } 209 | else 210 | { 211 | if (! charInSet (c1, "acgtbdhkmnrsvwyacgtbdhkmnrsvwy")) 212 | throw runtime_error (errorS + "Wrong nucleotide character: (code = " + to_string ((int) c) + ") '" + c + "'"); 213 | if (charInSet (c1, "bdhkmnrsvwy")) 214 | xs++; 215 | } 216 | } 217 | if (! skip) 218 | seq += c; 219 | } 220 | } 221 | lines++; 222 | } 223 | } 224 | processSeq (); // Last sequence 225 | if (! lines) 226 | throw runtime_error ("Empty file"); 227 | if (aa && (double) nuc / (double) seqSize_sum > 0.9) // PAR 228 | throw runtime_error ("Protein sequences looks like a nucleotide sequences"); 229 | 230 | ids. sort (); 231 | const size_t index = ids. findDuplicate (); 232 | if (index != no_index) 233 | throw runtime_error ("Duplicate identifier: " + ids [index]); 234 | 235 | cout << ids. size () << endl 236 | << seqSize_max << endl 237 | << seqSize_sum << endl; 238 | } 239 | }; 240 | 241 | 242 | 243 | } // namespace 244 | 245 | 246 | 247 | int main (int argc, 248 | const char* argv[]) 249 | { 250 | ThisApplication app; 251 | return app. run (argc, argv); 252 | } 253 | 254 | 255 | 256 | -------------------------------------------------------------------------------- /alignment.hpp: -------------------------------------------------------------------------------- 1 | // alignment.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Protein or DNA mutations library. 31 | * 32 | */ 33 | 34 | 35 | #include "common.hpp" 36 | using namespace Common_sp; 37 | #include "seq.hpp" 38 | using namespace Seq_sp; 39 | 40 | 41 | 42 | 43 | namespace Alignment_sp 44 | { 45 | 46 | 47 | 48 | static constexpr char pm_delimiter = '_'; 49 | 50 | 51 | 52 | struct AmrMutation final : Root 53 | // Database 54 | { 55 | size_t pos_real {0}; 56 | // In whole reference sequence 57 | // = start of reference 58 | 59 | string geneMutation_std; 60 | // Function of geneMutation_std 61 | // Upper-case 62 | string reference; 63 | string allele; 64 | string gene; 65 | int pos_std {0}; 66 | size_t frameshift {no_index}; 67 | // Position of '*' after getStop() 68 | int frameshift_insertion {0}; 69 | 70 | // To be reported 71 | // !empty() 72 | string geneMutation; 73 | string classS; 74 | string subclass; 75 | string name; 76 | // Species binomial + resistance 77 | 78 | 79 | // Input: pos_arg: 1-based 80 | AmrMutation (size_t pos_real_arg, 81 | const string &geneMutation_std_arg, 82 | const string &geneMutation_arg, 83 | const string &class_arg, 84 | const string &subclass_arg, 85 | const string &name_arg); 86 | AmrMutation (size_t pos_arg, 87 | const string &geneMutation_std_arg) 88 | : AmrMutation (pos_arg, geneMutation_std_arg, geneMutation_std_arg, "X", "X", "X") 89 | {} 90 | AmrMutation () = default; 91 | private: 92 | static void parse (const string &geneMutation_std, 93 | string &reference, 94 | string &allele, 95 | string &gene, 96 | int &pos_std, 97 | size_t &frameshift, 98 | int &frameshift_insertion); 99 | public: 100 | void qc () const override; 101 | void saveText (ostream &os) const override 102 | { if (empty ()) 103 | os << "empty"; 104 | else 105 | os << pos_real + 1 106 | << ' ' << geneMutation 107 | << ' ' << frameshift_insertion 108 | << ' ' << name; 109 | } 110 | bool empty () const override 111 | { return geneMutation_std. empty (); } 112 | 113 | 114 | size_t getStop () const 115 | { return pos_real + reference. size (); } 116 | string wildtype () const 117 | { return gene + "_" + reference + to_string (pos_std + 1) + reference; } 118 | bool operator< (const AmrMutation &other) const; 119 | bool operator== (const AmrMutation &other) const 120 | { return geneMutation_std == other. geneMutation_std; } 121 | void apply (string &seq) const 122 | { if (pos_real >= seq. size ()) 123 | throw runtime_error ("AmrMutation position " + to_string (pos_real) + " is outside the sequence: " + seq); 124 | if (frameshift != no_index) 125 | throw runtime_error ("AmrMutation is a frameshift"); 126 | if (verbose ()) 127 | cerr << seq. substr (0, pos_real) 128 | << endl << allele 129 | << endl << seq. substr (pos_real + reference. size ()) 130 | << endl; 131 | seq = seq. substr (0, pos_real) + allele + seq. substr (pos_real + reference. size ()); 132 | } 133 | }; 134 | 135 | 136 | 137 | struct Alignment; 138 | 139 | 140 | 141 | struct SeqChange final : Root 142 | // Observation 143 | { 144 | const Alignment* al {nullptr}; 145 | // !nullptr 146 | //bool fromAllele {false}; 147 | 148 | // In alignment 149 | size_t start {0}; 150 | size_t len {0}; 151 | 152 | // No '-' 153 | string reference; 154 | // Insertion => start is artifically decremented and len is incremented => !empty() 155 | string allele; 156 | // empty() <=> frame shift 157 | 158 | size_t start_ref {0}; 159 | size_t stop_ref {0}; 160 | size_t start_target {0}; 161 | double neighborhoodMismatch {0.0}; 162 | // 0..1 163 | 164 | VectorPtr mutations; 165 | // !nullptr 166 | // Matching AmrMutation's 167 | 168 | Disruption disr; 169 | 170 | const SeqChange* replacement {nullptr}; 171 | // !nullptr => *this is replaced by *replacement 172 | 173 | 174 | SeqChange () = default; 175 | explicit SeqChange (const Alignment* al_arg/*, 176 | bool fromAllele_arg*/) 177 | : al (al_arg) 178 | //, fromAllele (fromAllele_arg) 179 | {} 180 | SeqChange (const Alignment* al_arg, 181 | const AmrMutation* mutation_arg) 182 | : al (al_arg) 183 | { mutations << checkPtr (mutation_arg); } 184 | SeqChange (const Alignment* al_arg, 185 | const Disruption &disr_arg) 186 | : al (al_arg) 187 | , disr (disr_arg) 188 | {} 189 | void qc () const override; 190 | void saveText (ostream &os) const override 191 | { os << start + 1 192 | << ' ' << len 193 | << ' ' << strQuote (reference) << " -> " << strQuote (allele) 194 | << ' ' << start_ref + 1 << ".." << stop_ref 195 | << ' ' << start_target + 1 196 | << ' ' << neighborhoodMismatch; 197 | if (! disr. empty ()) 198 | disr. saveText (os); 199 | for (const AmrMutation* mutation : mutations) 200 | { os << ' ' ; 201 | mutation->saveText (os); 202 | } 203 | os << endl; 204 | } 205 | bool empty () const override 206 | { return ! len && disr. empty (); } 207 | 208 | 209 | bool hasMutation () const 210 | { return ! empty () && ! mutations. empty () && ! replacement; } 211 | bool hasFrameshift () const 212 | { return hasMutation () && mutations [0] -> frameshift != no_index; } 213 | bool isFrameshift () const 214 | { return reference. empty (); } 215 | string getMutationStr () const; 216 | size_t getStop () const 217 | { return start + len; } 218 | bool operator< (const SeqChange &other) const; 219 | bool better (const SeqChange &other) const; 220 | bool finish (const string &refSeq, 221 | size_t flankingLen); 222 | // Return: good match 223 | // Invokes: finishPos() 224 | bool finishPos (size_t flankingLen); 225 | // Return: good match 226 | private: 227 | void setSeq (); 228 | void setStartStopRef (); 229 | void setStartTarget (); 230 | void setNeighborhoodMismatch (size_t flankingLen); 231 | public: 232 | bool matchesMutation (const AmrMutation& mut) const; 233 | }; 234 | 235 | 236 | 237 | struct Alignment : Hsp 238 | { 239 | AmrMutation refMutation; 240 | // !empty() => qseq contains AmrMutation::allele 241 | //int ref_offset {0}; 242 | 243 | Vector seqChanges; 244 | 245 | 246 | Alignment (const string &line, 247 | bool qProt_arg, 248 | bool sProt_arg) 249 | : Hsp (line, qProt_arg, sProt_arg, qProt_arg || sProt_arg /*aProt*/, /*false*/ qProt_arg /*qStopCodon*/, true/*bacterialStartCodon*/) 250 | {} 251 | Alignment () = default; 252 | protected: 253 | void setSeqChanges (const Vector &refMutations, 254 | size_t flankingLen/*, 255 | bool allMutationsP*/); 256 | // Input: flankingLen: valid if > 0 257 | private: 258 | size_t refMutation2refSeq_pos (); 259 | // Return: no_index <=> refMutation is not detected 260 | public: 261 | void qc () const override; 262 | void saveText (ostream &os) const override 263 | { Hsp::saveText (os); 264 | if (! refMutation. empty ()) 265 | os << ' ' << refMutation; 266 | os << " #seqChanges:" << seqChanges. size (); 267 | } 268 | 269 | 270 | bool hasMutation () const 271 | { for (const SeqChange& seqChange : seqChanges) 272 | if (seqChange. hasMutation ()) 273 | return true; 274 | return false; 275 | } 276 | bool hasDeclarativeFrameshift () const 277 | { return seqChanges. size () == 1 && seqChanges [0]. hasFrameshift (); } 278 | }; 279 | 280 | 281 | 282 | 283 | } // namespace 284 | 285 | 286 | -------------------------------------------------------------------------------- /gff_check.cpp: -------------------------------------------------------------------------------- 1 | // gff_check.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Check the correctness of a GFF file 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | #include "gff.hpp" 40 | using namespace GFF_sp; 41 | 42 | #include "common.inc" 43 | 44 | 45 | 46 | namespace 47 | { 48 | 49 | 50 | const string locus_tagS ("[locus_tag="); 51 | const string prodigal_ID (" ID="); 52 | const string noFile ("emptystring"); 53 | 54 | 55 | 56 | struct ThisApplication final : Application 57 | { 58 | ThisApplication () 59 | : Application ("Check the correctness of a GFF file. Exit with an error if it is incorrect.") 60 | { 61 | // Input 62 | addPositional ("gff", "GFF file, if " + strQuote (noFile) + " then exit 0"); 63 | addKey ("gfftype", "Type of GFF file: " + Gff::names. toString (", "), "genbank"); 64 | addKey ("prot", "Protein FASTA file"); 65 | addKey ("dna", "DNA FASTA file"); 66 | addFlag ("lcl", "Nucleotide FASTA created by PGAP has \"lcl|\" prefix in accessions"); 67 | // Output 68 | addKey ("gff_prot_match", "Output file with pairs: \" \", \n\ 69 | where for genbank: is from " + strQuote (locus_tagS + "") + " in the protein FASTA comment, \n\ 70 | for microscope: is ID: from '>|ID:||', \n\ 71 | and for prodigal: is ID= in the protein FASTA comment\n\ 72 | "); 73 | addKey ("gff_dna_match", "Output file with pairs: \" \", where for pseudomonasdb: is the suffix after '|' in the DNA FASTA identifier"); 74 | version = SVN_REV; 75 | } 76 | 77 | 78 | 79 | void body () const final 80 | { 81 | const string gffName = getArg ("gff"); 82 | const Gff::Type type = Gff::name2type (getArg ("gfftype")); 83 | const string protFName = getArg ("prot"); 84 | const string dnaFName = getArg ("dna"); 85 | const string protMatchFName = getArg ("gff_prot_match"); 86 | const string dnaMatchFName = getArg ("gff_dna_match"); 87 | const bool lcl = getFlag ("lcl"); 88 | 89 | if (lcl && type != Gff::pgap) 90 | throw runtime_error ("-lcl requires type pgap"); 91 | 92 | 93 | if (isRight (gffName, noFile)) 94 | return; 95 | 96 | 97 | const Annot annot (gffName, type, ! protMatchFName. empty (), lcl); 98 | 99 | 100 | if (! protFName. empty ()) 101 | { 102 | StringVector gffIds; gffIds. reserve (10000); // PAR 103 | { 104 | OFStream outF; 105 | if (! protMatchFName. empty ()) 106 | outF. open ("", protMatchFName, ""); 107 | StringVector fastaIds; fastaIds. reserve (gffIds. capacity ()); 108 | LineInput f (protFName /*, 100 * 1024, 1*/); 109 | Istringstream iss; 110 | string line_orig; 111 | string fastaId; 112 | while (f. nextLine ()) 113 | { 114 | trimTrailing (f. line); 115 | if (f. line. empty ()) 116 | continue; 117 | if (f. line [0] != '>') 118 | continue; 119 | line_orig = f. line; 120 | iss. reset (f. line. substr (1)); 121 | fastaId. clear (); 122 | iss >> fastaId; 123 | QC_ASSERT (! fastaId. empty ()); 124 | ASSERT (! contains (fastaId, ' ')); 125 | fastaIds << fastaId; 126 | // gffId 127 | string gffId (fastaId); 128 | if (! protMatchFName. empty ()) 129 | switch (type) 130 | { 131 | case Gff::genbank: 132 | { 133 | const size_t pos = f. line. find (locus_tagS); 134 | if (pos == string::npos) 135 | throw runtime_error (__FILE__ ": " + strQuote (locus_tagS) + " is not found in: " + line_orig); 136 | gffId = f. line. substr (pos + locus_tagS. size ()); 137 | const size_t end = gffId. find (']'); 138 | if (end == string::npos) 139 | throw runtime_error (__FILE__ ": ']' is not found after " + strQuote (locus_tagS) + " in: " + line_orig); 140 | gffId. erase (end); 141 | } 142 | break; 143 | case Gff::microscope: 144 | { 145 | string s (std::move (gffId)); 146 | findSplit (s, '|'); 147 | gffId = findSplit (s, '|'); 148 | const string idS ("ID:"); 149 | if (! isLeft (gffId, idS)) 150 | throw runtime_error (__FILE__ ": 'ID:' is not found in: " + line_orig); 151 | gffId. erase (0, idS. size ()); 152 | } 153 | break; 154 | case Gff::prodigal: 155 | { 156 | const size_t pos = f. line. find (prodigal_ID); 157 | if (pos == string::npos) 158 | throw runtime_error (__FILE__ ": " + strQuote (prodigal_ID) + " is not found in: " + line_orig); 159 | gffId = f. line. substr (pos + prodigal_ID. size ()); 160 | const size_t end = gffId. find (';'); 161 | if (end == string::npos) 162 | throw runtime_error (__FILE__ ": ';' is not found after " + strQuote (prodigal_ID) + " in: " + line_orig); 163 | gffId. erase (end); 164 | } 165 | break; 166 | default: break; 167 | } 168 | // 169 | if (contains (gffId, ' ')) 170 | throw runtime_error (__FILE__ ": " + strQuote (gffId) + " contains space"); 171 | if (gffId. empty ()) 172 | throw runtime_error (__FILE__ ": No protein identifier in: " + line_orig); 173 | gffIds << gffId; 174 | if (outF. is_open ()) 175 | outF << fastaId << '\t' << gffId << endl; 176 | } 177 | const size_t n = fastaIds. size (); 178 | fastaIds. sort (); 179 | fastaIds. uniq (); 180 | if (fastaIds. size () != n) 181 | throw runtime_error (__FILE__ ": Duplicate FASTA ids"); 182 | gffIds. sort (); 183 | { 184 | const string* s_prev = nullptr; 185 | for (const string& s : gffIds) 186 | { 187 | if (s_prev && *s_prev == s) 188 | throw runtime_error (__FILE__ ": GFF identifier " + strQuote (s) + " is not unique"); 189 | s_prev = & s; 190 | } 191 | } 192 | ASSERT (gffIds. size () == fastaIds. size ()); 193 | } 194 | if (verbose ()) 195 | cout << "# Proteins in GFF: " << annot. prot2loci. size () << endl; 196 | for (const string& seqid : gffIds) 197 | if (! contains (annot. prot2loci, seqid)) 198 | throw runtime_error (__FILE__ ": Protein FASTA id " + strQuote (seqid) + " is not in the GFF file"); 199 | #if 0 200 | for (const auto& it : annot. prot2loci) 201 | if (! gffIds. containsFast (it. first)) 202 | throw runtime_error (__FILE__ ": GFF protein id " + strQuote (it. first) + " is not in the protein FASTA file"); // pseudogene ?? 203 | #endif 204 | } 205 | 206 | 207 | if (! dnaFName. empty ()) 208 | { 209 | StringVector contigIds; contigIds. reserve (10000); // PAR 210 | StringVector gffIds; gffIds. reserve (10000); // PAR 211 | { 212 | OFStream outF; 213 | if (! dnaMatchFName. empty ()) 214 | outF. open ("", dnaMatchFName, ""); 215 | LineInput f (dnaFName /*, 100 * 1024, 1*/); 216 | Istringstream iss; 217 | string contigId; 218 | while (f. nextLine ()) 219 | { 220 | trimTrailing (f. line); 221 | if (f. line. empty ()) 222 | continue; 223 | if (f. line [0] != '>') 224 | continue; 225 | iss. reset (f. line. substr (1)); 226 | contigId. clear (); 227 | iss >> contigId; 228 | ASSERT (! contains (contigId, ' ')); 229 | // gffId 230 | string gffId (contigId); 231 | if (! dnaMatchFName. empty ()) 232 | switch (type) 233 | { 234 | case Gff::pseudomonasdb: 235 | { 236 | const size_t pos = gffId. rfind ('|'); 237 | if (pos != string::npos) 238 | gffId. erase (0, pos + 1); 239 | } 240 | break; 241 | default: break; 242 | } 243 | // 244 | if (gffId. empty ()) 245 | throw runtime_error (__FILE__ ": No contig identifier in:\n" + f. line); 246 | if (lcl && ! isLeft (gffId, "lcl|")) 247 | throw runtime_error (__FILE__ ": Contig identifier does not start with " + strQuote ("lcl|") + ":\n" + f. line); 248 | gffIds << gffId; 249 | contigIds << contigId; 250 | if (outF. is_open ()) 251 | outF << contigId << '\t' << gffId << endl; 252 | } 253 | } 254 | ASSERT (contigIds. size () == gffIds. size ()); 255 | gffIds. sort (); 256 | { 257 | const string* s_prev = nullptr; 258 | for (const string& s : gffIds) 259 | { 260 | if (s_prev && *s_prev == s) 261 | throw runtime_error (__FILE__ ": DNA GFF identifier " + strQuote (s) + " is not unique"); 262 | s_prev = & s; 263 | } 264 | } 265 | contigIds. sort (); 266 | { 267 | const string* s_prev = nullptr; 268 | for (const string& s : contigIds) 269 | { 270 | if (s_prev && *s_prev == s) 271 | throw runtime_error (__FILE__ ": DNA contig identifier " + strQuote (s) + " is not unique"); 272 | s_prev = & s; 273 | } 274 | } 275 | for (const auto& it : annot. prot2loci) 276 | for (const Locus& cds : it. second) 277 | if (! gffIds. contains (cds. contig)) 278 | throw runtime_error (__FILE__ ": GFF contig id " + strQuote (cds. contig) + " is not in the DNA FASTA file"); 279 | } 280 | } 281 | }; 282 | 283 | 284 | 285 | } // namespace 286 | 287 | 288 | 289 | int main (int argc, 290 | const char* argv[]) 291 | { 292 | ThisApplication app; 293 | return app. run (argc, argv); 294 | } 295 | 296 | 297 | 298 | -------------------------------------------------------------------------------- /disruption2genesymbol.cpp: -------------------------------------------------------------------------------- 1 | // disruption2genesymbol.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Convert Disruption::genesymbol_raw() to a gene symbol 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | #include "seq.hpp" 40 | using namespace Seq_sp; 41 | 42 | #include "common.inc" 43 | 44 | 45 | 46 | namespace 47 | { 48 | 49 | 50 | constexpr char no_aa {'?'}; 51 | 52 | 53 | 54 | struct SymbolRaw final : Root 55 | { 56 | static constexpr size_t del_size {10}; // PAR 57 | 58 | // Input 59 | string contig; 60 | string prot; 61 | Disruption::Type type {Disruption::eNone}; 62 | // 0-based 63 | size_t qstart {no_index}; 64 | size_t qend {no_index}; 65 | // aa 66 | size_t sstart {no_index}; 67 | size_t send {no_index}; 68 | // bp 69 | // 70 | Strand strand {0}; 71 | bool stop {false}; 72 | string rest; 73 | 74 | // Output 75 | string ref; 76 | string allele; 77 | 78 | 79 | explicit SymbolRaw (const string &line) 80 | { 81 | string s; 82 | { 83 | istringstream iss (line); 84 | iss >> contig >> prot >> s; 85 | QC_ASSERT (! s. empty ()); 86 | ASSERT (! contig. empty ()); 87 | ASSERT (! prot. empty ()); 88 | 89 | constexpr size_t rest_size = 1024; 90 | char rest_ [rest_size]; 91 | iss. getline (rest_, rest_size); 92 | rest = rest_; 93 | } 94 | 95 | if (trimSuffix (s, Disruption::stopSuf)) 96 | stop = true; 97 | 98 | // strand 99 | { 100 | const string strandS = rfindSplit (s, '_'); 101 | if (strandS == "0") 102 | strand = -1; 103 | else if (strandS == "1") 104 | strand = 1; 105 | else 106 | throw runtime_error ("Unknown strand: " + strQuote (strandS)); 107 | } 108 | 109 | send = str2 (rfindSplit (s, '_')); 110 | sstart = str2 (rfindSplit (s, '_')); 111 | qend = str2 (rfindSplit (s, '_')); 112 | qstart = str2 (rfindSplit (s, '_')); 113 | QC_ASSERT (qstart <= qend); 114 | QC_ASSERT (sstart <= send); 115 | 116 | type = Disruption::name2type (s); 117 | QC_ASSERT (type != Disruption::eNone); 118 | QC_ASSERT (type != Disruption::eSmooth); 119 | } 120 | void saveText (ostream &os) const final 121 | { 122 | ASSERT (! ref. empty ()); 123 | os << contig // 0 124 | << '\t' << prot // 1 125 | << '\t'; 126 | if (verbose ()) 127 | os << '\t' << Disruption::typeNames [type] 128 | << '\t' << qstart 129 | << '\t' << qend 130 | << '\t' << sstart 131 | << '\t' << send 132 | << '\t' << (int) strand 133 | << '\t' << stop 134 | << '\t' << ref 135 | << '\t' << allele 136 | << '\t'; 137 | ASSERT (! contains (ref, '*')); 138 | string allele_ (allele); 139 | const bool alleleStop = trimSuffix (allele_, "*"); 140 | const size_t allele_size = allele_. size (); // Without stop codon 141 | //QC_IMPLY (type != Disruption::eFrameshift, alleleStop == stop); 142 | QC_IMPLY (stop, alleleStop); 143 | constexpr size_t display_max = 1/*reference aa*/ + 5; // PAR // PD-5395 144 | if (allele_size > display_max) 145 | allele_ = "ins"; 146 | if (alleleStop) 147 | allele_ += terminatorWord; 148 | ASSERT (! contains (allele_, '*')); 149 | // Standard gene symbol 150 | // 2 151 | if (ref. size () > display_max) 152 | os << ref. front () << qstart + 1 153 | << '_' << ref. back () << qstart + ref. size (); 154 | else 155 | os << ref << qstart + 1; 156 | switch (type) 157 | { 158 | case Disruption::eFrameshift: 159 | ASSERT (ref. size () == 1) 160 | ASSERT (! allele. empty ()); 161 | if (alleleStop && allele_size == 0) 162 | os << terminatorWord; 163 | else 164 | os << allele [0]; 165 | os << Disruption::typeNames [type]; 166 | if (alleleStop) 167 | os << terminatorWord << allele_size; 168 | break; 169 | case Disruption::eDeletion: // Or replacement 170 | if (allele_. empty ()) 171 | os << Disruption::typeNames [type]; 172 | else 173 | { 174 | os << allele_; 175 | if (allele_size > display_max) 176 | os << allele_size - 1/*reference aa*/; 177 | } 178 | break; 179 | case Disruption::eInsertion: 180 | ASSERT (ref. size () == 1); 181 | ASSERT (! allele_. empty ()); 182 | os << allele_; 183 | if (allele_size > display_max) 184 | os << allele_size - 1/*reference aa*/; 185 | break; 186 | default: 187 | break; 188 | } 189 | // 3 190 | os << '\t' 191 | // = 192 | // Opposite to SymbolRaw::SymbolRaw(line) 193 | << Disruption::typeNames [type] << '_' << qstart << '_' << qend << '_' << sstart << '_' << send << '_' << (strand == 1 ? 1 : 0); 194 | if (stop) 195 | os << Disruption::stopSuf; 196 | // 197 | os << '\t' << rest // 4 198 | << '\n'; 199 | } 200 | 201 | 202 | char contig2aa (const Dna &dna, 203 | size_t offset, 204 | Gencode gencode) const 205 | // Input: offset: from sstart/send 206 | // Return: no_aa <=> offset is outside dna 207 | { 208 | QC_ASSERT (send <= dna. seq. size ()); 209 | 210 | if (strand == 1) 211 | { 212 | const size_t i = sstart + offset * 3; 213 | if (i + 3 > send) 214 | return no_aa; 215 | return codon2aa (& dna. seq [i], gencode, false); 216 | } 217 | 218 | ASSERT (strand == -1); 219 | if (send < (offset + 1) * 3) 220 | return no_aa; 221 | const size_t i = send - (offset + 1) * 3; 222 | ASSERT (i + 3 <= dna. seq. size ()); 223 | if (i < sstart) 224 | return no_aa; 225 | string s (dna. seq. substr (i, 3)); 226 | reverseDna (s); 227 | return codon2aa (s. c_str (), gencode, false); 228 | } 229 | }; 230 | 231 | 232 | 233 | struct ThisApplication final : Application 234 | { 235 | static constexpr char id_delim {'|'}; 236 | 237 | 238 | ThisApplication () 239 | : Application ("Convert Disruption::genesymbol_raw() to standard gene symbols according to https://hgvs-nomenclature.org/stable/recommendations/protein/frameshift/.\n\ 240 | A stop codon is '" + string (terminatorWord) + "'.\n\ 241 | Print: where is inserted before " 242 | ) 243 | { 244 | addPositional ("nucl", "Input nucleotide FASTA file"); 245 | addPositional ("prot", "Input protein FASTA file"); 246 | addPositional ("tab", "Table with lines: > > "); 247 | addKey ("gencode", "NCBI genetic code for translated BLAST", "11"); 248 | addKey ("prot_id_pos", string ("Position of protein id in qseqid delimited by ") + id_delim + ", 1-based. 0 - use qseqid as a whole", "0"); 249 | } 250 | 251 | 252 | 253 | void body () const final 254 | { 255 | const string nuclFName = getArg ("nucl"); 256 | const string protFName = getArg ("prot"); 257 | const string tabFName = getArg ("tab"); 258 | const Gencode gencode = (Gencode) arg2uint ("gencode"); 259 | const size_t prot_id_pos = str2 (getArg ("prot_id_pos")); 260 | 261 | 262 | Vector symbolRaws; 263 | { 264 | LineInput f (tabFName); 265 | while (f. nextLine ()) 266 | symbolRaws << std::move (SymbolRaw (f. line)); 267 | } 268 | if (symbolRaws. empty ()) 269 | return; 270 | 271 | // SymbolRaw::allele 272 | { 273 | Multifasta fa (nuclFName, false); 274 | while (fa. next ()) 275 | { 276 | const Dna dna (fa, 100000/*PAR*/, true); 277 | dna. qc (); 278 | const string id (dna. getId ()); 279 | for (SymbolRaw& symbolRaw : symbolRaws) 280 | if (symbolRaw. contig == id) 281 | for (size_t offset = 0; ; offset++) 282 | { 283 | const char aa = symbolRaw. contig2aa (dna, offset, gencode); 284 | if (aa == no_aa) 285 | break; 286 | symbolRaw. allele += aa; 287 | if (aa == '*') 288 | break; 289 | } 290 | } 291 | } 292 | 293 | // SymbolRaw::{ref, allele for "del"} 294 | { 295 | Multifasta fa (protFName, true); 296 | while (fa. next ()) 297 | { 298 | const Peptide pep (fa, 1000/*PAR*/, true); 299 | pep. qc (); 300 | 301 | string id; 302 | const string id_whole (pep. getId ()); 303 | if (prot_id_pos) 304 | { 305 | const StringVector vec (id_whole, id_delim, true); 306 | if (prot_id_pos - 1 >= vec. size ()) 307 | throw runtime_error ("Protein identifier position " + to_string (prot_id_pos) + " is outside of the list of identifiers: " + strQuote (id_whole)); 308 | id = vec [prot_id_pos - 1]; 309 | } 310 | else 311 | id = id_whole; 312 | 313 | for (SymbolRaw& symbolRaw : symbolRaws) 314 | if (symbolRaw. prot == id) 315 | symbolRaw. ref = pep. seq. substr (symbolRaw. qstart, symbolRaw. qend - symbolRaw. qstart); 316 | } 317 | } 318 | 319 | // symbolRaw's 320 | for (const SymbolRaw& symbolRaw : symbolRaws) 321 | symbolRaw. saveText (cout); 322 | } 323 | }; 324 | 325 | 326 | } // namespace 327 | 328 | 329 | 330 | 331 | int main (int argc, 332 | const char* argv[]) 333 | { 334 | ThisApplication app; 335 | return app. run (argc, argv); 336 | } 337 | 338 | 339 | 340 | -------------------------------------------------------------------------------- /tsv.hpp: -------------------------------------------------------------------------------- 1 | // tsv.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * TSV-table 31 | * 32 | */ 33 | 34 | 35 | #ifndef TSV_HPP 36 | #define TSV_HPP 37 | 38 | 39 | #include "common.hpp" 40 | using namespace Common_sp; 41 | 42 | 43 | 44 | namespace Common_sp 45 | { 46 | 47 | 48 | 49 | struct Date : Root 50 | { 51 | enum Format {fmt_Year, fmt_YMD, fmt_None}; // not complete list ?? 52 | short year {0}; 53 | char month {0}; 54 | // 0 .. 12 - 1 55 | char day {0}; 56 | // 0 .. days[month] - 1 // leap year ?? 57 | 58 | 59 | Date () = default; 60 | explicit Date (short year_arg, 61 | char month_arg = 0, 62 | char day_arg = 0) 63 | : year (year_arg) 64 | , month (month_arg) 65 | , day (day_arg) 66 | {} 67 | static bool isYear (short n) 68 | { return n > 1000 && n < 2500; } // PAR 69 | static bool isMonth (short n) 70 | { return between (n, 0, 12); } 71 | static bool isDay (short n) 72 | { return between (n, 0, 31); } // Must depend on month ?? 73 | static Date parse (const string &s, 74 | Format fmt); 75 | // Return: !empty() <=> success 76 | bool empty () const final 77 | { return ! year 78 | && ! month 79 | && ! day; 80 | } 81 | void saveText (ostream &os) const final 82 | { os << std::setfill('0') << std::setw(4) << year << '-' 83 | << std::setfill('0') << std::setw(2) << (int) month + 1 << '-' 84 | << std::setfill('0') << std::setw(2) << (int) day + 1; 85 | } 86 | JsonMap* toJson (JsonContainer* parent, 87 | const string& name = noString) const override 88 | { auto j = new JsonMap (parent, name); 89 | new JsonInt (year, j, "year"); 90 | new JsonInt (month, j, "month"); 91 | new JsonInt (day, j, "day"); 92 | return j; 93 | } 94 | 95 | 96 | bool operator== (const Date &other) const 97 | { return year == other. year 98 | && month == other. month 99 | && day == other. day; 100 | } 101 | bool less (const Date &other, 102 | bool equal) const; 103 | bool operator<= (const Date &other) const 104 | { return less (other, true); } 105 | bool operator< (const Date &other) const 106 | { return less (other, false); } 107 | Date operator- (const Date &other) const; 108 | // Requires: other <= *this 109 | bool year_divisible () const 110 | { return ! month && ! day; } 111 | bool quarter_divisible () const 112 | { return ! (month % 3) && ! day; } 113 | bool month_divisible () const 114 | { return ! day; } 115 | }; 116 | 117 | 118 | 119 | struct TextTable : Named 120 | // Tab-separated value (tsv) table with a header 121 | // name: file name or empty() 122 | { 123 | bool pound {false}; 124 | // '#' in the beginning of header 125 | bool saveHeader {true}; 126 | 127 | 128 | struct Header : Named 129 | { 130 | size_t len_max {0}; 131 | // For trim()'ed fields 132 | // Type 133 | bool numeric {true}; 134 | // Valid if numeric 135 | bool scientific {false}; 136 | streamsize decimals {0}; 137 | bool null {false}; 138 | // = can be empty() 139 | static constexpr size_t choices_max {7}; // PAR 140 | Set choices; 141 | // size() <= choices_max + 1 142 | 143 | Header () = default; 144 | explicit Header (const string &name_arg) 145 | : Named (name_arg) 146 | {} 147 | void qc () const override; 148 | void saveText (ostream& os) const override 149 | { os << name 150 | << '\t' << len_max 151 | << '\t' << (numeric ? ((scientific ? "float" : "int") + string ("(") + to_string (decimals) + ")") : "char") 152 | << '\t' << (null ? "null" : "not null"); 153 | } 154 | 155 | void saveSql (ostream& os) const; 156 | }; 157 | Vector

header; 158 | // Header::name's are unique 159 | // size() = number of columns 160 | 161 | 162 | Vector rows; 163 | // StringVector::size() = header.size() 164 | // Values are trim()'ed 165 | typedef size_t ColNum; 166 | // no_index <=> no column 167 | typedef size_t RowNum; 168 | // no_index <=> no row 169 | static constexpr char aggr_sep {','}; // PAR 170 | 171 | 172 | struct Error : runtime_error 173 | { 174 | Error (const TextTable &tab, 175 | const string &what) 176 | : runtime_error (what + "\nIn table file: " + tab. name) 177 | {} 178 | }; 179 | 180 | 181 | explicit TextTable (const string &tableFName, 182 | const string &columnSynonymsFName = noString, 183 | bool headerP = true, 184 | uint displayPeriod = 0); 185 | // Input: tableFName: format: [{'#' }* '#']

{ >}* 186 | // empty lines are skipped 187 | // columnSynonymsFName: 188 | // Rows where number of columns < header size are added empty values 189 | static constexpr const char* syn_format {"Column synonyms file with the format: {

{ }* {|}}*"}; 190 | TextTable () = default; 191 | TextTable (bool pound_arg, 192 | const Vector

&header_arg) 193 | : pound (pound_arg) 194 | , header (header_arg) 195 | {} 196 | private: 197 | void setHeader (); 198 | public: 199 | static Vector

str2header (const string &s, 200 | char sep = ',') 201 | { const StringVector vec (s, sep, true); 202 | Vector