├── version.txt ├── .gitmodules ├── .gitignore ├── CITATION.md ├── test_database_update.sh ├── .github └── workflows │ ├── dockerhub.yml │ ├── trigger_action.sh │ ├── get_binary_release.sh │ ├── binary.yml │ ├── ccpp.yml │ ├── mac_ccpp.yml │ ├── conda.yml │ └── mac_conda.yml ├── test_prot.gff ├── LICENSE ├── test_disrupt.expected ├── README.md ├── curl_easy.hpp ├── test_prot.expected ├── columns.hpp ├── fasta2parts.cpp ├── test_amrfinder.sh ├── mutate.cpp ├── amrfinder_index.cpp ├── curl_easy.cpp ├── test_dna.expected ├── common.inc ├── test_prot.fa ├── gff.hpp ├── test_both.expected ├── fasta_extract.cpp ├── Makefile ├── fasta_check.cpp ├── alignment.hpp ├── gff_check.cpp ├── disruption2genesymbol.cpp ├── tsv.hpp ├── amrfinder_update.cpp ├── dna_mutation.cpp ├── test_dna_mut_all.expected └── gff.cpp /version.txt: -------------------------------------------------------------------------------- 1 | 4.2.5 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "stxtyper"] 2 | path = stx 3 | url = https://github.com/ncbi/stxtyper.git 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | *.o 3 | amr_report 4 | amrfinder 5 | amrfinder_index 6 | amrfinder_update 7 | amrfinder_customize 8 | disruption2genesymbol 9 | dna_mutation 10 | fasta_check 11 | fasta_extract 12 | fasta2parts 13 | gff_check 14 | mutate 15 | *.got 16 | -------------------------------------------------------------------------------- /CITATION.md: -------------------------------------------------------------------------------- 1 | Please see our [wiki instructions on citing AMRFinderPlus](https://github.com/ncbi/amr/wiki#citation) for more information on how to cite AMRFinderPlus. Importantly we ask that you include both the software version and database version in your methods so people can reproduce your results. 2 | -------------------------------------------------------------------------------- /test_database_update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "test_database_update.sh - test update from staging area" 4 | git status -uno 5 | echo "" 6 | echo "Attempts to update from ftp://ftp.ncbi.nlm.nih.gov/pathogen/Technical/AMRFinder_technical/test_database/" 7 | echo "WARNING: recompiles AMRFinderPlus to use a different update URL" 8 | echo "To continue press , to abort ^C" 9 | read 10 | 11 | set -x 12 | 13 | touch amrfinder_update.cpp 14 | make TEST_UPDATE=1 15 | ./amrfinder -U 16 | ./test_amrfinder.sh -n 17 | #touch amrfinder_update.cpp 18 | #make 19 | 20 | -------------------------------------------------------------------------------- /.github/workflows/dockerhub.yml: -------------------------------------------------------------------------------- 1 | name: dockerhub image 2 | 3 | on: 4 | schedule: 5 | - cron: '15 21 * * *' # 9:15pm everyday 6 | workflow_dispatch: 7 | repository_dispatch: 8 | types: [docker-test, install-test] 9 | 10 | jobs: 11 | 12 | test_dockerhub: 13 | runs-on: ubuntu-latest 14 | timeout-minutes: 30 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Test docker 18 | run: docker run hello-world 19 | - name: Software and DB version 20 | run: docker run ncbi/amr amrfinder --database_version 21 | - name: image ls for debugging 22 | run: docker image ls 23 | - name: Protein 24 | run: docker run --rm -v ${PWD}:/data ncbi/amr amrfinder --plus -n test_dna.fa -O Escherichia --mutation_all test_dna_mut_all.got --print_node > test_dna.got 25 | - name: Check results 26 | run: diff test_dna.expected test_dna.got 27 | - name: Combined 28 | run: docker run --rm -v ${PWD}:/data ncbi/amr amrfinder --plus -n test_dna.fa -p test_prot.fa -g test_prot.gff -O Escherichia --print_node > test_both.got 29 | - name: Check combined results 30 | run: diff test_both.expected test_both.got 31 | -------------------------------------------------------------------------------- /test_prot.gff: -------------------------------------------------------------------------------- 1 | ##gff-version 3 2 | ##sequence-region contig1 1-50000 3 | contig01 . gene 101 961 . + . ID=gene1;Name=blaTEM-156 4 | contigX . gene 1 501 . + . ID=gene2;Name=nimIJ_hmm 5 | contig02 . gene 1 1191 . + . ID=gene3;Name=blaPDC-114_blast 6 | contig03 . gene 101 802 . + . ID=gene4;Name=blaOXA-436_partial 7 | contig04 . gene 101 1147 . + . ID=gene5;Name=vanG 8 | contig06 . gene 31 2616 . + . ID=gene6;Name=gyrA 9 | contig07 . gene 101 526 . + . ID=gene7;Name=50S_L22 10 | contig09 . gene 1 675 . - . Name=aph3pp-Ib_partial_5p_neg 11 | contig09 . gene 715 1377 . - . Name=sul2_partial_3p_neg 12 | contig11 . gene 113 547 . + . Name=blaTEM-internal_stop 13 | contig12 . gene 71 637 . + . Name=qacR-curated_blast 14 | contig13 . gene 1 1137 . + . Name=emrD3-suppressed-in-vibrio 15 | contig13 . gene 1141 1491 . + . Name=arsR-suppressed-in-escherichia 16 | contig14 . gene 1093 2181 . + . Name=pmrB_C84R 17 | contig16 . gene 1 423 . + . Name=nfsA_R15C_K141STOP 18 | contig18 . gene 279 1238 . + . Name=stxA2a_prot 19 | contig18 Protein Homology CDS 279 1238 . + 0 Name=stxA2a_prot 20 | contig18 . gene 1250 1519 . + . Name=stxB2a_prot 21 | contig18 Protein Homology CDS 1250 1519 . + 0 Name=stxB2a_prot 22 | -------------------------------------------------------------------------------- /.github/workflows/trigger_action.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cat <&2 echo "Downloading AMRFinderPlus version $release" 20 | >&2 echo "Binaries from $URL" 21 | 22 | # download and unpack AMRFinder binaries 23 | curl --silent -L -O $URL 24 | tarball_name=$(echo $URL | perl -pe 's#^.*/(.*)#\1#') 25 | tar xfz $tarball_name 26 | rm $tarball_name 27 | 28 | # download and unpack test 29 | curl --silent \ 30 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_dna.fa \ 31 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_prot.fa \ 32 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_prot.gff \ 33 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_both.expected \ 34 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_dna.expected \ 35 | -O https://raw.githubusercontent.com/ncbi/amr/master/test_prot.expected 36 | 37 | # download database 38 | ./amrfinder --update 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | PUBLIC DOMAIN NOTICE 3 | National Center for Biotechnology Information 4 | 5 | This software/database is a "United States Government Work" under the 6 | terms of the United States Copyright Act. It was written as part of 7 | the author's official duties as a United States Government employee and 8 | thus cannot be copyrighted. This software/database is freely available 9 | to the public for use. The National Library of Medicine and the U.S. 10 | Government have not placed any restriction on its use or reproduction. 11 | 12 | Although all reasonable efforts have been taken to ensure the accuracy 13 | and reliability of the software and data, the NLM and the U.S. 14 | Government do not and cannot warrant the performance or results that 15 | may be obtained by using this software or data. The NLM and the U.S. 16 | Government disclaim all warranties, express or implied, including 17 | warranties of performance, merchantability or fitness for any particular 18 | purpose. 19 | 20 | Please cite Feldgarden, Michael, Vyacheslav Brover, Daniel H. Haft, Arjun B. 21 | Prasad, Douglas J. Slotta, Igor Tolstoy, Gregory H. Tyson et al. "Validating 22 | the AMRFinder tool and resistance gene database by using antimicrobial 23 | resistance genotype-phenotype correlations in a collection of isolates." 24 | Antimicrobial agents and chemotherapy 63, no. 11 (2019): e00483-19. 25 | https://pubmed.gov/31427293 in any work or product based on this material. 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/binary.yml: -------------------------------------------------------------------------------- 1 | name: binary tarball 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | branches: [ master ] 7 | repository_dispatch: 8 | types: [linux-binary-test, install-test] 9 | schedule: 10 | - cron: '15 15 * * *' # 3:15pm every day 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | timeout-minutes: 30 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: prerequisites 18 | run: | 19 | sudo apt-get update 20 | sudo apt-get install -y hmmer ncbi-blast+ curl build-essential 21 | - name: download 22 | run: bash -x .github/workflows/get_binary_release.sh 23 | - name: Software and db version 24 | run: ./amrfinder --database_version 25 | - name: run tests included with distribution 26 | run: | 27 | ./amrfinder --plus -p test_prot.fa -g test_prot.gff -O Escherichia --print_node > test_prot.got 28 | diff test_prot.expected test_prot.got 29 | ./amrfinder --plus -n test_dna.fa -O Escherichia --mutation_all test_dna_mut_all.got --print_node > test_dna.got 30 | diff test_dna.expected test_dna.got 31 | ./amrfinder --plus -n test_dna.fa -p test_prot.fa -g test_prot.gff -O Escherichia --print_node > test_both.got 32 | diff test_both.expected test_both.got 33 | - name: Run tests 34 | run: | 35 | # temporarily we need to download the test script. Can remove 36 | # once we have a new software release and it's included in the 37 | # distribution 38 | BASE_URL="https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master" 39 | echo "BASE_URL=${BASE_URL}" 40 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 41 | bash -x ./test_amrfinder.sh 42 | -------------------------------------------------------------------------------- /test_disrupt.expected: -------------------------------------------------------------------------------- 1 | Protein id Contig id Start Stop Strand Element symbol Element name Scope Type Subtype Class Subclass Method Target length Reference sequence length % Coverage of reference % Identity to reference Alignment length Closest reference accession Closest reference name HMM accession HMM description Hierarchy node 2 | NA cirA_A169insTer10 47 5134 + cirA_A169insTer10 Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 1696 657 100.00 99.85 657 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 3 | NA cirA_K633LfsTer8 13 3644 + cirA_K633LfsTer8 Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 1210 657 100.00 100.00 657 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 4 | NA cirA_N184NRHSEWTer 69 6059 + cirA_N184NRHSEWTer Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 1997 657 100.00 100.00 657 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 5 | NA cirA_N184NRHSEWTer 2058 2930 - blaCTX-M-15 extended-spectrum class A beta-lactamase CTX-M-15 core AMR AMR BETA-LACTAM CEPHALOSPORIN ALLELEX 291 291 100.00 100.00 291 WP_000239590.1 extended-spectrum class A beta-lactamase CTX-M-15 NA NA blaCTX-M-15 6 | NA cirA_Q562Ter 164 2134 - cirA_Q562Ter Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 657 657 100.00 99.85 657 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 7 | NA cirA_T98Ter 14 1973 + cirA_T98Ter Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT_DISRUPT BETA-LACTAM CEFIDEROCOL POINTX 653 657 100.00 95.59 658 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 8 | NA cirA_Y253CfsTer5 60 833 + cirA_Y253CfsTer5 Klebsiella pneumoniae cefiderocol resistant CirA core AMR POINT BETA-LACTAM CEFIDEROCOL POINTX 258 258 100.00 100.00 258 WP_002912926.1 catecholate siderophore receptor CirA NA NA cirA 9 | NA ompK35_R60LfsTer31 18 1101 + ompK35_R60LfsTer31 Klebsiella pneumoniae carbapenem resistant OmpK35 core AMR POINT_DISRUPT BETA-LACTAM CARBAPENEM POINTX 361 359 100.00 100.00 359 WP_004141771.1 outer membrane porin OmpK35 NA NA ompK35 10 | -------------------------------------------------------------------------------- /.github/workflows/ccpp.yml: -------------------------------------------------------------------------------- 1 | name: C++ CI 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | repository_dispatch: 8 | types: [linux-compile-test, install-test] 9 | jobs: 10 | linux_x86: 11 | runs-on: ubuntu-latest 12 | timeout-minutes: 30 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: prerequisites 16 | run: | 17 | sudo apt-get update 18 | sudo apt-get install -y hmmer ncbi-blast+ git libcurl4-openssl-dev build-essential curl 19 | - name: submodule checkout 20 | run: git submodule update --init --recursive 21 | - name: make 22 | run: cat version.txt; make -j -O 23 | - name: download db 24 | run: ./amrfinder -u 25 | - name: Software and DB version 26 | run: ./amrfinder --database_version 27 | - name: make test 28 | run: make test 29 | - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) 30 | run: ./amrfinder -u 2>&1 | fgrep 'Skipping update' 31 | - name: make github_binaries 32 | run: make github_binaries 33 | - uses: actions/upload-artifact@v4 34 | with: 35 | name: release-binary 36 | path: amrfinder_binaries_v*.tar.gz 37 | linux-arm64: 38 | runs-on: ubuntu-24.04-arm 39 | timeout-minutes: 30 40 | steps: 41 | - uses: actions/checkout@v4 42 | - name: prerequisites 43 | run: | 44 | sudo apt-get update 45 | sudo apt-get install -y hmmer ncbi-blast+ git libcurl4-openssl-dev build-essential curl 46 | - name: submodule checkout 47 | run: git submodule update --init --recursive 48 | - name: make 49 | run: cat version.txt; make -j -O 50 | - name: download db 51 | run: ./amrfinder -u 52 | - name: Software and DB version 53 | run: ./amrfinder --database_version 54 | - name: make test 55 | run: make test 56 | - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) 57 | run: ./amrfinder -u 2>&1 | fgrep 'Skipping update' 58 | - name: make github_binaries 59 | run: | 60 | make github_binaries 61 | version=`cat version.txt` 62 | mv amrfinder_binaries_v$version.tar.gz amrfinder_binaries_linux_aarch64_v$version.tar.gz 63 | - uses: actions/upload-artifact@v4 64 | with: 65 | name: release-binary-arm 66 | path: amrfinder_binaries_linux_aarch64_v*.tar.gz 67 | 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NCBI Antimicrobial Resistance Gene Finder (AMRFinderPlus) 2 | 3 | This software and the accompanying database are designed to find acquired antimicrobial resistance genes and point mutations in protein and/or assembled nucleotide sequences. We have also added "plus" stress, heat, and biocide resistance and virulence factors for [some organisms](https://github.com/evolarjun/amr/wiki/Curated-organisms). 4 | 5 | ## See [the wiki for documentation](https://github.com/ncbi/amr/wiki) 6 | ## [Citing AMRFinderPlus](https://github.com/ncbi/amr/wiki#how-to-cite) 7 | ## Please [subscribe to our announce list](https://www.ncbi.nlm.nih.gov/mailman/listinfo/amrfinder-announce) for announcements of database and software updates. 8 | 9 | ---- 10 | # Licenses 11 | 12 | ## PUBLIC DOMAIN NOTICE 13 | 14 | ### National Center for Biotechnology Information 15 | 16 | This software/database is a "United States Government Work" under the 17 | terms of the United States Copyright Act. It was written as part of 18 | the authors' official duties as a United States Government employee and 19 | thus cannot be copyrighted. This software/database is freely available 20 | to the public for use. The National Library of Medicine and the U.S. 21 | Government have not placed any restriction on its use or reproduction. 22 | 23 | Although all reasonable efforts have been taken to ensure the accuracy 24 | and reliability of the software and data, the NLM and the U.S. 25 | Government do not and cannot warrant the performance or results that 26 | may be obtained by using this software or data. The NLM and the U.S. 27 | Government disclaim all warranties, express or implied, including 28 | warranties of performance, merchantability or fitness for any particular 29 | purpose. 30 | 31 | In any work or product derived from this material, proper attribution of the 32 | authors as the source of the software or data should be made, using the 33 | following citation: 34 | 35 | Feldgarden M, Brover V, Gonzalez-Escalona N, Frye JG, Haendiges J, Haft DH, 36 | Hoffmann M, Pettengill JB, Prasad AB, Tillman GE, Tyson GH, Klimke W. 37 | AMRFinderPlus and the Reference Gene Catalog facilitate examination of the 38 | genomic links among antimicrobial resistance, stress response, and virulence. 39 | Sci Rep. 2021 Jun 16;11(1):12728. doi: [10.1038/s41598-021-91456-0](https://doi.org/10.1038/s41598-021-91456-0). PMID: [34135355](https://pubmed.ncbi.nlm.nih.gov/34135355/); PMCID: [PMC8208984](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8208984/). 40 | 41 | 42 | -------------------------------------------------------------------------------- /.github/workflows/mac_ccpp.yml: -------------------------------------------------------------------------------- 1 | name: MacOS C++ CI 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | repository_dispatch: 8 | types: [mac-compile-test, install-test] 9 | jobs: 10 | macos_arm: 11 | runs-on: macos-latest 12 | timeout-minutes: 30 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: submodule checkout 16 | run: git submodule update --init --recursive 17 | - name: prerequisites 18 | run: | 19 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" 20 | brew install blast 21 | brew install hmmer 22 | - name: make 23 | run: make -j 24 | - name: download db 25 | run: ./amrfinder -u 26 | - name: Software and DB versions 27 | run: | 28 | cat version.txt 29 | ./amrfinder --database_version 30 | - name: make test 31 | run: make test 32 | - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) 33 | run: ./amrfinder -u 2>&1 | fgrep 'Skipping update' 34 | - name: make github_binaries 35 | run: | 36 | make github_binaries 37 | version=`cat version.txt` 38 | mv amrfinder_binaries_v$version.tar.gz amrfinder_binaries_macos_aarch64_v$version.txt 39 | - uses: actions/upload-artifact@v4 40 | with: 41 | name: release-binary 42 | path: amrfinder_binaries_macos_aarch64_v*.tar.gz 43 | macos_x86_64: 44 | runs-on: macos-15-intel 45 | timeout-minutes: 30 46 | steps: 47 | - uses: actions/checkout@v4 48 | - name: submodule checkout 49 | run: git submodule update --init --recursive 50 | - name: prerequisites 51 | run: | 52 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" 53 | brew install blast 54 | brew install hmmer 55 | - name: make 56 | run: make -j 57 | - name: download db 58 | run: ./amrfinder -u 59 | - name: Software and DB versions 60 | run: | 61 | cat version.txt 62 | ./amrfinder --database_version 63 | - name: make test 64 | run: make test 65 | - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) 66 | run: ./amrfinder -u 2>&1 | fgrep 'Skipping update' 67 | - name: make github_binaries 68 | run: | 69 | make github_binaries 70 | version=`cat version.txt` 71 | mv amrfinder_binaries_v$version.tar.gz amrfinder_binaries_macos_x86_64_v$version.txt 72 | - uses: actions/upload-artifact@v4 73 | with: 74 | name: release-binary 75 | path: amrfinder_binaries_macos_x86_64_v*.tar.gz 76 | -------------------------------------------------------------------------------- /.github/workflows/conda.yml: -------------------------------------------------------------------------------- 1 | name: Linux bioconda 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '15 3 * * *' # 3:15am everyday 7 | repository_dispatch: 8 | types: [linux-bioconda-test, install-test] 9 | jobs: 10 | conda_linux_x86_64: 11 | runs-on: ubuntu-latest 12 | timeout-minutes: 30 13 | steps: 14 | - name: When was this run 15 | run: date 16 | - name: configure conda 17 | run: | 18 | . $CONDA/bin/activate 19 | conda config --add channels defaults 20 | conda config --add channels bioconda 21 | conda config --add channels conda-forge 22 | - name: install AMRFinderPlus 23 | run: | 24 | . $CONDA/bin/activate 25 | conda install --update-deps -c conda-forge -c bioconda -y ncbi-amrfinderplus 26 | amrfinder --version 27 | - name: download latest AMRFinderPlus database 28 | run: | 29 | source /usr/share/miniconda/bin/activate 30 | echo CONDA_PREFIX = ${CONDA_PREFIX} 31 | /usr/share/miniconda/bin/amrfinder --force_update 32 | - name: Software and DB version 33 | run: | 34 | source /usr/share/miniconda/bin/activate 35 | amrfinder --database_version 36 | - name: download tests 37 | run: | 38 | BASE_URL=https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master 39 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 40 | - name: run tests 41 | run: | 42 | source /usr/share/miniconda/bin/activate 43 | echo CONDA_PREFIX = $CONDA_PREFIX 44 | bash ./test_amrfinder.sh -p 45 | conda_linux_aarch64: 46 | runs-on: ubuntu-24.04-arm 47 | timeout-minutes: 30 48 | steps: 49 | - uses: conda-incubator/setup-miniconda@v3 50 | with: 51 | auto-update-conda: true 52 | - name: When was this run 53 | run: date 54 | - name: configure conda 55 | shell: bash -el {0} 56 | run: | 57 | conda config --add channels defaults 58 | conda config --add channels bioconda 59 | conda config --add channels conda-forge 60 | - name: install AMRFinderPlus 61 | shell: bash -el {0} 62 | run: | 63 | conda install --update-deps -c conda-forge -c bioconda -y ncbi-amrfinderplus 64 | amrfinder --version 65 | - name: download latest AMRFinderPlus database 66 | shell: bash -el {0} 67 | run: amrfinder --force_update 68 | - name: Software and DB version 69 | shell: bash -el {0} 70 | run: amrfinder --database_version 71 | - name: download tests 72 | run: | 73 | BASE_URL=https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master 74 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 75 | - name: run tests 76 | shell: bash -el {0} 77 | run: bash ./test_amrfinder.sh -p 78 | -------------------------------------------------------------------------------- /curl_easy.hpp: -------------------------------------------------------------------------------- 1 | // curl_easy.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * Dependencies: curl.{h,c} 30 | * 31 | * File Description: 32 | * curl_easy functions 33 | * 34 | */ 35 | 36 | 37 | #ifdef _MSC_VER 38 | #error "UNIX is required" 39 | #endif 40 | 41 | 42 | #include 43 | extern "C" 44 | { 45 | #include 46 | // Linking requires: -lcurl 47 | } 48 | 49 | #include "common.hpp" 50 | using namespace Common_sp; 51 | 52 | 53 | 54 | 55 | namespace CURL_sp 56 | { 57 | 58 | 59 | SoftwareVersion getLibVersion (); 60 | 61 | 62 | 63 | struct Curl 64 | { 65 | CURL* eh {nullptr}; 66 | 67 | 68 | Curl () 69 | : eh (curl_easy_init ()) 70 | { if (! eh) 71 | throw runtime_error ("Cannot initialize curl_easy"); 72 | // Override the libcurl system-wide default 73 | // PD-5495 / https://github.com/ncbi/amr/issues/170 74 | if (const char *env_ca_bundle = getenv ("CURL_CA_BUNDLE")) 75 | curl_easy_setopt (eh, CURLOPT_CAINFO, env_ca_bundle); 76 | } 77 | ~Curl () 78 | { curl_easy_cleanup (eh); } 79 | 80 | 81 | void download (const string &url, 82 | const string &fName); 83 | string read (const string &url); 84 | private: 85 | void process (const string &url, 86 | const string &error_msg_action); 87 | }; 88 | 89 | 90 | 91 | 92 | } // namespace 93 | 94 | 95 | -------------------------------------------------------------------------------- /test_prot.expected: -------------------------------------------------------------------------------- 1 | Protein id Contig id Start Stop Strand Element symbol Element name Scope Type Subtype Class Subclass Method Target length Reference sequence length % Coverage of reference % Identity to reference Alignment length Closest reference accession Closest reference name HMM accession HMM description Hierarchy node 2 | blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase blaTEM-156 3 | blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase blaPDC 4 | blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam 5 | vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG vanG 6 | aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib 7 | sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIALP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA sul2 8 | blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 144 286 50.35 97.22 144 WP_000027057.1 broad-spectrum class A beta-lactamase TEM-1 NA NA blaTEM 9 | qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA qacR 10 | emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA emrD3 11 | pmrB_C84R contig14 1093 2181 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTP 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA pmrB 12 | nfsA_R15C_K141STOP contig16 1 423 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTP 140 240 58.33 99.29 140 WP_089631889.1 nitroreductase NfsA NA NA nfsA 13 | stxA2a_prot contig18 279 1238 + stxA2 Shiga toxin Stx2 subunit A plus VIRULENCE VIRULENCE STX2 stxA2 EXACTP 319 319 100.00 100.00 319 TJA36680.1 Shiga toxin Stx2 subunit A NF041702.1 Shiga toxin Stx2 subunit A stxA2_acd 14 | stxB2a_prot contig18 1250 1519 + stxB2 Shiga toxin Stx2a subunit B plus VIRULENCE VIRULENCE STX2 stxB2a EXACTP 89 89 100.00 100.00 89 AAM90978.1 Shiga toxin Stx2a subunit B NF033660.0 Shiga toxin Stx2 subunit B stxB2a 15 | nimIJ_hmm contigX 1 501 + nimIJ NimIJ family 5-nitroimidazole reductase core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family 5-nitroimidazole reductase NF000262.1 NimIJ family 5-nitroimidazole reductase nimIJ 16 | -------------------------------------------------------------------------------- /columns.hpp: -------------------------------------------------------------------------------- 1 | // columns.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * AMRFinderPlus column names 31 | * 32 | */ 33 | 34 | 35 | 36 | // Columns 37 | // PD-5085 38 | constexpr const char* prot_colName = "Protein id"; // PD-2534 39 | constexpr const char* contig_colName = "Contig id"; 40 | // Target 41 | constexpr const char* start_colName = "Start"; 42 | constexpr const char* stop_colName = "Stop"; 43 | constexpr const char* strand_colName = "Strand"; 44 | // 45 | constexpr const char* genesymbol_colName = "Element symbol"; // PD-4924 46 | constexpr const char* elemName_colName = "Element name"; // PD-4910 47 | constexpr const char* scope_colName = "Scope"; // PD-2825 48 | // PD-1856 49 | constexpr const char* type_colName = "Type"; 50 | constexpr const char* subtype_colName = "Subtype"; 51 | constexpr const char* class_colName = "Class"; 52 | constexpr const char* subclass_colName = "Subclass"; 53 | // 54 | constexpr const char* method_colName = "Method"; 55 | constexpr const char* targetLen_colName = "Target length"; // was: "Element length" (temporarily) 56 | constexpr const char* refLen_colName = "Reference sequence length"; 57 | constexpr const char* refCov_colName = "% Coverage of reference"; 58 | constexpr const char* refIdent_colName = "% Identity to reference"; 59 | constexpr const char* alignLen_colName = "Alignment length"; 60 | constexpr const char* closestRefAccession_colName = "Closest reference accession"; 61 | constexpr const char* closestRefName_colName = "Closest reference name"; 62 | constexpr const char* hmmAccession_colName = "HMM accession"; 63 | constexpr const char* hmmDescr_colName = "HMM description"; 64 | constexpr const char* hierarchyNode_colName = "Hierarchy node"; 65 | 66 | 67 | // PD-5155 68 | constexpr const char* fusion_infix = "::"; // was: "/" 69 | 70 | constexpr const char* na = "NA"; 71 | 72 | constexpr const char* disruption_delim = "_@"; 73 | 74 | 75 | // Methods 76 | constexpr const char* frameshift_Name = "FRAMESHIFT"; 77 | constexpr const char* internalStop_Name = "INTERNAL_STOP"; 78 | constexpr const char* partial_Name = "PARTIAL"; 79 | constexpr const char* partialContigEnd_Name = "PARTIAL_CONTIG_END"; 80 | 81 | -------------------------------------------------------------------------------- /fasta2parts.cpp: -------------------------------------------------------------------------------- 1 | // fasta2parts.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Split the sequences a FASTA file into chunks without breaking sequences 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | 40 | #include "common.inc" 41 | 42 | 43 | 44 | namespace 45 | { 46 | 47 | 48 | 49 | struct ThisApplication final : Application 50 | { 51 | ThisApplication () 52 | : Application ("Split the sequences a FASTA file into parts without breaking sequences") 53 | { 54 | addPositional ("in", "FASTA file"); 55 | addPositional ("parts_max", "Max. number of parts (>= 2)"); 56 | addPositional ("dir", "Output directory where chunks are saved named by integers starting with 1"); 57 | version = SVN_REV; 58 | } 59 | 60 | 61 | 62 | void body () const final 63 | { 64 | const string fName = getArg ("in"); 65 | const size_t parts_max = str2 (getArg ("parts_max")); 66 | const string dirName = getArg ("dir"); 67 | 68 | if (parts_max <= 1) 69 | throw runtime_error ("Number of parts must be >= 2"); 70 | 71 | 72 | const size_t chunk_min = (size_t) getFileSize (fName) / parts_max + 1; 73 | 74 | size_t part = 0; 75 | unique_ptr out; 76 | size_t seqSize = 0; 77 | LineInput f (fName); 78 | while (f. nextLine ()) 79 | { 80 | trimTrailing (f. line); 81 | if (f. line. empty ()) 82 | continue; 83 | if ( f. line [0] == '>' 84 | && seqSize >= chunk_min 85 | && part < parts_max 86 | ) 87 | { 88 | out. reset (); 89 | seqSize = 0; 90 | } 91 | if (! out. get ()) 92 | { 93 | part++; 94 | ASSERT (part <= parts_max); 95 | out. reset (new OFStream (dirName, toString (part), "")); 96 | } 97 | *out << f. line << endl; 98 | seqSize += f. line. size (); 99 | } 100 | } 101 | }; 102 | 103 | 104 | 105 | } // namespace 106 | 107 | 108 | 109 | int main (int argc, 110 | const char* argv[]) 111 | { 112 | ThisApplication app; 113 | return app. run (argc, argv); 114 | } 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /.github/workflows/mac_conda.yml: -------------------------------------------------------------------------------- 1 | name: Mac bioconda 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: '15 9 * * *' # 9:15am everyday 6 | repository_dispatch: 7 | types: [mac-bioconda-test, install-test] 8 | jobs: 9 | conda_macos_aarch64: 10 | runs-on: macos-latest 11 | timeout-minutes: 30 12 | steps: 13 | - name: Install conda because built-in conda is borked 14 | run: | 15 | curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh 16 | bash ./Miniconda3-latest-MacOSX-x86_64.sh -b -p /Users/runner/miniconda3 17 | - name: Configure conda 18 | run: | 19 | source /Users/runner/miniconda3/bin/activate 20 | conda init 21 | # THIS DOESN"T WORK! Just install miniconda myself 22 | # . $CONDA/bin/activate 23 | conda config --add channels defaults 24 | conda config --add channels bioconda 25 | conda config --add channels conda-forge 26 | # permissions are messed up on the mac runner 27 | # Is this faster than installing miniconda myself? 28 | # sudo chown -R 501:20 $CONDA 29 | conda update conda 30 | - name: Install AMRFinderPlus 31 | run: | 32 | source /Users/runner/miniconda3/bin/activate 33 | conda install --update-deps -c bioconda -c conda-forge -y ncbi-amrfinderplus 34 | - name: Download AMRFinderPlus database 35 | run: | 36 | source /Users/runner/miniconda3/bin/activate 37 | /Users/runner/miniconda3/bin/amrfinder -u 38 | - name: Software and DB version 39 | run: | 40 | source /Users/runner/miniconda3/bin/activate 41 | amrfinder --database_version 42 | - name: Download tests 43 | run: | 44 | BASE_URL=https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master 45 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 46 | - name: Run tests 47 | run: | 48 | source /Users/runner/miniconda3/bin/activate 49 | echo CONDA_PREFIX = $CONDA_PREFIX 50 | bash ./test_amrfinder.sh -p 51 | conda_macos_x86_64: 52 | runs-on: macos-15-intel 53 | timeout-minutes: 30 54 | steps: 55 | - name: Install conda because built-in conda is borked 56 | run: | 57 | curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh 58 | bash ./Miniconda3-latest-MacOSX-x86_64.sh -b -p /Users/runner/miniconda3 59 | - name: Configure conda 60 | run: | 61 | source /Users/runner/miniconda3/bin/activate 62 | conda init 63 | # THIS DOESN"T WORK! Just install miniconda myself 64 | # . $CONDA/bin/activate 65 | conda config --add channels defaults 66 | conda config --add channels bioconda 67 | conda config --add channels conda-forge 68 | # permissions are messed up on the mac runner 69 | # Is this faster than installing miniconda myself? 70 | # sudo chown -R 501:20 $CONDA 71 | conda update conda 72 | - name: Install AMRFinderPlus 73 | run: | 74 | source /Users/runner/miniconda3/bin/activate 75 | conda install --update-deps -c bioconda -c conda-forge -y ncbi-amrfinderplus 76 | - name: Download AMRFinderPlus database 77 | run: | 78 | source /Users/runner/miniconda3/bin/activate 79 | /Users/runner/miniconda3/bin/amrfinder -u 80 | - name: Software and DB version 81 | run: | 82 | source /Users/runner/miniconda3/bin/activate 83 | amrfinder --database_version 84 | - name: Download tests 85 | run: | 86 | BASE_URL=https://raw.githubusercontent.com/${GITHUB_REPOSITORY}/master 87 | curl --silent -L -O ${BASE_URL}/test_amrfinder.sh 88 | - name: Run tests 89 | run: | 90 | source /Users/runner/miniconda3/bin/activate 91 | echo CONDA_PREFIX = $CONDA_PREFIX 92 | bash ./test_amrfinder.sh -p 93 | -------------------------------------------------------------------------------- /test_amrfinder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | AMRFINDER_OPTS=" --plus --print_node --threads 6 " 4 | path=0 5 | no_download=0 6 | print_help=0 7 | while getopts "pnh" opt; do 8 | case $opt in 9 | p) path=1 ;; 10 | n) no_download=1 ;; 11 | h) print_help=1 ;; 12 | esac 13 | done 14 | 15 | if [ "$print_help" -gt 0 ] 16 | then 17 | echo "test_amrfinder.sh - Run tests" 18 | echo "Options: " 19 | echo " -p Test the amrfinder command in path instead of ./amrfinder" 20 | echo " -n Don't attempt to download fresh test data, use the test data in $PWD" 21 | echo " -h print this help message" 22 | exit 1 23 | fi 24 | 25 | # some color macros 26 | if [ "$TERM" == "" ] || [ "$TERM" == "dumb" ] || [ ! -t 1 ] 27 | then 28 | green='' # no colors 29 | red='' 30 | reset='' 31 | else 32 | green=`tput setaf 2` # Set green foreground color (code 2) 33 | red=`tput setaf 1` # Set red foreground color (code 1) 34 | reset=`tput sgr0` # Reset color to default 35 | fi 36 | 37 | if [ "$path" -gt 0 ] 38 | then 39 | echo "Testing amrfinder command in your \$PATH" 40 | which amrfinder 41 | AMRFINDER=amrfinder 42 | else 43 | echo "Testing ./amrfinder" 44 | AMRFINDER=./amrfinder 45 | fi 46 | 47 | if [ "$no_download" -gt 0 ] 48 | then 49 | echo "-n option detected, skipping download of test data and using tests in" 50 | echo "$PWD. Note test data may not match the latest database release." 51 | else 52 | echo Downloading fresh test data... 53 | BASE_URL=https://raw.githubusercontent.com/ncbi/amr/master 54 | curl --silent --location \ 55 | -O ${BASE_URL}/test_dna.fa \ 56 | -O ${BASE_URL}/test_prot.fa \ 57 | -O ${BASE_URL}/test_prot.gff \ 58 | -O ${BASE_URL}/test_both.expected \ 59 | -O ${BASE_URL}/test_dna.expected \ 60 | -O ${BASE_URL}/test_dna_mut_all.expected \ 61 | -O ${BASE_URL}/test_prot.expected \ 62 | -O ${BASE_URL}/test_disrupt.fa \ 63 | -O ${BASE_URL}/test_disrupt.expected 64 | 65 | if [ $? != 0 ] 66 | then 67 | echo "${red}WARNING: Could not download new test data.${reset}" 68 | echo "Will attempt to use test data in $PWD." 69 | echo "Test data included with installation may not match the latest database release." 70 | fi 71 | fi 72 | 73 | TESTS=0 74 | TEST_TEXT="" 75 | FAILURES=0 76 | 77 | function test_input_file { 78 | local test_base="$1" 79 | local options="$2" 80 | 81 | TESTS=$(( $TESTS + 1 )) 82 | 83 | if ! $AMRFINDER $options $AMRFINDER_OPTS > "$test_base.got" 84 | then 85 | echo "${red}not ok: $AMRFINDER returned a non-zero exit value indicating a failure of the software${reset}" 86 | echo "# $AMRFINDER $options $AMRFINDER_OPTS > $test_base.got" 87 | return 1 88 | else 89 | if ! diff -q "$test_base.expected" "$test_base.got" 90 | then 91 | echo "${red}not ok: $AMRFINDER returned output different from expected.${reset}" 92 | echo "# diff $test_base.expected $test_base.got" 93 | echo "# To approve run: " 94 | echo "# mv $test_base.got $test_base.expected" 95 | TEST_TEXT="$TEST_TEXT"$'\n'"${red}Failed $test_base${reset}"; 96 | echo "" 97 | return 1 98 | else 99 | echo "${green}ok:${reset} $test_base" 100 | return 0 101 | fi 102 | fi 103 | } 104 | 105 | test_input_file "test_prot" "-p test_prot.fa -g test_prot.gff -O Escherichia" 106 | FAILURES=$(( $? + $FAILURES )) 107 | 108 | test_input_file "test_dna" "-n test_dna.fa -O Escherichia --mutation_all test_dna_mut_all.got" 109 | FAILURES=$(( $? + $FAILURES )) 110 | 111 | test_input_file "test_both" "-n test_dna.fa -g test_prot.gff -p test_prot.fa -O Escherichia" 112 | FAILURES=$(( $? + $FAILURES )) 113 | 114 | test_input_file "test_disrupt" "-n test_disrupt.fa -O Klebsiella_pneumoniae" 115 | FAILURES=$(( $? + $FAILURES )) 116 | 117 | # gzipped input 118 | # gzip -c test_prot.fa > test_prot.fa.gz 119 | # gzip -c test_dna.fa > test_dna.fa.gz 120 | # gzip -c test_prot.gff > test_prot.gff.gz 121 | # test_input_file "test_prot" "-n test_dna.fa.gz -p test_prot.fa.gz -g test_prot.gff.gz --protein_output test_prot.gz.fa --nucleotide_output test_dna.gz.out" 122 | 123 | echo "Done." 124 | echo "$TEST_TEXT" 125 | echo "" 126 | if [ "$FAILURES" -gt 0 ] 127 | then 128 | PASSED=$(( $TESTS - $FAILURES )) 129 | echo "${red}not ok overall: $FAILURES out of $TESTS amrfinder tests failed${reset}" 130 | exit 1 131 | else 132 | echo "${green}ok: all $TESTS amrfinder tests passed ${reset}" 133 | echo "Success!" 134 | fi 135 | -------------------------------------------------------------------------------- /mutate.cpp: -------------------------------------------------------------------------------- 1 | // mutate.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Mutate a FASTA file 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | #include "alignment.hpp" 40 | using namespace Alignment_sp; 41 | #include "seq.hpp" 42 | using namespace Seq_sp; 43 | 44 | #include "common.inc" 45 | 46 | 47 | 48 | namespace 49 | { 50 | 51 | 52 | 53 | struct ThisApplication final : Application 54 | { 55 | ThisApplication () 56 | : Application ("Mutate a FASTA file") 57 | { 58 | addPositional ("in", "Input FASTA file"); 59 | addPositional ("mut", "AmrMutation table: <1-based pos> "); 60 | addFlag ("aa", "Protein/DNA"); 61 | addFlag ("orig", "Add the original, non-mutated sequences"); 62 | 63 | version = SVN_REV; 64 | } 65 | 66 | 67 | 68 | void body () const final 69 | { 70 | const string inFName = getArg ("in"); 71 | const string mutFName = getArg ("mut"); 72 | const bool aa = getFlag ("aa"); 73 | const bool orig = getFlag ("orig"); 74 | 75 | 76 | map > id2mutation; 77 | { 78 | LineInput in (mutFName); 79 | Istringstream iss; 80 | while (in. nextLine ()) 81 | { 82 | iss. reset (in. line); 83 | string seqId; 84 | size_t pos; 85 | string mutation_std; 86 | string mutation_report; 87 | iss >> seqId >> pos >> mutation_std >> mutation_report; 88 | QC_ASSERT (! mutation_report. empty ()); 89 | AmrMutation mut (pos, mutation_std, mutation_report, "X", "X", "X"); 90 | mut. qc (); 91 | id2mutation [seqId] << std::move (mut); 92 | } 93 | } 94 | 95 | 96 | Multifasta fIn (inFName, aa); 97 | while (fIn. next ()) 98 | { 99 | unique_ptr seq; 100 | try 101 | { 102 | if (aa) 103 | { 104 | auto pep = new Peptide (fIn, 1000, false); // PAR 105 | pep->pseudo = true; 106 | seq. reset (pep); 107 | } 108 | else 109 | seq. reset (new Dna (fIn, 100000, false)); // PAR 110 | seq->qc (); 111 | if (orig) 112 | seq->saveText (cout); 113 | if (const Vector* muts = findPtr (id2mutation, seq->getId ())) 114 | for (const AmrMutation& mut : *muts) 115 | { 116 | unique_ptr seq1 (seq->copy ()); 117 | mut. apply (seq1->seq); 118 | if (! aa) 119 | strLower (seq1->seq); 120 | seq1->name += ":" + to_string (mut. pos_real + 1) + ":" + mut. geneMutation; 121 | seq1->qc (); 122 | seq1->saveText (cout); 123 | } 124 | } 125 | catch (const exception &e) 126 | { 127 | if (seq) 128 | throw runtime_error (seq->name + "\n" + e. what ()); 129 | throw; 130 | } 131 | } 132 | } 133 | }; 134 | 135 | 136 | 137 | } // namespace 138 | 139 | 140 | 141 | 142 | int main (int argc, 143 | const char* argv[]) 144 | { 145 | ThisApplication app; 146 | return app. run (argc, argv); 147 | } 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /amrfinder_index.cpp: -------------------------------------------------------------------------------- 1 | // amrfinder_index.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Indexing of AMRFinder data 31 | * 32 | * Dependencies: NCBI BLAST, HMMer 33 | * 34 | * Release changes: see amrfinder.cpp 35 | * 36 | */ 37 | 38 | 39 | 40 | 41 | #ifdef _MSC_VER 42 | #error "UNIX is required" 43 | #endif 44 | 45 | #undef NDEBUG 46 | #include "common.hpp" 47 | using namespace Common_sp; 48 | 49 | #include "common.inc" 50 | 51 | 52 | 53 | 54 | namespace 55 | { 56 | 57 | 58 | 59 | // ThisApplication 60 | 61 | struct ThisApplication final : ShellApplication 62 | { 63 | ThisApplication () 64 | : ShellApplication ("Index the database for AMRFinder", true, false, true, true) 65 | { 66 | addPositional ("DATABASE", "Directory with AMRFinder database"); 67 | addKey ("blast_bin", "Directory for BLAST", "", '\0', "BLAST_DIR"); 68 | addKey ("hmmer_bin", "Directory for HMMer", "", '\0', "HMMER_DIR"); 69 | version = SVN_REV; 70 | } 71 | 72 | 73 | 74 | void shellBody () const final 75 | { 76 | string dbDir = getArg ("DATABASE"); 77 | string blast_bin = getArg ("blast_bin"); 78 | string hmmer_bin = getArg ("hmmer_bin"); 79 | 80 | addDirSlash (dbDir); 81 | addDirSlash (blast_bin); 82 | addDirSlash (hmmer_bin); 83 | 84 | 85 | const Verbose vrb (qc_on); 86 | 87 | 88 | if (! directoryExists (dbDir)) 89 | throw runtime_error ("Database directory " + dbDir + " does not exist"); 90 | 91 | if (! blast_bin. empty ()) 92 | prog2dir ["makeblastdb"] = blast_bin; 93 | findProg ("makeblastdb"); 94 | 95 | if (! hmmer_bin. empty ()) 96 | prog2dir ["hmmpress"] = hmmer_bin; 97 | findProg ("hmmpress"); 98 | 99 | 100 | // Cf. amrfinder_update.cpp 101 | StringVector dnaPointMuts; 102 | { 103 | LineInput f (dbDir + "taxgroup.tsv", verbose () ? 1 : 0); 104 | while (f. nextLine ()) 105 | { 106 | if (isLeft (f. line, "#")) 107 | continue; 108 | string taxgroup = f. line; 109 | const int n = str2 (rfindSplit (taxgroup, '\t')); 110 | const string gpipe = rfindSplit (taxgroup, '\t'); 111 | QC_ASSERT (n >= 0); 112 | QC_ASSERT (! contains (taxgroup, ' ')); 113 | if (n) 114 | dnaPointMuts << taxgroup; 115 | } 116 | } 117 | 118 | stderr. section ("Indexing"); 119 | exec (fullProg ("hmmpress") + " -f " + shellQuote (dbDir + "AMR.LIB") + " > /dev/null 2> " + tmp + "/hmmpress.err", tmp + "/hmmpress.err"); 120 | setSymlink (dbDir, tmp + "/db", true); 121 | exec (fullProg ("makeblastdb") + " -in " + tmp + "/db/AMRProt.fa" + " -dbtype prot -logfile " + tmp + "/makeblastdb.AMRProt", tmp + "/makeblastdb.AMRProt"); 122 | exec (fullProg ("makeblastdb") + " -in " + tmp + "/db/AMR_CDS.fa" + " -dbtype nucl -logfile " + tmp + "/makeblastdb.AMR_CDS", tmp + "/makeblastdb.AMR_CDS"); 123 | for (const string& dnaPointMut : dnaPointMuts) 124 | exec (fullProg ("makeblastdb") + " -in " + tmp + "/db/AMR_DNA-" + dnaPointMut + ".fa -dbtype nucl -logfile " + tmp + "/makeblastdb.AMR_DNA-" + dnaPointMut, tmp + "/makeblastdb.AMR_DNA-" + dnaPointMut); 125 | } 126 | }; 127 | 128 | 129 | 130 | } // namespace 131 | 132 | 133 | 134 | int main (int argc, 135 | const char* argv[]) 136 | { 137 | ThisApplication app; 138 | return app. run (argc, argv); 139 | } 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /curl_easy.cpp: -------------------------------------------------------------------------------- 1 | // curl_easy.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * curl_easy functions 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "curl_easy.hpp" 38 | using namespace Common_sp; 39 | 40 | #include "common.inc" 41 | 42 | 43 | 44 | namespace CURL_sp 45 | { 46 | 47 | 48 | 49 | SoftwareVersion getLibVersion () 50 | { 51 | if (const curl_version_info_data* ver = curl_version_info (CURLVERSION_NOW)) 52 | { 53 | const uint major = (ver->version_num >> 16) & 0xff; 54 | const uint minor = (ver->version_num >> 8) & 0xff; 55 | const uint patch = ver->version_num & 0xff; 56 | return SoftwareVersion (major, minor, patch); 57 | } 58 | return SoftwareVersion (); 59 | } 60 | 61 | 62 | 63 | 64 | // Curl 65 | 66 | namespace 67 | { 68 | size_t write_stream_cb (char* ptr, 69 | size_t size, 70 | size_t nMemb, 71 | void* userData) 72 | { 73 | ASSERT (ptr); 74 | ASSERT (size == 1); 75 | ASSERT (userData); 76 | 77 | OFStream& f = * static_cast (userData); 78 | FOR (size_t, i, nMemb) 79 | f << ptr [i]; 80 | 81 | return nMemb; 82 | } 83 | 84 | 85 | 86 | size_t write_string_cb (char* ptr, 87 | size_t size, 88 | size_t nMemb, 89 | void* userData) 90 | { 91 | ASSERT (ptr); 92 | ASSERT (size == 1); 93 | ASSERT (userData); 94 | 95 | string& s = * static_cast (userData); 96 | FOR (size_t, i, nMemb) 97 | s += ptr [i]; 98 | 99 | return nMemb; 100 | } 101 | } 102 | 103 | 104 | 105 | void Curl::download (const string &url, 106 | const string &fName) 107 | { 108 | ASSERT (! fName. empty ()); 109 | 110 | { 111 | OFStream f (fName); 112 | curl_easy_setopt (eh, CURLOPT_WRITEFUNCTION, write_stream_cb); 113 | curl_easy_setopt (eh, CURLOPT_WRITEDATA, & f); 114 | process (url, "download"); 115 | } 116 | 117 | IFStream f (fName); 118 | string s; 119 | f >> s; 120 | if (s == " (end);) 43 | #define FOR_REV(type,i,start) FOR_REV_END(type, i, 0, (start)) 44 | // FOR(type,i,a) and FOR_REV(type,i,a) iterate over the same i 45 | // FOR_START(type,i,a,b) and FOR_REV_END(type,i,a,b) iterate over the same i 46 | 47 | #define ITER(ContainerType,iter,container) \ 48 | for (ContainerType::iterator iter = (container). begin (); iter != (container). end (); iter++) 49 | #define CONST_ITER(ContainerType,iter,container) \ 50 | for (ContainerType::const_iterator iter = (container). begin (); iter != (container). end (); iter++) 51 | #define ITER_REV(ContainerType,iter,container) \ 52 | for (ContainerType::reverse_iterator iter = (container). rbegin (); iter != (container). rend (); iter++) 53 | #define CONST_ITER_REV(ContainerType,iter,container) \ 54 | for (ContainerType::const_reverse_iterator iter = (container). rbegin (); iter != (container). rend (); iter++) 55 | 56 | 57 | #define Case break; case 58 | #define Default break; default 59 | 60 | 61 | // Exceptions 62 | 63 | #include 64 | #include 65 | 66 | 67 | #ifdef _MSC_VER 68 | #define FUNC std::string (__FUNCSIG__) + ":\n" 69 | #else 70 | #define FUNC std::string (__PRETTY_FUNCTION__) + ":\n" 71 | #endif 72 | 73 | 74 | #define ERROR_MSG(msg) \ 75 | { if (! std::uncaught_exceptions ()) \ 76 | throwf (std::string ("\"" __FILE__ "\", line ") + to_string (__LINE__) + ", in " + (FUNC) + (msg)); \ 77 | exit (1); \ 78 | } 79 | #define ERROR ERROR_MSG ("ERROR") 80 | #define NOT_IMPLEMENTED ERROR_MSG ("NOT IMPLEMENTED") 81 | #define NEVER_CALL ERROR_MSG ("NEVER CALL") 82 | 83 | 84 | #define QC_ASSERT(cond) { errno = 0; if (! (cond)) ERROR_MSG (#cond) } 85 | 86 | 87 | namespace 88 | { 89 | const bool debugP = 90 | #ifdef NDEBUG 91 | false 92 | #else 93 | true 94 | #endif 95 | ; 96 | } 97 | 98 | 99 | // Logic errors 100 | #ifdef NDEBUG 101 | #define ASSERT(cond) 102 | #else 103 | #define ASSERT(cond) QC_ASSERT (cond) 104 | #endif 105 | 106 | 107 | #ifdef NDEBUG 108 | #define EXEC_ASSERT(cond) cond 109 | #else 110 | #define EXEC_ASSERT(cond) { const bool c_ = (cond); if (! c_) ERROR_MSG (#cond); } 111 | #endif 112 | 113 | 114 | #define IMPLY(a,b) { if (a) { ASSERT (b) }} 115 | #define QC_IMPLY(a,b) { if (a) { QC_ASSERT (b) }} 116 | 117 | #define ASSERT_EQ(x,y,delta) ASSERT (std::fabs((x) - (y)) <= (delta)) 118 | #define QC_ASSERT_EQ(x,y,delta) QC_ASSERT (std::fabs((x) - (y)) <= (delta)) 119 | 120 | 121 | 122 | #define MODULE_INIT static bool run = false; \ 123 | if (run) \ 124 | return true; \ 125 | run = true; 126 | 127 | 128 | #define LESS_PART(x,y,part) { if ((x).part < (y).part) return true; \ 129 | if ((y).part < (x).part) return false; } 130 | #define LESS_COMP(comp) { const ebool c = comp; if (c == etrue) return true; if (c == efalse) return false; } 131 | #define COMP_PART(x,y,part) { if ((x).part < (y).part) return -1; \ 132 | if ((y).part < (x).part) return 1; } 133 | 134 | 135 | 136 | #define PRINT(x) { Offset::newLn (cout); cout << #x << " = " << (x); } 137 | 138 | #define LOG(x) { if (logPtr) *logPtr << (x) << endl; } 139 | 140 | #define XSTR(s) STR(s) 141 | #define STR(s) #s 142 | 143 | 144 | -------------------------------------------------------------------------------- /test_prot.fa: -------------------------------------------------------------------------------- 1 | >blaTEM-156 2 | MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLS 3 | RVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNIGDHVTRL 4 | DRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS 5 | RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW 6 | >blaPDC-114_blast BLAST (100% length, but 1 mismatch) 7 | MRDTRFPCLCGIAASTLLFATTPAIAGEAPADRLKALVDAAVQPVMKANDIPGLAVAISLKGEPHYFSYGLASKEDGRRV 8 | TPETLFEIGSVSKTFTATLAGYALTQDKMRLDDRASQHWPALVGSRFDGISLLDLATYTAGGLPLQFPDSVQKDQAQIRD 9 | YYRQWQPTYAPGSQRLYSNPSIGLFGYLAARSLGQPFERLMEQQVFPALGLEQTHLDVPEAALAQYAQGYGKDDRPLRAG 10 | PGPLDAEGYGVKTSAADLLRFVDANLHPERLDRPWAQALDATHRGYYKVGDMTQGLGWEAYDWPISLKRLQAGNSTPMAL 11 | QPHRIARLPAPQALEGQRLLNKTGSTNGFGAYVAFVPGRDLGLVILANRNYPNAERVKIAYAILSGLEQQGKVPLKR 12 | >blaOXA-436_partial (Should be partial OXA-48 family 13 | MRALALSAVLMVTTMIGMPAVAKEWQENKSWNAHFSEHKTQGVVVLWNENTQQGFTNDLKRANQAFLPASTFKIPNSLIA 14 | LDLGVVKDEHQVFKWDGQTRDIAAWNRDHDLITAMKYSVVPVYQEFARQIGEARMSKMLHAFDYGNEDISGNLDSFWLDG 15 | GIRISATQQIAFLRKLYHNKLHVSERSQRIVKQAMLTEANADYIIRAKTGYSVRIEPKIGWWVGWIELDDNVW 16 | >vanG 17 | MQNKKIAVIFGGNSTEYEVSLQSASAVFENINTNKFDIIPIGITRSGEWYHYTGEKEKILNNTWFEDSKN 18 | LCPVVVSQNRSVKGFLEIASDKYRIIKVDLVFPVLHGKNGENGTLQGIFELAGIPVVGCDTLSSALCMDK 19 | DRAHKLVSLAGISVPKSVTFKRFNEEAAMKEIEANLTYPLFIKPVRAGSSFGITKVIEKQELDAAIELAF 20 | EHDTEVIVEETINGFEVGCAVLGIDELIVGRVDEIELSSGFFDYTEKYTLKSSKIYMPARIDAEAEKRIQ 21 | EAAVTIYKALGCSGFSRVDMFYTPSGEIVFNEVNTIPGFTSHSRYPNMMKGIGLSFSQMLDKLIGLYVE 22 | >gyrA T86I Campylobacter 23 | MENIFSKDSDIELVDIENSIKGSYLDYSMSVIIGRALPDARDGLKPVHRRILYAMQNDEAKSRTDFVKSARIVGAVIGRYHPHGDAAVYDALVRMAQDFSMRYPSITGQGNFGSIDGDSAAAMRYTEAKMSKLSHELLKD 24 | IDKDTVDFVPNYDGSESEPDVLPSRVPNLLLNGSSGIAVGMATNIPPHSLNELIDGLLYLLDSKDASLEE 25 | IMQFIKGPDFPTGGIIYGKKGIIEAYRTGRGRVKVRAKTHIEKKTNKDVIVIDELPYQTNKARLIEQIAE 26 | LVKEKQIEGISEVRDESNKEGIRVVIELKREAMSEIVLNNLFKSTTMESTFGVIMLAIHNKEPKIFSLLE 27 | LLNLFLTHRKTVIIRRTIFELQKARARAHILEGLKIALDNIDEVIALIKNSSDNNTARDSLVAKFGLSEL 28 | QANAILDMKLGRLTGLEREKIENELAELMKEIARLEEILKSETLLENLIRDELKEIRSKFDVPRITQIED 29 | DYDDIDIEDLIPNENMVVTITHRGYIKRVPSKQYEKQKRGGKGKLAVTTYDDDFIESFFTANTHDTLMFV 30 | TDRGQLYWLKVYKIPEGSRTAKGKAVVNLINLQAEEKIMAIIPTTDFDESKSLCFFTKNGIVKRTNLSEY 31 | QNIRSVGVRAINLDENDELVTAIIVQRDEDEIFATGGEENLENQEIENLDDENLENEESVSTQGKMLFAV 32 | TKKGMCIKFPLAKVREIGRVSRGVTAIKFKEKNDELVGAVVIENDEQEILSISAKGIGKRTNAGEYRLQS 33 | RGGKGVICMKLTEKTKDLISVVIVDETMDLMALTSSGKMIRVDMQSIRKAGRNTSGVIVVNVENDEVVSI 34 | AKCPKEENDEDELSDENFGLDL 35 | >50S_L22 Campylobacter 50S_L22:A103V 36 | MSKALIKFIRLSSTKARLIAREVQGMNAELAMASLKFMPNKGAKYIANAISSAVANGGFEANEVIVKSCRVDAAAVLKRF 37 | RPRARGSASRIRKPTSHILVEVVKAEVKAEEKKTVAKKTTTTKAPAKKTTSTKKATAKKES 38 | 39 | >nimIJ_hmm WP_027455679.1 NimIJ family nitroimidazole resistance protein [Prevotella brevis] 40 | MSEFREMRRKRQQLTDADSIAVLQKATSGTLALLGDNDYPYAVPISYVYDNGKLYFHSAMAGHKVDAIRR 41 | CNKASFCVIEKDDVRPEKYTTYFRSVIAFGRIEIVEDEAEKRTIMHMMGNRFNPNHDDALQKELESGLAH 42 | MLAIRMDIEHLTGKEAIELVRQRGGN 43 | 44 | >aph3pp-Ib_partial_5p_neg NZ_QKNQ01000001.1 Providencia rettgeri strain Pret_2032, whole genome shotgun sequence 2160922-2162737 150-1527 704-137 45 | IRKLKEPPLNRTNIFFGESHSDWLPVRGGESGDFVFRRGDGHAFAKIAPASRRGELAGERDRLIWLKGRGVACPEVINWQEEQEGACLVITAIPGVPAADLSGADLLKAWPSMGQQLGAVHSLSVDQCPFERRLSRMFGRAVDVVSRNAVNPDFLPDEDKSTPQLDLLARVERELPVRLDQERTDMVVCHGDPCMPNFMVDPKTLQCTGLIDLGRLGTADRYADL 46 | 47 | >sul2_partial_3p_neg NZ_QKNQ01000001.1 Providencia rettgeri strain Pret_2032, whole genome shotgun sequence 2160922-2162737 150-1377 2-667 48 | SSNPDAAPVSSDTEIERIAPVLDALKADGIPVSLDSYQPATQAYALSRGVAYLNDIRGFPDAAFYPQLAKSSAKLVVMHSVQDGQADRREAPAGDIMDHIAAFFDARIAALTGAGIKRNRLVLDPGMGFFLGAAPETSLSVLARFDELRLRFDLPVLLSVSRKSFLRALTGRGPGDVGAATLAAELAAAAGGADFIRTHEPRPLRDGLAVLAALKETARIR 49 | 50 | >blaTEM-internal_stop 51 | HFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVNYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSCVDAG 52 | QEQLGRSIHYSQNDLVEYSPVTEKHLTDGMTLRELCSAAITMSDNTAANLLLTTIGGPKELTA 53 | 54 | >qacR-curated_blast (ADK23698.1) 55 | MKLKDKILGVAKELFIKNGYNATTTGEIVKLSESSKGNLYYHFKTKENLFLEILNIEESKWQEQWKKEQI 56 | KCKTNREKFYLYNELSLTTEYYYPLQNAIIEFYTEYYKTNSINEKMNKLENKYIDAYHVIFKEGNLNGEW 57 | CINDVNAVSKIAANAVNGIVTFTHEQNINERIKLMNKFSQIFLNGLSK 58 | 59 | >arsR-suppressed-in-escherichia BAE77793.1 arsR_K-12 As(III)-sensing_metalloregulatory_transcriptional_repressor_ArsR 60 | MSFLLPIQLFKILADETRLGIVLLLSELGELCVCDLCTALDQSQPKISRHLALLRESGLLLDRKQGKWVHYRLSPHIPAW 61 | AAKIIDEAWRCEQEKVQAIVRNLARQNCSGDSKNICS 62 | 63 | >emrD3-suppressed-in-vibrio ABQ18953.1 64 | MKTKPSLWLMVIMLMFPQIVETIYSPVLGSIARSFSVSDAQAAQTLSVYFLAFALGVVIWGVLADKWGRRPTMLVGLLIY 65 | GSATFIAMQTDSFTILMLARVFSAFGIAVGSVVTQTILRDVFSGHELRKVFSLMGIGISISPVLGMLLGGQLAFAGGHQL 66 | VFLALFFIALVLFVYNLCQLPETQQVKPKIALGCLVARMFKDRQVLLSALLVALYNVALFSYYQLGAFIFSDLGLDAEQF 67 | GYSGIALGLGSLIGSFLNKTLLAKQVPQRALLLLAALLLIMGTIGVSLTLDSIGFVAAMILVVIAYGMAIPNILSTALVE 68 | YKSQAGSAGALFGLLYYLLIGSGLALTGLVQRLGVVLLMCAGITLLATLARSSHIARLP 69 | 70 | >pmrB_C84R 71 | MHFLRRPISLRQRLILTIGAILLVFELISVFWLWHESTEQIQLFEQALRDNRNNDRHIMREIREAVASLIVPGVFMVSLTLFIRYQAVRRITRPLAELQKELEARTADNLTPIAIHSATLEIEAVVSALNDLVSRLTSTLDNERLFTADVAHELRTPLAGVRLHLELLAKTHHIDVAPLVARLDQMMESVSQLLQLARAGQSFSSGNYQHVKLLEDVILPSYDELSTMLDQRQQTLLLPESAADITVQGDATLLRMLLRNLVENAHRYSPQGSNIMIKLQEDDGAVMAVEDEGPGIDESKCGELSKAFVRMDSRYGGIGLGLSIVSRITQLHHGQFFLQNRQETSGTRAWVRLKKDQYVANQI 72 | 73 | >nfsA_R15C_K141STOP 74 | MTPTIELICGHRSICHFTDEPISEAQREAIINSARATSSSSFLQYSSIIRITDKALREELVTLTGGQKHVAQAAEFWVFCADFNRHLQICPDAQLGLAEQLLLGVVDTAMMAQNALIAAESLGLGGVYIGGLRNNIEAVT 75 | 76 | >stxA2a_prot EHY1938862.1 Shiga toxin Stx2a subunit A [Escherichia coli] 77 | MKCILFKWVLCLLLGFSSVSYSREFTIDFSTQQSYVSSLNSIRTEISTPLEHISQGTTSVSVINHTPPGSYFAVDIRGLD 78 | VYQARFDHLRLIIEQNNLYVAGFVNTATNTFYRFSDFTHISVPGVTTVSMTTDSSYTTLQRVAALERSGMQISRHSLVSS 79 | YLALMEFSGNTMTRDASRAVLRFVTVTAEALRFRQIQREFRQALSETAPVYTMTPGDVDLTLNWGRISNVLPEYRGEDGV 80 | RVGRISFNNISAILSTVAVILNCHHQGARSVRAVNEESQPECQITGDRPVIKINNTLWESNTAAAFLNRKSQFLYTTGK 81 | >stxB2a_prot EHY1938863.1 Shiga toxin Stx2a subunit B [Escherichia coli] 82 | MKKMFMAVLFALVSVNAMAADCAKGKIEFSKYNEDDTFTVKVDGKEYWTSRWNLQPLLQSAQLTGMTVTIKSSTCESGSG 83 | FAEVQFNND 84 | -------------------------------------------------------------------------------- /gff.hpp: -------------------------------------------------------------------------------- 1 | // gff.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * .gff file reader 31 | * 32 | */ 33 | 34 | 35 | #ifndef GFF_HPP 36 | #define GFF_HPP 37 | 38 | 39 | #include "common.hpp" 40 | using namespace Common_sp; 41 | 42 | 43 | 44 | namespace GFF_sp 45 | { 46 | 47 | 48 | 49 | struct Locus 50 | { 51 | static constexpr size_t end_delta = 3; // PAR 52 | size_t lineNum {0}; 53 | // >= 1 54 | // 0 - unknown 55 | string contig; 56 | // DNA FASTA id 57 | size_t start {0}; 58 | size_t stop {0}; 59 | // start <= stop 60 | bool strand {false}; 61 | bool partial {false}; 62 | size_t contigLen {0}; 63 | // 0 <=> unknown 64 | bool crossOrigin {false}; 65 | string gene; 66 | string product; 67 | 68 | 69 | Locus (size_t lineNum_arg, 70 | const string &contig_arg, 71 | size_t start_arg, 72 | size_t stop_arg, 73 | bool strand_arg, 74 | bool partial_arg, 75 | size_t crossOriginSeqLen, 76 | string gene_arg, 77 | string product_arg); 78 | Locus () = default; 79 | 80 | 81 | bool empty () const 82 | { return contig. empty (); } 83 | void print (ostream &os) const 84 | { os << contig 85 | << ' ' << start 86 | << ' ' << stop 87 | << ' ' << strand 88 | << ' ' << contigLen 89 | << ' ' << crossOrigin 90 | << ' ' << gene 91 | << ' ' << product 92 | << endl; 93 | } 94 | bool operator< (const Locus& other) const; 95 | size_t size () const 96 | { return crossOrigin 97 | ? contigLen - stop + start 98 | : stop - start; 99 | } 100 | bool atContigStart () const 101 | { return start <= end_delta; } 102 | bool atContigStop () const 103 | { return contigLen && contigLen - stop <= end_delta;} 104 | }; 105 | 106 | 107 | 108 | struct Gff 109 | { 110 | enum Type {bakta, genbank, microscope, patric, pgap, prodigal, prokka, pseudomonasdb, rast, standard/*PD-4548*/}; 111 | // Alphabetic order 112 | static const StringVector names; 113 | static Type name2type (const string &name); 114 | }; 115 | 116 | 117 | 118 | struct Annot final : Root 119 | { 120 | // Protein GFF id is a function of attributes (column in GFF) 121 | map> prot2loci; 122 | map fasta2gff_prot; 123 | // empty() => protein FASTA id = protein GFF id 124 | 125 | 126 | Annot (const string &fName, 127 | Gff::Type gffType, 128 | bool protMatch, 129 | bool lcl); 130 | // GFF 131 | // https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md 132 | // https://github.com/ncbi/amr/issues/91 133 | // Input: protMatch: property of protein FASTA: 134 | // genbank: "[locus_tag=...]" in comment 135 | // microscope: ">|ID:|| 136 | // prodigal: "ID=" in comment 137 | // lcl: property of DNA FASTA: >lcl|... 138 | /* 139 | gffType protein GFF id 140 | ------- -------------- 141 | bakta ID= 142 | genbank locus_tag=[project:]acc // if pseudo or protMatch 143 | Name=[project:]acc // else 144 | microscope ID= 145 | patric ID=...;locus_tag=... 146 | pgap Name= 147 | prodigal ID= 148 | prokka ID= 149 | pseudomonasdb Alias= (or locus=) 150 | rast ID= 151 | standard Name= 152 | 153 | ["]acc["] 154 | */ 155 | explicit Annot (const string &fName); 156 | // Bed 157 | // https://genome.ucsc.edu/FAQ/FAQformat.html#format1 158 | 159 | 160 | void load_fasta2gff_prot (const string &fName); 161 | // Input: fName: file is created by gff_check.cpp -gff_prot_match 162 | // Output: fasta2gff_prot 163 | void load_fasta2gff_dna (const string &fName); 164 | // Input: fName: file is created by gff_check.cpp -gff_dna_match 165 | // Output: Locus::contig 166 | const Set& findLoci (const string &fasta_prot) const; 167 | // Return: !empty() 168 | // throw if not found 169 | }; 170 | 171 | 172 | 173 | 174 | } 175 | 176 | 177 | 178 | #endif 179 | -------------------------------------------------------------------------------- /test_both.expected: -------------------------------------------------------------------------------- 1 | Protein id Contig id Start Stop Strand Element symbol Element name Scope Type Subtype Class Subclass Method Target length Reference sequence length % Coverage of reference % Identity to reference Alignment length Closest reference accession Closest reference name HMM accession HMM description Hierarchy node 2 | NA contig01 1 984 + blaTEMp_G162T Escherichia amoxicillin-clavulanic acid/piperacillin-tazobactam/ticarcillin-clavulanic acid resistant blaTEM promoter core AMR POINT BETA-LACTAM AMOXICILLIN-CLAVULANIC_ACID/PIPERACILLIN-TAZOBACTAM/TICARCILLIN-CLAVULANIC_ACID POINTN 984 1176 83.67 99.80 984 NZ_CP095603.1:148777-149952 blaTEM promoter region NA NA NA 3 | blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase blaTEM-156 4 | blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase blaPDC 5 | blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam 6 | vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG vanG 7 | NA contig04 1261 2391 + blaEC BlaEC family class C beta-lactamase plus AMR AMR BETA-LACTAM BETA-LACTAM BLASTX 377 377 100.00 98.14 377 WP_063610930.1 extended-spectrum class C beta-lactamase EC-15 NA NA blaEC 8 | NA contig08 1 700 + blaTEMp_G162T Escherichia amoxicillin-clavulanic acid/piperacillin-tazobactam/ticarcillin-clavulanic acid resistant blaTEM promoter core AMR POINT BETA-LACTAM AMOXICILLIN-CLAVULANIC_ACID/PIPERACILLIN-TAZOBACTAM/TICARCILLIN-CLAVULANIC_ACID POINTN 700 1176 59.52 99.71 700 NZ_CP095603.1:148777-149952 blaTEM promoter region NA NA NA 9 | NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NA NA blaTEM 10 | aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib 11 | sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA sul2 12 | NA contig10 486 1307 + blaOXA OXA-9 family oxacillin-hydrolyzing class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NA NA blaOXA-9_fam 13 | NA contig11 1 984 + blaTEMp_G162T Escherichia amoxicillin-clavulanic acid/piperacillin-tazobactam/ticarcillin-clavulanic acid resistant blaTEM promoter core AMR POINT BETA-LACTAM AMOXICILLIN-CLAVULANIC_ACID/PIPERACILLIN-TAZOBACTAM/TICARCILLIN-CLAVULANIC_ACID POINTN 984 1176 83.67 96.04 984 NZ_CP095603.1:148777-149952 blaTEM promoter region NA NA NA 14 | blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 144 286 50.35 97.22 144 WP_000027057.1 broad-spectrum class A beta-lactamase TEM-1 NA NA blaTEM 15 | qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA qacR 16 | emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA emrD3 17 | NA contig14 1 1089 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA pmrB 18 | pmrB_C84R contig14 1093 2181 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTP 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA pmrB 19 | NA contig15 1 2905 + 23S_A2058T Escherichia azithromycin/erythromycin/telithromycin resistant 23S core AMR POINT MACROLIDE AZITHROMYCIN/ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA NA 20 | NA contig16 1 720 + nfsA_K141Ter Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA nfsA 21 | NA contig16 1 720 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA nfsA 22 | NA contig17 1 247 + ampC_T-14TGT Escherichia cephalosporin resistant ampC promoter core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA NA 23 | stxA2a_prot contig18 279 1238 + stxA2 Shiga toxin Stx2 subunit A plus VIRULENCE VIRULENCE STX2 stxA2 EXACTP 319 319 100.00 100.00 319 TJA36680.1 Shiga toxin Stx2 subunit A NF041702.1 Shiga toxin Stx2 subunit A stxA2_acd 24 | NA contig18 279 1516 + stx2a_operon stx2a operon plus VIRULENCE STX_TYPE STX2 STX2A COMPLETE 1238 NA NA 100.00 408 AAS07600.1,AAM90978.1 Shiga toxin stx2a NA NA stxA2a::stxB2a 25 | stxB2a_prot contig18 1250 1519 + stxB2 Shiga toxin Stx2a subunit B plus VIRULENCE VIRULENCE STX2 stxB2a EXACTP 89 89 100.00 100.00 89 AAM90978.1 Shiga toxin Stx2a subunit B NF033660.0 Shiga toxin Stx2 subunit B stxB2a 26 | nimIJ_hmm contigX 1 501 + nimIJ NimIJ family 5-nitroimidazole reductase core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family 5-nitroimidazole reductase NF000262.1 NimIJ family 5-nitroimidazole reductase nimIJ 27 | -------------------------------------------------------------------------------- /fasta_extract.cpp: -------------------------------------------------------------------------------- 1 | // fasta_check.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Extract sequences out of a FASTA file 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | 40 | #include "common.inc" 41 | 42 | 43 | 44 | namespace 45 | { 46 | 47 | 48 | 49 | struct Segment 50 | // not circular 51 | { 52 | size_t start {0}; 53 | size_t stop {0}; 54 | bool strand {true}; 55 | // false <=> negative 56 | string genesymbol; 57 | string name; 58 | 59 | 60 | bool isDna () const 61 | { return stop; } 62 | size_t size () const 63 | { return stop - start; } 64 | void saveText (ostream &os) const 65 | { os << start 66 | << '\t' << stop 67 | << '\t' << strand 68 | << '\t' << genesymbol 69 | << '\t' << name 70 | << endl; 71 | } 72 | }; 73 | 74 | 75 | 76 | char complementaryNucleotide (char wildNucleotide) 77 | { 78 | char r = ' '; 79 | switch (toLower (wildNucleotide)) 80 | { 81 | case 'a': r = 't'; break; 82 | case 'c': r = 'g'; break; 83 | case 'g': r = 'c'; break; 84 | case 't': r = 'a'; break; 85 | case 'm': r = 'k'; break; 86 | case 'r': r = 'y'; break; 87 | case 'w': r = 'w'; break; 88 | case 's': r = 's'; break; 89 | case 'y': r = 'r'; break; 90 | case 'k': r = 'm'; break; 91 | case 'v': r = 'b'; break; 92 | case 'h': r = 'd'; break; 93 | case 'd': r = 'h'; break; 94 | case 'b': r = 'v'; break; 95 | case 'n': r = 'n'; break; 96 | case '-': r = '-'; break; 97 | default: 98 | throw runtime_error ("Bad nucleotide " + to_string (wildNucleotide)); 99 | } 100 | if (isupper (wildNucleotide)) 101 | r = toUpper (r); 102 | 103 | return r; 104 | } 105 | 106 | 107 | 108 | bool process (const string &id, 109 | string &seq, 110 | const map> &id2segments) 111 | { 112 | if (id. empty ()) 113 | return false; 114 | const Vector* segments = findPtr (id2segments, id); 115 | if (! segments) 116 | return false; 117 | 118 | replaceStr (seq, "-", ""); 119 | QC_ASSERT (! seq. empty ()); 120 | 121 | for (Segment& seg : var_cast (*segments)) 122 | { 123 | cout << '>' << id; 124 | if (seg. isDna ()) 125 | { 126 | QC_ASSERT (seg. start <= seq. size ()); 127 | minimize (seg. stop, seq. size ()); 128 | QC_ASSERT (seg. start < seg. stop); 129 | cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-'); 130 | } 131 | cout << ' ' << seg. genesymbol << ' ' << seg. name << endl; 132 | string seq1 (seq); 133 | if (seg. isDna ()) 134 | { 135 | ASSERT (seg. stop <= seq1. size ()); 136 | seq1 = seq1. substr (seg. start, seg. size ()); 137 | if (! seg. strand) 138 | { 139 | reverse (seq1); 140 | for (char &c : seq1) 141 | c = complementaryNucleotide (c); 142 | } 143 | //strLower (seq1); // Letter case can indicate nucleotide quality 144 | } 145 | //else 146 | //strUpper (seq1); 147 | constexpr size_t line_len = 60; // PAR 148 | for (size_t i = 0; i < seq1. size (); i += line_len) 149 | cout << seq1. substr (i, line_len) << endl; 150 | } 151 | 152 | return true; 153 | } 154 | 155 | 156 | 157 | struct ThisApplication final : Application 158 | { 159 | ThisApplication () 160 | : Application ("Extract sequences out of a FASTA file") 161 | { 162 | addPositional ("fasta", "FASTA file"); 163 | addPositional ("target", "Target identifiers in the FASTA file to extract.\n\ 164 | Line format for amino acid sequences : \n\ 165 | Line format for nucleotide sequences : =1)> = start)> \ 166 | "); 167 | addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); 168 | version = SVN_REV; 169 | } 170 | 171 | 172 | 173 | void body () const final 174 | { 175 | const string fName = getArg ("fasta"); 176 | const string targetFName = getArg ("target"); 177 | const bool aa = getFlag ("aa"); 178 | 179 | 180 | map> id2segments; 181 | { 182 | LineInput f (targetFName); 183 | string id; 184 | Istringstream iss; 185 | while (f. nextLine ()) 186 | { 187 | iss. reset (f. line); 188 | Segment seg; 189 | iss >> id; 190 | if (! aa) 191 | { 192 | char strand = '\0'; 193 | iss >> seg. start >> seg. stop >> strand; 194 | QC_ASSERT (seg. start); 195 | QC_ASSERT (seg. start <= seg. stop); 196 | seg. start--; 197 | QC_ASSERT ( strand == '+' 198 | || strand == '-' 199 | ); 200 | seg. strand = (strand == '+'); 201 | } 202 | iss >> seg. genesymbol; 203 | seg. name = f. line. substr ((size_t) iss. tellg ()); 204 | trim (seg. name); 205 | QC_ASSERT (aa == ! seg. isDna ()); 206 | id2segments [id] << std::move (seg); 207 | } 208 | } 209 | if (verbose ()) 210 | for (const auto& it : id2segments) 211 | { 212 | cout << it. first << ": " << endl; 213 | for (const Segment& seg : it. second) 214 | { 215 | cout << " "; 216 | seg. saveText (cout); 217 | } 218 | } 219 | if (id2segments. empty ()) 220 | return; 221 | 222 | 223 | size_t processed = 0; 224 | { 225 | LineInput f (fName); 226 | string id; 227 | string seq; 228 | while (f. nextLine ()) 229 | { 230 | trimTrailing (f. line); 231 | if (f. line. empty ()) 232 | continue; 233 | if (f. line [0] == '>') 234 | { 235 | processed += process (id, seq, id2segments); 236 | size_t pos = 1; 237 | while (pos < f. line. size () && ! isspace (f. line [pos])) 238 | pos++; 239 | id = f. line. substr (1, pos - 1); 240 | seq. clear (); 241 | } 242 | else 243 | seq += f. line; 244 | } 245 | processed += process (id, seq, id2segments); 246 | } 247 | if (processed != id2segments. size ()) 248 | throw runtime_error ("Requested identifiers: " + to_string (id2segments. size ()) + ", but processed: " + to_string (processed)); 249 | // Assumed: no duplicate identifiers in FASTA 250 | } 251 | }; 252 | 253 | 254 | 255 | } // namespace 256 | 257 | 258 | 259 | int main (int argc, 260 | const char* argv[]) 261 | { 262 | ThisApplication app; 263 | return app. run (argc, argv); 264 | } 265 | 266 | 267 | 268 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | # PUBLIC DOMAIN NOTICE This software/database is "United States Government 3 | # Work" under the terms of the United States Copyright Act. It was written as 4 | # part of the authors' official duties for the United States Government and 5 | # thus cannot be copyrighted. This software/database is freely available to the 6 | # public for use without a copyright notice. Restrictions cannot be placed on 7 | # its present or future use. 8 | # 9 | # Although all reasonable efforts have been taken to ensure the accuracy and 10 | # reliability of the software and data, the National Center for Biotechnology 11 | # Information (NCBI) and the U.S. Government do not and cannot warrant the 12 | # performance or results that may be obtained by using this software or data. 13 | # NCBI, NLM, and the U.S. Government disclaim all warranties as to performance, 14 | # merchantability or fitness for any particular purpose. 15 | # 16 | # In any work or product derived from this material, proper attribution of the 17 | # authors as the source of the software or data should be made, using: 18 | # https://ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder/ as the 19 | # citation. 20 | ############################################################################### 21 | 22 | # the SVNREV is set automatically here for convenience, 23 | # but when actually building we should override it like: 24 | # make all SVNREV=-D\'SVN_REV=\"$VERSION\"\' or use 25 | # a version.txt file 26 | ifeq ($(wildcard version.txt),) 27 | VERSION_STRING := $(shell git describe --tags) 28 | else 29 | VERSION_STRING := $(shell cat version.txt) 30 | endif 31 | SVNREV := -D'SVN_REV="$(VERSION_STRING)"' 32 | 33 | INSTALL=install 34 | 35 | # make it possible to hard define a database directory 36 | # Define default paths 37 | # This is a little convoluted because I broke things and don't want 38 | # to change two different ways of defining the paths. This could 39 | # be simplified in a later release 40 | PREFIX ?= /usr/local 41 | ifneq '$(INSTALL_DIR)' '' 42 | bindir=$(INSTALL_DIR) 43 | endif 44 | bindir ?= $(PREFIX)/bin 45 | ifneq '$(CONDA_DB_DIR)' '' 46 | DBDIR := -D'CONDA_DB_DIR="$(CONDA_DB_DIR)"' 47 | endif 48 | ifneq '$(DEFAULT_DB_DIR)' '' 49 | DBDIR := -D'CONDA_DB_DIR="$(DEFAULT_DB_DIR)"' 50 | endif 51 | 52 | # for testing database updates using 53 | ifdef TEST_UPDATE 54 | TEST_UPDATE_DB := '-D TEST_UPDATE' 55 | endif 56 | 57 | # detect system architecture and set appropriate flags 58 | # this is probably not the best way (i.e. M1 Mac would be arm64) 59 | # but it works for Nvidia Jetson boards (aarch64) 60 | ARCH := $(shell uname -m) 61 | OS := $(shell uname -s) 62 | # "hack": if amd64 we can set to aarch64 63 | # as AArch64 and ARM64 refer to the same thing 64 | # this should build for Mac M1 and other arm64 chips 65 | ifeq ($(ARCH),arm64) 66 | ARCH := aarch64 67 | endif 68 | # report detected OS and arch in stdout 69 | $(info Dectected architecture: $(OS) $(ARCH)) 70 | # set CFLAGS based on arch 71 | ifeq ($(ARCH),aarch64) 72 | # set arm CFLAGS 73 | CPPFLAGS = -std=gnu++17 -pthread --signed-char -falign-jumps -fno-math-errno -O3 74 | else 75 | # set x86_x64 CFLAGS 76 | CPPFLAGS = -std=gnu++17 -pthread -malign-double -fno-math-errno -O3 77 | endif 78 | # was: -std=gnu++14 79 | 80 | CXX=g++ 81 | COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) $(DBDIR) $(TEST_UPDATE_DB) -c 82 | 83 | 84 | .PHONY: all clean install release stxtyper test 85 | 86 | BINARIES= amr_report amrfinder amrfinder_index amrfinder_update fasta_check \ 87 | fasta_extract fasta2parts gff_check dna_mutation mutate disruption2genesymbol 88 | 89 | all: $(BINARIES) stxtyper 90 | 91 | release: clean 92 | svnversion . > version.txt 93 | make all 94 | 95 | common.o: common.hpp common.inc 96 | curl_easy.o: curl_easy.hpp common.hpp common.inc 97 | gff.o: gff.hpp common.hpp common.inc 98 | alignment.o: alignment.hpp seq.hpp common.hpp common.inc 99 | seq.o: seq.hpp graph.hpp common.hpp common.inc 100 | 101 | amr_report.o: common.hpp common.inc gff.hpp alignment.hpp tsv.hpp seq.hpp columns.hpp version.txt 102 | amr_reportOBJS=amr_report.o common.o gff.o alignment.o seq.o graph.o 103 | amr_report: $(amr_reportOBJS) 104 | $(CXX) $(LDFLAGS) -o $@ $(amr_reportOBJS) 105 | 106 | amrfinder.o: common.hpp common.inc gff.hpp seq.hpp tsv.hpp columns.hpp version.txt 107 | amrfinderOBJS=amrfinder.o common.o gff.o tsv.o 108 | amrfinder: $(amrfinderOBJS) 109 | $(CXX) $(LDFLAGS) -o $@ $(amrfinderOBJS) -pthread $(DBDIR) 110 | 111 | amrfinder_update.o: common.hpp common.inc curl_easy.hpp version.txt 112 | amrfinder_updateOBJS=amrfinder_update.o common.o curl_easy.o 113 | amrfinder_update: $(amrfinder_updateOBJS) 114 | @if [ "$(TEST_UPDATE)" != "" ] ; \ 115 | then \ 116 | touch amrfinder_update.cpp ;\ 117 | fi # make sure the next make command rebuilds amrfinder_update 118 | $(CXX) $(LDFLAGS) -o $@ $(amrfinder_updateOBJS) -lcurl 119 | 120 | amrfinder_index.o: common.hpp common.inc version.txt 121 | amrfinder_indexOBJS=amrfinder_index.o common.o 122 | amrfinder_index: $(amrfinder_indexOBJS) 123 | $(CXX) $(LDFLAGS) -o $@ $(amrfinder_indexOBJS) 124 | 125 | fasta_check.o: common.hpp common.inc version.txt 126 | fasta_checkOBJS=fasta_check.o common.o 127 | fasta_check: $(fasta_checkOBJS) 128 | $(CXX) $(LDFLAGS) -o $@ $(fasta_checkOBJS) 129 | 130 | fasta_extract.o: common.hpp common.inc version.txt 131 | fasta_extractOBJS=fasta_extract.o common.o 132 | fasta_extract: $(fasta_extractOBJS) 133 | $(CXX) $(LDFLAGS) -o $@ $(fasta_extractOBJS) 134 | 135 | fasta2parts.o: common.hpp common.inc version.txt 136 | fasta2partsOBJS=fasta2parts.o common.o 137 | fasta2parts: $(fasta2partsOBJS) 138 | $(CXX) $(LDFLAGS) -o $@ $(fasta2partsOBJS) 139 | 140 | gff_check.o: common.hpp common.inc gff.hpp version.txt 141 | gff_checkOBJS=gff_check.o common.o gff.o 142 | gff_check: $(gff_checkOBJS) 143 | $(CXX) $(LDFLAGS) -o $@ $(gff_checkOBJS) 144 | 145 | dna_mutation.o: common.hpp common.inc alignment.hpp seq.hpp tsv.hpp columns.hpp version.txt 146 | dna_mutationOBJS=dna_mutation.o common.o alignment.o seq.o graph.o 147 | dna_mutation: $(dna_mutationOBJS) 148 | $(CXX) $(LDFLAGS) -o $@ $(dna_mutationOBJS) 149 | 150 | mutate.o: common.hpp common.inc alignment.hpp seq.hpp version.txt 151 | mutateOBJS=mutate.o common.o alignment.o seq.o graph.o 152 | mutate: $(mutateOBJS) 153 | $(CXX) -o $@ $(mutateOBJS) 154 | 155 | disruption2genesymbol.o: common.hpp common.inc seq.hpp version.txt 156 | disruption2genesymbolOBJS=disruption2genesymbol.o common.o alignment.o seq.o graph.o 157 | disruption2genesymbol: $(disruption2genesymbolOBJS) 158 | $(CXX) -o $@ $(disruption2genesymbolOBJS) 159 | 160 | stxtyper: 161 | $(MAKE) -C stx 162 | 163 | clean: 164 | rm -f *.o 165 | rm -f $(BINARIES) 166 | $(MAKE) -C stx clean 167 | 168 | install: 169 | @if [ ! -e $(DESTDIR)$(bindir) ]; \ 170 | then \ 171 | mkdir -p $(DESTDIR)$(bindir); \ 172 | fi 173 | $(INSTALL) $(BINARIES) $(DESTDIR)$(bindir) 174 | make -C stx install PREFIX=$(PREFIX) bindir=$(bindir) 175 | mkdir $(DESTDIR)$(bindir)/stx 176 | ln -s ../stxtyper $(DESTDIR)$(bindir)/stx/stxtyper 177 | 178 | # amrfinder binaries for github binary release 179 | GITHUB_FILE=amrfinder_binaries_v$(VERSION_STRING) 180 | GITHUB_FILES = test_amrfinder.sh test_*.expected test_*.fa test_*.gff $(BINARIES) 181 | 182 | github_binaries: 183 | @if [ ! -e version.txt ]; \ 184 | then \ 185 | echo >&2 "version.txt required to make a distribution file"; \ 186 | false; \ 187 | fi 188 | # first recompile amrfinder.o to pick up the new version info 189 | # and remove leaky NCBI paths 190 | make clean 191 | # make all CXX=/usr/bin/g++ LD_RUN_PATH= 192 | make all LD_RUN_PATH= 193 | mkdir $(GITHUB_FILE) 194 | echo $(VERSION_STRING) > $(GITHUB_FILE)/version.txt 195 | cp $(GITHUB_FILES) $(GITHUB_FILE) 196 | # make -C stx 197 | # make -C stx install INSTALL_DIR=../$(GITHUB_FILE)/stx CXX=/usr/bin/g++ LD_RUN_PATH= 198 | make -C stx install INSTALL_DIR=../$(GITHUB_FILE)/stx LD_RUN_PATH= 199 | cp stx/test_stxtyper.sh stx/version.txt $(GITHUB_FILE)/stx 200 | mkdir $(GITHUB_FILE)/stx/test 201 | cp -R stx/test/*.fa stx/test/*.expected $(GITHUB_FILE)/stx/test 202 | if [ -e $(GITHUB_FILE).tar.gz ]; then rm $(GITHUB_FILE).tar.gz; fi 203 | cd $(GITHUB_FILE); ln -s stx/stxtyper .; tar cvfz ../$(GITHUB_FILE).tar.gz * 204 | rm -r $(GITHUB_FILE)/* 205 | rmdir $(GITHUB_FILE) 206 | 207 | test: $(DISTFILES) Makefile *.cpp *.hpp *.inc test_dna.fa test_prot.fa test_prot.gff test_dna.fa test_dna.expected test_prot.expected test_both.expected 208 | make -C stx test 209 | # test the amrfinder in the current directory 210 | # with the data in the current directory 211 | ./test_amrfinder.sh -n 212 | -------------------------------------------------------------------------------- /fasta_check.cpp: -------------------------------------------------------------------------------- 1 | // fasta_check.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Check the correctness of a FASTA file 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | 40 | #include "common.inc" 41 | 42 | 43 | 44 | namespace 45 | { 46 | 47 | 48 | 49 | struct ThisApplication final : Application 50 | { 51 | ThisApplication () 52 | : Application ("Check the correctness of a FASTA file. Exit with an error if it is incorrect. Print the number of sequences, max. sequence length and total sequence length") 53 | { 54 | addPositional ("in", "FASTA file"); 55 | addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); 56 | addFlag ("hyphen", "Hyphens are allowed"); 57 | addFlag ("ambig", "Ambiguous characters are allowed"); 58 | addKey ("ambig_max", "Max. number of ambiguous characters in sequences", "0"); 59 | addFlag ("stop_codon", "Stop codons ('*') in the protein sequence are allowed"); 60 | addKey ("len", "Output file with lines: "); 61 | addKey ("out", "Output FASTA file with some of the issues fixed"); 62 | version = SVN_REV; 63 | } 64 | 65 | 66 | 67 | void body () const final 68 | { 69 | const string fName = getArg ("in"); 70 | const bool aa = getFlag ("aa"); 71 | const bool hyphen = getFlag ("hyphen"); 72 | const bool ambig = getFlag ("ambig"); 73 | const size_t ambig_max = str2 (getArg ("ambig_max")); 74 | const bool stop_codon = getFlag ("stop_codon"); 75 | const string lenFName = getArg ("len"); 76 | const string outFName = getArg ("out"); 77 | 78 | QC_IMPLY (stop_codon, aa); 79 | 80 | 81 | unique_ptr lenF; 82 | if (! lenFName. empty ()) 83 | lenF. reset (new OFStream (lenFName)); 84 | unique_ptr outF; 85 | if (! outFName. empty ()) 86 | outF. reset (new OFStream (outFName)); 87 | size_t lines = 0; 88 | StringVector ids; ids. reserve (100000); // PAR 89 | size_t seqSize_max = 0; 90 | size_t seqSize_sum = 0; 91 | //string errorS; 92 | // One sequence 93 | size_t xs = 0; 94 | string header; 95 | string seq; 96 | 97 | auto processSeq = [&] () 98 | { 99 | if (! lines) 100 | return; 101 | ASSERT (! header. empty ()); 102 | ASSERT (! ids. empty ()); 103 | const string id (ids. back ()); 104 | if (aa && ! stop_codon) 105 | { 106 | while (! seq. empty () && seq. back () == '*') 107 | if (outF) 108 | seq. erase (seq. size () - 1); 109 | else 110 | throw runtime_error (id + ": '*' at the sequence end"); 111 | } 112 | if (seq. empty ()) 113 | throw runtime_error (id + ": Empty sequence"); 114 | bool skip = false; 115 | if (! ambig && xs > ambig_max) 116 | { 117 | if (outF) 118 | skip = true; 119 | else 120 | throw runtime_error (id + ": Too many ambiguities"); 121 | } 122 | if (skip) 123 | { LOG ("Skipping " + id); } 124 | else 125 | { 126 | if (lenF. get ()) 127 | *lenF << id << '\t' << seq. size () << endl; 128 | if (outF) 129 | *outF << header << endl << seq << endl; 130 | maximize (seqSize_max, seq. size ()); 131 | seqSize_sum += seq. size (); 132 | } 133 | xs = 0; 134 | header. clear (); 135 | seq. clear (); 136 | }; 137 | 138 | size_t nuc = 0; 139 | { 140 | LineInput f (fName); 141 | string id; 142 | while (f. nextLine ()) 143 | { 144 | trimTrailing (f. line); 145 | if (f. line. empty ()) 146 | continue; 147 | const string errorS ("File " + fName + ", " + f. lineStr (false) + ": "); 148 | if (f. line [0] == '>') 149 | { 150 | size_t pos = 1; 151 | while (pos < f. line. size () && ! isspace (f. line [pos])) 152 | pos++; 153 | id = f. line. substr (1, pos - 1); 154 | if (id. empty ()) 155 | throw runtime_error (errorS + "Empty sequence identifier"); 156 | #if 0 157 | if (id. size () > 1000) // PAR 158 | throw runtime_error (errorS + "Too long sequence identifier"); 159 | #endif 160 | for (const char c : id) 161 | if (! printable (c)) 162 | throw runtime_error (errorS + "Non-printable character in the sequence identifier: " + to_string ((int) c)); 163 | // BLAST: PD-4548 164 | if (! aa) 165 | { 166 | if (id. front () == '?') 167 | throw runtime_error (errorS + "Sequence identifier starts with '?'"); 168 | for (const char c : {',', ';', '.', '~'}) 169 | if (id. back () == c) 170 | throw runtime_error (errorS + "Sequence identifier ends with " + strQuote (string (1, c))); 171 | if (contains (id, "\\t")) 172 | throw runtime_error (errorS + "Sequence identifier contains '\\t'"); 173 | if (contains (id, ",,")) 174 | throw runtime_error (errorS + "Sequence identifier contains ',,'"); 175 | } 176 | processSeq (); 177 | header = f. line; 178 | ids << id; 179 | } 180 | else 181 | { 182 | if (! lines) 183 | throw runtime_error (errorS + "FASTA should start with '>'"); 184 | for (const char c : f. line) 185 | { 186 | bool skip = false; 187 | if (c == '-') 188 | if (hyphen) 189 | ; 190 | else 191 | { 192 | if (outF) 193 | skip = true; 194 | else 195 | throw runtime_error (errorS + "Hyphen in the sequence"); 196 | } 197 | else 198 | { 199 | const char c1 = toLower (c); 200 | if (aa) 201 | { 202 | if (! charInSet (c1, "acdefghiklmnpqrstvwyxbzjuoacdefghiklmnpqrstvwyxbzjuo*")) 203 | throw runtime_error (errorS + "Wrong amino acid character: (code = " + to_string ((int) c) + ") '" + c + "'"); 204 | if (charInSet (c1, "acgt")) 205 | nuc++; 206 | if (charInSet (c1, "xbzjuo")) 207 | xs++; 208 | } 209 | else 210 | { 211 | if (! charInSet (c1, "acgtbdhkmnrsvwyacgtbdhkmnrsvwy")) 212 | throw runtime_error (errorS + "Wrong nucleotide character: (code = " + to_string ((int) c) + ") '" + c + "'"); 213 | if (charInSet (c1, "bdhkmnrsvwy")) 214 | xs++; 215 | } 216 | } 217 | if (! skip) 218 | seq += c; 219 | } 220 | } 221 | lines++; 222 | } 223 | } 224 | processSeq (); // Last sequence 225 | if (! lines) 226 | throw runtime_error ("Empty file"); 227 | if (aa && (double) nuc / (double) seqSize_sum > 0.9) // PAR 228 | throw runtime_error ("Protein sequences looks like a nucleotide sequences"); 229 | 230 | ids. sort (); 231 | const size_t index = ids. findDuplicate (); 232 | if (index != no_index) 233 | throw runtime_error ("Duplicate identifier: " + ids [index]); 234 | 235 | cout << ids. size () << endl 236 | << seqSize_max << endl 237 | << seqSize_sum << endl; 238 | } 239 | }; 240 | 241 | 242 | 243 | } // namespace 244 | 245 | 246 | 247 | int main (int argc, 248 | const char* argv[]) 249 | { 250 | ThisApplication app; 251 | return app. run (argc, argv); 252 | } 253 | 254 | 255 | 256 | -------------------------------------------------------------------------------- /alignment.hpp: -------------------------------------------------------------------------------- 1 | // alignment.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Protein or DNA mutations library. 31 | * 32 | */ 33 | 34 | 35 | #include "common.hpp" 36 | using namespace Common_sp; 37 | #include "seq.hpp" 38 | using namespace Seq_sp; 39 | 40 | 41 | 42 | 43 | namespace Alignment_sp 44 | { 45 | 46 | 47 | 48 | static constexpr char pm_delimiter = '_'; 49 | 50 | 51 | 52 | struct AmrMutation final : Root 53 | // Database 54 | { 55 | size_t pos_real {0}; 56 | // In whole reference sequence 57 | // = start of reference 58 | 59 | string geneMutation_std; 60 | // Function of geneMutation_std 61 | // Upper-case 62 | string reference; 63 | string allele; 64 | string gene; 65 | int pos_std {0}; 66 | size_t frameshift {no_index}; 67 | // Position of '*' after getStop() 68 | int frameshift_insertion {0}; 69 | 70 | // To be reported 71 | // !empty() 72 | string geneMutation; 73 | string classS; 74 | string subclass; 75 | string name; 76 | // Species binomial + resistance 77 | 78 | 79 | // Input: pos_arg: 1-based 80 | AmrMutation (size_t pos_real_arg, 81 | const string &geneMutation_std_arg, 82 | const string &geneMutation_arg, 83 | const string &class_arg, 84 | const string &subclass_arg, 85 | const string &name_arg); 86 | AmrMutation (size_t pos_arg, 87 | const string &geneMutation_std_arg) 88 | : AmrMutation (pos_arg, geneMutation_std_arg, geneMutation_std_arg, "X", "X", "X") 89 | {} 90 | AmrMutation () = default; 91 | private: 92 | static void parse (const string &geneMutation_std, 93 | string &reference, 94 | string &allele, 95 | string &gene, 96 | int &pos_std, 97 | size_t &frameshift, 98 | int &frameshift_insertion); 99 | public: 100 | void qc () const override; 101 | void saveText (ostream &os) const override 102 | { if (empty ()) 103 | os << "empty"; 104 | else 105 | os << pos_real + 1 106 | << ' ' << geneMutation 107 | << ' ' << frameshift_insertion 108 | << ' ' << name; 109 | } 110 | bool empty () const override 111 | { return geneMutation_std. empty (); } 112 | 113 | 114 | size_t getStop () const 115 | { return pos_real + reference. size (); } 116 | string wildtype () const 117 | { return gene + "_" + reference + to_string (pos_std + 1) + reference; } 118 | bool operator< (const AmrMutation &other) const; 119 | bool operator== (const AmrMutation &other) const 120 | { return geneMutation_std == other. geneMutation_std; } 121 | void apply (string &seq) const 122 | { if (pos_real >= seq. size ()) 123 | throw runtime_error ("AmrMutation position " + to_string (pos_real) + " is outside the sequence: " + seq); 124 | if (frameshift != no_index) 125 | throw runtime_error ("AmrMutation is a frameshift"); 126 | if (verbose ()) 127 | cerr << seq. substr (0, pos_real) 128 | << endl << allele 129 | << endl << seq. substr (pos_real + reference. size ()) 130 | << endl; 131 | seq = seq. substr (0, pos_real) + allele + seq. substr (pos_real + reference. size ()); 132 | } 133 | }; 134 | 135 | 136 | 137 | struct Alignment; 138 | 139 | 140 | 141 | struct SeqChange final : Root 142 | // Observation 143 | { 144 | const Alignment* al {nullptr}; 145 | // !nullptr 146 | //bool fromAllele {false}; 147 | 148 | // In alignment 149 | size_t start {0}; 150 | size_t len {0}; 151 | 152 | // No '-' 153 | string reference; 154 | // Insertion => start is artifically decremented and len is incremented => !empty() 155 | string allele; 156 | // empty() <=> frame shift 157 | 158 | size_t start_ref {0}; 159 | size_t stop_ref {0}; 160 | size_t start_target {0}; 161 | double neighborhoodMismatch {0.0}; 162 | // 0..1 163 | 164 | VectorPtr mutations; 165 | // !nullptr 166 | // Matching AmrMutation's 167 | 168 | Disruption disr; 169 | 170 | const SeqChange* replacement {nullptr}; 171 | // !nullptr => *this is replaced by *replacement 172 | 173 | 174 | SeqChange () = default; 175 | explicit SeqChange (const Alignment* al_arg/*, 176 | bool fromAllele_arg*/) 177 | : al (al_arg) 178 | //, fromAllele (fromAllele_arg) 179 | {} 180 | SeqChange (const Alignment* al_arg, 181 | const AmrMutation* mutation_arg) 182 | : al (al_arg) 183 | { mutations << checkPtr (mutation_arg); } 184 | SeqChange (const Alignment* al_arg, 185 | const Disruption &disr_arg) 186 | : al (al_arg) 187 | , disr (disr_arg) 188 | {} 189 | void qc () const override; 190 | void saveText (ostream &os) const override 191 | { os << start + 1 192 | << ' ' << len 193 | << ' ' << strQuote (reference) << " -> " << strQuote (allele) 194 | << ' ' << start_ref + 1 << ".." << stop_ref 195 | << ' ' << start_target + 1 196 | << ' ' << neighborhoodMismatch; 197 | if (! disr. empty ()) 198 | disr. saveText (os); 199 | for (const AmrMutation* mutation : mutations) 200 | { os << ' ' ; 201 | mutation->saveText (os); 202 | } 203 | os << endl; 204 | } 205 | bool empty () const override 206 | { return ! len && disr. empty (); } 207 | 208 | 209 | bool hasMutation () const 210 | { return ! empty () && ! mutations. empty () && ! replacement; } 211 | bool hasFrameshift () const 212 | { return hasMutation () && mutations [0] -> frameshift != no_index; } 213 | bool isFrameshift () const 214 | { return reference. empty (); } 215 | string getMutationStr () const; 216 | size_t getStop () const 217 | { return start + len; } 218 | bool operator< (const SeqChange &other) const; 219 | bool better (const SeqChange &other) const; 220 | bool finish (const string &refSeq, 221 | size_t flankingLen); 222 | // Return: good match 223 | // Invokes: finishPos() 224 | bool finishPos (size_t flankingLen); 225 | // Return: good match 226 | private: 227 | void setSeq (); 228 | void setStartStopRef (); 229 | void setStartTarget (); 230 | void setNeighborhoodMismatch (size_t flankingLen); 231 | public: 232 | bool matchesMutation (const AmrMutation& mut) const; 233 | }; 234 | 235 | 236 | 237 | struct Alignment : Hsp 238 | { 239 | AmrMutation refMutation; 240 | // !empty() => qseq contains AmrMutation::allele 241 | //int ref_offset {0}; 242 | 243 | Vector seqChanges; 244 | 245 | 246 | Alignment (const string &line, 247 | bool qProt_arg, 248 | bool sProt_arg) 249 | : Hsp (line, qProt_arg, sProt_arg, qProt_arg || sProt_arg /*aProt*/, /*false*/ qProt_arg /*qStopCodon*/, true/*bacterialStartCodon*/) 250 | {} 251 | Alignment () = default; 252 | protected: 253 | void setSeqChanges (const Vector &refMutations, 254 | size_t flankingLen/*, 255 | bool allMutationsP*/); 256 | // Input: flankingLen: valid if > 0 257 | private: 258 | size_t refMutation2refSeq_pos (); 259 | // Return: no_index <=> refMutation is not detected 260 | public: 261 | void qc () const override; 262 | void saveText (ostream &os) const override 263 | { Hsp::saveText (os); 264 | if (! refMutation. empty ()) 265 | os << ' ' << refMutation; 266 | os << " #seqChanges:" << seqChanges. size (); 267 | } 268 | 269 | 270 | bool hasMutation () const 271 | { for (const SeqChange& seqChange : seqChanges) 272 | if (seqChange. hasMutation ()) 273 | return true; 274 | return false; 275 | } 276 | bool hasDeclarativeFrameshift () const 277 | { return seqChanges. size () == 1 && seqChanges [0]. hasFrameshift (); } 278 | }; 279 | 280 | 281 | 282 | 283 | } // namespace 284 | 285 | 286 | -------------------------------------------------------------------------------- /gff_check.cpp: -------------------------------------------------------------------------------- 1 | // gff_check.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Check the correctness of a GFF file 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | #include "gff.hpp" 40 | using namespace GFF_sp; 41 | 42 | #include "common.inc" 43 | 44 | 45 | 46 | namespace 47 | { 48 | 49 | 50 | const string locus_tagS ("[locus_tag="); 51 | const string prodigal_ID (" ID="); 52 | const string noFile ("emptystring"); 53 | 54 | 55 | 56 | struct ThisApplication final : Application 57 | { 58 | ThisApplication () 59 | : Application ("Check the correctness of a GFF file. Exit with an error if it is incorrect.") 60 | { 61 | // Input 62 | addPositional ("gff", "GFF file, if " + strQuote (noFile) + " then exit 0"); 63 | addKey ("gfftype", "Type of GFF file: " + Gff::names. toString (", "), "genbank"); 64 | addKey ("prot", "Protein FASTA file"); 65 | addKey ("dna", "DNA FASTA file"); 66 | addFlag ("lcl", "Nucleotide FASTA created by PGAP has \"lcl|\" prefix in accessions"); 67 | // Output 68 | addKey ("gff_prot_match", "Output file with pairs: \" \", \n\ 69 | where for genbank: is from " + strQuote (locus_tagS + "") + " in the protein FASTA comment, \n\ 70 | for microscope: is ID: from '>|ID:||', \n\ 71 | and for prodigal: is ID= in the protein FASTA comment\n\ 72 | "); 73 | addKey ("gff_dna_match", "Output file with pairs: \" \", where for pseudomonasdb: is the suffix after '|' in the DNA FASTA identifier"); 74 | version = SVN_REV; 75 | } 76 | 77 | 78 | 79 | void body () const final 80 | { 81 | const string gffName = getArg ("gff"); 82 | const Gff::Type type = Gff::name2type (getArg ("gfftype")); 83 | const string protFName = getArg ("prot"); 84 | const string dnaFName = getArg ("dna"); 85 | const string protMatchFName = getArg ("gff_prot_match"); 86 | const string dnaMatchFName = getArg ("gff_dna_match"); 87 | const bool lcl = getFlag ("lcl"); 88 | 89 | if (lcl && type != Gff::pgap) 90 | throw runtime_error ("-lcl requires type pgap"); 91 | 92 | 93 | if (isRight (gffName, noFile)) 94 | return; 95 | 96 | 97 | const Annot annot (gffName, type, ! protMatchFName. empty (), lcl); 98 | 99 | 100 | if (! protFName. empty ()) 101 | { 102 | StringVector gffIds; gffIds. reserve (10000); // PAR 103 | { 104 | OFStream outF; 105 | if (! protMatchFName. empty ()) 106 | outF. open ("", protMatchFName, ""); 107 | StringVector fastaIds; fastaIds. reserve (gffIds. capacity ()); 108 | LineInput f (protFName /*, 100 * 1024, 1*/); 109 | Istringstream iss; 110 | string line_orig; 111 | string fastaId; 112 | while (f. nextLine ()) 113 | { 114 | trimTrailing (f. line); 115 | if (f. line. empty ()) 116 | continue; 117 | if (f. line [0] != '>') 118 | continue; 119 | line_orig = f. line; 120 | iss. reset (f. line. substr (1)); 121 | fastaId. clear (); 122 | iss >> fastaId; 123 | QC_ASSERT (! fastaId. empty ()); 124 | ASSERT (! contains (fastaId, ' ')); 125 | fastaIds << fastaId; 126 | // gffId 127 | string gffId (fastaId); 128 | if (! protMatchFName. empty ()) 129 | switch (type) 130 | { 131 | case Gff::genbank: 132 | { 133 | const size_t pos = f. line. find (locus_tagS); 134 | if (pos == string::npos) 135 | throw runtime_error (__FILE__ ": " + strQuote (locus_tagS) + " is not found in: " + line_orig); 136 | gffId = f. line. substr (pos + locus_tagS. size ()); 137 | const size_t end = gffId. find (']'); 138 | if (end == string::npos) 139 | throw runtime_error (__FILE__ ": ']' is not found after " + strQuote (locus_tagS) + " in: " + line_orig); 140 | gffId. erase (end); 141 | } 142 | break; 143 | case Gff::microscope: 144 | { 145 | string s (std::move (gffId)); 146 | findSplit (s, '|'); 147 | gffId = findSplit (s, '|'); 148 | const string idS ("ID:"); 149 | if (! isLeft (gffId, idS)) 150 | throw runtime_error (__FILE__ ": 'ID:' is not found in: " + line_orig); 151 | gffId. erase (0, idS. size ()); 152 | } 153 | break; 154 | case Gff::prodigal: 155 | { 156 | const size_t pos = f. line. find (prodigal_ID); 157 | if (pos == string::npos) 158 | throw runtime_error (__FILE__ ": " + strQuote (prodigal_ID) + " is not found in: " + line_orig); 159 | gffId = f. line. substr (pos + prodigal_ID. size ()); 160 | const size_t end = gffId. find (';'); 161 | if (end == string::npos) 162 | throw runtime_error (__FILE__ ": ';' is not found after " + strQuote (prodigal_ID) + " in: " + line_orig); 163 | gffId. erase (end); 164 | } 165 | break; 166 | default: break; 167 | } 168 | // 169 | if (contains (gffId, ' ')) 170 | throw runtime_error (__FILE__ ": " + strQuote (gffId) + " contains space"); 171 | if (gffId. empty ()) 172 | throw runtime_error (__FILE__ ": No protein identifier in: " + line_orig); 173 | gffIds << gffId; 174 | if (outF. is_open ()) 175 | outF << fastaId << '\t' << gffId << endl; 176 | } 177 | const size_t n = fastaIds. size (); 178 | fastaIds. sort (); 179 | fastaIds. uniq (); 180 | if (fastaIds. size () != n) 181 | throw runtime_error (__FILE__ ": Duplicate FASTA ids"); 182 | gffIds. sort (); 183 | { 184 | const string* s_prev = nullptr; 185 | for (const string& s : gffIds) 186 | { 187 | if (s_prev && *s_prev == s) 188 | throw runtime_error (__FILE__ ": GFF identifier " + strQuote (s) + " is not unique"); 189 | s_prev = & s; 190 | } 191 | } 192 | ASSERT (gffIds. size () == fastaIds. size ()); 193 | } 194 | if (verbose ()) 195 | cout << "# Proteins in GFF: " << annot. prot2loci. size () << endl; 196 | for (const string& seqid : gffIds) 197 | if (! contains (annot. prot2loci, seqid)) 198 | throw runtime_error (__FILE__ ": Protein FASTA id " + strQuote (seqid) + " is not in the GFF file"); 199 | #if 0 200 | for (const auto& it : annot. prot2loci) 201 | if (! gffIds. containsFast (it. first)) 202 | throw runtime_error (__FILE__ ": GFF protein id " + strQuote (it. first) + " is not in the protein FASTA file"); // pseudogene ?? 203 | #endif 204 | } 205 | 206 | 207 | if (! dnaFName. empty ()) 208 | { 209 | StringVector contigIds; contigIds. reserve (10000); // PAR 210 | StringVector gffIds; gffIds. reserve (10000); // PAR 211 | { 212 | OFStream outF; 213 | if (! dnaMatchFName. empty ()) 214 | outF. open ("", dnaMatchFName, ""); 215 | LineInput f (dnaFName /*, 100 * 1024, 1*/); 216 | Istringstream iss; 217 | string contigId; 218 | while (f. nextLine ()) 219 | { 220 | trimTrailing (f. line); 221 | if (f. line. empty ()) 222 | continue; 223 | if (f. line [0] != '>') 224 | continue; 225 | iss. reset (f. line. substr (1)); 226 | contigId. clear (); 227 | iss >> contigId; 228 | ASSERT (! contains (contigId, ' ')); 229 | // gffId 230 | string gffId (contigId); 231 | if (! dnaMatchFName. empty ()) 232 | switch (type) 233 | { 234 | case Gff::pseudomonasdb: 235 | { 236 | const size_t pos = gffId. rfind ('|'); 237 | if (pos != string::npos) 238 | gffId. erase (0, pos + 1); 239 | } 240 | break; 241 | default: break; 242 | } 243 | // 244 | if (gffId. empty ()) 245 | throw runtime_error (__FILE__ ": No contig identifier in:\n" + f. line); 246 | if (lcl && ! isLeft (gffId, "lcl|")) 247 | throw runtime_error (__FILE__ ": Contig identifier does not start with " + strQuote ("lcl|") + ":\n" + f. line); 248 | gffIds << gffId; 249 | contigIds << contigId; 250 | if (outF. is_open ()) 251 | outF << contigId << '\t' << gffId << endl; 252 | } 253 | } 254 | ASSERT (contigIds. size () == gffIds. size ()); 255 | gffIds. sort (); 256 | { 257 | const string* s_prev = nullptr; 258 | for (const string& s : gffIds) 259 | { 260 | if (s_prev && *s_prev == s) 261 | throw runtime_error (__FILE__ ": DNA GFF identifier " + strQuote (s) + " is not unique"); 262 | s_prev = & s; 263 | } 264 | } 265 | contigIds. sort (); 266 | { 267 | const string* s_prev = nullptr; 268 | for (const string& s : contigIds) 269 | { 270 | if (s_prev && *s_prev == s) 271 | throw runtime_error (__FILE__ ": DNA contig identifier " + strQuote (s) + " is not unique"); 272 | s_prev = & s; 273 | } 274 | } 275 | for (const auto& it : annot. prot2loci) 276 | for (const Locus& cds : it. second) 277 | if (! gffIds. contains (cds. contig)) 278 | throw runtime_error (__FILE__ ": GFF contig id " + strQuote (cds. contig) + " is not in the DNA FASTA file"); 279 | } 280 | } 281 | }; 282 | 283 | 284 | 285 | } // namespace 286 | 287 | 288 | 289 | int main (int argc, 290 | const char* argv[]) 291 | { 292 | ThisApplication app; 293 | return app. run (argc, argv); 294 | } 295 | 296 | 297 | 298 | -------------------------------------------------------------------------------- /disruption2genesymbol.cpp: -------------------------------------------------------------------------------- 1 | // disruption2genesymbol.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Convert Disruption::genesymbol_raw() to a gene symbol 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "common.hpp" 38 | using namespace Common_sp; 39 | #include "seq.hpp" 40 | using namespace Seq_sp; 41 | 42 | #include "common.inc" 43 | 44 | 45 | 46 | namespace 47 | { 48 | 49 | 50 | constexpr char no_aa {'?'}; 51 | 52 | 53 | 54 | struct SymbolRaw final : Root 55 | { 56 | static constexpr size_t del_size {10}; // PAR 57 | 58 | // Input 59 | string contig; 60 | string prot; 61 | Disruption::Type type {Disruption::eNone}; 62 | // 0-based 63 | size_t qstart {no_index}; 64 | size_t qend {no_index}; 65 | // aa 66 | size_t sstart {no_index}; 67 | size_t send {no_index}; 68 | // bp 69 | // 70 | Strand strand {0}; 71 | bool stop {false}; 72 | string rest; 73 | 74 | // Output 75 | string ref; 76 | string allele; 77 | 78 | 79 | explicit SymbolRaw (const string &line) 80 | { 81 | string s; 82 | { 83 | istringstream iss (line); 84 | iss >> contig >> prot >> s; 85 | QC_ASSERT (! s. empty ()); 86 | ASSERT (! contig. empty ()); 87 | ASSERT (! prot. empty ()); 88 | 89 | constexpr size_t rest_size = 1024; 90 | char rest_ [rest_size]; 91 | iss. getline (rest_, rest_size); 92 | rest = rest_; 93 | } 94 | 95 | if (trimSuffix (s, Disruption::stopSuf)) 96 | stop = true; 97 | 98 | // strand 99 | { 100 | const string strandS = rfindSplit (s, '_'); 101 | if (strandS == "0") 102 | strand = -1; 103 | else if (strandS == "1") 104 | strand = 1; 105 | else 106 | throw runtime_error ("Unknown strand: " + strQuote (strandS)); 107 | } 108 | 109 | send = str2 (rfindSplit (s, '_')); 110 | sstart = str2 (rfindSplit (s, '_')); 111 | qend = str2 (rfindSplit (s, '_')); 112 | qstart = str2 (rfindSplit (s, '_')); 113 | QC_ASSERT (qstart <= qend); 114 | QC_ASSERT (sstart <= send); 115 | 116 | type = Disruption::name2type (s); 117 | QC_ASSERT (type != Disruption::eNone); 118 | QC_ASSERT (type != Disruption::eSmooth); 119 | } 120 | void saveText (ostream &os) const final 121 | { 122 | ASSERT (! ref. empty ()); 123 | os << contig // 0 124 | << '\t' << prot // 1 125 | << '\t'; 126 | if (verbose ()) 127 | os << '\t' << Disruption::typeNames [type] 128 | << '\t' << qstart 129 | << '\t' << qend 130 | << '\t' << sstart 131 | << '\t' << send 132 | << '\t' << (int) strand 133 | << '\t' << stop 134 | << '\t' << ref 135 | << '\t' << allele 136 | << '\t'; 137 | ASSERT (! contains (ref, '*')); 138 | string allele_ (allele); 139 | const bool alleleStop = trimSuffix (allele_, "*"); 140 | const size_t allele_size = allele_. size (); // Without stop codon 141 | //QC_IMPLY (type != Disruption::eFrameshift, alleleStop == stop); 142 | QC_IMPLY (stop, alleleStop); 143 | constexpr size_t display_max = 1/*reference aa*/ + 5; // PAR // PD-5395 144 | if (allele_size > display_max) 145 | allele_ = "ins"; 146 | if (alleleStop) 147 | allele_ += terminatorWord; 148 | ASSERT (! contains (allele_, '*')); 149 | // Standard gene symbol 150 | // 2 151 | if (ref. size () > display_max) 152 | os << ref. front () << qstart + 1 153 | << '_' << ref. back () << qstart + ref. size (); 154 | else 155 | os << ref << qstart + 1; 156 | switch (type) 157 | { 158 | case Disruption::eFrameshift: 159 | ASSERT (ref. size () == 1) 160 | ASSERT (! allele. empty ()); 161 | if (alleleStop && allele_size == 0) 162 | os << terminatorWord; 163 | else 164 | os << allele [0]; 165 | os << Disruption::typeNames [type]; 166 | if (alleleStop) 167 | os << terminatorWord << allele_size; 168 | break; 169 | case Disruption::eDeletion: // Or replacement 170 | if (allele_. empty ()) 171 | os << Disruption::typeNames [type]; 172 | else 173 | { 174 | os << allele_; 175 | if (allele_size > display_max) 176 | os << allele_size - 1/*reference aa*/; 177 | } 178 | break; 179 | case Disruption::eInsertion: 180 | ASSERT (ref. size () == 1); 181 | ASSERT (! allele_. empty ()); 182 | os << allele_; 183 | if (allele_size > display_max) 184 | os << allele_size - 1/*reference aa*/; 185 | break; 186 | default: 187 | break; 188 | } 189 | // 3 190 | os << '\t' 191 | // = 192 | // Opposite to SymbolRaw::SymbolRaw(line) 193 | << Disruption::typeNames [type] << '_' << qstart << '_' << qend << '_' << sstart << '_' << send << '_' << (strand == 1 ? 1 : 0); 194 | if (stop) 195 | os << Disruption::stopSuf; 196 | // 197 | os << '\t' << rest // 4 198 | << '\n'; 199 | } 200 | 201 | 202 | char contig2aa (const Dna &dna, 203 | size_t offset, 204 | Gencode gencode) const 205 | // Input: offset: from sstart/send 206 | // Return: no_aa <=> offset is outside dna 207 | { 208 | QC_ASSERT (send <= dna. seq. size ()); 209 | 210 | if (strand == 1) 211 | { 212 | const size_t i = sstart + offset * 3; 213 | if (i + 3 > send) 214 | return no_aa; 215 | return codon2aa (& dna. seq [i], gencode, false); 216 | } 217 | 218 | ASSERT (strand == -1); 219 | if (send < (offset + 1) * 3) 220 | return no_aa; 221 | const size_t i = send - (offset + 1) * 3; 222 | ASSERT (i + 3 <= dna. seq. size ()); 223 | if (i < sstart) 224 | return no_aa; 225 | string s (dna. seq. substr (i, 3)); 226 | reverseDna (s); 227 | return codon2aa (s. c_str (), gencode, false); 228 | } 229 | }; 230 | 231 | 232 | 233 | struct ThisApplication final : Application 234 | { 235 | static constexpr char id_delim {'|'}; 236 | 237 | 238 | ThisApplication () 239 | : Application ("Convert Disruption::genesymbol_raw() to standard gene symbols according to https://hgvs-nomenclature.org/stable/recommendations/protein/frameshift/.\n\ 240 | A stop codon is '" + string (terminatorWord) + "'.\n\ 241 | Print: where is inserted before " 242 | ) 243 | { 244 | addPositional ("nucl", "Input nucleotide FASTA file"); 245 | addPositional ("prot", "Input protein FASTA file"); 246 | addPositional ("tab", "Table with lines: > > "); 247 | addKey ("gencode", "NCBI genetic code for translated BLAST", "11"); 248 | addKey ("prot_id_pos", string ("Position of protein id in qseqid delimited by ") + id_delim + ", 1-based. 0 - use qseqid as a whole", "0"); 249 | } 250 | 251 | 252 | 253 | void body () const final 254 | { 255 | const string nuclFName = getArg ("nucl"); 256 | const string protFName = getArg ("prot"); 257 | const string tabFName = getArg ("tab"); 258 | const Gencode gencode = (Gencode) arg2uint ("gencode"); 259 | const size_t prot_id_pos = str2 (getArg ("prot_id_pos")); 260 | 261 | 262 | Vector symbolRaws; 263 | { 264 | LineInput f (tabFName); 265 | while (f. nextLine ()) 266 | symbolRaws << std::move (SymbolRaw (f. line)); 267 | } 268 | if (symbolRaws. empty ()) 269 | return; 270 | 271 | // SymbolRaw::allele 272 | { 273 | Multifasta fa (nuclFName, false); 274 | while (fa. next ()) 275 | { 276 | const Dna dna (fa, 100000/*PAR*/, true); 277 | dna. qc (); 278 | const string id (dna. getId ()); 279 | for (SymbolRaw& symbolRaw : symbolRaws) 280 | if (symbolRaw. contig == id) 281 | for (size_t offset = 0; ; offset++) 282 | { 283 | const char aa = symbolRaw. contig2aa (dna, offset, gencode); 284 | if (aa == no_aa) 285 | break; 286 | symbolRaw. allele += aa; 287 | if (aa == '*') 288 | break; 289 | } 290 | } 291 | } 292 | 293 | // SymbolRaw::{ref, allele for "del"} 294 | { 295 | Multifasta fa (protFName, true); 296 | while (fa. next ()) 297 | { 298 | const Peptide pep (fa, 1000/*PAR*/, true); 299 | pep. qc (); 300 | 301 | string id; 302 | const string id_whole (pep. getId ()); 303 | if (prot_id_pos) 304 | { 305 | const StringVector vec (id_whole, id_delim, true); 306 | if (prot_id_pos - 1 >= vec. size ()) 307 | throw runtime_error ("Protein identifier position " + to_string (prot_id_pos) + " is outside of the list of identifiers: " + strQuote (id_whole)); 308 | id = vec [prot_id_pos - 1]; 309 | } 310 | else 311 | id = id_whole; 312 | 313 | for (SymbolRaw& symbolRaw : symbolRaws) 314 | if (symbolRaw. prot == id) 315 | symbolRaw. ref = pep. seq. substr (symbolRaw. qstart, symbolRaw. qend - symbolRaw. qstart); 316 | } 317 | } 318 | 319 | // symbolRaw's 320 | for (const SymbolRaw& symbolRaw : symbolRaws) 321 | symbolRaw. saveText (cout); 322 | } 323 | }; 324 | 325 | 326 | } // namespace 327 | 328 | 329 | 330 | 331 | int main (int argc, 332 | const char* argv[]) 333 | { 334 | ThisApplication app; 335 | return app. run (argc, argv); 336 | } 337 | 338 | 339 | 340 | -------------------------------------------------------------------------------- /tsv.hpp: -------------------------------------------------------------------------------- 1 | // tsv.hpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * TSV-table 31 | * 32 | */ 33 | 34 | 35 | #ifndef TSV_HPP 36 | #define TSV_HPP 37 | 38 | 39 | #include "common.hpp" 40 | using namespace Common_sp; 41 | 42 | 43 | 44 | namespace Common_sp 45 | { 46 | 47 | 48 | 49 | struct Date : Root 50 | { 51 | enum Format {fmt_Year, fmt_YMD, fmt_None}; // not complete list ?? 52 | short year {0}; 53 | char month {0}; 54 | // 0 .. 12 - 1 55 | char day {0}; 56 | // 0 .. days[month] - 1 // leap year ?? 57 | 58 | 59 | Date () = default; 60 | explicit Date (short year_arg, 61 | char month_arg = 0, 62 | char day_arg = 0) 63 | : year (year_arg) 64 | , month (month_arg) 65 | , day (day_arg) 66 | {} 67 | static bool isYear (short n) 68 | { return n > 1000 && n < 2500; } // PAR 69 | static bool isMonth (short n) 70 | { return between (n, 0, 12); } 71 | static bool isDay (short n) 72 | { return between (n, 0, 31); } // Must depend on month ?? 73 | static Date parse (const string &s, 74 | Format fmt); 75 | // Return: !empty() <=> success 76 | bool empty () const final 77 | { return ! year 78 | && ! month 79 | && ! day; 80 | } 81 | void saveText (ostream &os) const final 82 | { os << std::setfill('0') << std::setw(4) << year << '-' 83 | << std::setfill('0') << std::setw(2) << (int) month + 1 << '-' 84 | << std::setfill('0') << std::setw(2) << (int) day + 1; 85 | } 86 | JsonMap* toJson (JsonContainer* parent, 87 | const string& name = noString) const override 88 | { auto j = new JsonMap (parent, name); 89 | new JsonInt (year, j, "year"); 90 | new JsonInt (month, j, "month"); 91 | new JsonInt (day, j, "day"); 92 | return j; 93 | } 94 | 95 | 96 | bool operator== (const Date &other) const 97 | { return year == other. year 98 | && month == other. month 99 | && day == other. day; 100 | } 101 | bool less (const Date &other, 102 | bool equal) const; 103 | bool operator<= (const Date &other) const 104 | { return less (other, true); } 105 | bool operator< (const Date &other) const 106 | { return less (other, false); } 107 | Date operator- (const Date &other) const; 108 | // Requires: other <= *this 109 | bool year_divisible () const 110 | { return ! month && ! day; } 111 | bool quarter_divisible () const 112 | { return ! (month % 3) && ! day; } 113 | bool month_divisible () const 114 | { return ! day; } 115 | }; 116 | 117 | 118 | 119 | struct TextTable : Named 120 | // Tab-separated value (tsv) table with a header 121 | // name: file name or empty() 122 | { 123 | bool pound {false}; 124 | // '#' in the beginning of header 125 | bool saveHeader {true}; 126 | 127 | 128 | struct Header : Named 129 | { 130 | size_t len_max {0}; 131 | // For trim()'ed fields 132 | // Type 133 | bool numeric {true}; 134 | // Valid if numeric 135 | bool scientific {false}; 136 | streamsize decimals {0}; 137 | bool null {false}; 138 | // = can be empty() 139 | static constexpr size_t choices_max {7}; // PAR 140 | Set choices; 141 | // size() <= choices_max + 1 142 | 143 | Header () = default; 144 | explicit Header (const string &name_arg) 145 | : Named (name_arg) 146 | {} 147 | void qc () const override; 148 | void saveText (ostream& os) const override 149 | { os << name 150 | << '\t' << len_max 151 | << '\t' << (numeric ? ((scientific ? "float" : "int") + string ("(") + to_string (decimals) + ")") : "char") 152 | << '\t' << (null ? "null" : "not null"); 153 | } 154 | 155 | void saveSql (ostream& os) const; 156 | }; 157 | Vector
header; 158 | // Header::name's are unique 159 | // size() = number of columns 160 | 161 | 162 | Vector rows; 163 | // StringVector::size() = header.size() 164 | // Values are trim()'ed 165 | typedef size_t ColNum; 166 | // no_index <=> no column 167 | typedef size_t RowNum; 168 | // no_index <=> no row 169 | static constexpr char aggr_sep {','}; // PAR 170 | 171 | 172 | struct Error : runtime_error 173 | { 174 | Error (const TextTable &tab, 175 | const string &what) 176 | : runtime_error (what + "\nIn table file: " + tab. name) 177 | {} 178 | }; 179 | 180 | 181 | explicit TextTable (const string &tableFName, 182 | const string &columnSynonymsFName = noString, 183 | bool headerP = true, 184 | uint displayPeriod = 0); 185 | // Input: tableFName: format: [{'#' }* '#']
{ >}* 186 | // empty lines are skipped 187 | // columnSynonymsFName: 188 | // Rows where number of columns < header size are added empty values 189 | static constexpr const char* syn_format {"Column synonyms file with the format: {
{ }* {|}}*"}; 190 | TextTable () = default; 191 | TextTable (bool pound_arg, 192 | const Vector
&header_arg) 193 | : pound (pound_arg) 194 | , header (header_arg) 195 | {} 196 | private: 197 | void setHeader (); 198 | public: 199 | static Vector
str2header (const string &s, 200 | char sep = ',') 201 | { const StringVector vec (s, sep, true); 202 | Vector
header; header. reserve (vec. size ()); 203 | for (const string& name : vec) 204 | header << std::move (Header (name)); 205 | return header; 206 | } 207 | void qc () const override; 208 | void saveText (ostream &os) const override; 209 | 210 | 211 | void printHeader (ostream &os) const; 212 | ColNum col2num_ (const string &columnName) const; 213 | // Return: no_index <=> no columnName 214 | ColNum col2num (const string &columnName) const 215 | { const ColNum i = col2num_ (columnName); 216 | if (i == no_index) 217 | throw Error (*this, "Table has no column " + strQuote (columnName)); 218 | return i; 219 | } 220 | Vector columns2nums (const StringVector &columns) const 221 | { Vector nums; nums. reserve (columns. size ()); 222 | for (const string &s : columns) 223 | nums << col2num (s); 224 | return nums; 225 | } 226 | bool hasColumn (const string &columnName) const 227 | { return col2num_ (columnName) != no_index; } 228 | void duplicateColumn (const string &columnName_from, 229 | const string &columnName_to); 230 | void substitueColumn (string &columnName_from, 231 | const string &columnName_to) 232 | { duplicateColumn (columnName_from, columnName_to); 233 | columnName_from = columnName_to; 234 | } 235 | ColNum findDate (Date::Format &fmt) const; 236 | // Date column is not empty and has the same format fmt in all rows 237 | // Return: no_index <=> not found 238 | // Output: fmt, valid if return != no_index 239 | bool isKey (ColNum colNum) const; 240 | private: 241 | int compare (const StringVector& row1, 242 | const StringVector& row2, 243 | ColNum column) const; 244 | public: 245 | void filterColumns (const StringVector &newColumnNames); 246 | // Input: newColumnNames: in header::name's 247 | // can be repeated 248 | // ordered 249 | void sort (const StringVector &by); 250 | void deredundify (const StringVector &equivCols, 251 | const CompareInt& equivBetter); 252 | // Input: equivBetter(row1,row2) = 1 <=> row1 is better than row2 253 | // Requires: row1 and row2 are in the class of equivCols-equivalent rows 254 | // Sorts by equivCols 255 | void group (const StringVector &by, 256 | const StringVector &sum, 257 | const StringVector &minV, 258 | const StringVector &maxV, 259 | const StringVector &aggr); 260 | // aggr: slow 261 | // Invokes: sort(by), filterColumns(by + sum + aggr) 262 | private: 263 | void merge (RowNum toRowNum, 264 | RowNum fromRowNum, 265 | const Vector &sum, 266 | const Vector &minV, 267 | const Vector &maxV, 268 | const Vector &aggr); 269 | public: 270 | static StringVector aggr2values (const string &aggr) 271 | { StringVector v (aggr, aggr_sep, true); 272 | v. sort (); 273 | v. uniq (); 274 | return v; 275 | } 276 | void colNumsRow2values (const Vector &colNums, 277 | RowNum row_num, 278 | StringVector &values) const; 279 | // Output: values 280 | RowNum find (const Vector &colNums, 281 | const StringVector &targetValues, 282 | RowNum rowNum_start) const; 283 | StringVector col2values (ColNum col) const; 284 | 285 | 286 | struct Key 287 | { 288 | const Vector colNums; 289 | unordered_map data; 290 | 291 | Key (const TextTable &tab, 292 | const StringVector &columns); 293 | 294 | RowNum find (const StringVector &values) const 295 | { const auto& it = data. find (values); 296 | if (it != data. end ()) 297 | return it->second; 298 | return no_index; 299 | } 300 | }; 301 | 302 | 303 | struct Index 304 | { 305 | const Vector colNums; 306 | unordered_map,StringVector::Hasher> data; 307 | 308 | Index (const TextTable &tab, 309 | const StringVector &columns); 310 | 311 | const Vector* find (const StringVector &values) const 312 | { const auto& it = data. find (values); 313 | if (it == data. end ()) 314 | return nullptr; 315 | return & it->second; 316 | } 317 | }; 318 | }; 319 | 320 | 321 | 322 | struct TsvOut 323 | { 324 | private: 325 | ostream* os {nullptr}; 326 | unique_ptr on; 327 | size_t lines {0}; 328 | size_t fields_max {0}; 329 | size_t fields {0}; 330 | public: 331 | bool usePound {true}; 332 | 333 | 334 | explicit TsvOut (ostream* os_arg, 335 | streamsize precision = 6, 336 | bool scientific = false) 337 | : os (os_arg) 338 | , on (os_arg ? new ONumber (*os_arg, precision, scientific) : nullptr) 339 | {} 340 | // !os_arg <=> disabled 341 | explicit TsvOut (ostream &os_arg, 342 | streamsize precision = 6, 343 | bool scientific = false) 344 | : TsvOut (& os_arg, precision, scientific) 345 | {} 346 | ~TsvOut () 347 | { if (fields) 348 | errorExitStr ("TsvOut: unfinished line with " + to_string (fields) + " fields"); 349 | } 350 | 351 | 352 | bool live () const 353 | { return os; } 354 | bool empty () const 355 | { return ! lines 356 | && ! fields_max 357 | && ! fields; 358 | } 359 | template 360 | TsvOut& operator<< (const T &field) 361 | { if (os) 362 | { if (lines && fields >= fields_max) 363 | throw runtime_error ("TsvOut: fields_max = " + to_string (fields_max)); 364 | if (fields) 365 | *os << '\t'; 366 | else if (! lines && usePound) 367 | *os << '#'; 368 | *os << field; 369 | fields++; 370 | } 371 | return *this; 372 | } 373 | void newLn () 374 | { if (! os) 375 | return; 376 | *os << endl; 377 | if (! lines) 378 | fields_max = fields; 379 | lines++; 380 | if (fields != fields_max) 381 | throw runtime_error ("TsvOut: fields_max = " + to_string (fields_max) + ", but fields = " + to_string (fields)); 382 | fields = 0; 383 | } 384 | }; 385 | 386 | 387 | 388 | } 389 | 390 | 391 | 392 | #endif 393 | 394 | -------------------------------------------------------------------------------- /amrfinder_update.cpp: -------------------------------------------------------------------------------- 1 | // amrfinder_update.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Updating of AMRFinder data 31 | * 32 | * Dependencies: libcurl 33 | 34 | * Release changes: see amrfinder.cpp 35 | * 36 | */ 37 | 38 | 39 | 40 | // PAR 41 | #undef TEST_UPDATE 42 | #define HTTPS 1 // 0: FTP 43 | 44 | 45 | 46 | #undef NDEBUG 47 | 48 | #include "common.hpp" 49 | using namespace Common_sp; 50 | #include "curl_easy.hpp" 51 | using namespace CURL_sp; 52 | 53 | #include "common.inc" 54 | 55 | 56 | 57 | namespace 58 | { 59 | 60 | 61 | // URL 62 | #ifdef TEST_UPDATE 63 | #define URL "https://ftp.ncbi.nlm.nih.gov/pathogen/Technical/AMRFinder_technical/test_database/" 64 | #else 65 | #define URL_SUF "://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/" 66 | #if HTTPS 67 | #define URL "https" URL_SUF 68 | #else 69 | #define URL "ftp" URL_SUF 70 | #endif 71 | #endif 72 | 73 | 74 | 75 | string getLatestMinor (Curl &curl) 76 | // Return: empty() <=> failure 77 | { 78 | StringVector dir (curl. read (URL), '\n', true); 79 | if (verbose ()) 80 | { 81 | save (cout, dir, '\n'); 82 | cout << endl; 83 | } 84 | 85 | Vector vers; 86 | for (string& line : dir) 87 | #if HTTPS 88 | if (isLeft (line, "' after '' in " + string (URL)); 99 | line. erase (pos2); 100 | 101 | istringstream iss (line); 102 | SoftwareVersion ver (iss, true); 103 | vers << std::move (ver); 104 | } 105 | catch (...) {} 106 | #else 107 | if (! contains (line, " -> ")) 108 | { 109 | trimTrailing (line); 110 | const size_t pos = line. rfind (' '); 111 | if (pos != string::npos) 112 | { 113 | istringstream iss (line. substr (pos + 1)); 114 | try 115 | { 116 | SoftwareVersion ver (iss, true); 117 | vers << std::move (ver); 118 | } 119 | catch (...) {} 120 | } 121 | } 122 | #endif 123 | if (vers. empty ()) 124 | return noString; 125 | 126 | vers. sort (); 127 | return vers. back (). getMinor (); 128 | } 129 | 130 | 131 | 132 | string getLatestDataVersion (Curl &curl, 133 | const string &minor) 134 | // Return: empty() <=> failure 135 | { 136 | const string url (URL + minor + "/"); 137 | StringVector dir (curl. read (url), '\n', true); 138 | if (verbose ()) 139 | { 140 | save (cout, dir, '\n'); 141 | cout << endl; 142 | } 143 | 144 | Vector dataVersions; 145 | for (string& line : dir) 146 | #if HTTPS 147 | if (isLeft (line, "' after '' in " + url); 158 | line. erase (pos2); 159 | 160 | istringstream iss (line); 161 | DataVersion dv (iss); 162 | dataVersions << std::move (dv); 163 | } 164 | catch (...) {} 165 | #else 166 | if (! contains (line, " -> ")) 167 | { 168 | trimTrailing (line); 169 | const size_t pos = line. rfind (' '); 170 | if (pos != string::npos) 171 | { 172 | istringstream iss (line. substr (pos + 1)); 173 | try 174 | { 175 | DataVersion dv (iss); 176 | dataVersions << std::move (dv); 177 | } 178 | catch (...) {} 179 | } 180 | } 181 | #endif 182 | if (dataVersions. empty ()) 183 | return noString; 184 | 185 | dataVersions. sort (); 186 | return dataVersions. back (). str (); 187 | } 188 | 189 | 190 | 191 | void fetchAMRFile (Curl &curl, 192 | const string &urlDir, 193 | const string &localDir, 194 | const string &fName) 195 | { 196 | ASSERT (isDirName (urlDir)); 197 | ASSERT (isDirName (localDir)); 198 | ASSERT (! fName. empty ()); 199 | curl. download (urlDir + fName, localDir + fName); 200 | } 201 | 202 | 203 | 204 | 205 | // ThisApplication 206 | 207 | struct ThisApplication final : ShellApplication 208 | { 209 | string curMinor; 210 | 211 | 212 | ThisApplication () 213 | : ShellApplication ("Update the database for AMRFinder from " URL "\n\ 214 | Requirement: the database directory contains subdirectories named by database versions.\ 215 | ", false, false, true, true) 216 | { 217 | addKey ("database", "Directory for all versions of AMRFinder databases", "$BASE/data", 'd', "DATABASE_DIR"); 218 | addKey ("blast_bin", "Directory for BLAST", "", '\0', "BLAST_DIR"); 219 | addKey ("hmmer_bin", "Directory for HMMer", "", '\0', "HMMER_DIR"); 220 | addFlag ("force_update", "Force updating the AMRFinder database"); // PD-3469 221 | version = SVN_REV; 222 | 223 | // curMinor 224 | { 225 | istringstream versionIss (version); 226 | const SoftwareVersion softwareVersion (versionIss); 227 | curMinor = softwareVersion. getMinor (); 228 | } 229 | } 230 | 231 | 232 | 233 | void createLatestLink (const string &mainDirS, 234 | const string &latestDir) const 235 | { 236 | ASSERT (! mainDirS. empty ()); 237 | ASSERT (! latestDir. empty ()); 238 | const string latestLink (mainDirS + "latest"); 239 | ::remove (latestLink. c_str ()); 240 | setSymlink (latestDir, latestLink, false); 241 | } 242 | 243 | 244 | 245 | void shellBody () const final 246 | { 247 | const string mainDirOrig = getArg ("database"); 248 | string blast_bin = getArg ("blast_bin"); 249 | string hmmer_bin = getArg ("hmmer_bin"); 250 | const bool force_update = getFlag ("force_update"); 251 | 252 | addDirSlash (blast_bin); 253 | addDirSlash (hmmer_bin); 254 | 255 | 256 | const Verbose vrb (qc_on); 257 | 258 | 259 | Curl curl; 260 | 261 | 262 | const bool screen = ! isRedirected (cerr); 263 | 264 | // FTP site files 265 | stderr << "Looking up the published databases at " << colorizeUrl (URL, screen) << '\n'; 266 | string load_minor = curMinor; 267 | string load_data_version; 268 | { 269 | const string published_minor (getLatestMinor (curl)); 270 | if (published_minor. empty ()) 271 | throw runtime_error ("Cannot get the software minor version of the latest published database version"); 272 | //if (qc_on) 273 | //stderr << "Latest published software minor version: " << published_minor << "\n"; 274 | // ASSERT: published_minor >= curMinor 275 | 276 | const string published_data_version (getLatestDataVersion (curl, published_minor)); 277 | if (published_data_version. empty ()) 278 | throw runtime_error ("Cannot get the latest published database version for the software minor version " + published_minor); 279 | 280 | const string cur_data_version (getLatestDataVersion (curl, curMinor)); 281 | load_data_version = cur_data_version; 282 | if (cur_data_version. empty ()) // Contents of (URL + curMinor) are empty ?? 283 | { 284 | stderr << "\n"; 285 | const Warning w (stderr); 286 | stderr << "Cannot get the latest published database version for the current software minor version " + curMinor + ".\n" 287 | << "The latest published database version " + published_data_version + " for the latest published software minor version " + published_minor + " will be used instead"; 288 | load_minor = published_minor; 289 | load_data_version = published_data_version; 290 | } 291 | else if (cur_data_version != published_data_version) 292 | { 293 | ASSERT (cur_data_version < published_data_version); 294 | stderr << "\n"; 295 | const Warning w (stderr); 296 | stderr << "A newer version of the database exists (" << published_data_version << "), but it requires " 297 | "a newer version of the software (" << published_minor << ") to install.\n" 298 | "See " + colorizeUrl ("https://github.com/ncbi/amr/wiki/Upgrading", screen) + " for more information.\n"; 299 | } 300 | } 301 | ASSERT (! load_data_version. empty ()); 302 | 303 | 304 | // Users's files 305 | string mainDirS; 306 | { 307 | const Dir mainDir (mainDirOrig); 308 | mainDirS = mainDir. get (); 309 | } 310 | addDirSlash (mainDirS); 311 | 312 | const string versionFName ("version.txt"); 313 | const string urlDir (URL + load_minor + "/" + load_data_version + "/"); 314 | const string latestDir (mainDirS + load_data_version + "/"); 315 | 316 | stderr << "Looking for the target directory: " << colorizeDir (latestDir, screen) << "\n"; 317 | if (directoryExists (latestDir)) 318 | { 319 | if (force_update) 320 | stderr << colorizeDir (latestDir, screen) << ": already exists, overwriting what was there\n"; 321 | else 322 | { 323 | curl. download (urlDir + versionFName, tmp + "/curl"); 324 | const StringVector version_old (latestDir + versionFName, (size_t) 100, true); 325 | const StringVector version_new (tmp + "/curl", (size_t) 100, true); 326 | if ( ! version_old. empty () 327 | && ! version_new. empty () 328 | && version_old. front () == version_new. front () 329 | ) 330 | { 331 | const Warning w (stderr); 332 | stderr << colorizeDir (latestDir, screen) << ": contains the latest version " << version_old. front () << '\n'; 333 | stderr << "Skipping update\nUse amrfinder --force_update to overwrite the existing database"; 334 | createLatestLink (mainDirS, /*latestDir*/ load_data_version); 335 | return; 336 | } 337 | } 338 | } 339 | else 340 | Dir (latestDir). create (); 341 | 342 | stderr << "Downloading AMRFinder database version " << load_data_version << " into: " << colorizeDir (latestDir, screen) << "\n"; 343 | // Requires: Software version >= 3.13.1 344 | fetchAMRFile (curl, urlDir, latestDir, "AMR.LIB"); 345 | fetchAMRFile (curl, urlDir, latestDir, "AMRProt.fa"); 346 | fetchAMRFile (curl, urlDir, latestDir, "AMRProt-mutation.tsv"); 347 | fetchAMRFile (curl, urlDir, latestDir, "AMRProt-suppress.tsv"); 348 | fetchAMRFile (curl, urlDir, latestDir, "AMRProt-susceptible.fa"); 349 | fetchAMRFile (curl, urlDir, latestDir, "AMRProt-susceptible.tsv"); 350 | fetchAMRFile (curl, urlDir, latestDir, "AMR_CDS.fa"); 351 | fetchAMRFile (curl, urlDir, latestDir, "database_format_version.txt"); // PD-3051 352 | fetchAMRFile (curl, urlDir, latestDir, "fam.tsv"); 353 | fetchAMRFile (curl, urlDir, latestDir, "taxgroup.tsv"); 354 | fetchAMRFile (curl, urlDir, latestDir, versionFName); 355 | 356 | StringVector dnaPointMuts; 357 | { 358 | LineInput f (latestDir + "taxgroup.tsv"); 359 | while (f. nextLine ()) 360 | { 361 | if (isLeft (f. line, "#")) 362 | continue; 363 | string taxgroup, gpipe; 364 | int n = -1; 365 | istringstream iss (f. line); 366 | iss >> taxgroup >> gpipe >> n; 367 | if (n < 0) 368 | throw runtime_error ("Bad " + latestDir + "taxgroup.tsv"); 369 | if (n) 370 | dnaPointMuts << taxgroup; 371 | } 372 | } 373 | 374 | for (const string& dnaPointMut : dnaPointMuts) 375 | { 376 | fetchAMRFile (curl, urlDir, latestDir, "AMR_DNA-" + dnaPointMut + ".fa"); 377 | fetchAMRFile (curl, urlDir, latestDir, "AMR_DNA-" + dnaPointMut + ".tsv"); 378 | } 379 | 380 | fetchAMRFile (curl, urlDir, latestDir, "changes.txt"); 381 | 382 | createLatestLink (mainDirS, load_data_version); 383 | 384 | 385 | prog2dir ["amrfinder_index"] = execDir; 386 | exec (fullProg ("amrfinder_index") + shellQuote (latestDir) 387 | + makeKey ("blast_bin", blast_bin) 388 | + makeKey ("hmmer_bin", hmmer_bin) 389 | + ifS (getQuiet (), " -q") + ifS (qc_on, " --debug") + " > " + tmp + "/amrfinder_index.err", tmp + "/amrfinder_index.err"); 390 | } 391 | }; 392 | 393 | 394 | 395 | } // namespace 396 | 397 | 398 | 399 | int main (int argc, 400 | const char* argv[]) 401 | { 402 | ThisApplication app; 403 | return app. run (argc, argv); 404 | } 405 | 406 | 407 | 408 | -------------------------------------------------------------------------------- /dna_mutation.cpp: -------------------------------------------------------------------------------- 1 | // dna_mutation.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * Identification of mutations at DNA level 31 | * 32 | * Release changes: see amrfinder.cpp 33 | * 34 | */ 35 | 36 | 37 | #undef NDEBUG 38 | 39 | #include "common.hpp" 40 | #include "tsv.hpp" 41 | using namespace Common_sp; 42 | #include "alignment.hpp" 43 | using namespace Alignment_sp; 44 | #include "columns.hpp" 45 | 46 | #include "common.inc" 47 | 48 | 49 | 50 | namespace 51 | { 52 | 53 | map > accession2mutations; 54 | string input_name; 55 | bool print_node = false; 56 | 57 | 58 | 59 | struct BlastnAlignment final : Alignment 60 | { 61 | // PD-2001 62 | static constexpr const size_t flankingLen = 200; // PAR 63 | string organism; 64 | string refAccessionFrag; 65 | string product; 66 | string gene; 67 | 68 | 69 | BlastnAlignment (const string &line, 70 | const string &organism_arg) 71 | : Alignment (line, false, false) 72 | , organism (organism_arg) 73 | { 74 | // To match AmrMutation 75 | strUpper (qseq); 76 | strUpper (sseq); 77 | 78 | replace (organism, '_', ' '); 79 | try 80 | { 81 | // qseqid = NC_022347.1@23S_ribosomal_RNA@23S@-159:1040292-1037381 82 | // accesion_version@gene_name@gene_symbol@offset:start-stop 83 | ASSERT (! qseqid. empty ()); 84 | // PD-3419 85 | { 86 | string s (qseqid); 87 | refAccessionFrag = findSplit (s, '@'); 88 | product = findSplit (s, '@'); 89 | gene = findSplit (s, ':'); 90 | //ref_offset = stoi (findSplit (s, ':')); 91 | refAccessionFrag += ":" + s; 92 | } 93 | replace (product, '_', ' '); 94 | if (const Vector* refMutations = findPtr (accession2mutations, qseqid)) 95 | setSeqChanges (*refMutations, flankingLen); 96 | } 97 | catch (const exception &e) 98 | { 99 | throw runtime_error (line + "\n" + e. what ());; 100 | } 101 | } 102 | void qc () const override 103 | { if (! qc_on) 104 | return; 105 | Alignment::qc (); 106 | QC_ASSERT (! aProt); 107 | QC_ASSERT (! refAccessionFrag. empty ()); 108 | QC_ASSERT (! product. empty ()); 109 | QC_ASSERT (! gene. empty ()); 110 | QC_ASSERT (! organism. empty ()); 111 | } 112 | void report (TsvOut& td, 113 | bool mutationAll) const 114 | { for (const SeqChange& seqChange : seqChanges) 115 | { 116 | VectorPtr mutations (seqChange. mutations); 117 | if (mutations. empty ()) 118 | mutations << nullptr; 119 | for (const AmrMutation* mutation : mutations) 120 | { 121 | { 122 | bool skip = true; 123 | if (mutationAll) 124 | skip = false; 125 | if (! seqChange. empty () && mutation && ! seqChange. replacement) // resistant mutation 126 | skip = false; 127 | if (skip) 128 | continue; 129 | } 130 | ASSERT (! (seqChange. empty () && ! mutation)); 131 | if (! input_name. empty ()) 132 | td << input_name;; 133 | td << na // PD-2534 134 | << nvl (sseqid, na) 135 | << (empty () ? 0 : sInt. start + 1) 136 | << (empty () ? 0 : sInt. stop) 137 | << (empty () ? string (na) : string (1, strand2char (sInt. strand))) 138 | << (mutation 139 | ? seqChange. empty () 140 | ? mutation->wildtype () 141 | : mutation->geneMutation 142 | : gene + "_" + seqChange. getMutationStr () 143 | ) 144 | << (mutation 145 | ? seqChange. empty () 146 | ? organism + " " + product + " [WILDTYPE]" 147 | : mutation->name 148 | : organism + " " + product + " [UNKNOWN]" 149 | ) 150 | << "core" // PD-2825 151 | // PD-1856 152 | << "AMR" 153 | << "POINT" 154 | << (mutation ? nvl (mutation->classS, na) : na) 155 | << (mutation ? nvl (mutation->subclass, na) : na); 156 | if (empty ()) 157 | td << na 158 | << na 159 | << na 160 | << na 161 | << na 162 | << na 163 | << na 164 | << na; 165 | else 166 | td << "POINTN" // PD-2088 167 | << sInt. len () // was: targetLen // PD-3796 168 | << qlen 169 | << qRelCoverage () * 100 170 | << relIdentity () * 100 171 | << sseq. size () 172 | << refAccessionFrag // qseqid 173 | << product; // pm.gene 174 | // HMM 175 | td << na 176 | << na; 177 | if (print_node) 178 | td << na; 179 | td. newLn (); 180 | #if 0 181 | if (! seqChange. empty () && mutation && ! seqChange. replacement) // resistant mutation 182 | os << td. str () << endl; 183 | if (mutation_all. get ()) 184 | *mutation_all << td. str () << endl; 185 | #endif 186 | } 187 | } 188 | } 189 | 190 | 191 | bool good () const 192 | { return sseq. size () >= min (qlen, 2 * flankingLen + 1); } 193 | #if 0 194 | bool operator< (const BlastnAlignment &other) const 195 | { LESS_PART (*this, other, sseqid); 196 | LESS_PART (other, *this, relIdentity ()); 197 | LESS_PART (*this, other, sstart); 198 | LESS_PART (*this, other, qseqid); 199 | return false; 200 | } 201 | #endif 202 | }; 203 | 204 | 205 | 206 | 207 | struct Batch 208 | { 209 | VectorOwn blastAls; 210 | 211 | 212 | explicit Batch (const string &mutation_tab) 213 | { 214 | { 215 | LineInput f (mutation_tab); 216 | Istringstream iss; 217 | while (f. nextLine ()) 218 | { 219 | if (isLeft (f. line, "#")) 220 | continue; 221 | iss. reset (f. line); 222 | string accession, geneMutation_std, geneMutation_report, classS, subclass, name; 223 | int pos; 224 | iss >> accession >> pos >> geneMutation_std >> geneMutation_report >> classS >> subclass >> name; 225 | QC_ASSERT (pos > 0); 226 | QC_ASSERT (! name. empty ()); 227 | accession2mutations [accession] << std::move (AmrMutation ((size_t) pos, geneMutation_std, geneMutation_report, classS, subclass, name)); 228 | } 229 | } 230 | for (auto& it : accession2mutations) 231 | { 232 | it. second. sort (); 233 | if (! it. second. isUniq ()) 234 | throw runtime_error ("Duplicate reference mutations for " + it. first); 235 | } 236 | } 237 | 238 | 239 | void report (TsvOut &td, 240 | bool mutationAll) const 241 | { 242 | ASSERT (td. empty ()); 243 | 244 | // Cf. BlastnAlignment::report() 245 | if (! input_name. empty ()) 246 | td << "Name"; 247 | td << prot_colName // sseqid 248 | // Contig 249 | << contig_colName 250 | // target 251 | << start_colName 252 | << stop_colName 253 | << strand_colName 254 | // 255 | << genesymbol_colName 256 | << elemName_colName // was: "AmrMutation name" 257 | << scope_colName 258 | << type_colName 259 | << subtype_colName 260 | << class_colName 261 | << subclass_colName 262 | // 263 | << method_colName 264 | << targetLen_colName 265 | // 266 | << refLen_colName // qlen 267 | << refCov_colName // queryCoverage 268 | << refIdent_colName 269 | << alignLen_colName // length 270 | << closestRefAccession_colName 271 | << closestRefName_colName 272 | // 273 | << hmmAccession_colName 274 | << hmmDescr_colName 275 | ; 276 | if (print_node) 277 | td << hierarchyNode_colName; 278 | td. newLn (); 279 | 280 | for (const BlastnAlignment* blastAl : blastAls) 281 | { 282 | ASSERT (blastAl); 283 | blastAl->report (td, mutationAll); 284 | blastAl->qc (); 285 | } 286 | } 287 | }; 288 | 289 | 290 | 291 | 292 | // ThisApplication 293 | 294 | struct ThisApplication final : Application 295 | { 296 | ThisApplication () 297 | : Application ("Find mutations at DNA level and report in the format of amr_report.cpp") 298 | { 299 | addPositional ("blastn", string ("blastn output in the format: ") + Hsp::format [true] + ". qseqid is the 1st column of table"); 300 | addPositional ("mutation", "Mutations table"); 301 | addPositional ("organism", "Organism name"); 302 | addKey ("mutation_all", "File to report all mutations"); 303 | addKey ("name", "Text to be added as the first column \"name\" to all rows of the report"); 304 | addFlag ("print_node", "Print FAM.id"); 305 | version = SVN_REV; 306 | } 307 | 308 | 309 | 310 | void body () const final 311 | { 312 | const string blastnFName = getArg ("blastn"); 313 | const string mutation_tab = getArg ("mutation"); 314 | const string organism = getArg ("organism"); 315 | const string mutation_all_FName = getArg ("mutation_all"); 316 | input_name = getArg ("name"); 317 | print_node = getFlag ("print_node"); 318 | 319 | 320 | Batch batch (mutation_tab); 321 | 322 | 323 | // Input 324 | { 325 | LineInput f (blastnFName); 326 | while (f. nextLine ()) 327 | { 328 | { 329 | Unverbose unv; 330 | if (verbose ()) 331 | cout << f. line << endl; 332 | } 333 | unique_ptr al (new BlastnAlignment (f. line, organism)); 334 | al->qc (); 335 | if (al->good ()) 336 | batch. blastAls << al. release (); 337 | } 338 | } 339 | if (verbose ()) 340 | { 341 | cout << "# Good Blasts: " << batch. blastAls. size () << endl; 342 | for (const BlastnAlignment* blastAl : batch. blastAls) 343 | { 344 | ASSERT (blastAl); 345 | blastAl->saveText (cout); 346 | cout << ' ' << blastAl->seqChanges. size () << endl; 347 | } 348 | } 349 | 350 | 351 | // Group by sseqid and process each sseqid separately for speed ?? 352 | //Common_sp::sort (batch. blastAls); 353 | for (const BlastnAlignment* blastAl1 : batch. blastAls) 354 | for (const SeqChange& seqChange1 : blastAl1->seqChanges) 355 | { 356 | ASSERT (seqChange1. al == blastAl1); 357 | //ASSERT (seqChange1. mutation); 358 | for (const BlastnAlignment* blastAl2 : batch. blastAls) 359 | if ( blastAl2->sseqid == blastAl1->sseqid 360 | && blastAl2->sInt. strand == blastAl1->sInt. strand 361 | && blastAl2 != blastAl1 362 | ) 363 | //for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) 364 | for (SeqChange& seqChange2 : var_cast (blastAl2) -> seqChanges) 365 | { 366 | //SeqChange& seqChange2 = *iter; 367 | ASSERT (seqChange2. al == blastAl2); 368 | //ASSERT (seqChange2. mutation); 369 | if ( seqChange1. start_target == seqChange2. start_target 370 | && seqChange1. better (seqChange2) 371 | ) 372 | //iter. erase (); 373 | seqChange2. replacement = & seqChange1; 374 | } 375 | } 376 | 377 | #if 0 378 | // [UNKNOWN] 379 | { 380 | map mutation2ptr; 381 | for (const auto& it : accession2mutations) 382 | for (const AmrMutation& mut : it. second) 383 | mutation2ptr [mut] = & mut; 384 | for (const BlastnAlignment* al : batch. blastAls) 385 | for (const SeqChange& seqChange : al->seqChanges) 386 | if (const AmrMutation* mut = seqChange. mutation) 387 | mutation2ptr. erase (*mut); 388 | for (const auto& it : mutation2ptr) 389 | { 390 | const auto al = new BlastnAlignment (* it. second); 391 | batch. blastAls << al; 392 | } 393 | } 394 | #endif 395 | 396 | 397 | // Output 398 | { 399 | TsvOut td (cout, 2, false); 400 | td. usePound = false; 401 | batch. report (td, false); 402 | } 403 | if (! mutation_all_FName. empty ()) 404 | { 405 | OFStream f (mutation_all_FName); 406 | TsvOut td (f, 2, false); 407 | td. usePound = false; 408 | batch. report (td, true); 409 | } 410 | } 411 | }; 412 | 413 | 414 | 415 | } // namespace 416 | 417 | 418 | 419 | int main (int argc, 420 | const char* argv[]) 421 | { 422 | ThisApplication app; 423 | return app. run (argc, argv); 424 | } 425 | 426 | 427 | 428 | -------------------------------------------------------------------------------- /test_dna_mut_all.expected: -------------------------------------------------------------------------------- 1 | Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description 2 | NA contig05 237 1224 - 23S_A2058A Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE AZITHROMYCIN/ERYTHROMYCIN/TELITHROMYCIN POINTN 988 2905 34.01 81.08 1004 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 3 | NA contig05 237 1224 - 23S_C2611C Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE ERYTHROMYCIN/TELITHROMYCIN POINTN 988 2905 34.01 81.08 1004 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 4 | NA contig05 237 1224 - 23S_G2057G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MULTIDRUG CHLORAMPHENICOL/ERYTHROMYCIN/TELITHROMYCIN POINTN 988 2905 34.01 81.08 1004 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 5 | NA contig05 237 1224 - 23S_G2447G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT OXAZOLIDINONE LINEZOLID POINTN 988 2905 34.01 81.08 1004 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 6 | NA contig05 237 1224 - 23S_T2609T Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE TELITHROMYCIN POINTN 988 2905 34.01 81.08 1004 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 7 | NA contig14 1 1089 + pmrB_A159A two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 8 | NA contig14 1 1089 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 9 | NA contig14 1 1089 + pmrB_E121E two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 10 | NA contig14 1 1089 + pmrB_E166E two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 11 | NA contig14 1 1089 + pmrB_G206G two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 12 | NA contig14 1 1089 + pmrB_L10L two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 13 | NA contig14 1 1089 + pmrB_L14L two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 14 | NA contig14 1 1089 + pmrB_P94P two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 15 | NA contig14 1 1089 + pmrB_T147T two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 16 | NA contig14 1 1089 + pmrB_T156T two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 17 | NA contig14 1 1089 + pmrB_V161V two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 18 | NA contig14 1093 2181 + pmrB_A159A two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 19 | NA contig14 1093 2181 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 20 | NA contig14 1093 2181 + pmrB_E121E two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 21 | NA contig14 1093 2181 + pmrB_E166E two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 22 | NA contig14 1093 2181 + pmrB_G206G two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 23 | NA contig14 1093 2181 + pmrB_L10L two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 24 | NA contig14 1093 2181 + pmrB_L14L two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 25 | NA contig14 1093 2181 + pmrB_P94P two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 26 | NA contig14 1093 2181 + pmrB_T147T two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 27 | NA contig14 1093 2181 + pmrB_T156T two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 28 | NA contig14 1093 2181 + pmrB_V161V two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 29 | NA contig14 2185 3273 + pmrB_A159A two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 30 | NA contig14 2185 3273 + pmrB_C84C two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 31 | NA contig14 2185 3273 + pmrB_E121E two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 32 | NA contig14 2185 3273 + pmrB_E166E two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 33 | NA contig14 2185 3273 + pmrB_G206G two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 34 | NA contig14 2185 3273 + pmrB_L10L two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 35 | NA contig14 2185 3273 + pmrB_L14L two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 36 | NA contig14 2185 3273 + pmrB_P94P two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 37 | NA contig14 2185 3273 + pmrB_T147T two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 38 | NA contig14 2185 3273 + pmrB_T156T two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 39 | NA contig14 2185 3273 + pmrB_V161V two-component system sensor histidine kinase PmrB [WILDTYPE] core AMR POINT COLISTIN COLISTIN EXACTX 363 363 100.00 100.00 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA 40 | NA contig15 1 2905 + 23S_A2058T Escherichia azithromycin/erythromycin/telithromycin resistant 23S core AMR POINT MACROLIDE AZITHROMYCIN/ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 41 | NA contig15 1 2905 + 23S_C2611C Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 42 | NA contig15 1 2905 + 23S_G2032G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT LINCOSAMIDE/OXAZOLIDINONE CLINDAMYCIN/LINEZOLID POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 43 | NA contig15 1 2905 + 23S_G2032G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE/OXAZOLIDINONE CLARITHROMYCIN/LINEZOLID POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 44 | NA contig15 1 2905 + 23S_G2032G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT OXAZOLIDINONE LINEZOLID POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 45 | NA contig15 1 2905 + 23S_G2057G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MULTIDRUG CHLORAMPHENICOL/ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 46 | NA contig15 1 2905 + 23S_G2447G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT OXAZOLIDINONE LINEZOLID POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 47 | NA contig15 1 2905 + 23S_T2609T Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE TELITHROMYCIN POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 48 | NA contig15 1 2905 + 23S_T754T Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 99.97 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 49 | NA contig15 2906 5810 + 23S_A2058A Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE AZITHROMYCIN/ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 50 | NA contig15 2906 5810 + 23S_C2611C Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 51 | NA contig15 2906 5810 + 23S_G2032G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT LINCOSAMIDE/OXAZOLIDINONE CLINDAMYCIN/LINEZOLID POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 52 | NA contig15 2906 5810 + 23S_G2032G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE/OXAZOLIDINONE CLARITHROMYCIN/LINEZOLID POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 53 | NA contig15 2906 5810 + 23S_G2032G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT OXAZOLIDINONE LINEZOLID POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 54 | NA contig15 2906 5810 + 23S_G2057G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MULTIDRUG CHLORAMPHENICOL/ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 55 | NA contig15 2906 5810 + 23S_G2447G Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT OXAZOLIDINONE LINEZOLID POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 56 | NA contig15 2906 5810 + 23S_T2609T Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE TELITHROMYCIN POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 57 | NA contig15 2906 5810 + 23S_T754T Escherichia 23S ribosomal RNA [WILDTYPE] core AMR POINT MACROLIDE ERYTHROMYCIN/TELITHROMYCIN POINTN 2905 2905 100.00 100.00 2905 NC_004431.1:237160-240064 23S ribosomal RNA NA NA 58 | NA contig16 1 720 + nfsA_E223E nitroreductase NfsA [WILDTYPE] core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA 59 | NA contig16 1 720 + nfsA_G131G nitroreductase NfsA [WILDTYPE] core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA 60 | NA contig16 1 720 + nfsA_K141STOP Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA 61 | NA contig16 1 720 + nfsA_Q44Q nitroreductase NfsA [WILDTYPE] core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA 62 | NA contig16 1 720 + nfsA_R133R nitroreductase NfsA [WILDTYPE] core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA 63 | NA contig16 1 720 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA 64 | NA contig16 1 720 + nfsA_R203R nitroreductase NfsA [WILDTYPE] core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA 65 | NA contig16 1 720 + nfsA_S33S nitroreductase NfsA [WILDTYPE] core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA 66 | NA contig17 1 247 + ampC_C-11C Escherichia ampC/blaEC promoter region [WILDTYPE] core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA 67 | NA contig17 1 247 + ampC_C-42C Escherichia ampC/blaEC promoter region [WILDTYPE] core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA 68 | NA contig17 1 247 + ampC_G-15G Escherichia ampC/blaEC promoter region [WILDTYPE] core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA 69 | NA contig17 1 247 + ampC_T-14T Escherichia ampC/blaEC promoter region [WILDTYPE] core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA 70 | NA contig17 1 247 + ampC_T-14TGT Escherichia cephalosporin resistant ampC core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA 71 | NA contig17 1 247 + ampC_T-32T Escherichia ampC/blaEC promoter region [WILDTYPE] core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA 72 | -------------------------------------------------------------------------------- /gff.cpp: -------------------------------------------------------------------------------- 1 | // gff.cpp 2 | 3 | /*=========================================================================== 4 | * 5 | * PUBLIC DOMAIN NOTICE 6 | * National Center for Biotechnology Information 7 | * 8 | * This software/database is a "United States Government Work" under the 9 | * terms of the United States Copyright Act. It was written as part of 10 | * the author's official duties as a United States Government employee and 11 | * thus cannot be copyrighted. This software/database is freely available 12 | * to the public for use. The National Library of Medicine and the U.S. 13 | * Government have not placed any restriction on its use or reproduction. 14 | * 15 | * Although all reasonable efforts have been taken to ensure the accuracy 16 | * and reliability of the software and data, the NLM and the U.S. 17 | * Government do not and cannot warrant the performance or results that 18 | * may be obtained by using this software or data. The NLM and the U.S. 19 | * Government disclaim all warranties, express or implied, including 20 | * warranties of performance, merchantability or fitness for any particular 21 | * purpose. 22 | * 23 | * Please cite the author in any work or product based on this material. 24 | * 25 | * =========================================================================== 26 | * 27 | * Author: Vyacheslav Brover 28 | * 29 | * File Description: 30 | * .gff file reader 31 | * 32 | */ 33 | 34 | 35 | #undef NDEBUG 36 | 37 | #include "gff.hpp" 38 | 39 | #include "common.inc" 40 | 41 | 42 | 43 | namespace GFF_sp 44 | { 45 | 46 | 47 | 48 | // Locus 49 | 50 | Locus::Locus (size_t lineNum_arg, 51 | const string &contig_arg, 52 | size_t start_arg, 53 | size_t stop_arg, 54 | bool strand_arg, 55 | bool partial_arg, 56 | size_t crossOriginSeqLen_arg, 57 | string gene_arg, 58 | string product_arg) 59 | : lineNum (lineNum_arg) 60 | , contig (contig_arg) 61 | , start (start_arg) 62 | , stop (stop_arg) 63 | , strand (strand_arg) 64 | , partial (partial_arg) 65 | , contigLen (crossOriginSeqLen_arg) 66 | , crossOrigin (bool (crossOriginSeqLen_arg)) 67 | , gene (gene_arg) 68 | , product (product_arg) 69 | { 70 | //QC_ASSERT (lineNum >= 1); 71 | trim (contig); 72 | if (contig. empty ()) 73 | throw runtime_error ("Empty contig name"); 74 | if (crossOrigin) 75 | { 76 | swap (start, stop); 77 | start--; 78 | stop++; 79 | QC_ASSERT (contigLen); 80 | QC_ASSERT (stop <= contigLen); 81 | } 82 | QC_ASSERT (start < stop); 83 | } 84 | 85 | 86 | 87 | bool Locus::operator< (const Locus& other) const 88 | { 89 | LESS_PART (*this, other, contig) 90 | LESS_PART (*this, other, start) 91 | LESS_PART (*this, other, stop) 92 | LESS_PART (*this, other, strand) 93 | //LESS_PART (*this, other, contigLen); 94 | LESS_PART (*this, other, crossOrigin); 95 | return false; 96 | } 97 | 98 | 99 | 100 | 101 | // Gff 102 | 103 | const StringVector Gff::names {"bakta", "genbank", "microscope", "patric", "pgap", "prodigal", "prokka", "pseudomonasdb", "rast", "standard"}; 104 | 105 | 106 | Gff::Type Gff::name2type (const string &name) 107 | { 108 | if (name == "bakta") return bakta; 109 | if (name == "genbank") return genbank; 110 | if (name == "microscope") return microscope; 111 | if (name == "patric") return patric; 112 | if (name == "pgap") return pgap; 113 | if (name == "prodigal") return prodigal; 114 | if (name == "prokka") return prokka; 115 | if (name == "pseudomonasdb") return pseudomonasdb; 116 | if (name == "rast") return rast; 117 | if (name == "standard") return standard; 118 | throw runtime_error ("Unknown GFF type: " + strQuote (name)); 119 | } 120 | 121 | 122 | 123 | 124 | // Annot 125 | 126 | namespace 127 | { 128 | 129 | string unescape (const string &s) 130 | { 131 | string r (unpercent (s)); 132 | trim (r); 133 | return r; 134 | } 135 | 136 | 137 | 138 | void pgap_accession (string &accession, 139 | bool lcl) 140 | // Update: accession 141 | { 142 | static const string gnlPrefix ("gnl|"); 143 | static const string lclPrefix ("lcl|"); 144 | 145 | size_t pos = accession. rfind (':'); 146 | if (pos == string::npos) 147 | { 148 | if (lcl) 149 | accession = lclPrefix + accession; 150 | } 151 | else 152 | { 153 | if (lcl) 154 | throw runtime_error ("Accession " + strQuote (accession) + " cannot have " + strQuote (gnlPrefix) + " and " + strQuote (lclPrefix) + " at the same time"); 155 | accession [pos] = '|'; 156 | accession = gnlPrefix + accession; 157 | } 158 | 159 | QC_ASSERT (! accession. empty ()); 160 | } 161 | 162 | } 163 | 164 | 165 | 166 | Annot::Annot (const string &fName, 167 | Gff::Type gffType, 168 | bool protMatch, 169 | bool lcl) 170 | { 171 | IMPLY (protMatch, gffType == Gff::genbank 172 | || gffType == Gff::microscope 173 | || gffType == Gff::prodigal 174 | ); 175 | IMPLY (gffType == Gff::microscope, protMatch); 176 | //IMPLY (trimProject, gffType == Gff::genbank); 177 | IMPLY (lcl, gffType == Gff::pgap); 178 | 179 | if (fName. empty ()) 180 | throw runtime_error ("Empty GFF file name"); 181 | 182 | LineInput f (fName /*, 100 * 1024, 1*/); 183 | while (f. nextLine ()) 184 | { 185 | trim (f. line); 186 | 187 | if ( ( gffType == Gff::prokka 188 | || gffType == Gff::bakta 189 | ) 190 | && f. line == "##FASTA" 191 | ) 192 | break; 193 | 194 | if ( f. line. empty () 195 | || f. line [0] == '#' 196 | ) 197 | continue; 198 | 199 | try 200 | { 201 | /*1*/ string contig (unescape (findSplit (f. line, '\t'))); 202 | /*2*/ const string source (unescape (findSplit (f. line, '\t'))); 203 | /*3*/ const string type (unescape (findSplit (f. line, '\t'))); 204 | /*4*/ const string startS (unescape (findSplit (f. line, '\t'))); 205 | /*5*/ const string stopS (unescape (findSplit (f. line, '\t'))); 206 | /*6*/ const string score (unescape (findSplit (f. line, '\t'))); // real number 207 | /*7*/ const string strand (unescape (findSplit (f. line, '\t'))); 208 | /*8*/ const string phase (unescape (findSplit (f. line, '\t'))); // frame 209 | /*9*/ string attributes (f. line); 210 | 211 | trim (attributes); 212 | if (attributes. empty ()) 213 | throw runtime_error ("9 fields are expected in each line"); 214 | 215 | #if 0 216 | if (trimProject) 217 | if (contains (contig, ":")) 218 | findSplit (contig, ':'); // = project_id 219 | #endif 220 | if (contig. empty ()) 221 | throw runtime_error ("empty sequence indentifier"); 222 | for (const char c : contig) 223 | if (! printable (c)) 224 | throw runtime_error ("Non-printable character in the sequence identifier: " + to_string (c)); 225 | 226 | if ( type != "CDS" 227 | && type != "gene" 228 | && type != "pseudogene" 229 | ) 230 | continue; 231 | 232 | if (gffType == Gff::pgap && type != "CDS") 233 | continue; 234 | 235 | long start = -1; 236 | if (! str2 (startS, start)) 237 | throw runtime_error ("Cannot read start"); 238 | if (start <= 0) 239 | throw runtime_error ("start should be >= 1"); 240 | 241 | long stop = -1; 242 | if (! str2 (stopS, stop)) 243 | throw runtime_error ("Cannot read stop"); 244 | if (stop <= 0) 245 | throw runtime_error ("stop should be >= 1"); 246 | 247 | if (start > stop) 248 | throw runtime_error ("start cannot be greater than stop"); 249 | 250 | start--; 251 | 252 | if ( strand != "+" 253 | && strand != "-" 254 | ) 255 | throw runtime_error ("strand should be '+' or '-'"); 256 | 257 | const bool pseudo = contains (attributes, "pseudo=true") 258 | || contains (attributes, "gene_biotype=pseudogene") 259 | || type == "pseudogene"; 260 | //if (pseudo && type == "CDS") 261 | //continue; 262 | 263 | const bool partial = contains (attributes, "partial=true") 264 | // Gff::prodigal 265 | || contains (attributes, "partial=01") 266 | || contains (attributes, "partial=10") 267 | || contains (attributes, "partial=11"); 268 | 269 | string protAttr = "Name"; 270 | switch (gffType) 271 | { 272 | case Gff::bakta: protAttr = "ID"; break; 273 | case Gff::genbank: protAttr = (protMatch || pseudo) ? "locus_tag" : "Name"; break; 274 | case Gff::microscope: protAttr = "ID"; break; 275 | case Gff::patric: protAttr = "ID"; break; 276 | case Gff::prodigal: protAttr = "ID"; break; 277 | case Gff::prokka: protAttr = "ID"; break; 278 | case Gff::pseudomonasdb: protAttr = "Alias"; break; // for type = "gene", "locus" for type = "CDS" 279 | case Gff::rast: protAttr = "ID"; break; 280 | default: break; 281 | } 282 | ASSERT (! protAttr. empty ()); 283 | protAttr += "="; 284 | 285 | string prot_; 286 | string gene_; 287 | string product_; 288 | { 289 | string locusTag; 290 | while (! attributes. empty ()) 291 | { 292 | string s (findSplit (attributes, ';')); 293 | trim (s); 294 | if (isLeft (s, protAttr)) 295 | { 296 | prot_ = s; 297 | findSplit (prot_, '='); 298 | } 299 | else if (isLeft (s, "gene=")) 300 | { 301 | gene_ = s; 302 | findSplit (gene_, '='); 303 | } 304 | else if (isLeft (s, "product=")) 305 | { 306 | product_ = s; 307 | findSplit (product_, '='); 308 | //replace (product, tmpSpace, ' '); 309 | } 310 | else if (gffType == Gff::patric && isLeft (s, "locus_tag=")) 311 | { 312 | locusTag = s; 313 | findSplit (locusTag, '='); 314 | } 315 | } 316 | trimPrefix (prot_, "\""); 317 | trimSuffix (prot_, "\""); 318 | if (prot_. empty ()) 319 | continue; 320 | //throw runtime_error ("no attribute '" + protAttr + "': " + f. line); 321 | 322 | switch (gffType) 323 | { 324 | case Gff::genbank: if (contains (prot_, ":")) findSplit (prot_, ':'); break; 325 | case Gff::patric: if (! locusTag. empty ()) prot_ += "|" + locusTag; 326 | if (isLeft (contig, "accn|")) contig. erase (0, 5); 327 | break; 328 | default: break; 329 | } 330 | } 331 | QC_ASSERT (! prot_. empty ()); 332 | 333 | string prot (unescape (prot_)); 334 | const string gene (unescape (gene_)); 335 | const string product (unescape (product_)); 336 | 337 | if (gffType == Gff::pgap) 338 | { 339 | pgap_accession (prot, false); 340 | pgap_accession (contig, lcl); 341 | } 342 | QC_ASSERT (! prot. empty ()); 343 | 344 | Locus locus ((size_t) f. lineNum, contig, (size_t) start, (size_t) stop, strand == "+", partial, 0, gene, product); 345 | #if 0 346 | // DNA may be truncated 347 | if (type == "CDS" && ! pseudo && locus. size () % 3 != 0) 348 | { 349 | cout << "Locus tag: " << prot << endl; 350 | locus. print (cout); 351 | ERROR; 352 | } 353 | #endif 354 | 355 | prot2loci [prot] << std::move (locus); 356 | } 357 | catch (const exception &e) 358 | { 359 | throw runtime_error ("File " + fName + ", " + f. lineStr () + ": " + e. what ()); 360 | } 361 | } 362 | } 363 | 364 | 365 | 366 | Annot::Annot (const string &fName) 367 | { 368 | if (fName. empty ()) 369 | throw runtime_error ("Empty BED file name"); 370 | 371 | LineInput f (fName /*, 100 * 1024, 1*/); 372 | while (f. nextLine ()) 373 | { 374 | trim (f. line); 375 | if ( f. line. empty () 376 | || f. line [0] == '#' 377 | ) 378 | continue; 379 | 380 | replace (f. line, ' ', '_'); // to use '\t' as delimiter 381 | 382 | const string errorS ("File " + fName + ", " + f. lineStr () + ": "); 383 | 384 | string contig, prot; 385 | size_t start, stop; 386 | double score; 387 | char strand = ' '; 388 | static Istringstream iss; 389 | iss. reset (f. line); 390 | iss >> contig >> start >> stop >> prot >> score >> strand; 391 | 392 | if (strand == ' ') 393 | throw runtime_error (errorS + "at least 5 fields are expected in each line"); 394 | 395 | for (const char c : contig) 396 | if (! printable (c)) 397 | throw runtime_error (errorS + "Non-printable character in the sequence identifier: " + to_string (c)); 398 | 399 | if (start >= stop) 400 | throw runtime_error (errorS + "start should be less than stop"); 401 | 402 | if ( strand != '+' 403 | && strand != '-' 404 | ) 405 | throw runtime_error (errorS + "strand should be '+' or '-'"); 406 | 407 | trim (prot, '_'); 408 | ASSERT (! prot. empty ()); 409 | prot2loci [prot] << Locus ((size_t) f. lineNum, contig, start, stop, strand == '+', false/*partial*/, 0, noString, noString); 410 | } 411 | } 412 | 413 | 414 | 415 | void Annot::load_fasta2gff_prot (const string &fName) 416 | { 417 | ASSERT (fasta2gff_prot. empty ()); 418 | 419 | LineInput f (fName); 420 | Istringstream iss; 421 | string fasta_prot, gff_prot; 422 | while (f. nextLine ()) 423 | { 424 | iss. reset (f. line); 425 | fasta_prot. clear (); 426 | gff_prot. clear (); 427 | iss >> fasta_prot >> gff_prot; 428 | QC_ASSERT (! gff_prot. empty ()); 429 | fasta2gff_prot [fasta_prot] = gff_prot; 430 | } 431 | if (fasta2gff_prot. empty ()) 432 | throw runtime_error ("File " + fName + " is empty"); 433 | } 434 | 435 | 436 | 437 | void Annot::load_fasta2gff_dna (const string &fName) 438 | { 439 | map gff2fasta; 440 | { 441 | LineInput f (fName); 442 | Istringstream iss; 443 | string fasta_dna, gff_dna; 444 | while (f. nextLine ()) 445 | { 446 | iss. reset (f. line); 447 | fasta_dna. clear (); 448 | gff_dna. clear (); 449 | iss >> fasta_dna >> gff_dna; 450 | QC_ASSERT (! gff_dna. empty ()); 451 | gff2fasta [gff_dna] = fasta_dna; 452 | } 453 | } 454 | if (gff2fasta. empty ()) 455 | throw runtime_error ("File " + fName + " is empty"); 456 | 457 | for (auto& it : prot2loci) 458 | { 459 | Set& loci = it. second; 460 | for (const Locus& locus : loci) 461 | { 462 | string s; 463 | if (! find (gff2fasta, locus. contig, s)) 464 | throw runtime_error ("FASTA DNA contig " + strQuote (locus. contig) + " is not found in GFF-DNA match file " + strQuote (fName)); 465 | var_cast (locus). contig = std::move (s); 466 | } 467 | } 468 | } 469 | 470 | 471 | 472 | const Set& Annot::findLoci (const string &fasta_prot) const 473 | { 474 | ASSERT (! fasta_prot. empty ()); 475 | 476 | string gff_prot (fasta_prot); 477 | if (! fasta2gff_prot. empty ()) 478 | { 479 | string s; 480 | if (! find (fasta2gff_prot, gff_prot, s)) 481 | throw runtime_error ("FASTA protein " + strQuote (fasta_prot) + " is not found in GFF-protein match file"); 482 | gff_prot = std::move (s); 483 | } 484 | ASSERT (! gff_prot. empty ()); 485 | 486 | const Set* loci = findPtr (prot2loci, gff_prot); 487 | if (! loci) 488 | throw runtime_error ("FASTA protein " + fasta_prot + (fasta_prot == gff_prot ? "" : " (converted to GFF protein " + gff_prot +")") + " is misssing in .gff-file"); 489 | ASSERT (! loci->empty ()); 490 | 491 | return *loci; 492 | } 493 | 494 | 495 | 496 | } 497 | --------------------------------------------------------------------------------