├── .github └── workflows │ ├── dockerhub_push_release.yml │ └── tests.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── README.md ├── bin ├── adapt_filter_coverage.sh ├── blast_align.sh ├── blast_to_bed.sh ├── blast_to_complete.sh ├── blast_to_link.sh ├── bowtie_mapper.sh ├── build_karyotype.sh ├── calculate_seqlen.sh ├── cdhit_cluster.sh ├── check_dependencies.sh ├── check_mandatory_files.sh ├── coordinate_adapter.sh ├── download_plasmid_database.py ├── draw_circos_images.sh ├── filter_fasta.sh ├── get_coverage.sh ├── gff_to_bed.sh ├── mash_screener.sh ├── mashclust.py ├── ncbi_database_fetcher.sh ├── process_cluster_output.sh ├── prokka_annotation.sh ├── quality_trim.sh ├── rename_from_fasta.sh ├── sam_to_bam.sh ├── spades_assembly.sh ├── summary_report_pid.py └── summary_table.sh ├── config_files ├── OR.conf ├── annotation_config_file.txt ├── circos_individual_1_3_0.conf ├── circos_individual_1_3_3.conf ├── circos_summary_1_3_0.conf ├── circos_summary_1_3_3.conf └── simple.conf ├── databases ├── ARGannot.pID.fasta ├── card.fasta └── plasmidFinder_01_26_2018.fsa ├── documents ├── ECCMID plasmidID 2018.pdf ├── Istall_dependencies.md └── PlasmidID_IWBBIO.pdf ├── environment.yml ├── img ├── 01_plasmid_track.png ├── 02_mapping_track.png ├── 03_annotation_track.png ├── 04_contig_track.png ├── 05_01_complete_contig_track.png ├── 05_complete_contig_track.png ├── Alignment.png ├── Annotation.png ├── Clustering_2.png ├── Mapping.png ├── Overlap_examples.png ├── PIPELNE TFM.png ├── SEN30_000195995_K00826.1.png ├── SEN30_000195995_NC_002305.1.png ├── SEN30_000195995_NC_003384.1.png ├── SEN30_000195995_NC_003385.1.png ├── SEN30_000195995_NC_009981.1.png ├── SEN30_000195995_NC_013365.1.png ├── SEN30_000195995_NZ_LT883154.1.png ├── SEN30_000195995_NZ_LT904853.1.png ├── SEN30_000195995_NZ_LT904874.1.png ├── SEN30_000195995_NZ_LT904880.1.png ├── SEN30_000195995_NZ_LT904895.1.png ├── SEN_summary.png ├── SEN_summary_numbers.png ├── Short_pipeline.png ├── Visualization.png ├── isciii_logo.jpeg ├── pipeline_pID.png ├── plasmidID_logo.png ├── summary_image_1_3.png ├── summary_image_2_3.png └── summary_image_3_3.png ├── plasmidID └── test ├── KPN_TEST_R1.fastq.gz ├── KPN_TEST_R2.fastq.gz ├── contigs_KPN_TEST.fasta ├── plasmids_TEST_database.fasta └── test.sh /.github/workflows/dockerhub_push_release.yml: -------------------------------------------------------------------------------- 1 | name: deploy release 2 | # This builds the docker image and pushes it to DockerHub 3 | on: 4 | release: 5 | types: [published] 6 | jobs: 7 | push_dockerhub: 8 | name: Push new Docker image to Docker Hub (release) 9 | runs-on: ubuntu-latest 10 | # Only run for the official repo, for releases and merged PRs 11 | if: ${{ github.repository == 'BU-ISCIII/plasmidID' }} 12 | env: 13 | DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} 14 | DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }} 15 | steps: 16 | - name: Check out pipeline code 17 | uses: actions/checkout@v2 18 | 19 | - name: Build new docker image 20 | run: docker build --no-cache . -t buisciii/plasmidid:${{ github.event.release.tag_name }} 21 | 22 | - name: Push Docker image to DockerHub (develop) 23 | run: | 24 | echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin 25 | docker push buisciii/plasmidid:${{ github.event.release.tag_name }} 26 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests ci 2 | # This workflow runs the pipeline with the minimal test dataset to check that it completes any errors 3 | on: 4 | push: 5 | branches: [develop] 6 | pull_request_target: 7 | branches: [develop] 8 | release: 9 | types: [published] 10 | 11 | jobs: 12 | push_dockerhub: 13 | name: Push new Docker image to Docker Hub (dev) 14 | runs-on: ubuntu-latest 15 | # Only run for the official repo, for releases and merged PRs 16 | if: ${{ github.repository == 'BU-ISCIII/plasmidID' }} 17 | env: 18 | DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} 19 | DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }} 20 | steps: 21 | - name: Check out pipeline code 22 | uses: actions/checkout@v2 23 | 24 | - name: Build new docker image 25 | run: docker build --no-cache . -t buisciii/plasmidid:dev 26 | 27 | - name: Push Docker image to DockerHub (develop) 28 | run: | 29 | echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin 30 | docker push buisciii/plasmidid:dev 31 | run-tests: 32 | name: Run tests 33 | needs: push_dockerhub 34 | runs-on: ubuntu-latest 35 | steps: 36 | - name: Run pipeline with test data 37 | run: | 38 | docker run buisciii/plasmidid:dev bash /opt/plasmidID/test/test.sh 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Custom 2 | TEST/ 3 | TEST_DATA/*.bt2 4 | TEST_DATA/*.length 5 | TEST_DATA/*.blast* 6 | NO_GROUP/ 7 | psi_cd_hit_may_2018_log.txt 8 | plasmid.database* 9 | .vscode/ 10 | 11 | ## Trash 12 | *~ 13 | .fuse* 14 | 15 | ## Container images 16 | *.img 17 | *.simg 18 | log_* 19 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 1.6.4 - 2021-03-2020 4 | ### Added 5 | - Updated Dockerfile 6 | - Migrated tests to github actions 7 | ### Fixed 8 | - Updated environment.yml for conda. 9 | - Fixed issues #12,#14,#15,#17. Cases with no plasmids or too many. Relative paths in html images. 10 | 11 | ## 1.4.2 - 2018-09-29 12 | ### Added 13 | - Specific config file for only reconstruct parameter 14 | 15 | ###Fixed 16 | - Protein databases can be properly used 17 | 18 | ## 1.4 - 2018-09-20 19 | ### Added 20 | - Automatically annotated genes/cds are displayed differently depending on whether they are located in forward or reverse 21 | - Psi-cd-hit and blast now handle threads 22 | - Improved error handling 23 | - Doocker/Singularity compatibility 24 | - One multifasta file per reference plasmid is generated with all the similar contigs from the sample 25 | - Quick staus of values applied to plasmid reconstruction 26 | 27 | ###Fixed 28 | - Some plasmids from the database were not annotated 29 | - Limit sample name to 37 characters, capped by prokka 30 | - Bug in complete contig track generator that took the wrong value and couldn't draw sequences that matched the position 0 of plasmid 31 | 32 | 33 | 34 | ## 1.3.0 - 2018-07-11 35 | ### New 36 | - Summary table can be generated with new utility 37 | - Several databases can be now annotated filling annotation_config_file.txt 38 | - --only-reconstruct is now implemented if user only needs to reconstruct and annotate contigs with small known databases 39 | ### Fixed 40 | - circos dependency is now checked 41 | - Output is now correctly redirected with -o 42 | ### Added 43 | - trimmomatic directory containing .jar can no be especified with --trimmomatic-directory 44 | - Vervose mode included. By default a log file will be created 45 | - Friendly terminal output 46 | 47 | ## 1.2.2 - 2018-06-22 48 | ### Fixed 49 | - ***IMPORTANT***: PlasmidID maps with -a mode NOW, as it should have allways been. A bug on mapping script is now solved 50 | - Number of threads are now implemented on mapping 51 | - Some cumulative clustering temporary files are now removed 52 | 53 | ## 1.2.1 - 2018-06-14 54 | ### Fixed 55 | - All dependencies are now checked at the beggining 56 | - Path to scripts are no longer hard coded paths 57 | - Links should be now displayed on summary image 58 | 59 | ### Added 60 | - Added first utility ***ncbi_database_fetcher.sh***, a script to download FASTA databases from terms 61 | - Short scripts now moved to /bin has to be added to PATH 62 | 63 | 64 | ## 1.1.1 - 2018-06-11 65 | ### Fixed 66 | - Additional database will not be required for circos executios, even though the file will be created 67 | - Fixed an issue when no plasmid matches mapping requeriments 68 | - Fixed an issue when circos will trow an error message when no plasmids met mapping requeriments 69 | 70 | 71 | ## 1.1.0 - 2018-06-06 72 | ### Added 73 | - Database plasmids used as scaffold are annotated after filtering. User doesn't need to annotate the initial huge plasmid database. 74 | - User can add ONE nucleotide FASTA file wi that will be specifically annotated on final plasmids with a light blue color 75 | 76 | ## Unreleased 77 | 78 | - Create config files as required by user and include visual parameters 79 | - Test and adapt the --only-reconstruct option 80 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:latest 2 | 3 | RUN mkdir /opt/plasmidID/ 4 | ADD bin /opt/plasmidID/bin 5 | ADD config_files /opt/plasmidID/config_files 6 | ADD databases /opt/plasmidID/databases 7 | ADD documents /opt/plasmidID/documents 8 | ADD img /opt/plasmidID/img 9 | ADD test /opt/plasmidID/test 10 | ADD plasmidID /opt/plasmidID/ 11 | ADD environment.yml /opt/plasmidID/ 12 | ADD CHANGELOG.md /opt/plasmidID/ 13 | ADD LICENSE /opt/plasmidID/ 14 | 15 | RUN cd /opt/plasmidID 16 | RUN /opt/conda/bin/conda env create -f /opt/plasmidID/environment.yml && /opt/conda/bin/conda clean -a 17 | RUN /opt/conda/bin/conda env export --name plasmidID > plasmidID.yml 18 | ENV PATH /opt/conda/envs/plasmidID/bin:$PATH 19 | ENV PATH /opt/plasmidID/bin:/opt/plasmidID:$PATH 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/fusion-report/README.html) 3 | [![CircleCI Build Status](https://circleci.com/gh/circleci/circleci-docs.svg?style=shield)](https://circleci.com/gh/BU-ISCIII/plasmidID) [![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) [![Scif](https://img.shields.io/badge/Filesystem-Scientific-brightgreen.svg)](https://sci-f.github.io) 4 | 5 | # plasmidID Logo 6 | 7 |
8 |
9 | 10 | * [Introduction](#introduction) 11 | * [Requirements](#requirements) 12 | * [Software](#software) 13 | * [Plasmid database](#plasmid-database) 14 | * [Installation](#installation) 15 | * [Install from source](#install-from-source) 16 | * [Install using conda](#install-using-conda) 17 | * [Quick usage](#quick-usage) 18 | * [Usage](#usage) 19 | * [Output](#output) 20 | * [Annotation file](#annotation-file) 21 | * [Illustrated pipeline](#illustrated-pipeline) 22 | * [Docker](#docker) 23 | 24 | ## Introduction 25 | 26 | PlasmidID is a mapping-based, assembly-assisted plasmid identification tool that analyzes and gives graphic solution for plasmid identification. 27 | 28 | PlasmidID is a **computational pipeline** implemented in **BASH** that maps Illumina reads over plasmid database sequences. The k-mer filtered, most covered sequences are clustered by identity to avoid redundancy and the longest are used as scaffold for plasmid reconstruction. Reads are assembled and annotated by automatic and specific annotation. All information generated from mapping, assembly, annotation and local alignment analyses is gathered and accurately represented in a **circular image** which allow user to determine plasmidic composition in any bacterial sample. 29 | 30 | ## Requirements 31 | 32 | #### Software 33 | 34 | * [Python >=3.6](https://www.python.org/) 35 | * [Trimmomatic v0.33](http://www.usadellab.org/cms/?page=trimmomatic)(Optional) 36 | * [Spades v3.8](http://cab.spbu.ru/software/spades/) (Optional) 37 | * [Perl v5.26.0](https://www.perl.org/get.html) 38 | * [NCBI_blast + v2.2.3](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download) 39 | * [Bedtools v2.25](http://bedtools.readthedocs.io/en/latest/) 40 | * [Bowtie 2 v2.2.4](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) 41 | * [SAMtools v1.2](http://samtools.sourceforge.net/) 42 | * [prokka v1.12](http://www.vicbioinformatics.com/software.prokka.shtml) 43 | * [cd-hit v4.6.6](http://weizhongli-lab.org/cd-hit/) (no longer needed since v1.6) 44 | * [circos v0.69.3](http://circos.ca/software/download/circos/) 45 | * [mash v2.2](https://github.com/marbl/Mash) 46 | 47 | #### Plasmid database 48 | 49 | Since version v1.5.1 plasmid database can be downloaded with the following command: 50 | 51 | ```Bash 52 | download_plasmid_database.py -o FOLDER 53 | ``` 54 | 55 | ## Installation 56 | 57 | #### Install from source 58 | 59 | Install all dependencies and add them to $PATH 60 | 61 | git clone https://github.com/BU-ISCIII/plasmidID.git 62 | 63 | Add plasmidID and ./bin to $PATH 64 | 65 | #### Install using conda 66 | 67 | This option is recomended. 68 | 69 | Install [Anaconda3](https://www.anaconda.com/distribution/) 70 | 71 | ``` 72 | conda install -c conda-forge -c bioconda plasmidid 73 | ``` 74 | Wait for the environment to solve 75 | 76 | Ignore warnings/errors 77 | 78 | #### Use Docker 79 | 80 | Example: 81 | Clone the repo: 82 | ```Bash 83 | git clone git@github.com:BU-ISCIII/plasmidID.git 84 | cd plasmidID 85 | ``` 86 | Run it with the test data using docker: 87 | 88 | **Notice that the input files MUST be in your present working directory or in any folder inside it. For example, if I execute this command in `/home/smonzon`, my folder with the files would be in `/home/smonzon/test`.** 89 | 90 | ```Bash 91 | docker run -v $PWD:$PWD -w $PWD buisciii/plasmidid plasmidID \ 92 | -1 test/KPN_TEST_R1.fastq.gz \ 93 | -2 test/KPN_TEST_R2.fastq.gz \ 94 | -d test/plasmids_TEST_database.fasta \ 95 | -c test/contigs_KPN_TEST.fasta \ 96 | --no-trim \ 97 | -s KPN 98 | ``` 99 | 100 | ## Quick usage 101 | 102 | Illumina paired-end 103 | ``` 104 | plasmidID \ 105 | -1 SAMPLE_R1.fastq.gz \ 106 | -2 SAMPLE_R2.fastq.gz \ 107 | -d YYYY-MM-DD_plasmids.fasta \ 108 | -c SAMPLE_assembled_contigs.fasta \ 109 | --no-trim \ 110 | -s SAMPLE 111 | ``` 112 | 113 | SMRT sequencing (only contigs) 114 | ``` 115 | plasmidID \ 116 | -d YYYY-MM-DD_plasmids.fasta \ 117 | -c SAMPLE_contigs.fasta \ 118 | -s SAMPLE 119 | ``` 120 | 121 | Annotate any fasta you want 122 | ``` 123 | plasmidID \ 124 | -d YYYY-MM-DD_plasmids.fasta \ 125 | -c SAMPLE_assembled_contigs.fasta \ 126 | -a annotation_file \ 127 | -s SAMPLE 128 | ``` 129 | More info about [annotation file](#annotation-file) 130 | 131 | If there are several samples in the same GROUP folder 132 | ``` 133 | summary_report_pid.py -i NO_GROUP/ 134 | ``` 135 | ## Usage 136 | 137 | ``` 138 | usage : plasmidID <-1 R1> <-2 R2> <-d database(fasta)> <-s sample_name> [-g group_name] [options] 139 | 140 | Mandatory input data: 141 | -1 | --R1 reads corresponding to paired-end R1 (mandatory) 142 | -2 | --R2 reads corresponding to paired-end R2 (mandatory) 143 | -d | --database database to map and reconstruct (mandatory) 144 | -s | --sample sample name (mandatory), less than 37 characters 145 | 146 | Optional input data: 147 | -g | --group group name (optional). If unset, samples will be gathered in NO_GROUP group 148 | -c | --contigs file with contigs. If supplied, plasmidID will not assembly reads 149 | -a | --annotate file with configuration file for specific annotation 150 | -o output directory, by default is the current directory 151 | 152 | Pipeline options: 153 | --explore Relaxes default parameters to find less reliable relationships within data supplied and database 154 | --only-reconstruct Database supplied will not be filtered and all sequences will be used as scaffold 155 | This option does not require R1 and R2, instead a contig file can be supplied 156 | -w Undo winner takes it all algorithm when clustering by kmer - QUICKER MODE 157 | Trimming: 158 | --trimmomatic-directory Indicate directory holding trimmomatic .jar executable 159 | --no-trim Reads supplied will not be quality trimmed 160 | 161 | Coverage and Clustering: 162 | -C | --coverage-cutoff minimun coverage percentage to select a plasmid as scafold (0-100), default 80 163 | -S | --coverage-summary minimun coverage percentage to include plasmids in summary image (0-100), default 90 164 | -f | --cluster kmer identity to cluster plasmids into the same representative sequence (0 means identical) (0-1), default 0.5 165 | -k | --kmer identity to filter plasmids from the database with kmer approach (0-1), default 0.95 166 | 167 | Contig local alignment 168 | -i | --alignment-identity minimun identity percentage aligned for a contig to annotate, default 90 169 | -l | --alignment-percentage minimun length percentage aligned for a contig to annotate, default 20 170 | -L | --length-total minimun alignment length to filter blast analysis 171 | --extend-annotation look for annotation over regions with no homology found (base pairs), default 500bp 172 | 173 | Draw images: 174 | --config-directory directory holding config files, default config_files/ 175 | --config-file-individual file name of the individual file used to reconstruct 176 | Additional options: 177 | 178 | -M | --memory max memory allowed to use 179 | -T | --threads number of threads 180 | -v | --version version 181 | -h | --help display usage message 182 | 183 | example: ./plasmidID.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -d database.fasta -s ECO_553 -G ENTERO 184 | ./plasmidID.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -d PacBio_sample.fasta -c scaffolds.fasta -C 60 -s ECO_60 -G ENTERO --no-trim 185 | ``` 186 | 187 | ## Examples 188 | 189 | Under construction 190 | 191 | ## Output 192 | 193 | Since v1.6, the more relevant output is located in GROUP/SAMPLE folder: 194 | 195 | - **SAMPLE_final_results.html(.tab)** 196 | - id: Name of the accession number of reference 197 | - length: length of the reference sequence 198 | - species: species of the reference sequence 199 | - description: rest of reference fasta header 200 | - contig_name: number of the contigs that align the minimun required for complete contig track 201 | - SAMPLE: 202 | - Image of the reconstructed plasmid (click to open in new tab) 203 | - MAPPING % (percentage): percentage of reference covered with reads 204 | - X for contig mode (gray colour) 205 | - Orientative colouring (the closer to 100% the better) 206 | - ALIGN FR (fraction_covered): total length of contigs aligned (complete) / reference sequence length 207 | - Orientative colouring (the closer to 1 the better) 208 | 209 | 210 | ## Annotation file 211 | 212 | Under construction 213 | 214 | ## Illustrated pipeline 215 | 216 | This image sumarizes PlasmidID pipeline, including the most important steps. 217 | For furder details, including: 218 | - [Results interpretation](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track) 219 | - and more, please visit: [**PLASMIDID WIKI**](https://github.com/BU-ISCIII/plasmidID/wiki) 220 | 221 |

workflow_small 222 | 223 | -------------------------------------------------------------------------------- /bin/adapt_filter_coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | #~set -x 7 | 8 | #============================================================= 9 | # HEADER 10 | #============================================================= 11 | 12 | #INSTITUTION:ISCIII 13 | #CENTRE:BU-ISCIII 14 | #AUTHOR: Pedro J. Sola 15 | VERSION=1.0 16 | #CREATED: 21 March 2018 17 | #REVISION: 18 | #DESCRIPTION:adapt_filter_coverage script that adapt percentages and filter coverage info from bedtools genomecov output 19 | 20 | #================================================================ 21 | # END_OF_HEADER 22 | #================================================================ 23 | 24 | #SHORT USAGE RULES 25 | #LONG USAGE FUNCTION 26 | usage() { 27 | cat << EOF 28 | 29 | adapt_filter_coverage script that adapt percentages and filter coverage info from bedtools genomecov output 30 | 31 | usage : $0 <-i inputfile(.fasta)> [-o ] [-c ] [-s ] [-v] [-h] 32 | 33 | -i input file 34 | -o output directory (optional). By default the file is replaced in the same location 35 | -c percentage value to filter >= values. If not supplied, all records will be outputted 36 | -s string to ad at the end of the outputted file (list of accession numbers) 37 | -v version 38 | -h display usage message 39 | 40 | example: adapt_filter_coverage.sh -i ecoli.coverage -c 70 41 | 42 | EOF 43 | } 44 | 45 | #================================================================ 46 | # OPTION_PROCESSING 47 | #================================================================ 48 | #Make sure the script is executed with arguments 49 | if [ $# = 0 ] ; then 50 | usage >&2 51 | exit 1 52 | fi 53 | 54 | # Error handling 55 | error(){ 56 | local parent_lineno="$1" 57 | local script="$2" 58 | local message="$3" 59 | local code="${4:-1}" 60 | 61 | RED='\033[0;31m' 62 | NC='\033[0m' 63 | 64 | if [[ -n "$message" ]] ; then 65 | echo -e "\n---------------------------------------\n" 66 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 67 | echo -e "MESSAGE:\n" 68 | echo -e "$message" 69 | echo -e "\n---------------------------------------\n" 70 | else 71 | echo -e "\n---------------------------------------\n" 72 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 73 | echo -e "\n---------------------------------------\n" 74 | fi 75 | 76 | exit "${code}" 77 | } 78 | 79 | #DECLARE FLAGS AND VARIABLES 80 | cwd="$(pwd)" 81 | input_file="Input_file" 82 | coverage_cutoff_input=100 83 | 84 | #PARSE VARIABLE ARGUMENTS WITH getops 85 | #common example with letters, for long options check longopts2getopts.sh 86 | options=":i:o:c:s:vh" 87 | while getopts $options opt; do 88 | case $opt in 89 | i ) 90 | input_file=$OPTARG 91 | ;; 92 | o ) 93 | output_dir=$OPTARG 94 | ;; 95 | c ) 96 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 97 | echo "please, provide a percentage between 0 and 100" 98 | usage 99 | exit 1 100 | else 101 | coverage_cutoff_input=$OPTARG 102 | fi 103 | ;; 104 | s ) 105 | suffix=$OPTARG 106 | ;; 107 | h ) 108 | usage 109 | exit 1 110 | ;; 111 | v ) 112 | echo $VERSION 113 | exit 1 114 | ;; 115 | \?) 116 | echo "Invalid Option: -$OPTARG" 1>&2 117 | usage 118 | exit 1 119 | ;; 120 | : ) 121 | echo "Option -$OPTARG requires an argument." >&2 122 | exit 1 123 | ;; 124 | * ) 125 | echo "Unimplemented option: -$OPTARG" >&2; 126 | exit 1 127 | ;; 128 | 129 | esac 130 | done 131 | shift $((OPTIND-1)) 132 | 133 | #================================================================ 134 | # MAIN_BODY 135 | #================================================================ 136 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 137 | 138 | echo -e "\n#Executing" $0 "\n" 139 | 140 | check_mandatory_files.sh $input_file 141 | 142 | 143 | suffix="_adapted_filtered_"$coverage_cutoff_input 144 | coverage_cutoff=$(echo "(1 - ($coverage_cutoff_input/100))" | bc -l) 145 | 146 | #echo $coverage_cutoff 147 | 148 | if [ ! $output_dir ]; then 149 | output_dir=$(dirname $input_file) 150 | #echo "Default output directory is" $output_dir 151 | mkdir -p $output_dir 152 | else 153 | #echo "Output directory is" $output_dir 154 | mkdir -p $output_dir 155 | fi 156 | 157 | 158 | if [ ! $filename ]; then 159 | filename=$(basename $input_file | cut -d. -f1) 160 | fi 161 | 162 | 163 | if [ -f $input_file"_adapted" ]; then 164 | echo "Found previous" $(basename $input_file"_adapted")", removing it" 165 | rm $input_file"_adapted" 166 | fi 167 | 168 | ## Keep information about positions with 0 coverage. If no 0 coverage positions for a plasmid, create line including this info. 169 | awk ' 170 | BEGIN{OFS="\t"} 171 | (!x[$1]++) {if ($1 != "genome") 172 | {if ($2 == 0) 173 | {print $0} 174 | else 175 | {print $1, 0, $4, $4, 0.0000000001} 176 | } 177 | } 178 | ' $input_file > $input_file"_adapted" || error ${LINENO} $(basename $0) "Awk command for bedtools coverage output parsing in $input_file\"_adapted\" creation. See $output_dir/logs for more information" 179 | 180 | ## Keep plasmids with coverage < 1-coverage_cutoff_input/100 181 | awk ' 182 | {if ($2 == 0 && $5 < '"${coverage_cutoff}"') 183 | {print $1} 184 | } 185 | ' $input_file"_adapted" > $input_file$suffix || error ${LINENO} $(basename $0) "Awk command for coverage filtering in $input_file$suffix creation. See $output_dir/logs for more information." 186 | 187 | echo "$(date)" 188 | echo "Done filtering sequences with" $coverage_cutoff_input"% and greater coverage" 189 | echo "Those sequences can be found at" $input_file$suffix 190 | echo -e $(cat $input_file$suffix | wc -l) mapped equals or more than $coverage_cutoff_input "\n" 191 | -------------------------------------------------------------------------------- /bin/blast_align.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | #============================================================= 6 | # HEADER 7 | #============================================================= 8 | 9 | #INSTITUTION:ISCIII 10 | #CENTRE:BU-ISCIII 11 | #AUTHOR: Pedro J. Sola 12 | VERSION=1.0 13 | #CREATED: 1 May 2018 14 | #REVISION: 15 | #DESCRIPTION:Script that blast a query against a database 16 | # 17 | #DOCUMENTATION 18 | # 19 | #Blast output6 with aditions: 20 | #1 Query label.(qseqid) 21 | #2 Target or subject(database sequence or cluster centroid) label. (sseqid) 22 | #3 Percent identity. (pident) 23 | #4 Alignment length. (length) 24 | #5 Number of mismatches. (mismatch) 25 | #6 Number of gap opens. (gapopen) 26 | #7 Start position in query. Query coordinates start with 1 at the first base in the sequence as it appears in the input file. For translated searches (nucleotide queries, protein targets), query startend for -ve frame. (qstart) 27 | #8 End position in query. (qend) 28 | #9 Start position in target. Target coordinates start with 1 at the first base in sequence as it appears in the database. For untranslated nucleotide searches, target startend for a reverse-complement alignment. (sstart) 29 | #10 End position in target. (send) 30 | #11 E-value calculated using Karlin-Altschul statistics. (evalue) 31 | #12 Bit score calculated using Karlin-Altschul statistics. (bitscore) 32 | #13 Lenght of query (qlen) 33 | #14 Length of target (slen) 34 | # 35 | # 36 | #TO DO: 37 | # 38 | #Handle all types of blast: blastn, blastp... 39 | # 40 | #================================================================ 41 | # END_OF_HEADER 42 | #================================================================ 43 | 44 | #SHORT USAGE RULES 45 | #LONG USAGE FUNCTION 46 | usage() { 47 | cat << EOF 48 | 49 | blast_align is a script that blast a query against a database 50 | 51 | usage : $0 <-i inputfile(query)> <-d inputfile(database)> [-p ] [-o ] [-t ] 52 | [-T ] [-e ] [-v] [-h] 53 | 54 | -i query file in FASTA format 55 | -d database to blast against 56 | -o output directory, default same directory as query 57 | -p prefix for blast identification (mandatory) and output file name 58 | -q type of query, nucl by default 59 | -t type of database, nucl by default 60 | -e evalue for blast analysis, default 0.0001 61 | -T number of threads 62 | -v version 63 | -h display usage message 64 | 65 | Output directory is the same as input directory by default 66 | 67 | example: blast_align -i ecoli.fasta -d plasmid_ddbb.fasta -p plasmid 68 | 69 | 70 | EOF 71 | } 72 | 73 | 74 | #================================================================ 75 | # OPTION_PROCESSING 76 | #================================================================ 77 | #Make sure the script is executed with arguments 78 | if [ $# = 0 ] ; then 79 | usage >&2 80 | exit 1 81 | fi 82 | 83 | # Error handling 84 | error(){ 85 | local parent_lineno="$1" 86 | local script="$2" 87 | local message="$3" 88 | local code="${4:-1}" 89 | 90 | RED='\033[0;31m' 91 | NC='\033[0m' 92 | 93 | if [[ -n "$message" ]] ; then 94 | echo -e "\n---------------------------------------\n" 95 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 96 | echo -e "MESSAGE:\n" 97 | echo -e "$message" 98 | echo -e "\n---------------------------------------\n" 99 | else 100 | echo -e "\n---------------------------------------\n" 101 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 102 | echo -e "\n---------------------------------------\n" 103 | fi 104 | 105 | exit "${code}" 106 | } 107 | 108 | #DECLARE FLAGS AND VARIABLES 109 | cwd="$(pwd)" 110 | group="NO_GROUP" 111 | input_file="Input_file" 112 | database="Database" 113 | query_type="nucl" 114 | database_type="nucl" 115 | evalue=0.0001 116 | threads=1 117 | blast_command="blastn" 118 | 119 | #PARSE VARIABLE ARGUMENTS WITH getops 120 | #common example with letters, for long options check longopts2getopts.sh 121 | options=":i:o:p:f:d:q:t:e:T:vh" 122 | while getopts $options opt; do 123 | case $opt in 124 | i ) 125 | input_file=$OPTARG 126 | ;; 127 | d ) 128 | database=$OPTARG 129 | ;; 130 | o ) 131 | output_dir=$OPTARG 132 | ;; 133 | p) 134 | prefix=$OPTARG 135 | ;; 136 | f) 137 | file_name=$OPTARG 138 | ;; 139 | t ) 140 | database_type=$OPTARG 141 | ;; 142 | q ) 143 | query_type=$OPTARG 144 | ;; 145 | g ) 146 | group=$OPTARG 147 | ;; 148 | e ) 149 | evalue=$OPTARG 150 | ;; 151 | T) 152 | threads=$OPTARG 153 | ;; 154 | h ) 155 | usage 156 | exit 1 157 | ;; 158 | v ) 159 | echo $VERSION 160 | exit 1 161 | ;; 162 | \?) 163 | echo "Invalid Option: -$OPTARG" 1>&2 164 | usage 165 | exit 1 166 | ;; 167 | : ) 168 | echo "Option -$OPTARG requires an argument." >&2 169 | exit 1 170 | ;; 171 | * ) 172 | echo "Unimplemented option: -$OPTARG" >&2; 173 | exit 1 174 | ;; 175 | 176 | esac 177 | done 178 | shift $((OPTIND-1)) 179 | 180 | #================================================================ 181 | # MAIN_BODY 182 | #================================================================ 183 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 184 | 185 | echo -e "\n#Executing" $0 "\n" 186 | 187 | check_mandatory_files.sh $input_file $database 188 | 189 | #check_dependencies.sh blastn 190 | 191 | 192 | if [ ! $prefix ]; then 193 | echo "please provide a prefix to identify this blast analysis" 194 | exit 1 195 | fi 196 | 197 | if [ $query_type == "prot" ] || [ $query_type == "nucl" ]; then 198 | echo "query type selected as" $database_type 199 | else 200 | echo "please provide a proper query type" 201 | exit 1 202 | fi 203 | 204 | if [ $query_type == "prot" ]; then 205 | blast_command="tblastn" 206 | fi 207 | 208 | if [ ! $output_dir ]; then 209 | output_dir=$(dirname $input_file) 210 | echo "Default output directory is" $output_dir 211 | mkdir -p $output_dir 212 | else 213 | echo "Output directory is" $output_dir 214 | mkdir -p $output_dir 215 | fi 216 | 217 | if [ ! $file_name ]; then 218 | file_name=$(basename $input_file | cut -d. -f1) 219 | echo "filename is" $file_name 220 | fi 221 | 222 | database_name=$(basename $database) 223 | database_dir=$(dirname $database) 224 | 225 | ##BLAST EXECUTION 226 | 227 | echo "$(date)" 228 | echo "Blasting" $file_name "agaist" $database_name 229 | 230 | makeblastdb -in $database -out $database_dir/$database_name".blast.tmp" -dbtype $database_type || error ${LINENO} $(basename $0) "Makeblastdb command failed. See $output_dir/logs for more information." 231 | 232 | echo "BLAST command is" $blast_command 233 | 234 | $blast_command -query $input_file \ 235 | -db $database_dir/$database_name".blast.tmp" \ 236 | -out $output_dir/$file_name"."$prefix".blast" \ 237 | -evalue $evalue \ 238 | -num_threads $threads \ 239 | -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen" || error ${LINENO} $(basename $0) "Blastn command failed. See $output_dir/logs for more information" 240 | 241 | 242 | echo "$(date)" 243 | echo "Done blasting" $file_name "agaist" $database_name 244 | echo -e "blasted file can be found in" $output_dir/$file_name"."$prefix".blast" "\n" 245 | -------------------------------------------------------------------------------- /bin/blast_to_bed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | #set -e 6 | #set -x 7 | 8 | #============================================================= 9 | # HEADER 10 | #============================================================= 11 | 12 | #INSTITUTION:ISCIII 13 | #CENTRE:BU-ISCIII 14 | #AUTHOR: Pedro J. Sola 15 | VERSION=1.0 16 | #CREATED: 4 May 2018 17 | #REVISION: 18 | #06 May 2018: add id optiopn in bed output 19 | #04 June 2018: add an option for an aditional division mostly for ABR sort 20 | # 21 | #DESCRIPTION:blast_to_bed script obtain a BED file with coordinates of local blast alignments matching some given conditions 22 | #================================================================ 23 | # END_OF_HEADER 24 | #================================================================ 25 | 26 | #SHORT USAGE RULES 27 | #LONG USAGE FUNCTION 28 | usage() { 29 | cat << EOF 30 | 31 | blast_to_bed is a script than obtain a BED file with coordinates of local blast alignments matching some given conditions 32 | 33 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o ] [-b ] [-l ] [-L ] 34 | [-p ] [-d ] [-D (l|r)] [-q ] [-Q (l|r)] [-U ] [-I] [-u] [-v] [-h] 35 | 36 | -i input file 37 | -b blast identity cutoff (0 - 100), default 90 38 | -l blast length percentage cutoff (0 - 100), default 20, use 90 for genes 39 | -L blast length alignment cutoff, default 0, use 200 or 500 for contigs 40 | -o output directory (optional). By default the file is replaced in the same location 41 | -q database chraracter delimiter, default "_" 42 | -Q query field to retrieve (l=left, r=right), default left 43 | -d database chraracter delimiter, default "_" 44 | -D database field to retrieve (l=left, r=right), default right 45 | -I contig mode 46 | -u unique. Outputs only one query entry per database entry 47 | -U unique mode with delimiter. Outputs only one delimited query per database entry 48 | -v version 49 | -h display usage message 50 | 51 | example: blast_to_bed.sh -i ecoli_prefix.blast -b 80 -l 50 -q - -Q r 52 | 53 | EOF 54 | } 55 | 56 | #================================================================ 57 | # OPTION_PROCESSING 58 | #================================================================ 59 | #Make sure the script is executed with arguments 60 | if [ $# = 0 ] ; then 61 | usage >&2 62 | exit 1 63 | fi 64 | 65 | error(){ 66 | local parent_lineno="$1" 67 | local script="$2" 68 | local message="$3" 69 | local code="${4:-1}" 70 | 71 | RED='\033[0;31m' 72 | NC='\033[0m' 73 | 74 | if [[ -n "$message" ]] ; then 75 | echo -e "\n---------------------------------------\n" 76 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 77 | echo -e "MESSAGE:\n" 78 | echo -e "$message" 79 | echo -e "\n---------------------------------------\n" 80 | else 81 | echo -e "\n---------------------------------------\n" 82 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 83 | echo -e "\n---------------------------------------\n" 84 | fi 85 | 86 | exit "${code}" 87 | } 88 | 89 | #DECLARE FLAGS AND VARIABLES 90 | cwd="$(pwd)" 91 | input_file="Input_file" 92 | blast_id_cutoff=90 93 | blast_len_percentage=10 94 | blast_len_alignment=0 95 | database_delimiter="_" 96 | database_field=r 97 | query_delimiter="_" 98 | query_field=l 99 | unique=false 100 | unique_divider=false 101 | divider_delimiter="-" 102 | suffix="" 103 | id_circos=false 104 | id_output="" 105 | 106 | #PARSE VARIABLE ARGUMENTS WITH getops 107 | #common example with letters, for long options check longopts2getopts.sh 108 | options=":i:b:q:Q:d:D:o:l:L:U:Iuvh" 109 | while getopts $options opt; do 110 | case $opt in 111 | i ) 112 | input_file=$OPTARG 113 | ;; 114 | b ) 115 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 116 | echo "please, provide a percentage between 0 and 100" 117 | exit 1 118 | else 119 | blast_id_cutoff=$OPTARG 120 | fi 121 | ;; 122 | o ) 123 | output_dir=$OPTARG 124 | ;; 125 | l ) 126 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 127 | echo "please, provide a percentage between 0 and 100" 128 | exit 1 129 | else 130 | blast_len_percentage=$OPTARG 131 | fi 132 | ;; 133 | L ) 134 | blast_len_alignment=$OPTARG 135 | ;; 136 | d ) 137 | database_delimiter=$OPTARG 138 | ;; 139 | D ) 140 | database_field=$OPTARG 141 | ;; 142 | q ) 143 | query_delimiter=$OPTARG 144 | ;; 145 | Q ) 146 | query_field=$OPTARG 147 | ;; 148 | u ) 149 | unique=true 150 | suffix=".unique.tmp" 151 | ;; 152 | U ) 153 | unique_divider=true 154 | suffix=".unique.divider.tmp" 155 | divider_delimiter=$OPTARG 156 | ;; 157 | I) 158 | id_circos=true 159 | id_output=",\"id=\"query_name[length(query_name)]" 160 | ;; 161 | h ) 162 | usage 163 | exit 1 164 | ;; 165 | v ) 166 | echo $VERSION 167 | exit 1 168 | ;; 169 | \?) 170 | echo "Invalid Option: -$OPTARG" 1>&2 171 | usage 172 | exit 1 173 | ;; 174 | : ) 175 | echo "Option -$OPTARG requires an argument." >&2 176 | exit 1 177 | ;; 178 | * ) 179 | echo "Unimplemented option: -$OPTARG" >&2; 180 | exit 1 181 | ;; 182 | 183 | esac 184 | done 185 | shift $((OPTIND-1)) 186 | 187 | #================================================================ 188 | # MAIN_BODY 189 | #================================================================ 190 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 191 | 192 | echo -e "\n#Executing" $0 "\n" 193 | 194 | check_mandatory_files.sh $input_file 195 | 196 | 197 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l) 198 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//') 199 | 200 | 201 | if [ ! $output_dir ]; then 202 | output_dir=$(dirname $input_file) 203 | #echo "Default output directory is" $output_dir 204 | mkdir -p $output_dir 205 | else 206 | #echo "Output directory is" $output_dir 207 | mkdir -p $output_dir 208 | fi 209 | 210 | 211 | if [ ! $file_name ]; then 212 | file_name=$(basename $input_file | cut -d. -f1,2) 213 | fi 214 | 215 | ##CHECK FIELDS TO RETRIEVE 216 | 217 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then 218 | 219 | if [ $database_field == l ]; then 220 | database_field="1" 221 | else 222 | database_field="length(database_name)" 223 | fi 224 | 225 | else 226 | echo "Please introduce r or l for database" 227 | exit 1 228 | fi 229 | 230 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then 231 | 232 | if [ $query_field == l ]; then 233 | query_field="1" 234 | else 235 | query_field="length(query_name)" 236 | fi 237 | 238 | else 239 | 240 | echo "Please introduce 0 or 1 for query" 241 | exit 1 242 | fi 243 | 244 | echo "$(date)" 245 | echo "Adapting blast to bed using" $(basename $input_file) "with:" 246 | echo "Blast identity=" $blast_id_cutoff 247 | echo "Min length aligned=" $blast_len_alignment 248 | echo "Min len percentage=" $blast_len_percentage 249 | echo "database_delimiter=" $database_delimiter 250 | echo "database_field)=" $database_field 251 | echo "query_delimiter=" $query_delimiter 252 | echo "query_field=" $query_field 253 | 254 | 255 | cat $input_file | sort -k3 -nr | \ 256 | awk ' 257 | {OFS="\t" 258 | split($2, database_name, "'"${database_delimiter}"'") 259 | split($1, query_name, "'"${query_delimiter}"'")} 260 | (($3 >= '"${blast_id_cutoff}"')&&(($4/$13) >= '"${blast_len_percentage_value}"')&&($4 >= '"${blast_len_alignment}"')) \ 261 | {print database_name['"$database_field"'], $9, $10, query_name['"$query_field"']'"$id_output"'} 262 | ' \ 263 | > $output_dir/$file_name".bed"$suffix || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\"$suffix. See $output_dir/logs for more information." 264 | 265 | 266 | if [ "$unique" == "true" ]; then 267 | echo "unique option enabled" 268 | awk ' 269 | (!x[$1$4]++) 270 | ' $output_dir/$file_name".bed"$suffix \ 271 | > $output_dir/$file_name".bed" || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\". See $output_dir/logs for more information." 272 | rm $output_dir/$file_name".bed"$suffix 273 | fi 274 | 275 | 276 | if [ "$unique_divider" == "true" ]; then 277 | echo "unique delimiter option enabled" 278 | awk ' 279 | {split($4,query,"'"${divider_delimiter}"'")} 280 | (!x[query[1]$1]++) 281 | ' $output_dir/$file_name".bed"$suffix \ 282 | > $output_dir/$file_name".bed" || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\"$suffix. See $output_dir/logs for more information." 283 | rm $output_dir/$file_name".bed"$suffix 284 | fi 285 | 286 | 287 | 288 | echo "$(date)" 289 | echo "DONE adapting blast to bed" 290 | echo -e "File can be found at" $output_dir/$file_name".bed" "\n" 291 | -------------------------------------------------------------------------------- /bin/blast_to_complete.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | #set -e 6 | #set -x 7 | 8 | #============================================================= 9 | # HEADER 10 | #============================================================= 11 | 12 | #INSTITUTION:ISCIII 13 | #CENTRE:BU-ISCIII 14 | #AUTHOR: Pedro J. Sola 15 | VERSION=1.0 16 | #CREATED: 13 May 2018 17 | # 18 | #DESCRIPTION:blast_to_complete script obtain full length of sequences from blast and adapt it to circos 19 | #================================================================ 20 | # END_OF_HEADER 21 | #================================================================ 22 | 23 | #SHORT USAGE RULES 24 | #LONG USAGE FUNCTION 25 | usage() { 26 | cat << EOF 27 | 28 | blast_to_complete is a script that obtain full length of sequences from blast and adapt it to circos 29 | 30 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o ] [-b ] [-l ] 31 | [-p ] [-d ] [-D (l|r)] [-q ] [-Q (l|r)] [-I] [-u] [-v] [-h] 32 | 33 | -i input file 34 | -b blast identity cutoff (0 - 100), default 90 35 | -l blast length percentage cutoff (0 - 100), default 50, use 90 for genes 36 | -o output directory (optional). By default the file is replaced in the same location 37 | -q database chraracter delimiter, default "_" 38 | -Q query field to retrieve (l=left, r=right), default left 39 | -d database chraracter delimiter, default "_" 40 | -D database field to retrieve (l=left, r=right), default right 41 | -I contig mode 42 | -u unique. Outputs only one query entry per database entry 43 | -v version 44 | -h display usage message 45 | 46 | example: blast_to_complete.sh -i ecoli_prefix.blast 47 | EOF 48 | } 49 | 50 | #================================================================ 51 | # OPTION_PROCESSING 52 | #================================================================ 53 | #Make sure the script is executed with arguments 54 | if [ $# = 0 ] ; then 55 | usage >&2 56 | exit 1 57 | fi 58 | 59 | # Error handling 60 | error(){ 61 | local parent_lineno="$1" 62 | local script="$2" 63 | local message="$3" 64 | local code="${4:-1}" 65 | 66 | RED='\033[0;31m' 67 | NC='\033[0m' 68 | 69 | if [[ -n "$message" ]] ; then 70 | echo -e "\n---------------------------------------\n" 71 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 72 | echo -e "MESSAGE:\n" 73 | echo -e "$message" 74 | echo -e "\n---------------------------------------\n" 75 | else 76 | echo -e "\n---------------------------------------\n" 77 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 78 | echo -e "\n---------------------------------------\n" 79 | fi 80 | 81 | exit "${code}" 82 | } 83 | 84 | #DECLARE FLAGS AND VARIABLES 85 | cwd="$(pwd)" 86 | input_file="Input_file" 87 | blast_id_cutoff=90 88 | blast_len_percentage=15 89 | database_delimiter="-" 90 | database_field=r 91 | query_delimiter="_" 92 | query_field=r 93 | unique=false 94 | suffix="" 95 | id_circos=false 96 | id_output="" 97 | 98 | #PARSE VARIABLE ARGUMENTS WITH getops 99 | #common example with letters, for long options check longopts2getopts.sh 100 | options=":i:b:q:Q:d:D:o:l:Iuvh" 101 | while getopts $options opt; do 102 | case $opt in 103 | i ) 104 | input_file=$OPTARG 105 | ;; 106 | b ) 107 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 108 | echo "please, provide a percentage between 0 and 100" 109 | exit 1 110 | else 111 | blast_id_cutoff=$OPTARG 112 | fi 113 | ;; 114 | o ) 115 | output_dir=$OPTARG 116 | ;; 117 | l ) 118 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 119 | echo "please, provide a percentage between 0 and 100" 120 | exit 1 121 | else 122 | blast_len_percentage=$OPTARG 123 | fi 124 | ;; 125 | d ) 126 | database_delimiter=$OPTARG 127 | ;; 128 | D ) 129 | database_field=$OPTARG 130 | ;; 131 | q ) 132 | query_delimiter=$OPTARG 133 | ;; 134 | Q ) 135 | query_field=$OPTARG 136 | ;; 137 | u ) 138 | unique=true 139 | suffix=".unique.tmp" 140 | ;; 141 | I) 142 | id_circos=true 143 | id_output=",\"id=\"database_name[length(database_name)]" 144 | ;; 145 | h ) 146 | usage 147 | exit 1 148 | ;; 149 | v ) 150 | echo $VERSION 151 | exit 1 152 | ;; 153 | \?) 154 | echo "Invalid Option: -$OPTARG" 1>&2 155 | usage 156 | exit 1 157 | ;; 158 | : ) 159 | echo "Option -$OPTARG requires an argument." >&2 160 | exit 1 161 | ;; 162 | * ) 163 | echo "Unimplemented option: -$OPTARG" >&2; 164 | exit 1 165 | ;; 166 | 167 | esac 168 | done 169 | shift $((OPTIND-1)) 170 | 171 | #================================================================ 172 | # MAIN_BODY 173 | #================================================================ 174 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 175 | 176 | echo -e "\n#Executing" $0 "\n" 177 | 178 | check_mandatory_files.sh $input_file 179 | 180 | 181 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l) 182 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//') 183 | 184 | 185 | if [ ! $output_dir ]; then 186 | output_dir=$(dirname $input_file) 187 | #echo "Default output directory is" $output_dir 188 | mkdir -p $output_dir 189 | else 190 | #echo "Output directory is" $output_dir 191 | mkdir -p $output_dir 192 | fi 193 | 194 | 195 | if [ ! $file_name ]; then 196 | file_name=$(basename $input_file | cut -d. -f1,2) 197 | fi 198 | 199 | ##CHECK FIELDS TO RETRIEVE 200 | 201 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then 202 | 203 | if [ $database_field == l ]; then 204 | database_field="1" 205 | else 206 | database_field="length(database_name)" 207 | fi 208 | 209 | else 210 | echo "Please introduce r or l for database" 211 | exit 1 212 | fi 213 | 214 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then 215 | 216 | if [ $query_field == l ]; then 217 | query_field="1" 218 | else 219 | query_field="length(query_name)" 220 | fi 221 | 222 | else 223 | 224 | echo "Please introduce 0 or 1 for query" 225 | exit 1 226 | fi 227 | 228 | echo "$(date)" 229 | echo "Adapting blast to complete using" $(basename $input_file) "with:" 230 | echo "Blast identity=" $blast_id_cutoff 231 | echo "Min len percentage=" $blast_len_percentage 232 | 233 | 234 | cat $input_file |\ 235 | awk ' 236 | BEGIN{OFS="\t"} 237 | {split($1, query_name, "'"${query_delimiter}"'") 238 | split($2,database_name, "'"${database_delimiter}"'")} 239 | (($3 >= '"${blast_id_cutoff}"') && (($4/$13)>='"${blast_len_percentage_value}"') && (!x[$1$2]++)) \ 240 | {{isInverted=($10-$9) 241 | ext2=($13-$8)} 242 | {if (isInverted < 0) 243 | {pos1 = $10 244 | pos2 = $9} 245 | else 246 | {pos1 =$9 247 | pos2 = $10} 248 | {if ((isInverted < 0) && (($14 - pos2) > $7)) 249 | {coordChr2 = (pos2 + $7)} 250 | else if ((isInverted < 0) && (($14 - pos2) <= $7)) 251 | {coordChr2=$14} 252 | {if ((isInverted < 0) && (ext2 <= pos1)) 253 | {coordChr1= pos1 - ext2;} 254 | else if ((isInverted < 0) && (ext2 > pos1)) 255 | {coordChr1= 1} 256 | {if ((isInverted > 0) && (pos1 > $7)) 257 | {coordChr1=(pos1 - $7)} 258 | else if ((isInverted > 0) && (pos1 <= $7)) 259 | {coordChr1=1} 260 | {if ((isInverted > 0) && (ext2 > ($14-pos2))) 261 | {coordChr2= $14;} 262 | else if ((isInverted > 0) && (ext2 <= ($14-pos2))) 263 | {coordChr2= (pos2 + ext2)} 264 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13} }}}}}} 265 | ' \ 266 | >$output_dir/$file_name".complete"|| error ${LINENO} $(basename $0) "Awk command parsing blast output for circos input in $file_name\".complete\" creation failed. See $output_dir/logs for more information" 267 | 268 | 269 | cat $input_file |\ 270 | awk ' 271 | BEGIN{OFS="\t"} 272 | {split($1, query_name, "'"${query_delimiter}"'") 273 | split($2,database_name, "'"${database_delimiter}"'")} 274 | (($3 >= '"${blast_id_cutoff}"') && (($4/$13)>='"${blast_len_percentage_value}"') && (!x[$2$1]++)) \ 275 | {{isInverted=($10-$9) 276 | ext2=($13-$8)} 277 | {if (isInverted < 0) 278 | {pos1=$10 279 | pos2=$9} 280 | else 281 | {pos1 =$9 282 | pos2=$10}; \ 283 | {if ((isInverted < 0) && (($14 - pos2) < $7)) 284 | {coordChr1=1 285 | coordChr2=($7-($14-pos2)) 286 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}} 287 | {if ((isInverted < 0) && (ext2 > pos1)) 288 | {coordChr1=($14-(ext2-pos1)) 289 | coordChr2=$14 290 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}} 291 | {if ((isInverted > 0) && (pos1 < $7)) 292 | {coordChr1=($14-($7-pos1)) 293 | coordChr2=$14 294 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}} 295 | {if ((isInverted > 0) && (ext2 > ($14-pos2))) 296 | {coordChr1=1 297 | coordChr2=(ext2-($14-pos2)) 298 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13} 299 | } 300 | }}}}}} 301 | ' \ 302 | >>$output_dir/$file_name".complete" || error ${LINENO} $(basename $0) "Awk command parsing blast output for circos input in $file_name\".complete\" second step creation failed. See $output_dir/logs for more information" 303 | 304 | 305 | 306 | echo "$(date)" 307 | echo "DONE adapting blast to complete" 308 | echo -e "File can be found at" $output_dir/$file_name".complete" "/n" 309 | -------------------------------------------------------------------------------- /bin/blast_to_link.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | #set -e 6 | #set -x 7 | 8 | #============================================================= 9 | # HEADER 10 | #============================================================= 11 | 12 | #INSTITUTION:ISCIII 13 | #CENTRE:BU-ISCIII 14 | #AUTHOR: Pedro J. Sola 15 | VERSION=1.0 16 | #CREATED: 14 May 2018 17 | # 18 | #DESCRIPTION:blast_to_link script to obtain a link file that represent duplications between all members of the query 19 | #================================================================ 20 | # END_OF_HEADER 21 | #================================================================ 22 | 23 | #SHORT USAGE RULES 24 | #LONG USAGE FUNCTION 25 | usage() { 26 | cat << EOF 27 | 28 | blast_to_bed is a script than obtain a BED file with coordinates of local blast alignments matching some given conditions 29 | 30 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o ] [-b ] [-l ] [-L ] 31 | [-p ] [-d ] [-D (l|r)] [-q ] [-Q (l|r)] [-I] [-u] [-v] [-h] 32 | 33 | -i input file 34 | -b blast identity cutoff (0 - 100), default 90 35 | -l blast length percentage cutoff (0 - 100), default 20, use 90 for genes 36 | -o output directory (optional). By default the file is replaced in the same location 37 | -q database chraracter delimiter, default "_" 38 | -Q query field to retrieve (l=left, r=right), default left 39 | -d database chraracter delimiter, default "_" 40 | -D database field to retrieve (l=left, r=right), default right 41 | -I contig mode 42 | -v version 43 | -h display usage message 44 | 45 | example: blast_to_link.sh -i ecoli_prefix.blast -b 80 -l 50 46 | EOF 47 | } 48 | 49 | #================================================================ 50 | # OPTION_PROCESSING 51 | #================================================================ 52 | #Make sure the script is executed with arguments 53 | if [ $# = 0 ] ; then 54 | usage >&2 55 | exit 1 56 | fi 57 | 58 | # Error handling 59 | error(){ 60 | local parent_lineno="$1" 61 | local script="$2" 62 | local message="$3" 63 | local code="${4:-1}" 64 | 65 | RED='\033[0;31m' 66 | NC='\033[0m' 67 | 68 | if [[ -n "$message" ]] ; then 69 | echo -e "\n---------------------------------------\n" 70 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 71 | echo -e "MESSAGE:\n" 72 | echo -e "$message" 73 | echo -e "\n---------------------------------------\n" 74 | else 75 | echo -e "\n---------------------------------------\n" 76 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 77 | echo -e "\n---------------------------------------\n" 78 | fi 79 | 80 | exit "${code}" 81 | } 82 | 83 | #DECLARE FLAGS AND VARIABLES 84 | cwd="$(pwd)" 85 | input_file="Input_file" 86 | blast_id_cutoff=90 87 | blast_len_percentage=50 88 | blast_len_alignment=0 89 | database_delimiter="-" 90 | database_field=l 91 | query_delimiter="_" 92 | query_field=r 93 | unique=false 94 | suffix="" 95 | id_circos=false 96 | id_output="" 97 | 98 | #PARSE VARIABLE ARGUMENTS WITH getops 99 | #common example with letters, for long options check longopts2getopts.sh 100 | options=":i:b:q:Q:d:D:o:l:L:Iuvh" 101 | while getopts $options opt; do 102 | case $opt in 103 | i ) 104 | input_file=$OPTARG 105 | ;; 106 | b ) 107 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 108 | echo "please, provide a percentage between 0 and 100" 109 | exit 1 110 | else 111 | blast_id_cutoff=$OPTARG 112 | fi 113 | ;; 114 | o ) 115 | output_dir=$OPTARG 116 | ;; 117 | l ) 118 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 119 | echo "please, provide a percentage between 0 and 100" 120 | exit 1 121 | else 122 | blast_len_percentage=$OPTARG 123 | fi 124 | ;; 125 | d ) 126 | database_delimiter=$OPTARG 127 | ;; 128 | D ) 129 | database_field=$OPTARG 130 | ;; 131 | q ) 132 | query_delimiter=$OPTARG 133 | ;; 134 | Q ) 135 | query_field=$OPTARG 136 | ;; 137 | u ) 138 | unique=true 139 | suffix=".unique.tmp" 140 | ;; 141 | I) 142 | id_circos=true 143 | id_output=",\"id=\"query_name[length(query_name)]" 144 | ;; 145 | h ) 146 | usage 147 | exit 1 148 | ;; 149 | v ) 150 | echo $VERSION 151 | exit 1 152 | ;; 153 | \?) 154 | echo "Invalid Option: -$OPTARG" 1>&2 155 | usage 156 | exit 1 157 | ;; 158 | : ) 159 | echo "Option -$OPTARG requires an argument." >&2 160 | exit 1 161 | ;; 162 | * ) 163 | echo "Unimplemented option: -$OPTARG" >&2; 164 | exit 1 165 | ;; 166 | 167 | esac 168 | done 169 | shift $((OPTIND-1)) 170 | 171 | 172 | #================================================================ 173 | # MAIN_BODY 174 | #================================================================ 175 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 176 | 177 | echo -e "\n#Executing" $0 "\n" 178 | 179 | check_mandatory_files.sh $input_file 180 | 181 | 182 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l) 183 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//') 184 | 185 | 186 | if [ ! $output_dir ]; then 187 | output_dir=$(dirname $input_file) 188 | #echo "Default output directory is" $output_dir 189 | mkdir -p $output_dir 190 | else 191 | #echo "Output directory is" $output_dir 192 | mkdir -p $output_dir 193 | fi 194 | 195 | 196 | if [ ! $file_name ]; then 197 | file_name=$(basename $input_file | cut -d. -f1,2) 198 | fi 199 | 200 | ##CHECK FIELDS TO RETRIEVE 201 | 202 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then 203 | 204 | if [ $database_field == l ]; then 205 | database_field="1" 206 | else 207 | database_field="length(database_name)" 208 | fi 209 | 210 | else 211 | echo "Please introduce r or l for database" 212 | exit 1 213 | fi 214 | 215 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then 216 | 217 | if [ $query_field == l ]; then 218 | query_field="1" 219 | else 220 | query_field="length(query_name)" 221 | fi 222 | 223 | else 224 | 225 | echo "Please introduce 0 or 1 for query" 226 | exit 1 227 | fi 228 | 229 | echo "$(date)" 230 | echo "Adapting blast to links using" $(basename $input_file) "with:" 231 | echo "Blast identity=" $blast_id_cutoff 232 | echo "Min len percentage=" $blast_len_percentage 233 | 234 | ##Have only into account blast entries with a determine blast length 235 | 236 | awk ' 237 | (($4/$13) >= '"${blast_len_percentage_value}"') && !contigPlasmid[$1$2]++ \ 238 | {print $1$2} 239 | ' $input_file \ 240 | > $output_dir/$file_name".dict_length_percentage" || error ${LINENO} $(basename $0) "Awk command in $file_name\".dict_length_percentage\" creation failed. See $output_dir/logs for more information." 241 | 242 | 243 | ##Obtain coordinates query --> ddbb 244 | 245 | awk ' 246 | NR==FNR{contigPlasmid[$1]=$1;next} 247 | {split($2, database_name, "'"${database_delimiter}"'") 248 | split($1, query_name, "'"${query_delimiter}"'") 249 | header=$1$2} 250 | {if ((header in contigPlasmid) && ($3>='"${blast_id_cutoff}"') && (($4/$13)>=0.03)) 251 | print query_name['"$query_field"'], $7,$8,database_name['"$database_field"'],$9,$10'"$id_output"'}' \ 252 | $output_dir/$file_name".dict_length_percentage" $input_file \ 253 | > $output_dir/$file_name."blast.links" || error ${LINENO} $(basename $0) "Awk command in $file_name\".blast.links\" creation failed. See $output_dir/logs for more information" 254 | 255 | ##Change coordinates from query --> ddbb to ddbb-->ddbb in order to represent them in CIRCOSS 256 | 257 | awk ' 258 | BEGIN{OFS="\t"} 259 | { 260 | if($1 != savedNode) 261 | {savedNode= $1; delete chr} 262 | else{for(i in chr) 263 | {print $4" "$5" "$6" "chr[i]" id="savedNode} 264 | } 265 | chr[$4$5$6] = $4" "$5" "$6}' \ 266 | $output_dir/$file_name."blast.links" \ 267 | > $output_dir/$file_name."links" || error ${LINENO} $(basename $0) "Awk command in $file_name\".links\" creation failed. See $output_dir/logs for more information" 268 | 269 | 270 | rm $output_dir/$file_name".dict_length_percentage" 271 | 272 | echo "$(date)" 273 | echo "DONE adapting blast to link" 274 | echo -e "File can be found at" $output_dir/$file_name".links" "\n" 275 | -------------------------------------------------------------------------------- /bin/bowtie_mapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | #============================================================= 6 | # HEADER 7 | #============================================================= 8 | 9 | #INSTITUTION:ISCIII 10 | #CENTRE:BU-ISCIII 11 | #AUTHOR: Pedro J. Sola 12 | VERSION=1.0 13 | #CREATED: 15 March 2018 14 | #REVISION: 15 | # 19 March 2018: Complete usage info 16 | # 19 March 2018: Check mandatory files. folders and variables 17 | #DESCRIPTION:Script that index a database and map a supplied pair-end sequences 18 | #TODO 19 | # -Handle files extensions for bowtie, now is fastq by default 20 | #================================================================ 21 | # END_OF_HEADER 22 | #================================================================ 23 | 24 | #SHORT USAGE RULES 25 | #LONG USAGE FUNCTION 26 | usage() { 27 | cat << EOF 28 | 29 | Bowtie_mapper script index a database and map a supplied pair-end sequences 30 | 31 | usage : $0 [-i ] [-o ] <-d database(fasta)> <-s sample_name> <-1 R1> <-2 R2> 32 | [-g group_name] [-f ] [-T ] [-a] [-v] [-h] 33 | 34 | -i input directory (optional) 35 | -o output directory (optional) 36 | -d database to map (.fasta) 37 | -s sample name 38 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group 39 | -1 reads corresponding to paired-end R1 40 | -2 reads corresponding to paired-end R2 41 | -f offrate index for bowtie_build (optional). Default value 1. for quicker indexing use higher number 42 | -a use -a mapping (off by default) 43 | -T number of threads 44 | -v version 45 | -h display usage message 46 | 47 | example: bowtie_mapper.sh -d database.fasta -s COLI -1 ecoli_1.fastq -2 ecoli_2.fastq -a 48 | 49 | EOF 50 | } 51 | 52 | #================================================================ 53 | # OPTION_PROCESSING 54 | #================================================================ 55 | #Make sure the script is executed with arguments 56 | if [ $# = 0 ] ; then 57 | usage >&2 58 | exit 1 59 | fi 60 | 61 | # Error handling 62 | error(){ 63 | local parent_lineno="$1" 64 | local script="$2" 65 | local message="$3" 66 | local code="${4:-1}" 67 | 68 | RED='\033[0;31m' 69 | NC='\033[0m' 70 | 71 | if [[ -n "$message" ]] ; then 72 | echo -e "\n---------------------------------------\n" 73 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 74 | echo -e "MESSAGE:\n" 75 | echo -e "$message" 76 | echo -e "\n---------------------------------------\n" 77 | else 78 | echo -e "\n---------------------------------------\n" 79 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 80 | echo -e "\n---------------------------------------\n" 81 | fi 82 | 83 | exit "${code}" 84 | } 85 | 86 | #DECLARE FLAGS AND VARIABLES 87 | threads=1 88 | offrate=1 89 | cwd="$(pwd)" 90 | a_mapping="" 91 | group="NO_GROUP" 92 | database="Database" 93 | R1="R1" 94 | R2="R2" 95 | 96 | #PARSE VARIABLE ARGUMENTS WITH getops 97 | #common example with letters, for long options check longopts2getopts.sh 98 | options=":i:o:s:g:d:1:2:f:T:avh" 99 | while getopts $options opt; do 100 | case $opt in 101 | i ) 102 | input_dir=$OPTARG 103 | ;; 104 | o ) 105 | output_dir=$OPTARG 106 | ;; 107 | s ) 108 | sample=$OPTARG 109 | ;; 110 | g) 111 | group=$OPTARG 112 | ;; 113 | d ) 114 | database=$OPTARG 115 | ;; 116 | 1 ) 117 | R1=$OPTARG 118 | ;; 119 | 2 ) 120 | R2=$OPTARG 121 | ;; 122 | f ) 123 | offrate=$OPTARG 124 | ;; 125 | T ) 126 | threads=$OPTARG 127 | ;; 128 | a) 129 | a_mapping="-a" 130 | ;; 131 | h ) 132 | usage 133 | exit 1 134 | ;; 135 | v ) 136 | echo $VERSION 137 | exit 1 138 | ;; 139 | \?) 140 | echo "Invalid Option: -$OPTARG" 1>&2 141 | usage 142 | exit 1 143 | ;; 144 | : ) 145 | echo "Option -$OPTARG requires an argument." >&2 146 | exit 1 147 | ;; 148 | * ) 149 | echo "Unimplemented option: -$OPTARG" >&2; 150 | exit 1 151 | ;; 152 | 153 | esac 154 | done 155 | shift $((OPTIND-1)) 156 | 157 | 158 | #================================================================ 159 | # MAIN_BODY 160 | #================================================================ 161 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 162 | 163 | echo -e "\n#Executing" $0 "\n" 164 | 165 | check_dependencies.sh bowtie2-build bowtie2 166 | 167 | check_mandatory_files.sh $database $R1 $R2 168 | 169 | if [ ! $sample ]; then 170 | echo "ERROR: please, provide a sample name" 171 | usage 172 | exit 1 173 | fi 174 | 175 | if [ ! $output_dir ]; then 176 | output_dir=$cwd"/$group/$sample/mapping/" 177 | echo "Default output directory is" $output_dir 178 | mkdir -p $output_dir 179 | else 180 | echo "Output directory is" $output_dir 181 | mkdir -p $output_dir 182 | fi 183 | 184 | 185 | ########INDEXING############ 186 | ############################ 187 | 188 | files_bt2=$(ls $database*bt2 2> /dev/null | wc -l) 189 | 190 | 191 | if [ "$files_bt2" = "6" ];then \ 192 | echo "Found an indexed ddbb for" $(basename $database); 193 | echo "Omitting indexing" 194 | else 195 | echo "Building index of " $(basename $database); 196 | bowtie2-build \ 197 | --offrate $offrate \ 198 | $database $database || error ${LINENO} $(basename $0) "Bowtie2-build command failed. See $output_dir/logs for more information" 199 | fi 200 | 201 | ########MAPPING############# 202 | ############################ 203 | 204 | if [ -f $mappedDir/$sample.sorted.bam -a -f $mappedDir/$sample.sorted.bam.bai ];then \ 205 | echo "Found a mapping file for sample" $sample; 206 | echo "Omitting mapping" 207 | else 208 | echo "$(date)" 209 | echo mapping $R1 210 | echo mapping $R2 211 | 212 | bowtie2 \ 213 | -1 $R1 \ 214 | -2 $R2 \ 215 | -S $output_dir/$sample.sam \ 216 | -q \ 217 | --very-sensitive-local \ 218 | $a_mapping \ 219 | -p $threads \ 220 | -x $database || error ${LINENO} $(basename $0) "Bowtie2 command failed. See $output_dir/logs for more information" 221 | 222 | 223 | echo "$(date)" 224 | echo -e "DONE Mapping $sample of $group Group" "\n" 225 | fi 226 | 227 | -------------------------------------------------------------------------------- /bin/build_karyotype.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | #set -x 7 | 8 | #============================================================= 9 | # HEADER 10 | #============================================================= 11 | 12 | #INSTITUTION:ISCIII 13 | #CENTRE:BU-ISCIII 14 | #AUTHOR: Pedro J. Sola 15 | VERSION=1.0 16 | #CREATED: 13 April 2018 17 | #REVISION: 18 | #DESCRIPTION:build_karyotype script that creates karyotype file for CIRCOS either for summary and individual image 19 | 20 | #================================================================ 21 | # END_OF_HEADER 22 | #================================================================ 23 | 24 | #SHORT USAGE RULES 25 | #LONG USAGE FUNCTION 26 | usage() { 27 | cat << EOF 28 | 29 | build_karyotype script that creates karyotype file for CIRCOS either for summary and individual image 30 | 31 | usage : $0 <-i inputfile(coverage)> [-o ] [-f ] [-g ] <-k int(0-100)> <-K int(0-100)> [-v] [-h] 32 | 33 | -i input file 34 | -o output directory (optional). By default the file is replaced in the same location 35 | -f file name for identification 36 | -g group name for identification 37 | -R Reconstruct 38 | -K percentage value to display plasmids covered >= in summary image 39 | -k percentage value to display plasmids covered >= in individual image 40 | -v version 41 | -h display usage message 42 | 43 | example: build_karyotype.sh -i ecoli.coverage -K 80 -k 50 44 | 45 | EOF 46 | } 47 | 48 | #================================================================ 49 | # OPTION_PROCESSING 50 | #================================================================ 51 | #Make sure the script is executed with arguments 52 | if [ $# = 0 ] ; then 53 | usage >&2 54 | exit 1 55 | fi 56 | 57 | 58 | # Error handling 59 | error(){ 60 | local parent_lineno="$1" 61 | local script="$2" 62 | local message="$3" 63 | local code="${4:-1}" 64 | 65 | RED='\033[0;31m' 66 | NC='\033[0m' 67 | 68 | if [[ -n "$message" ]] ; then 69 | echo -e "\n---------------------------------------\n" 70 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 71 | echo -e "MESSAGE:\n" 72 | echo -e "$message" 73 | echo -e "\n---------------------------------------\n" 74 | else 75 | echo -e "\n---------------------------------------\n" 76 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 77 | echo -e "\n---------------------------------------\n" 78 | fi 79 | 80 | exit "${code}" 81 | } 82 | 83 | #DECLARE FLAGS AND VARIABLES 84 | cwd="$(pwd)" 85 | input_file="Input_file" 86 | coverage_cutoff_input=100 87 | reconstruct=false 88 | 89 | #PARSE VARIABLE ARGUMENTS WITH getops 90 | #common example with letters, for long options check longopts2getopts.sh 91 | options=":i:o:f:g:K:k:Rvh" 92 | while getopts $options opt; do 93 | case $opt in 94 | i ) 95 | input_file=$OPTARG 96 | ;; 97 | o ) 98 | output_dir=$OPTARG 99 | ;; 100 | f ) file_name=$OPTARG 101 | ;; 102 | g ) group_name=$OPTARG 103 | ;; 104 | R ) reconstruct=true 105 | ;; 106 | K ) 107 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 108 | echo "please, provide a summary percentage between 0 and 100" 109 | usage 110 | exit 1 111 | else 112 | coverage_cutoff_summary_percentage=$OPTARG 113 | fi 114 | ;; 115 | k ) 116 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 117 | echo "please, provide an individual percentage between 0 and 100" 118 | usage 119 | exit 1 120 | else 121 | coverage_cutoff_individual_percentage=$OPTARG 122 | fi 123 | ;; 124 | h ) 125 | usage 126 | exit 1 127 | ;; 128 | v ) 129 | echo $VERSION 130 | exit 1 131 | ;; 132 | \?) 133 | echo "Invalid Option: -$OPTARG" 1>&2 134 | usage 135 | exit 1 136 | ;; 137 | : ) 138 | echo "Option -$OPTARG requires an argument." >&2 139 | exit 1 140 | ;; 141 | * ) 142 | echo "Unimplemented option: -$OPTARG" >&2; 143 | exit 1 144 | ;; 145 | 146 | esac 147 | done 148 | shift $((OPTIND-1)) 149 | 150 | #================================================================ 151 | # MAIN_BODY 152 | #================================================================ 153 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 154 | 155 | echo -e "\n#Executing" $0 "\n" 156 | 157 | check_mandatory_files.sh $input_file 158 | 159 | coverage_cutoff_summary=$(echo "(1 - ($coverage_cutoff_summary_percentage/100))" | bc -l) 160 | coverage_cutoff_individual=$(echo "(1 - ($coverage_cutoff_individual_percentage/100))" | bc -l) 161 | 162 | 163 | if [ ! $output_dir ]; then 164 | output_dir=$(dirname $input_file) 165 | #echo "Default output directory is" $output_dir 166 | mkdir -p $output_dir 167 | else 168 | #echo "Output directory is" $output_dir 169 | mkdir -p $output_dir 170 | fi 171 | 172 | 173 | if [ ! $file_name ]; then 174 | file_name=$(basename $input_file | cut -d. -f1) 175 | fi 176 | 177 | echo "FILE NAME" $file_name 178 | 179 | echo "$(date)" 180 | echo "Obtain list of cromosomes (idiogram) for CIRCOS karyotype file" 181 | echo "Generating summary karyotype file with plasmids that mapped more than" $coverage_cutoff_summary_percentage"%" 182 | if [ $reconstruct = true ];then 183 | 184 | awk '{print "chr -", $1, $1, "0", $2, "id="$1}' $input_file \ 185 | >$output_dir/$file_name".karyotype_summary.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype summary in $file_name\".karyotype_summary.txt\" creation. See $output_dir/logs for more information" 186 | 187 | awk '{print "chr -", $1, $1, "0", $2, "id="$1}' $input_file \ 188 | >$output_dir/$file_name".karyotype_individual.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype individual in $file_name\".karyotype_individual.txt\" creation. See $output_dir/logs for more information." 189 | 190 | else 191 | awk ' 192 | {if ($2 == 0 && $5 < '"${coverage_cutoff_summary}"') 193 | {print "chr -", $1, $1, "0", $4, "id="$1} 194 | } 195 | ' $input_file \ 196 | > $output_dir/$file_name".karyotype_summary.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype summary in $file_name\".karyotype_summary.txt\" creation. See $output_dir/logs for more information." 197 | 198 | 199 | echo "Generating individual karyotype file with plasmids that mapped more than" $coverage_cutoff_individual_percentage"%" 200 | 201 | awk ' 202 | {if ($2 == 0 && $5 < '"${coverage_cutoff_individual}"') 203 | {print "chr -", $1, $1, "0", $4, "id="$1} 204 | } 205 | ' $input_file \ 206 | > $output_dir/$file_name".karyotype_individual.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype individual in $file_name\".karyotype_individual.txt\" creation. See $output_dir/logs for more information" 207 | 208 | fi 209 | 210 | echo "$(date)" 211 | echo "Done Obtain list of cromosomes (idiogram) for CIRCOS karyotype file" 212 | echo "Files can be found at" $output_dir 213 | echo $(cat $output_dir/$file_name".karyotype_summary.txt" | wc -l) "sequences will be displayed on summary image" 214 | echo -e $(cat $output_dir/$file_name".karyotype_individual.txt" | wc -l) "images will be created individually" "\n" 215 | -------------------------------------------------------------------------------- /bin/calculate_seqlen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | 7 | #============================================================= 8 | # HEADER 9 | #============================================================= 10 | 11 | #INSTITUTION:ISCIII 12 | #CENTRE:BU-ISCIII 13 | #AUTHOR: Pedro J. Sola 14 | VERSION=1.0 15 | #CREATED: 20 March 2018 16 | #REVISION: 17 | #DESCRIPTION:Script that convert a supplied SAM file into compressed binary indexed BAM 18 | #AKNOWLEDGE: 19 | # -Adapted from klashxx: https://stackoverflow.com/questions/23992646/sequence-length-of-fasta-file/23992773 20 | #================================================================ 21 | # END_OF_HEADER 22 | #================================================================ 23 | 24 | #SHORT USAGE RULES 25 | #LONG USAGE FUNCTION 26 | usage() { 27 | cat << EOF 28 | 29 | Calculate_sequlen script calculates a supplied FASTA length 30 | 31 | usage : $0 <-i inputfile(.fasta)> [-o ] [-n ] [-r] [-v] [-h] 32 | 33 | -i input file 34 | -o output directory (optional). By default the file is replaced in the same location 35 | -n file name (optional). By default is the same name with .length extension 36 | -r remove ">" (greater-than) symbol from fasta header 37 | -v version 38 | -h display usage message 39 | 40 | example: calculate_sequlen.sh -i ecoli.fasta 41 | 42 | EOF 43 | } 44 | 45 | #================================================================ 46 | # OPTION_PROCESSING 47 | #================================================================ 48 | #Make sure the script is executed with arguments 49 | if [ $# = 0 ] ; then 50 | usage >&2 51 | exit 1 52 | fi 53 | 54 | # Error handling 55 | error(){ 56 | local parent_lineno="$1" 57 | local script="$2" 58 | local message="$3" 59 | local code="${4:-1}" 60 | 61 | RED='\033[0;31m' 62 | NC='\033[0m' 63 | 64 | if [[ -n "$message" ]] ; then 65 | echo -e "\n---------------------------------------\n" 66 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 67 | echo -e "MESSAGE:\n" 68 | echo -e "$message" 69 | echo -e "\n---------------------------------------\n" 70 | else 71 | echo -e "\n---------------------------------------\n" 72 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 73 | echo -e "\n---------------------------------------\n" 74 | fi 75 | 76 | exit "${code}" 77 | } 78 | 79 | #DECLARE FLAGS AND VARIABLES 80 | remove_head=remove_head_false 81 | cwd="$(pwd)" 82 | file_name="file_name" 83 | input_file="Input_file" 84 | 85 | #PARSE VARIABLE ARGUMENTS WITH getops 86 | #common example with letters, for long options check longopts2getopts.sh 87 | options=":i:o:n:rvh" 88 | while getopts $options opt; do 89 | case $opt in 90 | i ) 91 | input_file=$OPTARG 92 | ;; 93 | o ) 94 | output_dir=$OPTARG 95 | ;; 96 | n ) 97 | filename=$OPTARG 98 | ;; 99 | r ) 100 | remove_head="^>" 101 | ;; 102 | h ) 103 | usage 104 | exit 1 105 | ;; 106 | v ) 107 | echo $VERSION 108 | exit 1 109 | ;; 110 | \?) 111 | echo "Invalid Option: -$OPTARG" 1>&2 112 | usage 113 | exit 1 114 | ;; 115 | : ) 116 | echo "Option -$OPTARG requires an argument." >&2 117 | exit 1 118 | ;; 119 | * ) 120 | echo "Unimplemented option: -$OPTARG" >&2; 121 | exit 1 122 | ;; 123 | 124 | esac 125 | done 126 | shift $((OPTIND-1)) 127 | 128 | #================================================================ 129 | # MAIN_BODY 130 | #================================================================ 131 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 132 | 133 | echo -e "\n#Executing" $0 "\n" 134 | 135 | check_mandatory_files.sh $input_file 136 | 137 | if [ ! $output_dir ]; then 138 | output_dir=$(dirname $input_file) 139 | #echo "Default output directory is" $output_dir 140 | mkdir -p $output_dir 141 | else 142 | #echo "Output directory is" $output_dir 143 | mkdir -p $output_dir 144 | fi 145 | 146 | if [ ! $filename ]; then 147 | filename=$(basename $input_file | cut -d. -f1) 148 | fi 149 | 150 | awk ' 151 | BEGIN {FS=="| "} 152 | /^>/ {if (seqlen) 153 | print seqlen;printf "%s\t", $1; seqlen=0; next 154 | } 155 | {seqlen+=length($0)} 156 | END {print seqlen}' $input_file | sed 's/'$remove_head'//g' \ 157 | >$output_dir/$filename".length" || error ${LINENO} $(basename $0) "Awk command for bedtools seqlen calculation in $filename\".length\" creation. See $output_dir/logs for more information." 158 | 159 | echo "$(date)" 160 | echo "Done seqlen calculation" 161 | echo "Files can be found at" $output_dir 162 | -------------------------------------------------------------------------------- /bin/cdhit_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | #============================================================= 6 | # HEADER 7 | #============================================================= 8 | 9 | #INSTITUTION:ISCIII 10 | #CENTRE:BU-ISCIII 11 | #AUTHOR: Pedro J. Sola 12 | VERSION=1.0 13 | #CREATED: 6 April 2018 14 | #REVISION: 15 | #DESCRIPTION:Script that uses cd-hit/psi-cd-hit to clusterize a FASTA file 16 | # 17 | #DOCUMENTATION 18 | # 19 | # 20 | #Compare floats in BASH 21 | # 22 | #if [ $(echo "$cluster_cutoff > 0.7"|bc -l) -eq 1 ]; then 23 | # echo "YES" 24 | #else 25 | # echo "NO" 26 | #fi 27 | # 28 | #-d length of description in .clstr file, default 20. if set to 0, 29 | # it takes the fasta defline and stops at first space 30 | #-s length difference cutoff, default 0.0 31 | # if set to 0.9, the shorter sequences need to be 32 | # at least 90% length of the representative of the cluster 33 | #-B 1 or 0, default 0, by default, sequences are stored in RAM 34 | # if set to 1, sequence are stored on hard drive 35 | # it is recommended to use -B 1 for huge databases 36 | #-g 1 or 0, default 0 37 | # By cd-hit’s default algorithm, a sequence is clustered to the first 38 | # cluster that meet the threshold (fast mode). If set to 1, the program 39 | # will cluster it into the most similar cluster that meet the threshold 40 | # (accurate but slow mode) 41 | # 42 | # PSI-CD-HIT 43 | #-G (1/0) use global identity? default 1, sequence identity 44 | # calculated as total identical residues of local alignments 45 | # length of shorter sequence 46 | # 47 | #-n 5 for thresholds 0.7 ~ 1.0 48 | #-n 4 for thresholds 0.6 ~ 0.7 49 | #-n 3 for thresholds 0.5 ~ 0.6 50 | #-n 2 for thresholds 0.4 ~ 0.5 51 | 52 | #================================================================ 53 | # END_OF_HEADER 54 | #================================================================ 55 | 56 | #SHORT USAGE RULES 57 | #LONG USAGE FUNCTION 58 | usage() { 59 | cat << EOF 60 | 61 | Cdhit_cluster script uses cd-hit/psi-cd-hit to clusterize a FASTA file 62 | 63 | usage : $0 <-i inputfile(FASTA)> [-o ] [-n ] [-c ] [-H ] 64 | [-T ] [-g group_name] [-s ] [-M ][-C <(0|1)>] [-G <(0|1)>] [-b ] [p] [-v] [-h] 65 | 66 | -i input file in FASTA format 67 | -c percentage threshold to cluster, default 80 68 | -H cd-hit command, default cd-hit-est 69 | -M max available memory (Mbyte), default 400 70 | -n file name 71 | -s length difference cutoff, default 0.8 72 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group 73 | -p runs psi-cd-hit instead of cd-hit 74 | -C psi-cd-hit only: circular sequences, default 1. If set to 0 sequence is assumed lineal 75 | -G psi-cd-hit only: gobal identity, -G 0 only takes the first local alignment for clustering 76 | -b psi-cd-hit only: choose blast program, default blastn 77 | -T number of threads 78 | -v version 79 | -h display usage message 80 | 81 | 82 | Output directory is the same as input directory 83 | 84 | example: cdhit_cluster -i ecoli.fasta -c 90 -M 50000 -T 0 85 | cdhit_cluster -H cd-hit -i ecoli.fasta -c 90 -M 50000 -T 0 86 | 87 | 88 | EOF 89 | } 90 | 91 | #================================================================ 92 | # OPTION_PROCESSING 93 | #================================================================ 94 | #Make sure the script is executed with arguments 95 | if [ $# = 0 ] ; then 96 | usage >&2 97 | exit 1 98 | fi 99 | 100 | # Error handling 101 | error(){ 102 | local parent_lineno="$1" 103 | local script="$2" 104 | local message="$3" 105 | local code="${4:-1}" 106 | 107 | RED='\033[0;31m' 108 | NC='\033[0m' 109 | 110 | if [[ -n "$message" ]] ; then 111 | echo -e "\n---------------------------------------\n" 112 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 113 | echo -e "MESSAGE:\n" 114 | echo -e "$message" 115 | echo -e "\n---------------------------------------\n" 116 | else 117 | echo -e "\n---------------------------------------\n" 118 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 119 | echo -e "\n---------------------------------------\n" 120 | fi 121 | 122 | exit "${code}" 123 | } 124 | 125 | #DECLARE FLAGS AND VARIABLES 126 | cwd="$(pwd)" 127 | group="NO_GROUP" 128 | input_file="Input_file" 129 | cluster_cutoff=0.8 130 | max_memory=400 131 | length_cutoff=0.8 132 | cd_hit_command=cd-hit-est 133 | is_circle=1 134 | global_psi_cd_hit=1 135 | psi_cd_hit_program=blastn 136 | word_size=0 137 | threads=0 138 | 139 | #PARSE VARIABLE ARGUMENTS WITH getops 140 | #common example with letters, for long options check longopts2getopts.sh 141 | options=":i:o:c:M:n:s:g:C:G:b:T:H:pvh" 142 | while getopts $options opt; do 143 | case $opt in 144 | i ) 145 | input_file=$OPTARG 146 | ;; 147 | 148 | c ) 149 | cluster_cutoff_input=$OPTARG 150 | ;; 151 | g) 152 | group=$OPTARG 153 | ;; 154 | H) 155 | cd_hit_command=$OPTARG 156 | ;; 157 | M ) 158 | max_memory=$OPTARG 159 | ;; 160 | n ) 161 | file_name=$OPTARG 162 | ;; 163 | s ) 164 | length_cutoff=$OPTARG 165 | ;; 166 | p ) 167 | cd_hit_command=psi-cd-hit.pl 168 | ;; 169 | C ) 170 | is_circle=$OPTARG 171 | ;; 172 | G) 173 | global_psi_cd_hit=$OPTARG 174 | ;; 175 | T) 176 | threads=$OPTARG 177 | ;; 178 | b) 179 | psi_cd_hit_program=$OPTARG 180 | ;; 181 | h ) 182 | usage 183 | exit 1 184 | ;; 185 | v ) 186 | echo $VERSION 187 | exit 1 188 | ;; 189 | \?) 190 | echo "Invalid Option: -$OPTARG" 1>&2 191 | usage 192 | exit 1 193 | ;; 194 | : ) 195 | echo "Option -$OPTARG requires an argument." >&2 196 | exit 1 197 | ;; 198 | * ) 199 | echo "Unimplemented option: -$OPTARG" >&2; 200 | exit 1 201 | ;; 202 | 203 | esac 204 | done 205 | shift $((OPTIND-1)) 206 | 207 | #================================================================ 208 | # MAIN_BODY 209 | #================================================================ 210 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 211 | 212 | echo -e "\n#Executing" $0 "\n" 213 | 214 | 215 | check_mandatory_files.sh $input_file 216 | 217 | check_dependencies.sh cd-hit-est 218 | #psi-cd-hit.pl 219 | 220 | 221 | 222 | # Set word size (parameter -n for cd-hit) as author recomends 223 | #according to clustering percentage 224 | 225 | 226 | cluster_cutoff=$(echo "$cluster_cutoff_input / 100" | bc -l | sed 's/0\{1,\}$//') 227 | #cluster_cutoff=${cluster_cutoff%.*} #Remove float value 228 | 229 | 230 | if [[ "$cluster_cutoff_input" -gt 70 && "$cluster_cutoff_input" -le 100 ]]; then 231 | word_size=5 232 | elif [[ "$cluster_cutoff_input" -gt 60 && "$cluster_cutoff_input" -le 70 ]]; then 233 | word_size=4 234 | elif [[ "$cluster_cutoff_input" -gt 50 && "$cluster_cutoff_input" -le 60 ]]; then 235 | word_size=3 236 | elif [[ "$cluster_cutoff_input" -ge 40 && "$cluster_cutoff_input" -le 50 ]]; then 237 | word_size=2 238 | else 239 | echo "please introduce a valid cluster percentage value between 0.4 and 1" 240 | exit 1 241 | fi 242 | 243 | 244 | 245 | if [ ! $output_dir ]; then 246 | output_dir=$(dirname $input_file) 247 | echo "Default output directory is" $output_dir 248 | mkdir -p $output_dir 249 | else 250 | echo "Output directory is" $output_dir 251 | mkdir -p $output_dir 252 | fi 253 | 254 | if [ ! $file_name ]; then 255 | file_name=$(basename $input_file) 256 | echo "filename is" $file_name 257 | fi 258 | 259 | ##CD-HIT EXECUTION 260 | 261 | echo "$(date)" 262 | echo "Clustering sequences with identity" $cluster_cutoff_input"% or higher" 263 | echo "Using" $cd_hit_command "with file" $input_file 264 | seq_number_prev_clstr=$(cat $input_file | grep ">" | wc -l) 265 | 266 | cd $(dirname $input_file) 267 | 268 | if [ -f $output_dir/$file_name""_""$cluster_cutoff_input ]; then \ 269 | echo "Found a clustered file for sample" $file_name; 270 | echo "Omitting clustering process calculation" 271 | exit 1 272 | else 273 | if [ $cd_hit_command == "psi-cd-hit.pl" ]; then 274 | 275 | check_dependencies.sh psi-cd-hit.pl 276 | $cd_hit_command -i $(basename $input_file) -o $file_name""_""$cluster_cutoff_input -c $cluster_cutoff -G $global_psi_cd_hit -g 1 -prog $psi_cd_hit_program -circle $is_circle -core $threads || error ${LINENO} $(basename $0) "PSI-CD-HIT command failed. See $output_dir/logs for more information." 277 | 278 | else 279 | 280 | $cd_hit_command -i $(basename $input_file) -o $file_name""_""$cluster_cutoff_input -c $cluster_cutoff -n $word_size -d 0 -s $length_cutoff -B 1 -M $max_memory -T $threads|| error ${LINENO} $(basename $0) "CD-HIT command failed. See $output_dir/logs for more information" 281 | 282 | fi 283 | fi 284 | 285 | seq_number_post_clstr=$(cat $file_name""_""$cluster_cutoff_input | grep ">" | wc -l) 286 | 287 | echo "$(date)" 288 | echo "DONE Clustering sequences with identity" $cluster_cutoff_input"% or higher" 289 | echo "fasta file can be found in" $output_dir/$file_name""_""$cluster_cutoff_input 290 | echo "Previous number of sequences=" $seq_number_prev_clstr 291 | echo -e "Number of sequences after clustering=" $seq_number_post_clstr "\n" 292 | cd $cwd 293 | -------------------------------------------------------------------------------- /bin/check_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #============================================================= 4 | # HEADER 5 | #============================================================= 6 | 7 | #INSTITUTION:ISCIII 8 | #CENTRE:BU-ISCIII 9 | #AUTHOR: Pedro J. Sola 10 | VERSION=1.0 11 | #CREATED: 19 March 2018 12 | #REVISION: 12 July 2018: add formated output and colors 13 | #AKNOWLEDGE: Colored text: https://stackoverflow.com/questions/5947742/how-to-change-the-output-color-of-echo-in-linux 14 | #DESCRIPTION:Short function to evaluate if programs are on path 15 | 16 | #================================================================ 17 | # END_OF_HEADER 18 | #================================================================ 19 | 20 | #SHORT USAGE RULES 21 | #LONG USAGE FUNCTION 22 | usage() { 23 | cat << EOF 24 | 25 | Check_dependencies Short function to evaluate if files exist 26 | 27 | usage : $0 [program_name2] ... 28 | 29 | example: lib/check_dependencies.sh foo bar 30 | 31 | EOF 32 | } 33 | 34 | if [ $# = 0 ] ; then 35 | usage >&2 36 | exit 1 37 | fi 38 | 39 | #DECLARE FLAGS AND VARIABLES 40 | missing_dependencies=0 41 | 42 | #SET COLORS 43 | 44 | RED='\033[0;31m' 45 | GREEN='\033[0;32m' 46 | NC='\033[0m' 47 | 48 | printf '\n%s\t%20s\n' "DEPENDENCY" "STATUS" 49 | printf '%s\t%20s\n' "----------" "------" 50 | 51 | for command in "$@"; do 52 | #dependency_version=$($command --version) 53 | length_command=$(echo $command | wc -m) 54 | distance_table=$((30 - $length_command)) 55 | distance_expression=$(echo "%${distance_table}s") 56 | 57 | printf '%s' $command 58 | if ! [ -x "$(which $command 2> /dev/null)" ]; then 59 | 60 | 61 | printf $distance_expression 62 | printf "${RED}NOT INSTALLED${NC} \n" 63 | let missing_dependencies++ 64 | else 65 | printf $distance_expression 66 | printf "${GREEN}INSTALLED${NC} \n" 67 | fi 68 | done 69 | 70 | if [ $missing_dependencies -gt 0 ]; then 71 | printf "${RED}ERROR${NC}: $missing_dependencies missing dependencies, aborting execution\n" >&2 72 | exit 1 73 | fi 74 | 75 | -------------------------------------------------------------------------------- /bin/check_mandatory_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #============================================================= 4 | # HEADER 5 | #============================================================= 6 | 7 | #INSTITUTION:ISCIII 8 | #CENTRE:BU-ISCIII 9 | #AUTHOR: Pedro J. Sola 10 | VERSION=1.0 11 | #CREATED: 19 March 2018 12 | #REVISION: 13 | #DESCRIPTION:Short function to evaluate if files exist 14 | 15 | #================================================================ 16 | # END_OF_HEADER 17 | #================================================================ 18 | 19 | #SHORT USAGE RULES 20 | #LONG USAGE FUNCTION 21 | usage() { 22 | cat << EOF 23 | 24 | Check_mandatory_files Short function to evaluate if files exist 25 | 26 | usage : $0 [file2] ... 27 | 28 | example: lib/check_mandatory_files.sh foo.txt bar.fasta 29 | 30 | EOF 31 | } 32 | 33 | if [ $# = 0 ] ; then 34 | usage >&2 35 | exit 36 | fi 37 | 38 | #DECLARE FLAGS AND VARIABLES 39 | missing_files=0 40 | 41 | for file in "$@"; do 42 | if [ ! -f $file ]; then 43 | echo "$(basename $file)" "not supplied, please, introduce a valid file" >&2 44 | let missing_files++ 45 | fi 46 | done 47 | 48 | if [ $missing_files -gt 0 ]; then 49 | echo "ERROR: $missing_files missing files, aborting execution" >&2 50 | exit 1 51 | fi -------------------------------------------------------------------------------- /bin/coordinate_adapter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | #set -e 6 | #set -x 7 | 8 | #============================================================= 9 | # HEADER 10 | #============================================================= 11 | 12 | #INSTITUTION:ISCIII 13 | #CENTRE:BU-ISCIII 14 | #AUTHOR: Pedro J. Sola 15 | VERSION=1.0 16 | #CREATED: 17 May 2018 17 | #REVISION: 18 | #DESCRIPTION:coordinate_adapter script adapt coordinates obtained with a bed file to a reference sequences in a link file 19 | # 20 | # 21 | #================================================================ 22 | # END_OF_HEADER 23 | #================================================================ 24 | 25 | #SHORT USAGE RULES 26 | #LONG USAGE FUNCTION 27 | usage() { 28 | cat << EOF 29 | 30 | coordinate_adapter script adapt coordinates obtained with a bed file to a reference sequences in a link file 31 | 32 | usage : $0 <-i inputfile(.bed)> <-l link_file> [-o ] [-n ] [-f ] [-u] [-v] [-h] 33 | 34 | -i input file in bed format 35 | -l link file with coordinates relationship within bed file ddbb and link reference 36 | -o output directory (optional). By default the file is placed in the same location as input 37 | -n length to extend annotation, default 2000 38 | -f file name 39 | -u uniq mode. Remove duplicates 40 | -p prokka mode. Remove suffix of prokka 41 | -v version 42 | -h display usage message 43 | 44 | example: ./coordinate_adapter.sh -i genes.bed -l ecoli.links -n 10000 45 | 46 | EOF 47 | } 48 | 49 | #================================================================ 50 | # OPTION_PROCESSING 51 | #================================================================ 52 | #Make sure the script is executed with arguments 53 | if [ $# = 0 ] ; then 54 | usage >&2 55 | exit 1 56 | fi 57 | 58 | # Error handling 59 | error(){ 60 | local parent_lineno="$1" 61 | local script="$2" 62 | local message="$3" 63 | local code="${4:-1}" 64 | 65 | RED='\033[0;31m' 66 | NC='\033[0m' 67 | 68 | if [[ -n "$message" ]] ; then 69 | echo -e "\n---------------------------------------\n" 70 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 71 | echo -e "MESSAGE:\n" 72 | echo -e "$message" 73 | echo -e "\n---------------------------------------\n" 74 | else 75 | echo -e "\n---------------------------------------\n" 76 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 77 | echo -e "\n---------------------------------------\n" 78 | fi 79 | 80 | exit "${code}" 81 | } 82 | 83 | #DECLARE FLAGS AND VARIABLES 84 | cwd="$(pwd)" 85 | input_file="Bed_file" 86 | link_file="Link_file" 87 | number_extension=2000 88 | unique=false 89 | prokka_mode=false 90 | suffix="" 91 | 92 | #PARSE VARIABLE ARGUMENTS WITH getops 93 | #common example with letters, for long options check longopts2getopts.sh 94 | options=":i:l:n:f:puvh" 95 | while getopts $options opt; do 96 | case $opt in 97 | i ) 98 | input_file=$OPTARG 99 | ;; 100 | l ) 101 | link_file=$OPTARG 102 | ;; 103 | o ) 104 | output_dir=$OPTARG 105 | ;; 106 | n ) 107 | number_extension=$OPTARG 108 | ;; 109 | f) 110 | file_name=$OPTARG 111 | ;; 112 | u ) 113 | unique=true 114 | suffix=".unique.tmp" 115 | ;; 116 | p ) 117 | prokka_mode=true 118 | suffix=".prokka.tmp" 119 | ;; 120 | h ) 121 | usage 122 | exit 1 123 | ;; 124 | v ) 125 | echo $VERSION 126 | exit 1 127 | ;; 128 | \?) 129 | echo "Invalid Option: -$OPTARG" 1>&2 130 | usage 131 | exit 1 132 | ;; 133 | : ) 134 | echo "Option -$OPTARG requires an argument." >&2 135 | exit 1 136 | ;; 137 | * ) 138 | echo "Unimplemented option: -$OPTARG" >&2; 139 | exit 1 140 | ;; 141 | 142 | esac 143 | done 144 | shift $((OPTIND-1)) 145 | 146 | 147 | #================================================================ 148 | # MAIN_BODY 149 | #================================================================ 150 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 151 | 152 | echo -e "\n#Executing" $0 "\n" 153 | 154 | check_mandatory_files.sh $input_file $link_file 155 | 156 | if [ ! $output_dir ]; then 157 | output_dir=$(dirname $input_file) 158 | #echo "Default output directory is" $output_dir 159 | mkdir -p $output_dir 160 | else 161 | #echo "Output directory is" $output_dir 162 | mkdir -p $output_dir 163 | fi 164 | 165 | 166 | if [ ! $file_name ]; then 167 | file_name=$(basename $input_file | cut -d. -f1,2) 168 | fi 169 | 170 | 171 | echo "$(date)" 172 | echo "adapting coordinates from" $input_file and $link_file 173 | echo "file name is:" $file_name 174 | 175 | #Create a dictionary file with all posibilities: Column 1 and 5 must have some common terms 176 | awk 'NR==FNR{a[NR]=$1;b[NR]=$0;next}{for(i = 1; i <= NR; ++i){if (a[i] == $1) print b[i],"\t", $0}}' \ 177 | $input_file $link_file > $output_dir/$file_name".coordinates.tmp" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.tmp\" creation. See $output_dir/logs for more information." 178 | 179 | awk '(($2 >= $6 - '"${number_extension}"' && $2 <= $7) || ($3 >= $6 && $3 <= $7 + '"${number_extension}"')) {{isInverted=($10-$9); \ 180 | genelength=($3-$2)};{if (isInverted < 0) {coordChr1=(($7-$3)+$10);} else {coordChr1=(($2-$6)+$9)}}; \ 181 | coordChr2=(coordChr1+genelength); {print $8, coordChr1, coordChr2, $4}}' $output_dir/$file_name".coordinates.tmp" > $output_dir/$file_name".coordinates.negatives"|| error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.negatives\" creation. See $output_dir/logs for more information." 182 | 183 | 184 | #resulting in a bed file with coordinated of plasmid bur refering to contig annotation: 185 | #NZ_CP010574.1 34820 33528 arsB_1 186 | #NZ_CP008930.1 90527 89235 arsB_1 187 | #NZ_CP006927.1 44969 43677 arsB_1 188 | #NZ_CP010574.1 81021 82508 ltrA_1 189 | #NZ_CP008930.1 144220 145707 ltrA_1 190 | 191 | 192 | #Remove duplicate of several matches 193 | 194 | awk '($2 > 0) && ($3 > 0)' $output_dir/$file_name".coordinates.negatives" \ 195 | > $output_dir/$file_name".coordinates"$suffix || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates$suffix\" creation. See $output_dir/logs for more information." 196 | 197 | 198 | if [ "$unique" == "true" ]; then 199 | awk ' 200 | (!x[$1$4]++) 201 | ' $output_dir/$file_name".coordinates"$suffix \ 202 | > $output_dir/$file_name".coordinates" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates\" creation. See $output_dir/logs for more information." 203 | 204 | rm $output_dir/$file_name".coordinates"$suffix 205 | fi 206 | 207 | if [ "$prokka_mode" == "true" ]; then 208 | 209 | awk ' 210 | (!uniq[$1$4]++) 211 | ' $output_dir/$file_name".coordinates"$suffix \ 212 | > $output_dir/$file_name".coordinates.prokka.unique.tmp"|| error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.prokka.unique.tmp\" creation. See $output_dir/logs for more information." 213 | 214 | 215 | awk ' 216 | BEGIN{OFS="\t"}{split($4, namelowbar, "_")} {$4=($4 !~ /CDS/) ? namelowbar[1] : $4}1 217 | ' $output_dir/$file_name".coordinates.prokka.unique.tmp" \ 218 | > $output_dir/$file_name".coordinates" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates\" creation. See $output_dir/logs for more information." 219 | 220 | rm $output_dir/$file_name".coordinates.prokka.unique.tmp" 221 | rm $output_dir/$file_name".coordinates"$suffix 222 | 223 | fi 224 | 225 | rm $output_dir/$file_name".coordinates.tmp" 226 | rm $output_dir/$file_name".coordinates.negatives" 227 | 228 | 229 | echo "$(date)" 230 | echo -e "Coordinates adapted to file" $output_dir/$file_name".coordinates" "\n" 231 | -------------------------------------------------------------------------------- /bin/download_plasmid_database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Standard library imports 4 | import os 5 | import sys 6 | import logging 7 | 8 | # Third party imports 9 | import argparse 10 | import datetime 11 | import pandas as pd 12 | import Bio 13 | from Bio import Entrez 14 | from Bio import SeqIO 15 | 16 | logger = logging.getLogger() 17 | 18 | """ 19 | ============================================================= 20 | HEADER 21 | ============================================================= 22 | FUNCTION: Download up to date plasmid database from https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/plasmids.txt. 23 | Remove those sequences with terms not related to complete plasmid such: gene, protein, partial, putative or hypothetical 24 | 25 | INSTITUTION:CNM-ISCIII 26 | AUTHOR: Pedro J. Sola (pedroscampoy@gmail.com) 27 | d^v^b 28 | VERSION=0.1 29 | CREATED: 26 February 2020 30 | REVISION: 31 | 32 | TODO: 33 | add user defined terms 34 | filter by record size (len(record)) 35 | ================================================================ 36 | END_OF_HEADER 37 | ================================================================ 38 | """ 39 | 40 | 41 | def check_create_dir(path): 42 | if os.path.exists(path): 43 | pass 44 | else: 45 | os.mkdir(path) 46 | 47 | 48 | def main(): 49 | 50 | def get_arguments(): 51 | 52 | parser = argparse.ArgumentParser( 53 | prog='download_plasmid_database.py', description='Download up to date plasmid database from ncbi ftp') 54 | 55 | parser.add_argument('-o', '--output', type=str, required=True, 56 | help='REQUIRED. Output directory to extract plasmid database') 57 | 58 | arguments = parser.parse_args() 59 | 60 | return arguments 61 | 62 | args = get_arguments() 63 | 64 | output_dir = os.path.abspath(args.output) 65 | 66 | check_create_dir(output_dir) 67 | 68 | # LOGGING 69 | # Create log file with date and time 70 | today = str(datetime.date.today()) 71 | right_now_full = "".join(today.split("-")) 72 | 73 | log_filename = 'plasmidID_database' + "_" + right_now_full + ".log" 74 | log_full_path = os.path.join(output_dir, log_filename) 75 | 76 | logger = logging.getLogger() 77 | logger.setLevel(logging.DEBUG) 78 | 79 | formatter = logging.Formatter('%(asctime)s:%(message)s') 80 | 81 | file_handler = logging.FileHandler(log_full_path) 82 | file_handler.setLevel(logging.DEBUG) 83 | file_handler.setFormatter(formatter) 84 | 85 | stream_handler = logging.StreamHandler() 86 | stream_handler.setLevel(logging.INFO) 87 | # stream_handler.setFormatter(formatter) 88 | 89 | logger.addHandler(stream_handler) 90 | logger.addHandler(file_handler) 91 | 92 | #####################START PIPELINE################ 93 | 94 | logger.debug(args) 95 | 96 | plasmid_text_file = today + "_plasmids.txt" 97 | plasmid_text_path = os.path.join(output_dir, plasmid_text_file) 98 | 99 | plasmid_fasta_file = today + "_plasmids.fasta" 100 | plasmid_fasta_path = os.path.join(output_dir, plasmid_fasta_file) 101 | 102 | plasmid_failed_file = today + "failed_plasmids.txt" 103 | plasmid_failed_path = os.path.join(output_dir, plasmid_failed_file) 104 | 105 | try: 106 | df = pd.read_csv( 107 | 'https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/plasmids.txt', sep='\t') 108 | except: 109 | logger.info('there was a problem accessing the ftp') 110 | sys.exit(1) 111 | 112 | df.to_csv(plasmid_text_path, sep='\t', index=False) 113 | 114 | plasmid_reference = df['RefSeq'][df.RefSeq != 115 | "-"].tolist() + df['INSDC'][df.RefSeq == "-"].tolist() 116 | 117 | # remove duplicates 118 | plasmid_reference = set(plasmid_reference) 119 | # Set terms to exclude 120 | terms_to_exclude = ['gene ', 'protein', 121 | 'partial', 'putative', 'hypothetical'] 122 | # Dictionary with erroneous accession numbers to determine the reason 123 | erroneous = {} 124 | 125 | Entrez.email = "A.N.Other@example.com" 126 | 127 | total_sequences = len(plasmid_reference) 128 | current_record = 1 129 | logger.info("") 130 | logger.info("Starting plasmid database download script: " + 131 | str(total_sequences) + " will be downloaded") 132 | logger.info("This will take a while.\nCheck progress in " + log_full_path) 133 | 134 | with open(plasmid_fasta_path, 'w+') as output_handle: 135 | for plasmid_accnumber in plasmid_reference: 136 | try: 137 | handle = Entrez.efetch( 138 | db="nucleotide", id=plasmid_accnumber, rettype="fasta", retmode="text") 139 | record = SeqIO.read(handle, "fasta") 140 | terms_present = [ 141 | x in record.description for x in terms_to_exclude] 142 | handle.close() 143 | if sum(terms_present) > 0: 144 | terms_true = [terms_to_exclude[i] 145 | for i, x in enumerate(terms_present) if x == True] 146 | erroneous[record.id] = "Include terms: " + \ 147 | ', '.join(terms_true) + " => " + record.description 148 | logger.debug(" %s/%s Invalid terms in record %s" % 149 | (current_record, total_sequences, record.id)) 150 | else: 151 | logger.debug(" %s/%s Downloading record %s" % 152 | (current_record, total_sequences, record.id)) 153 | SeqIO.write(record, output_handle, "fasta") 154 | except: 155 | logger.debug(" %s/%s Failed to download %s" % 156 | (current_record, total_sequences, record.id)) 157 | erroneous[record.id] = "failed to download" 158 | current_record = current_record + 1 159 | 160 | if len(erroneous) > 0: 161 | with open(plasmid_failed_path, 'w+') as ferror: 162 | for acc, reason in erroneous.items(): 163 | ferror.write(acc + ": " + reason + "\n") 164 | 165 | logger.info("ALL DONE\nFASTA file is available in: " + plasmid_fasta_path) 166 | 167 | 168 | if __name__ == '__main__': 169 | try: 170 | main() 171 | except Exception as e: 172 | logger.exception(e) 173 | raise 174 | -------------------------------------------------------------------------------- /bin/draw_circos_images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | #set -x 7 | #============================================================= 8 | # HEADER 9 | #============================================================= 10 | 11 | #INSTITUTION:ISCIII 12 | #CENTRE:BU-ISCIII 13 | #AUTHOR: Pedro J. Sola 14 | VERSION=1.0 15 | #CREATED: 01 May 2018 16 | #REVISION: 17 | # 11 July 2018: Apply good practices bash 18 | # Include independent files 19 | # Include several databases 20 | # 13 July 2018: Include log file 21 | # manage directories 22 | #DESCRIPTION:Script that creates and execute a cicos config file for plasmidID 23 | # 24 | # 25 | # 26 | #================================================================ 27 | # END_OF_HEADER 28 | #================================================================ 29 | 30 | #SHORT USAGE RULES 31 | #LONG USAGE FUNCTION 32 | usage() { 33 | cat << EOF 34 | 35 | draw_circos_image script that creates and execute a cicos config file for plasmidID 36 | 37 | usage : $0 <-i input_directory> <-d config_files_directory> <-C config_file> <-s sample> <-g <-o [-l ] [-V] [-c] [-v] [-h] 38 | 39 | -i input directory containing files to represent 40 | -d directory containing config files 41 | -C config file selected to draw 42 | -s sample 43 | -g group 44 | -l log file 45 | -o output directory to create config and pictures 46 | -c clean: remove config files 47 | -v version 48 | -V verbose 49 | -h display usage message 50 | 51 | EOF 52 | } 53 | 54 | #================================================================ 55 | # OPTION_PROCESSING 56 | #================================================================ 57 | #Make sure the script is executed with arguments 58 | if [ $# = 0 ] ; then 59 | usage >&2 60 | exit 1 61 | fi 62 | 63 | # Error handling 64 | error(){ 65 | local parent_lineno="$1" 66 | local script="$2" 67 | local message="$3" 68 | local code="${4:-1}" 69 | 70 | RED='\033[0;31m' 71 | NC='\033[0m' 72 | 73 | if [[ -n "$message" ]] ; then 74 | echo -e "\n---------------------------------------\n" 75 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 76 | echo -e "MESSAGE:\n" 77 | echo -e "$message" 78 | echo -e "\n---------------------------------------\n" 79 | else 80 | echo -e "\n---------------------------------------\n" 81 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 82 | echo -e "\n---------------------------------------\n" 83 | fi 84 | 85 | exit "${code}" 86 | } 87 | 88 | #DECLARE FLAGS AND VARIABLES 89 | 90 | cwd="$(pwd)" 91 | clean=false 92 | verbose=false 93 | 94 | 95 | #PARSE VARIABLE ARGUMENTS WITH getops 96 | #common example with letters, for long options check longopts2getopts.sh 97 | options=":i:m:o:g:l:s:d:C:cVvh" 98 | while getopts $options opt; do 99 | case $opt in 100 | i ) 101 | input_dir=$OPTARG 102 | ;; 103 | o ) 104 | output_dir=$OPTARG 105 | ;; 106 | d ) 107 | config_dir=$OPTARG 108 | ;; 109 | C ) 110 | config_file_individual=$OPTARG 111 | ;; 112 | l ) 113 | log_file=$OPTARG 114 | ;; 115 | g ) 116 | group=$OPTARG 117 | ;; 118 | s ) 119 | sample=$OPTARG 120 | ;; 121 | c ) 122 | clean=true 123 | ;; 124 | h ) 125 | usage 126 | exit 1 127 | ;; 128 | V ) 129 | verbose=true 130 | ;; 131 | v ) 132 | echo $VERSION 133 | exit 1 134 | ;; 135 | \?) 136 | echo "Invalid Option: -$OPTARG" 1>&2 137 | usage 138 | exit 1 139 | ;; 140 | : ) 141 | echo "Option -$OPTARG requires an argument." >&2 142 | exit 1 143 | ;; 144 | * ) 145 | echo "Unimplemented option: -$OPTARG" >&2; 146 | exit 1 147 | ;; 148 | 149 | esac 150 | done 151 | shift $((OPTIND-1)) 152 | 153 | #================================================================ 154 | # MAIN_BODY 155 | #================================================================ 156 | 157 | imageDir=$input_dir"/data" 158 | 159 | if [ -f $log_file ]; then 160 | rm $log_file 161 | fi 162 | 163 | echo -e "\n#Executing" $0 "\n" &>> $log_file 164 | 165 | cdsDdbb_file=$input_dir/database/$sample".gff.bed" 166 | cdsDdbb_file_forward=$input_dir/database/$sample".gff.forward.bed" 167 | cdsDdbb_file_reverse=$input_dir/database/$sample".gff.reverse.bed" 168 | 169 | 170 | circos_conf_summary="$config_dir/circos_summary_1_3_3.conf" 171 | circos_conf_individual="$config_dir/$config_file_individual" 172 | circosDir="$output_dir" 173 | 174 | 175 | plasmidMapped=$imageDir/$sample".coverage_adapted_clustered_ac" 176 | 177 | karyotype_file_individual=$imageDir/$sample".karyotype_individual.txt" 178 | karyotype_file_summary=$imageDir/$sample".karyotype_summary.txt" 179 | annotation_text_file=$imageDir/pID_text_annotation.coordinates 180 | annotation_highlights_file=$imageDir/pID_highlights.conf 181 | 182 | coverage_file=$imageDir/$sample".bedgraph_term" 183 | cds_contig_file=$imageDir/$sample".gff.coordinates" 184 | cds_contig_file_forward=$imageDir/$sample".gff.forward.coordinates" 185 | cds_contig_file_reverse=$imageDir/$sample".gff.reverse.coordinates" 186 | 187 | 188 | contig_file=$imageDir/$sample".plasmids.bed" 189 | contig_file_complete=$imageDir/$sample".plasmids.complete" 190 | links_file=$imageDir/$sample".plasmids.links" 191 | 192 | imageName=$sample"_summary.png" 193 | 194 | mkdir -p $circosDir 195 | 196 | 197 | echo "Creating individual config file for SAMPLE $sample using FILE $circos_conf_individual" &>> $log_file 198 | 199 | awk '{gsub("PLASMID_KARYOTYPE","'$karyotype_file_individual'"); \ 200 | gsub("PLASMID_SPECIFIC_TEXT","'$annotation_text_file'"); \ 201 | gsub("PID_ALL_HIGHLIGHTS","'$annotation_highlights_file'"); \ 202 | gsub("PLASMID_COVERAGE_GRAPH","'$coverage_file'"); \ 203 | gsub("PLASMID_CDS_CONTIG","'$cds_contig_file'"); \ 204 | gsub("PLASMID_CDS_FORWARD","'$cds_contig_file_forward'"); \ 205 | gsub("PLASMID_CDS_REVERSE","'$cds_contig_file_reverse'"); \ 206 | gsub("PLASMID_CDS_DDBB","'$cdsDdbb_file'"); \ 207 | gsub("CDS_DDBB_FORWARD","'$cdsDdbb_file_forward'"); \ 208 | gsub("CDS_DDBB_REVERSE","'$cdsDdbb_file_reverse'"); \ 209 | gsub("PLASMID_CONTIGS_COMPLETE","'$contig_file_complete'"); \ 210 | gsub("PLASMID_CONTIGS","'$contig_file'"); \ 211 | gsub("OUTPUTDIR","'$circosDir'"); \ 212 | print $0}' $circos_conf_individual > $circosDir/$sample"_individual.circos.conf" 213 | 214 | echo "DONE Creating config file for circos in SAMPLE $sample" &>> $log_file 215 | 216 | echo "Executing circos in SAMPLE $sample" &>> $log_file 217 | 218 | 219 | 220 | for i in $(cat $plasmidMapped) 221 | do 222 | echo "Creating image for plasmid $i in sample $sample" &>> $log_file 223 | awk '{gsub("SAMPLE_SHOWN","'$i'"); \ 224 | gsub("IMAGENAME_SAMPLE_PLASMID","'$sample'_'$i'.png"); \ 225 | print $0}' $circosDir/$sample"_individual.circos.conf" > $circosDir/$sample"_"$i"_individual.circos.conf" 226 | if [ $verbose = true ];then 227 | $(circos -conf $circosDir/$sample"_"$i"_individual.circos.conf" |& tee -a $log_file) || error ${LINENO} $(basename $0) "Circos command for individual image has failed. See $output_dir/logs for more information" 228 | else 229 | $(circos -conf $circosDir/$sample"_"$i"_individual.circos.conf" &>> $log_file) || error ${LINENO} $(basename $0) "Circos command for individual image has failed. See $output_dir/logs for more information" 230 | fi 231 | done 232 | 233 | 234 | if [ -s $karyotype_file_summary ]; then 235 | 236 | echo "Creating summary image for in sample" $sample "from FILE" $circos_conf_summary &>> $log_file 237 | 238 | awk '{gsub("PLASMID_KARYOTYPE","'$karyotype_file_summary'"); \ 239 | gsub("PLASMID_SPECIFIC_TEXT","'$annotation_text_file'"); \ 240 | gsub("PID_ALL_HIGHLIGHTS","'$annotation_highlights_file'"); \ 241 | gsub("PLASMID_COVERAGE_GRAPH","'$coverage_file'"); \ 242 | gsub("PLASMID_CDS_CONTIG","'$cds_contig_file'"); \ 243 | gsub("PLASMID_CDS_FORWARD","'$cds_contig_file_forward'"); \ 244 | gsub("PLASMID_CDS_REVERSE","'$cds_contig_file_reverse'"); \ 245 | gsub("PLASMID_CDS_DDBB","'$cdsDdbb_file'"); \ 246 | gsub("PLASMID_CONTIGS","'$contig_file'"); \ 247 | gsub("PLASMID_LINKS","'$links_file'"); \ 248 | gsub("OUTPUTDIR","'$circosDir'"); \ 249 | gsub("IMAGENAME","'$imageName'"); \ 250 | print $0}' $circos_conf_summary > $circosDir/$sample"_summary.circos.conf" 251 | 252 | if [ $verbose = true ]; then 253 | circos -conf $circosDir/$sample"_summary.circos.conf" |& tee -a $log_file || exit 1 254 | else 255 | circos -conf $circosDir/$sample"_summary.circos.conf" &>> $log_file || exit 1 256 | fi 257 | 258 | else 259 | 260 | echo "No plasmid matched requirements to draw the summary image" 261 | 262 | fi 263 | 264 | 265 | #Remove config files 266 | if [ clean = true ];then 267 | for i in $(cat $plasmidMapped) 268 | do 269 | if [ -f $circosDir/$sample"_"$i"_individual.circos.conf" ]; then 270 | rm $circosDir/$sample"_"$i"_individual.circos.conf" 271 | fi 272 | done 273 | 274 | rm $circosDir/$sample"_summary.circos.conf" 275 | rm $circosDir/$sample"_individual.circos.conf" 276 | fi 277 | 278 | echo "DONE, files can be found at $circosDir" 279 | -------------------------------------------------------------------------------- /bin/filter_fasta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | #set -x 7 | #============================================================= 8 | # HEADER 9 | #============================================================= 10 | 11 | #INSTITUTION:ISCIII 12 | #CENTRE:BU-ISCIII 13 | #AUTHOR: Pedro J. Sola 14 | VERSION=1.0 15 | #CREATED: 21 March 2018 16 | #REVISION: 17 | # 22 March 2018: Handle output directory by default the same as -f file 18 | # 13 April 2018: Include -G option to filter any file by term with both file or term 19 | #DESCRIPTION:Script that extract sequences by term, either by key or file with a list 20 | #AKNOWLEDGE: 21 | # -Multiple arguments in one flag: https://stackoverflow.com/questions/7529856/retrieving-multiple-arguments-for-a-single-option-using-getopts-in-bash 22 | #TODO: 23 | # -Add and remove sequences in the same execution 24 | #================================================================ 25 | # END_OF_HEADER 26 | #================================================================ 27 | 28 | #SHORT USAGE RULES 29 | #LONG USAGE FUNCTION 30 | usage() { 31 | cat << EOF 32 | 33 | Filter_fasta script that extract sequences by term, either by key or file with a list 34 | 35 | usage : $0 <-i file.fasta> <(-l term1 -l term2 -l term3 | -f file)> [-n ] [-o ] [-G] [-N] [-v] [-h] 36 | 37 | -i fasta file to filter 38 | -o output directory (optional). By default the file is replaced in the same location 39 | -n file name (optional). By default is the same as -f file with .fasta extension 40 | -l list of key terms separated by space 41 | -N Use term to discard sequences with terms (Negative filter) 42 | -G General filter: filter any file with a list of keys 43 | -f file with a list of terms to filter 44 | -v version 45 | -h display usage message 46 | 47 | example: filter_fasta.sh -i ecoli.fasta -l NC00012 -l WC52247 -l hypothetical -l partial -n NAME 48 | filter_fasta.sh -i ecoli.fasta -l "NC00012 WC52247 hypothetical partial" 49 | filter_fasta.sh -i ecoli.fasta -f list_with_terms.txt 50 | 51 | EOF 52 | } 53 | 54 | #================================================================ 55 | # OPTION_PROCESSING 56 | #================================================================ 57 | #Make sure the script is executed with arguments 58 | if [ $# = 0 ] ; then 59 | usage >&2 60 | exit 1 61 | fi 62 | 63 | # Error handling 64 | error(){ 65 | local parent_lineno="$1" 66 | local script="$2" 67 | local message="$3" 68 | local code="${4:-1}" 69 | 70 | RED='\033[0;31m' 71 | NC='\033[0m' 72 | 73 | if [[ -n "$message" ]] ; then 74 | echo -e "\n---------------------------------------\n" 75 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 76 | echo -e "MESSAGE:\n" 77 | echo -e "$message" 78 | echo -e "\n---------------------------------------\n" 79 | else 80 | echo -e "\n---------------------------------------\n" 81 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 82 | echo -e "\n---------------------------------------\n" 83 | fi 84 | 85 | exit "${code}" 86 | } 87 | 88 | #DECLARE FLAGS AND VARIABLES 89 | term_option=false 90 | file_option=false 91 | general_filter=false 92 | negative_filter="" 93 | cwd="$(pwd)" 94 | input_file="Input_file" 95 | 96 | #PARSE VARIABLE ARGUMENTS WITH getops 97 | #common example with letters, for long options check longopts2getopts.sh 98 | options=":i:o:n:l:f:GNvh" 99 | while getopts $options opt; do 100 | case $opt in 101 | i ) 102 | input_file=$OPTARG 103 | ;; 104 | o ) 105 | output_dir=$OPTARG 106 | ;; 107 | n ) 108 | file_name=$OPTARG 109 | ;; 110 | N ) 111 | negative_filter="!" 112 | ;; 113 | G ) 114 | general_filter=true 115 | ;; 116 | l ) 117 | terms_for_filtering+=($OPTARG) 118 | term_option=true 119 | ;; 120 | f ) 121 | file_for_filtering=$OPTARG 122 | check_mandatory_files.sh $input_file 123 | file_option=true 124 | ;; 125 | h ) 126 | usage 127 | exit 1 128 | ;; 129 | v ) 130 | echo $VERSION 131 | exit 1 132 | ;; 133 | \?) 134 | echo "Invalid Option: -$OPTARG" 1>&2 135 | usage 136 | exit 1 137 | ;; 138 | : ) 139 | echo "Option -$OPTARG requires an argument." >&2 140 | exit 1 141 | ;; 142 | * ) 143 | echo "Unimplemented option: -$OPTARG" >&2; 144 | exit 1 145 | ;; 146 | 147 | esac 148 | done 149 | shift $((OPTIND-1)) 150 | 151 | #================================================================ 152 | # MAIN_BODY 153 | #================================================================ 154 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 155 | 156 | echo -e "\n#Executing" $0 "\n" 157 | 158 | check_mandatory_files.sh $input_file 159 | 160 | 161 | if [ $general_filter = true ]; then 162 | file_name=$(basename $input_file) 163 | output_dir=$(dirname $input_file) 164 | fi 165 | 166 | #MANAGE OUTPUT DIRECTORY 167 | if [ $file_option = true ] && [ ! $output_dir ]; then 168 | output_dir=$(dirname $file_for_filtering) 169 | echo "Output directory is" $output_dir 170 | mkdir -p $output_dir 171 | elif [ $file_option = false ] && [ ! $output_dir ]; then 172 | echo "please, provide an output directory" $output_dir 173 | exit 1 174 | else 175 | echo "Output directory is=" $output_dir 176 | mkdir -p $output_dir 177 | fi 178 | 179 | #MANAGE FILE NAME 180 | if [ $file_option = true ] && [ ! $file_name ]; then 181 | file_name=$(echo $(basename $file_for_filtering)) 182 | elif [ $file_option = false ] && [ ! $file_name ]; then 183 | file_name=$terms_for_filtering #First term supplied by -l 184 | else 185 | echo "File name is=" $file_name 186 | fi 187 | 188 | #PROCESS REGULAR EXPRESSION TERMS 189 | if [ $term_option = true ] && [ $file_option = false ]; then 190 | 191 | list_terms_listed=$(for term in "${terms_for_filtering[@]}"; do echo "$term"; done) #process terms into list 192 | final_list_terms_regexp=$(printf "%s|" $list_terms_listed | sed 's/|$//g') #suitable for regexp 193 | 194 | elif [ $term_option = false ] && [ $file_option = true ]; then 195 | 196 | check_mandatory_files.sh $file_for_filtering 197 | if [ ! -s $file_for_filtering ];then 198 | echo -e "ERROR: terms file empty!!" 199 | exit 1 200 | fi 201 | 202 | final_list_terms_regexp=$(printf "%s|" $(cat $file_for_filtering) | sed 's/|$//g') 203 | else 204 | 205 | check_mandatory_files.sh $file_for_filtering 206 | if [ ! -s $file_for_filtering ];then 207 | echo -e "ERROR: terms file empty!!" 208 | exit 1 209 | fi 210 | list_terms_listed=$(for term in "${terms_for_filtering[@]}"; do echo "$term"; done) 211 | list_terms_regexp_term=$(printf "%s|" $list_terms_listed | sed 's/|$//g') 212 | list_terms_regexp_file=$(printf "%s|" $(cat $file_for_filtering) | sed 's/|$//g') 213 | final_list_terms_regexp=$(echo $list_terms_regexp_term"|"$list_terms_regexp_file) #concat all regexp into one 214 | fi 215 | 216 | #AWK SCRIPT THAT FILTER SEQUENCES# 217 | ################################## 218 | 219 | if [ $general_filter = true ]; then 220 | 221 | echo "$(date)" 222 | echo "General filtering terms on file" $(basename $input_file) 223 | 224 | awk ' 225 | /'"${final_list_terms_regexp}"'/ {print $0} 226 | ' $input_file \ 227 | > $output_dir/$file_name"_term" || error ${LINENO} $(basename $0) "Awk command for fasta filtering in $file_name\"_term\" creation. See $output_dir/logs for more information." 228 | 229 | echo "$(date)" 230 | echo "Done general filtering terms on file" $(basename $input_file) 231 | echo "File with filtered lines can be found in" $output_dir/$file_name"_term" 232 | 233 | else 234 | echo "$(date)" 235 | echo "Filtering terms on file" $(basename $input_file) 236 | seq_number_prev=$(cat $input_file | grep ">" | wc -l) 237 | 238 | awk ' 239 | BEGIN {RS=">"} 240 | '"${negative_filter}"'/'"${final_list_terms_regexp}"'/ {print ">"$0} 241 | ' $input_file \ 242 | > $output_dir/$file_name"_term.fasta" || error ${LINENO} $(basename $0) "Awk command for fasta filtering in $file_name\"_term.fata\" creation. See $output_dir/logs for more information." 243 | 244 | echo "$(date)" 245 | echo "DONE Filtering terms on file" $(basename $input_file) 246 | seq_number_post=$(cat $output_dir/$file_name"_term.fasta" | grep ">" | wc -l) 247 | echo "File with filtered sequences can be found in" $output_dir/$file_name"_term.fasta" 248 | 249 | echo "Previous number of sequences=" $seq_number_prev 250 | echo "Post number of sequences=" $seq_number_post 251 | echo -e "\n" 252 | fi 253 | -------------------------------------------------------------------------------- /bin/get_coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | #============================================================= 6 | # HEADER 7 | #============================================================= 8 | 9 | #INSTITUTION:ISCIII 10 | #CENTRE:BU-ISCIII 11 | #AUTHOR: Pedro J. Sola 12 | VERSION=1.0 13 | #CREATED: 20 March 2018 14 | #REVISION: 15 | #DESCRIPTION:Script that uses bedtool to obtain coverage data from a BAMm file 16 | #The default output format is as follows: 17 | # 18 | #chromosome (or entire genome) 19 | #depth of coverage from features in input file 20 | #number of bases on chromosome (or genome) with depth equal to column 2. 21 | #size of chromosome (or entire genome) in base pairs 22 | #fraction of bases on chromosome (or entire genome) with depth equal to column 2. 23 | # 24 | #chr1 0 980 1000 0.98 25 | #chr1 1 20 1000 0.02 26 | #chr2 1 500 500 1 27 | #genome 0 980 1500 0.653333 28 | #genome 1 520 1500 0.346667 29 | # 30 | #-p option is equivalent to -bga BEDGRAPH output 31 | # 32 | #chr1 0 554304 0 33 | #chr1 554304 554309 5 34 | #chr1 554309 554313 6 35 | #chr1 554313 554314 1 36 | #chr1 554314 554315 0 37 | #chr1 554315 554316 6 38 | #chr1 554316 554317 5 39 | #chr1 554317 554318 1 40 | #chr1 554318 554319 2 41 | #chr1 554319 554321 6 42 | #================================================================ 43 | # END_OF_HEADER 44 | #================================================================ 45 | 46 | #SHORT USAGE RULES 47 | #LONG USAGE FUNCTION 48 | usage() { 49 | cat << EOF 50 | 51 | Get_coverage script uses bedtool to obtain coverage data from a BAMm file 52 | 53 | usage : $0 <-i inputfile(sorted.bam)> [-o ] [-d ] [-s sample_name] 54 | [-g group_name] [-m ] [p] [-v] [-h] 55 | 56 | -i input file in sorted BAM format 57 | -o output directory (optional) 58 | -d database to extract length. Fasta file used to map against 59 | -m max depth reported (default 500) 60 | -p reports genome coverage for all positions in BEDGRAPH format includig 0 positions. 61 | Default option is bedtools genomecov that needs the reference genome 62 | -s sample name 63 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group 64 | -v version 65 | -h display usage message 66 | 67 | example: get_coverage.sh -i ecoli.bam -d database.fasta 68 | get_coverage.sh -i ecoli.bam -p -m 100 69 | 70 | EOF 71 | } 72 | 73 | #================================================================ 74 | # OPTION_PROCESSING 75 | #================================================================ 76 | #Make sure the script is executed with arguments 77 | if [ $# = 0 ] ; then 78 | usage >&2 79 | exit 1 80 | fi 81 | 82 | # Error handling 83 | error(){ 84 | local parent_lineno="$1" 85 | local script="$2" 86 | local message="$3" 87 | local code="${4:-1}" 88 | 89 | RED='\033[0;31m' 90 | NC='\033[0m' 91 | 92 | if [[ -n "$message" ]] ; then 93 | echo -e "\n---------------------------------------\n" 94 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 95 | echo -e "MESSAGE:\n" 96 | echo -e "$message" 97 | echo -e "\n---------------------------------------\n" 98 | else 99 | echo -e "\n---------------------------------------\n" 100 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 101 | echo -e "\n---------------------------------------\n" 102 | fi 103 | 104 | exit "${code}" 105 | } 106 | 107 | #DECLARE FLAGS AND VARIABLES 108 | cwd="$(pwd)" 109 | group="NO_GROUP" 110 | input_file="Input_file" 111 | database="Database" 112 | positional=false 113 | max_coverage=500 114 | 115 | #PARSE VARIABLE ARGUMENTS WITH getops 116 | #common example with letters, for long options check longopts2getopts.sh 117 | options=":i:o:d:s:g:m:n:pvh" 118 | while getopts $options opt; do 119 | case $opt in 120 | i ) 121 | input_file=$OPTARG 122 | ;; 123 | o ) 124 | output_dir=$OPTARG 125 | ;; 126 | s ) 127 | sample=$OPTARG 128 | ;; 129 | g) 130 | group=$OPTARG 131 | ;; 132 | d ) 133 | database=$OPTARG 134 | ;; 135 | m ) 136 | max_coverage=$OPTARG 137 | ;; 138 | p ) 139 | positional=true 140 | ;; 141 | h ) 142 | usage 143 | exit 1 144 | ;; 145 | v ) 146 | echo $VERSION 147 | exit 1 148 | ;; 149 | \?) 150 | echo "Invalid Option: -$OPTARG" 1>&2 151 | usage 152 | exit 1 153 | ;; 154 | : ) 155 | echo "Option -$OPTARG requires an argument." >&2 156 | exit 1 157 | ;; 158 | * ) 159 | echo "Unimplemented option: -$OPTARG" >&2; 160 | exit 1 161 | ;; 162 | 163 | esac 164 | done 165 | shift $((OPTIND-1)) 166 | 167 | #================================================================ 168 | # MAIN_BODY 169 | #================================================================ 170 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 171 | 172 | echo -e "\n#Executing" $0 "\n" 173 | 174 | check_mandatory_files.sh $input_file 175 | 176 | check_dependencies.sh bedtools 177 | 178 | if [ ! $output_dir ]; then 179 | output_dir=$(dirname $input_file) 180 | echo "Default output directory is" $output_dir 181 | mkdir -p $output_dir 182 | else 183 | echo "Output directory is" $output_dir 184 | mkdir -p $output_dir 185 | fi 186 | 187 | if [ ! $filename ]; then 188 | filename=$(basename $input_file | cut -d. -f1) 189 | fi 190 | 191 | 192 | 193 | if [ $positional = true ]; then 194 | if [ -f $imageDir/$sample".plasmid.bedgraph" ];then \ 195 | echo "Found a bedgraph file for sample" $sample; 196 | echo "Omitting bedgraph step" 197 | else 198 | echo "$(date)" 199 | echo "Obtaining coverage coordinates from sequences" 200 | 201 | bedtools genomecov -ibam $input_file -bga -max $max_coverage > $output_dir/$filename".bedgraph"|| error ${LINENO} $(basename $0) "Bedtools genomecov command failed. See $output_dir/logs for more information." 202 | 203 | echo "$(date)" 204 | echo "DONE obtaining coverage coordinates from sequences" 205 | fi 206 | else 207 | 208 | 209 | check_mandatory_files.sh $database 210 | 211 | if [ -f $database".length" ]; then 212 | echo "Found length file for" $(basename $database) 213 | echo "Omitting length calculation" 214 | else 215 | echo "$(date)" 216 | echo "Creating a length file for" $(basename $database) 217 | calculate_seqlen.sh -r -i $database > $database".length"|| error ${LINENO} $(basename $0) "calculate_seqlen script failed. See $output_dir/logs for more information." 218 | fi 219 | 220 | if [ -f $output_dir/$filename".coverage" ];then \ 221 | echo "Found a coverage file for sample" $sample; 222 | echo "Omitting coverage calculation" 223 | else 224 | echo "$(date)" 225 | echo "Calculating coverage for every position that mapped $filename" 226 | 227 | bedtools genomecov -ibam $input_file -g $database".length" > $output_dir/$filename".coverage"|| error ${LINENO} $(basename $0) "Bedtools genomecov command failed. See $output_dir/logs for more information." 228 | 229 | echo "$(date)" 230 | echo "DONE Calculating coverage for every plamid that mapped $sample" 231 | fi 232 | fi 233 | 234 | echo -e "\n" 235 | -------------------------------------------------------------------------------- /bin/mash_screener.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | #============================================================= 6 | # HEADER 7 | #============================================================= 8 | 9 | #INSTITUTION:ISCIII 10 | #CENTRE:BU-ISCIII 11 | #AUTHOR: Pedro J. Sola 12 | VERSION=1.0 13 | #CREATED: 27 November 2019 14 | #REVISION: 15 | 16 | #DESCRIPTION:Script that screen reads over a database using kmers and estract sequences ids with higher values 17 | #TODO 18 | #================================================================ 19 | # END_OF_HEADER 20 | #================================================================ 21 | 22 | #SHORT USAGE RULES 23 | #LONG USAGE FUNCTION 24 | usage() { 25 | cat << EOF 26 | 27 | Bowtie_mapper script index a database and map a supplied pair-end sequences 28 | 29 | usage : $0 [-i ] [-o ] <-d database(fasta)> <-s sample_name> <-1 R1> <-2 R2> 30 | [-g group_name] [-f ] [-T ] [-a] [-v] [-h] 31 | 32 | -i input directory (optional) 33 | -o output directory (optional) 34 | -d database to screen (.fasta) 35 | -s sample name 36 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group 37 | -1 reads corresponding to paired-end R1 38 | -2 reads corresponding to paired-end R2 39 | -f threshold identity value to retieve sequence ids with at least this value (default 0.9) 40 | -w use winner takes it all 41 | -T number of threads 42 | -v version 43 | -h display usage message 44 | 45 | example: mash_screener.sh -d database.fasta -s COLI -1 ecoli_1.fastq -2 ecoli_2.fastq 46 | 47 | EOF 48 | } 49 | 50 | #================================================================ 51 | # OPTION_PROCESSING 52 | #================================================================ 53 | #Make sure the script is executed with arguments 54 | if [ $# = 0 ] ; then 55 | usage >&2 56 | exit 1 57 | fi 58 | 59 | # Error handling 60 | error(){ 61 | local parent_lineno="$1" 62 | local script="$2" 63 | local message="$3" 64 | local code="${4:-1}" 65 | 66 | RED='\033[0;31m' 67 | NC='\033[0m' 68 | 69 | if [[ -n "$message" ]] ; then 70 | echo -e "\n---------------------------------------\n" 71 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 72 | echo -e "MESSAGE:\n" 73 | echo -e "$message" 74 | echo -e "\n---------------------------------------\n" 75 | else 76 | echo -e "\n---------------------------------------\n" 77 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 78 | echo -e "\n---------------------------------------\n" 79 | fi 80 | 81 | exit "${code}" 82 | } 83 | 84 | #DECLARE FLAGS AND VARIABLES 85 | threads=1 86 | offrate=1 87 | filter_identity=0.9 88 | cwd="$(pwd)" 89 | w_winner="" 90 | group="NO_GROUP" 91 | database="Database" 92 | R1="R1" 93 | R2="R2" 94 | 95 | #PARSE VARIABLE ARGUMENTS WITH getops 96 | #common example with letters, for long options check longopts2getopts.sh 97 | options=":i:o:s:g:d:1:2:f:T:avwh" 98 | while getopts $options opt; do 99 | case $opt in 100 | i ) 101 | input_dir=$OPTARG 102 | ;; 103 | o ) 104 | output_dir=$OPTARG 105 | ;; 106 | s ) 107 | sample=$OPTARG 108 | ;; 109 | g) 110 | group=$OPTARG 111 | ;; 112 | d ) 113 | database=$OPTARG 114 | ;; 115 | 1 ) 116 | R1=$OPTARG 117 | ;; 118 | 2 ) 119 | R2=$OPTARG 120 | ;; 121 | f ) 122 | filter_identity=$OPTARG 123 | ;; 124 | w) 125 | w_winner="-w" 126 | ;; 127 | T ) 128 | threads=$OPTARG 129 | ;; 130 | 131 | h ) 132 | usage 133 | exit 1 134 | ;; 135 | v ) 136 | echo $VERSION 137 | exit 1 138 | ;; 139 | \?) 140 | echo "Invalid Option: -$OPTARG" 1>&2 141 | usage 142 | exit 1 143 | ;; 144 | : ) 145 | echo "Option -$OPTARG requires an argument." >&2 146 | exit 1 147 | ;; 148 | * ) 149 | echo "Unimplemented option: -$OPTARG" >&2; 150 | exit 1 151 | ;; 152 | 153 | esac 154 | done 155 | shift $((OPTIND-1)) 156 | 157 | 158 | #================================================================ 159 | # MAIN_BODY 160 | #================================================================ 161 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 162 | 163 | echo -e "\n#Executing" $0 "\n" 164 | 165 | check_dependencies.sh bash mash 166 | 167 | check_mandatory_files.sh $database $R1 168 | 169 | if [ ! $sample ]; then 170 | echo "ERROR: please, provide a sample name" 171 | usage 172 | exit 1 173 | fi 174 | 175 | if [ ! $output_dir ]; then 176 | output_dir=$cwd"/$group/$sample/kmer/" 177 | echo "Default output directory is" $output_dir 178 | mkdir -p $output_dir 179 | else 180 | echo "Output directory is" $output_dir 181 | mkdir -p $output_dir 182 | fi 183 | 184 | 185 | ########SKETCH############## 186 | ############################ 187 | 188 | if [ -f $output_dir/database.msh ]; then \ 189 | echo "Found a sketch ddbb for" $(basename $database); 190 | echo "Omitting sketching" 191 | else 192 | echo "creating sketch of " $(basename $database); 193 | mash sketch -i -k 32 -s 1000 -p $threads -o $output_dir/database $database || error ${LINENO} $(basename $0) "mash screen command failed. See $output_dir/logs for more information" 194 | fi 195 | 196 | ########SCREEN############## 197 | ############################ 198 | 199 | if [ -f $output_dir/database.screen.tab ];then \ 200 | echo "Found a mash screen file for sample" $sample; 201 | echo "Omitting screening" 202 | else 203 | echo "$(date)" 204 | echo screening $R1 205 | 206 | mash screen $w_winner -p $threads $output_dir/database.msh $R1 > $output_dir/database.screen.tab || error ${LINENO} $(basename $0) "Bowtie2 command failed. See $output_dir/logs for more information" 207 | 208 | 209 | echo "$(date)" 210 | echo -e "DONE Screening $sample of $group Group" "\n" 211 | fi 212 | 213 | ######PARSE_RESULT########## 214 | ############################ 215 | 216 | output_mash_id=$output_dir/database.filtered_$filter_identity 217 | 218 | echo "Retrieving sequences matching more than $filter_identity identity" 219 | 220 | cat $output_dir/database.screen.tab | awk '($1 >= '"${filter_identity}"') {print $5}' > $output_mash_id 221 | 222 | 223 | #####FILTER SEQUENCES####### 224 | ############################ 225 | if [ $(cat $output_mash_id | wc -l | cut -d " " -f 1) -gt 0 ] 226 | then 227 | filter_fasta.sh -i $database -f $output_mash_id 228 | else 229 | echo "No plasmids have passed the mash identity filter!! Exiting!!" 230 | exit 0 231 | fi 232 | -------------------------------------------------------------------------------- /bin/ncbi_database_fetcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | #set -x 7 | #============================================================= 8 | # HEADER 9 | #============================================================= 10 | 11 | #INSTITUTION:ISCIII 12 | #CENTRE:BU-ISCIII 13 | #AUTHOR: Pedro J. Sola 14 | VERSION=1.0 15 | #CREATED: 12 June 2018 16 | #REVISION: 17 | # 22 June 2018: include quite mode that avoid watching the progress 18 | # 19 | # 20 | #DESCRIPTION:Script that extract a database from ncbi database using terms 21 | #AKNOWLEDGE: 22 | # -Multiple arguments in one flag: https://stackoverflow.com/questions/7529856/retrieving-multiple-arguments-for-a-single-option-using-getopts-in-bash 23 | # 24 | #================================================================ 25 | # END_OF_HEADER 26 | #================================================================ 27 | 28 | #SHORT USAGE RULES 29 | #LONG USAGE FUNCTION 30 | usage() { 31 | cat << EOF 32 | 33 | ncbi_database_fetcher is a script that extract sequences from NCBI by term 34 | 35 | usage : $0 <(-y term1 -y term2 | -y "term1 term2")> [(-n term1 -n term2 | -n "term1 term2")] [-O ][-d (nucleotide|protein)] [-f ] [-o ] [-q] [-v] [-h] 36 | 37 | -y list of key terms separated by space to be INCLUDED in sequences title 38 | -n list of key terms separated by space to be EXCLUDED in sequences title 39 | -O organism to filter 40 | -d database type, default nucleotide 41 | -o output directory (optional). By default the file is placed in cwd 42 | -f file name (optional). By default is the first term used as query 43 | -q quiet 44 | -v version 45 | -h display usage message 46 | 47 | example: ./ncbi_database_fetcher.sh -y plasmid -n unnamed -n partial -O Archaea 48 | 49 | EOF 50 | } 51 | 52 | #================================================================ 53 | # OPTION_PROCESSING 54 | #================================================================ 55 | #Make sure the script is executed with arguments 56 | if [ $# = 0 ] ; then 57 | usage >&2 58 | exit 1 59 | fi 60 | 61 | #DECLARE FLAGS AND VARIABLES 62 | cwd="$(pwd)" 63 | use_term_and=false 64 | use_term_not=false 65 | use_term_org=false 66 | quiet=false 67 | database_type=nucleotide 68 | #PARSE VARIABLE ARGUMENTS WITH getops 69 | 70 | options=":y:n:o:f:d:O:qvh" 71 | while getopts $options opt; do 72 | case $opt in 73 | o ) 74 | output_dir=$OPTARG 75 | ;; 76 | O) 77 | terms_organism+=($OPTARG) 78 | use_term_org=true 79 | ;; 80 | f ) 81 | file_name=$OPTARG 82 | ;; 83 | d ) 84 | database_type=$OPTARG 85 | ;; 86 | y ) 87 | terms_and+=($OPTARG) 88 | use_term_and=true 89 | ;; 90 | n ) 91 | terms_not+=($OPTARG) 92 | use_term_not=true 93 | ;; 94 | q ) 95 | quiet=true 96 | ;; 97 | h ) 98 | usage 99 | exit 1 100 | ;; 101 | v ) 102 | echo $VERSION 103 | exit 1 104 | ;; 105 | \?) 106 | echo "Invalid Option: -$OPTARG" 1>&2 107 | usage 108 | exit 1 109 | ;; 110 | : ) 111 | echo "Option -$OPTARG requires an argument." >&2 112 | exit 1 113 | ;; 114 | * ) 115 | echo "Unimplemented option: -$OPTARG" >&2; 116 | exit 1 117 | ;; 118 | 119 | esac 120 | done 121 | shift $((OPTIND-1)) 122 | 123 | #================================================================ 124 | # MAIN_BODY 125 | #================================================================ 126 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 127 | 128 | echo -e "\n#Executing" $0 "\n" 129 | 130 | if [ $use_term_and = false ]; then 131 | echo "Please, introduce at least one term to include search" 132 | usage 133 | exit 1 134 | fi 135 | 136 | #MANAGE OUTPUT DIRECTORY 137 | if [ ! $output_dir ]; then 138 | output_dir=$cwd 139 | echo "Default output_dir is" $output_dir 140 | mkdir -p $output_dir 141 | else 142 | echo "Output directory is" $output_dir 143 | mkdir -p $output_dir 144 | fi 145 | 146 | #MANAGE FILE NAME 147 | 148 | if [ ! $file_name ]; then 149 | 150 | if [ "${#terms_and[@]}" -gt 1 ]; then 151 | file_name_value_one=$(echo ${terms_and[0]}) 152 | file_name_value_two=$(echo ${terms_and[1]}) 153 | 154 | file_name=$file_name_value_one"_"$file_name_value_two 155 | echo "Default file name is" $file_name 156 | else 157 | file_name=$terms_and".database" 158 | echo "Default file name is" $file_name 159 | fi 160 | else 161 | echo "File name is" $file_name 162 | fi 163 | 164 | 165 | ##PROCESS REGULAR EXPRESSION TERMS 166 | 167 | list_terms_and=$(for term in "${terms_and[@]}"; do echo "$term"; done) 168 | list_terms_org=$(for organism in "${terms_organism[@]}"; do echo "$organism"; done) 169 | 170 | #echo "${#terms_and[@]}" "NUMBER OF TERMS" 171 | 172 | list_terms_regexp_and=$(printf "%s[Title] AND " $list_terms_and | sed 's/ AND $//g') 173 | list_terms_regexp_organism=$(printf "AND %s[organism] " $list_terms_org | sed 's/ $//g') 174 | 175 | if [ $use_term_not = true ]; then 176 | 177 | list_terms_not=$(for term in "${terms_not[@]}"; do echo "$term"; done) 178 | list_terms_regexp_not=$(printf "NOT %s[Title] " $list_terms_not | sed 's/ $//g') 179 | final_list_terms_regexp=$(echo $list_terms_regexp_and" "$list_terms_regexp_not" "$list_terms_regexp_organism) #concat all regexp into one 180 | 181 | else 182 | final_list_terms_regexp=$(echo $list_terms_regexp_and " "$list_terms_regexp_organism) 183 | fi 184 | 185 | echo $final_list_terms_regexp 186 | 187 | ########EUTILS COMMAND############ 188 | ################################## 189 | 190 | echo "$(date)" 191 | echo "Obtaining seuences with terms:" $list_terms_and 192 | echo "But not those terms:" $list_terms_not 193 | if [ $use_term_org = true ]; then 194 | echo "Filtering by organisms:" $list_terms_org 195 | fi 196 | echo "" 197 | 198 | base="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" 199 | 200 | ##DETERMINE RETMAX 201 | wget -q -O $output_dir/$file_name".count" $base"esearch.fcgi?db="$database_type"&term=""$final_list_terms_regexp" 202 | 203 | counter=$(cat $output_dir/$file_name".count" | awk '//' | head -n 1 | awk '// {split($0,counter_prev,"");split(counter_prev[1],counter,"")}END{print counter[length(counter)]}') 204 | echo -e "FOUND" $counter "RECORDS\n" 205 | 206 | if [ $counter -eq 0 ]; then 207 | echo "Try different terms" 208 | echo "EXIT" 209 | exit 1 210 | fi 211 | 212 | echo "Retrieving Id" 213 | 214 | ##OBTAIN TOTAL LIST OF ID 215 | wget -q -O $output_dir/$file_name".id" $base"esearch.fcgi?db="$database_type"&term=""$final_list_terms_regexp""&RetMax="$counter 216 | 217 | list_of_id=$(cat $output_dir/$file_name".id"| awk '{split($0,id_prev,"");split(id_prev[1],id,"")}//{print id[length(id)]}') 218 | array_of_id=($list_of_id) 219 | 220 | echo "And sequences" 221 | counter=1 222 | 223 | 224 | ##Checking previous DDBB 225 | if [ -s $output_dir/$file_name".fasta" ]; then 226 | echo -e "\nFound a ddbb with the same name, Removing it\n" 227 | rm $output_dir/$file_name".fasta" 228 | fi 229 | 230 | 231 | ##RETRIEVING FASTA SEQUENCE 232 | 233 | for i in $list_of_id 234 | do 235 | if [ $quiet = false ]; then 236 | 237 | echo $counter"/""${#array_of_id[@]}" 238 | fi 239 | 240 | ((counter++)) 241 | 242 | curl -s $base"efetch.fcgi?db="$database_type"&id="$i"&retmode=text&rettype=fasta" >> $output_dir/$file_name".fasta" 243 | done 244 | 245 | 246 | echo "$(date)" 247 | echo "DONE obtaining seuences with terms supplied" 248 | 249 | seq_number_post=$(cat $output_dir/$file_name".fasta" | grep ">" | wc -l) 250 | echo "File with filtered sequences can be found in" $output_dir/$file_name".fasta" 251 | echo "with" $seq_number_post "sequences" 252 | 253 | rm $output_dir/$file_name".count" 254 | rm $output_dir/$file_name".id" 255 | -------------------------------------------------------------------------------- /bin/process_cluster_output.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | #set -x 7 | 8 | #============================================================= 9 | # HEADER 10 | #============================================================= 11 | 12 | #INSTITUTION:ISCIII 13 | #CENTRE:BU-ISCIII 14 | #AUTHOR: Pedro J. Sola 15 | VERSION=1.0 16 | #CREATED: 12 April 2018 17 | #REVISION: 18 | #DESCRIPTION:process_cluster_output script obtain a list of ac from fasta, and estract their coverage value from a coverage file 19 | 20 | #================================================================ 21 | # END_OF_HEADER 22 | #================================================================ 23 | 24 | #SHORT USAGE RULES 25 | #LONG USAGE FUNCTION 26 | usage() { 27 | cat << EOF 28 | 29 | process_cluster_output script obtain a list of ac from fasta, and estract their coverage value from a coverage file 30 | 31 | usage : $0 <-i inputfile(.fasta)> <-b coverage_file> [-o ] [-c ] [-s ] [-v] [-h] 32 | 33 | -i input file 34 | -b file with coverage info 35 | -o output directory (optional). By default the file is replaced in the same location 36 | -c percentage value to filter >= values. If not supplied, all records will be outputted 37 | -s string to ad at the end of the outputted file (list of accession numbers) 38 | -v version 39 | -h display usage message 40 | 41 | example: process_cluster_output.sh -i ecoli_clustered.fasta_70 -b ecoli.coverage 42 | 43 | EOF 44 | } 45 | 46 | #================================================================ 47 | # OPTION_PROCESSING 48 | #================================================================ 49 | #Make sure the script is executed with arguments 50 | if [ $# = 0 ] ; then 51 | usage >&2 52 | exit 1 53 | fi 54 | 55 | # Error handling 56 | error(){ 57 | local parent_lineno="$1" 58 | local script="$2" 59 | local message="$3" 60 | local code="${4:-1}" 61 | 62 | RED='\033[0;31m' 63 | NC='\033[0m' 64 | 65 | if [[ -n "$message" ]] ; then 66 | echo -e "\n---------------------------------------\n" 67 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 68 | echo -e "MESSAGE:\n" 69 | echo -e "$message" 70 | echo -e "\n---------------------------------------\n" 71 | else 72 | echo -e "\n---------------------------------------\n" 73 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 74 | echo -e "\n---------------------------------------\n" 75 | fi 76 | 77 | exit "${code}" 78 | } 79 | 80 | #DECLARE FLAGS AND VARIABLES 81 | cwd="$(pwd)" 82 | input_file="Input_file" 83 | coverage_cutoff_input=100 84 | 85 | #PARSE VARIABLE ARGUMENTS WITH getops 86 | #common example with letters, for long options check longopts2getopts.sh 87 | options=":i:b:o:c:s:vh" 88 | while getopts $options opt; do 89 | case $opt in 90 | i ) 91 | input_file=$OPTARG 92 | ;; 93 | b ) 94 | coverage_file=$OPTARG 95 | ;; 96 | o ) 97 | output_dir=$OPTARG 98 | ;; 99 | c ) 100 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then 101 | echo "please, provide a percentage between 0 and 100" 102 | usage 103 | exit 1 104 | else 105 | coverage_cutoff_input=$OPTARG 106 | fi 107 | ;; 108 | s ) 109 | suffix=$OPTARG 110 | ;; 111 | h ) 112 | usage 113 | exit 1 114 | ;; 115 | v ) 116 | echo $VERSION 117 | exit 1 118 | ;; 119 | \?) 120 | echo "Invalid Option: -$OPTARG" 1>&2 121 | usage 122 | exit 1 123 | ;; 124 | : ) 125 | echo "Option -$OPTARG requires an argument." >&2 126 | exit 1 127 | ;; 128 | * ) 129 | echo "Unimplemented option: -$OPTARG" >&2; 130 | exit 1 131 | ;; 132 | 133 | esac 134 | done 135 | shift $((OPTIND-1)) 136 | 137 | #================================================================ 138 | # MAIN_BODY 139 | #================================================================ 140 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 141 | 142 | echo -e "\n#Executing" $0 "\n" 143 | 144 | check_mandatory_files.sh $input_file 145 | 146 | suffix="_clustered" 147 | coverage_cutoff=$(echo "(1 - ($coverage_cutoff_input/100))" | bc -l) 148 | 149 | if [ ! $output_dir ]; then 150 | output_dir=$(dirname $input_file) 151 | #echo "Default output directory is" $output_dir 152 | mkdir -p $output_dir 153 | else 154 | #echo "Output directory is" $output_dir 155 | mkdir -p $output_dir 156 | fi 157 | 158 | 159 | if [ ! $file_name ]; then 160 | file_name=$(basename $input_file) 161 | coverage_name=$(basename $coverage_file) 162 | fi 163 | 164 | echo "$(date)" 165 | echo "extracting coverage info from clustered sequences in" $file_name 166 | 167 | ac_input_file=$(cat $input_file | grep ">" | awk '{gsub(">","");print $1}') 168 | 169 | for i in $ac_input_file ;do 170 | awk ' 171 | /^'"$i"'/ 172 | ' $coverage_file 173 | done > $output_dir/$coverage_name$suffix || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix creation. See $output_dir/logs for more information." 174 | 175 | 176 | awk ' 177 | {if ($2 == 0 && $5 <= '"${coverage_cutoff}"') 178 | {print $1}} 179 | ' $output_dir/$coverage_name$suffix > $output_dir/$coverage_name$suffix"_ac" || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix\"_ac\" creation. See $output_dir/logs for more information." 180 | 181 | 182 | awk ' 183 | {if ($2 == 0 && $5 <= '"${coverage_cutoff}"') 184 | {print $1, ((1 - $5)*100)} 185 | } 186 | ' $output_dir/$coverage_name$suffix > $output_dir/$coverage_name$suffix"_percentage" || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix\"_percentage\" creation. See $output_dir/logs for more information." 187 | 188 | echo "$(date)" 189 | echo "DONE extracting coverage info from clustered sequences in" $file_name 190 | echo -e "Info can be found at" $coverage_name$suffix"_ac and" "\n" $coverage_name$suffix"_percentage" "\n" 191 | -------------------------------------------------------------------------------- /bin/prokka_annotation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #set -e 4 | 5 | #============================================================= 6 | # HEADER 7 | #============================================================= 8 | 9 | #INSTITUTION:ISCIII 10 | #CENTRE:BU-ISCIII 11 | #AUTHOR: Pedro J. Sola 12 | VERSION=1.0 13 | #CREATED: 30 April 2018 14 | #REVISION: 15 | #12 June 2018: Handled cleaning process without hard coded paths 16 | # 17 | #DESCRIPTION:Script that uses prokka to annotate a FASTA file 18 | # 19 | #DOCUMENTATION 20 | # 21 | #Prokka outputs the fasta headers as: 22 | # gnl|center|locustag_01 23 | # gnl|center|locustag_02 24 | # 25 | #TO DO: 26 | #Handle cleaning [v] 27 | # 28 | #================================================================ 29 | # END_OF_HEADER 30 | #================================================================ 31 | 32 | #SHORT USAGE RULES 33 | #LONG USAGE FUNCTION 34 | usage() { 35 | cat << EOF 36 | 37 | Prokka_annotation is a script that uses prokka to annotate a FASTA file 38 | 39 | usage : $0 <-i inputfile(FASTA)> <-p prefix> [-o ] [-k ] 40 | [-T ] [-g group_name][-G genus] [-S species] [-c] [-v] [-h] 41 | 42 | -i input file in FASTA format 43 | -o output directory 44 | -p prefix for sample identification (mandatory) and output file name 45 | -k kingdom (Bacteria by default) 46 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group 47 | -G sample genus in case is known by user 48 | -S sample species in case is known by user 49 | -c clean:remove files other than gff and renamed fasta 50 | -T number of threads 51 | -v version 52 | -h display usage message 53 | 54 | 55 | Output directory is the same as input directory by default 56 | 57 | example: prokka_annotation -i ecoli.fasta -p ECO -T 5 58 | 59 | 60 | EOF 61 | } 62 | 63 | 64 | #================================================================ 65 | # OPTION_PROCESSING 66 | #================================================================ 67 | #Make sure the script is executed with arguments 68 | if [ $# = 0 ] ; then 69 | usage >&2 70 | exit 1 71 | fi 72 | 73 | # Error handling 74 | error(){ 75 | local parent_lineno="$1" 76 | local script="$2" 77 | local message="$3" 78 | local code="${4:-1}" 79 | 80 | RED='\033[0;31m' 81 | NC='\033[0m' 82 | 83 | if [[ -n "$message" ]] ; then 84 | echo -e "\n---------------------------------------\n" 85 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 86 | echo -e "MESSAGE:\n" 87 | echo -e "$message" 88 | echo -e "\n---------------------------------------\n" 89 | else 90 | echo -e "\n---------------------------------------\n" 91 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 92 | echo -e "\n---------------------------------------\n" 93 | fi 94 | 95 | exit "${code}" 96 | } 97 | 98 | #DECLARE FLAGS AND VARIABLES 99 | cwd="$(pwd)" 100 | group="NO_GROUP" 101 | input_file="Input_file" 102 | kingdom="Bacteria" 103 | clean=false 104 | genus="" 105 | species="" 106 | threads=1 107 | 108 | #PARSE VARIABLE ARGUMENTS WITH getops 109 | #common example with letters, for long options check longopts2getopts.sh 110 | options=":i:o:p:k:g:G:S:T:cvh" 111 | while getopts $options opt; do 112 | case $opt in 113 | i ) 114 | input_file=$OPTARG 115 | ;; 116 | 117 | o ) 118 | output_dir=$OPTARG 119 | ;; 120 | p) 121 | prefix=$OPTARG 122 | file_name=$OPTARG 123 | ;; 124 | k ) 125 | kingdom=$OPTARG 126 | ;; 127 | g ) 128 | group=$OPTARG 129 | ;; 130 | S ) 131 | species=$OPTARG 132 | ;; 133 | G) 134 | genus=$OPTARG 135 | ;; 136 | c ) 137 | clean=true 138 | ;; 139 | T) 140 | threads=$OPTARG 141 | ;; 142 | 143 | h ) 144 | usage 145 | exit 1 146 | ;; 147 | v ) 148 | echo $VERSION 149 | exit 1 150 | ;; 151 | \?) 152 | echo "Invalid Option: -$OPTARG" 1>&2 153 | usage 154 | exit 1 155 | ;; 156 | : ) 157 | echo "Option -$OPTARG requires an argument." >&2 158 | exit 1 159 | ;; 160 | * ) 161 | echo "Unimplemented option: -$OPTARG" >&2; 162 | exit 1 163 | ;; 164 | 165 | esac 166 | done 167 | shift $((OPTIND-1)) 168 | 169 | #================================================================ 170 | # MAIN_BODY 171 | #================================================================ 172 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 173 | 174 | echo -e "\n#Executing" $0 "\n" 175 | 176 | check_mandatory_files.sh $input_file 177 | 178 | check_dependencies.sh prokka 179 | echo "PREFIX" $prefix 180 | 181 | if [ ! $prefix ]; then 182 | echo "please provide a prefix" 183 | exit 1 184 | fi 185 | 186 | if [ ! $output_dir ]; then 187 | output_dir=$(dirname $input_file) 188 | echo "Default output directory is" $output_dir 189 | mkdir -p $output_dir 190 | else 191 | echo "Output directory is" $output_dir 192 | mkdir -p $output_dir 193 | fi 194 | 195 | if [ ! $file_name ]; then 196 | file_name=$(basename $input_file) 197 | echo "filename is" $file_name 198 | fi 199 | 200 | 201 | ##PROKKA EXECUTION 202 | 203 | echo "$(date)" 204 | echo "Annotating $input_file with prokka" 205 | 206 | prokka --force --outdir $output_dir \ 207 | --prefix $prefix \ 208 | --addgenes \ 209 | --kingdom $kingdom \ 210 | --genus $genus \ 211 | --species $species \ 212 | --usegenus \ 213 | --centre BU-ISCIII \ 214 | --locustag $prefix \ 215 | --addgenes \ 216 | --cpus $threads \ 217 | $input_file #|| error ${LINENO} $(basename $0) "Prokka command failed. See $output_dir/logs for more information." 218 | 219 | echo "$(date)" 220 | echo "done annotating $input_file with prokka" 221 | 222 | ##CLEAN FILES THAT WILL NOT BE USED IN PLASMIDID 223 | 224 | if [ $clean = true ]; then 225 | 226 | echo "Removing unwanted files" 227 | for i in $(ls $output_dir/$prefix.??? | awk '!/fna|gff|log|err|gb/') 228 | do 229 | rm $i 230 | done 231 | fi 232 | 233 | echo -e "\n" 234 | -------------------------------------------------------------------------------- /bin/quality_trim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | #============================================================= 6 | # HEADER 7 | #============================================================= 8 | 9 | #INSTITUTION:ISCIII 10 | #CENTRE:BU-ISCIII 11 | #AUTHOR: Pedro J. Sola 12 | VERSION=1.0 13 | #CREATED: 21 May 2018 14 | #REVISION: 15 | #DESCRIPTION:Script that execute trimmomatic to filter by quality 16 | # 17 | # 18 | #================================================================ 19 | # END_OF_HEADER 20 | #================================================================ 21 | 22 | 23 | usage() { 24 | cat << EOF 25 | 26 | quality_trim script execute trimmomatic to filter by quality 27 | 28 | usage : $0 <-1 R1 file> <-2 R2 file> [-o ] [-d ] <-s sample_name> 29 | [-a adapter_file] [-g group_name] [-f ] [-l ] [-M ] [-T ][-v] [-h] 30 | 31 | -1 R1 file (mandatory) 32 | -2 R2 file (mandatory) 33 | -d directory where trimmomatic is installed, default: /opt/Trimmomatic/ 34 | -a adapters to remove, default: TruSeq3-PE.fa 35 | -o output directory (optional) 36 | -f file name 37 | -l minimus length of trimmed reads (default 40) 38 | -s sample name (mandatory) 39 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group 40 | -M RAM memmory (Gb), default 8 41 | -T threads, default 1 42 | -v version 43 | -h display usage message 44 | 45 | example: ./quality_trim.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -s ECO232 -g ENTERO -T 8 46 | 47 | EOF 48 | } 49 | 50 | #================================================================ 51 | # OPTION_PROCESSING 52 | #================================================================ 53 | #Make sure the script is executed with arguments 54 | if [ $# = 0 ] ; then 55 | usage >&2 56 | exit 1 57 | fi 58 | 59 | # Error handling 60 | error(){ 61 | local parent_lineno="$1" 62 | local script="$2" 63 | local message="$3" 64 | local code="${4:-1}" 65 | 66 | RED='\033[0;31m' 67 | NC='\033[0m' 68 | 69 | if [[ -n "$message" ]] ; then 70 | echo -e "\n---------------------------------------\n" 71 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 72 | echo -e "MESSAGE:\n" 73 | echo -e "$message" 74 | echo -e "\n---------------------------------------\n" 75 | else 76 | echo -e "\n---------------------------------------\n" 77 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 78 | echo -e "\n---------------------------------------\n" 79 | fi 80 | 81 | exit "${code}" 82 | } 83 | 84 | #DECLARE FLAGS AND VARIABLES 85 | cwd="$(pwd)" 86 | group="NO_GROUP" 87 | r1_file="R1_file" 88 | r2_file="R2_file" 89 | trimmomatic_directory=/opt/Trimmomatic/ 90 | adapter_file="TruSeq3-PE.fa" 91 | minimus_length=40 92 | max_mem=8 93 | threads=1 94 | 95 | #PARSE VARIABLE ARGUMENTS WITH getops 96 | #common example with letters, for long options check longopts2getopts.sh 97 | options=":1:2:o:f:d:a:s:g:l:n:M:T:vh" 98 | while getopts $options opt; do 99 | case $opt in 100 | 1 ) 101 | r1_file=$OPTARG 102 | ;; 103 | 2 ) 104 | r2_file=$OPTARG 105 | ;; 106 | o ) 107 | output_dir=$OPTARG 108 | ;; 109 | f ) 110 | file_name=$OPTARG 111 | ;; 112 | s ) 113 | sample=$OPTARG 114 | ;; 115 | d) 116 | trimmomatic_directory=$OPTARG 117 | ;; 118 | a) 119 | adapter_file=$OPTARG 120 | ;; 121 | l) 122 | minimus_length=$OPTARG 123 | ;; 124 | g) 125 | group=$OPTARG 126 | ;; 127 | M ) 128 | max_mem=$OPTARG 129 | ;; 130 | T ) 131 | threads=$OPTARG 132 | ;; 133 | h ) 134 | usage 135 | exit 1 136 | ;; 137 | v ) 138 | echo $VERSION 139 | exit 1 140 | ;; 141 | \?) 142 | echo "Invalid Option: -$OPTARG" 1>&2 143 | usage 144 | exit 1 145 | ;; 146 | : ) 147 | echo "Option -$OPTARG requires an argument." >&2 148 | exit 1 149 | ;; 150 | * ) 151 | echo "Unimplemented option: -$OPTARG" >&2; 152 | exit 1 153 | ;; 154 | 155 | esac 156 | done 157 | shift $((OPTIND-1)) 158 | 159 | 160 | 161 | #================================================================ 162 | # MAIN_BODY 163 | #================================================================ 164 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 165 | 166 | echo -e "\n#Executing" $0 "\n" 167 | 168 | check_mandatory_files.sh $r1_file $r2_file 169 | 170 | check_dependencies.sh trimmomatic 171 | 172 | if [ ! $sample ]; then 173 | echo "Please include a sample name" 174 | exit 1 175 | fi 176 | 177 | 178 | if [ ! $output_dir ]; then 179 | output_dir="$group/$sample/trimmed" 180 | echo "Default output directory is" $output_dir 181 | mkdir -p $output_dir 182 | else 183 | echo "Output directory is" $output_dir 184 | mkdir -p $output_dir 185 | fi 186 | 187 | if [ ! $filename ]; then 188 | filename=$sample 189 | fi 190 | 191 | 192 | #trimmomatic_executable=$(find $trimmomatic_directory -type f -name "trimmomatic*.jar" | awk 'NR==1') 193 | 194 | trimmomatic_path=$(whereis trimmomatic | cut -d " " -f 2 | cut -d "/" -f 1,2,3,4,5,6) 195 | trimmomatic_adapter=$(find $trimmomatic_path -type f -name $adapter_file | awk 'NR==1') 196 | 197 | echo "$(date)" 198 | echo "Quality trimming:" 199 | echo "R1 = " $r1_file 200 | echo "R2 = " $r2_file 201 | 202 | trimmomatic PE -threads $threads \ 203 | $r1_file \ 204 | $r2_file \ 205 | $output_dir/$sample"_1_paired.fastq.gz" \ 206 | $output_dir/$sample"_1_unpaired.fastq.gz" \ 207 | $output_dir/$sample"_2_paired.fastq.gz" \ 208 | $output_dir/$sample"_2_unpaired.fastq.gz" \ 209 | ILLUMINACLIP:$trimmomatic_adapter:2:30:10 SLIDINGWINDOW:4:20 MINLEN:$minimus_length || error ${LINENO} $(basename $0) "Trimmomatic command failed. See $output_dir/logs for more information." 210 | 211 | echo "$(date)" 212 | echo "DONE quality trimming, file can be fount at:" 213 | echo $output_dir/$sample"_1_paired.fastq.gz" 214 | echo $output_dir/$sample"_1_unpaired.fastq.gz" 215 | echo $output_dir/$sample"_2_paired.fastq.gz" 216 | echo $output_dir/$sample"_2_unpaired.fastq.gz" 217 | echo -e "\n" -------------------------------------------------------------------------------- /bin/rename_from_fasta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | #set -x 7 | 8 | #============================================================= 9 | # HEADER 10 | #============================================================= 11 | 12 | #INSTITUTION:ISCIII 13 | #CENTRE:BU-ISCIII 14 | #AUTHOR: Pedro J. Sola 15 | VERSION=1.0 16 | #CREATED: 06 June 2018 17 | #REVISION: 18 | #DESCRIPTION:rename_from_fasta script rename any field in a file by either providing two fasta files or a dictionary file 19 | 20 | #================================================================ 21 | # END_OF_HEADER 22 | #================================================================ 23 | 24 | 25 | usage() { 26 | cat << EOF 27 | 28 | rename_from_fasta script rename any field in a file by either providing two fasta files or a dictionary file 29 | 30 | usage : $0 <-i file_to_rename> [-1 ] [-2 ] [-d ] [-o ] [-f ] [-v] [-h] 31 | 32 | -i input file to rename 33 | -1 original fata file whose names will be finally printed 34 | -2 new fata file whose names will be replaced 35 | -o output directory (optional). By default the file is replaced in the same location 36 | -f output file name (".rename" will be added at the end) 37 | -d dictionary file to be used if fasta files are not supplied 38 | -v version 39 | -h display usage message 40 | 41 | example: process_cluster_output.sh -i ecoli_clustered.fasta_70 -b ecoli.coverage 42 | 43 | EOF 44 | } 45 | 46 | #================================================================ 47 | # OPTION_PROCESSING 48 | #================================================================ 49 | #Make sure the script is executed with arguments 50 | if [ $# = 0 ] ; then 51 | usage >&2 52 | exit 1 53 | fi 54 | 55 | # Error handling 56 | error(){ 57 | local parent_lineno="$1" 58 | local script="$2" 59 | local message="$3" 60 | local code="${4:-1}" 61 | 62 | RED='\033[0;31m' 63 | NC='\033[0m' 64 | 65 | if [[ -n "$message" ]] ; then 66 | echo -e "\n---------------------------------------\n" 67 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 68 | echo -e "MESSAGE:\n" 69 | echo -e "$message" 70 | echo -e "\n---------------------------------------\n" 71 | else 72 | echo -e "\n---------------------------------------\n" 73 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 74 | echo -e "\n---------------------------------------\n" 75 | fi 76 | 77 | exit "${code}" 78 | } 79 | 80 | #DECLARE FLAGS AND VARIABLES 81 | cwd="$(pwd)" 82 | input_file="Input_file" 83 | 84 | #PARSE VARIABLE ARGUMENTS WITH getops 85 | #common example with letters, for long options check longopts2getopts.sh 86 | options=":i:1:2:f:o:d:vh" 87 | while getopts $options opt; do 88 | case $opt in 89 | i ) 90 | input_file=$OPTARG 91 | ;; 92 | 1 ) 93 | fasta_file_old=$OPTARG 94 | ;; 95 | 2 ) 96 | fasta_file_new=$OPTARG 97 | ;; 98 | d ) 99 | dictionary_file_new=$OPTARG 100 | ;; 101 | o ) 102 | output_dir=$OPTARG 103 | ;; 104 | f ) 105 | file_name=$OPTARG 106 | ;; 107 | h ) 108 | usage 109 | exit 1 110 | ;; 111 | v ) 112 | echo $VERSION 113 | exit 1 114 | ;; 115 | \?) 116 | echo "Invalid Option: -$OPTARG" 1>&2 117 | usage 118 | exit 1 119 | ;; 120 | : ) 121 | echo "Option -$OPTARG requires an argument." >&2 122 | exit 1 123 | ;; 124 | * ) 125 | echo "Unimplemented option: -$OPTARG" >&2; 126 | exit 1 127 | ;; 128 | 129 | esac 130 | done 131 | shift $((OPTIND-1)) 132 | 133 | #================================================================ 134 | # MAIN_BODY 135 | #================================================================ 136 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 137 | 138 | echo -e "\n#Executing" $0 "\n" 139 | 140 | check_mandatory_files.sh $input_file 141 | 142 | if [ ! $output_dir ]; then 143 | output_dir=$(dirname $input_file) 144 | echo "Default output directory is" $output_dir 145 | mkdir -p $output_dir 146 | else 147 | echo "Output directory is" $output_dir 148 | mkdir -p $output_dir 149 | fi 150 | 151 | 152 | if [ ! $file_name ]; then 153 | file_name=$(basename $input_file | cut -d "." -f1,2) 154 | fi 155 | 156 | fasta_file_old_name=$(basename $fasta_file_old) 157 | fasta_file_new_name=$(basename $fasta_file_new) 158 | 159 | echo "$(date)" 160 | echo "Renaming" $file_name 161 | 162 | cat $fasta_file_old | awk '/>/ {print $1}'| sed 's/>//g' | sed 's/|/-/g' > $output_dir/$fasta_file_old_name".ac" 163 | cat $fasta_file_new | awk '/>/ {print $1}'| sed 's/>//g' | sed 's/|/-/g' > $output_dir/$fasta_file_new_name".ac" 164 | cat $input_file | sed 's/|/-/g' > $output_dir/$file_name".nopipe.tmp" 165 | 166 | 167 | #Paste colums to relate names in a dictionary 168 | awk 'NR==FNR{ac[NR]=$0;next}{print ac[FNR], "\t", $0"\\t" }' $output_dir/$fasta_file_old_name".ac" $output_dir/$fasta_file_new_name".ac" > $output_dir/dictionary.txt || error ${LINENO} $(basename $0) "AWK command failed in dictionary.txt creation. See $output_dir/logs for more information." 169 | 170 | #Rename fields 171 | 172 | #cat $output_dir/dictionary.txt | while read -r line; do word1=$(cut -f1); word2=$(cut -f2); echo "##########word 1="$word1;echo "###########word 2="$word2; sed 's/$word2/$word1/g' $input_file; done > $output_dir/$file_name".renamed" 173 | 174 | 175 | awk 'FNR==NR {dict[$2]=$1"\t"; next} {for (i in dict) gsub(i, dict[i])}1' $output_dir/dictionary.txt $output_dir/$file_name".nopipe.tmp" > $output_dir/$file_name".renamed" || error ${LINENO} $(basename $0) "AWK command failed in $file_name\".renamed\" creation. See $output_dir/logs for more information." 176 | 177 | #awk 'NR==FNR{dict[$2]=$1;next}{$1=dict[$1]}1' $output_dir/dictionary.txt $input_file #> $output_dir/$file_name".renamed" 178 | 179 | 180 | rm $output_dir/$fasta_file_old_name".ac" 181 | rm $output_dir/$fasta_file_new_name".ac" 182 | rm $output_dir/$file_name".nopipe.tmp" 183 | rm $output_dir/dictionary.txt 184 | 185 | echo "$(date)" 186 | echo "DONE renaming" $file_name 187 | echo -e "Renamed file can be found at" $output_dir/$file_name".renamed" 188 | -------------------------------------------------------------------------------- /bin/sam_to_bam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | #set -e 6 | # Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error when performing parameter expansion. 7 | # An error message will be written to the standard error, and a non-interactive shell will exit 8 | #set -u 9 | #Print everything as if it were executed, after substitution and expansion is applied: Debug|log option 10 | #set -x 11 | 12 | #============================================================= 13 | # HEADER 14 | #============================================================= 15 | 16 | #INSTITUTION:ISCIII 17 | #CENTRE:BU-ISCIII 18 | #AUTHOR: Pedro J. Sola 19 | VERSION=1.0 20 | #CREATED: 19 March 2018 21 | #REVISION: 22 | #DESCRIPTION:Script that convert a supplied SAM file into compressed binary indexed BAM 23 | 24 | #================================================================ 25 | # END_OF_HEADER 26 | #================================================================ 27 | 28 | #SHORT USAGE RULES 29 | #LONG USAGE FUNCTION 30 | usage() { 31 | cat << EOF 32 | 33 | Sam_to_bam script converts a supplied SAM file into compressed binary indexed BAM 34 | 35 | usage : $0 <-i inputfile(.sam)> [-o ] [-s sample_name] [-g group_name] [-T ] [-v] [-h] 36 | 37 | -i input file 38 | -o output directory (optional). By default the BAM file will replace SAM in the same location 39 | -s sample name 40 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group 41 | -T number of threads 42 | -v version 43 | -h display usage message 44 | 45 | example: sam_to_bam.sh -i ecoli.sam 46 | 47 | EOF 48 | } 49 | 50 | #================================================================ 51 | # OPTION_PROCESSING 52 | #================================================================ 53 | #Make sure the script is executed with arguments 54 | if [ $? != 0 ] ; then 55 | usage >&2 56 | exit 1 57 | fi 58 | 59 | # Error handling 60 | error(){ 61 | local parent_lineno="$1" 62 | local script="$2" 63 | local message="$3" 64 | local code="${4:-1}" 65 | 66 | RED='\033[0;31m' 67 | NC='\033[0m' 68 | 69 | if [[ -n "$message" ]] ; then 70 | echo -e "\n---------------------------------------\n" 71 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 72 | echo -e "MESSAGE:\n" 73 | echo -e "$message" 74 | echo -e "\n---------------------------------------\n" 75 | else 76 | echo -e "\n---------------------------------------\n" 77 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 78 | echo -e "\n---------------------------------------\n" 79 | fi 80 | 81 | exit "${code}" 82 | } 83 | 84 | #DECLARE FLAGS AND VARIABLES 85 | threads=1 86 | cwd="$(pwd)" 87 | group="NO_GROUP" 88 | input_file="Input_file" 89 | 90 | #PARSE VARIABLE ARGUMENTS WITH getops 91 | #common example with letters, for long options check longopts2getopts.sh 92 | options=":i:o:s:g:vh" 93 | while getopts $options opt; do 94 | case $opt in 95 | i ) 96 | input_file=$OPTARG 97 | ;; 98 | o ) 99 | output_dir=$OPTARG 100 | ;; 101 | s ) 102 | sample=$OPTARG 103 | ;; 104 | g) 105 | group=$OPTARG 106 | ;; 107 | 108 | T ) 109 | threads=$OPTARG 110 | ;; 111 | 112 | h ) 113 | usage 114 | exit 1 115 | ;; 116 | v ) 117 | echo $VERSION 118 | exit 1 119 | ;; 120 | \?) 121 | echo "Invalid Option: -$OPTARG" 1>&2 122 | usage 123 | exit 1 124 | ;; 125 | : ) 126 | echo "Option -$OPTARG requires an argument." >&2 127 | exit 1 128 | ;; 129 | * ) 130 | echo "Unimplemented option: -$OPTARG" >&2; 131 | exit 1 132 | ;; 133 | 134 | esac 135 | done 136 | shift $((OPTIND-1)) 137 | 138 | 139 | #================================================================ 140 | # MAIN_BODY 141 | #================================================================ 142 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 143 | 144 | echo -e "\n#Executing" $0 "\n" 145 | 146 | check_mandatory_files.sh $input_file 147 | 148 | check_dependencies.sh samtools 149 | 150 | 151 | if [ ! $output_dir ]; then 152 | output_dir=$(dirname $input_file) 153 | echo "Default output directory is" $output_dir 154 | mkdir -p $output_dir 155 | else 156 | echo "Output directory is" $output_dir 157 | mkdir -p $output_dir 158 | fi 159 | 160 | if [ ! $sample ]; then 161 | sample=$(basename $input_file | cut -d. -f1) 162 | fi 163 | 164 | ########SAM_TO_BAM########## 165 | ############################ 166 | 167 | 168 | if [ -f $output_dir/$sample.sorted.bam -a -f $output_dir/$sample.sorted.bam.bai ];then \ 169 | echo "Found a sorted .BAM file for sample" $sample; 170 | echo "Omitting BAM to SAM convertion" 171 | else 172 | echo "$(date)" 173 | echo "Converting SAM to sorted indexed BAM in $sample" 174 | 175 | samtools view \ 176 | -Sb $input_file \ 177 | -o $output_dir/$sample.bam || error ${LINENO} $(basename $0) "Samtools view command failed. See $output_dir/logs for more information." 178 | 179 | 180 | echo "$(date)" 181 | echo "Sorting BAM file in $sample" 182 | 183 | samtools sort \ 184 | -T $output_dir/$sample".sorted.bam" \ 185 | -o $output_dir/$sample".sorted.bam" \ 186 | $output_dir/$sample.bam || error ${LINENO} $(basename $0) "Samtools sort command failed. See $output_dir/logs for more information." 187 | 188 | echo "$(date)" 189 | echo "Indexing BAM file in $sample" 190 | 191 | samtools index \ 192 | $output_dir/$sample".sorted.bam" || error ${LINENO} $(basename $0) "Samtools index command failed. See $output_dir/logs for more information." 193 | 194 | 195 | echo "$(date)" 196 | echo "DONE Converting SAM to sorted indexed BAM in $sample" 197 | fi 198 | 199 | if [ -f $output_dir/$sample.sam ];then \ 200 | 201 | echo $sample.sam "removed" 202 | rm $output_dir/$sample.sam 203 | 204 | fi 205 | 206 | if [ -f $output_dir/$sample.bam ];then \ 207 | 208 | echo $sample.bam "removed" 209 | rm $output_dir/$sample.bam 210 | 211 | fi 212 | 213 | echo -e "\n" 214 | -------------------------------------------------------------------------------- /bin/spades_assembly.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | #set -x 5 | 6 | #============================================================= 7 | # HEADER 8 | #============================================================= 9 | 10 | #INSTITUTION:ISCIII 11 | #CENTRE:BU-ISCIII 12 | #AUTHOR: Pedro J. Sola 13 | VERSION=1.0 14 | #CREATED: 21 May 2018 15 | #REVISION: 16 | #DESCRIPTION:Script that assemble illumina sequences using SPAdes 17 | # 18 | # 19 | #================================================================ 20 | # END_OF_HEADER 21 | #================================================================ 22 | 23 | 24 | usage() { 25 | cat << EOF 26 | 27 | spades_assembly script that assemble illumina sequences using SPAdes 28 | 29 | usage : $0 <-p R1_paired file> <-P R2_paired file> [-o ] 30 | [-k ][-s sample_name] [-g group_name] [-f ] [-T ] [q] [-c] [-v] [-h] 31 | 32 | -p R1_paired file (mandatory) 33 | -P R2_paired file (mandatory) 34 | -k kmers, supplied as numbers sepparated by number or one flag per number, default: 21,33,55,77,99,127 35 | -o output directory (optional) 36 | -f file name 37 | -s sample name (mandatory) 38 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group 39 | -q quick_mode: look for files in a folder SUPPLIED with "paired" term 40 | -c clean mode: remove unnecesary temporary folders 41 | -T threads, default 1 42 | -v version 43 | -h display usage message 44 | 45 | example: ./spades_assembly.sh -p ecoli_R1_paired.fastq.gz -P ecoli_R2_paired.fastq.gz -c 46 | 47 | EOF 48 | } 49 | 50 | 51 | #================================================================ 52 | # OPTION_PROCESSING 53 | #================================================================ 54 | #Make sure the script is executed with arguments 55 | if [ $# = 0 ] ; then 56 | usage >&2 57 | exit 1 58 | fi 59 | 60 | # Error handling 61 | error(){ 62 | local parent_lineno="$1" 63 | local script="$2" 64 | local message="$3" 65 | local code="${4:-1}" 66 | 67 | RED='\033[0;31m' 68 | NC='\033[0m' 69 | 70 | if [[ -n "$message" ]] ; then 71 | echo -e "\n---------------------------------------\n" 72 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 73 | echo -e "MESSAGE:\n" 74 | echo -e "$message" 75 | echo -e "\n---------------------------------------\n" 76 | else 77 | echo -e "\n---------------------------------------\n" 78 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 79 | echo -e "\n---------------------------------------\n" 80 | fi 81 | 82 | exit "${code}" 83 | } 84 | 85 | #DECLARE FLAGS AND VARIABLES 86 | cwd="$(pwd)" 87 | group="NO_GROUP" 88 | r1_paired_file="R1_paired_file" 89 | r2_paired_file="R2_paired_file" 90 | threads=1 91 | kmer_values_command="21,33,55,77,99,127" 92 | kmer_option=false 93 | quick_mode=false 94 | clean_mode=false 95 | 96 | #PARSE VARIABLE ARGUMENTS WITH getops 97 | #common example with letters, for long options check longopts2getopts.sh 98 | options=":p:P:u:U:o:f:d:a:s:g:k:T:q:cvh" 99 | while getopts $options opt; do 100 | case $opt in 101 | p ) 102 | r1_paired_file=$OPTARG 103 | ;; 104 | P ) 105 | r2_paired_file=$OPTARG 106 | ;; 107 | o ) 108 | output_dir=$OPTARG 109 | ;; 110 | f ) 111 | file_name=$OPTARG 112 | ;; 113 | s ) 114 | sample=$OPTARG 115 | ;; 116 | k) 117 | kmer_value+=($OPTARG) 118 | kmer_option=true 119 | ;; 120 | q) 121 | directory_reads=$OPTARG 122 | quick_mode=true 123 | ;; 124 | l) 125 | minimus_length=$OPTARG 126 | ;; 127 | g) 128 | group=$OPTARG 129 | ;; 130 | c) 131 | clean_mode=true 132 | ;; 133 | M ) 134 | max_mem=$OPTARG 135 | ;; 136 | T ) 137 | threads=$OPTARG 138 | ;; 139 | h ) 140 | usage 141 | exit 1 142 | ;; 143 | v ) 144 | echo $VERSION 145 | exit 1 146 | ;; 147 | \?) 148 | echo "Invalid Option: -$OPTARG" 1>&2 149 | usage 150 | exit 1 151 | ;; 152 | : ) 153 | echo "Option -$OPTARG requires an argument." >&2 154 | exit 1 155 | ;; 156 | * ) 157 | echo "Unimplemented option: -$OPTARG" >&2; 158 | exit 1 159 | ;; 160 | 161 | esac 162 | done 163 | shift $((OPTIND-1)) 164 | 165 | 166 | 167 | #================================================================ 168 | # MAIN_BODY 169 | #================================================================ 170 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS 171 | 172 | echo -e "\n#Executing" $0 "\n" 173 | 174 | check_dependencies.sh spades.py 175 | 176 | 177 | if [ ! $directory_reads ]; then 178 | directory_reads=$(dirname $r1_paired_file) 179 | echo "Reads directory is" $directory_reads 180 | else 181 | echo "Reads directory for quick mode is" $directory_reads 182 | sample_dir=$(dirname $directory_reads) 183 | output_dir=$sample_dir"/assembly" 184 | mkdir -p $output_dir 185 | fi 186 | 187 | 188 | if [ ! $output_dir ]; then 189 | sample_dir=$(dirname $directory_reads) 190 | output_dir=$sample_dir"/assembly" 191 | echo "Default output directory is" $output_dir 192 | mkdir -p $output_dir 193 | else 194 | echo "Output directory is" $output_dir 195 | mkdir -p $output_dir 196 | fi 197 | 198 | 199 | if [ $quick_mode = true ]; then 200 | echo "Entering QUICK MODE" 201 | r1_paired_file=$(find $directory_reads -name "*1_paired.fastq.gz" -type f) 202 | r2_paired_file=$(find $directory_reads -name "*2_paired.fastq.gz" -type f) 203 | fi 204 | 205 | 206 | check_mandatory_files.sh $r1_paired_file $r2_paired_file 207 | 208 | if [ $kmer_option = true ]; then 209 | list_kmer_values=$(for value in "${kmer_value[@]}"; do echo "$value"; done) 210 | kmer_values_command=$(printf "%s," $list_kmer_values | sed 's/,$//g') 211 | fi 212 | 213 | 214 | echo "$(date)" 215 | echo "Assembly:" 216 | echo "R1 paired file = " $r1_paired_file 217 | echo "R2 paired file = " $r2_paired_file 218 | 219 | 220 | spades.py \ 221 | --careful \ 222 | -t $threads \ 223 | -k $kmer_values_command \ 224 | --pe1-1 $r1_paired_file \ 225 | --pe1-2 $r2_paired_file \ 226 | -o $output_dir || error ${LINENO} $(basename $0) "Spades command failed. See $output_dir/logs for more information." 227 | 228 | 229 | 230 | echo "$(date)" 231 | echo "DONE. Assembled contigs can be found at $output_dir/contigs.fasta:" 232 | echo "DONE. Assembled scaffolds can be found at $output_dir/scaffolds.fasta:" 233 | 234 | if [ $clean_mode = true ]; then 235 | echo "Removing unnecesary folders" 236 | rm -rf $(find $output_dir -maxdepth 1 -mindepth 1 -type d) 237 | echo "DONE removing unwanted folders" 238 | fi 239 | 240 | echo -e "\n" 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | -------------------------------------------------------------------------------- /config_files/annotation_config_file.txt: -------------------------------------------------------------------------------- 1 | #1. Fasta file for annotation 2 | #2. Name given to this annotation 3 | #3. Alignment %Identity necessary to include the sequence 4 | #4. Alignment %Length neccesary to include the sequence 5 | #5. Query divisor for the sequence name. (ie. For name Inc_NC_632542_protein-description) 6 | #6. Query field to represent (l:left|r:rigth) (ie. with divisor "_", left would be "Inc" and rigth "protein-description") 7 | #7. Unique. Each sequence will be allowed only once per plasmid 8 | #8. Doble Unique. This field uses a provided separator to extract only the best match. (ie within OXA-11 and OXA-48, using "-" as separator will retrieve only one). Use n if not used. 9 | #9. Color. Color used to represent this database (blue, green, grey, orange, purple, red, yellow. vvl,, vl, l, d, vd and vvd stands for very v), light(l) and dark(d)) 10 | 11 | #DDBBFILE,NANE,P_IDENTITY,P_ALIGNMENT,Q_DIVISOR,Q_SIDE_LR,IS_UNIQUE,DOBLE_UNIQUE,COLOR, 12 | 13 | #DEFAULTEXAMPLE: Copy and paste next line, change the file name, name of database and color. Remove "#" 14 | #PATH/TO/FILE,NAME,95,90,_,l,n,n,nucl,COLOR 15 | 16 | #ANTIBIOTIC_RESISTANCE_ANNOTATION 17 | databases/ARGannot.pID.fasta,abr,98,90, ,r,y,-,nucl,lred 18 | #REPLISOME_ANNOTATION 19 | databases/plasmidFinder_01_26_2018.fsa,inc,95,80,_,l,y,n,nucl,lyellow 20 | -------------------------------------------------------------------------------- /config_files/circos_summary_1_3_0.conf: -------------------------------------------------------------------------------- 1 | ######## CIRCOS.CONF 2 | #################### 3 | 4 | karyotype = PLASMID_KARYOTYPE 5 | 6 | chromosome_units = 1000000 7 | chromosomes_display_default = yes 8 | #chromosomes_display_default = no 9 | #chromosomes = /NZ/ 10 | chromosomes_color = /./ = lblue 11 | #chromosomes_scale = /./ = 1rn 12 | #chromosomes_scale = eval(var(size)) < 100000 = 0.5r 13 | z=100 14 | 15 | 16 | #############################HIGHLIGHTS 17 | 18 | 19 | <> 20 | 21 | 22 | 23 | 24 | 25 | #############################PLOTS 26 | 27 | 28 | ############### COVERAGE 29 | 30 | type = histogram 31 | file = PLASMID_COVERAGE_GRAPH 32 | 33 | color = black 34 | r1 = 0.99r 35 | r0 = 0.90r 36 | extend_bin = no 37 | min= 0 38 | max= 500 39 | thickness = 2 40 | orientation = out 41 | 42 | # 43 | #show = data 44 | # 45 | #color = vvlgrey 46 | # 47 | # 48 | 49 | 50 | 51 | 52 | thickness = 1 53 | color = lgrey 54 | spacing = 50 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | condition = var(value) < 20 65 | color = lorange 66 | thickness = 3 67 | flow = continue 68 | 69 | 70 | 71 | condition = var(value) == 0 72 | color = red 73 | thickness = 3 74 | flow = continue 75 | 76 | 77 | 78 | condition = var(value) > 200 79 | color = green 80 | thickness = 3 81 | 82 | 83 | 84 | 85 | 86 | 87 | ############### /COVERAGE 88 | 89 | 90 | ############### TEXT_ADITIONAL_ANNOTATION 91 | 92 | type = text 93 | color = black 94 | label_font = bold 95 | label_size = 10p 96 | file = PLASMID_SPECIFIC_TEXT 97 | r1 = 0.85r+200p 98 | r0 = 0.80r 99 | orientation = center 100 | show_links = no 101 | 102 | margin = 0u 103 | label_parallel = no 104 | padding = 1p 105 | rpadding = 2p 106 | label_snuggle = yes 107 | max_snuggle_distance = 5r 108 | snuggle_sampling = 2 109 | snuggle_tolerance = 1r 110 | snuggle_link_overlap_test = yes 111 | snuggle_link_overlap_tolerance = 20p 112 | 113 | 114 | ############### /TEXT_ADITIONAL_ANNOTATION 115 | 116 | ############### TEXT_CDS_CONTIG 117 | 118 | type = text 119 | color = black 120 | label_font = default 121 | label_size = 9p 122 | file = PLASMID_CDS_CONTIG 123 | r1 = 0.80r 124 | r0 = 0.75r 125 | orientation = center 126 | show_links = yes 127 | label_parallel = no 128 | padding = 0p 129 | label_snuggle = yes 130 | max_snuggle_distance = 6r 131 | snuggle_sampling = 10 132 | snuggle_tolerance = 1r 133 | snuggle_link_overlap_test = yes 134 | snuggle_link_overlap_tolerance = 10p 135 | #snuggle_refine = yes 136 | 137 | # 138 | # 139 | #condition = var(value) =~ /CDS/ 140 | #show = no 141 | #flow = continue 142 | # 143 | # 144 | 145 | 146 | 147 | 148 | ############### /TEXT_CDS_CONTIG 149 | 150 | ############### CDS_CONTIGS_PROKKA 151 | 152 | type = tile 153 | file = PLASMID_CDS_CONTIG 154 | r1 = 0.75r 155 | r0 = 0.70r 156 | layers = 3 157 | layers_overflow = collapse 158 | margin = 10u 159 | thickness = 20 160 | padding = 10 161 | orientation = in 162 | stroke_thickness = 1 163 | stroke_color = vdgrey 164 | color = purple 165 | #units_ok = bupr 166 | #units_nounit = n 167 | 168 | 169 | ############### /CDS_CONTIGS_PROKKA 170 | 171 | ############### TEXT_CONTIG 172 | 173 | type = text 174 | #color = black 175 | label_font = bold 176 | 177 | label_size = 10p 178 | file = PLASMID_CONTIGS 179 | r1 = 0.70r 180 | r0 = 0.64r 181 | orientation = out 182 | show_links = yes 183 | label_parallel = yes 184 | padding = 10p 185 | margin = 10p 186 | label_snuggle = yes 187 | max_snuggle_distance = 10r 188 | snuggle_sampling = 10 189 | snuggle_tolerance = 1r 190 | snuggle_link_overlap_test = yes 191 | snuggle_link_overlap_tolerance = 500p 192 | #snuggle_refine = yes 193 | 194 | 195 | 196 | 197 | condition = var(id) =~ /(\d+)(\d+)(\d*)/ 198 | color = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 199 | flow = continue 200 | 201 | 202 | 203 | condition = var(id) =~ /(\d+)(\d+)(\d*)/ 204 | link_color = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 205 | flow = continue 206 | 207 | 208 | 209 | condition = var(size) < 1kb 210 | show = no 211 | 212 | 213 | 214 | 215 | ############### /TEXT_CONTIG 216 | 217 | ############### CONTIGS SPADES ALL 218 | 219 | type = tile 220 | file = PLASMID_CONTIGS 221 | r1 = 0.65r 222 | r0 = 0.6r 223 | layers = 4 224 | margin = 5u 225 | thickness = 20 226 | padding = 5 227 | layers_overflow = collapse 228 | orientation = out 229 | stroke_thickness = 0 230 | stroke_color = grey 231 | color = grey 232 | 233 | 234 | 235 | 236 | 237 | condition = var(id) =~ /(\d+)(\d+)(\d*)/ 238 | color = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 239 | flow = continue 240 | 241 | #importance = 100 242 | #condition = 1 243 | #color = eval(sprintf("spectral-11-div-%d",remap_int(NODE_%d%d,0,10e6,1,11))) 244 | #color = eval((qw(vvvlgrey vvlgrey vlgrey lgrey grey dgrey vdgrey vvdgrey))[var(id) % 8]) 245 | 246 | 247 | 248 | condition = var(size) < 1kb 249 | show = no 250 | 251 | 252 | 253 | 254 | 255 | ############### /CONTIGS SPADES ALL 256 | 257 | 258 | 259 | 260 | ######## LINKS 261 | ############## 262 | 263 | 264 | 265 | 266 | 267 | file = PLASMID_LINKS 268 | r1 = 0.50r 269 | r0 = 0r 270 | ribbon = yes 271 | flat = yes 272 | radius = 0.6r 273 | bezier_radius = 0.1r 274 | crest = 0.2 275 | color = lgrey_a4 276 | 277 | 278 | 279 | 280 | condition = var(intrachr) 281 | show = no 282 | 283 | 284 | 285 | importance = 110 286 | condition = var(size1) < 2kb 287 | show = no 288 | flow = continue 289 | 290 | 291 | 292 | importance = 110 293 | condition = var(size2) < 2kb 294 | show = no 295 | flow = continue 296 | 297 | 298 | 299 | 300 | condition = var(id) =~ /(\d+)(\d+)(\d*)/ 301 | color = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 302 | #"paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12) 303 | #"set3-12-qual-%d_a%d" 304 | #"rev(set3-12-qual-%d_a%d)" 305 | flow = continue 306 | 307 | 308 | 309 | condition = 1 310 | z = eval(average(-1*(var(size1),var(size2)))) 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | ######## IDEOGRAM 321 | ################# 322 | 323 | 324 | show = yes 325 | 326 | 327 | default = 5000u 328 | #when representing witout scaling 329 | #default = 1000u 330 | break = 500u 331 | 332 | 333 | chromosomes_color = dblue 334 | stroke_color = blue 335 | 336 | radius = 0.93r 337 | thickness = 25p 338 | fill = yes 339 | 340 | show_label = yes 341 | 342 | label_font = bold 343 | label_radius = dims(ideogram,radius_inner) 344 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2 345 | 346 | label_size = 17 347 | label_parallel = yes 348 | 349 | 350 | 351 | ######## TICKS 352 | ############## 353 | 354 | show_ticks = yes 355 | show_tick_labels = yes 356 | 357 | 358 | radius = dims(ideogram,radius_outer) 359 | color = black 360 | thickness = 2p 361 | 362 | #multiplier = 0.001 363 | 364 | 365 | #spacing = 1000u 366 | rspacing = 0.025 367 | multiplier = 0.001 368 | spacing_type = relative 369 | skip_first_label = yes 370 | skip_last_label = no 371 | size = 5p 372 | show_label = yes 373 | label_size = 20p 374 | #label_relative = yes 375 | suffix = " kb" 376 | #rdivisor = ideogram 377 | format = %d 378 | rmultiplier = 1 379 | 380 | 381 | 382 | 383 | # 384 | #spacing = 2000u 385 | #size = 15p 386 | #show_label = yes 387 | #label_size = 20p 388 | #labe_offset = 10p 389 | #suffix = " kb" 390 | #format = %d 391 | # 392 | 393 | 394 | 395 | ########COLORS 396 | ############## 397 | <> 398 | 399 | 400 | ########HOUSEKEEPING 401 | #################### 402 | <> 403 | max_points_per_track* = 8000000 404 | 405 | ########IMAGE 406 | ############# 407 | 408 | dir = OUTPUTDIR 409 | #dir = conf(configdir) 410 | file = IMAGENAME 411 | png = yes 412 | svg = no 413 | # radius of inscribed circle in image 414 | radius = 1900p 415 | # by default angle=0 is at 3 o'clock position 416 | angle_offset = -90 417 | #angle_orientation = counterclockwise 418 | auto_alpha_colors = yes 419 | auto_alpha_steps = 5 420 | 421 | 422 | -------------------------------------------------------------------------------- /config_files/circos_summary_1_3_3.conf: -------------------------------------------------------------------------------- 1 | ######## CIRCOS.CONF 2 | #################### 3 | 4 | karyotype = PLASMID_KARYOTYPE 5 | 6 | chromosome_units = 1000000 7 | chromosomes_display_default = yes 8 | #chromosomes_display_default = no 9 | #chromosomes = /NZ/ 10 | chromosomes_color = /./ = lblue 11 | #chromosomes_scale = /./ = 1rn 12 | #chromosomes_scale = eval(var(size)) < 100000 = 0.5r 13 | z=100 14 | 15 | 16 | #############################HIGHLIGHTS 17 | 18 | 19 | <> 20 | r1 = 0.90r 21 | r0 = 0.75r 22 | 23 | 24 | 25 | 26 | #############################PLOTS 27 | 28 | 29 | ############### COVERAGE 30 | 31 | type = histogram 32 | file = PLASMID_COVERAGE_GRAPH 33 | 34 | color = black 35 | r1 = 0.99r 36 | r0 = 0.90r 37 | extend_bin = no 38 | min= 0 39 | max= 500 40 | thickness = 2 41 | orientation = out 42 | 43 | # 44 | #show = data 45 | # 46 | #color = vvlgrey 47 | # 48 | # 49 | 50 | 51 | 52 | 53 | thickness = 1 54 | color = lgrey 55 | spacing = 50 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | condition = var(value) < 20 66 | color = lorange 67 | thickness = 3 68 | flow = continue 69 | 70 | 71 | 72 | condition = var(value) == 0 73 | color = red 74 | thickness = 3 75 | flow = continue 76 | 77 | 78 | 79 | condition = var(value) > 200 80 | color = green 81 | thickness = 3 82 | 83 | 84 | 85 | 86 | 87 | 88 | ############### /COVERAGE 89 | 90 | 91 | ############### TEXT_ADITIONAL_ANNOTATION 92 | 93 | type = text 94 | color = black 95 | label_font = bold 96 | label_size = 10p 97 | file = PLASMID_SPECIFIC_TEXT 98 | r1 = 0.85r+200p 99 | r0 = 0.82r 100 | orientation = center 101 | show_links = no 102 | 103 | margin = 0u 104 | label_parallel = no 105 | padding = 1p 106 | rpadding = 2p 107 | label_snuggle = yes 108 | max_snuggle_distance = 5r 109 | snuggle_sampling = 2 110 | snuggle_tolerance = 1r 111 | snuggle_link_overlap_test = yes 112 | snuggle_link_overlap_tolerance = 20p 113 | 114 | 115 | ############### /TEXT_ADITIONAL_ANNOTATION 116 | 117 | ############### TEXT_CDS_CONTIG 118 | 119 | type = text 120 | color = black 121 | label_font = default 122 | label_size = 9p 123 | file = PLASMID_CDS_CONTIG 124 | r1 = 0.80r 125 | r0 = 0.75r 126 | orientation = center 127 | show_links = yes 128 | link_dims = 8p,8p,10p,8p,8p 129 | link_color = purple 130 | label_parallel = no 131 | padding = 0p 132 | label_snuggle = yes 133 | max_snuggle_distance = 6r 134 | snuggle_sampling = 10 135 | snuggle_tolerance = 1r 136 | snuggle_link_overlap_test = yes 137 | snuggle_link_overlap_tolerance = 10p 138 | #snuggle_refine = yes 139 | 140 | # 141 | # 142 | #condition = var(value) =~ /CDS/ 143 | #show = no 144 | #flow = continue 145 | # 146 | # 147 | 148 | 149 | 150 | 151 | ############### /TEXT_CDS_CONTIG 152 | 153 | ############### CDS_CONTIGS_PROKKA 154 | 155 | type = tile 156 | file = PLASMID_CDS_FORWARD 157 | r1 = 0.80r 158 | r0 = 0.75r 159 | layers = 3 160 | layers_overflow = grow 161 | margin = 0.001u 162 | thickness = 20p 163 | padding = 0p 164 | rpadding = 0p 165 | orientation = out 166 | stroke_thickness = 1 167 | stroke_color = dgrey 168 | color = dpurple 169 | 170 | 171 | 172 | 173 | r1 = 0.75r 174 | r0 = 0.75r 175 | 176 | 177 | position = 0.75r 178 | color = dgrey 179 | thickness = 2 180 | 181 | 182 | 183 | 184 | 185 | type = tile 186 | file = PLASMID_CDS_REVERSE 187 | r1 = 0.75r 188 | r0 = 0.70r 189 | layers = 3 190 | layers_overflow = grow 191 | margin = 0.001u 192 | thickness = 20p 193 | padding = 0p 194 | rpadding = 0p 195 | orientation = in 196 | stroke_thickness = 1 197 | stroke_color = dgrey 198 | color = lpurple 199 | 200 | 201 | ############### /CDS_CONTIGS_PROKKA 202 | 203 | ############### TEXT_CONTIG 204 | 205 | type = text 206 | #color = black 207 | label_font = bold 208 | 209 | label_size = 10p 210 | file = PLASMID_CONTIGS 211 | r1 = 0.70r 212 | r0 = 0.64r 213 | orientation = out 214 | show_links = yes 215 | label_parallel = yes 216 | padding = 10p 217 | margin = 10p 218 | label_snuggle = yes 219 | max_snuggle_distance = 10r 220 | snuggle_sampling = 10 221 | snuggle_tolerance = 1r 222 | snuggle_link_overlap_test = yes 223 | snuggle_link_overlap_tolerance = 500p 224 | #snuggle_refine = yes 225 | 226 | 227 | 228 | 229 | condition = var(id) =~ /(\d+)(\d+)(\d*)/ 230 | color = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 231 | flow = continue 232 | 233 | 234 | 235 | condition = var(id) =~ /(\d+)(\d+)(\d*)/ 236 | link_color = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 237 | flow = continue 238 | 239 | 240 | 241 | condition = var(size) < 1kb 242 | show = no 243 | 244 | 245 | 246 | 247 | ############### /TEXT_CONTIG 248 | 249 | ############### CONTIGS SPADES ALL 250 | 251 | type = tile 252 | file = PLASMID_CONTIGS 253 | r1 = 0.65r 254 | r0 = 0.60r 255 | layers = 4 256 | margin = 5u 257 | thickness = 20 258 | padding = 5 259 | layers_overflow = collapse 260 | orientation = out 261 | stroke_thickness = 0 262 | stroke_color = grey 263 | color = grey 264 | 265 | 266 | 267 | 268 | 269 | condition = var(id) =~ /(\d+)(\d+)(\d*)/ 270 | color = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 271 | flow = continue 272 | 273 | #importance = 100 274 | #condition = 1 275 | #color = eval(sprintf("spectral-11-div-%d",remap_int(NODE_%d%d,0,10e6,1,11))) 276 | #color = eval((qw(vvvlgrey vvlgrey vlgrey lgrey grey dgrey vdgrey vvdgrey))[var(id) % 8]) 277 | 278 | 279 | 280 | condition = var(size) < 1kb 281 | show = no 282 | 283 | 284 | 285 | 286 | 287 | ############### /CONTIGS SPADES ALL 288 | 289 | 290 | 291 | 292 | ######## LINKS 293 | ############## 294 | 295 | 296 | 297 | 298 | 299 | file = PLASMID_LINKS 300 | r1 = 0.50r 301 | r0 = 0r 302 | ribbon = yes 303 | flat = yes 304 | radius = 0.6r 305 | bezier_radius = 0.1r 306 | crest = 0.2 307 | color = lgrey_a4 308 | 309 | 310 | 311 | 312 | condition = var(intrachr) 313 | show = no 314 | 315 | 316 | 317 | importance = 110 318 | condition = var(size1) < 2kb 319 | show = no 320 | flow = continue 321 | 322 | 323 | 324 | importance = 110 325 | condition = var(size2) < 2kb 326 | show = no 327 | flow = continue 328 | 329 | 330 | 331 | 332 | condition = var(id) =~ /(\d+)(\d+)(\d*)/ 333 | color = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 334 | #"paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12) 335 | #"set3-12-qual-%d_a%d" 336 | #"rev(set3-12-qual-%d_a%d)" 337 | flow = continue 338 | 339 | 340 | 341 | condition = 1 342 | z = eval(average(-1*(var(size1),var(size2)))) 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | ######## IDEOGRAM 353 | ################# 354 | 355 | 356 | show = yes 357 | 358 | 359 | default = 5000u 360 | #when representing witout scaling 361 | #default = 1000u 362 | break = 500u 363 | 364 | 365 | chromosomes_color = dblue 366 | stroke_color = blue 367 | 368 | radius = 0.93r 369 | thickness = 25p 370 | fill = yes 371 | 372 | show_label = yes 373 | 374 | label_font = bold 375 | label_radius = dims(ideogram,radius_inner) 376 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2 377 | 378 | label_size = 17 379 | label_parallel = yes 380 | 381 | 382 | 383 | ######## TICKS 384 | ############## 385 | 386 | show_ticks = yes 387 | show_tick_labels = yes 388 | 389 | 390 | radius = dims(ideogram,radius_outer) 391 | color = black 392 | thickness = 2p 393 | 394 | #multiplier = 0.001 395 | 396 | 397 | #spacing = 1000u 398 | rspacing = 0.025 399 | multiplier = 0.001 400 | spacing_type = relative 401 | skip_first_label = yes 402 | skip_last_label = no 403 | size = 5p 404 | show_label = yes 405 | label_size = 20p 406 | #label_relative = yes 407 | suffix = " kb" 408 | #rdivisor = ideogram 409 | format = %d 410 | rmultiplier = 1 411 | 412 | 413 | 414 | 415 | # 416 | #spacing = 2000u 417 | #size = 15p 418 | #show_label = yes 419 | #label_size = 20p 420 | #labe_offset = 10p 421 | #suffix = " kb" 422 | #format = %d 423 | # 424 | 425 | 426 | 427 | ########COLORS 428 | ############## 429 | <> 430 | 431 | 432 | ########HOUSEKEEPING 433 | #################### 434 | <> 435 | max_points_per_track* = 8000000 436 | max_ideograms*=1000 437 | ########IMAGE 438 | ############# 439 | 440 | dir = OUTPUTDIR 441 | #dir = conf(configdir) 442 | file = IMAGENAME 443 | png = yes 444 | svg = no 445 | # radius of inscribed circle in image 446 | radius = 1900p 447 | # by default angle=0 is at 3 o'clock position 448 | angle_offset = -90 449 | #angle_orientation = counterclockwise 450 | auto_alpha_colors = yes 451 | auto_alpha_steps = 5 452 | 453 | 454 | -------------------------------------------------------------------------------- /config_files/simple.conf: -------------------------------------------------------------------------------- 1 | ######## CIRCOS.CONF 2 | #################### 3 | 4 | karyotype = PLASMID_KARYOTYPE 5 | 6 | chromosome_units = 1000000 7 | chromosomes_display_default = no 8 | chromosomes = SAMPLE_SHOWN 9 | chromosomes_color = /./ = lblue 10 | z=100 11 | 12 | # 13 | # 14 | #chr = NZ_CP018342.1 15 | #start = 30000u 16 | #end = 52000u 17 | #scale = 15 18 | 19 | #smooth_distance = 10r 20 | #smooth_steps = 5 21 | 22 | # 23 | # 24 | 25 | #############################HIGHLIGHTS 26 | 27 | 28 | <> 29 | 30 | 31 | 32 | ########################################PLOTS 33 | 34 | 35 | ############### COVERAGE 36 | 37 | type = histogram 38 | file = PLASMID_COVERAGE_GRAPH 39 | 40 | color = black 41 | r1 = 0.99r 42 | r0 = 0.90r 43 | extend_bin = no 44 | min= 0 45 | max= 500 46 | thickness = 2 47 | orientation = out 48 | 49 | 50 | 51 | 52 | thickness = 1 53 | color = lgrey 54 | spacing = 50 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | condition = var(value) < 20 63 | color = lorange 64 | thickness = 3 65 | flow = continue 66 | 67 | 68 | 69 | condition = var(value) == 0 70 | color = red 71 | thickness = 3 72 | flow = continue 73 | 74 | 75 | 76 | condition = var(value) > 200 77 | color = green 78 | thickness = 3 79 | 80 | 81 | 82 | 83 | 84 | ############### /COVERAGE 85 | 86 | ############### TEXT_ADITIONAL_ANNOTATION 87 | 88 | type = text 89 | color = black 90 | label_font = bold 91 | label_size = 30p 92 | file = PLASMID_SPECIFIC_TEXT 93 | r1 = 0.85r+200p 94 | r0 = 0.74r 95 | orientation = center 96 | show_links = no 97 | 98 | margin = 0u 99 | label_parallel = no 100 | padding = 1p 101 | rpadding = 2p 102 | label_snuggle = yes 103 | max_snuggle_distance = 5r 104 | snuggle_sampling = 2 105 | snuggle_tolerance = 1r 106 | snuggle_link_overlap_test = yes 107 | snuggle_link_overlap_tolerance = 20p 108 | 109 | 110 | ############### /TEXT_ADITIONAL_ANNOTATION 111 | 112 | 113 | ############### TEXT_CDS_CONTIG 114 | 115 | 116 | type = text 117 | color = black 118 | label_font = default 119 | label_size = 42p 120 | file = PLASMID_CDS_CONTIG 121 | r1 = 0.70r+200p 122 | r0 = 0.70r 123 | orientation = center 124 | show_links = yes 125 | link_dims = 8p,8p,30p,8p,8p 126 | link_color = grey 127 | 128 | label_parallel = no 129 | padding = 0p 130 | label_snuggle = yes 131 | max_snuggle_distance = 6r 132 | snuggle_sampling = 10 133 | snuggle_tolerance = 1r 134 | snuggle_link_overlap_test = yes 135 | snuggle_link_overlap_tolerance = 10p 136 | 137 | # 138 | # 139 | #condition = var(value) eq "cds" 140 | #label_size = 7p 141 | #show = no 142 | #flow = continue 143 | # 144 | # 145 | 146 | 147 | ############### /TEXT_CDS_CONTIG 148 | 149 | ############### CDS_CONTIGS_PROKKA 150 | 151 | type = tile 152 | file = PLASMID_CDS_FORWARD 153 | r1 = 0.73r 154 | r0 = 0.70r 155 | layers = 2 156 | layers_overflow = grow 157 | margin = 0.001u 158 | thickness = 30p 159 | padding = 0p 160 | rpadding = 0p 161 | orientation = out 162 | stroke_thickness = 1 163 | stroke_color = vvdgrey 164 | color = dgrey 165 | 166 | 167 | 168 | r1 = 0.70r 169 | r0 = 0.70r 170 | 171 | 172 | position = 0.70r 173 | color = dgrey 174 | thickness = 2 175 | 176 | 177 | 178 | 179 | 180 | type = tile 181 | file = PLASMID_CDS_REVERSE 182 | r1 = 0.70r 183 | r0 = 0.67r 184 | layers = 2 185 | layers_overflow = grow 186 | margin = 0.001u 187 | thickness = 30p 188 | padding = 0p 189 | rpadding = 0p 190 | orientation = in 191 | stroke_thickness = 1 192 | stroke_color = dgrey 193 | color = lgrey 194 | 195 | 196 | ############### /CDS_CONTIGS_PROKKA 197 | 198 | 199 | ############### TEXT_CONTIG 200 | 201 | type = text 202 | label_font = bold 203 | label_size = 20p 204 | file = PLASMID_CONTIGS 205 | r1 = 0.60r+100p 206 | r0 = 0.60r 207 | orientation = out 208 | show_links = yes 209 | label_parallel = yes 210 | padding = 5p 211 | rpadding = 2p 212 | margin = 15p 213 | label_snuggle = yes 214 | max_snuggle_distance = 10r 215 | snuggle_sampling = 10 216 | snuggle_tolerance = 5r 217 | snuggle_link_overlap_test = yes 218 | snuggle_link_overlap_tolerance = 3p 219 | 220 | 221 | 222 | 223 | condition = var(value) =~ /(\d+)(\d+)(\d*)/ 224 | color = eval(my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 225 | flow = continue 226 | 227 | 228 | 229 | condition = var(value) =~ /(\d+)(\d+)(\d*)/ 230 | link_color = eval(my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 231 | flow = continue 232 | 233 | 234 | 235 | condition = var(size) < 0.2kb 236 | show = no 237 | 238 | 239 | 240 | 241 | ############### /TEXT_CONTIG 242 | 243 | ############### CONTIGS SPADES ALL 244 | 245 | type = tile 246 | file = PLASMID_CONTIGS 247 | r1 = 0.60r 248 | r0 = 0.50r 249 | layers = 5 250 | margin = 5u 251 | thickness = 40 252 | padding = 5 253 | layers_overflow = collapse 254 | orientation = in 255 | stroke_thickness = 0 256 | stroke_color = grey 257 | color = grey 258 | 259 | 260 | 261 | 262 | condition = var(value) =~ /(\d+)(\d+)(\d*)/ 263 | color = eval( my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 ))) 264 | flow = continue 265 | 266 | 267 | 268 | 269 | condition = var(size) < 0.2kb 270 | show = no 271 | 272 | 273 | 274 | 275 | 276 | ############### /CONTIGS SPADES ALL 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | ######## IDEOGRAM 285 | ################# 286 | 287 | z=5000 288 | show = yes 289 | 290 | 291 | default = 10u 292 | #when representing witout scaling 293 | #default = 1000u 294 | break = 10u 295 | 296 | 297 | chromosomes_color = dblue 298 | stroke_color = blue 299 | 300 | radius = 0.93r 301 | thickness = 30p 302 | fill = yes 303 | 304 | show_label = yes 305 | label_color = dgrey 306 | label_center = yes 307 | label_font = bold 308 | label_radius = 0.1r 309 | #label_radius = dims(ideogram,radius_inner) 310 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2 311 | 312 | label_size = 50 313 | label_parallel = yes 314 | 315 | 316 | 317 | ######## TICKS 318 | ############## 319 | 320 | show_ticks = yes 321 | show_tick_labels = yes 322 | 323 | 324 | radius = dims(ideogram,radius_outer) 325 | color = black 326 | thickness = 2p 327 | labe_offset = 0p 328 | 329 | #multiplier = 0.001 330 | 331 | 332 | #spacing = 1000u 333 | rspacing = 0.025 334 | multiplier = 0.001 335 | spacing_type = relative 336 | skip_first_label = yes 337 | skip_last_label = no 338 | size = 5p 339 | show_label = yes 340 | label_size = 20p 341 | #label_relative = yes 342 | suffix = " kb" 343 | #rdivisor = ideogram 344 | format = %d 345 | rmultiplier = 1 346 | 347 | 348 | 349 | # 350 | #spacing = 2000u 351 | #multiplier = 0.001 352 | #size = 5p 353 | #show_label = yes 354 | #skip_first_label = yes 355 | #label_size = 15p 356 | #labe_offset = 0p 357 | #suffix = " kb" 358 | #format = %d 359 | # 360 | 361 | 362 | 363 | ########COLORS 364 | ############## 365 | <> 366 | 367 | 368 | ########HOUSEKEEPING 369 | #################### 370 | <> 371 | max_points_per_track* = 8000000 372 | max_ideograms*=1000 373 | 374 | ########IMAGE 375 | ############# 376 | 377 | dir = OUTPUTDIR 378 | #dir = conf(configdir) 379 | file = IMAGENAME_SAMPLE_PLASMID 380 | png = yes 381 | svg = no 382 | # radius of inscribed circle in image 383 | radius = 1900p 384 | # by default angle=0 is at 3 o'clock position 385 | angle_offset = -90 386 | #angle_orientation = counterclockwise 387 | auto_alpha_colors = yes 388 | auto_alpha_steps = 5 389 | 390 | -------------------------------------------------------------------------------- /documents/ECCMID plasmidID 2018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/documents/ECCMID plasmidID 2018.pdf -------------------------------------------------------------------------------- /documents/Istall_dependencies.md: -------------------------------------------------------------------------------- 1 | # Trimmomatic 2 | - wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.38.zip 3 | - unzip Trimmomatic-0.38.zip 4 | - copy to /opt/Trimmomatic or use trimmomatic-dir PATH/TO/Trimmomatic-0.38 5 | 6 | # SPAdes 7 | 8 | - wget http://cab.spbu.ru/files/release3.12.0/SPAdes-3.12.0-Linux.tar.gz 9 | - tar -xzf SPAdes-3.12.0-Linux.tar.gz 10 | - Add to PATH SPAdes-3.12.0-Linux/bin/ 11 | 12 | # Blast+ 13 | 14 | - sudo apt-get install ncbi-blast+ 15 | 16 | # Bowtie2 17 | 18 | - sudo apt install bowtie2 19 | 20 | # Cd-hit-est 21 | 22 | - sudo apt-get install cd-hit 23 | 24 | # Bedtools 25 | 26 | - sudo apt install bedtools 27 | 28 | # Prokka 29 | 30 | - sudo apt-get install libdatetime-perl libxml-simple-perl libdigest-md5-perl git default-jre bioperl 31 | - sudo cpan Bio::Perl 32 | - git clone https://github.com/tseemann/prokka.git $HOME/prokka 33 | - $HOME/prokka/bin/prokka --setupdb 34 | - Add $HOME/prokka/bin/ to PATH 35 | 36 | # Circos 37 | 38 | 39 | - wget http://www.circos.ca/distribution/circos-0.69-6.tgz 40 | - tar xvfz circos-0.69-6.tgz 41 | - sudo apt-get -y install libgd2-xpm-dev 42 | - Add circos-0.69-6.tgz/bin to PATH 43 | - sudo sed -i 's/max_points_per_track = 25000/max_points_per_track = 20000000/g' /opt/circos-0.69-6/etc/housekeeping.conf 44 | 45 | 46 | 47 | 48 | 49 | 50 | ##g++ 51 | - sudo apt-get install build-essential 52 | ##libz.h 53 | - sudo apt-get install libz-dev 54 | ##circos dependencies 55 | - sudo apt install circos 56 | -------------------------------------------------------------------------------- /documents/PlasmidID_IWBBIO.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/documents/PlasmidID_IWBBIO.pdf -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: plasmidID 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python>=3.6 8 | - bioconda::perl-gd>=2.71 9 | - bioconda::bowtie2 10 | - bioconda::bedtools 11 | - bioconda::samtools 12 | - bioconda::mash>=2 13 | - bioconda::circos 14 | - bioconda::prokka>=1.14 15 | - bioconda::blast 16 | - bioconda::spades 17 | - bioconda::trimmomatic 18 | - tbb==2020.2 19 | - conda-forge::gawk 20 | - conda-forge::biopython 21 | - conda-forge::numpy 22 | - conda-forge::pandas 23 | - conda-forge::scikit-learn 24 | - conda-forge::scipy 25 | - conda-forge::tabulate 26 | - conda-forge::wget 27 | - conda-forge::bc 28 | -------------------------------------------------------------------------------- /img/01_plasmid_track.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/01_plasmid_track.png -------------------------------------------------------------------------------- /img/02_mapping_track.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/02_mapping_track.png -------------------------------------------------------------------------------- /img/03_annotation_track.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/03_annotation_track.png -------------------------------------------------------------------------------- /img/04_contig_track.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/04_contig_track.png -------------------------------------------------------------------------------- /img/05_01_complete_contig_track.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/05_01_complete_contig_track.png -------------------------------------------------------------------------------- /img/05_complete_contig_track.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/05_complete_contig_track.png -------------------------------------------------------------------------------- /img/Alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Alignment.png -------------------------------------------------------------------------------- /img/Annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Annotation.png -------------------------------------------------------------------------------- /img/Clustering_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Clustering_2.png -------------------------------------------------------------------------------- /img/Mapping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Mapping.png -------------------------------------------------------------------------------- /img/Overlap_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Overlap_examples.png -------------------------------------------------------------------------------- /img/PIPELNE TFM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/PIPELNE TFM.png -------------------------------------------------------------------------------- /img/SEN30_000195995_K00826.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_K00826.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NC_002305.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_002305.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NC_003384.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_003384.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NC_003385.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_003385.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NC_009981.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_009981.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NC_013365.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_013365.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NZ_LT883154.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT883154.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NZ_LT904853.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904853.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NZ_LT904874.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904874.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NZ_LT904880.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904880.1.png -------------------------------------------------------------------------------- /img/SEN30_000195995_NZ_LT904895.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904895.1.png -------------------------------------------------------------------------------- /img/SEN_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN_summary.png -------------------------------------------------------------------------------- /img/SEN_summary_numbers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN_summary_numbers.png -------------------------------------------------------------------------------- /img/Short_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Short_pipeline.png -------------------------------------------------------------------------------- /img/Visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Visualization.png -------------------------------------------------------------------------------- /img/isciii_logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/isciii_logo.jpeg -------------------------------------------------------------------------------- /img/pipeline_pID.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/pipeline_pID.png -------------------------------------------------------------------------------- /img/plasmidID_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/plasmidID_logo.png -------------------------------------------------------------------------------- /img/summary_image_1_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_1_3.png -------------------------------------------------------------------------------- /img/summary_image_2_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_2_3.png -------------------------------------------------------------------------------- /img/summary_image_3_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_3_3.png -------------------------------------------------------------------------------- /test/KPN_TEST_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/test/KPN_TEST_R1.fastq.gz -------------------------------------------------------------------------------- /test/KPN_TEST_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/test/KPN_TEST_R2.fastq.gz -------------------------------------------------------------------------------- /test/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 4 | #or a compound command returns a non-zero status: If errors are not handled by user 5 | set -e 6 | # Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error when performing parameter expansion. 7 | 8 | #Print everything as if it were executed, after substitution and expansion is applied: Debug|log option 9 | #set -x 10 | 11 | #============================================================= 12 | # HEADER 13 | #============================================================= 14 | 15 | #INSTITUTION:ISCIII 16 | #CENTRE:BU-ISCIII 17 | #AUTHOR: Pedro J. Sola (pedroscampoy@gmail.com) 18 | VERSION=1.6.3 19 | #CREATED: 15 March 2018 20 | # 21 | #ACKNOLEDGE: longops2getops.sh: https://gist.github.com/adamhotep/895cebf290e95e613c006afbffef09d7 22 | # 23 | #DESCRIPTION: test.sh uses test data for testing plasmidID installation. 24 | # 25 | # 26 | #================================================================ 27 | # END_OF_HEADER 28 | #================================================================ 29 | 30 | #SHORT USAGE RULES 31 | #LONG USAGE FUNCTION 32 | usage() { 33 | cat << EOF 34 | 35 | plasmidID is a computational pipeline tha reconstruct and annotate the most likely plasmids present in one sample 36 | 37 | usage : $0 38 | 39 | -v | --version version 40 | -h | --help display usage message 41 | 42 | example: ./test.sh 43 | 44 | EOF 45 | } 46 | 47 | #================================================================ 48 | # OPTION_PROCESSING 49 | #================================================================ 50 | # Error handling 51 | error(){ 52 | local parent_lineno="$1" 53 | local script="$2" 54 | local message="$3" 55 | local code="${4:-1}" 56 | 57 | RED='\033[0;31m' 58 | NC='\033[0m' 59 | 60 | if [[ -n "$message" ]] ; then 61 | echo -e "\n---------------------------------------\n" 62 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 63 | echo -e "MESSAGE:\n" 64 | echo -e "$message" 65 | echo -e "\n---------------------------------------\n" 66 | else 67 | echo -e "\n---------------------------------------\n" 68 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" 69 | echo -e "\n---------------------------------------\n" 70 | fi 71 | 72 | exit "${code}" 73 | } 74 | 75 | # translate long options to short 76 | reset=true 77 | for arg in "$@" 78 | do 79 | if [ -n "$reset" ]; then 80 | unset reset 81 | set -- # this resets the "$@" array so we can rebuild it 82 | fi 83 | case "$arg" in 84 | --help) set -- "$@" -h ;; 85 | --version) set -- "$@" -v ;; 86 | # pass through anything else 87 | *) set -- "$@" "$arg" ;; 88 | esac 89 | done 90 | 91 | #DECLARE FLAGS AND VARIABLES 92 | script_dir=$(dirname $(readlink -f $0)) 93 | R1=KPN_TEST_R1.fastq.gz 94 | R2=KPN_TEST_R2.fastq.gz 95 | database=plasmids_TEST_database.fasta 96 | contigs=contigs_KPN_TEST.fasta 97 | 98 | #PARSE VARIABLE ARGUMENTS WITH getops 99 | #common example with letters, for long options check longopts2getopts.sh 100 | options=":1:2:d:s:g:c:a:i:o:C:S:f:l:L:T:M:X:y:Y:RVtvh" 101 | while getopts $options opt; do 102 | case $opt in 103 | h ) 104 | usage 105 | exit 1 106 | ;; 107 | v ) 108 | echo $VERSION 109 | exit 1 110 | ;; 111 | \?) 112 | echo "Invalid Option: -$OPTARG" 1>&2 113 | usage 114 | exit 1 115 | ;; 116 | : ) 117 | echo "Option -$OPTARG requires an argument." >&2 118 | exit 1 119 | ;; 120 | * ) 121 | echo "Unimplemented option: -$OPTARG" >&2; 122 | exit 1 123 | ;; 124 | 125 | esac 126 | done 127 | shift $((OPTIND-1)) 128 | 129 | ## Execute plasmidID with test data. 130 | echo "Executing:../plasmidID.sh -1 $R1 -2 $R2 -d $database -c $contigs -s KPN --no-trim" 131 | echo "Forward reads: $R1" 132 | echo "Reverse reads: $R2" 133 | echo "PlasmidDatabase: $database" 134 | echo "Contigs: $contigs" 135 | echo "Options: --no-trim" 136 | 137 | echo "export PATH=$PATH:$script_dir/../bin" > path 138 | source path 139 | $script_dir/../plasmidID -1 $script_dir/$R1 -2 $script_dir/$R2 -d $script_dir/$database -c $script_dir/$contigs -s KPN --no-trim 140 | 141 | 142 | echo "ALL DONE. TEST COMPLETED SUCCESSFULLY YOUR INSTALLATION SHOULD BE CORRECT." 143 | --------------------------------------------------------------------------------