├── .github
    └── workflows
    │   ├── dockerhub_push_release.yml
    │   └── tests.yml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── bin
    ├── adapt_filter_coverage.sh
    ├── blast_align.sh
    ├── blast_to_bed.sh
    ├── blast_to_complete.sh
    ├── blast_to_link.sh
    ├── bowtie_mapper.sh
    ├── build_karyotype.sh
    ├── calculate_seqlen.sh
    ├── cdhit_cluster.sh
    ├── check_dependencies.sh
    ├── check_mandatory_files.sh
    ├── coordinate_adapter.sh
    ├── download_plasmid_database.py
    ├── draw_circos_images.sh
    ├── filter_fasta.sh
    ├── get_coverage.sh
    ├── gff_to_bed.sh
    ├── mash_screener.sh
    ├── mashclust.py
    ├── ncbi_database_fetcher.sh
    ├── process_cluster_output.sh
    ├── prokka_annotation.sh
    ├── quality_trim.sh
    ├── rename_from_fasta.sh
    ├── sam_to_bam.sh
    ├── spades_assembly.sh
    ├── summary_report_pid.py
    └── summary_table.sh
├── config_files
    ├── OR.conf
    ├── annotation_config_file.txt
    ├── circos_individual_1_3_0.conf
    ├── circos_individual_1_3_3.conf
    ├── circos_summary_1_3_0.conf
    ├── circos_summary_1_3_3.conf
    └── simple.conf
├── databases
    ├── ARGannot.pID.fasta
    ├── card.fasta
    └── plasmidFinder_01_26_2018.fsa
├── documents
    ├── ECCMID plasmidID 2018.pdf
    ├── Istall_dependencies.md
    └── PlasmidID_IWBBIO.pdf
├── environment.yml
├── img
    ├── 01_plasmid_track.png
    ├── 02_mapping_track.png
    ├── 03_annotation_track.png
    ├── 04_contig_track.png
    ├── 05_01_complete_contig_track.png
    ├── 05_complete_contig_track.png
    ├── Alignment.png
    ├── Annotation.png
    ├── Clustering_2.png
    ├── Mapping.png
    ├── Overlap_examples.png
    ├── PIPELNE TFM.png
    ├── SEN30_000195995_K00826.1.png
    ├── SEN30_000195995_NC_002305.1.png
    ├── SEN30_000195995_NC_003384.1.png
    ├── SEN30_000195995_NC_003385.1.png
    ├── SEN30_000195995_NC_009981.1.png
    ├── SEN30_000195995_NC_013365.1.png
    ├── SEN30_000195995_NZ_LT883154.1.png
    ├── SEN30_000195995_NZ_LT904853.1.png
    ├── SEN30_000195995_NZ_LT904874.1.png
    ├── SEN30_000195995_NZ_LT904880.1.png
    ├── SEN30_000195995_NZ_LT904895.1.png
    ├── SEN_summary.png
    ├── SEN_summary_numbers.png
    ├── Short_pipeline.png
    ├── Visualization.png
    ├── isciii_logo.jpeg
    ├── pipeline_pID.png
    ├── plasmidID_logo.png
    ├── summary_image_1_3.png
    ├── summary_image_2_3.png
    └── summary_image_3_3.png
├── plasmidID
└── test
    ├── KPN_TEST_R1.fastq.gz
    ├── KPN_TEST_R2.fastq.gz
    ├── contigs_KPN_TEST.fasta
    ├── plasmids_TEST_database.fasta
    └── test.sh


/.github/workflows/dockerhub_push_release.yml:
--------------------------------------------------------------------------------
 1 | name: deploy release
 2 | # This builds the docker image and pushes it to DockerHub
 3 | on:
 4 |   release:
 5 |      types: [published]
 6 | jobs:
 7 |   push_dockerhub:
 8 |     name: Push new Docker image to Docker Hub (release)
 9 |     runs-on: ubuntu-latest
10 |     # Only run for the official repo, for releases and merged PRs
11 |     if: ${{ github.repository == 'BU-ISCIII/plasmidID' }}
12 |     env:
13 |       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
14 |       DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }}
15 |     steps:
16 |       - name: Check out pipeline code
17 |         uses: actions/checkout@v2
18 | 
19 |       - name: Build new docker image
20 |         run: docker build --no-cache . -t buisciii/plasmidid:${{ github.event.release.tag_name }}
21 | 
22 |       - name: Push Docker image to DockerHub (develop)
23 |         run: |
24 |           echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
25 |           docker push buisciii/plasmidid:${{ github.event.release.tag_name }}
26 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests ci
 2 | # This workflow runs the pipeline with the minimal test dataset to check that it completes any errors
 3 | on:
 4 |   push:
 5 |     branches: [develop]
 6 |   pull_request_target:
 7 |     branches: [develop]
 8 |   release:
 9 |     types: [published]
10 | 
11 | jobs:
12 |   push_dockerhub:
13 |     name: Push new Docker image to Docker Hub (dev)
14 |     runs-on: ubuntu-latest
15 |     # Only run for the official repo, for releases and merged PRs
16 |     if: ${{ github.repository == 'BU-ISCIII/plasmidID' }}
17 |     env:
18 |       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
19 |       DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }}
20 |     steps:
21 |       - name: Check out pipeline code
22 |         uses: actions/checkout@v2
23 | 
24 |       - name: Build new docker image
25 |         run: docker build --no-cache . -t buisciii/plasmidid:dev
26 | 
27 |       - name: Push Docker image to DockerHub (develop)
28 |         run: |
29 |           echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
30 |           docker push buisciii/plasmidid:dev
31 |   run-tests:
32 |     name: Run tests
33 |     needs: push_dockerhub
34 |     runs-on: ubuntu-latest
35 |     steps:
36 |       - name: Run pipeline with test data
37 |         run: |
38 |             docker run buisciii/plasmidid:dev bash /opt/plasmidID/test/test.sh
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ## Custom
 2 | TEST/
 3 | TEST_DATA/*.bt2
 4 | TEST_DATA/*.length
 5 | TEST_DATA/*.blast*
 6 | NO_GROUP/
 7 | psi_cd_hit_may_2018_log.txt
 8 | plasmid.database*
 9 | .vscode/
10 | 
11 | ## Trash
12 | *~
13 | .fuse*
14 | 
15 | ## Container images
16 | *.img
17 | *.simg
18 | log_*
19 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 1.6.4 - 2021-03-2020
 4 | ### Added
 5 | - Updated Dockerfile
 6 | - Migrated tests to github actions
 7 | ### Fixed
 8 | - Updated environment.yml for conda.
 9 | - Fixed issues #12,#14,#15,#17. Cases with no plasmids or too many. Relative paths in html images.
10 | 
11 | ## 1.4.2 - 2018-09-29
12 | ### Added
13 | - Specific config file for only reconstruct parameter
14 | 
15 | ###Fixed
16 | - Protein databases can be properly used
17 | 
18 | ## 1.4 - 2018-09-20
19 | ### Added
20 | - Automatically annotated genes/cds are displayed differently depending on whether they are located in forward or reverse
21 | - Psi-cd-hit and blast now handle threads
22 | - Improved error handling
23 | - Doocker/Singularity compatibility
24 | - One multifasta file per reference plasmid is generated with all the similar contigs from the sample
25 | - Quick staus of values applied to plasmid reconstruction
26 | 
27 | ###Fixed
28 | - Some plasmids from the database were not annotated
29 | - Limit sample name to 37 characters, capped by prokka
30 | - Bug in complete contig track generator that took the wrong value and couldn't draw sequences that matched the position 0 of plasmid
31 | 
32 | 
33 | 
34 | ## 1.3.0 - 2018-07-11
35 | ### New
36 | - Summary table can be generated with new utility
37 | - Several databases can be now annotated filling annotation_config_file.txt
38 | - --only-reconstruct is now implemented if user only needs to reconstruct and annotate contigs with small known databases
39 | ### Fixed
40 | - circos dependency is now checked
41 | - Output is now correctly redirected with -o
42 | ### Added
43 | - trimmomatic directory containing .jar can no be especified with --trimmomatic-directory
44 | - Vervose mode included. By default a log file will be created
45 | - Friendly terminal output
46 | 
47 | ## 1.2.2 - 2018-06-22
48 | ### Fixed
49 | - ***IMPORTANT***: PlasmidID maps with -a mode NOW, as it should have allways been. A bug on mapping script is now solved
50 | - Number of threads are now implemented on mapping
51 | - Some cumulative clustering temporary files are now removed
52 | 
53 | ## 1.2.1 - 2018-06-14
54 | ### Fixed
55 | - All dependencies are now checked at the beggining
56 | - Path to scripts are no longer hard coded paths
57 | - Links should be now displayed on summary image
58 | 
59 | ### Added
60 | - Added first utility ***ncbi_database_fetcher.sh***, a script to download FASTA databases from terms
61 | - Short scripts now moved to /bin has to be added to PATH
62 | 
63 | 
64 | ## 1.1.1 - 2018-06-11
65 | ### Fixed
66 | - Additional database will not be required for circos executios, even though the file will be created
67 | - Fixed an issue when no plasmid matches mapping requeriments
68 | - Fixed an issue when circos will trow an error message when no plasmids met mapping requeriments
69 | 
70 | 
71 | ## 1.1.0 - 2018-06-06
72 | ### Added
73 | - Database plasmids used as scaffold are annotated after filtering. User doesn't need to annotate the initial huge plasmid database.
74 | - User can add ONE nucleotide FASTA file wi that will be specifically annotated on final plasmids with a light blue color
75 | 
76 | ## Unreleased
77 | 
78 | - Create config files as required by user and include visual parameters
79 | - Test and adapt the --only-reconstruct option
80 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3:latest
 2 | 
 3 | RUN mkdir /opt/plasmidID/
 4 | ADD bin /opt/plasmidID/bin
 5 | ADD config_files /opt/plasmidID/config_files
 6 | ADD databases /opt/plasmidID/databases
 7 | ADD documents /opt/plasmidID/documents
 8 | ADD img /opt/plasmidID/img
 9 | ADD test /opt/plasmidID/test
10 | ADD plasmidID /opt/plasmidID/
11 | ADD environment.yml /opt/plasmidID/
12 | ADD CHANGELOG.md /opt/plasmidID/
13 | ADD LICENSE /opt/plasmidID/
14 | 
15 | RUN cd /opt/plasmidID
16 | RUN /opt/conda/bin/conda env create -f /opt/plasmidID/environment.yml && /opt/conda/bin/conda clean -a
17 | RUN /opt/conda/bin/conda env export --name plasmidID > plasmidID.yml
18 | ENV PATH /opt/conda/envs/plasmidID/bin:$PATH
19 | ENV PATH /opt/plasmidID/bin:/opt/plasmidID:$PATH
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/fusion-report/README.html)
  3 | [![CircleCI Build Status](https://circleci.com/gh/circleci/circleci-docs.svg?style=shield)](https://circleci.com/gh/BU-ISCIII/plasmidID) [![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) [![Scif](https://img.shields.io/badge/Filesystem-Scientific-brightgreen.svg)](https://sci-f.github.io)
  4 | 
  5 | # plasmidID <img align="left" src="https://github.com/BU-ISCIII/plasmidID/blob/develop/img/plasmidID_logo.png" alt="Logo" width="100">
  6 | 
  7 | <br>
  8 | <br>
  9 | 
 10 | * [Introduction](#introduction)
 11 | * [Requirements](#requirements)
 12 |     * [Software](#software)
 13 |     * [Plasmid database](#plasmid-database)
 14 | * [Installation](#installation)
 15 |     * [Install from source](#install-from-source)
 16 |     * [Install using conda](#install-using-conda)
 17 | * [Quick usage](#quick-usage)
 18 | * [Usage](#usage)
 19 | * [Output](#output)
 20 | * [Annotation file](#annotation-file)
 21 | * [Illustrated pipeline](#illustrated-pipeline)
 22 | * [Docker](#docker)
 23 | 
 24 | ## Introduction
 25 | 
 26 | PlasmidID is a mapping-based, assembly-assisted plasmid identification tool that analyzes and gives graphic solution for plasmid identification.
 27 | 
 28 | PlasmidID is a **computational pipeline** implemented in **BASH** that maps Illumina reads over plasmid database sequences. The k-mer filtered, most covered sequences are clustered by identity to avoid redundancy and the longest are used as scaffold for plasmid reconstruction. Reads are assembled and annotated by automatic and specific annotation. All information generated from mapping, assembly, annotation and local alignment analyses is gathered and accurately represented in a **circular image** which allow user to determine plasmidic composition in any bacterial sample.
 29 | 
 30 | ## Requirements
 31 | 
 32 | #### Software
 33 | 
 34 | * [Python >=3.6](https://www.python.org/)
 35 | * [Trimmomatic v0.33](http://www.usadellab.org/cms/?page=trimmomatic)(Optional)
 36 | * [Spades v3.8](http://cab.spbu.ru/software/spades/) (Optional)
 37 | * [Perl v5.26.0](https://www.perl.org/get.html)
 38 | * [NCBI_blast + v2.2.3](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)
 39 | * [Bedtools v2.25](http://bedtools.readthedocs.io/en/latest/)
 40 | * [Bowtie 2 v2.2.4](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)
 41 | * [SAMtools v1.2](http://samtools.sourceforge.net/)
 42 | * [prokka v1.12](http://www.vicbioinformatics.com/software.prokka.shtml)
 43 | * [cd-hit v4.6.6](http://weizhongli-lab.org/cd-hit/) (no longer needed since v1.6)
 44 | * [circos v0.69.3](http://circos.ca/software/download/circos/)
 45 | * [mash v2.2](https://github.com/marbl/Mash)
 46 | 
 47 | #### Plasmid database
 48 | 
 49 | Since version v1.5.1 plasmid database can be downloaded with the following command:
 50 | 
 51 | ```Bash
 52 |  download_plasmid_database.py -o FOLDER
 53 | ```
 54 | 
 55 | ## Installation
 56 | 
 57 | #### Install from source
 58 | 
 59 | Install all dependencies and add them to $PATH
 60 | 
 61 | git clone https://github.com/BU-ISCIII/plasmidID.git
 62 | 
 63 | Add plasmidID and ./bin to $PATH
 64 | 
 65 | #### Install using conda
 66 | 
 67 | This option is recomended.
 68 | 
 69 | Install [Anaconda3](https://www.anaconda.com/distribution/)
 70 | 
 71 | ```
 72 | conda install -c conda-forge -c bioconda plasmidid
 73 | ```
 74 | Wait for the environment to solve
 75 | 
 76 | Ignore warnings/errors
 77 | 
 78 | #### Use Docker
 79 | 
 80 | Example:
 81 | Clone the repo:
 82 | ```Bash
 83 | git clone git@github.com:BU-ISCIII/plasmidID.git
 84 | cd plasmidID
 85 | ```
 86 | Run it with the test data using docker:
 87 | 
 88 | **Notice that the input files MUST be in your present working directory or in any folder inside it. For example, if I execute this command in `/home/smonzon`, my folder with the files would be in `/home/smonzon/test`.**
 89 | 
 90 | ```Bash
 91 | docker run -v $PWD:$PWD -w $PWD buisciii/plasmidid plasmidID \
 92 |      -1 test/KPN_TEST_R1.fastq.gz  \
 93 |      -2 test/KPN_TEST_R2.fastq.gz \
 94 |      -d test/plasmids_TEST_database.fasta \
 95 |      -c test/contigs_KPN_TEST.fasta \
 96 |      --no-trim \
 97 |      -s KPN
 98 | ```
 99 | 
100 | ## Quick usage
101 | 
102 | Illumina paired-end
103 | ```
104 | plasmidID \
105 | -1 SAMPLE_R1.fastq.gz  \
106 | -2 SAMPLE_R2.fastq.gz \
107 | -d YYYY-MM-DD_plasmids.fasta \
108 | -c SAMPLE_assembled_contigs.fasta \
109 | --no-trim \
110 | -s SAMPLE
111 | ```
112 | 
113 | SMRT sequencing (only contigs)
114 | ```
115 | plasmidID \
116 | -d YYYY-MM-DD_plasmids.fasta \
117 | -c SAMPLE_contigs.fasta \
118 | -s SAMPLE
119 | ```
120 | 
121 | Annotate any fasta you want
122 | ```
123 | plasmidID \
124 | -d YYYY-MM-DD_plasmids.fasta \
125 | -c SAMPLE_assembled_contigs.fasta \
126 | -a annotation_file \
127 | -s SAMPLE
128 | ```
129 | More info about [annotation file](#annotation-file)
130 | 
131 | If there are several samples in the same GROUP folder
132 | ```
133 | summary_report_pid.py -i NO_GROUP/
134 | ```
135 | ## Usage
136 | 
137 | ```
138 | usage : plasmidID <-1 R1> <-2 R2> <-d database(fasta)> <-s sample_name> [-g group_name] [options]
139 | 
140 | 	Mandatory input data:
141 | 	-1 | --R1	<filename>	reads corresponding to paired-end R1 (mandatory)
142 | 	-2 | --R2	<filename>	reads corresponding to paired-end R2 (mandatory)
143 | 	-d | --database	<filename>	database to map and reconstruct (mandatory)
144 | 	-s | --sample	<string>	sample name (mandatory), less than 37 characters
145 | 
146 | 	Optional input data:
147 | 	-g | --group	<string>	group name (optional). If unset, samples will be gathered in NO_GROUP group
148 | 	-c | --contigs	<filename>	file with contigs. If supplied, plasmidID will not assembly reads
149 | 	-a | --annotate <filename>	file with configuration file for specific annotation
150 | 	-o 		<output_dir>	output directory, by default is the current directory
151 | 
152 | 	Pipeline options:
153 | 	--explore	Relaxes default parameters to find less reliable relationships within data supplied and database
154 | 	--only-reconstruct	Database supplied will not be filtered and all sequences will be used as scaffold
155 | 						This option does not require R1 and R2, instead a contig file can be supplied
156 | 	-w 			Undo winner takes it all algorithm when clustering by kmer - QUICKER MODE
157 | 	Trimming:
158 | 	--trimmomatic-directory Indicate directory holding trimmomatic .jar executable
159 | 	--no-trim	Reads supplied will not be quality trimmed
160 | 
161 | 	Coverage and Clustering:
162 | 	-C | --coverage-cutoff	<int>	minimun coverage percentage to select a plasmid as scafold (0-100), default 80
163 | 	-S | --coverage-summary	<int>	minimun coverage percentage to include plasmids in summary image (0-100), default 90
164 | 	-f | --cluster	<int>	kmer identity to cluster plasmids into the same representative sequence (0 means identical) 		(0-1), default 0.5
165 | 	-k | --kmer	<int>	identity to filter plasmids from the database with kmer approach (0-1), default 0.95
166 | 
167 | 	Contig local alignment
168 | 	-i | --alignment-identity <int>	minimun identity percentage aligned for a contig to annotate, default 90
169 | 	-l | --alignment-percentage <int>	minimun length percentage aligned for a contig to annotate, default 20
170 | 	-L | --length-total	<int>	minimun alignment length to filter blast analysis
171 | 	--extend-annotation <int>	look for annotation over regions with no homology found (base pairs), default 500bp
172 | 
173 | 	Draw images:
174 | 	--config-directory <dir>	directory holding config files, default config_files/
175 | 	--config-file-individual <file-name> file name of the individual file used to reconstruct
176 | 	Additional options:
177 | 
178 | 	-M | --memory	<int>	max memory allowed to use
179 | 	-T | --threads	<int>	number of threads
180 | 	-v | --version		version
181 | 	-h | --help		display usage message
182 | 
183 | example: ./plasmidID.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -d database.fasta -s ECO_553 -G ENTERO
184 | 	./plasmidID.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -d PacBio_sample.fasta -c scaffolds.fasta -C 60 -s ECO_60 -G ENTERO --no-trim
185 | ```
186 | 
187 | ## Examples
188 | 
189 | Under construction
190 | 
191 | ## Output
192 | 
193 | Since v1.6, the more relevant output is located in GROUP/SAMPLE folder:
194 | 
195 | - **SAMPLE_final_results.html(.tab)**
196 | 	- id: Name of the accession number of reference
197 | 	- length: length of the reference sequence
198 | 	- species: species of the reference sequence
199 | 	- description: rest of reference fasta header
200 | 	- contig_name: number of the contigs that align the minimun required for complete contig track
201 | 	- SAMPLE:
202 | 		- Image of the reconstructed plasmid (click to open in new tab)
203 | 		- MAPPING % (percentage): percentage of reference covered with reads
204 | 			- X for contig mode (gray colour)
205 | 			- Orientative colouring (the closer to 100% the better)
206 | 		- ALIGN FR (fraction_covered): total length of contigs aligned (complete) / reference sequence length
207 | 			- Orientative colouring (the closer to 1 the better)
208 | 
209 | 
210 | ## Annotation file
211 | 
212 | Under construction
213 | 
214 | ## Illustrated pipeline
215 | 
216 | This image sumarizes PlasmidID pipeline, including the most important steps.
217 | For furder details, including:
218 | - [Results interpretation](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track)
219 | - and more, please visit: [**PLASMIDID WIKI**](https://github.com/BU-ISCIII/plasmidID/wiki)
220 | 
221 | <p align="center"><img src="https://github.com/BU-ISCIII/plasmidID/blob/master/img/pipeline_pID.png" alt="workflow_small"  width="500">
222 | 
223 | 


--------------------------------------------------------------------------------
/bin/adapt_filter_coverage.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | #~set -x
  7 | 
  8 | #=============================================================
  9 | # HEADER
 10 | #=============================================================
 11 | 
 12 | #INSTITUTION:ISCIII
 13 | #CENTRE:BU-ISCIII
 14 | #AUTHOR: Pedro J. Sola
 15 | VERSION=1.0
 16 | #CREATED: 21 March 2018
 17 | #REVISION:
 18 | #DESCRIPTION:adapt_filter_coverage script that adapt percentages and filter coverage info from bedtools genomecov output
 19 | 
 20 | #================================================================
 21 | # END_OF_HEADER
 22 | #================================================================
 23 | 
 24 | #SHORT USAGE RULES
 25 | #LONG USAGE FUNCTION
 26 | usage() {
 27 | 	cat << EOF
 28 | 
 29 | adapt_filter_coverage script that adapt percentages and filter coverage info from bedtools genomecov output
 30 | 
 31 | usage : $0 <-i inputfile(.fasta)> [-o <directory>] [-c <int(0-100)>] [-s <suffix>] [-v] [-h]
 32 | 
 33 | 	-i input file
 34 | 	-o output directory (optional). By default the file is replaced in the same location
 35 | 	-c percentage value to filter >= values. If not supplied, all records will be outputted
 36 | 	-s string to ad at the end of the outputted file (list of accession numbers)
 37 | 	-v version
 38 | 	-h display usage message
 39 | 
 40 | example: adapt_filter_coverage.sh -i ecoli.coverage -c 70
 41 | 
 42 | EOF
 43 | }
 44 | 
 45 | #================================================================
 46 | # OPTION_PROCESSING
 47 | #================================================================
 48 | #Make sure the script is executed with arguments
 49 | if [ $# = 0 ] ; then
 50 |  usage >&2
 51 |  exit 1
 52 | fi
 53 | 
 54 | # Error handling
 55 | error(){
 56 |   local parent_lineno="$1"
 57 |   local script="$2"
 58 |   local message="$3"
 59 |   local code="${4:-1}"
 60 | 
 61 | 	RED='\033[0;31m'
 62 | 	NC='\033[0m'
 63 | 
 64 |   if [[ -n "$message" ]] ; then
 65 |     echo -e "\n---------------------------------------\n"
 66 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 67 |     echo -e "MESSAGE:\n"
 68 |     echo -e "$message"
 69 |     echo -e "\n---------------------------------------\n"
 70 |   else
 71 |     echo -e "\n---------------------------------------\n"
 72 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 73 |     echo -e "\n---------------------------------------\n"
 74 |   fi
 75 | 
 76 |   exit "${code}"
 77 | }
 78 | 
 79 | #DECLARE FLAGS AND VARIABLES
 80 | cwd="$(pwd)"
 81 | input_file="Input_file"
 82 | coverage_cutoff_input=100
 83 | 
 84 | #PARSE VARIABLE ARGUMENTS WITH getops
 85 | #common example with letters, for long options check longopts2getopts.sh
 86 | options=":i:o:c:s:vh"
 87 | while getopts $options opt; do
 88 | 	case $opt in
 89 | 		i )
 90 | 			input_file=$OPTARG
 91 | 			;;
 92 | 		o )
 93 | 			output_dir=$OPTARG
 94 | 			;;
 95 | 		c )
 96 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
 97 | 				echo "please, provide a percentage between 0 and 100"
 98 | 				usage
 99 | 				exit 1
100 | 			else
101 | 				coverage_cutoff_input=$OPTARG
102 | 			fi
103 | 			;;
104 | 		s )
105 | 			suffix=$OPTARG
106 | 			;;
107 |         h )
108 | 		  	usage
109 | 		  	exit 1
110 | 		  	;;
111 | 		v )
112 | 		  	echo $VERSION
113 | 		  	exit 1
114 | 		  	;;
115 | 		\?)
116 | 			echo "Invalid Option: -$OPTARG" 1>&2
117 | 			usage
118 | 			exit 1
119 | 			;;
120 | 		: )
121 |       		echo "Option -$OPTARG requires an argument." >&2
122 |       		exit 1
123 |       		;;
124 |       	* )
125 | 			echo "Unimplemented option: -$OPTARG" >&2;
126 | 			exit 1
127 | 			;;
128 | 
129 | 	esac
130 | done
131 | shift $((OPTIND-1))
132 | 
133 | #================================================================
134 | # MAIN_BODY
135 | #================================================================
136 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
137 | 
138 | echo -e "\n#Executing" $0 "\n"
139 | 
140 | check_mandatory_files.sh $input_file
141 | 
142 | 
143 | suffix="_adapted_filtered_"$coverage_cutoff_input
144 | coverage_cutoff=$(echo "(1 - ($coverage_cutoff_input/100))" | bc -l)
145 | 
146 | #echo $coverage_cutoff
147 | 
148 | if [ ! $output_dir ]; then
149 | 	output_dir=$(dirname $input_file)
150 | 	#echo "Default output directory is" $output_dir
151 | 	mkdir -p $output_dir
152 | else
153 | 	#echo "Output directory is" $output_dir
154 | 	mkdir -p $output_dir
155 | fi
156 | 
157 | 
158 | if [ ! $filename ]; then
159 | 	filename=$(basename $input_file | cut -d. -f1)
160 | fi
161 | 
162 | 
163 | if [ -f $input_file"_adapted" ]; then
164 | 	echo "Found previous" $(basename $input_file"_adapted")", removing it"
165 | 	rm $input_file"_adapted"
166 | fi
167 | 
168 | ## Keep information about positions with 0 coverage. If no 0 coverage positions for a plasmid, create line including this info.
169 | awk '
170 | BEGIN{OFS="\t"}
171 | (!x[$1]++) {if ($1 != "genome")
172 | 				{if ($2 == 0)
173 | 					{print $0}
174 | 				else
175 | 					{print $1, 0, $4, $4, 0.0000000001}
176 | 				}
177 | 			}
178 | 	' $input_file > $input_file"_adapted" || error ${LINENO} $(basename $0) "Awk command for bedtools coverage output parsing in $input_file\"_adapted\" creation. See $output_dir/logs for more information"
179 | 
180 | ## Keep plasmids with coverage < 1-coverage_cutoff_input/100
181 | awk '
182 | {if ($2 == 0 && $5 < '"${coverage_cutoff}"')
183 | 	 {print $1}
184 | }
185 | 	' $input_file"_adapted" > $input_file$suffix || error ${LINENO} $(basename $0) "Awk command for coverage filtering in $input_file$suffix creation. See $output_dir/logs for more information."
186 | 
187 | echo "$(date)"
188 | echo "Done filtering sequences with" $coverage_cutoff_input"% and greater coverage"
189 | echo "Those sequences can be found at" $input_file$suffix
190 | echo -e $(cat $input_file$suffix | wc -l) mapped equals or more than $coverage_cutoff_input "\n"
191 | 


--------------------------------------------------------------------------------
/bin/blast_align.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | #=============================================================
  6 | # HEADER
  7 | #=============================================================
  8 | 
  9 | #INSTITUTION:ISCIII
 10 | #CENTRE:BU-ISCIII
 11 | #AUTHOR: Pedro J. Sola
 12 | VERSION=1.0
 13 | #CREATED: 1 May 2018
 14 | #REVISION:
 15 | #DESCRIPTION:Script that blast a query against a database
 16 | #
 17 | #DOCUMENTATION
 18 | #
 19 | #Blast output6 with aditions:
 20 | #1	 	Query label.(qseqid)
 21 | #2	 	Target or subject(database sequence or cluster centroid) label. (sseqid)
 22 | #3	 	Percent identity. (pident)
 23 | #4	 	Alignment length. (length)
 24 | #5	 	Number of mismatches. (mismatch)
 25 | #6	 	Number of gap opens. (gapopen)
 26 | #7	 	Start position in query. Query coordinates start with 1 at the first base in the sequence as it appears in the input file. For translated searches (nucleotide queries, protein targets), query start<end for +ve frame and start>end for -ve frame. (qstart)
 27 | #8	 	End position in query. (qend)
 28 | #9	 	Start position in target. Target coordinates start with 1 at the first base in sequence as it appears in the database. For untranslated nucleotide searches, target start<end for plus strand, start>end for a reverse-complement alignment. (sstart)
 29 | #10	 	End position in target. (send)
 30 | #11	 	E-value calculated using Karlin-Altschul statistics. (evalue)
 31 | #12	 	Bit score calculated using Karlin-Altschul statistics. (bitscore)
 32 | #13		Lenght of query (qlen)
 33 | #14		Length of target (slen)
 34 | #
 35 | #
 36 | #TO DO:
 37 | #
 38 | #Handle all types of blast: blastn, blastp...
 39 | #
 40 | #================================================================
 41 | # END_OF_HEADER
 42 | #================================================================
 43 | 
 44 | #SHORT USAGE RULES
 45 | #LONG USAGE FUNCTION
 46 | usage() {
 47 | 	cat << EOF
 48 | 
 49 | blast_align is a script that blast a query against a database
 50 | 
 51 | usage : $0 <-i inputfile(query)> <-d inputfile(database)> [-p <prefix>] [-o <directory>] [-t <nucl|prot>]
 52 | 		[-T <threads>] [-e <evalue>] [-v] [-h]
 53 | 
 54 | 	-i query file in FASTA format
 55 |     -d database to blast against
 56 | 	-o output directory, default same directory as query
 57 | 	-p prefix for blast identification (mandatory) and output file name
 58 | 	-q type of query, nucl by default
 59 | 	-t type of database, nucl by default
 60 |     -e evalue for blast analysis, default 0.0001
 61 | 	-T number of threads
 62 | 	-v version
 63 | 	-h display usage message
 64 | 
 65 | Output directory is the same as input directory by default
 66 | 
 67 | example: blast_align -i ecoli.fasta -d plasmid_ddbb.fasta -p plasmid
 68 | 
 69 | 
 70 | EOF
 71 | }
 72 | 
 73 | 
 74 | #================================================================
 75 | # OPTION_PROCESSING
 76 | #================================================================
 77 | #Make sure the script is executed with arguments
 78 | if [ $# = 0 ] ; then
 79 |  usage >&2
 80 |  exit 1
 81 | fi
 82 | 
 83 | # Error handling
 84 | error(){
 85 |   local parent_lineno="$1"
 86 |   local script="$2"
 87 |   local message="$3"
 88 |   local code="${4:-1}"
 89 | 
 90 | 	RED='\033[0;31m'
 91 | 	NC='\033[0m'
 92 | 
 93 |   if [[ -n "$message" ]] ; then
 94 |     echo -e "\n---------------------------------------\n"
 95 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 96 |     echo -e "MESSAGE:\n"
 97 |     echo -e "$message"
 98 |     echo -e "\n---------------------------------------\n"
 99 |   else
100 |     echo -e "\n---------------------------------------\n"
101 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
102 |     echo -e "\n---------------------------------------\n"
103 |   fi
104 | 
105 |   exit "${code}"
106 | }
107 | 
108 | #DECLARE FLAGS AND VARIABLES
109 | cwd="$(pwd)"
110 | group="NO_GROUP"
111 | input_file="Input_file"
112 | database="Database"
113 | query_type="nucl"
114 | database_type="nucl"
115 | evalue=0.0001
116 | threads=1
117 | blast_command="blastn"
118 | 
119 | #PARSE VARIABLE ARGUMENTS WITH getops
120 | #common example with letters, for long options check longopts2getopts.sh
121 | options=":i:o:p:f:d:q:t:e:T:vh"
122 | while getopts $options opt; do
123 | 	case $opt in
124 | 		i )
125 | 			input_file=$OPTARG
126 | 			;;
127 |         d )
128 | 			database=$OPTARG
129 | 			;;
130 | 		o )
131 | 			output_dir=$OPTARG
132 | 			;;
133 | 		p)
134 | 			prefix=$OPTARG
135 | 			;;
136 | 		f)
137 | 			file_name=$OPTARG
138 | 			;;
139 | 		t )
140 |           	database_type=$OPTARG
141 |           	;;
142 |         q )
143 |           	query_type=$OPTARG
144 |           	;;
145 |         g )
146 |           	group=$OPTARG
147 |           	;;
148 | 		e )
149 |           	evalue=$OPTARG
150 |           	;;
151 |         T)
152 |           	threads=$OPTARG
153 |           	;;
154 |         h )
155 | 		  	usage
156 | 		  	exit 1
157 | 		  	;;
158 | 		v )
159 | 		  	echo $VERSION
160 | 		  	exit 1
161 | 		  	;;
162 | 		\?)
163 | 			echo "Invalid Option: -$OPTARG" 1>&2
164 | 			usage
165 | 			exit 1
166 | 			;;
167 | 		: )
168 |       		echo "Option -$OPTARG requires an argument." >&2
169 |       		exit 1
170 |       		;;
171 |       	* )
172 | 			echo "Unimplemented option: -$OPTARG" >&2;
173 | 			exit 1
174 | 			;;
175 | 
176 | 	esac
177 | done
178 | shift $((OPTIND-1))
179 | 
180 | #================================================================
181 | # MAIN_BODY
182 | #================================================================
183 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
184 | 
185 | echo -e "\n#Executing" $0 "\n"
186 | 
187 | check_mandatory_files.sh $input_file $database
188 | 
189 | #check_dependencies.sh blastn
190 | 
191 | 
192 | if [ ! $prefix ]; then
193 | 	echo "please provide a prefix to identify this blast analysis"
194 | 	exit 1
195 | fi
196 | 
197 | if [ $query_type == "prot" ] || [ $query_type == "nucl" ]; then
198 | 	echo "query type selected as" $database_type
199 | else
200 | 	echo "please provide a proper query type"
201 |     exit 1
202 | fi
203 | 
204 | if [ $query_type == "prot" ]; then
205 | 	blast_command="tblastn"
206 | fi
207 | 
208 | if [ ! $output_dir ]; then
209 | 	output_dir=$(dirname $input_file)
210 | 	echo "Default output directory is" $output_dir
211 | 	mkdir -p $output_dir
212 | else
213 | 	echo "Output directory is" $output_dir
214 | 	mkdir -p $output_dir
215 | fi
216 | 
217 | if [ ! $file_name ]; then
218 | 	file_name=$(basename $input_file | cut -d. -f1)
219 | 	echo "filename is" $file_name
220 | fi
221 | 
222 | database_name=$(basename $database)
223 | database_dir=$(dirname $database)
224 | 
225 | ##BLAST EXECUTION
226 | 
227 | echo "$(date)"
228 | echo "Blasting" $file_name "agaist" $database_name
229 | 
230 | makeblastdb -in $database -out $database_dir/$database_name".blast.tmp" -dbtype $database_type || error ${LINENO} $(basename $0) "Makeblastdb command failed. See $output_dir/logs for more information."
231 | 
232 | echo "BLAST command is" $blast_command
233 | 
234 | $blast_command -query $input_file \
235 | -db $database_dir/$database_name".blast.tmp" \
236 | -out $output_dir/$file_name"."$prefix".blast" \
237 | -evalue $evalue \
238 | -num_threads $threads \
239 | -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen" || error ${LINENO} $(basename $0) "Blastn command failed. See $output_dir/logs for more information"
240 | 
241 | 
242 | echo "$(date)"
243 | echo "Done blasting" $file_name "agaist" $database_name
244 | echo -e "blasted file can be found in" $output_dir/$file_name"."$prefix".blast" "\n"
245 | 


--------------------------------------------------------------------------------
/bin/blast_to_bed.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | #set -e
  6 | #set -x
  7 | 
  8 | #=============================================================
  9 | # HEADER
 10 | #=============================================================
 11 | 
 12 | #INSTITUTION:ISCIII
 13 | #CENTRE:BU-ISCIII
 14 | #AUTHOR: Pedro J. Sola
 15 | VERSION=1.0
 16 | #CREATED: 4 May 2018
 17 | #REVISION:
 18 | #06 May 2018: add id optiopn in bed output
 19 | #04 June 2018: add an option for an aditional division mostly for ABR sort
 20 | #
 21 | #DESCRIPTION:blast_to_bed script obtain a BED file with coordinates of local blast alignments matching some given conditions
 22 | #================================================================
 23 | # END_OF_HEADER
 24 | #================================================================
 25 | 
 26 | #SHORT USAGE RULES
 27 | #LONG USAGE FUNCTION
 28 | usage() {
 29 | 	cat << EOF
 30 | 
 31 | blast_to_bed is a script than obtain a BED file with coordinates of local blast alignments matching some given conditions
 32 | 
 33 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o <directory>] [-b <int(0-100)>] [-l <int(0-100)>] [-L <int>]
 34 | 		[-p <prefix>] [-d <delimiter>] [-D (l|r)] [-q <delimiter>] [-Q (l|r)] [-U <delimiter>] [-I] [-u] [-v] [-h]
 35 | 
 36 | 	-i input file
 37 | 	-b blast identity cutoff (0 - 100), default 90
 38 | 	-l blast length percentage cutoff (0 - 100), default 20, use 90 for genes
 39 | 	-L blast length alignment cutoff, default 0, use 200 or 500 for contigs
 40 | 	-o output directory (optional). By default the file is replaced in the same location
 41 | 	-q database chraracter delimiter, default "_"
 42 | 	-Q query field to retrieve (l=left, r=right), default left
 43 | 	-d database chraracter delimiter, default "_"
 44 | 	-D database field to retrieve (l=left, r=right), default right
 45 | 	-I contig mode
 46 | 	-u unique. Outputs only one query entry per database entry
 47 | 	-U unique mode with delimiter. Outputs only one delimited query per database entry
 48 | 	-v version
 49 | 	-h display usage message
 50 | 
 51 | example: blast_to_bed.sh -i ecoli_prefix.blast -b 80 -l 50 -q - -Q r
 52 | 
 53 | EOF
 54 | }
 55 | 
 56 | #================================================================
 57 | # OPTION_PROCESSING
 58 | #================================================================
 59 | #Make sure the script is executed with arguments
 60 | if [ $# = 0 ] ; then
 61 |  usage >&2
 62 |  exit 1
 63 | fi
 64 | 
 65 | error(){
 66 |   local parent_lineno="$1"
 67 |   local script="$2"
 68 |   local message="$3"
 69 |   local code="${4:-1}"
 70 | 
 71 | 	RED='\033[0;31m'
 72 | 	NC='\033[0m'
 73 | 
 74 |   if [[ -n "$message" ]] ; then
 75 |     echo -e "\n---------------------------------------\n"
 76 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 77 |     echo -e "MESSAGE:\n"
 78 |     echo -e "$message"
 79 |     echo -e "\n---------------------------------------\n"
 80 |   else
 81 |     echo -e "\n---------------------------------------\n"
 82 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 83 |     echo -e "\n---------------------------------------\n"
 84 |   fi
 85 | 
 86 |   exit "${code}"
 87 | }
 88 | 
 89 | #DECLARE FLAGS AND VARIABLES
 90 | cwd="$(pwd)"
 91 | input_file="Input_file"
 92 | blast_id_cutoff=90
 93 | blast_len_percentage=10
 94 | blast_len_alignment=0
 95 | database_delimiter="_"
 96 | database_field=r
 97 | query_delimiter="_"
 98 | query_field=l
 99 | unique=false
100 | unique_divider=false
101 | divider_delimiter="-"
102 | suffix=""
103 | id_circos=false
104 | id_output=""
105 | 
106 | #PARSE VARIABLE ARGUMENTS WITH getops
107 | #common example with letters, for long options check longopts2getopts.sh
108 | options=":i:b:q:Q:d:D:o:l:L:U:Iuvh"
109 | while getopts $options opt; do
110 | 	case $opt in
111 | 		i )
112 | 			input_file=$OPTARG
113 | 			;;
114 | 		b )
115 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
116 | 				echo "please, provide a percentage between 0 and 100"
117 | 				exit 1
118 | 			else
119 | 				blast_id_cutoff=$OPTARG
120 | 			fi
121 | 			;;
122 | 		o )
123 | 			output_dir=$OPTARG
124 | 			;;
125 | 		l )
126 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
127 | 				echo "please, provide a percentage between 0 and 100"
128 | 				exit 1
129 | 			else
130 | 				blast_len_percentage=$OPTARG
131 | 			fi
132 | 			;;
133 | 		L )
134 | 			blast_len_alignment=$OPTARG
135 | 			;;
136 |         d )
137 | 			database_delimiter=$OPTARG
138 | 			;;
139 | 		D )
140 | 			database_field=$OPTARG
141 | 			;;
142 |         q )
143 | 			query_delimiter=$OPTARG
144 | 			;;
145 | 		Q )
146 | 			query_field=$OPTARG
147 | 			;;
148 |         u )
149 | 			unique=true
150 |             suffix=".unique.tmp"
151 | 			;;
152 | 		U )
153 | 			unique_divider=true
154 | 			suffix=".unique.divider.tmp"
155 | 			divider_delimiter=$OPTARG
156 | 			;;
157 |         I)
158 | 			id_circos=true
159 |             id_output=",\"id=\"query_name[length(query_name)]"
160 |             ;;
161 |         h )
162 | 		  	usage
163 | 		  	exit 1
164 | 		  	;;
165 | 		v )
166 | 		  	echo $VERSION
167 | 		  	exit 1
168 | 		  	;;
169 | 		\?)
170 | 			echo "Invalid Option: -$OPTARG" 1>&2
171 | 			usage
172 | 			exit 1
173 | 			;;
174 | 		: )
175 |       		echo "Option -$OPTARG requires an argument." >&2
176 |       		exit 1
177 |       		;;
178 |       	* )
179 | 			echo "Unimplemented option: -$OPTARG" >&2;
180 | 			exit 1
181 | 			;;
182 | 
183 | 	esac
184 | done
185 | shift $((OPTIND-1))
186 | 
187 | #================================================================
188 | # MAIN_BODY
189 | #================================================================
190 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
191 | 
192 | echo -e "\n#Executing" $0 "\n"
193 | 
194 | check_mandatory_files.sh $input_file
195 | 
196 | 
197 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l)
198 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//')
199 | 
200 | 
201 | if [ ! $output_dir ]; then
202 | 	output_dir=$(dirname $input_file)
203 | 	#echo "Default output directory is" $output_dir
204 | 	mkdir -p $output_dir
205 | else
206 | 	#echo "Output directory is" $output_dir
207 | 	mkdir -p $output_dir
208 | fi
209 | 
210 | 
211 | if [ ! $file_name ]; then
212 | 	file_name=$(basename $input_file | cut -d. -f1,2)
213 | fi
214 | 
215 | ##CHECK FIELDS TO RETRIEVE
216 | 
217 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then
218 | 
219 | 	if [ $database_field == l ]; then
220 | 		database_field="1"
221 | 	else
222 | 		database_field="length(database_name)"
223 | 	fi
224 | 
225 | else
226 | 	echo "Please introduce r or l for database"
227 | 	exit 1
228 | fi
229 | 
230 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then
231 | 
232 | 	if [ $query_field == l ]; then
233 | 		query_field="1"
234 | 	else
235 | 		query_field="length(query_name)"
236 | 	fi
237 | 
238 | else
239 | 
240 | 	echo "Please introduce 0 or 1 for query"
241 | 	exit 1
242 | fi
243 | 
244 | echo "$(date)"
245 | echo "Adapting blast to bed using" $(basename $input_file) "with:"
246 | echo "Blast identity=" $blast_id_cutoff
247 | echo "Min length aligned=" $blast_len_alignment
248 | echo "Min len percentage=" $blast_len_percentage
249 | echo "database_delimiter=" $database_delimiter
250 | echo "database_field)=" $database_field
251 | echo "query_delimiter=" $query_delimiter
252 | echo "query_field=" $query_field
253 | 
254 | 
255 | cat $input_file | sort -k3 -nr | \
256 | awk '
257 | 	{OFS="\t"
258 | 	split($2, database_name, "'"${database_delimiter}"'")
259 | 	split($1, query_name, "'"${query_delimiter}"'")}
260 | 	(($3 >= '"${blast_id_cutoff}"')&&(($4/$13) >= '"${blast_len_percentage_value}"')&&($4 >= '"${blast_len_alignment}"')) \
261 | 	{print database_name['"$database_field"'], $9, $10, query_name['"$query_field"']'"$id_output"'}
262 | 	' \
263 | > $output_dir/$file_name".bed"$suffix || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\"$suffix. See $output_dir/logs for more information."
264 | 
265 | 
266 | if [ "$unique" == "true" ]; then
267 | 	echo "unique option enabled"
268 |     awk '
269 |         (!x[$1$4]++)
270 |     	' $output_dir/$file_name".bed"$suffix \
271 | 	> $output_dir/$file_name".bed" || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\". See $output_dir/logs for more information."
272 | 	rm $output_dir/$file_name".bed"$suffix
273 | fi
274 | 
275 | 
276 | if [ "$unique_divider" == "true" ]; then
277 | 	echo "unique delimiter option enabled"
278 |     awk '
279 |     	{split($4,query,"'"${divider_delimiter}"'")}
280 |         (!x[query[1]$1]++)
281 |     	' $output_dir/$file_name".bed"$suffix \
282 | 	> $output_dir/$file_name".bed" || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\"$suffix. See $output_dir/logs for more information."
283 | 	rm $output_dir/$file_name".bed"$suffix
284 | fi
285 | 
286 | 
287 | 
288 | echo "$(date)"
289 | echo "DONE adapting blast to bed"
290 | echo -e "File can be found at" $output_dir/$file_name".bed" "\n"
291 | 


--------------------------------------------------------------------------------
/bin/blast_to_complete.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | #set -e
  6 | #set -x
  7 | 
  8 | #=============================================================
  9 | # HEADER
 10 | #=============================================================
 11 | 
 12 | #INSTITUTION:ISCIII
 13 | #CENTRE:BU-ISCIII
 14 | #AUTHOR: Pedro J. Sola
 15 | VERSION=1.0
 16 | #CREATED: 13 May 2018
 17 | #
 18 | #DESCRIPTION:blast_to_complete script obtain full length of sequences from blast and adapt it to circos
 19 | #================================================================
 20 | # END_OF_HEADER
 21 | #================================================================
 22 | 
 23 | #SHORT USAGE RULES
 24 | #LONG USAGE FUNCTION
 25 | usage() {
 26 | 	cat << EOF
 27 | 
 28 | blast_to_complete is a script that obtain  full length of sequences from blast and adapt it to circos
 29 | 
 30 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o <directory>] [-b <int(0-100)>] [-l <int(0-100)>]
 31 | 		[-p <prefix>] [-d <delimiter>] [-D (l|r)] [-q <delimiter>] [-Q (l|r)] [-I] [-u] [-v] [-h]
 32 | 
 33 | 	-i input file
 34 | 	-b blast identity cutoff (0 - 100), default 90
 35 | 	-l blast length percentage cutoff (0 - 100), default 50, use 90 for genes
 36 | 	-o output directory (optional). By default the file is replaced in the same location
 37 | 	-q database chraracter delimiter, default "_"
 38 | 	-Q query field to retrieve (l=left, r=right), default left
 39 | 	-d database chraracter delimiter, default "_"
 40 | 	-D database field to retrieve (l=left, r=right), default right
 41 | 	-I contig mode
 42 | 	-u unique. Outputs only one query entry per database entry
 43 | 	-v version
 44 | 	-h display usage message
 45 | 
 46 | example: blast_to_complete.sh -i ecoli_prefix.blast
 47 | EOF
 48 | }
 49 | 
 50 | #================================================================
 51 | # OPTION_PROCESSING
 52 | #================================================================
 53 | #Make sure the script is executed with arguments
 54 | if [ $# = 0 ] ; then
 55 |  usage >&2
 56 |  exit 1
 57 | fi
 58 | 
 59 | # Error handling
 60 | error(){
 61 |   local parent_lineno="$1"
 62 |   local script="$2"
 63 |   local message="$3"
 64 |   local code="${4:-1}"
 65 | 
 66 | 	RED='\033[0;31m'
 67 | 	NC='\033[0m'
 68 | 
 69 |   if [[ -n "$message" ]] ; then
 70 |     echo -e "\n---------------------------------------\n"
 71 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 72 |     echo -e "MESSAGE:\n"
 73 |     echo -e "$message"
 74 |     echo -e "\n---------------------------------------\n"
 75 |   else
 76 |     echo -e "\n---------------------------------------\n"
 77 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 78 |     echo -e "\n---------------------------------------\n"
 79 |   fi
 80 | 
 81 |   exit "${code}"
 82 | }
 83 | 
 84 | #DECLARE FLAGS AND VARIABLES
 85 | cwd="$(pwd)"
 86 | input_file="Input_file"
 87 | blast_id_cutoff=90
 88 | blast_len_percentage=15
 89 | database_delimiter="-"
 90 | database_field=r
 91 | query_delimiter="_"
 92 | query_field=r
 93 | unique=false
 94 | suffix=""
 95 | id_circos=false
 96 | id_output=""
 97 | 
 98 | #PARSE VARIABLE ARGUMENTS WITH getops
 99 | #common example with letters, for long options check longopts2getopts.sh
100 | options=":i:b:q:Q:d:D:o:l:Iuvh"
101 | while getopts $options opt; do
102 | 	case $opt in
103 | 		i )
104 | 			input_file=$OPTARG
105 | 			;;
106 | 		b )
107 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
108 | 				echo "please, provide a percentage between 0 and 100"
109 | 				exit 1
110 | 			else
111 | 				blast_id_cutoff=$OPTARG
112 | 			fi
113 | 			;;
114 | 		o )
115 | 			output_dir=$OPTARG
116 | 			;;
117 | 		l )
118 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
119 | 				echo "please, provide a percentage between 0 and 100"
120 | 				exit 1
121 | 			else
122 | 				blast_len_percentage=$OPTARG
123 | 			fi
124 | 			;;
125 | 		d )
126 | 			database_delimiter=$OPTARG
127 | 			;;
128 | 		D )
129 | 			database_field=$OPTARG
130 | 			;;
131 |         q )
132 | 			query_delimiter=$OPTARG
133 | 			;;
134 | 		Q )
135 | 			query_field=$OPTARG
136 | 			;;
137 |         u )
138 | 			unique=true
139 |             suffix=".unique.tmp"
140 | 			;;
141 |         I)
142 | 			id_circos=true
143 |             id_output=",\"id=\"database_name[length(database_name)]"
144 | 			;;
145 |         h )
146 | 		  	usage
147 | 		  	exit 1
148 | 		  	;;
149 | 		v )
150 | 		  	echo $VERSION
151 | 		  	exit 1
152 | 		  	;;
153 | 		\?)
154 | 			echo "Invalid Option: -$OPTARG" 1>&2
155 | 			usage
156 | 			exit 1
157 | 			;;
158 | 		: )
159 |       		echo "Option -$OPTARG requires an argument." >&2
160 |       		exit 1
161 |       		;;
162 |       	* )
163 | 			echo "Unimplemented option: -$OPTARG" >&2;
164 | 			exit 1
165 | 			;;
166 | 
167 | 	esac
168 | done
169 | shift $((OPTIND-1))
170 | 
171 | #================================================================
172 | # MAIN_BODY
173 | #================================================================
174 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
175 | 
176 | echo -e "\n#Executing" $0 "\n"
177 | 
178 | check_mandatory_files.sh $input_file
179 | 
180 | 
181 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l)
182 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//')
183 | 
184 | 
185 | if [ ! $output_dir ]; then
186 | 	output_dir=$(dirname $input_file)
187 | 	#echo "Default output directory is" $output_dir
188 | 	mkdir -p $output_dir
189 | else
190 | 	#echo "Output directory is" $output_dir
191 | 	mkdir -p $output_dir
192 | fi
193 | 
194 | 
195 | if [ ! $file_name ]; then
196 | 	file_name=$(basename $input_file | cut -d. -f1,2)
197 | fi
198 | 
199 | ##CHECK FIELDS TO RETRIEVE
200 | 
201 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then
202 | 
203 | 	if [ $database_field == l ]; then
204 | 		database_field="1"
205 | 	else
206 | 		database_field="length(database_name)"
207 | 	fi
208 | 
209 | else
210 | 	echo "Please introduce r or l for database"
211 | 	exit 1
212 | fi
213 | 
214 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then
215 | 
216 | 	if [ $query_field == l ]; then
217 | 		query_field="1"
218 | 	else
219 | 		query_field="length(query_name)"
220 | 	fi
221 | 
222 | else
223 | 
224 | 	echo "Please introduce 0 or 1 for query"
225 | 	exit 1
226 | fi
227 | 
228 | echo "$(date)"
229 | echo "Adapting blast to complete using" $(basename $input_file) "with:"
230 | echo "Blast identity=" $blast_id_cutoff
231 | echo "Min len percentage=" $blast_len_percentage
232 | 
233 | 
234 | cat $input_file |\
235 | awk '
236 | 	BEGIN{OFS="\t"}
237 | 	{split($1, query_name, "'"${query_delimiter}"'")
238 | 	split($2,database_name, "'"${database_delimiter}"'")}
239 | 	(($3 >= '"${blast_id_cutoff}"') && (($4/$13)>='"${blast_len_percentage_value}"') && (!x[$1$2]++)) \
240 | 	{{isInverted=($10-$9)
241 | 		ext2=($13-$8)}
242 | 		{if (isInverted < 0)
243 | 			{pos1 = $10
244 | 			pos2 = $9}
245 | 		else
246 | 			{pos1 =$9
247 | 			pos2 = $10}
248 | 		{if ((isInverted < 0) && (($14 - pos2) > $7))
249 | 			{coordChr2 = (pos2 + $7)}
250 | 		else if ((isInverted < 0) && (($14 - pos2) <= $7))
251 | 			{coordChr2=$14}
252 | 		{if ((isInverted < 0) && (ext2 <= pos1))
253 | 			{coordChr1= pos1 - ext2;}
254 | 		else if ((isInverted < 0) && (ext2 > pos1))
255 | 			{coordChr1= 1}
256 | 		{if ((isInverted > 0) && (pos1 > $7))
257 | 			{coordChr1=(pos1 - $7)}
258 | 		else if ((isInverted > 0) && (pos1 <= $7))
259 | 			{coordChr1=1}
260 | 		{if ((isInverted > 0) && (ext2 > ($14-pos2)))
261 | 			{coordChr2= $14;}
262 | 		else if ((isInverted > 0) && (ext2 <= ($14-pos2)))
263 | 			{coordChr2= (pos2 + ext2)}
264 | 	{print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13} }}}}}}
265 | 	' \
266 | 	>$output_dir/$file_name".complete"|| error ${LINENO} $(basename $0) "Awk command parsing blast output for circos input in $file_name\".complete\" creation failed. See $output_dir/logs for more information"
267 | 
268 | 
269 | cat $input_file |\
270 | awk '
271 | 	BEGIN{OFS="\t"}
272 | 	{split($1, query_name, "'"${query_delimiter}"'")
273 | 	split($2,database_name, "'"${database_delimiter}"'")}
274 | 	(($3 >= '"${blast_id_cutoff}"') && (($4/$13)>='"${blast_len_percentage_value}"') && (!x[$2$1]++)) \
275 | 	{{isInverted=($10-$9)
276 | 	ext2=($13-$8)}
277 | 	{if (isInverted < 0)
278 | 		{pos1=$10
279 | 		pos2=$9}
280 | 	else
281 | 		{pos1 =$9
282 | 		pos2=$10}; \
283 | 	{if ((isInverted < 0) && (($14 - pos2) < $7))
284 | 		{coordChr1=1
285 | 		coordChr2=($7-($14-pos2))
286 | 		{print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}}
287 | 	{if ((isInverted < 0) && (ext2 > pos1))
288 | 		{coordChr1=($14-(ext2-pos1))
289 | 		coordChr2=$14
290 | 		{print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}}
291 | 	{if ((isInverted > 0) && (pos1 < $7))
292 | 		{coordChr1=($14-($7-pos1))
293 | 		coordChr2=$14
294 | 		{print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}}
295 | 	{if ((isInverted > 0) && (ext2 > ($14-pos2)))
296 | 		{coordChr1=1
297 | 		coordChr2=(ext2-($14-pos2))
298 | 		{print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}
299 | 		}
300 | 	}}}}}}
301 | 	' \
302 | 	>>$output_dir/$file_name".complete" || error ${LINENO} $(basename $0) "Awk command parsing blast output for circos input in $file_name\".complete\" second step creation failed. See $output_dir/logs for more information"
303 | 
304 | 
305 | 
306 | echo "$(date)"
307 | echo "DONE adapting blast to complete"
308 | echo -e "File can be found at" $output_dir/$file_name".complete" "/n"
309 | 


--------------------------------------------------------------------------------
/bin/blast_to_link.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | #set -e
  6 | #set -x
  7 | 
  8 | #=============================================================
  9 | # HEADER
 10 | #=============================================================
 11 | 
 12 | #INSTITUTION:ISCIII
 13 | #CENTRE:BU-ISCIII
 14 | #AUTHOR: Pedro J. Sola
 15 | VERSION=1.0
 16 | #CREATED: 14 May 2018
 17 | #
 18 | #DESCRIPTION:blast_to_link script to obtain a link file that represent duplications between all members of the query
 19 | #================================================================
 20 | # END_OF_HEADER
 21 | #================================================================
 22 | 
 23 | #SHORT USAGE RULES
 24 | #LONG USAGE FUNCTION
 25 | usage() {
 26 | 	cat << EOF
 27 | 
 28 | blast_to_bed is a script than obtain a BED file with coordinates of local blast alignments matching some given conditions
 29 | 
 30 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o <directory>] [-b <int(0-100)>] [-l <int(0-100)>] [-L <int>]
 31 | 		[-p <prefix>] [-d <delimiter>] [-D (l|r)] [-q <delimiter>] [-Q (l|r)] [-I] [-u] [-v] [-h]
 32 | 
 33 | 	-i input file
 34 | 	-b blast identity cutoff (0 - 100), default 90
 35 | 	-l blast length percentage cutoff (0 - 100), default 20, use 90 for genes
 36 | 	-o output directory (optional). By default the file is replaced in the same location
 37 | 	-q database chraracter delimiter, default "_"
 38 | 	-Q query field to retrieve (l=left, r=right), default left
 39 | 	-d database chraracter delimiter, default "_"
 40 | 	-D database field to retrieve (l=left, r=right), default right
 41 | 	-I contig mode
 42 | 	-v version
 43 | 	-h display usage message
 44 | 
 45 | example: blast_to_link.sh -i ecoli_prefix.blast -b 80 -l 50
 46 | EOF
 47 | }
 48 | 
 49 | #================================================================
 50 | # OPTION_PROCESSING
 51 | #================================================================
 52 | #Make sure the script is executed with arguments
 53 | if [ $# = 0 ] ; then
 54 |  usage >&2
 55 |  exit 1
 56 | fi
 57 | 
 58 | # Error handling
 59 | error(){
 60 |   local parent_lineno="$1"
 61 |   local script="$2"
 62 |   local message="$3"
 63 |   local code="${4:-1}"
 64 | 
 65 | 	RED='\033[0;31m'
 66 | 	NC='\033[0m'
 67 | 
 68 |   if [[ -n "$message" ]] ; then
 69 |     echo -e "\n---------------------------------------\n"
 70 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 71 |     echo -e "MESSAGE:\n"
 72 |     echo -e "$message"
 73 |     echo -e "\n---------------------------------------\n"
 74 |   else
 75 |     echo -e "\n---------------------------------------\n"
 76 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 77 |     echo -e "\n---------------------------------------\n"
 78 |   fi
 79 | 
 80 |   exit "${code}"
 81 | }
 82 | 
 83 | #DECLARE FLAGS AND VARIABLES
 84 | cwd="$(pwd)"
 85 | input_file="Input_file"
 86 | blast_id_cutoff=90
 87 | blast_len_percentage=50
 88 | blast_len_alignment=0
 89 | database_delimiter="-"
 90 | database_field=l
 91 | query_delimiter="_"
 92 | query_field=r
 93 | unique=false
 94 | suffix=""
 95 | id_circos=false
 96 | id_output=""
 97 | 
 98 | #PARSE VARIABLE ARGUMENTS WITH getops
 99 | #common example with letters, for long options check longopts2getopts.sh
100 | options=":i:b:q:Q:d:D:o:l:L:Iuvh"
101 | while getopts $options opt; do
102 | 	case $opt in
103 | 		i )
104 | 			input_file=$OPTARG
105 | 			;;
106 | 		b )
107 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
108 | 				echo "please, provide a percentage between 0 and 100"
109 | 				exit 1
110 | 			else
111 | 				blast_id_cutoff=$OPTARG
112 | 			fi
113 | 			;;
114 | 		o )
115 | 			output_dir=$OPTARG
116 | 			;;
117 | 		l )
118 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
119 | 				echo "please, provide a percentage between 0 and 100"
120 | 				exit 1
121 | 			else
122 | 				blast_len_percentage=$OPTARG
123 | 			fi
124 | 			;;
125 | 		d )
126 | 			database_delimiter=$OPTARG
127 | 			;;
128 | 		D )
129 | 			database_field=$OPTARG
130 | 			;;
131 |         q )
132 | 			query_delimiter=$OPTARG
133 | 			;;
134 | 		Q )
135 | 			query_field=$OPTARG
136 | 			;;
137 |         u )
138 | 			unique=true
139 |             suffix=".unique.tmp"
140 | 			;;
141 |         I)
142 | 			id_circos=true
143 |             id_output=",\"id=\"query_name[length(query_name)]"
144 | 			;;
145 |         h )
146 | 		  	usage
147 | 		  	exit 1
148 | 		  	;;
149 | 		v )
150 | 		  	echo $VERSION
151 | 		  	exit 1
152 | 		  	;;
153 | 		\?)
154 | 			echo "Invalid Option: -$OPTARG" 1>&2
155 | 			usage
156 | 			exit 1
157 | 			;;
158 | 		: )
159 |       		echo "Option -$OPTARG requires an argument." >&2
160 |       		exit 1
161 |       		;;
162 |       	* )
163 | 			echo "Unimplemented option: -$OPTARG" >&2;
164 | 			exit 1
165 | 			;;
166 | 
167 | 	esac
168 | done
169 | shift $((OPTIND-1))
170 | 
171 | 
172 | #================================================================
173 | # MAIN_BODY
174 | #================================================================
175 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
176 | 
177 | echo -e "\n#Executing" $0 "\n"
178 | 
179 | check_mandatory_files.sh $input_file
180 | 
181 | 
182 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l)
183 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//')
184 | 
185 | 
186 | if [ ! $output_dir ]; then
187 | 	output_dir=$(dirname $input_file)
188 | 	#echo "Default output directory is" $output_dir
189 | 	mkdir -p $output_dir
190 | else
191 | 	#echo "Output directory is" $output_dir
192 | 	mkdir -p $output_dir
193 | fi
194 | 
195 | 
196 | if [ ! $file_name ]; then
197 | 	file_name=$(basename $input_file | cut -d. -f1,2)
198 | fi
199 | 
200 | ##CHECK FIELDS TO RETRIEVE
201 | 
202 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then
203 | 
204 | 	if [ $database_field == l ]; then
205 | 		database_field="1"
206 | 	else
207 | 		database_field="length(database_name)"
208 | 	fi
209 | 
210 | else
211 | 	echo "Please introduce r or l for database"
212 | 	exit 1
213 | fi
214 | 
215 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then
216 | 
217 | 	if [ $query_field == l ]; then
218 | 		query_field="1"
219 | 	else
220 | 		query_field="length(query_name)"
221 | 	fi
222 | 
223 | else
224 | 
225 | 	echo "Please introduce 0 or 1 for query"
226 | 	exit 1
227 | fi
228 | 
229 | echo "$(date)"
230 | echo "Adapting blast to links using" $(basename $input_file) "with:"
231 | echo "Blast identity=" $blast_id_cutoff
232 | echo "Min len percentage=" $blast_len_percentage
233 | 
234 | ##Have only into account blast entries with a determine blast length
235 | 
236 | awk '
237 | 	(($4/$13) >= '"${blast_len_percentage_value}"') && !contigPlasmid[$1$2]++ \
238 | 	{print $1$2}
239 | 	' $input_file \
240 | 	> $output_dir/$file_name".dict_length_percentage" || error ${LINENO} $(basename $0) "Awk command in $file_name\".dict_length_percentage\" creation failed. See $output_dir/logs for more information."
241 | 
242 | 
243 | ##Obtain coordinates query --> ddbb
244 | 
245 | awk '
246 | 	NR==FNR{contigPlasmid[$1]=$1;next}
247 | 	{split($2, database_name, "'"${database_delimiter}"'")
248 | 	split($1, query_name, "'"${query_delimiter}"'")
249 | 	header=$1$2}
250 | 	{if ((header in contigPlasmid) && ($3>='"${blast_id_cutoff}"') && (($4/$13)>=0.03))
251 | 		print query_name['"$query_field"'], $7,$8,database_name['"$database_field"'],$9,$10'"$id_output"'}' \
252 | 	$output_dir/$file_name".dict_length_percentage" $input_file \
253 | 	> $output_dir/$file_name."blast.links" || error ${LINENO} $(basename $0) "Awk command in $file_name\".blast.links\" creation failed. See $output_dir/logs for more information"
254 | 
255 | ##Change coordinates from query --> ddbb to ddbb-->ddbb in order to represent them in CIRCOSS
256 | 
257 | awk '
258 | 	BEGIN{OFS="\t"}
259 | 	{
260 | 	if($1 != savedNode)
261 | 		{savedNode= $1; delete chr}
262 | 	else{for(i in chr)
263 | 		{print $4" "$5" "$6" "chr[i]" id="savedNode}
264 | 	}
265 | 	chr[$4$5$6] = $4" "$5" "$6}' \
266 | 	$output_dir/$file_name."blast.links" \
267 | 	> $output_dir/$file_name."links" || error ${LINENO} $(basename $0) "Awk command in $file_name\".links\" creation failed. See $output_dir/logs for more information"
268 | 
269 | 
270 | rm $output_dir/$file_name".dict_length_percentage"
271 | 
272 | echo "$(date)"
273 | echo "DONE adapting blast to link"
274 | echo -e "File can be found at" $output_dir/$file_name".links" "\n"
275 | 


--------------------------------------------------------------------------------
/bin/bowtie_mapper.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | #=============================================================
  6 | # HEADER
  7 | #=============================================================
  8 | 
  9 | #INSTITUTION:ISCIII
 10 | #CENTRE:BU-ISCIII
 11 | #AUTHOR: Pedro J. Sola
 12 | VERSION=1.0
 13 | #CREATED: 15 March 2018
 14 | #REVISION:
 15 | #		19 March 2018: Complete usage info
 16 | #		19 March 2018: Check mandatory files. folders and variables
 17 | #DESCRIPTION:Script that index a database and map a supplied pair-end sequences
 18 | #TODO
 19 | #	-Handle files extensions for bowtie, now is fastq by default
 20 | #================================================================
 21 | # END_OF_HEADER
 22 | #================================================================
 23 | 
 24 | #SHORT USAGE RULES
 25 | #LONG USAGE FUNCTION
 26 | usage() {
 27 | 	cat << EOF
 28 | 
 29 | Bowtie_mapper script index a database and map a supplied pair-end sequences
 30 | 
 31 | usage : $0 [-i <inputfile>] [-o <directory>] <-d database(fasta)> <-s sample_name> <-1 R1> <-2 R2>
 32 | 		[-g group_name] [-f <int>] [-T <int>] [-a] [-v] [-h]
 33 | 
 34 | 	-i input directory (optional)
 35 | 	-o output directory (optional)
 36 | 	-d database to map (.fasta)
 37 | 	-s sample name
 38 | 	-g group name (optional). If unset, samples will be gathered in NO_GROUP group
 39 | 	-1 reads corresponding to paired-end R1
 40 | 	-2 reads corresponding to paired-end R2
 41 | 	-f offrate index for bowtie_build (optional). Default value 1. for quicker indexing use higher number
 42 | 	-a use -a mapping (off by default)
 43 | 	-T number of threads
 44 | 	-v version
 45 | 	-h display usage message
 46 | 
 47 | example: bowtie_mapper.sh -d database.fasta -s COLI -1 ecoli_1.fastq -2 ecoli_2.fastq -a
 48 | 
 49 | EOF
 50 | }
 51 | 
 52 | #================================================================
 53 | # OPTION_PROCESSING
 54 | #================================================================
 55 | #Make sure the script is executed with arguments
 56 | if [ $# = 0 ] ; then
 57 |  usage >&2
 58 |  exit 1
 59 | fi
 60 | 
 61 | # Error handling
 62 | error(){
 63 |   local parent_lineno="$1"
 64 |   local script="$2"
 65 |   local message="$3"
 66 |   local code="${4:-1}"
 67 | 
 68 | 	RED='\033[0;31m'
 69 | 	NC='\033[0m'
 70 | 
 71 |   if [[ -n "$message" ]] ; then
 72 |     echo -e "\n---------------------------------------\n"
 73 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 74 |     echo -e "MESSAGE:\n"
 75 |     echo -e "$message"
 76 |     echo -e "\n---------------------------------------\n"
 77 |   else
 78 |     echo -e "\n---------------------------------------\n"
 79 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 80 |     echo -e "\n---------------------------------------\n"
 81 |   fi
 82 | 
 83 |   exit "${code}"
 84 | }
 85 | 
 86 | #DECLARE FLAGS AND VARIABLES
 87 | threads=1
 88 | offrate=1
 89 | cwd="$(pwd)"
 90 | a_mapping=""
 91 | group="NO_GROUP"
 92 | database="Database"
 93 | R1="R1"
 94 | R2="R2"
 95 | 
 96 | #PARSE VARIABLE ARGUMENTS WITH getops
 97 | #common example with letters, for long options check longopts2getopts.sh
 98 | options=":i:o:s:g:d:1:2:f:T:avh"
 99 | while getopts $options opt; do
100 | 	case $opt in
101 | 		i )
102 | 			input_dir=$OPTARG
103 | 			;;
104 | 		o )
105 | 			output_dir=$OPTARG
106 | 			;;
107 | 		s )
108 | 			sample=$OPTARG
109 | 			;;
110 | 		g)
111 | 			group=$OPTARG
112 | 			;;
113 | 		d )
114 | 			database=$OPTARG
115 | 			;;
116 | 		1 )
117 | 			R1=$OPTARG
118 | 			;;
119 | 		2 )
120 | 			R2=$OPTARG
121 | 			;;
122 | 		f )
123 |           	offrate=$OPTARG
124 |       		;;
125 |         T )
126 | 			threads=$OPTARG
127 |             ;;
128 |         a)
129 | 			a_mapping="-a"
130 | 			;;
131 |         h )
132 | 		  	usage
133 | 		  	exit 1
134 | 		  	;;
135 | 		v )
136 | 		  	echo $VERSION
137 | 		  	exit 1
138 | 		  	;;
139 | 		\?)
140 | 			echo "Invalid Option: -$OPTARG" 1>&2
141 | 			usage
142 | 			exit 1
143 | 			;;
144 | 		: )
145 |       		echo "Option -$OPTARG requires an argument." >&2
146 |       		exit 1
147 |       		;;
148 |       	* )
149 | 			echo "Unimplemented option: -$OPTARG" >&2;
150 | 			exit 1
151 | 			;;
152 | 
153 | 	esac
154 | done
155 | shift $((OPTIND-1))
156 | 
157 | 
158 | #================================================================
159 | # MAIN_BODY
160 | #================================================================
161 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
162 | 
163 | echo -e "\n#Executing" $0 "\n"
164 | 
165 | check_dependencies.sh bowtie2-build bowtie2
166 | 
167 | check_mandatory_files.sh $database $R1 $R2
168 | 
169 | if [ ! $sample ]; then
170 | 	echo "ERROR: please, provide a sample name"
171 | 	usage
172 | 	exit 1
173 | fi
174 | 
175 | if [ ! $output_dir ]; then
176 | 	output_dir=$cwd"/$group/$sample/mapping/"
177 | 	echo "Default output directory is" $output_dir
178 | 	mkdir -p $output_dir
179 | else
180 | 	echo "Output directory is" $output_dir
181 | 	mkdir -p $output_dir
182 | fi
183 | 
184 | 
185 | ########INDEXING############
186 | ############################
187 | 
188 | files_bt2=$(ls $database*bt2 2> /dev/null | wc -l)
189 | 
190 | 
191 | if [ "$files_bt2" = "6" ];then \
192 | 	echo "Found an indexed ddbb for" $(basename $database);
193 | 	echo "Omitting indexing"
194 | else
195 | 	echo "Building index of " $(basename $database);
196 | 	bowtie2-build \
197 | 	--offrate $offrate \
198 | 	$database $database || error ${LINENO} $(basename $0) "Bowtie2-build command failed. See $output_dir/logs for more information"
199 | fi
200 | 
201 | ########MAPPING#############
202 | ############################
203 | 
204 | if [ -f $mappedDir/$sample.sorted.bam -a -f $mappedDir/$sample.sorted.bam.bai ];then \
205 | 	echo "Found a mapping file for sample" $sample;
206 | 	echo "Omitting mapping"
207 | else
208 | 	echo "$(date)"
209 | 	echo mapping $R1
210 | 	echo mapping $R2
211 | 
212 | 	bowtie2 \
213 | 	-1 $R1 \
214 | 	-2 $R2 \
215 | 	-S $output_dir/$sample.sam \
216 | 	-q \
217 | 	--very-sensitive-local \
218 | 	$a_mapping \
219 | 	-p $threads \
220 | 	-x $database || error ${LINENO} $(basename $0) "Bowtie2 command failed. See $output_dir/logs for more information"
221 | 
222 | 
223 | 	echo "$(date)"
224 | 	echo -e "DONE Mapping $sample of $group Group" "\n"
225 | fi
226 | 
227 | 


--------------------------------------------------------------------------------
/bin/build_karyotype.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | #set -x
  7 | 
  8 | #=============================================================
  9 | # HEADER
 10 | #=============================================================
 11 | 
 12 | #INSTITUTION:ISCIII
 13 | #CENTRE:BU-ISCIII
 14 | #AUTHOR: Pedro J. Sola
 15 | VERSION=1.0
 16 | #CREATED: 13 April 2018
 17 | #REVISION:
 18 | #DESCRIPTION:build_karyotype script that creates karyotype file for CIRCOS either for summary and individual image
 19 | 
 20 | #================================================================
 21 | # END_OF_HEADER
 22 | #================================================================
 23 | 
 24 | #SHORT USAGE RULES
 25 | #LONG USAGE FUNCTION
 26 | usage() {
 27 | 	cat << EOF
 28 | 
 29 | build_karyotype script that creates karyotype file for CIRCOS either for summary and individual image
 30 | 
 31 | usage : $0 <-i inputfile(coverage)> [-o <directory>] [-f <file_name>] [-g <group_name>] <-k int(0-100)> <-K int(0-100)> [-v] [-h]
 32 | 
 33 | 	-i input file
 34 | 	-o output directory (optional). By default the file is replaced in the same location
 35 | 	-f file name for identification
 36 | 	-g group name for identification
 37 | 	-R Reconstruct
 38 | 	-K percentage value to display plasmids covered >= in summary image
 39 | 	-k percentage value to display plasmids covered >= in individual image
 40 | 	-v version
 41 | 	-h display usage message
 42 | 
 43 | example: build_karyotype.sh -i ecoli.coverage -K 80 -k 50
 44 | 
 45 | EOF
 46 | }
 47 | 
 48 | #================================================================
 49 | # OPTION_PROCESSING
 50 | #================================================================
 51 | #Make sure the script is executed with arguments
 52 | if [ $# = 0 ] ; then
 53 |  usage >&2
 54 |  exit 1
 55 | fi
 56 | 
 57 | 
 58 | # Error handling
 59 | error(){
 60 |   local parent_lineno="$1"
 61 |   local script="$2"
 62 |   local message="$3"
 63 |   local code="${4:-1}"
 64 | 
 65 | 	RED='\033[0;31m'
 66 | 	NC='\033[0m'
 67 | 
 68 |   if [[ -n "$message" ]] ; then
 69 |     echo -e "\n---------------------------------------\n"
 70 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 71 |     echo -e "MESSAGE:\n"
 72 |     echo -e "$message"
 73 |     echo -e "\n---------------------------------------\n"
 74 |   else
 75 |     echo -e "\n---------------------------------------\n"
 76 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 77 |     echo -e "\n---------------------------------------\n"
 78 |   fi
 79 | 
 80 |   exit "${code}"
 81 | }
 82 | 
 83 | #DECLARE FLAGS AND VARIABLES
 84 | cwd="$(pwd)"
 85 | input_file="Input_file"
 86 | coverage_cutoff_input=100
 87 | reconstruct=false
 88 | 
 89 | #PARSE VARIABLE ARGUMENTS WITH getops
 90 | #common example with letters, for long options check longopts2getopts.sh
 91 | options=":i:o:f:g:K:k:Rvh"
 92 | while getopts $options opt; do
 93 | 	case $opt in
 94 | 		i )
 95 | 			input_file=$OPTARG
 96 | 			;;
 97 | 		o )
 98 | 			output_dir=$OPTARG
 99 | 			;;
100 | 		f ) file_name=$OPTARG
101 | 			;;
102 | 		g ) group_name=$OPTARG
103 | 			;;
104 | 		R ) reconstruct=true
105 | 			;;
106 | 		K )
107 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
108 | 				echo "please, provide a summary percentage between 0 and 100"
109 | 				usage
110 | 				exit 1
111 | 			else
112 | 				coverage_cutoff_summary_percentage=$OPTARG
113 | 			fi
114 | 			;;
115 | 		k )
116 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
117 | 				echo "please, provide an individual percentage between 0 and 100"
118 | 				usage
119 | 				exit 1
120 | 			else
121 | 				coverage_cutoff_individual_percentage=$OPTARG
122 | 			fi
123 | 			;;
124 | 		h )
125 | 		  	usage
126 | 		  	exit 1
127 | 		  	;;
128 | 		v )
129 | 		  	echo $VERSION
130 | 		  	exit 1
131 | 		  	;;
132 | 		\?)
133 | 			echo "Invalid Option: -$OPTARG" 1>&2
134 | 			usage
135 | 			exit 1
136 | 			;;
137 | 		: )
138 |       		echo "Option -$OPTARG requires an argument." >&2
139 |       		exit 1
140 |       		;;
141 |       	* )
142 | 			echo "Unimplemented option: -$OPTARG" >&2;
143 | 			exit 1
144 | 			;;
145 | 
146 | 	esac
147 | done
148 | shift $((OPTIND-1))
149 | 
150 | #================================================================
151 | # MAIN_BODY
152 | #================================================================
153 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
154 | 
155 | echo -e "\n#Executing" $0 "\n"
156 | 
157 | check_mandatory_files.sh $input_file
158 | 
159 | coverage_cutoff_summary=$(echo "(1 - ($coverage_cutoff_summary_percentage/100))" | bc -l)
160 | coverage_cutoff_individual=$(echo "(1 - ($coverage_cutoff_individual_percentage/100))" | bc -l)
161 | 
162 | 
163 | if [ ! $output_dir ]; then
164 | 	output_dir=$(dirname $input_file)
165 | 	#echo "Default output directory is" $output_dir
166 | 	mkdir -p $output_dir
167 | else
168 | 	#echo "Output directory is" $output_dir
169 | 	mkdir -p $output_dir
170 | fi
171 | 
172 | 
173 | if [ ! $file_name ]; then
174 | 	file_name=$(basename $input_file | cut -d. -f1)
175 | fi
176 | 
177 | echo "FILE NAME" $file_name
178 | 
179 | echo "$(date)"
180 | echo "Obtain list of cromosomes (idiogram) for CIRCOS karyotype file"
181 | echo "Generating summary karyotype file with plasmids that mapped more than" $coverage_cutoff_summary_percentage"%"
182 | if [ $reconstruct = true ];then
183 | 
184 | 	awk '{print "chr -", $1, $1, "0", $2, "id="$1}' $input_file \
185 | 	>$output_dir/$file_name".karyotype_summary.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype summary in $file_name\".karyotype_summary.txt\" creation. See $output_dir/logs for more information"
186 | 
187 | 	awk '{print "chr -", $1, $1, "0", $2, "id="$1}' $input_file \
188 | 	>$output_dir/$file_name".karyotype_individual.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype individual in $file_name\".karyotype_individual.txt\" creation. See $output_dir/logs for more information."
189 | 
190 | else
191 | 	awk '
192 | 		{if ($2 == 0 && $5 < '"${coverage_cutoff_summary}"')
193 | 			{print "chr -", $1, $1, "0", $4, "id="$1}
194 | 		}
195 | 		' $input_file \
196 | 		> $output_dir/$file_name".karyotype_summary.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype summary in $file_name\".karyotype_summary.txt\" creation. See $output_dir/logs for more information."
197 | 
198 | 
199 | 	echo "Generating individual karyotype file with plasmids that mapped more than" $coverage_cutoff_individual_percentage"%"
200 | 
201 | 	awk '
202 | 		{if ($2 == 0 && $5 < '"${coverage_cutoff_individual}"')
203 | 			{print "chr -", $1, $1, "0", $4, "id="$1}
204 | 		}
205 | 		' $input_file \
206 | 		> $output_dir/$file_name".karyotype_individual.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype individual in $file_name\".karyotype_individual.txt\" creation. See $output_dir/logs for more information"
207 | 
208 | fi
209 | 
210 | echo "$(date)"
211 | echo "Done Obtain list of cromosomes (idiogram) for CIRCOS karyotype file"
212 | echo "Files can be found at" $output_dir
213 | echo $(cat $output_dir/$file_name".karyotype_summary.txt" | wc -l) "sequences will be displayed on summary image"
214 | echo -e $(cat $output_dir/$file_name".karyotype_individual.txt" | wc -l) "images will be created individually" "\n"
215 | 


--------------------------------------------------------------------------------
/bin/calculate_seqlen.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | 
  7 | #=============================================================
  8 | # HEADER
  9 | #=============================================================
 10 | 
 11 | #INSTITUTION:ISCIII
 12 | #CENTRE:BU-ISCIII
 13 | #AUTHOR: Pedro J. Sola
 14 | VERSION=1.0
 15 | #CREATED: 20 March 2018
 16 | #REVISION:
 17 | #DESCRIPTION:Script that convert a supplied SAM file into compressed binary indexed BAM
 18 | #AKNOWLEDGE:
 19 | #		-Adapted from klashxx: https://stackoverflow.com/questions/23992646/sequence-length-of-fasta-file/23992773
 20 | #================================================================
 21 | # END_OF_HEADER
 22 | #================================================================
 23 | 
 24 | #SHORT USAGE RULES
 25 | #LONG USAGE FUNCTION
 26 | usage() {
 27 | 	cat << EOF
 28 | 
 29 | Calculate_sequlen script calculates a supplied FASTA length
 30 | 
 31 | usage : $0 <-i inputfile(.fasta)> [-o <directory>] [-n <string>] [-r] [-v] [-h]
 32 | 
 33 | 	-i input file
 34 | 	-o output directory (optional). By default the file is replaced in the same location
 35 | 	-n file name (optional). By default is the same name with .length extension
 36 | 	-r remove ">" (greater-than) symbol from fasta header
 37 | 	-v version
 38 | 	-h display usage message
 39 | 
 40 | example: calculate_sequlen.sh -i ecoli.fasta
 41 | 
 42 | EOF
 43 | }
 44 | 
 45 | #================================================================
 46 | # OPTION_PROCESSING
 47 | #================================================================
 48 | #Make sure the script is executed with arguments
 49 | if [ $# = 0 ] ; then
 50 |  usage >&2
 51 |  exit 1
 52 | fi
 53 | 
 54 | # Error handling
 55 | error(){
 56 |   local parent_lineno="$1"
 57 |   local script="$2"
 58 |   local message="$3"
 59 |   local code="${4:-1}"
 60 | 
 61 | 	RED='\033[0;31m'
 62 | 	NC='\033[0m'
 63 | 
 64 |   if [[ -n "$message" ]] ; then
 65 |     echo -e "\n---------------------------------------\n"
 66 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 67 |     echo -e "MESSAGE:\n"
 68 |     echo -e "$message"
 69 |     echo -e "\n---------------------------------------\n"
 70 |   else
 71 |     echo -e "\n---------------------------------------\n"
 72 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 73 |     echo -e "\n---------------------------------------\n"
 74 |   fi
 75 | 
 76 |   exit "${code}"
 77 | }
 78 | 
 79 | #DECLARE FLAGS AND VARIABLES
 80 | remove_head=remove_head_false
 81 | cwd="$(pwd)"
 82 | file_name="file_name"
 83 | input_file="Input_file"
 84 | 
 85 | #PARSE VARIABLE ARGUMENTS WITH getops
 86 | #common example with letters, for long options check longopts2getopts.sh
 87 | options=":i:o:n:rvh"
 88 | while getopts $options opt; do
 89 | 	case $opt in
 90 | 		i )
 91 | 			input_file=$OPTARG
 92 | 			;;
 93 | 		o )
 94 | 			output_dir=$OPTARG
 95 | 			;;
 96 | 		n )
 97 | 			filename=$OPTARG
 98 | 			;;
 99 | 		r )
100 | 			remove_head="^>"
101 | 			;;
102 |         h )
103 | 		  	usage
104 | 		  	exit 1
105 | 		  	;;
106 | 		v )
107 | 		  	echo $VERSION
108 | 		  	exit 1
109 | 		  	;;
110 | 		\?)
111 | 			echo "Invalid Option: -$OPTARG" 1>&2
112 | 			usage
113 | 			exit 1
114 | 			;;
115 | 		: )
116 |       		echo "Option -$OPTARG requires an argument." >&2
117 |       		exit 1
118 |       		;;
119 |       	* )
120 | 			echo "Unimplemented option: -$OPTARG" >&2;
121 | 			exit 1
122 | 			;;
123 | 
124 | 	esac
125 | done
126 | shift $((OPTIND-1))
127 | 
128 | #================================================================
129 | # MAIN_BODY
130 | #================================================================
131 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
132 | 
133 | echo -e "\n#Executing" $0 "\n"
134 | 
135 | check_mandatory_files.sh $input_file
136 | 
137 | if [ ! $output_dir ]; then
138 | 	output_dir=$(dirname $input_file)
139 | 	#echo "Default output directory is" $output_dir
140 | 	mkdir -p $output_dir
141 | else
142 | 	#echo "Output directory is" $output_dir
143 | 	mkdir -p $output_dir
144 | fi
145 | 
146 | if [ ! $filename ]; then
147 | 	filename=$(basename $input_file | cut -d. -f1)
148 | fi
149 | 
150 | awk '
151 | BEGIN {FS=="| "}
152 | /^>/ {if (seqlen)
153 | 	print seqlen;printf "%s\t", $1; seqlen=0; next
154 | 	}
155 | {seqlen+=length($0)}
156 | END {print seqlen}' $input_file | sed 's/'$remove_head'//g' \
157 | >$output_dir/$filename".length" || error ${LINENO} $(basename $0) "Awk command for bedtools seqlen calculation in $filename\".length\" creation. See $output_dir/logs for more information."
158 | 
159 | echo "$(date)"
160 | echo "Done seqlen calculation"
161 | echo "Files can be found at" $output_dir
162 | 


--------------------------------------------------------------------------------
/bin/cdhit_cluster.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | #=============================================================
  6 | # HEADER
  7 | #=============================================================
  8 | 
  9 | #INSTITUTION:ISCIII
 10 | #CENTRE:BU-ISCIII
 11 | #AUTHOR: Pedro J. Sola
 12 | VERSION=1.0
 13 | #CREATED: 6 April 2018
 14 | #REVISION:
 15 | #DESCRIPTION:Script that uses cd-hit/psi-cd-hit to clusterize a FASTA file
 16 | #
 17 | #DOCUMENTATION
 18 | #
 19 | #
 20 | #Compare floats in BASH
 21 | #
 22 | #if [ $(echo "$cluster_cutoff > 0.7"|bc -l) -eq 1 ]; then
 23 | #	echo "YES"
 24 | #else
 25 | #	echo "NO"
 26 | #fi
 27 | #
 28 | #-d length of description in .clstr file, default 20. if set to 0,
 29 | #	it takes the fasta defline and stops at first space
 30 | #-s length difference cutoff, default 0.0
 31 | #	if set to 0.9, the shorter sequences need to be
 32 | #	at least 90% length of the representative of the cluster
 33 | #-B 1 or 0, default 0, by default, sequences are stored in RAM
 34 | #	if set to 1, sequence are stored on hard drive
 35 | #	it is recommended to use -B 1 for huge databases
 36 | #-g 1 or 0, default 0
 37 | #	By cd-hit’s default algorithm, a sequence is clustered to the first
 38 | #	cluster that meet the threshold (fast mode). If set to 1, the program
 39 | #	will cluster it into the most similar cluster that meet the threshold
 40 | #	(accurate but slow mode)
 41 | #
 42 | #	PSI-CD-HIT
 43 | #-G (1/0) use global identity? default 1, sequence identity
 44 | #	calculated as total identical residues of local alignments
 45 | #	length of shorter sequence
 46 | #
 47 | #-n 5 for thresholds 0.7 ~ 1.0
 48 | #-n 4 for thresholds 0.6 ~ 0.7
 49 | #-n 3 for thresholds 0.5 ~ 0.6
 50 | #-n 2 for thresholds 0.4 ~ 0.5
 51 | 
 52 | #================================================================
 53 | # END_OF_HEADER
 54 | #================================================================
 55 | 
 56 | #SHORT USAGE RULES
 57 | #LONG USAGE FUNCTION
 58 | usage() {
 59 | 	cat << EOF
 60 | 
 61 | Cdhit_cluster script uses cd-hit/psi-cd-hit to clusterize a FASTA file
 62 | 
 63 | usage : $0 <-i inputfile(FASTA)> [-o <directory>] [-n <filename>] [-c <percentage>] [-H <cd-hit_command>]
 64 | 		[-T <threads>] [-g group_name] [-s <int>] [-M <int>][-C <(0|1)>] [-G <(0|1)>] [-b <blast_prog>] [p] [-v] [-h]
 65 | 
 66 | 	-i input file in FASTA format
 67 | 	-c percentage threshold to cluster, default 80
 68 | 	-H cd-hit command, default cd-hit-est
 69 | 	-M max available memory (Mbyte), default 400
 70 | 	-n file name
 71 | 	-s length difference cutoff, default 0.8
 72 | 	-g group name (optional). If unset, samples will be gathered in NO_GROUP group
 73 | 	-p runs psi-cd-hit instead of cd-hit
 74 | 	-C psi-cd-hit only: circular sequences, default 1. If set to 0 sequence is assumed lineal
 75 | 	-G psi-cd-hit only: gobal identity, -G 0 only takes the first local alignment for clustering
 76 | 	-b psi-cd-hit only: choose blast program, default blastn
 77 | 	-T number of threads
 78 | 	-v version
 79 | 	-h display usage message
 80 | 
 81 | 
 82 | Output directory is the same as input directory
 83 | 
 84 | example: cdhit_cluster -i ecoli.fasta -c 90 -M 50000 -T 0
 85 | 		cdhit_cluster -H cd-hit -i ecoli.fasta -c 90 -M 50000 -T 0
 86 | 
 87 | 
 88 | EOF
 89 | }
 90 | 
 91 | #================================================================
 92 | # OPTION_PROCESSING
 93 | #================================================================
 94 | #Make sure the script is executed with arguments
 95 | if [ $# = 0 ] ; then
 96 |  usage >&2
 97 |  exit 1
 98 | fi
 99 | 
100 | # Error handling
101 | error(){
102 |   local parent_lineno="$1"
103 |   local script="$2"
104 |   local message="$3"
105 |   local code="${4:-1}"
106 | 
107 | 	RED='\033[0;31m'
108 | 	NC='\033[0m'
109 | 
110 |   if [[ -n "$message" ]] ; then
111 |     echo -e "\n---------------------------------------\n"
112 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
113 |     echo -e "MESSAGE:\n"
114 |     echo -e "$message"
115 |     echo -e "\n---------------------------------------\n"
116 |   else
117 |     echo -e "\n---------------------------------------\n"
118 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
119 |     echo -e "\n---------------------------------------\n"
120 |   fi
121 | 
122 |   exit "${code}"
123 | }
124 | 
125 | #DECLARE FLAGS AND VARIABLES
126 | cwd="$(pwd)"
127 | group="NO_GROUP"
128 | input_file="Input_file"
129 | cluster_cutoff=0.8
130 | max_memory=400
131 | length_cutoff=0.8
132 | cd_hit_command=cd-hit-est
133 | is_circle=1
134 | global_psi_cd_hit=1
135 | psi_cd_hit_program=blastn
136 | word_size=0
137 | threads=0
138 | 
139 | #PARSE VARIABLE ARGUMENTS WITH getops
140 | #common example with letters, for long options check longopts2getopts.sh
141 | options=":i:o:c:M:n:s:g:C:G:b:T:H:pvh"
142 | while getopts $options opt; do
143 | 	case $opt in
144 | 		i )
145 | 			input_file=$OPTARG
146 | 			;;
147 | 
148 | 		c )
149 | 			cluster_cutoff_input=$OPTARG
150 | 			;;
151 | 		g)
152 | 			group=$OPTARG
153 | 			;;
154 | 		H)
155 | 			cd_hit_command=$OPTARG
156 | 			;;
157 | 		M )
158 | 			max_memory=$OPTARG
159 | 			;;
160 | 		n )
161 | 			file_name=$OPTARG
162 | 			;;
163 | 		s )
164 |           	length_cutoff=$OPTARG
165 |           	;;
166 |         p )
167 |           	cd_hit_command=psi-cd-hit.pl
168 |           	;;
169 |         C )
170 |           	is_circle=$OPTARG
171 |           	;;
172 |         G)
173 |           	global_psi_cd_hit=$OPTARG
174 |           	;;
175 |         T)
176 |           	threads=$OPTARG
177 |           	;;
178 |         b)
179 |           	psi_cd_hit_program=$OPTARG
180 |           	;;
181 |         h )
182 | 		  	usage
183 | 		  	exit 1
184 | 		  	;;
185 | 		v )
186 | 		  	echo $VERSION
187 | 		  	exit 1
188 | 		  	;;
189 | 		\?)
190 | 			echo "Invalid Option: -$OPTARG" 1>&2
191 | 			usage
192 | 			exit 1
193 | 			;;
194 | 		: )
195 |       		echo "Option -$OPTARG requires an argument." >&2
196 |       		exit 1
197 |       		;;
198 |       	* )
199 | 			echo "Unimplemented option: -$OPTARG" >&2;
200 | 			exit 1
201 | 			;;
202 | 
203 | 	esac
204 | done
205 | shift $((OPTIND-1))
206 | 
207 | #================================================================
208 | # MAIN_BODY
209 | #================================================================
210 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
211 | 
212 | echo -e "\n#Executing" $0 "\n"
213 | 
214 | 
215 | check_mandatory_files.sh $input_file
216 | 
217 | check_dependencies.sh cd-hit-est
218 |  #psi-cd-hit.pl
219 | 
220 | 
221 | 
222 | # Set word size (parameter -n for cd-hit) as author recomends
223 | #according to clustering percentage
224 | 
225 | 
226 | cluster_cutoff=$(echo "$cluster_cutoff_input / 100" | bc -l | sed 's/0\{1,\}$//')
227 | #cluster_cutoff=${cluster_cutoff%.*} #Remove float value
228 | 
229 | 
230 | if [[ "$cluster_cutoff_input" -gt 70  &&  "$cluster_cutoff_input" -le 100 ]]; then
231 | 	word_size=5
232 | elif [[ "$cluster_cutoff_input" -gt 60  &&  "$cluster_cutoff_input" -le 70 ]]; then
233 | 	word_size=4
234 | elif [[ "$cluster_cutoff_input" -gt 50  &&  "$cluster_cutoff_input" -le 60 ]]; then
235 | 	word_size=3
236 | elif [[ "$cluster_cutoff_input" -ge 40  &&  "$cluster_cutoff_input" -le 50 ]]; then
237 | 	word_size=2
238 | else
239 | 	echo "please introduce a valid cluster percentage value between 0.4 and 1"
240 | 	exit 1
241 | fi
242 | 
243 | 
244 | 
245 | if [ ! $output_dir ]; then
246 | 	output_dir=$(dirname $input_file)
247 | 	echo "Default output directory is" $output_dir
248 | 	mkdir -p $output_dir
249 | else
250 | 	echo "Output directory is" $output_dir
251 | 	mkdir -p $output_dir
252 | fi
253 | 
254 | if [ ! $file_name ]; then
255 | 	file_name=$(basename $input_file)
256 | 	echo "filename is" $file_name
257 | fi
258 | 
259 | ##CD-HIT EXECUTION
260 | 
261 | echo "$(date)"
262 | echo "Clustering sequences with identity" $cluster_cutoff_input"% or higher"
263 | echo "Using" $cd_hit_command "with file" $input_file
264 | seq_number_prev_clstr=$(cat $input_file | grep ">" | wc -l)
265 | 
266 | cd $(dirname $input_file)
267 | 
268 | if [ -f $output_dir/$file_name""_""$cluster_cutoff_input ]; then \
269 | 	echo "Found a clustered file for sample" $file_name;
270 | 	echo "Omitting clustering process calculation"
271 | 	exit 1
272 | else
273 | 	if [ $cd_hit_command  == "psi-cd-hit.pl" ]; then
274 | 
275 | 		check_dependencies.sh psi-cd-hit.pl
276 | 		$cd_hit_command -i $(basename $input_file) -o $file_name""_""$cluster_cutoff_input -c $cluster_cutoff -G $global_psi_cd_hit -g 1 -prog $psi_cd_hit_program -circle $is_circle -core $threads || error ${LINENO} $(basename $0) "PSI-CD-HIT command failed. See $output_dir/logs for more information."
277 | 
278 | 	else
279 | 
280 | 		$cd_hit_command -i $(basename $input_file) -o $file_name""_""$cluster_cutoff_input -c $cluster_cutoff -n $word_size -d 0 -s $length_cutoff -B 1 -M $max_memory -T $threads|| error ${LINENO} $(basename $0) "CD-HIT command failed. See $output_dir/logs for more information"
281 | 
282 | 	fi
283 | fi
284 | 
285 | seq_number_post_clstr=$(cat $file_name""_""$cluster_cutoff_input | grep ">" | wc -l)
286 | 
287 | echo "$(date)"
288 | echo "DONE Clustering sequences with identity" $cluster_cutoff_input"% or higher"
289 | echo "fasta file can be found in" $output_dir/$file_name""_""$cluster_cutoff_input
290 | echo "Previous number of sequences=" $seq_number_prev_clstr
291 | echo -e "Number of sequences after clustering=" $seq_number_post_clstr "\n"
292 | cd $cwd
293 | 


--------------------------------------------------------------------------------
/bin/check_dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #=============================================================
 4 | # HEADER
 5 | #=============================================================
 6 | 
 7 | #INSTITUTION:ISCIII
 8 | #CENTRE:BU-ISCIII
 9 | #AUTHOR: Pedro J. Sola
10 | VERSION=1.0 
11 | #CREATED: 19 March 2018
12 | #REVISION: 12 July 2018: add formated output and colors
13 | #AKNOWLEDGE: Colored text: https://stackoverflow.com/questions/5947742/how-to-change-the-output-color-of-echo-in-linux
14 | #DESCRIPTION:Short function to evaluate if programs are on path
15 | 
16 | #================================================================
17 | # END_OF_HEADER
18 | #================================================================
19 | 
20 | #SHORT USAGE RULES
21 | #LONG USAGE FUNCTION
22 | usage() {
23 | 	cat << EOF
24 | 
25 | Check_dependencies Short function to evaluate if files exist
26 | 
27 | usage : $0 <program_name1> [program_name2] ...
28 | 
29 | example: lib/check_dependencies.sh foo bar
30 | 
31 | EOF
32 | }
33 | 
34 | if [ $# = 0 ] ; then
35 |  usage >&2
36 |  exit 1
37 | fi
38 | 
39 | #DECLARE FLAGS AND VARIABLES
40 | missing_dependencies=0
41 | 
42 | #SET COLORS
43 | 
44 | RED='\033[0;31m'
45 | GREEN='\033[0;32m'
46 | NC='\033[0m'
47 | 
48 | printf '\n%s\t%20s\n' "DEPENDENCY" "STATUS"
49 | printf '%s\t%20s\n'   "----------" "------"
50 | 
51 | for command in "$@"; do
52 | 	#dependency_version=$($command --version)
53 | 	length_command=$(echo $command | wc -m)
54 | 	distance_table=$((30 - $length_command))
55 | 	distance_expression=$(echo "%${distance_table}s")
56 | 	
57 | 	printf '%s' $command
58 | 	if ! [ -x "$(which $command 2> /dev/null)" ]; then
59 | 		
60 | 		
61 | 		printf $distance_expression
62 | 		printf "${RED}NOT INSTALLED${NC} \n"
63 | 		let missing_dependencies++
64 | 	else
65 | 		printf $distance_expression
66 | 		printf "${GREEN}INSTALLED${NC} \n"
67 | 	fi
68 | done
69 | 
70 | if [ $missing_dependencies -gt 0 ]; then 
71 | 	printf "${RED}ERROR${NC}: $missing_dependencies missing dependencies, aborting execution\n" >&2
72 | 	exit 1
73 | fi
74 | 
75 | 


--------------------------------------------------------------------------------
/bin/check_mandatory_files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #=============================================================
 4 | # HEADER
 5 | #=============================================================
 6 | 
 7 | #INSTITUTION:ISCIII
 8 | #CENTRE:BU-ISCIII
 9 | #AUTHOR: Pedro J. Sola
10 | VERSION=1.0 
11 | #CREATED: 19 March 2018
12 | #REVISION:
13 | #DESCRIPTION:Short function to evaluate if files exist
14 | 
15 | #================================================================
16 | # END_OF_HEADER
17 | #================================================================
18 | 
19 | #SHORT USAGE RULES
20 | #LONG USAGE FUNCTION
21 | usage() {
22 | 	cat << EOF
23 | 
24 | Check_mandatory_files Short function to evaluate if files exist
25 | 
26 | usage : $0 <file1> [file2] ...
27 | 
28 | example: lib/check_mandatory_files.sh foo.txt bar.fasta
29 | 
30 | EOF
31 | }
32 | 
33 | if [ $# = 0 ] ; then
34 |  usage >&2
35 |  exit
36 | fi
37 | 
38 | #DECLARE FLAGS AND VARIABLES
39 | missing_files=0
40 | 
41 | for file in "$@"; do
42 | 	if [ ! -f $file ]; then
43 | 		echo "$(basename $file)" "not supplied, please, introduce a valid file" >&2
44 | 		let missing_files++
45 | 	fi
46 | done
47 | 
48 | if [ $missing_files -gt 0 ]; then 
49 | 	echo "ERROR: $missing_files missing files, aborting execution" >&2
50 | 	exit 1
51 | fi


--------------------------------------------------------------------------------
/bin/coordinate_adapter.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | #set -e
  6 | #set -x
  7 | 
  8 | #=============================================================
  9 | # HEADER
 10 | #=============================================================
 11 | 
 12 | #INSTITUTION:ISCIII
 13 | #CENTRE:BU-ISCIII
 14 | #AUTHOR: Pedro J. Sola
 15 | VERSION=1.0
 16 | #CREATED: 17 May 2018
 17 | #REVISION:
 18 | #DESCRIPTION:coordinate_adapter script adapt coordinates obtained with a bed file to a reference sequences in a link file
 19 | #
 20 | #
 21 | #================================================================
 22 | # END_OF_HEADER
 23 | #================================================================
 24 | 
 25 | #SHORT USAGE RULES
 26 | #LONG USAGE FUNCTION
 27 | usage() {
 28 | 	cat << EOF
 29 | 
 30 | coordinate_adapter script adapt coordinates obtained with a bed file to a reference sequences in a link file
 31 | 
 32 | usage : $0 <-i inputfile(.bed)> <-l link_file> [-o <directory>] [-n <number>] [-f <file_name>] [-u] [-v] [-h]
 33 | 
 34 | 	-i input file in bed format
 35 | 	-l link file with coordinates relationship within bed file ddbb and link reference
 36 | 	-o output directory (optional). By default the file is placed in the same location as input
 37 | 	-n length to extend annotation, default 2000
 38 | 	-f file name
 39 | 	-u uniq mode. Remove duplicates
 40 | 	-p prokka mode. Remove suffix of prokka
 41 | 	-v version
 42 | 	-h display usage message
 43 | 
 44 | example: ./coordinate_adapter.sh -i genes.bed -l ecoli.links -n 10000
 45 | 
 46 | EOF
 47 | }
 48 | 
 49 | #================================================================
 50 | # OPTION_PROCESSING
 51 | #================================================================
 52 | #Make sure the script is executed with arguments
 53 | if [ $# = 0 ] ; then
 54 |  usage >&2
 55 |  exit 1
 56 | fi
 57 | 
 58 | # Error handling
 59 | error(){
 60 |   local parent_lineno="$1"
 61 |   local script="$2"
 62 |   local message="$3"
 63 |   local code="${4:-1}"
 64 | 
 65 | 	RED='\033[0;31m'
 66 | 	NC='\033[0m'
 67 | 
 68 |   if [[ -n "$message" ]] ; then
 69 |     echo -e "\n---------------------------------------\n"
 70 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 71 |     echo -e "MESSAGE:\n"
 72 |     echo -e "$message"
 73 |     echo -e "\n---------------------------------------\n"
 74 |   else
 75 |     echo -e "\n---------------------------------------\n"
 76 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 77 |     echo -e "\n---------------------------------------\n"
 78 |   fi
 79 | 
 80 |   exit "${code}"
 81 | }
 82 | 
 83 | #DECLARE FLAGS AND VARIABLES
 84 | cwd="$(pwd)"
 85 | input_file="Bed_file"
 86 | link_file="Link_file"
 87 | number_extension=2000
 88 | unique=false
 89 | prokka_mode=false
 90 | suffix=""
 91 | 
 92 | #PARSE VARIABLE ARGUMENTS WITH getops
 93 | #common example with letters, for long options check longopts2getopts.sh
 94 | options=":i:l:n:f:puvh"
 95 | while getopts $options opt; do
 96 | 	case $opt in
 97 | 		i )
 98 | 			input_file=$OPTARG
 99 | 			;;
100 | 		l )
101 | 			link_file=$OPTARG
102 | 			;;
103 | 		o )
104 | 			output_dir=$OPTARG
105 | 			;;
106 | 		n )
107 | 			number_extension=$OPTARG
108 | 			;;
109 | 		f)
110 | 			file_name=$OPTARG
111 | 			;;
112 | 		u )
113 | 			unique=true
114 | 			suffix=".unique.tmp"
115 | 			;;
116 | 		p )
117 | 			prokka_mode=true
118 | 			suffix=".prokka.tmp"
119 | 			;;
120 |         h )
121 | 		  	usage
122 | 		  	exit 1
123 | 		  	;;
124 | 		v )
125 | 		  	echo $VERSION
126 | 		  	exit 1
127 | 		  	;;
128 | 		\?)
129 | 			echo "Invalid Option: -$OPTARG" 1>&2
130 | 			usage
131 | 			exit 1
132 | 			;;
133 | 		: )
134 |       		echo "Option -$OPTARG requires an argument." >&2
135 |       		exit 1
136 |       		;;
137 |       	* )
138 | 			echo "Unimplemented option: -$OPTARG" >&2;
139 | 			exit 1
140 | 			;;
141 | 
142 | 	esac
143 | done
144 | shift $((OPTIND-1))
145 | 
146 | 
147 | #================================================================
148 | # MAIN_BODY
149 | #================================================================
150 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
151 | 
152 | echo -e "\n#Executing" $0 "\n"
153 | 
154 | check_mandatory_files.sh $input_file $link_file
155 | 
156 | if [ ! $output_dir ]; then
157 | 	output_dir=$(dirname $input_file)
158 | 	#echo "Default output directory is" $output_dir
159 | 	mkdir -p $output_dir
160 | else
161 | 	#echo "Output directory is" $output_dir
162 | 	mkdir -p $output_dir
163 | fi
164 | 
165 | 
166 | if [ ! $file_name ]; then
167 | 	file_name=$(basename $input_file | cut -d. -f1,2)
168 | fi
169 | 
170 | 
171 | echo "$(date)"
172 | echo "adapting coordinates from" $input_file and $link_file
173 | echo "file name is:" $file_name
174 | 
175 | #Create a dictionary file with all posibilities: Column 1 and 5 must have some common terms
176 | awk 'NR==FNR{a[NR]=$1;b[NR]=$0;next}{for(i = 1; i <= NR; ++i){if (a[i] == $1) print b[i],"\t", $0}}' \
177 | $input_file $link_file > $output_dir/$file_name".coordinates.tmp" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.tmp\" creation. See $output_dir/logs for more information."
178 | 
179 | awk '(($2 >= $6 - '"${number_extension}"' && $2 <= $7) || ($3 >= $6 && $3 <= $7 + '"${number_extension}"')) {{isInverted=($10-$9); \
180 | genelength=($3-$2)};{if (isInverted < 0) {coordChr1=(($7-$3)+$10);} else {coordChr1=(($2-$6)+$9)}}; \
181 | coordChr2=(coordChr1+genelength); {print $8, coordChr1, coordChr2, $4}}' $output_dir/$file_name".coordinates.tmp" > $output_dir/$file_name".coordinates.negatives"|| error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.negatives\" creation. See $output_dir/logs for more information."
182 | 
183 | 
184 | #resulting in a bed file with coordinated of plasmid bur refering to contig annotation:
185 | #NZ_CP010574.1 34820 33528 arsB_1
186 | #NZ_CP008930.1 90527 89235 arsB_1
187 | #NZ_CP006927.1 44969 43677 arsB_1
188 | #NZ_CP010574.1 81021 82508 ltrA_1
189 | #NZ_CP008930.1 144220 145707 ltrA_1
190 | 
191 | 
192 | #Remove duplicate of several matches
193 | 
194 | awk '($2 > 0) && ($3 > 0)' $output_dir/$file_name".coordinates.negatives" \
195 | > $output_dir/$file_name".coordinates"$suffix || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates$suffix\" creation. See $output_dir/logs for more information."
196 | 
197 | 
198 | if [ "$unique" == "true" ]; then
199 |     awk '
200 |         (!x[$1$4]++)
201 |     	' $output_dir/$file_name".coordinates"$suffix \
202 | 		> $output_dir/$file_name".coordinates" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates\" creation. See $output_dir/logs for more information."
203 | 
204 | 		rm $output_dir/$file_name".coordinates"$suffix
205 | fi
206 | 
207 | if [ "$prokka_mode" == "true" ]; then
208 | 
209 | 	awk '
210 | 		(!uniq[$1$4]++)
211 | 		' $output_dir/$file_name".coordinates"$suffix \
212 | 		> $output_dir/$file_name".coordinates.prokka.unique.tmp"|| error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.prokka.unique.tmp\" creation. See $output_dir/logs for more information."
213 | 
214 | 
215 | 	awk '
216 | 		BEGIN{OFS="\t"}{split($4, namelowbar, "_")} {$4=($4 !~ /CDS/) ? namelowbar[1] : $4}1
217 | 		' $output_dir/$file_name".coordinates.prokka.unique.tmp" \
218 | 		> $output_dir/$file_name".coordinates" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates\" creation. See $output_dir/logs for more information."
219 | 
220 | 	rm $output_dir/$file_name".coordinates.prokka.unique.tmp"
221 | 	rm $output_dir/$file_name".coordinates"$suffix
222 | 
223 | fi
224 | 
225 | rm $output_dir/$file_name".coordinates.tmp"
226 | rm $output_dir/$file_name".coordinates.negatives"
227 | 
228 | 
229 | echo "$(date)"
230 | echo -e "Coordinates adapted to file" $output_dir/$file_name".coordinates" "\n"
231 | 


--------------------------------------------------------------------------------
/bin/download_plasmid_database.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Standard library imports
  4 | import os
  5 | import sys
  6 | import logging
  7 | 
  8 | # Third party imports
  9 | import argparse
 10 | import datetime
 11 | import pandas as pd
 12 | import Bio
 13 | from Bio import Entrez
 14 | from Bio import SeqIO
 15 | 
 16 | logger = logging.getLogger()
 17 | 
 18 | """
 19 | =============================================================
 20 | HEADER
 21 | =============================================================
 22 | FUNCTION: Download up to date plasmid database from https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/plasmids.txt.
 23 |             Remove those sequences with terms not related to complete plasmid such: gene, protein, partial, putative or hypothetical
 24 | 
 25 | INSTITUTION:CNM-ISCIII
 26 | AUTHOR: Pedro J. Sola (pedroscampoy@gmail.com)
 27 | d^v^b
 28 | VERSION=0.1
 29 | CREATED: 26 February 2020
 30 | REVISION: 
 31 | 
 32 | TODO:
 33 |     add user defined terms
 34 |     filter by record size (len(record))
 35 | ================================================================
 36 | END_OF_HEADER
 37 | ================================================================
 38 | """
 39 | 
 40 | 
 41 | def check_create_dir(path):
 42 |     if os.path.exists(path):
 43 |         pass
 44 |     else:
 45 |         os.mkdir(path)
 46 | 
 47 | 
 48 | def main():
 49 | 
 50 |     def get_arguments():
 51 | 
 52 |         parser = argparse.ArgumentParser(
 53 |             prog='download_plasmid_database.py', description='Download up to date plasmid database from ncbi ftp')
 54 | 
 55 |         parser.add_argument('-o', '--output', type=str, required=True,
 56 |                             help='REQUIRED. Output directory to extract plasmid database')
 57 | 
 58 |         arguments = parser.parse_args()
 59 | 
 60 |         return arguments
 61 | 
 62 |     args = get_arguments()
 63 | 
 64 |     output_dir = os.path.abspath(args.output)
 65 | 
 66 |     check_create_dir(output_dir)
 67 | 
 68 |     # LOGGING
 69 |     # Create log file with date and time
 70 |     today = str(datetime.date.today())
 71 |     right_now_full = "".join(today.split("-"))
 72 | 
 73 |     log_filename = 'plasmidID_database' + "_" + right_now_full + ".log"
 74 |     log_full_path = os.path.join(output_dir, log_filename)
 75 | 
 76 |     logger = logging.getLogger()
 77 |     logger.setLevel(logging.DEBUG)
 78 | 
 79 |     formatter = logging.Formatter('%(asctime)s:%(message)s')
 80 | 
 81 |     file_handler = logging.FileHandler(log_full_path)
 82 |     file_handler.setLevel(logging.DEBUG)
 83 |     file_handler.setFormatter(formatter)
 84 | 
 85 |     stream_handler = logging.StreamHandler()
 86 |     stream_handler.setLevel(logging.INFO)
 87 |     # stream_handler.setFormatter(formatter)
 88 | 
 89 |     logger.addHandler(stream_handler)
 90 |     logger.addHandler(file_handler)
 91 | 
 92 |     #####################START PIPELINE################
 93 | 
 94 |     logger.debug(args)
 95 | 
 96 |     plasmid_text_file = today + "_plasmids.txt"
 97 |     plasmid_text_path = os.path.join(output_dir, plasmid_text_file)
 98 | 
 99 |     plasmid_fasta_file = today + "_plasmids.fasta"
100 |     plasmid_fasta_path = os.path.join(output_dir, plasmid_fasta_file)
101 | 
102 |     plasmid_failed_file = today + "failed_plasmids.txt"
103 |     plasmid_failed_path = os.path.join(output_dir, plasmid_failed_file)
104 | 
105 |     try:
106 |         df = pd.read_csv(
107 |             'https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/plasmids.txt', sep='\t')
108 |     except:
109 |         logger.info('there was a problem accessing the ftp')
110 |         sys.exit(1)
111 | 
112 |     df.to_csv(plasmid_text_path, sep='\t', index=False)
113 | 
114 |     plasmid_reference = df['RefSeq'][df.RefSeq !=
115 |                                      "-"].tolist() + df['INSDC'][df.RefSeq == "-"].tolist()
116 | 
117 |     # remove duplicates
118 |     plasmid_reference = set(plasmid_reference)
119 |     # Set terms to exclude
120 |     terms_to_exclude = ['gene ', 'protein',
121 |                         'partial', 'putative', 'hypothetical']
122 |     # Dictionary with erroneous accession numbers to determine the reason
123 |     erroneous = {}
124 | 
125 |     Entrez.email = "A.N.Other@example.com"
126 | 
127 |     total_sequences = len(plasmid_reference)
128 |     current_record = 1
129 |     logger.info("")
130 |     logger.info("Starting plasmid database download script: " +
131 |                 str(total_sequences) + " will be downloaded")
132 |     logger.info("This will take a while.\nCheck progress in " + log_full_path)
133 | 
134 |     with open(plasmid_fasta_path, 'w+') as output_handle:
135 |         for plasmid_accnumber in plasmid_reference:
136 |             try:
137 |                 handle = Entrez.efetch(
138 |                     db="nucleotide", id=plasmid_accnumber, rettype="fasta", retmode="text")
139 |                 record = SeqIO.read(handle, "fasta")
140 |                 terms_present = [
141 |                     x in record.description for x in terms_to_exclude]
142 |                 handle.close()
143 |                 if sum(terms_present) > 0:
144 |                     terms_true = [terms_to_exclude[i]
145 |                                   for i, x in enumerate(terms_present) if x == True]
146 |                     erroneous[record.id] = "Include terms: " + \
147 |                         ', '.join(terms_true) + " => " + record.description
148 |                     logger.debug(" %s/%s Invalid terms in record %s" %
149 |                                  (current_record, total_sequences, record.id))
150 |                 else:
151 |                     logger.debug(" %s/%s Downloading record %s" %
152 |                                  (current_record, total_sequences, record.id))
153 |                     SeqIO.write(record, output_handle, "fasta")
154 |             except:
155 |                 logger.debug(" %s/%s Failed to download %s" %
156 |                              (current_record, total_sequences, record.id))
157 |                 erroneous[record.id] = "failed to download"
158 |             current_record = current_record + 1
159 | 
160 |     if len(erroneous) > 0:
161 |         with open(plasmid_failed_path, 'w+') as ferror:
162 |             for acc, reason in erroneous.items():
163 |                 ferror.write(acc + ": " + reason + "\n")
164 | 
165 |     logger.info("ALL DONE\nFASTA file is available in: " + plasmid_fasta_path)
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     try:
170 |         main()
171 |     except Exception as e:
172 |         logger.exception(e)
173 |         raise
174 | 


--------------------------------------------------------------------------------
/bin/draw_circos_images.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | #set -x
  7 | #=============================================================
  8 | # HEADER
  9 | #=============================================================
 10 | 
 11 | #INSTITUTION:ISCIII
 12 | #CENTRE:BU-ISCIII
 13 | #AUTHOR: Pedro J. Sola
 14 | VERSION=1.0
 15 | #CREATED: 01 May 2018
 16 | #REVISION:
 17 | #		11 July 2018: Apply good practices bash
 18 | #						Include independent files
 19 | #						Include several databases
 20 | #		13 July 2018: Include log file
 21 | #						manage directories
 22 | #DESCRIPTION:Script that creates and execute a cicos config file for plasmidID
 23 | #
 24 | #
 25 | #
 26 | #================================================================
 27 | # END_OF_HEADER
 28 | #================================================================
 29 | 
 30 | #SHORT USAGE RULES
 31 | #LONG USAGE FUNCTION
 32 | usage() {
 33 | 	cat << EOF
 34 | 
 35 | draw_circos_image script that creates and execute a cicos config file for plasmidID
 36 | 
 37 | usage : $0 <-i input_directory> <-d config_files_directory> <-C config_file> <-s sample> <-g <group> <-o <output_directory> [-l <log_file>] [-V] [-c] [-v] [-h]
 38 | 
 39 | 	-i input directory containing files to represent
 40 | 	-d directory containing config files
 41 | 	-C config file selected to draw
 42 | 	-s sample
 43 | 	-g group
 44 | 	-l log file
 45 | 	-o output directory to create config and pictures
 46 | 	-c clean: remove config files
 47 | 	-v version
 48 | 	-V verbose
 49 | 	-h display usage message
 50 | 
 51 | EOF
 52 | }
 53 | 
 54 | #================================================================
 55 | # OPTION_PROCESSING
 56 | #================================================================
 57 | #Make sure the script is executed with arguments
 58 | if [ $# = 0 ] ; then
 59 |  usage >&2
 60 |  exit 1
 61 | fi
 62 | 
 63 | # Error handling
 64 | error(){
 65 |   local parent_lineno="$1"
 66 |   local script="$2"
 67 |   local message="$3"
 68 |   local code="${4:-1}"
 69 | 
 70 | 	RED='\033[0;31m'
 71 | 	NC='\033[0m'
 72 | 
 73 |   if [[ -n "$message" ]] ; then
 74 |     echo -e "\n---------------------------------------\n"
 75 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 76 |     echo -e "MESSAGE:\n"
 77 |     echo -e "$message"
 78 |     echo -e "\n---------------------------------------\n"
 79 |   else
 80 |     echo -e "\n---------------------------------------\n"
 81 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 82 |     echo -e "\n---------------------------------------\n"
 83 |   fi
 84 | 
 85 |   exit "${code}"
 86 | }
 87 | 
 88 | #DECLARE FLAGS AND VARIABLES
 89 | 
 90 | cwd="$(pwd)"
 91 | clean=false
 92 | verbose=false
 93 | 
 94 | 
 95 | #PARSE VARIABLE ARGUMENTS WITH getops
 96 | #common example with letters, for long options check longopts2getopts.sh
 97 | options=":i:m:o:g:l:s:d:C:cVvh"
 98 | while getopts $options opt; do
 99 | 	case $opt in
100 | 		i )
101 | 			input_dir=$OPTARG
102 | 			;;
103 | 		o )
104 | 			output_dir=$OPTARG
105 | 			;;
106 | 		d )
107 | 			config_dir=$OPTARG
108 | 			;;
109 | 		C )
110 | 			config_file_individual=$OPTARG
111 | 			;;
112 | 		l )
113 | 			log_file=$OPTARG
114 | 			;;
115 | 		g )
116 | 			group=$OPTARG
117 | 			;;
118 | 		s )
119 | 			sample=$OPTARG
120 | 			;;
121 | 		c )
122 | 			clean=true
123 | 			;;
124 |         h )
125 | 		  	usage
126 | 		  	exit 1
127 | 		  	;;
128 | 		V )
129 | 		  	verbose=true
130 | 		  	;;
131 | 		v )
132 | 		  	echo $VERSION
133 | 		  	exit 1
134 | 		  	;;
135 | 		\?)
136 | 			echo "Invalid Option: -$OPTARG" 1>&2
137 | 			usage
138 | 			exit 1
139 | 			;;
140 | 		: )
141 |       		echo "Option -$OPTARG requires an argument." >&2
142 |       		exit 1
143 |       		;;
144 |       	* )
145 | 			echo "Unimplemented option: -$OPTARG" >&2;
146 | 			exit 1
147 | 			;;
148 | 
149 | 	esac
150 | done
151 | shift $((OPTIND-1))
152 | 
153 | #================================================================
154 | # MAIN_BODY
155 | #================================================================
156 | 
157 | imageDir=$input_dir"/data"
158 | 
159 | if [ -f $log_file ]; then
160 | 	rm $log_file
161 | fi
162 | 
163 | echo -e "\n#Executing" $0 "\n" &>> $log_file
164 | 
165 | cdsDdbb_file=$input_dir/database/$sample".gff.bed"
166 | cdsDdbb_file_forward=$input_dir/database/$sample".gff.forward.bed"
167 | cdsDdbb_file_reverse=$input_dir/database/$sample".gff.reverse.bed"
168 | 
169 | 
170 | circos_conf_summary="$config_dir/circos_summary_1_3_3.conf"
171 | circos_conf_individual="$config_dir/$config_file_individual"
172 | circosDir="$output_dir"
173 | 
174 | 
175 | plasmidMapped=$imageDir/$sample".coverage_adapted_clustered_ac"
176 | 
177 | karyotype_file_individual=$imageDir/$sample".karyotype_individual.txt"
178 | karyotype_file_summary=$imageDir/$sample".karyotype_summary.txt"
179 | annotation_text_file=$imageDir/pID_text_annotation.coordinates
180 | annotation_highlights_file=$imageDir/pID_highlights.conf
181 | 
182 | coverage_file=$imageDir/$sample".bedgraph_term"
183 | cds_contig_file=$imageDir/$sample".gff.coordinates"
184 | cds_contig_file_forward=$imageDir/$sample".gff.forward.coordinates"
185 | cds_contig_file_reverse=$imageDir/$sample".gff.reverse.coordinates"
186 | 
187 | 
188 | contig_file=$imageDir/$sample".plasmids.bed"
189 | contig_file_complete=$imageDir/$sample".plasmids.complete"
190 | links_file=$imageDir/$sample".plasmids.links"
191 | 
192 | imageName=$sample"_summary.png"
193 | 
194 | mkdir -p $circosDir
195 | 
196 | 
197 | echo "Creating individual config file for SAMPLE $sample using FILE $circos_conf_individual" &>> $log_file
198 | 
199 | awk '{gsub("PLASMID_KARYOTYPE","'$karyotype_file_individual'"); \
200 | gsub("PLASMID_SPECIFIC_TEXT","'$annotation_text_file'"); \
201 | gsub("PID_ALL_HIGHLIGHTS","'$annotation_highlights_file'"); \
202 | gsub("PLASMID_COVERAGE_GRAPH","'$coverage_file'"); \
203 | gsub("PLASMID_CDS_CONTIG","'$cds_contig_file'"); \
204 | gsub("PLASMID_CDS_FORWARD","'$cds_contig_file_forward'"); \
205 | gsub("PLASMID_CDS_REVERSE","'$cds_contig_file_reverse'"); \
206 | gsub("PLASMID_CDS_DDBB","'$cdsDdbb_file'"); \
207 | gsub("CDS_DDBB_FORWARD","'$cdsDdbb_file_forward'"); \
208 | gsub("CDS_DDBB_REVERSE","'$cdsDdbb_file_reverse'"); \
209 | gsub("PLASMID_CONTIGS_COMPLETE","'$contig_file_complete'"); \
210 | gsub("PLASMID_CONTIGS","'$contig_file'"); \
211 | gsub("OUTPUTDIR","'$circosDir'"); \
212 | print $0}' $circos_conf_individual > $circosDir/$sample"_individual.circos.conf"
213 | 
214 | echo "DONE Creating config file for circos in SAMPLE $sample" &>> $log_file
215 | 
216 | echo "Executing circos in SAMPLE $sample" &>> $log_file
217 | 
218 | 
219 | 
220 | for i in $(cat $plasmidMapped)
221 | do
222 | 	echo "Creating image for plasmid $i in sample $sample" &>> $log_file
223 | 	awk '{gsub("SAMPLE_SHOWN","'$i'"); \
224 | 	gsub("IMAGENAME_SAMPLE_PLASMID","'$sample'_'$i'.png"); \
225 | 	print $0}' $circosDir/$sample"_individual.circos.conf" > $circosDir/$sample"_"$i"_individual.circos.conf"
226 | 	if [ $verbose = true ];then
227 | 		$(circos -conf $circosDir/$sample"_"$i"_individual.circos.conf" |& tee -a $log_file) || error ${LINENO} $(basename $0) "Circos command for individual image has failed. See $output_dir/logs for more information"
228 | 	else
229 | 		$(circos -conf $circosDir/$sample"_"$i"_individual.circos.conf" &>> $log_file) || error ${LINENO} $(basename $0) "Circos command for individual image has failed. See $output_dir/logs for more information"
230 | 	fi
231 | done
232 | 
233 | 
234 | if [ -s $karyotype_file_summary ]; then
235 | 
236 | 	echo "Creating summary image for in sample" $sample "from FILE" $circos_conf_summary &>> $log_file
237 | 
238 | 	awk '{gsub("PLASMID_KARYOTYPE","'$karyotype_file_summary'"); \
239 | 	gsub("PLASMID_SPECIFIC_TEXT","'$annotation_text_file'"); \
240 | 	gsub("PID_ALL_HIGHLIGHTS","'$annotation_highlights_file'"); \
241 | 	gsub("PLASMID_COVERAGE_GRAPH","'$coverage_file'"); \
242 | 	gsub("PLASMID_CDS_CONTIG","'$cds_contig_file'"); \
243 | 	gsub("PLASMID_CDS_FORWARD","'$cds_contig_file_forward'"); \
244 | 	gsub("PLASMID_CDS_REVERSE","'$cds_contig_file_reverse'"); \
245 | 	gsub("PLASMID_CDS_DDBB","'$cdsDdbb_file'"); \
246 | 	gsub("PLASMID_CONTIGS","'$contig_file'"); \
247 | 	gsub("PLASMID_LINKS","'$links_file'"); \
248 | 	gsub("OUTPUTDIR","'$circosDir'"); \
249 | 	gsub("IMAGENAME","'$imageName'"); \
250 | 	print $0}' $circos_conf_summary > $circosDir/$sample"_summary.circos.conf"
251 | 
252 | 	if [ $verbose = true ]; then
253 | 		circos -conf $circosDir/$sample"_summary.circos.conf" |& tee -a $log_file || exit 1
254 | 	else
255 | 		circos -conf $circosDir/$sample"_summary.circos.conf" &>> $log_file || exit 1
256 | 	fi
257 | 
258 | else
259 | 
260 | 	echo "No plasmid matched requirements to draw the summary image"
261 | 
262 | fi
263 | 
264 | 
265 | #Remove config files
266 | if [ clean = true ];then
267 | 	for i in $(cat $plasmidMapped)
268 | 	do
269 | 		if [ -f $circosDir/$sample"_"$i"_individual.circos.conf" ]; then
270 | 			rm $circosDir/$sample"_"$i"_individual.circos.conf"
271 | 		fi
272 | 	done
273 | 
274 | 		rm $circosDir/$sample"_summary.circos.conf"
275 | 		rm $circosDir/$sample"_individual.circos.conf"
276 | fi
277 | 
278 | echo "DONE, files can be found at $circosDir"
279 | 


--------------------------------------------------------------------------------
/bin/filter_fasta.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | #set -x
  7 | #=============================================================
  8 | # HEADER
  9 | #=============================================================
 10 | 
 11 | #INSTITUTION:ISCIII
 12 | #CENTRE:BU-ISCIII
 13 | #AUTHOR: Pedro J. Sola
 14 | VERSION=1.0
 15 | #CREATED: 21 March 2018
 16 | #REVISION:
 17 | #		22 March 2018: Handle output directory by default the same as -f file
 18 | #		13 April 2018: Include -G option to filter any file by term with both file or term
 19 | #DESCRIPTION:Script that extract sequences by term, either by key or file with a list
 20 | #AKNOWLEDGE:
 21 | #		-Multiple arguments in one flag: https://stackoverflow.com/questions/7529856/retrieving-multiple-arguments-for-a-single-option-using-getopts-in-bash
 22 | #TODO:
 23 | #		-Add and remove sequences in the same execution
 24 | #================================================================
 25 | # END_OF_HEADER
 26 | #================================================================
 27 | 
 28 | #SHORT USAGE RULES
 29 | #LONG USAGE FUNCTION
 30 | usage() {
 31 | 	cat << EOF
 32 | 
 33 | Filter_fasta script that extract sequences by term, either by key or file with a list
 34 | 
 35 | usage : $0 <-i file.fasta> <(-l term1 -l term2 -l term3 | -f file)> [-n <filename>] [-o <directory>] [-G] [-N] [-v] [-h]
 36 | 
 37 | 	-i fasta file to filter
 38 | 	-o output directory (optional). By default the file is replaced in the same location
 39 | 	-n file name (optional). By default is the same as -f file with .fasta extension
 40 | 	-l list of key terms separated by space
 41 | 	-N Use term to discard sequences with terms (Negative filter)
 42 | 	-G General filter: filter any file with a list of keys
 43 | 	-f file with a list of terms to filter
 44 | 	-v version
 45 | 	-h display usage message
 46 | 
 47 | example: filter_fasta.sh -i ecoli.fasta -l NC00012 -l WC52247 -l hypothetical -l partial -n NAME
 48 | 		filter_fasta.sh -i ecoli.fasta -l "NC00012 WC52247 hypothetical partial"
 49 | 		filter_fasta.sh -i ecoli.fasta -f list_with_terms.txt
 50 | 
 51 | EOF
 52 | }
 53 | 
 54 | #================================================================
 55 | # OPTION_PROCESSING
 56 | #================================================================
 57 | #Make sure the script is executed with arguments
 58 | if [ $# = 0 ] ; then
 59 |  usage >&2
 60 |  exit 1
 61 | fi
 62 | 
 63 | # Error handling
 64 | error(){
 65 |   local parent_lineno="$1"
 66 |   local script="$2"
 67 |   local message="$3"
 68 |   local code="${4:-1}"
 69 | 
 70 | 	RED='\033[0;31m'
 71 | 	NC='\033[0m'
 72 | 
 73 |   if [[ -n "$message" ]] ; then
 74 |     echo -e "\n---------------------------------------\n"
 75 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 76 |     echo -e "MESSAGE:\n"
 77 |     echo -e "$message"
 78 |     echo -e "\n---------------------------------------\n"
 79 |   else
 80 |     echo -e "\n---------------------------------------\n"
 81 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 82 |     echo -e "\n---------------------------------------\n"
 83 |   fi
 84 | 
 85 |   exit "${code}"
 86 | }
 87 | 
 88 | #DECLARE FLAGS AND VARIABLES
 89 | term_option=false
 90 | file_option=false
 91 | general_filter=false
 92 | negative_filter=""
 93 | cwd="$(pwd)"
 94 | input_file="Input_file"
 95 | 
 96 | #PARSE VARIABLE ARGUMENTS WITH getops
 97 | #common example with letters, for long options check longopts2getopts.sh
 98 | options=":i:o:n:l:f:GNvh"
 99 | while getopts $options opt; do
100 | 	case $opt in
101 | 		i )
102 | 			input_file=$OPTARG
103 | 			;;
104 | 		o )
105 | 			output_dir=$OPTARG
106 | 			;;
107 | 		n )
108 | 			file_name=$OPTARG
109 | 			;;
110 | 		N )
111 | 			negative_filter="!"
112 | 			;;
113 | 		G )
114 | 			general_filter=true
115 | 			;;
116 | 		l )
117 | 			terms_for_filtering+=($OPTARG)
118 | 			term_option=true
119 | 			;;
120 | 		f )
121 | 			file_for_filtering=$OPTARG
122 | 			check_mandatory_files.sh $input_file
123 | 			file_option=true
124 | 			;;
125 |         h )
126 | 		  	usage
127 | 		  	exit 1
128 | 		  	;;
129 | 		v )
130 | 		  	echo $VERSION
131 | 		  	exit 1
132 | 		  	;;
133 | 		\?)
134 | 			echo "Invalid Option: -$OPTARG" 1>&2
135 | 			usage
136 | 			exit 1
137 | 			;;
138 | 		: )
139 |       		echo "Option -$OPTARG requires an argument." >&2
140 |       		exit 1
141 |       		;;
142 |       	* )
143 | 			echo "Unimplemented option: -$OPTARG" >&2;
144 | 			exit 1
145 | 			;;
146 | 
147 | 	esac
148 | done
149 | shift $((OPTIND-1))
150 | 
151 | #================================================================
152 | # MAIN_BODY
153 | #================================================================
154 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
155 | 
156 | echo -e "\n#Executing" $0 "\n"
157 | 
158 | check_mandatory_files.sh $input_file
159 | 
160 | 
161 | if [ $general_filter = true ]; then
162 | 	file_name=$(basename $input_file)
163 | 	output_dir=$(dirname $input_file)
164 | fi
165 | 
166 | #MANAGE OUTPUT DIRECTORY
167 | if [ $file_option = true ] && [ ! $output_dir ]; then
168 | 	output_dir=$(dirname $file_for_filtering)
169 | 	echo "Output directory is" $output_dir
170 | 	mkdir -p $output_dir
171 | elif [ $file_option = false ] && [ ! $output_dir ]; then
172 |  	echo "please, provide an output directory" $output_dir
173 |  	exit 1
174 | else
175 | 	echo "Output directory is=" $output_dir
176 | 	mkdir -p $output_dir
177 | fi
178 | 
179 | #MANAGE FILE NAME
180 | if [ $file_option = true ] && [ ! $file_name ]; then
181 | 	file_name=$(echo $(basename $file_for_filtering))
182 | elif [ $file_option = false ] && [ ! $file_name  ]; then
183 |  	file_name=$terms_for_filtering #First term supplied by -l
184 | else
185 | 	echo "File name is=" $file_name
186 | fi
187 | 
188 | #PROCESS REGULAR EXPRESSION TERMS
189 | if [ $term_option = true ] && [ $file_option = false ]; then
190 | 
191 | 	list_terms_listed=$(for term in "${terms_for_filtering[@]}"; do echo "$term"; done) #process terms into list
192 | 	final_list_terms_regexp=$(printf "%s|" $list_terms_listed | sed 's/|$//g') #suitable for regexp
193 | 
194 | elif [ $term_option = false ] && [ $file_option = true ]; then
195 | 
196 | 	check_mandatory_files.sh $file_for_filtering
197 | 	if [ ! -s $file_for_filtering ];then
198 | 		echo -e "ERROR: terms file empty!!"
199 | 		exit 1
200 | 	fi
201 | 
202 | 	final_list_terms_regexp=$(printf "%s|" $(cat $file_for_filtering) | sed 's/|$//g')
203 | else
204 | 
205 | 	check_mandatory_files.sh $file_for_filtering
206 | 	if [ ! -s $file_for_filtering ];then
207 | 		echo -e "ERROR: terms file empty!!"
208 | 		exit 1
209 | 	fi
210 | 	list_terms_listed=$(for term in "${terms_for_filtering[@]}"; do echo "$term"; done)
211 | 	list_terms_regexp_term=$(printf "%s|" $list_terms_listed | sed 's/|$//g')
212 | 	list_terms_regexp_file=$(printf "%s|" $(cat $file_for_filtering) | sed 's/|$//g')
213 | 	final_list_terms_regexp=$(echo $list_terms_regexp_term"|"$list_terms_regexp_file) #concat all regexp into one
214 | fi
215 | 
216 | #AWK SCRIPT THAT FILTER SEQUENCES#
217 | ##################################
218 | 
219 | if [ $general_filter = true ]; then
220 | 
221 | 	echo "$(date)"
222 | 	echo "General filtering terms on file" $(basename $input_file)
223 | 
224 | 	awk '
225 | 		/'"${final_list_terms_regexp}"'/ {print $0}
226 | 		' $input_file \
227 | 		> $output_dir/$file_name"_term" || error ${LINENO} $(basename $0) "Awk command for fasta filtering in $file_name\"_term\" creation. See $output_dir/logs for more information."
228 | 
229 | 	echo "$(date)"
230 | 	echo "Done general filtering terms on file" $(basename $input_file)
231 | 	echo "File with filtered lines can be found in" $output_dir/$file_name"_term"
232 | 
233 | else
234 | 	echo "$(date)"
235 | 	echo "Filtering terms on file" $(basename $input_file)
236 | 	seq_number_prev=$(cat $input_file | grep ">" | wc -l)
237 | 
238 | 	awk '
239 | 		BEGIN {RS=">"}
240 | 		'"${negative_filter}"'/'"${final_list_terms_regexp}"'/ {print ">"$0}
241 | 		' $input_file \
242 | 		> $output_dir/$file_name"_term.fasta" || error ${LINENO} $(basename $0) "Awk command for fasta filtering in $file_name\"_term.fata\" creation. See $output_dir/logs for more information."
243 | 
244 | 	echo "$(date)"
245 | 	echo "DONE Filtering terms on file" $(basename $input_file)
246 | 	seq_number_post=$(cat $output_dir/$file_name"_term.fasta" | grep ">" | wc -l)
247 | 	echo "File with filtered sequences can be found in" $output_dir/$file_name"_term.fasta"
248 | 
249 | 	echo "Previous number of sequences=" $seq_number_prev
250 | 	echo "Post number of sequences=" $seq_number_post
251 | 	echo -e "\n"
252 | fi
253 | 


--------------------------------------------------------------------------------
/bin/get_coverage.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | #=============================================================
  6 | # HEADER
  7 | #=============================================================
  8 | 
  9 | #INSTITUTION:ISCIII
 10 | #CENTRE:BU-ISCIII
 11 | #AUTHOR: Pedro J. Sola
 12 | VERSION=1.0
 13 | #CREATED: 20 March 2018
 14 | #REVISION:
 15 | #DESCRIPTION:Script that uses bedtool to obtain coverage data from a BAMm file
 16 | #The default output format is as follows:
 17 | #
 18 | #chromosome (or entire genome)
 19 | #depth of coverage from features in input file
 20 | #number of bases on chromosome (or genome) with depth equal to column 2.
 21 | #size of chromosome (or entire genome) in base pairs
 22 | #fraction of bases on chromosome (or entire genome) with depth equal to column 2.
 23 | #
 24 | #chr1   0  980  1000  0.98
 25 | #chr1   1  20   1000  0.02
 26 | #chr2   1  500  500   1
 27 | #genome 0  980  1500  0.653333
 28 | #genome 1  520  1500  0.346667
 29 | #
 30 | #-p option is equivalent to -bga BEDGRAPH output
 31 | #
 32 | #chr1  0       554304  0
 33 | #chr1  554304  554309  5
 34 | #chr1  554309  554313  6
 35 | #chr1  554313  554314  1
 36 | #chr1  554314  554315  0
 37 | #chr1  554315  554316  6
 38 | #chr1  554316  554317  5
 39 | #chr1  554317  554318  1
 40 | #chr1  554318  554319  2
 41 | #chr1  554319  554321  6
 42 | #================================================================
 43 | # END_OF_HEADER
 44 | #================================================================
 45 | 
 46 | #SHORT USAGE RULES
 47 | #LONG USAGE FUNCTION
 48 | usage() {
 49 | 	cat << EOF
 50 | 
 51 | Get_coverage script uses bedtool to obtain coverage data from a BAMm file
 52 | 
 53 | usage : $0 <-i inputfile(sorted.bam)> [-o <directory>] [-d <database(fasta)>] [-s sample_name]
 54 | 		 [-g group_name] [-m <int>] [p] [-v] [-h]
 55 | 
 56 | 	-i input file in sorted BAM format
 57 | 	-o output directory (optional)
 58 | 	-d database to extract length. Fasta file used to map against
 59 | 	-m max depth reported (default 500)
 60 | 	-p reports genome coverage for all positions in BEDGRAPH format includig 0 positions.
 61 | 		Default option is bedtools genomecov that needs the reference genome
 62 | 	-s sample name
 63 | 	-g group name (optional). If unset, samples will be gathered in NO_GROUP group
 64 | 	-v version
 65 | 	-h display usage message
 66 | 
 67 | example: get_coverage.sh -i ecoli.bam -d database.fasta
 68 | 		 get_coverage.sh -i ecoli.bam -p -m 100
 69 | 
 70 | EOF
 71 | }
 72 | 
 73 | #================================================================
 74 | # OPTION_PROCESSING
 75 | #================================================================
 76 | #Make sure the script is executed with arguments
 77 | if [ $# = 0 ] ; then
 78 |  usage >&2
 79 |  exit 1
 80 | fi
 81 | 
 82 | # Error handling
 83 | error(){
 84 |   local parent_lineno="$1"
 85 |   local script="$2"
 86 |   local message="$3"
 87 |   local code="${4:-1}"
 88 | 
 89 | 	RED='\033[0;31m'
 90 | 	NC='\033[0m'
 91 | 
 92 |   if [[ -n "$message" ]] ; then
 93 |     echo -e "\n---------------------------------------\n"
 94 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 95 |     echo -e "MESSAGE:\n"
 96 |     echo -e "$message"
 97 |     echo -e "\n---------------------------------------\n"
 98 |   else
 99 |     echo -e "\n---------------------------------------\n"
100 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
101 |     echo -e "\n---------------------------------------\n"
102 |   fi
103 | 
104 |   exit "${code}"
105 | }
106 | 
107 | #DECLARE FLAGS AND VARIABLES
108 | cwd="$(pwd)"
109 | group="NO_GROUP"
110 | input_file="Input_file"
111 | database="Database"
112 | positional=false
113 | max_coverage=500
114 | 
115 | #PARSE VARIABLE ARGUMENTS WITH getops
116 | #common example with letters, for long options check longopts2getopts.sh
117 | options=":i:o:d:s:g:m:n:pvh"
118 | while getopts $options opt; do
119 | 	case $opt in
120 | 		i )
121 | 			input_file=$OPTARG
122 | 			;;
123 | 		o )
124 | 			output_dir=$OPTARG
125 | 			;;
126 | 		s )
127 | 			sample=$OPTARG
128 | 			;;
129 | 		g)
130 | 			group=$OPTARG
131 | 			;;
132 | 		d )
133 | 			database=$OPTARG
134 | 			;;
135 | 		m )
136 | 			max_coverage=$OPTARG
137 | 			;;
138 | 		p )
139 |           	positional=true
140 |       		;;
141 |         h )
142 | 		  	usage
143 | 		  	exit 1
144 | 		  	;;
145 | 		v )
146 | 		  	echo $VERSION
147 | 		  	exit 1
148 | 		  	;;
149 | 		\?)
150 | 			echo "Invalid Option: -$OPTARG" 1>&2
151 | 			usage
152 | 			exit 1
153 | 			;;
154 | 		: )
155 |       		echo "Option -$OPTARG requires an argument." >&2
156 |       		exit 1
157 |       		;;
158 |       	* )
159 | 			echo "Unimplemented option: -$OPTARG" >&2;
160 | 			exit 1
161 | 			;;
162 | 
163 | 	esac
164 | done
165 | shift $((OPTIND-1))
166 | 
167 | #================================================================
168 | # MAIN_BODY
169 | #================================================================
170 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
171 | 
172 | echo -e "\n#Executing" $0 "\n"
173 | 
174 | check_mandatory_files.sh $input_file
175 | 
176 | check_dependencies.sh bedtools
177 | 
178 | if [ ! $output_dir ]; then
179 | 	output_dir=$(dirname $input_file)
180 | 	echo "Default output directory is" $output_dir
181 | 	mkdir -p $output_dir
182 | else
183 | 	echo "Output directory is" $output_dir
184 | 	mkdir -p $output_dir
185 | fi
186 | 
187 | if [ ! $filename ]; then
188 | 	filename=$(basename $input_file | cut -d. -f1)
189 | fi
190 | 
191 | 
192 | 
193 | if [ $positional = true ]; then
194 | 	if [ -f $imageDir/$sample".plasmid.bedgraph" ];then \
195 | 		echo "Found a bedgraph file for sample" $sample;
196 | 		echo "Omitting bedgraph step"
197 | 	else
198 | 		echo "$(date)"
199 | 		echo "Obtaining coverage coordinates from sequences"
200 | 
201 | 		bedtools genomecov -ibam $input_file -bga -max $max_coverage > $output_dir/$filename".bedgraph"|| error ${LINENO} $(basename $0) "Bedtools genomecov command failed. See $output_dir/logs for more information."
202 | 
203 | 		echo "$(date)"
204 | 		echo "DONE obtaining coverage coordinates from sequences"
205 | 	fi
206 | else
207 | 
208 | 
209 | 	check_mandatory_files.sh $database
210 | 
211 | 	if [ -f $database".length" ]; then
212 | 		echo "Found length file for" $(basename $database)
213 | 		echo "Omitting length calculation"
214 | 	else
215 | 		echo "$(date)"
216 | 		echo "Creating a length file for" $(basename $database)
217 | 		calculate_seqlen.sh -r -i $database > $database".length"|| error ${LINENO} $(basename $0) "calculate_seqlen script failed. See $output_dir/logs for more information."
218 | 	fi
219 | 
220 | 	if [ -f $output_dir/$filename".coverage" ];then \
221 | 		echo "Found a coverage file for sample" $sample;
222 | 		echo "Omitting coverage calculation"
223 | 	else
224 | 		echo "$(date)"
225 | 		echo "Calculating coverage for every position that mapped $filename"
226 | 
227 | 		bedtools genomecov -ibam $input_file -g $database".length" > $output_dir/$filename".coverage"|| error ${LINENO} $(basename $0) "Bedtools genomecov command failed. See $output_dir/logs for more information."
228 | 
229 | 		echo "$(date)"
230 | 		echo "DONE Calculating coverage for every plamid that mapped $sample"
231 | 	fi
232 | fi
233 | 
234 | echo -e "\n"
235 | 


--------------------------------------------------------------------------------
/bin/mash_screener.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | #=============================================================
  6 | # HEADER
  7 | #=============================================================
  8 | 
  9 | #INSTITUTION:ISCIII
 10 | #CENTRE:BU-ISCIII
 11 | #AUTHOR: Pedro J. Sola
 12 | VERSION=1.0
 13 | #CREATED: 27 November 2019
 14 | #REVISION:
 15 | 
 16 | #DESCRIPTION:Script that screen reads over a database using kmers and estract sequences ids with higher values
 17 | #TODO
 18 | #================================================================
 19 | # END_OF_HEADER
 20 | #================================================================
 21 | 
 22 | #SHORT USAGE RULES
 23 | #LONG USAGE FUNCTION
 24 | usage() {
 25 | 	cat << EOF
 26 | 
 27 | Bowtie_mapper script index a database and map a supplied pair-end sequences
 28 | 
 29 | usage : $0 [-i <inputfile>] [-o <directory>] <-d database(fasta)> <-s sample_name> <-1 R1> <-2 R2>
 30 | 		[-g group_name] [-f <int>] [-T <int>] [-a] [-v] [-h]
 31 | 
 32 | 	-i input directory (optional)
 33 | 	-o output directory (optional)
 34 | 	-d database to screen (.fasta)
 35 | 	-s sample name
 36 | 	-g group name (optional). If unset, samples will be gathered in NO_GROUP group
 37 | 	-1 reads corresponding to paired-end R1
 38 | 	-2 reads corresponding to paired-end R2
 39 | 	-f threshold identity value to retieve sequence ids with at least this value (default 0.9)
 40 | 	-w use winner takes it all
 41 | 	-T number of threads
 42 | 	-v version
 43 | 	-h display usage message
 44 | 
 45 | example: mash_screener.sh -d database.fasta -s COLI -1 ecoli_1.fastq -2 ecoli_2.fastq
 46 | 
 47 | EOF
 48 | }
 49 | 
 50 | #================================================================
 51 | # OPTION_PROCESSING
 52 | #================================================================
 53 | #Make sure the script is executed with arguments
 54 | if [ $# = 0 ] ; then
 55 |  usage >&2
 56 |  exit 1
 57 | fi
 58 | 
 59 | # Error handling
 60 | error(){
 61 |   local parent_lineno="$1"
 62 |   local script="$2"
 63 |   local message="$3"
 64 |   local code="${4:-1}"
 65 | 
 66 | 	RED='\033[0;31m'
 67 | 	NC='\033[0m'
 68 | 
 69 |   if [[ -n "$message" ]] ; then
 70 |     echo -e "\n---------------------------------------\n"
 71 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 72 |     echo -e "MESSAGE:\n"
 73 |     echo -e "$message"
 74 |     echo -e "\n---------------------------------------\n"
 75 |   else
 76 |     echo -e "\n---------------------------------------\n"
 77 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 78 |     echo -e "\n---------------------------------------\n"
 79 |   fi
 80 | 
 81 |   exit "${code}"
 82 | }
 83 | 
 84 | #DECLARE FLAGS AND VARIABLES
 85 | threads=1
 86 | offrate=1
 87 | filter_identity=0.9
 88 | cwd="$(pwd)"
 89 | w_winner=""
 90 | group="NO_GROUP"
 91 | database="Database"
 92 | R1="R1"
 93 | R2="R2"
 94 | 
 95 | #PARSE VARIABLE ARGUMENTS WITH getops
 96 | #common example with letters, for long options check longopts2getopts.sh
 97 | options=":i:o:s:g:d:1:2:f:T:avwh"
 98 | while getopts $options opt; do
 99 | 	case $opt in
100 | 		i )
101 | 			input_dir=$OPTARG
102 | 			;;
103 | 		o )
104 | 			output_dir=$OPTARG
105 | 			;;
106 | 		s )
107 | 			sample=$OPTARG
108 | 			;;
109 | 		g)
110 | 			group=$OPTARG
111 | 			;;
112 | 		d )
113 | 			database=$OPTARG
114 | 			;;
115 | 		1 )
116 | 			R1=$OPTARG
117 | 			;;
118 | 		2 )
119 | 			R2=$OPTARG
120 | 			;;
121 | 		f )
122 |           	filter_identity=$OPTARG
123 |       		;;
124 | 		w)
125 | 			w_winner="-w"
126 | 			;;
127 |         T )
128 | 			threads=$OPTARG
129 |             ;;
130 | 
131 |         h )
132 | 		  	usage
133 | 		  	exit 1
134 | 		  	;;
135 | 		v )
136 | 		  	echo $VERSION
137 | 		  	exit 1
138 | 		  	;;
139 | 		\?)
140 | 			echo "Invalid Option: -$OPTARG" 1>&2
141 | 			usage
142 | 			exit 1
143 | 			;;
144 | 		: )
145 |       		echo "Option -$OPTARG requires an argument." >&2
146 |       		exit 1
147 |       		;;
148 |       	* )
149 | 			echo "Unimplemented option: -$OPTARG" >&2;
150 | 			exit 1
151 | 			;;
152 | 
153 | 	esac
154 | done
155 | shift $((OPTIND-1))
156 | 
157 | 
158 | #================================================================
159 | # MAIN_BODY
160 | #================================================================
161 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
162 | 
163 | echo -e "\n#Executing" $0 "\n"
164 | 
165 | check_dependencies.sh bash mash
166 | 
167 | check_mandatory_files.sh $database $R1
168 | 
169 | if [ ! $sample ]; then
170 | 	echo "ERROR: please, provide a sample name"
171 | 	usage
172 | 	exit 1
173 | fi
174 | 
175 | if [ ! $output_dir ]; then
176 | 	output_dir=$cwd"/$group/$sample/kmer/"
177 | 	echo "Default output directory is" $output_dir
178 | 	mkdir -p $output_dir
179 | else
180 | 	echo "Output directory is" $output_dir
181 | 	mkdir -p $output_dir
182 | fi
183 | 
184 | 
185 | ########SKETCH##############
186 | ############################
187 | 
188 | if [ -f $output_dir/database.msh ]; then \
189 | 	echo "Found a sketch ddbb for" $(basename $database);
190 | 	echo "Omitting sketching"
191 | else
192 | 	echo "creating sketch of " $(basename $database);
193 | 	mash sketch -i -k 32 -s 1000 -p $threads -o $output_dir/database $database || error ${LINENO} $(basename $0) "mash screen command failed. See $output_dir/logs for more information"
194 | fi
195 | 
196 | ########SCREEN##############
197 | ############################
198 | 
199 | if [ -f $output_dir/database.screen.tab ];then \
200 | 	echo "Found a mash screen file for sample" $sample;
201 | 	echo "Omitting screening"
202 | else
203 | 	echo "$(date)"
204 | 	echo screening $R1
205 | 
206 | 	mash screen $w_winner -p $threads $output_dir/database.msh $R1 > $output_dir/database.screen.tab || error ${LINENO} $(basename $0) "Bowtie2 command failed. See $output_dir/logs for more information"
207 | 
208 | 
209 | 	echo "$(date)"
210 | 	echo -e "DONE Screening $sample of $group Group" "\n"
211 | fi
212 | 
213 | ######PARSE_RESULT##########
214 | ############################
215 | 
216 | output_mash_id=$output_dir/database.filtered_$filter_identity
217 | 
218 | echo "Retrieving sequences matching more than $filter_identity identity"
219 | 
220 | cat $output_dir/database.screen.tab | awk '($1 >= '"${filter_identity}"') {print $5}' > $output_mash_id
221 | 
222 | 
223 | #####FILTER SEQUENCES#######
224 | ############################
225 | if [ $(cat $output_mash_id | wc -l | cut -d " " -f 1) -gt 0 ]
226 | then
227 | 	filter_fasta.sh -i $database -f $output_mash_id
228 | else
229 | 	echo "No plasmids have passed the mash identity filter!! Exiting!!"
230 | 	exit 0
231 | fi
232 | 


--------------------------------------------------------------------------------
/bin/ncbi_database_fetcher.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list, 
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | #set -x
  7 | #=============================================================
  8 | # HEADER
  9 | #=============================================================
 10 | 
 11 | #INSTITUTION:ISCIII
 12 | #CENTRE:BU-ISCIII
 13 | #AUTHOR: Pedro J. Sola
 14 | VERSION=1.0 
 15 | #CREATED: 12 June 2018
 16 | #REVISION:
 17 | # 22 June 2018: include quite mode that avoid watching the progress
 18 | #		
 19 | #
 20 | #DESCRIPTION:Script that extract a database from ncbi database using terms
 21 | #AKNOWLEDGE: 
 22 | #		-Multiple arguments in one flag: https://stackoverflow.com/questions/7529856/retrieving-multiple-arguments-for-a-single-option-using-getopts-in-bash
 23 | #
 24 | #================================================================
 25 | # END_OF_HEADER
 26 | #================================================================
 27 | 
 28 | #SHORT USAGE RULES
 29 | #LONG USAGE FUNCTION
 30 | usage() {
 31 | 	cat << EOF
 32 | 
 33 | ncbi_database_fetcher is a script that extract sequences from NCBI by term
 34 | 
 35 | usage : $0 <(-y term1 -y term2 | -y "term1 term2")> [(-n term1 -n term2 | -n "term1 term2")] [-O <organism>][-d (nucleotide|protein)] [-f <filename>] [-o <directory>] [-q] [-v] [-h]
 36 | 
 37 | 	-y list of key terms separated by space to be INCLUDED in sequences title
 38 | 	-n list of key terms separated by space to be EXCLUDED in sequences title
 39 | 	-O organism to filter
 40 | 	-d database type, default nucleotide
 41 | 	-o output directory (optional). By default the file is placed in cwd
 42 | 	-f file name (optional). By default is the first term used as query
 43 | 	-q quiet
 44 | 	-v version
 45 | 	-h display usage message
 46 | 
 47 | example: ./ncbi_database_fetcher.sh -y plasmid -n unnamed -n partial -O Archaea
 48 | 
 49 | EOF
 50 | }
 51 | 
 52 | #================================================================
 53 | # OPTION_PROCESSING
 54 | #================================================================
 55 | #Make sure the script is executed with arguments
 56 | if [ $# = 0 ] ; then
 57 |  usage >&2
 58 |  exit 1
 59 | fi
 60 | 
 61 | #DECLARE FLAGS AND VARIABLES
 62 | cwd="$(pwd)"
 63 | use_term_and=false
 64 | use_term_not=false
 65 | use_term_org=false
 66 | quiet=false
 67 | database_type=nucleotide
 68 | #PARSE VARIABLE ARGUMENTS WITH getops
 69 | 
 70 | options=":y:n:o:f:d:O:qvh"
 71 | while getopts $options opt; do
 72 | 	case $opt in
 73 | 		o )
 74 | 			output_dir=$OPTARG
 75 | 			;;
 76 | 		O)
 77 | 			terms_organism+=($OPTARG)
 78 | 			use_term_org=true
 79 | 			;;
 80 | 		f )
 81 | 			file_name=$OPTARG
 82 | 			;;
 83 | 		d )
 84 | 			database_type=$OPTARG
 85 | 			;;
 86 | 		y )
 87 | 			terms_and+=($OPTARG)
 88 | 			use_term_and=true
 89 | 			;;
 90 | 		n )
 91 | 			terms_not+=($OPTARG)
 92 | 			use_term_not=true
 93 | 			;;
 94 | 		q )
 95 | 			quiet=true
 96 | 			;;
 97 | 		h )
 98 | 		  	usage
 99 | 		  	exit 1
100 | 		  	;;
101 | 		v )
102 | 		  	echo $VERSION
103 | 		  	exit 1
104 | 		  	;;
105 | 		\?)  
106 | 			echo "Invalid Option: -$OPTARG" 1>&2
107 | 			usage
108 | 			exit 1
109 | 			;;
110 | 		: )
111 |       		echo "Option -$OPTARG requires an argument." >&2
112 |       		exit 1
113 |       		;;
114 |       	* ) 
115 | 			echo "Unimplemented option: -$OPTARG" >&2;
116 | 			exit 1
117 | 			;;
118 | 
119 | 	esac
120 | done
121 | shift $((OPTIND-1))
122 | 
123 | #================================================================
124 | # MAIN_BODY
125 | #================================================================
126 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
127 | 
128 | echo -e "\n#Executing" $0 "\n"
129 | 
130 | if [ $use_term_and = false ]; then
131 | 	echo "Please, introduce at least one term to include search"
132 | 	usage
133 | 	exit 1
134 | fi
135 | 
136 | #MANAGE OUTPUT DIRECTORY
137 | if [ ! $output_dir ]; then
138 | 	output_dir=$cwd
139 | 	echo "Default output_dir is" $output_dir
140 | 	mkdir -p $output_dir
141 | else
142 | 	echo "Output directory is" $output_dir
143 | 	mkdir -p $output_dir
144 | fi
145 | 
146 | #MANAGE FILE NAME
147 | 
148 | if [ ! $file_name ]; then
149 | 
150 | 	if [ "${#terms_and[@]}" -gt 1 ]; then
151 | 		file_name_value_one=$(echo ${terms_and[0]})
152 | 		file_name_value_two=$(echo ${terms_and[1]})
153 | 
154 | 		file_name=$file_name_value_one"_"$file_name_value_two
155 | 		echo "Default file name is" $file_name
156 | 	else
157 | 		file_name=$terms_and".database"
158 | 		echo "Default file name is" $file_name
159 | 	fi
160 | else
161 | 	echo "File name is" $file_name
162 | fi
163 | 
164 | 
165 | ##PROCESS REGULAR EXPRESSION TERMS
166 | 
167 | list_terms_and=$(for term in "${terms_and[@]}"; do echo "$term"; done)
168 | list_terms_org=$(for organism in "${terms_organism[@]}"; do echo "$organism"; done)
169 | 
170 | #echo "${#terms_and[@]}" "NUMBER OF TERMS"
171 | 
172 | list_terms_regexp_and=$(printf "%s[Title] AND " $list_terms_and | sed 's/ AND $//g')
173 | list_terms_regexp_organism=$(printf "AND %s[organism] " $list_terms_org | sed 's/ $//g')
174 | 
175 | if [ $use_term_not = true ]; then
176 | 
177 | 	list_terms_not=$(for term in "${terms_not[@]}"; do echo "$term"; done)
178 | 	list_terms_regexp_not=$(printf "NOT %s[Title] " $list_terms_not | sed 's/ $//g')
179 | 	final_list_terms_regexp=$(echo $list_terms_regexp_and" "$list_terms_regexp_not" "$list_terms_regexp_organism) #concat all regexp into one
180 | 
181 | else
182 | 	final_list_terms_regexp=$(echo $list_terms_regexp_and " "$list_terms_regexp_organism)
183 | fi
184 | 
185 | echo $final_list_terms_regexp
186 | 
187 | ########EUTILS COMMAND############
188 | ##################################
189 | 
190 | echo "$(date)"
191 | echo "Obtaining seuences with terms:" $list_terms_and
192 | echo "But not those terms:" $list_terms_not
193 | if [ $use_term_org = true ]; then
194 | 	echo "Filtering by organisms:" $list_terms_org
195 | fi
196 | echo ""
197 | 
198 | base="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
199 | 
200 | ##DETERMINE RETMAX
201 | wget -q -O $output_dir/$file_name".count" $base"esearch.fcgi?db="$database_type"&term=""$final_list_terms_regexp"
202 | 
203 | counter=$(cat $output_dir/$file_name".count" | awk '/<Count>/' | head -n 1 | awk '/<Count>/ {split($0,counter_prev,"</Count>");split(counter_prev[1],counter,"<Count>")}END{print counter[length(counter)]}')
204 | echo -e "FOUND" $counter "RECORDS\n"
205 | 
206 | if [ $counter -eq 0 ]; then
207 | 	echo "Try different terms"
208 | 	echo "EXIT"
209 | 	exit 1
210 | fi
211 | 
212 | echo "Retrieving Id"
213 | 
214 | ##OBTAIN TOTAL LIST OF ID
215 | wget -q -O $output_dir/$file_name".id" $base"esearch.fcgi?db="$database_type"&term=""$final_list_terms_regexp""&RetMax="$counter
216 | 
217 | list_of_id=$(cat $output_dir/$file_name".id"| awk '{split($0,id_prev,"</Id>");split(id_prev[1],id,"<Id>")}/<Id>/{print id[length(id)]}')
218 | array_of_id=($list_of_id)
219 | 
220 | echo "And sequences"
221 | counter=1
222 | 
223 | 
224 | ##Checking previous DDBB
225 | if [ -s $output_dir/$file_name".fasta" ]; then
226 | 	echo -e "\nFound a ddbb with the same name, Removing it\n"
227 | 	rm $output_dir/$file_name".fasta"
228 | fi
229 | 
230 | 
231 | ##RETRIEVING FASTA SEQUENCE 
232 | 
233 | for i in $list_of_id
234 | do 
235 | 	if [ $quiet = false ]; then
236 | 
237 | 		echo $counter"/""${#array_of_id[@]}"
238 | 	fi
239 | 
240 | 	((counter++))
241 | 	
242 | 	curl -s $base"efetch.fcgi?db="$database_type"&id="$i"&retmode=text&rettype=fasta" >> $output_dir/$file_name".fasta"
243 | done
244 | 
245 | 
246 | echo "$(date)"
247 | echo "DONE obtaining seuences with terms supplied"
248 | 
249 | seq_number_post=$(cat $output_dir/$file_name".fasta" | grep ">" | wc -l)
250 | echo "File with filtered sequences can be found in" $output_dir/$file_name".fasta"
251 | echo "with" $seq_number_post "sequences"
252 | 
253 | rm $output_dir/$file_name".count"
254 | rm $output_dir/$file_name".id"
255 | 


--------------------------------------------------------------------------------
/bin/process_cluster_output.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | #set -x
  7 | 
  8 | #=============================================================
  9 | # HEADER
 10 | #=============================================================
 11 | 
 12 | #INSTITUTION:ISCIII
 13 | #CENTRE:BU-ISCIII
 14 | #AUTHOR: Pedro J. Sola
 15 | VERSION=1.0
 16 | #CREATED: 12 April 2018
 17 | #REVISION:
 18 | #DESCRIPTION:process_cluster_output script obtain a list of ac from fasta, and estract their coverage value from a coverage file
 19 | 
 20 | #================================================================
 21 | # END_OF_HEADER
 22 | #================================================================
 23 | 
 24 | #SHORT USAGE RULES
 25 | #LONG USAGE FUNCTION
 26 | usage() {
 27 | 	cat << EOF
 28 | 
 29 | process_cluster_output script obtain a list of ac from fasta, and estract their coverage value from a coverage file
 30 | 
 31 | usage : $0 <-i inputfile(.fasta)> <-b coverage_file> [-o <directory>] [-c <int(0-100)>] [-s <suffix>] [-v] [-h]
 32 | 
 33 | 	-i input file
 34 | 	-b file with coverage info
 35 | 	-o output directory (optional). By default the file is replaced in the same location
 36 | 	-c percentage value to filter >= values. If not supplied, all records will be outputted
 37 | 	-s string to ad at the end of the outputted file (list of accession numbers)
 38 | 	-v version
 39 | 	-h display usage message
 40 | 
 41 | example: process_cluster_output.sh -i ecoli_clustered.fasta_70 -b ecoli.coverage
 42 | 
 43 | EOF
 44 | }
 45 | 
 46 | #================================================================
 47 | # OPTION_PROCESSING
 48 | #================================================================
 49 | #Make sure the script is executed with arguments
 50 | if [ $# = 0 ] ; then
 51 |  usage >&2
 52 |  exit 1
 53 | fi
 54 | 
 55 | # Error handling
 56 | error(){
 57 |   local parent_lineno="$1"
 58 |   local script="$2"
 59 |   local message="$3"
 60 |   local code="${4:-1}"
 61 | 
 62 | 	RED='\033[0;31m'
 63 | 	NC='\033[0m'
 64 | 
 65 |   if [[ -n "$message" ]] ; then
 66 |     echo -e "\n---------------------------------------\n"
 67 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 68 |     echo -e "MESSAGE:\n"
 69 |     echo -e "$message"
 70 |     echo -e "\n---------------------------------------\n"
 71 |   else
 72 |     echo -e "\n---------------------------------------\n"
 73 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 74 |     echo -e "\n---------------------------------------\n"
 75 |   fi
 76 | 
 77 |   exit "${code}"
 78 | }
 79 | 
 80 | #DECLARE FLAGS AND VARIABLES
 81 | cwd="$(pwd)"
 82 | input_file="Input_file"
 83 | coverage_cutoff_input=100
 84 | 
 85 | #PARSE VARIABLE ARGUMENTS WITH getops
 86 | #common example with letters, for long options check longopts2getopts.sh
 87 | options=":i:b:o:c:s:vh"
 88 | while getopts $options opt; do
 89 | 	case $opt in
 90 | 		i )
 91 | 			input_file=$OPTARG
 92 | 			;;
 93 | 		b )
 94 | 			coverage_file=$OPTARG
 95 | 			;;
 96 | 		o )
 97 | 			output_dir=$OPTARG
 98 | 			;;
 99 | 		c )
100 | 			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
101 | 				echo "please, provide a percentage between 0 and 100"
102 | 				usage
103 | 				exit 1
104 | 			else
105 | 				coverage_cutoff_input=$OPTARG
106 | 			fi
107 | 			;;
108 | 		s )
109 | 			suffix=$OPTARG
110 | 			;;
111 |         h )
112 | 		  	usage
113 | 		  	exit 1
114 | 		  	;;
115 | 		v )
116 | 		  	echo $VERSION
117 | 		  	exit 1
118 | 		  	;;
119 | 		\?)
120 | 			echo "Invalid Option: -$OPTARG" 1>&2
121 | 			usage
122 | 			exit 1
123 | 			;;
124 | 		: )
125 |       		echo "Option -$OPTARG requires an argument." >&2
126 |       		exit 1
127 |       		;;
128 |       	* )
129 | 			echo "Unimplemented option: -$OPTARG" >&2;
130 | 			exit 1
131 | 			;;
132 | 
133 | 	esac
134 | done
135 | shift $((OPTIND-1))
136 | 
137 | #================================================================
138 | # MAIN_BODY
139 | #================================================================
140 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
141 | 
142 | echo -e "\n#Executing" $0 "\n"
143 | 
144 | check_mandatory_files.sh $input_file
145 | 
146 | suffix="_clustered"
147 | coverage_cutoff=$(echo "(1 - ($coverage_cutoff_input/100))" | bc -l)
148 | 
149 | if [ ! $output_dir ]; then
150 | 	output_dir=$(dirname $input_file)
151 | 	#echo "Default output directory is" $output_dir
152 | 	mkdir -p $output_dir
153 | else
154 | 	#echo "Output directory is" $output_dir
155 | 	mkdir -p $output_dir
156 | fi
157 | 
158 | 
159 | if [ ! $file_name ]; then
160 | 	file_name=$(basename $input_file)
161 | 	coverage_name=$(basename $coverage_file)
162 | fi
163 | 
164 | echo "$(date)"
165 | echo "extracting coverage info from clustered sequences in" $file_name
166 | 
167 | ac_input_file=$(cat $input_file | grep ">" | awk '{gsub(">","");print $1}')
168 | 
169 | for i in $ac_input_file ;do
170 | 	awk '
171 | 		/^'"$i"'/
172 | 		' $coverage_file
173 | done > $output_dir/$coverage_name$suffix || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix creation. See $output_dir/logs for more information."
174 | 
175 | 
176 | awk '
177 | 	{if ($2 == 0 && $5 <= '"${coverage_cutoff}"')
178 | 		{print $1}}
179 | 	' $output_dir/$coverage_name$suffix > $output_dir/$coverage_name$suffix"_ac" || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix\"_ac\" creation. See $output_dir/logs for more information."
180 | 
181 | 
182 | awk '
183 | 	{if ($2 == 0 && $5 <= '"${coverage_cutoff}"')
184 | 	 	{print $1, ((1 - $5)*100)}
185 | 	}
186 | 	' $output_dir/$coverage_name$suffix > $output_dir/$coverage_name$suffix"_percentage" || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix\"_percentage\" creation. See $output_dir/logs for more information."
187 | 
188 | echo "$(date)"
189 | echo "DONE extracting coverage info from clustered sequences in" $file_name
190 | echo -e "Info can be found at" $coverage_name$suffix"_ac and" "\n" $coverage_name$suffix"_percentage" "\n"
191 | 


--------------------------------------------------------------------------------
/bin/prokka_annotation.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #set -e
  4 | 
  5 | #=============================================================
  6 | # HEADER
  7 | #=============================================================
  8 | 
  9 | #INSTITUTION:ISCIII
 10 | #CENTRE:BU-ISCIII
 11 | #AUTHOR: Pedro J. Sola
 12 | VERSION=1.0
 13 | #CREATED: 30 April 2018
 14 | #REVISION:
 15 | #12 June 2018: Handled cleaning process without hard coded paths
 16 | #
 17 | #DESCRIPTION:Script that uses prokka to annotate a FASTA file
 18 | #
 19 | #DOCUMENTATION
 20 | #
 21 | #Prokka outputs the fasta headers as:
 22 | # gnl|center|locustag_01
 23 | # gnl|center|locustag_02
 24 | #
 25 | #TO DO:
 26 | #Handle cleaning [v]
 27 | #
 28 | #================================================================
 29 | # END_OF_HEADER
 30 | #================================================================
 31 | 
 32 | #SHORT USAGE RULES
 33 | #LONG USAGE FUNCTION
 34 | usage() {
 35 | 	cat << EOF
 36 | 
 37 | Prokka_annotation is a script that uses prokka to annotate a FASTA file
 38 | 
 39 | usage : $0 <-i inputfile(FASTA)> <-p prefix> [-o <directory>] [-k <kingdom>]
 40 | 		[-T <threads>] [-g group_name][-G genus] [-S species] [-c] [-v] [-h]
 41 | 
 42 | 	-i input file in FASTA format
 43 | 	-o output directory
 44 | 	-p prefix for sample identification (mandatory) and output file name
 45 | 	-k kingdom (Bacteria by default)
 46 | 	-g group name (optional). If unset, samples will be gathered in NO_GROUP group
 47 | 	-G sample genus in case is known by user
 48 | 	-S sample species in case is known by user
 49 | 	-c clean:remove files other than gff and renamed fasta
 50 | 	-T number of threads
 51 | 	-v version
 52 | 	-h display usage message
 53 | 
 54 | 
 55 | Output directory is the same as input directory by default
 56 | 
 57 | example: prokka_annotation -i ecoli.fasta -p ECO -T 5
 58 | 
 59 | 
 60 | EOF
 61 | }
 62 | 
 63 | 
 64 | #================================================================
 65 | # OPTION_PROCESSING
 66 | #================================================================
 67 | #Make sure the script is executed with arguments
 68 | if [ $# = 0 ] ; then
 69 |  usage >&2
 70 |  exit 1
 71 | fi
 72 | 
 73 | # Error handling
 74 | error(){
 75 |   local parent_lineno="$1"
 76 |   local script="$2"
 77 |   local message="$3"
 78 |   local code="${4:-1}"
 79 | 
 80 | 	RED='\033[0;31m'
 81 | 	NC='\033[0m'
 82 | 
 83 |   if [[ -n "$message" ]] ; then
 84 |     echo -e "\n---------------------------------------\n"
 85 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 86 |     echo -e "MESSAGE:\n"
 87 |     echo -e "$message"
 88 |     echo -e "\n---------------------------------------\n"
 89 |   else
 90 |     echo -e "\n---------------------------------------\n"
 91 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 92 |     echo -e "\n---------------------------------------\n"
 93 |   fi
 94 | 
 95 |   exit "${code}"
 96 | }
 97 | 
 98 | #DECLARE FLAGS AND VARIABLES
 99 | cwd="$(pwd)"
100 | group="NO_GROUP"
101 | input_file="Input_file"
102 | kingdom="Bacteria"
103 | clean=false
104 | genus=""
105 | species=""
106 | threads=1
107 | 
108 | #PARSE VARIABLE ARGUMENTS WITH getops
109 | #common example with letters, for long options check longopts2getopts.sh
110 | options=":i:o:p:k:g:G:S:T:cvh"
111 | while getopts $options opt; do
112 | 	case $opt in
113 | 		i )
114 | 			input_file=$OPTARG
115 | 			;;
116 | 
117 | 		o )
118 | 			output_dir=$OPTARG
119 | 			;;
120 | 		p)
121 | 			prefix=$OPTARG
122 | 			file_name=$OPTARG
123 | 			;;
124 | 		k )
125 |           	kingdom=$OPTARG
126 |           	;;
127 |         g )
128 |           	group=$OPTARG
129 |           	;;
130 |         S )
131 |           	species=$OPTARG
132 |           	;;
133 |         G)
134 |           	genus=$OPTARG
135 |           	;;
136 | 		c )
137 |           	clean=true
138 |           	;;
139 |         T)
140 |           	threads=$OPTARG
141 |           	;;
142 | 
143 |         h )
144 | 		  	usage
145 | 		  	exit 1
146 | 		  	;;
147 | 		v )
148 | 		  	echo $VERSION
149 | 		  	exit 1
150 | 		  	;;
151 | 		\?)
152 | 			echo "Invalid Option: -$OPTARG" 1>&2
153 | 			usage
154 | 			exit 1
155 | 			;;
156 | 		: )
157 |       		echo "Option -$OPTARG requires an argument." >&2
158 |       		exit 1
159 |       		;;
160 |       	* )
161 | 			echo "Unimplemented option: -$OPTARG" >&2;
162 | 			exit 1
163 | 			;;
164 | 
165 | 	esac
166 | done
167 | shift $((OPTIND-1))
168 | 
169 | #================================================================
170 | # MAIN_BODY
171 | #================================================================
172 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
173 | 
174 | echo -e "\n#Executing" $0 "\n"
175 | 
176 | check_mandatory_files.sh $input_file
177 | 
178 | check_dependencies.sh prokka
179 | echo "PREFIX" $prefix
180 | 
181 | if [ ! $prefix ]; then
182 | 	echo "please provide a prefix"
183 | 	exit 1
184 | fi
185 | 
186 | if [ ! $output_dir ]; then
187 | 	output_dir=$(dirname $input_file)
188 | 	echo "Default output directory is" $output_dir
189 | 	mkdir -p $output_dir
190 | else
191 | 	echo "Output directory is" $output_dir
192 | 	mkdir -p $output_dir
193 | fi
194 | 
195 | if [ ! $file_name ]; then
196 | 	file_name=$(basename $input_file)
197 | 	echo "filename is" $file_name
198 | fi
199 | 
200 | 
201 | ##PROKKA EXECUTION
202 | 
203 | echo "$(date)"
204 | echo "Annotating $input_file with prokka"
205 | 
206 | prokka --force --outdir $output_dir \
207 | --prefix $prefix \
208 | --addgenes \
209 | --kingdom $kingdom \
210 | --genus $genus \
211 | --species $species \
212 | --usegenus \
213 | --centre BU-ISCIII \
214 | --locustag $prefix \
215 | --addgenes \
216 | --cpus $threads \
217 | $input_file #|| error ${LINENO} $(basename $0) "Prokka command failed. See $output_dir/logs for more information."
218 | 
219 | echo "$(date)"
220 | echo "done annotating $input_file with prokka"
221 | 
222 | ##CLEAN FILES THAT WILL NOT BE USED IN PLASMIDID
223 | 
224 | if [ $clean = true ]; then
225 | 
226 | 	echo "Removing unwanted files"
227 | 	for i in $(ls $output_dir/$prefix.??? | awk '!/fna|gff|log|err|gb/')
228 | 	do
229 | 		rm $i
230 | 	done
231 | fi
232 | 
233 | echo -e "\n"
234 | 


--------------------------------------------------------------------------------
/bin/quality_trim.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | #=============================================================
  6 | # HEADER
  7 | #=============================================================
  8 | 
  9 | #INSTITUTION:ISCIII
 10 | #CENTRE:BU-ISCIII
 11 | #AUTHOR: Pedro J. Sola
 12 | VERSION=1.0
 13 | #CREATED: 21 May 2018
 14 | #REVISION:
 15 | #DESCRIPTION:Script that execute trimmomatic to filter by quality
 16 | #
 17 | #
 18 | #================================================================
 19 | # END_OF_HEADER
 20 | #================================================================
 21 | 
 22 | 
 23 | usage() {
 24 | 	cat << EOF
 25 | 
 26 | quality_trim script execute trimmomatic to filter by quality
 27 | 
 28 | usage : $0 <-1 R1 file> <-2 R2 file> [-o <directory>] [-d <trimmomatic_directory>] <-s sample_name>
 29 | 		[-a adapter_file] [-g group_name] [-f <file_name>] [-l <int>] [-M <int>] [-T <int>][-v] [-h]
 30 | 
 31 | 	-1 R1 file (mandatory)
 32 | 	-2 R2 file (mandatory)
 33 | 	-d directory where trimmomatic is installed, default: /opt/Trimmomatic/
 34 | 	-a adapters to remove, default: TruSeq3-PE.fa
 35 | 	-o output directory (optional)
 36 | 	-f file name
 37 | 	-l minimus length of trimmed reads (default 40)
 38 | 	-s sample name (mandatory)
 39 | 	-g group name (optional). If unset, samples will be gathered in NO_GROUP group
 40 | 	-M RAM memmory (Gb), default 8
 41 | 	-T threads, default 1
 42 | 	-v version
 43 | 	-h display usage message
 44 | 
 45 | example: ./quality_trim.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -s ECO232 -g ENTERO -T 8
 46 | 
 47 | EOF
 48 | }
 49 | 
 50 | #================================================================
 51 | # OPTION_PROCESSING
 52 | #================================================================
 53 | #Make sure the script is executed with arguments
 54 | if [ $# = 0 ] ; then
 55 |  usage >&2
 56 |  exit 1
 57 | fi
 58 | 
 59 | # Error handling
 60 | error(){
 61 |   local parent_lineno="$1"
 62 |   local script="$2"
 63 |   local message="$3"
 64 |   local code="${4:-1}"
 65 | 
 66 | 	RED='\033[0;31m'
 67 | 	NC='\033[0m'
 68 | 
 69 |   if [[ -n "$message" ]] ; then
 70 |     echo -e "\n---------------------------------------\n"
 71 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 72 |     echo -e "MESSAGE:\n"
 73 |     echo -e "$message"
 74 |     echo -e "\n---------------------------------------\n"
 75 |   else
 76 |     echo -e "\n---------------------------------------\n"
 77 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 78 |     echo -e "\n---------------------------------------\n"
 79 |   fi
 80 | 
 81 |   exit "${code}"
 82 | }
 83 | 
 84 | #DECLARE FLAGS AND VARIABLES
 85 | cwd="$(pwd)"
 86 | group="NO_GROUP"
 87 | r1_file="R1_file"
 88 | r2_file="R2_file"
 89 | trimmomatic_directory=/opt/Trimmomatic/
 90 | adapter_file="TruSeq3-PE.fa"
 91 | minimus_length=40
 92 | max_mem=8
 93 | threads=1
 94 | 
 95 | #PARSE VARIABLE ARGUMENTS WITH getops
 96 | #common example with letters, for long options check longopts2getopts.sh
 97 | options=":1:2:o:f:d:a:s:g:l:n:M:T:vh"
 98 | while getopts $options opt; do
 99 | 	case $opt in
100 | 		1 )
101 | 			r1_file=$OPTARG
102 | 			;;
103 | 		2 )
104 | 			r2_file=$OPTARG
105 | 			;;
106 | 		o )
107 | 			output_dir=$OPTARG
108 | 			;;
109 | 		f )
110 | 			file_name=$OPTARG
111 | 			;;
112 | 		s )
113 | 			sample=$OPTARG
114 | 			;;
115 | 		d)
116 | 			trimmomatic_directory=$OPTARG
117 | 			;;
118 | 		a)
119 | 			adapter_file=$OPTARG
120 | 			;;
121 | 		l)
122 | 			minimus_length=$OPTARG
123 | 			;;
124 | 		g)
125 | 			group=$OPTARG
126 | 			;;
127 | 		M )
128 | 			max_mem=$OPTARG
129 | 			;;
130 | 		T )
131 | 			threads=$OPTARG
132 | 			;;
133 | 		h )
134 | 		  	usage
135 | 		  	exit 1
136 | 		  	;;
137 | 		v )
138 | 		  	echo $VERSION
139 | 		  	exit 1
140 | 		  	;;
141 | 		\?)
142 | 			echo "Invalid Option: -$OPTARG" 1>&2
143 | 			usage
144 | 			exit 1
145 | 			;;
146 | 		: )
147 |       		echo "Option -$OPTARG requires an argument." >&2
148 |       		exit 1
149 |       		;;
150 |       	* )
151 | 			echo "Unimplemented option: -$OPTARG" >&2;
152 | 			exit 1
153 | 			;;
154 | 
155 | 	esac
156 | done
157 | shift $((OPTIND-1))
158 | 
159 | 
160 | 
161 | #================================================================
162 | # MAIN_BODY
163 | #================================================================
164 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
165 | 
166 | echo -e "\n#Executing" $0 "\n"
167 | 
168 | check_mandatory_files.sh $r1_file $r2_file
169 | 
170 | check_dependencies.sh trimmomatic
171 | 
172 | if [ ! $sample ]; then
173 | 	echo "Please include a sample name"
174 | 	exit 1
175 | fi
176 | 
177 | 
178 | if [ ! $output_dir ]; then
179 | 	output_dir="$group/$sample/trimmed"
180 | 	echo "Default output directory is" $output_dir
181 | 	mkdir -p $output_dir
182 | else
183 | 	echo "Output directory is" $output_dir
184 | 	mkdir -p $output_dir
185 | fi
186 | 
187 | if [ ! $filename ]; then
188 | 	filename=$sample
189 | fi
190 | 
191 | 
192 | #trimmomatic_executable=$(find $trimmomatic_directory -type f -name "trimmomatic*.jar" | awk 'NR==1')
193 | 
194 | trimmomatic_path=$(whereis trimmomatic | cut -d " " -f 2 | cut -d "/" -f 1,2,3,4,5,6)
195 | trimmomatic_adapter=$(find $trimmomatic_path -type f -name $adapter_file | awk 'NR==1')
196 | 
197 | echo "$(date)"
198 | echo "Quality trimming:"
199 | echo "R1 = " $r1_file
200 | echo "R2 = " $r2_file
201 | 
202 | trimmomatic PE -threads $threads \
203 | $r1_file \
204 | $r2_file \
205 | $output_dir/$sample"_1_paired.fastq.gz" \
206 | $output_dir/$sample"_1_unpaired.fastq.gz" \
207 | $output_dir/$sample"_2_paired.fastq.gz" \
208 | $output_dir/$sample"_2_unpaired.fastq.gz" \
209 | ILLUMINACLIP:$trimmomatic_adapter:2:30:10 SLIDINGWINDOW:4:20 MINLEN:$minimus_length || error ${LINENO} $(basename $0) "Trimmomatic command failed. See $output_dir/logs for more information."
210 | 
211 | echo "$(date)"
212 | echo "DONE quality trimming, file can be fount at:"
213 | echo $output_dir/$sample"_1_paired.fastq.gz"
214 | echo $output_dir/$sample"_1_unpaired.fastq.gz"
215 | echo $output_dir/$sample"_2_paired.fastq.gz"
216 | echo $output_dir/$sample"_2_unpaired.fastq.gz"
217 | echo -e "\n"


--------------------------------------------------------------------------------
/bin/rename_from_fasta.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | #set -x
  7 | 
  8 | #=============================================================
  9 | # HEADER
 10 | #=============================================================
 11 | 
 12 | #INSTITUTION:ISCIII
 13 | #CENTRE:BU-ISCIII
 14 | #AUTHOR: Pedro J. Sola
 15 | VERSION=1.0
 16 | #CREATED: 06 June 2018
 17 | #REVISION:
 18 | #DESCRIPTION:rename_from_fasta script rename any field in a file by either providing two fasta files or a dictionary file
 19 | 
 20 | #================================================================
 21 | # END_OF_HEADER
 22 | #================================================================
 23 | 
 24 | 
 25 | usage() {
 26 | 	cat << EOF
 27 | 
 28 | rename_from_fasta script rename any field in a file by either providing two fasta files or a dictionary file
 29 | 
 30 | usage : $0 <-i file_to_rename> [-1 <inputfile1(.fasta)>] [-2 <inputfile2(.fasta)>] [-d <dictionary>] [-o <directory>] [-f <file_name>] [-v] [-h]
 31 | 
 32 | 	-i input file to rename
 33 | 	-1 original fata file whose names will be finally printed
 34 | 	-2 new fata file whose names will be replaced
 35 | 	-o output directory (optional). By default the file is replaced in the same location
 36 | 	-f output file name (".rename" will be added at the end)
 37 | 	-d dictionary file to be used if fasta files are not supplied
 38 | 	-v version
 39 | 	-h display usage message
 40 | 
 41 | example: process_cluster_output.sh -i ecoli_clustered.fasta_70 -b ecoli.coverage
 42 | 
 43 | EOF
 44 | }
 45 | 
 46 | #================================================================
 47 | # OPTION_PROCESSING
 48 | #================================================================
 49 | #Make sure the script is executed with arguments
 50 | if [ $# = 0 ] ; then
 51 |  usage >&2
 52 |  exit 1
 53 | fi
 54 | 
 55 | # Error handling
 56 | error(){
 57 |   local parent_lineno="$1"
 58 |   local script="$2"
 59 |   local message="$3"
 60 |   local code="${4:-1}"
 61 | 
 62 | 	RED='\033[0;31m'
 63 | 	NC='\033[0m'
 64 | 
 65 |   if [[ -n "$message" ]] ; then
 66 |     echo -e "\n---------------------------------------\n"
 67 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 68 |     echo -e "MESSAGE:\n"
 69 |     echo -e "$message"
 70 |     echo -e "\n---------------------------------------\n"
 71 |   else
 72 |     echo -e "\n---------------------------------------\n"
 73 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 74 |     echo -e "\n---------------------------------------\n"
 75 |   fi
 76 | 
 77 |   exit "${code}"
 78 | }
 79 | 
 80 | #DECLARE FLAGS AND VARIABLES
 81 | cwd="$(pwd)"
 82 | input_file="Input_file"
 83 | 
 84 | #PARSE VARIABLE ARGUMENTS WITH getops
 85 | #common example with letters, for long options check longopts2getopts.sh
 86 | options=":i:1:2:f:o:d:vh"
 87 | while getopts $options opt; do
 88 | 	case $opt in
 89 | 		i )
 90 | 			input_file=$OPTARG
 91 | 			;;
 92 | 		1 )
 93 | 			fasta_file_old=$OPTARG
 94 | 			;;
 95 | 		2 )
 96 | 			fasta_file_new=$OPTARG
 97 | 			;;
 98 | 		d )
 99 | 			dictionary_file_new=$OPTARG
100 | 			;;
101 | 		o )
102 | 			output_dir=$OPTARG
103 | 			;;
104 | 		f )
105 | 			file_name=$OPTARG
106 | 			;;
107 |         h )
108 | 		  	usage
109 | 		  	exit 1
110 | 		  	;;
111 | 		v )
112 | 		  	echo $VERSION
113 | 		  	exit 1
114 | 		  	;;
115 | 		\?)
116 | 			echo "Invalid Option: -$OPTARG" 1>&2
117 | 			usage
118 | 			exit 1
119 | 			;;
120 | 		: )
121 |       		echo "Option -$OPTARG requires an argument." >&2
122 |       		exit 1
123 |       		;;
124 |       	* )
125 | 			echo "Unimplemented option: -$OPTARG" >&2;
126 | 			exit 1
127 | 			;;
128 | 
129 | 	esac
130 | done
131 | shift $((OPTIND-1))
132 | 
133 | #================================================================
134 | # MAIN_BODY
135 | #================================================================
136 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
137 | 
138 | echo -e "\n#Executing" $0 "\n"
139 | 
140 | check_mandatory_files.sh $input_file
141 | 
142 | if [ ! $output_dir ]; then
143 | 	output_dir=$(dirname $input_file)
144 | 	echo "Default output directory is" $output_dir
145 | 	mkdir -p $output_dir
146 | else
147 | 	echo "Output directory is" $output_dir
148 | 	mkdir -p $output_dir
149 | fi
150 | 
151 | 
152 | if [ ! $file_name ]; then
153 | 	file_name=$(basename $input_file | cut -d "." -f1,2)
154 | fi
155 | 
156 | fasta_file_old_name=$(basename $fasta_file_old)
157 | fasta_file_new_name=$(basename $fasta_file_new)
158 | 
159 | echo "$(date)"
160 | echo "Renaming" $file_name
161 | 
162 | cat $fasta_file_old | awk '/>/ {print $1}'| sed 's/>//g' | sed 's/|/-/g' > $output_dir/$fasta_file_old_name".ac"
163 | cat $fasta_file_new | awk '/>/ {print $1}'| sed 's/>//g' | sed 's/|/-/g' > $output_dir/$fasta_file_new_name".ac"
164 | cat $input_file | sed 's/|/-/g' > $output_dir/$file_name".nopipe.tmp"
165 | 
166 | 
167 | #Paste colums to relate names in a dictionary
168 | awk 'NR==FNR{ac[NR]=$0;next}{print ac[FNR], "\t", $0"\\t" }' $output_dir/$fasta_file_old_name".ac" $output_dir/$fasta_file_new_name".ac" > $output_dir/dictionary.txt || error ${LINENO} $(basename $0) "AWK command failed in dictionary.txt creation. See $output_dir/logs for more information."
169 | 
170 | #Rename fields
171 | 
172 | #cat $output_dir/dictionary.txt | while read -r line; do word1=$(cut -f1); word2=$(cut -f2); echo "##########word 1="$word1;echo "###########word 2="$word2; sed 's/$word2/$word1/g' $input_file; done > $output_dir/$file_name".renamed"
173 | 
174 | 
175 | awk 'FNR==NR {dict[$2]=$1"\t"; next} {for (i in dict) gsub(i, dict[i])}1' $output_dir/dictionary.txt $output_dir/$file_name".nopipe.tmp" > $output_dir/$file_name".renamed" || error ${LINENO} $(basename $0) "AWK command failed in $file_name\".renamed\" creation. See $output_dir/logs for more information."
176 | 
177 | #awk 'NR==FNR{dict[$2]=$1;next}{$1=dict[$1]}1' $output_dir/dictionary.txt $input_file #> $output_dir/$file_name".renamed"
178 | 
179 | 
180 | rm $output_dir/$fasta_file_old_name".ac"
181 | rm $output_dir/$fasta_file_new_name".ac"
182 | rm $output_dir/$file_name".nopipe.tmp"
183 | rm $output_dir/dictionary.txt
184 | 
185 | echo "$(date)"
186 | echo "DONE renaming" $file_name
187 | echo -e "Renamed file can be found at" $output_dir/$file_name".renamed"
188 | 


--------------------------------------------------------------------------------
/bin/sam_to_bam.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | #set -e
  6 | # Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error when performing parameter expansion.
  7 | # An error message will be written to the standard error, and a non-interactive shell will exit
  8 | #set -u
  9 | #Print everything as if it were executed, after substitution and expansion is applied: Debug|log option
 10 | #set -x
 11 | 
 12 | #=============================================================
 13 | # HEADER
 14 | #=============================================================
 15 | 
 16 | #INSTITUTION:ISCIII
 17 | #CENTRE:BU-ISCIII
 18 | #AUTHOR: Pedro J. Sola
 19 | VERSION=1.0
 20 | #CREATED: 19 March 2018
 21 | #REVISION:
 22 | #DESCRIPTION:Script that convert a supplied SAM file into compressed binary indexed BAM
 23 | 
 24 | #================================================================
 25 | # END_OF_HEADER
 26 | #================================================================
 27 | 
 28 | #SHORT USAGE RULES
 29 | #LONG USAGE FUNCTION
 30 | usage() {
 31 | 	cat << EOF
 32 | 
 33 | Sam_to_bam script converts a supplied SAM file into compressed binary indexed BAM
 34 | 
 35 | usage : $0 <-i inputfile(.sam)> [-o <directory>] [-s sample_name] [-g group_name] [-T <int>] [-v] [-h]
 36 | 
 37 | 	-i input file
 38 | 	-o output directory (optional). By default the BAM file will replace SAM in the same location
 39 | 	-s sample name
 40 | 	-g group name (optional). If unset, samples will be gathered in NO_GROUP group
 41 | 	-T number of threads
 42 | 	-v version
 43 | 	-h display usage message
 44 | 
 45 | example: sam_to_bam.sh -i ecoli.sam
 46 | 
 47 | EOF
 48 | }
 49 | 
 50 | #================================================================
 51 | # OPTION_PROCESSING
 52 | #================================================================
 53 | #Make sure the script is executed with arguments
 54 | if [ $? != 0 ] ; then
 55 |  usage >&2
 56 |  exit 1
 57 | fi
 58 | 
 59 | # Error handling
 60 | error(){
 61 |   local parent_lineno="$1"
 62 |   local script="$2"
 63 |   local message="$3"
 64 |   local code="${4:-1}"
 65 | 
 66 | 	RED='\033[0;31m'
 67 | 	NC='\033[0m'
 68 | 
 69 |   if [[ -n "$message" ]] ; then
 70 |     echo -e "\n---------------------------------------\n"
 71 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 72 |     echo -e "MESSAGE:\n"
 73 |     echo -e "$message"
 74 |     echo -e "\n---------------------------------------\n"
 75 |   else
 76 |     echo -e "\n---------------------------------------\n"
 77 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 78 |     echo -e "\n---------------------------------------\n"
 79 |   fi
 80 | 
 81 |   exit "${code}"
 82 | }
 83 | 
 84 | #DECLARE FLAGS AND VARIABLES
 85 | threads=1
 86 | cwd="$(pwd)"
 87 | group="NO_GROUP"
 88 | input_file="Input_file"
 89 | 
 90 | #PARSE VARIABLE ARGUMENTS WITH getops
 91 | #common example with letters, for long options check longopts2getopts.sh
 92 | options=":i:o:s:g:vh"
 93 | while getopts $options opt; do
 94 | 	case $opt in
 95 | 		i )
 96 | 			input_file=$OPTARG
 97 | 			;;
 98 | 		o )
 99 | 			output_dir=$OPTARG
100 | 			;;
101 | 		s )
102 | 			sample=$OPTARG
103 | 			;;
104 | 		g)
105 | 			group=$OPTARG
106 | 			;;
107 | 
108 |         T )
109 | 			threads=$OPTARG
110 |             ;;
111 | 
112 |         h )
113 | 		  	usage
114 | 		  	exit 1
115 | 		  	;;
116 | 		v )
117 | 		  	echo $VERSION
118 | 		  	exit 1
119 | 		  	;;
120 | 		\?)
121 | 			echo "Invalid Option: -$OPTARG" 1>&2
122 | 			usage
123 | 			exit 1
124 | 			;;
125 | 		: )
126 |       		echo "Option -$OPTARG requires an argument." >&2
127 |       		exit 1
128 |       		;;
129 |       	* )
130 | 			echo "Unimplemented option: -$OPTARG" >&2;
131 | 			exit 1
132 | 			;;
133 | 
134 | 	esac
135 | done
136 | shift $((OPTIND-1))
137 | 
138 | 
139 | #================================================================
140 | # MAIN_BODY
141 | #================================================================
142 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
143 | 
144 | echo -e "\n#Executing" $0 "\n"
145 | 
146 | check_mandatory_files.sh $input_file
147 | 
148 | check_dependencies.sh samtools
149 | 
150 | 
151 | if [ ! $output_dir ]; then
152 | 	output_dir=$(dirname $input_file)
153 | 	echo "Default output directory is" $output_dir
154 | 	mkdir -p $output_dir
155 | else
156 | 	echo "Output directory is" $output_dir
157 | 	mkdir -p $output_dir
158 | fi
159 | 
160 | if [ ! $sample ]; then
161 | 	sample=$(basename $input_file | cut -d. -f1)
162 | fi
163 | 
164 | ########SAM_TO_BAM##########
165 | ############################
166 | 
167 | 
168 | if [ -f $output_dir/$sample.sorted.bam -a -f $output_dir/$sample.sorted.bam.bai  ];then \
169 | 	echo "Found a sorted .BAM file for sample" $sample;
170 | 	echo "Omitting BAM to SAM convertion"
171 | else
172 | 	echo "$(date)"
173 | 	echo "Converting SAM to sorted indexed BAM in $sample"
174 | 
175 | 	samtools view \
176 | 	-Sb $input_file \
177 | 	-o $output_dir/$sample.bam || error ${LINENO} $(basename $0) "Samtools view command failed. See $output_dir/logs for more information."
178 | 
179 | 
180 | 	echo "$(date)"
181 | 	echo "Sorting BAM file in $sample"
182 | 
183 | 	samtools sort \
184 | 	-T $output_dir/$sample".sorted.bam" \
185 | 	-o $output_dir/$sample".sorted.bam" \
186 | 	$output_dir/$sample.bam || error ${LINENO} $(basename $0) "Samtools sort command failed. See $output_dir/logs for more information."
187 | 
188 | 	echo "$(date)"
189 | 	echo "Indexing BAM file in $sample"
190 | 
191 | 	samtools index \
192 | 	$output_dir/$sample".sorted.bam" || error ${LINENO} $(basename $0) "Samtools index command failed. See $output_dir/logs for more information."
193 | 
194 | 
195 | 	echo "$(date)"
196 | 	echo "DONE Converting SAM to sorted indexed BAM in $sample"
197 | fi
198 | 
199 | if [ -f $output_dir/$sample.sam ];then \
200 | 
201 | 	echo $sample.sam "removed"
202 | 	rm $output_dir/$sample.sam
203 | 
204 | fi
205 | 
206 | if [ -f $output_dir/$sample.bam ];then \
207 | 
208 | 	echo $sample.bam "removed"
209 | 	rm $output_dir/$sample.bam
210 | 
211 | fi
212 | 
213 | echo -e "\n"
214 | 


--------------------------------------------------------------------------------
/bin/spades_assembly.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | #set -x
  5 | 
  6 | #=============================================================
  7 | # HEADER
  8 | #=============================================================
  9 | 
 10 | #INSTITUTION:ISCIII
 11 | #CENTRE:BU-ISCIII
 12 | #AUTHOR: Pedro J. Sola
 13 | VERSION=1.0
 14 | #CREATED: 21 May 2018
 15 | #REVISION:
 16 | #DESCRIPTION:Script that assemble illumina sequences using SPAdes
 17 | #
 18 | #
 19 | #================================================================
 20 | # END_OF_HEADER
 21 | #================================================================
 22 | 
 23 | 
 24 | usage() {
 25 | 	cat << EOF
 26 | 
 27 | spades_assembly script that assemble illumina sequences using SPAdes
 28 | 
 29 | usage : $0 <-p R1_paired file> <-P R2_paired file> [-o <directory>]
 30 | 		 [-k <int>][-s sample_name] [-g group_name] [-f <file_name>] [-T <int>] [q] [-c] [-v] [-h]
 31 | 
 32 | 	-p R1_paired file (mandatory)
 33 | 	-P R2_paired file (mandatory)
 34 | 	-k kmers, supplied as numbers sepparated by number or one flag per number, default: 21,33,55,77,99,127
 35 | 	-o output directory (optional)
 36 | 	-f file name
 37 | 	-s sample name (mandatory)
 38 | 	-g group name (optional). If unset, samples will be gathered in NO_GROUP group
 39 | 	-q quick_mode: look for files in a folder SUPPLIED with "paired" term
 40 | 	-c clean mode: remove unnecesary temporary folders
 41 | 	-T threads, default 1
 42 | 	-v version
 43 | 	-h display usage message
 44 | 
 45 | example: ./spades_assembly.sh -p ecoli_R1_paired.fastq.gz -P ecoli_R2_paired.fastq.gz -c
 46 | 
 47 | EOF
 48 | }
 49 | 
 50 | 
 51 | #================================================================
 52 | # OPTION_PROCESSING
 53 | #================================================================
 54 | #Make sure the script is executed with arguments
 55 | if [ $# = 0 ] ; then
 56 |  usage >&2
 57 |  exit 1
 58 | fi
 59 | 
 60 | # Error handling
 61 | error(){
 62 |   local parent_lineno="$1"
 63 |   local script="$2"
 64 |   local message="$3"
 65 |   local code="${4:-1}"
 66 | 
 67 | 	RED='\033[0;31m'
 68 | 	NC='\033[0m'
 69 | 
 70 |   if [[ -n "$message" ]] ; then
 71 |     echo -e "\n---------------------------------------\n"
 72 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 73 |     echo -e "MESSAGE:\n"
 74 |     echo -e "$message"
 75 |     echo -e "\n---------------------------------------\n"
 76 |   else
 77 |     echo -e "\n---------------------------------------\n"
 78 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 79 |     echo -e "\n---------------------------------------\n"
 80 |   fi
 81 | 
 82 |   exit "${code}"
 83 | }
 84 | 
 85 | #DECLARE FLAGS AND VARIABLES
 86 | cwd="$(pwd)"
 87 | group="NO_GROUP"
 88 | r1_paired_file="R1_paired_file"
 89 | r2_paired_file="R2_paired_file"
 90 | threads=1
 91 | kmer_values_command="21,33,55,77,99,127"
 92 | kmer_option=false
 93 | quick_mode=false
 94 | clean_mode=false
 95 | 
 96 | #PARSE VARIABLE ARGUMENTS WITH getops
 97 | #common example with letters, for long options check longopts2getopts.sh
 98 | options=":p:P:u:U:o:f:d:a:s:g:k:T:q:cvh"
 99 | while getopts $options opt; do
100 | 	case $opt in
101 | 		p )
102 | 			r1_paired_file=$OPTARG
103 | 			;;
104 | 		P )
105 | 			r2_paired_file=$OPTARG
106 | 			;;
107 | 		o )
108 | 			output_dir=$OPTARG
109 | 			;;
110 | 		f )
111 | 			file_name=$OPTARG
112 | 			;;
113 | 		s )
114 | 			sample=$OPTARG
115 | 			;;
116 | 		k)
117 | 			kmer_value+=($OPTARG)
118 | 			kmer_option=true
119 | 			;;
120 | 		q)
121 | 			directory_reads=$OPTARG
122 | 			quick_mode=true
123 | 			;;
124 | 		l)
125 | 			minimus_length=$OPTARG
126 | 			;;
127 | 		g)
128 | 			group=$OPTARG
129 | 			;;
130 | 		c)
131 | 			clean_mode=true
132 | 			;;
133 | 		M )
134 | 			max_mem=$OPTARG
135 | 			;;
136 | 		T )
137 | 			threads=$OPTARG
138 | 			;;
139 | 		h )
140 | 		  	usage
141 | 		  	exit 1
142 | 		  	;;
143 | 		v )
144 | 		  	echo $VERSION
145 | 		  	exit 1
146 | 		  	;;
147 | 		\?)
148 | 			echo "Invalid Option: -$OPTARG" 1>&2
149 | 			usage
150 | 			exit 1
151 | 			;;
152 | 		: )
153 |       		echo "Option -$OPTARG requires an argument." >&2
154 |       		exit 1
155 |       		;;
156 |       	* )
157 | 			echo "Unimplemented option: -$OPTARG" >&2;
158 | 			exit 1
159 | 			;;
160 | 
161 | 	esac
162 | done
163 | shift $((OPTIND-1))
164 | 
165 | 
166 | 
167 | #================================================================
168 | # MAIN_BODY
169 | #================================================================
170 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
171 | 
172 | echo -e "\n#Executing" $0 "\n"
173 | 
174 | check_dependencies.sh spades.py
175 | 
176 | 
177 | if [ ! $directory_reads ]; then
178 | 	directory_reads=$(dirname $r1_paired_file)
179 | 	echo "Reads directory is" $directory_reads
180 | else
181 | 	echo "Reads directory for quick mode is" $directory_reads
182 | 	sample_dir=$(dirname $directory_reads)
183 | 	output_dir=$sample_dir"/assembly"
184 | 	mkdir -p $output_dir
185 | fi
186 | 
187 | 
188 | if [ ! $output_dir ]; then
189 | 	sample_dir=$(dirname $directory_reads)
190 | 	output_dir=$sample_dir"/assembly"
191 | 	echo "Default output directory is" $output_dir
192 | 	mkdir -p $output_dir
193 | else
194 | 	echo "Output directory is" $output_dir
195 | 	mkdir -p $output_dir
196 | fi
197 | 
198 | 
199 | if [ $quick_mode = true ]; then
200 | 	echo "Entering QUICK MODE"
201 | 	r1_paired_file=$(find $directory_reads -name  "*1_paired.fastq.gz" -type f)
202 | 	r2_paired_file=$(find $directory_reads -name  "*2_paired.fastq.gz" -type f)
203 | fi
204 | 
205 | 
206 | check_mandatory_files.sh $r1_paired_file $r2_paired_file
207 | 
208 | if [ $kmer_option = true ]; then
209 | 	list_kmer_values=$(for value in "${kmer_value[@]}"; do echo "$value"; done)
210 | 	kmer_values_command=$(printf "%s," $list_kmer_values | sed 's/,$//g')
211 | fi
212 | 
213 | 
214 | echo "$(date)"
215 | echo "Assembly:"
216 | echo "R1 paired file = " $r1_paired_file
217 | echo "R2 paired file = " $r2_paired_file
218 | 
219 | 
220 | spades.py \
221 | --careful \
222 | -t $threads \
223 | -k $kmer_values_command \
224 | --pe1-1 $r1_paired_file \
225 | --pe1-2 $r2_paired_file \
226 | -o $output_dir || error ${LINENO} $(basename $0) "Spades command failed. See $output_dir/logs for more information."
227 | 
228 | 
229 | 
230 | echo "$(date)"
231 | echo "DONE. Assembled contigs can be found at $output_dir/contigs.fasta:"
232 | echo "DONE. Assembled scaffolds can be found at $output_dir/scaffolds.fasta:"
233 | 
234 | if [ $clean_mode = true ]; then
235 | 	echo "Removing unnecesary folders"
236 | 	rm -rf $(find $output_dir -maxdepth 1 -mindepth 1 -type d)
237 | 	echo "DONE removing unwanted folders"
238 | fi
239 | 
240 | echo -e "\n"
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 


--------------------------------------------------------------------------------
/config_files/annotation_config_file.txt:
--------------------------------------------------------------------------------
 1 | #1. Fasta file for annotation
 2 | #2. Name given to this annotation 
 3 | #3. Alignment %Identity necessary to include the sequence
 4 | #4. Alignment %Length neccesary to include the sequence
 5 | #5. Query divisor for the sequence name. (ie. For name Inc_NC_632542_protein-description)
 6 | #6. Query field to represent (l:left|r:rigth) (ie. with divisor "_", left would be "Inc" and rigth "protein-description")
 7 | #7. Unique. Each sequence will be allowed only once per plasmid
 8 | #8. Doble Unique. This field uses a provided separator to extract only the best match. (ie within OXA-11 and OXA-48, using "-" as separator will retrieve only one). Use n if not used.
 9 | #9. Color. Color used to represent this database (blue, green, grey, orange, purple, red, yellow. vvl,, vl, l, d, vd and vvd stands for very v), light(l) and dark(d))
10 | 
11 | #DDBBFILE,NANE,P_IDENTITY,P_ALIGNMENT,Q_DIVISOR,Q_SIDE_LR,IS_UNIQUE,DOBLE_UNIQUE,COLOR,
12 | 
13 | #DEFAULTEXAMPLE: Copy and paste next line, change the file name, name of database and color. Remove "#"
14 | #PATH/TO/FILE,NAME,95,90,_,l,n,n,nucl,COLOR
15 | 
16 | #ANTIBIOTIC_RESISTANCE_ANNOTATION
17 | databases/ARGannot.pID.fasta,abr,98,90, ,r,y,-,nucl,lred
18 | #REPLISOME_ANNOTATION
19 | databases/plasmidFinder_01_26_2018.fsa,inc,95,80,_,l,y,n,nucl,lyellow
20 | 


--------------------------------------------------------------------------------
/config_files/circos_summary_1_3_0.conf:
--------------------------------------------------------------------------------
  1 | ######## CIRCOS.CONF
  2 | ####################
  3 | 
  4 | karyotype = PLASMID_KARYOTYPE
  5 | 
  6 | chromosome_units = 1000000
  7 | chromosomes_display_default = yes
  8 | #chromosomes_display_default = no
  9 | #chromosomes = /NZ/
 10 | chromosomes_color = /./ = lblue
 11 | #chromosomes_scale = /./ = 1rn
 12 | #chromosomes_scale = eval(var(size)) < 100000 = 0.5r
 13 | z=100
 14 | 
 15 | 
 16 | #############################HIGHLIGHTS
 17 | <highlights>
 18 | 
 19 | <<include PID_ALL_HIGHLIGHTS>>
 20 | 
 21 | </highlights>
 22 | 
 23 | 
 24 | 
 25 | #############################PLOTS
 26 | <plots>
 27 | 
 28 | ############### COVERAGE
 29 | <plot>
 30 | type = histogram
 31 | file = PLASMID_COVERAGE_GRAPH
 32 | 
 33 | color = black
 34 | r1 = 0.99r
 35 | r0 = 0.90r
 36 | extend_bin = no
 37 | min= 0
 38 | max= 500
 39 | thickness = 2
 40 | orientation = out
 41 | 
 42 | #<backgrounds>
 43 | #show  = data
 44 | #<background>
 45 | #color = vvlgrey
 46 | #</background>
 47 | #</backgrounds>
 48 | 
 49 | <axes>
 50 | 
 51 | <axis>
 52 | thickness = 1
 53 | color = lgrey
 54 | spacing = 50
 55 | </axis>
 56 | 
 57 | </axes>
 58 | 
 59 | <rules>
 60 | 
 61 | 
 62 | 
 63 | <rule>
 64 | condition = var(value) < 20
 65 | color     = lorange
 66 | thickness = 3
 67 | flow      = continue
 68 | </rule>
 69 | 
 70 | <rule>
 71 | condition = var(value) == 0
 72 | color     = red
 73 | thickness = 3
 74 | flow         = continue
 75 | </rule>
 76 | 
 77 | <rule>
 78 | condition = var(value) > 200
 79 | color     = green
 80 | thickness = 3
 81 | </rule>
 82 | 
 83 | </rules>
 84 | 
 85 | </plot>
 86 | 
 87 | ############### /COVERAGE
 88 | 
 89 | 
 90 | ############### TEXT_ADITIONAL_ANNOTATION
 91 | <plot>
 92 | type = text
 93 | color      = black
 94 | label_font = bold
 95 | label_size = 10p
 96 | file = PLASMID_SPECIFIC_TEXT
 97 | r1   = 0.85r+200p
 98 | r0   = 0.80r
 99 | orientation = center
100 | show_links     = no
101 | 
102 | margin = 0u
103 | label_parallel = no
104 | padding  = 1p
105 | rpadding = 2p
106 | label_snuggle             = yes
107 | max_snuggle_distance  = 5r
108 | snuggle_sampling                = 2
109 | snuggle_tolerance               = 1r
110 | snuggle_link_overlap_test      = yes 
111 | snuggle_link_overlap_tolerance = 20p
112 | 
113 | </plot>
114 | ############### /TEXT_ADITIONAL_ANNOTATION
115 | 
116 | ############### TEXT_CDS_CONTIG
117 | <plot>
118 | type = text
119 | color      = black
120 | label_font = default
121 | label_size = 9p
122 | file = PLASMID_CDS_CONTIG
123 | r1   = 0.80r
124 | r0   = 0.75r
125 | orientation = center
126 | show_links     = yes
127 | label_parallel = no
128 | padding  = 0p
129 | label_snuggle             = yes
130 | max_snuggle_distance  = 6r
131 | snuggle_sampling                = 10
132 | snuggle_tolerance               = 1r
133 | snuggle_link_overlap_test      = yes 
134 | snuggle_link_overlap_tolerance = 10p
135 | #snuggle_refine                 = yes
136 | 
137 | #<rules>
138 | #<rule>
139 | #condition  = var(value) =~ /CDS/
140 | #show       = no
141 | #flow = continue
142 | #</rule>
143 | #</rules>
144 | 
145 | 
146 | </plot>
147 | 
148 | ############### /TEXT_CDS_CONTIG
149 | 
150 | ############### CDS_CONTIGS_PROKKA
151 | <plot>
152 | type      = tile
153 | file      = PLASMID_CDS_CONTIG
154 | r1        = 0.75r
155 | r0        = 0.70r
156 | layers    = 3
157 | layers_overflow = collapse
158 | margin    = 10u
159 | thickness = 20
160 | padding   = 10
161 | orientation      = in
162 | stroke_thickness = 1
163 | stroke_color     = vdgrey
164 | color            = purple
165 | #units_ok = bupr
166 | #units_nounit = n
167 | 
168 | </plot>
169 | ############### /CDS_CONTIGS_PROKKA
170 | 
171 | ############### TEXT_CONTIG
172 | <plot>
173 | type = text
174 | #color      = black
175 | label_font = bold
176 | 
177 | label_size = 10p
178 | file = PLASMID_CONTIGS
179 | r1   = 0.70r
180 | r0   = 0.64r
181 | orientation = out
182 | show_links     = yes
183 | label_parallel = yes
184 | padding  = 10p
185 | margin = 10p
186 | label_snuggle             = yes
187 | max_snuggle_distance  = 10r
188 | snuggle_sampling                = 10
189 | snuggle_tolerance               = 1r
190 | snuggle_link_overlap_test      = yes 
191 | snuggle_link_overlap_tolerance = 500p
192 | #snuggle_refine                 = yes
193 | 
194 | <rules>
195 | <rule>
196 | 
197 | condition  = var(id) =~ /(\d+)(\d+)(\d*)/
198 | color      = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
199 | flow = continue
200 | </rule>
201 | 
202 | <rule>
203 | condition  = var(id) =~ /(\d+)(\d+)(\d*)/
204 | link_color      = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
205 | flow = continue
206 | </rule>
207 | 
208 | <rule>
209 | condition = var(size) < 1kb
210 | show     = no
211 | </rule>
212 | 
213 | </rules>
214 | </plot>
215 | ############### /TEXT_CONTIG
216 | 
217 | ############### CONTIGS SPADES ALL
218 | <plot>
219 | type      = tile
220 | file      = PLASMID_CONTIGS
221 | r1        = 0.65r
222 | r0        = 0.6r
223 | layers    = 4
224 | margin    = 5u
225 | thickness = 20
226 | padding   = 5
227 | layers_overflow = collapse
228 | orientation      = out
229 | stroke_thickness = 0
230 | stroke_color     = grey
231 | color            = grey
232 | 
233 | <rules>
234 | 
235 | <rule>
236 | 
237 | condition  = var(id) =~ /(\d+)(\d+)(\d*)/
238 | color      = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
239 | flow = continue
240 | 
241 | #importance = 100
242 | #condition = 1
243 | #color = eval(sprintf("spectral-11-div-%d",remap_int(NODE_%d%d,0,10e6,1,11)))
244 | #color = eval((qw(vvvlgrey vvlgrey vlgrey lgrey grey dgrey vdgrey vvdgrey))[var(id) % 8])
245 | </rule>
246 | 
247 | <rule>
248 | condition = var(size) < 1kb
249 | show     = no
250 | </rule>
251 | 
252 | 
253 | </rules>
254 | </plot>
255 | ############### /CONTIGS SPADES ALL
256 | 
257 | </plots>
258 | 
259 | 
260 | ######## LINKS
261 | ##############
262 | 
263 | <links>
264 | 
265 | <link>
266 | 
267 | file      = PLASMID_LINKS
268 | r1        = 0.50r
269 | r0        = 0r
270 | ribbon        = yes
271 | flat          = yes
272 | radius        = 0.6r
273 | bezier_radius = 0.1r
274 | crest         = 0.2
275 | color         = lgrey_a4
276 | 
277 | <rules>
278 | 
279 | <rule>
280 | condition     = var(intrachr)
281 | show          = no
282 | </rule>
283 | 
284 | <rule>
285 | importance = 110
286 | condition  = var(size1) < 2kb 
287 | show       = no
288 | flow = continue
289 | </rule>
290 | 
291 | <rule>
292 | importance = 110
293 | condition  = var(size2) < 2kb 
294 | show       = no
295 | flow = continue
296 | </rule>
297 | 
298 | <rule>
299 | 
300 | condition  = var(id) =~ /(\d+)(\d+)(\d*)/
301 | color      = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
302 | #"paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12)
303 | #"set3-12-qual-%d_a%d"
304 | #"rev(set3-12-qual-%d_a%d)"
305 | flow = continue
306 | </rule>
307 | 
308 | <rule>
309 | condition = 1
310 | z         = eval(average(-1*(var(size1),var(size2))))
311 | </rule>
312 | 
313 | </rules>
314 | 
315 | </link>
316 | 
317 | </links>
318 | 
319 | 
320 | ######## IDEOGRAM
321 | #################
322 | <ideogram>
323 | 
324 | show = yes
325 | 
326 | <spacing>
327 | default = 5000u
328 | #when representing witout scaling
329 | #default = 1000u
330 | break = 500u
331 | </spacing>
332 | 
333 | chromosomes_color = dblue
334 | stroke_color = blue
335 | 
336 | radius    = 0.93r
337 | thickness = 25p
338 | fill      = yes
339 | 
340 | show_label = yes
341 | 
342 | label_font = bold
343 | label_radius = dims(ideogram,radius_inner)
344 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2
345 | 
346 | label_size = 17
347 | label_parallel = yes
348 | 
349 | </ideogram>
350 | 
351 | ######## TICKS
352 | ##############
353 | 
354 | show_ticks = yes
355 | show_tick_labels = yes
356 | 
357 | <ticks>
358 | radius = dims(ideogram,radius_outer)
359 | color = black
360 | thickness = 2p
361 | 
362 | #multiplier = 0.001
363 | 
364 | <tick>
365 | #spacing = 1000u
366 | rspacing       = 0.025
367 | multiplier = 0.001
368 | spacing_type   = relative
369 | skip_first_label = yes
370 | skip_last_label = no
371 | size = 5p
372 | show_label = yes
373 | label_size = 20p
374 | #label_relative = yes
375 | suffix = " kb"
376 | #rdivisor = ideogram
377 | format = %d
378 | rmultiplier    = 1
379 | </tick>
380 | 
381 | 
382 | 
383 | #<tick>
384 | #spacing = 2000u
385 | #size = 15p
386 | #show_label = yes
387 | #label_size = 20p
388 | #labe_offset = 10p
389 | #suffix = " kb"
390 | #format = %d
391 | #</tick>
392 | 
393 | </ticks>
394 | 
395 | ########COLORS
396 | ##############
397 | <<include etc/colors_fonts_patterns.conf>>
398 | 
399 | 
400 | ########HOUSEKEEPING
401 | ####################
402 | <<include etc/housekeeping.conf>>
403 | max_points_per_track* = 8000000
404 | 
405 | ########IMAGE
406 | #############
407 | <image>
408 | dir   = OUTPUTDIR
409 | #dir  = conf(configdir)
410 | file  = IMAGENAME
411 | png   = yes
412 | svg   = no
413 | # radius of inscribed circle in image
414 | radius         = 1900p
415 | # by default angle=0 is at 3 o'clock position
416 | angle_offset      = -90
417 | #angle_orientation = counterclockwise
418 | auto_alpha_colors = yes
419 | auto_alpha_steps  = 5
420 | </image>
421 | 
422 | 


--------------------------------------------------------------------------------
/config_files/circos_summary_1_3_3.conf:
--------------------------------------------------------------------------------
  1 | ######## CIRCOS.CONF
  2 | ####################
  3 | 
  4 | karyotype = PLASMID_KARYOTYPE
  5 | 
  6 | chromosome_units = 1000000
  7 | chromosomes_display_default = yes
  8 | #chromosomes_display_default = no
  9 | #chromosomes = /NZ/
 10 | chromosomes_color = /./ = lblue
 11 | #chromosomes_scale = /./ = 1rn
 12 | #chromosomes_scale = eval(var(size)) < 100000 = 0.5r
 13 | z=100
 14 | 
 15 | 
 16 | #############################HIGHLIGHTS
 17 | <highlights>
 18 | 
 19 | <<include PID_ALL_HIGHLIGHTS>>
 20 | r1        = 0.90r
 21 | r0        = 0.75r
 22 | </highlights>
 23 | 
 24 | 
 25 | 
 26 | #############################PLOTS
 27 | <plots>
 28 | 
 29 | ############### COVERAGE
 30 | <plot>
 31 | type = histogram
 32 | file = PLASMID_COVERAGE_GRAPH
 33 | 
 34 | color = black
 35 | r1 = 0.99r
 36 | r0 = 0.90r
 37 | extend_bin = no
 38 | min= 0
 39 | max= 500
 40 | thickness = 2
 41 | orientation = out
 42 | 
 43 | #<backgrounds>
 44 | #show  = data
 45 | #<background>
 46 | #color = vvlgrey
 47 | #</background>
 48 | #</backgrounds>
 49 | 
 50 | <axes>
 51 | 
 52 | <axis>
 53 | thickness = 1
 54 | color = lgrey
 55 | spacing = 50
 56 | </axis>
 57 | 
 58 | </axes>
 59 | 
 60 | <rules>
 61 | 
 62 | 
 63 | 
 64 | <rule>
 65 | condition = var(value) < 20
 66 | color     = lorange
 67 | thickness = 3
 68 | flow      = continue
 69 | </rule>
 70 | 
 71 | <rule>
 72 | condition = var(value) == 0
 73 | color     = red
 74 | thickness = 3
 75 | flow         = continue
 76 | </rule>
 77 | 
 78 | <rule>
 79 | condition = var(value) > 200
 80 | color     = green
 81 | thickness = 3
 82 | </rule>
 83 | 
 84 | </rules>
 85 | 
 86 | </plot>
 87 | 
 88 | ############### /COVERAGE
 89 | 
 90 | 
 91 | ############### TEXT_ADITIONAL_ANNOTATION
 92 | <plot>
 93 | type = text
 94 | color      = black
 95 | label_font = bold
 96 | label_size = 10p
 97 | file = PLASMID_SPECIFIC_TEXT
 98 | r1   = 0.85r+200p
 99 | r0   = 0.82r
100 | orientation = center
101 | show_links     = no
102 | 
103 | margin = 0u
104 | label_parallel = no
105 | padding  = 1p
106 | rpadding = 2p
107 | label_snuggle             = yes
108 | max_snuggle_distance  = 5r
109 | snuggle_sampling                = 2
110 | snuggle_tolerance               = 1r
111 | snuggle_link_overlap_test      = yes
112 | snuggle_link_overlap_tolerance = 20p
113 | 
114 | </plot>
115 | ############### /TEXT_ADITIONAL_ANNOTATION
116 | 
117 | ############### TEXT_CDS_CONTIG
118 | <plot>
119 | type = text
120 | color      = black
121 | label_font = default
122 | label_size = 9p
123 | file = PLASMID_CDS_CONTIG
124 | r1   = 0.80r
125 | r0   = 0.75r
126 | orientation = center
127 | show_links     = yes
128 | link_dims      = 8p,8p,10p,8p,8p
129 | link_color      = purple
130 | label_parallel = no
131 | padding  = 0p
132 | label_snuggle             = yes
133 | max_snuggle_distance  = 6r
134 | snuggle_sampling                = 10
135 | snuggle_tolerance               = 1r
136 | snuggle_link_overlap_test      = yes
137 | snuggle_link_overlap_tolerance = 10p
138 | #snuggle_refine                 = yes
139 | 
140 | #<rules>
141 | #<rule>
142 | #condition  = var(value) =~ /CDS/
143 | #show       = no
144 | #flow = continue
145 | #</rule>
146 | #</rules>
147 | 
148 | 
149 | </plot>
150 | 
151 | ############### /TEXT_CDS_CONTIG
152 | 
153 | ############### CDS_CONTIGS_PROKKA
154 | <plot>
155 | type      = tile
156 | file      = PLASMID_CDS_FORWARD
157 | r1        = 0.80r
158 | r0        = 0.75r
159 | layers    = 3
160 | layers_overflow = grow
161 | margin    = 0.001u
162 | thickness = 20p
163 | padding   = 0p
164 | rpadding   = 0p
165 | orientation = out
166 | stroke_thickness = 1
167 | stroke_color     = dgrey
168 | color            = dpurple
169 | </plot>
170 | 
171 | 
172 | <plot>
173 | r1        = 0.75r
174 | r0        = 0.75r
175 | <axes>
176 | <axis>
177 | position  = 0.75r
178 | color     = dgrey
179 | thickness = 2
180 | </axis>
181 | </axes>
182 | </plot>
183 | 
184 | <plot>
185 | type      = tile
186 | file      = PLASMID_CDS_REVERSE
187 | r1        = 0.75r
188 | r0        = 0.70r
189 | layers    = 3
190 | layers_overflow = grow
191 | margin    = 0.001u
192 | thickness = 20p
193 | padding   = 0p
194 | rpadding   = 0p
195 | orientation = in
196 | stroke_thickness = 1
197 | stroke_color     = dgrey
198 | color            = lpurple
199 | 
200 | </plot>
201 | ############### /CDS_CONTIGS_PROKKA
202 | 
203 | ############### TEXT_CONTIG
204 | <plot>
205 | type = text
206 | #color      = black
207 | label_font = bold
208 | 
209 | label_size = 10p
210 | file = PLASMID_CONTIGS
211 | r1   = 0.70r
212 | r0   = 0.64r
213 | orientation = out
214 | show_links     = yes
215 | label_parallel = yes
216 | padding  = 10p
217 | margin = 10p
218 | label_snuggle = yes
219 | max_snuggle_distance  = 10r
220 | snuggle_sampling                = 10
221 | snuggle_tolerance               = 1r
222 | snuggle_link_overlap_test      = yes
223 | snuggle_link_overlap_tolerance = 500p
224 | #snuggle_refine                 = yes
225 | 
226 | <rules>
227 | <rule>
228 | 
229 | condition  = var(id) =~ /(\d+)(\d+)(\d*)/
230 | color      = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
231 | flow = continue
232 | </rule>
233 | 
234 | <rule>
235 | condition  = var(id) =~ /(\d+)(\d+)(\d*)/
236 | link_color      = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
237 | flow = continue
238 | </rule>
239 | 
240 | <rule>
241 | condition = var(size) < 1kb
242 | show     = no
243 | </rule>
244 | 
245 | </rules>
246 | </plot>
247 | ############### /TEXT_CONTIG
248 | 
249 | ############### CONTIGS SPADES ALL
250 | <plot>
251 | type      = tile
252 | file      = PLASMID_CONTIGS
253 | r1        = 0.65r
254 | r0        = 0.60r
255 | layers    = 4
256 | margin    = 5u
257 | thickness = 20
258 | padding   = 5
259 | layers_overflow = collapse
260 | orientation      = out
261 | stroke_thickness = 0
262 | stroke_color     = grey
263 | color            = grey
264 | 
265 | <rules>
266 | 
267 | <rule>
268 | 
269 | condition  = var(id) =~ /(\d+)(\d+)(\d*)/
270 | color      = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
271 | flow = continue
272 | 
273 | #importance = 100
274 | #condition = 1
275 | #color = eval(sprintf("spectral-11-div-%d",remap_int(NODE_%d%d,0,10e6,1,11)))
276 | #color = eval((qw(vvvlgrey vvlgrey vlgrey lgrey grey dgrey vdgrey vvdgrey))[var(id) % 8])
277 | </rule>
278 | 
279 | <rule>
280 | condition = var(size) < 1kb
281 | show     = no
282 | </rule>
283 | 
284 | 
285 | </rules>
286 | </plot>
287 | ############### /CONTIGS SPADES ALL
288 | 
289 | </plots>
290 | 
291 | 
292 | ######## LINKS
293 | ##############
294 | 
295 | <links>
296 | 
297 | <link>
298 | 
299 | file      = PLASMID_LINKS
300 | r1        = 0.50r
301 | r0        = 0r
302 | ribbon        = yes
303 | flat          = yes
304 | radius        = 0.6r
305 | bezier_radius = 0.1r
306 | crest         = 0.2
307 | color         = lgrey_a4
308 | 
309 | <rules>
310 | 
311 | <rule>
312 | condition     = var(intrachr)
313 | show          = no
314 | </rule>
315 | 
316 | <rule>
317 | importance = 110
318 | condition  = var(size1) < 2kb
319 | show       = no
320 | flow = continue
321 | </rule>
322 | 
323 | <rule>
324 | importance = 110
325 | condition  = var(size2) < 2kb
326 | show       = no
327 | flow = continue
328 | </rule>
329 | 
330 | <rule>
331 | 
332 | condition  = var(id) =~ /(\d+)(\d+)(\d*)/
333 | color      = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
334 | #"paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12)
335 | #"set3-12-qual-%d_a%d"
336 | #"rev(set3-12-qual-%d_a%d)"
337 | flow = continue
338 | </rule>
339 | 
340 | <rule>
341 | condition = 1
342 | z         = eval(average(-1*(var(size1),var(size2))))
343 | </rule>
344 | 
345 | </rules>
346 | 
347 | </link>
348 | 
349 | </links>
350 | 
351 | 
352 | ######## IDEOGRAM
353 | #################
354 | <ideogram>
355 | 
356 | show = yes
357 | 
358 | <spacing>
359 | default = 5000u
360 | #when representing witout scaling
361 | #default = 1000u
362 | break = 500u
363 | </spacing>
364 | 
365 | chromosomes_color = dblue
366 | stroke_color = blue
367 | 
368 | radius    = 0.93r
369 | thickness = 25p
370 | fill      = yes
371 | 
372 | show_label = yes
373 | 
374 | label_font = bold
375 | label_radius = dims(ideogram,radius_inner)
376 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2
377 | 
378 | label_size = 17
379 | label_parallel = yes
380 | 
381 | </ideogram>
382 | 
383 | ######## TICKS
384 | ##############
385 | 
386 | show_ticks = yes
387 | show_tick_labels = yes
388 | 
389 | <ticks>
390 | radius = dims(ideogram,radius_outer)
391 | color = black
392 | thickness = 2p
393 | 
394 | #multiplier = 0.001
395 | 
396 | <tick>
397 | #spacing = 1000u
398 | rspacing       = 0.025
399 | multiplier = 0.001
400 | spacing_type   = relative
401 | skip_first_label = yes
402 | skip_last_label = no
403 | size = 5p
404 | show_label = yes
405 | label_size = 20p
406 | #label_relative = yes
407 | suffix = " kb"
408 | #rdivisor = ideogram
409 | format = %d
410 | rmultiplier    = 1
411 | </tick>
412 | 
413 | 
414 | 
415 | #<tick>
416 | #spacing = 2000u
417 | #size = 15p
418 | #show_label = yes
419 | #label_size = 20p
420 | #labe_offset = 10p
421 | #suffix = " kb"
422 | #format = %d
423 | #</tick>
424 | 
425 | </ticks>
426 | 
427 | ########COLORS
428 | ##############
429 | <<include etc/colors_fonts_patterns.conf>>
430 | 
431 | 
432 | ########HOUSEKEEPING
433 | ####################
434 | <<include etc/housekeeping.conf>>
435 | max_points_per_track* = 8000000
436 | max_ideograms*=1000
437 | ########IMAGE
438 | #############
439 | <image>
440 | dir   = OUTPUTDIR
441 | #dir  = conf(configdir)
442 | file  = IMAGENAME
443 | png   = yes
444 | svg   = no
445 | # radius of inscribed circle in image
446 | radius         = 1900p
447 | # by default angle=0 is at 3 o'clock position
448 | angle_offset      = -90
449 | #angle_orientation = counterclockwise
450 | auto_alpha_colors = yes
451 | auto_alpha_steps  = 5
452 | </image>
453 | 
454 | 


--------------------------------------------------------------------------------
/config_files/simple.conf:
--------------------------------------------------------------------------------
  1 | ######## CIRCOS.CONF
  2 | ####################
  3 | 
  4 | karyotype = PLASMID_KARYOTYPE
  5 | 
  6 | chromosome_units = 1000000
  7 | chromosomes_display_default = no
  8 | chromosomes = SAMPLE_SHOWN
  9 | chromosomes_color = /./ = lblue
 10 | z=100
 11 | 
 12 | #<zooms>
 13 | #<zoom>
 14 | #chr    = NZ_CP018342.1
 15 | #start  = 30000u
 16 | #end    = 52000u
 17 | #scale  = 15
 18 | 
 19 | #smooth_distance = 10r
 20 | #smooth_steps    = 5
 21 | 
 22 | #</zoom>
 23 | #</zooms>
 24 | 
 25 | #############################HIGHLIGHTS
 26 | <highlights>
 27 | 
 28 | <<include PID_ALL_HIGHLIGHTS>>
 29 | 
 30 | </highlights>
 31 | 
 32 | ########################################PLOTS
 33 | <plots>
 34 | 
 35 | ############### COVERAGE
 36 | <plot>
 37 | type = histogram
 38 | file = PLASMID_COVERAGE_GRAPH
 39 | 
 40 | color = black
 41 | r1 = 0.99r
 42 | r0 = 0.90r
 43 | extend_bin = no
 44 | min= 0
 45 | max= 500
 46 | thickness = 2
 47 | orientation = out
 48 | 
 49 | <axes>
 50 | 
 51 | <axis>
 52 | thickness = 1
 53 | color = lgrey
 54 | spacing = 50
 55 | </axis>
 56 | 
 57 | </axes>
 58 | 
 59 | <rules>
 60 | 
 61 | <rule>
 62 | condition = var(value) < 20
 63 | color     = lorange
 64 | thickness = 3
 65 | flow      = continue
 66 | </rule>
 67 | 
 68 | <rule>
 69 | condition = var(value) == 0
 70 | color     = red
 71 | thickness = 3
 72 | flow         = continue
 73 | </rule>
 74 | 
 75 | <rule>
 76 | condition = var(value) > 200
 77 | color     = green
 78 | thickness = 3
 79 | </rule>
 80 | 
 81 | </rules>
 82 | 
 83 | </plot>
 84 | ############### /COVERAGE
 85 | 
 86 | ############### TEXT_ADITIONAL_ANNOTATION
 87 | <plot>
 88 | type = text
 89 | color      = black
 90 | label_font = bold
 91 | label_size = 30p
 92 | file = PLASMID_SPECIFIC_TEXT
 93 | r1   = 0.85r+200p
 94 | r0   = 0.74r
 95 | orientation = center
 96 | show_links     = no
 97 | 
 98 | margin = 0u
 99 | label_parallel = no
100 | padding  = 1p
101 | rpadding = 2p
102 | label_snuggle             = yes
103 | max_snuggle_distance  = 5r
104 | snuggle_sampling                = 2
105 | snuggle_tolerance               = 1r
106 | snuggle_link_overlap_test      = yes
107 | snuggle_link_overlap_tolerance = 20p
108 | 
109 | </plot>
110 | ############### /TEXT_ADITIONAL_ANNOTATION
111 | 
112 | 
113 | ############### TEXT_CDS_CONTIG
114 | <plot>
115 | 
116 | type = text
117 | color      = black
118 | label_font = default
119 | label_size = 42p
120 | file = PLASMID_CDS_CONTIG
121 | r1   = 0.70r+200p
122 | r0   = 0.70r
123 | orientation = center
124 | show_links     = yes
125 | link_dims      = 8p,8p,30p,8p,8p
126 | link_color     = grey
127 | 
128 | label_parallel = no
129 | padding  = 0p
130 | label_snuggle             = yes
131 | max_snuggle_distance  = 6r
132 | snuggle_sampling                = 10
133 | snuggle_tolerance               = 1r
134 | snuggle_link_overlap_test      = yes
135 | snuggle_link_overlap_tolerance = 10p
136 | 
137 | #<rules>
138 | #<rule>
139 | #condition  = var(value) eq "cds"
140 | #label_size = 7p
141 | #show = no
142 | #flow = continue
143 | #</rule>
144 | #</rules>
145 | 
146 | </plot>
147 | ############### /TEXT_CDS_CONTIG
148 | 
149 | ############### CDS_CONTIGS_PROKKA
150 | <plot>
151 | type      = tile
152 | file      = PLASMID_CDS_FORWARD
153 | r1        = 0.73r
154 | r0        = 0.70r
155 | layers    = 2
156 | layers_overflow = grow
157 | margin    = 0.001u
158 | thickness = 30p
159 | padding   = 0p
160 | rpadding   = 0p
161 | orientation = out
162 | stroke_thickness = 1
163 | stroke_color     = vvdgrey
164 | color            = dgrey
165 | </plot>
166 | 
167 | <plot>
168 | r1        = 0.70r
169 | r0        = 0.70r
170 | <axes>
171 | <axis>
172 | position  = 0.70r
173 | color     = dgrey
174 | thickness = 2
175 | </axis>
176 | </axes>
177 | </plot>
178 | 
179 | <plot>
180 | type      = tile
181 | file      = PLASMID_CDS_REVERSE
182 | r1        = 0.70r
183 | r0        = 0.67r
184 | layers    = 2
185 | layers_overflow = grow
186 | margin    = 0.001u
187 | thickness = 30p
188 | padding   = 0p
189 | rpadding   = 0p
190 | orientation = in
191 | stroke_thickness = 1
192 | stroke_color     = dgrey
193 | color            = lgrey
194 | 
195 | </plot>
196 | ############### /CDS_CONTIGS_PROKKA
197 | 
198 | 
199 | ############### TEXT_CONTIG
200 | <plot>
201 | type = text
202 | label_font = bold
203 | label_size = 20p
204 | file = PLASMID_CONTIGS
205 | r1   = 0.60r+100p
206 | r0   = 0.60r
207 | orientation = out
208 | show_links     = yes
209 | label_parallel = yes
210 | padding  = 5p
211 | rpadding = 2p
212 | margin = 15p
213 | label_snuggle             = yes
214 | max_snuggle_distance  = 10r
215 | snuggle_sampling                = 10
216 | snuggle_tolerance               = 5r
217 | snuggle_link_overlap_test      = yes
218 | snuggle_link_overlap_tolerance = 3p
219 | 
220 | <rules>
221 | 
222 | <rule>
223 | condition  = var(value) =~ /(\d+)(\d+)(\d*)/
224 | color      = eval(my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
225 | flow = continue
226 | </rule>
227 | 
228 | <rule>
229 | condition  = var(value) =~ /(\d+)(\d+)(\d*)/
230 | link_color      = eval(my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
231 | flow = continue
232 | </rule>
233 | 
234 | <rule>
235 | condition = var(size) < 0.2kb
236 | show     = no
237 | </rule>
238 | 
239 | </rules>
240 | </plot>
241 | ############### /TEXT_CONTIG
242 | 
243 | ############### CONTIGS SPADES ALL
244 | <plot>
245 | type      = tile
246 | file      = PLASMID_CONTIGS
247 | r1        = 0.60r
248 | r0        = 0.50r
249 | layers    = 5
250 | margin    = 5u
251 | thickness = 40
252 | padding   = 5
253 | layers_overflow = collapse
254 | orientation      = in
255 | stroke_thickness = 0
256 | stroke_color     = grey
257 | color            = grey
258 | 
259 | <rules>
260 | 
261 | <rule>
262 | condition  = var(value) =~ /(\d+)(\d+)(\d*)/
263 | color      = eval( my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
264 | flow = continue
265 | </rule>
266 | 
267 | 
268 | <rule>
269 | condition = var(size) < 0.2kb
270 | show     = no
271 | </rule>
272 | 
273 | 
274 | </rules>
275 | </plot>
276 | ############### /CONTIGS SPADES ALL
277 | 
278 | 
279 | 
280 | </plots>
281 | 
282 | 
283 | 
284 | ######## IDEOGRAM
285 | #################
286 | <ideogram>
287 | z=5000
288 | show = yes
289 | 
290 | <spacing>
291 | default = 10u
292 | #when representing witout scaling
293 | #default = 1000u
294 | break = 10u
295 | </spacing>
296 | 
297 | chromosomes_color = dblue
298 | stroke_color = blue
299 | 
300 | radius    = 0.93r
301 | thickness = 30p
302 | fill      = yes
303 | 
304 | show_label = yes
305 | label_color    = dgrey
306 | label_center   = yes
307 | label_font = bold
308 | label_radius = 0.1r
309 | #label_radius = dims(ideogram,radius_inner)
310 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2
311 | 
312 | label_size = 50
313 | label_parallel = yes
314 | 
315 | </ideogram>
316 | 
317 | ######## TICKS
318 | ##############
319 | 
320 | show_ticks = yes
321 | show_tick_labels = yes
322 | 
323 | <ticks>
324 | radius = dims(ideogram,radius_outer)
325 | color = black
326 | thickness = 2p
327 | labe_offset = 0p
328 | 
329 | #multiplier = 0.001
330 | 
331 | <tick>
332 | #spacing = 1000u
333 | rspacing       = 0.025
334 | multiplier = 0.001
335 | spacing_type   = relative
336 | skip_first_label = yes
337 | skip_last_label = no
338 | size = 5p
339 | show_label = yes
340 | label_size = 20p
341 | #label_relative = yes
342 | suffix = " kb"
343 | #rdivisor = ideogram
344 | format = %d
345 | rmultiplier    = 1
346 | </tick>
347 | 
348 | 
349 | #<tick>
350 | #spacing = 2000u
351 | #multiplier = 0.001
352 | #size = 5p
353 | #show_label = yes
354 | #skip_first_label = yes
355 | #label_size = 15p
356 | #labe_offset = 0p
357 | #suffix = " kb"
358 | #format = %d
359 | #</tick>
360 | 
361 | </ticks>
362 | 
363 | ########COLORS
364 | ##############
365 | <<include etc/colors_fonts_patterns.conf>>
366 | 
367 | 
368 | ########HOUSEKEEPING
369 | ####################
370 | <<include etc/housekeeping.conf>>
371 | max_points_per_track* = 8000000
372 | max_ideograms*=1000
373 | 
374 | ########IMAGE
375 | #############
376 | <image>
377 | dir   = OUTPUTDIR
378 | #dir  = conf(configdir)
379 | file  = IMAGENAME_SAMPLE_PLASMID
380 | png   = yes
381 | svg   = no
382 | # radius of inscribed circle in image
383 | radius         = 1900p
384 | # by default angle=0 is at 3 o'clock position
385 | angle_offset      = -90
386 | #angle_orientation = counterclockwise
387 | auto_alpha_colors = yes
388 | auto_alpha_steps  = 5
389 | </image>
390 | 


--------------------------------------------------------------------------------
/documents/ECCMID plasmidID 2018.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/documents/ECCMID plasmidID 2018.pdf


--------------------------------------------------------------------------------
/documents/Istall_dependencies.md:
--------------------------------------------------------------------------------
 1 | # Trimmomatic
 2 | - wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.38.zip
 3 | - unzip Trimmomatic-0.38.zip
 4 | - copy to /opt/Trimmomatic or use trimmomatic-dir PATH/TO/Trimmomatic-0.38
 5 | 
 6 | # SPAdes
 7 | 
 8 | - wget http://cab.spbu.ru/files/release3.12.0/SPAdes-3.12.0-Linux.tar.gz
 9 | - tar -xzf SPAdes-3.12.0-Linux.tar.gz
10 | - Add to PATH SPAdes-3.12.0-Linux/bin/
11 | 
12 | # Blast+
13 | 
14 | - sudo apt-get install ncbi-blast+
15 | 
16 | # Bowtie2
17 | 
18 | - sudo apt install bowtie2
19 | 
20 | # Cd-hit-est
21 | 
22 | - sudo apt-get install cd-hit
23 | 
24 | # Bedtools
25 | 
26 | - sudo apt install bedtools
27 | 
28 | # Prokka
29 | 
30 | - sudo apt-get install libdatetime-perl libxml-simple-perl libdigest-md5-perl git default-jre bioperl
31 | - sudo cpan Bio::Perl
32 | - git clone https://github.com/tseemann/prokka.git $HOME/prokka
33 | - $HOME/prokka/bin/prokka --setupdb
34 | - Add $HOME/prokka/bin/ to PATH
35 | 
36 | # Circos
37 | 
38 | 
39 | - wget http://www.circos.ca/distribution/circos-0.69-6.tgz
40 | - tar xvfz circos-0.69-6.tgz
41 | - sudo apt-get -y install libgd2-xpm-dev
42 | - Add circos-0.69-6.tgz/bin to PATH
43 | - sudo sed -i 's/max_points_per_track = 25000/max_points_per_track = 20000000/g' /opt/circos-0.69-6/etc/housekeeping.conf
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | ##g++
51 | - sudo apt-get install build-essential
52 | ##libz.h
53 | - sudo apt-get install libz-dev
54 | ##circos dependencies
55 | - sudo apt install circos
56 | 


--------------------------------------------------------------------------------
/documents/PlasmidID_IWBBIO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/documents/PlasmidID_IWBBIO.pdf


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: plasmidID
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python>=3.6
 8 |   - bioconda::perl-gd>=2.71
 9 |   - bioconda::bowtie2
10 |   - bioconda::bedtools
11 |   - bioconda::samtools
12 |   - bioconda::mash>=2
13 |   - bioconda::circos
14 |   - bioconda::prokka>=1.14
15 |   - bioconda::blast
16 |   - bioconda::spades
17 |   - bioconda::trimmomatic
18 |   - tbb==2020.2
19 |   - conda-forge::gawk
20 |   - conda-forge::biopython
21 |   - conda-forge::numpy
22 |   - conda-forge::pandas
23 |   - conda-forge::scikit-learn
24 |   - conda-forge::scipy
25 |   - conda-forge::tabulate
26 |   - conda-forge::wget
27 |   - conda-forge::bc
28 | 


--------------------------------------------------------------------------------
/img/01_plasmid_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/01_plasmid_track.png


--------------------------------------------------------------------------------
/img/02_mapping_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/02_mapping_track.png


--------------------------------------------------------------------------------
/img/03_annotation_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/03_annotation_track.png


--------------------------------------------------------------------------------
/img/04_contig_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/04_contig_track.png


--------------------------------------------------------------------------------
/img/05_01_complete_contig_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/05_01_complete_contig_track.png


--------------------------------------------------------------------------------
/img/05_complete_contig_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/05_complete_contig_track.png


--------------------------------------------------------------------------------
/img/Alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Alignment.png


--------------------------------------------------------------------------------
/img/Annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Annotation.png


--------------------------------------------------------------------------------
/img/Clustering_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Clustering_2.png


--------------------------------------------------------------------------------
/img/Mapping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Mapping.png


--------------------------------------------------------------------------------
/img/Overlap_examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Overlap_examples.png


--------------------------------------------------------------------------------
/img/PIPELNE TFM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/PIPELNE TFM.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_K00826.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_K00826.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_002305.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_002305.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_003384.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_003384.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_003385.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_003385.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_009981.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_009981.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_013365.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_013365.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT883154.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT883154.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT904853.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904853.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT904874.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904874.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT904880.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904880.1.png


--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT904895.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904895.1.png


--------------------------------------------------------------------------------
/img/SEN_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN_summary.png


--------------------------------------------------------------------------------
/img/SEN_summary_numbers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN_summary_numbers.png


--------------------------------------------------------------------------------
/img/Short_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Short_pipeline.png


--------------------------------------------------------------------------------
/img/Visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Visualization.png


--------------------------------------------------------------------------------
/img/isciii_logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/isciii_logo.jpeg


--------------------------------------------------------------------------------
/img/pipeline_pID.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/pipeline_pID.png


--------------------------------------------------------------------------------
/img/plasmidID_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/plasmidID_logo.png


--------------------------------------------------------------------------------
/img/summary_image_1_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_1_3.png


--------------------------------------------------------------------------------
/img/summary_image_2_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_2_3.png


--------------------------------------------------------------------------------
/img/summary_image_3_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_3_3.png


--------------------------------------------------------------------------------
/test/KPN_TEST_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/test/KPN_TEST_R1.fastq.gz


--------------------------------------------------------------------------------
/test/KPN_TEST_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/test/KPN_TEST_R2.fastq.gz


--------------------------------------------------------------------------------
/test/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
  4 | #or a compound command returns a non-zero status: If errors are not handled by user
  5 | set -e
  6 | # Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error when performing parameter expansion.
  7 | 
  8 | #Print everything as if it were executed, after substitution and expansion is applied: Debug|log option
  9 | #set -x
 10 | 
 11 | #=============================================================
 12 | # HEADER
 13 | #=============================================================
 14 | 
 15 | #INSTITUTION:ISCIII
 16 | #CENTRE:BU-ISCIII
 17 | #AUTHOR: Pedro J. Sola (pedroscampoy@gmail.com)
 18 | VERSION=1.6.3
 19 | #CREATED: 15 March 2018
 20 | #
 21 | #ACKNOLEDGE: longops2getops.sh: https://gist.github.com/adamhotep/895cebf290e95e613c006afbffef09d7
 22 | #
 23 | #DESCRIPTION: test.sh uses test data for testing plasmidID installation.
 24 | #
 25 | #
 26 | #================================================================
 27 | # END_OF_HEADER
 28 | #================================================================
 29 | 
 30 | #SHORT USAGE RULES
 31 | #LONG USAGE FUNCTION
 32 | usage() {
 33 | 	cat << EOF
 34 | 
 35 | plasmidID is a computational pipeline tha reconstruct and annotate the most likely plasmids present in one sample
 36 | 
 37 | usage : $0
 38 | 
 39 | 	-v | --version		version
 40 | 	-h | --help		display usage message
 41 | 
 42 | example: ./test.sh
 43 | 
 44 | EOF
 45 | }
 46 | 
 47 | #================================================================
 48 | # OPTION_PROCESSING
 49 | #================================================================
 50 | # Error handling
 51 | error(){
 52 |   local parent_lineno="$1"
 53 |   local script="$2"
 54 |   local message="$3"
 55 |   local code="${4:-1}"
 56 | 
 57 | 	RED='\033[0;31m'
 58 | 	NC='\033[0m'
 59 | 
 60 |   if [[ -n "$message" ]] ; then
 61 |     echo -e "\n---------------------------------------\n"
 62 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 63 |     echo -e "MESSAGE:\n"
 64 |     echo -e "$message"
 65 |     echo -e "\n---------------------------------------\n"
 66 |   else
 67 |     echo -e "\n---------------------------------------\n"
 68 |     echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
 69 |     echo -e "\n---------------------------------------\n"
 70 |   fi
 71 | 
 72 |   exit "${code}"
 73 | }
 74 | 
 75 | # translate long options to short
 76 | reset=true
 77 | for arg in "$@"
 78 | do
 79 |     if [ -n "$reset" ]; then
 80 |       unset reset
 81 |       set --      # this resets the "$@" array so we can rebuild it
 82 |     fi
 83 |     case "$arg" in
 84 |        	--help)    	set -- "$@" -h ;;
 85 |        	--version) 	set -- "$@" -v ;;
 86 |        # pass through anything else
 87 |        *)         set -- "$@" "$arg" ;;
 88 |     esac
 89 | done
 90 | 
 91 | #DECLARE FLAGS AND VARIABLES
 92 | script_dir=$(dirname $(readlink -f $0))
 93 | R1=KPN_TEST_R1.fastq.gz
 94 | R2=KPN_TEST_R2.fastq.gz
 95 | database=plasmids_TEST_database.fasta
 96 | contigs=contigs_KPN_TEST.fasta
 97 | 
 98 | #PARSE VARIABLE ARGUMENTS WITH getops
 99 | #common example with letters, for long options check longopts2getopts.sh
100 | options=":1:2:d:s:g:c:a:i:o:C:S:f:l:L:T:M:X:y:Y:RVtvh"
101 | while getopts $options opt; do
102 | 	case $opt in
103 |         h )
104 | 		  	usage
105 | 		  	exit 1
106 | 		  	;;
107 | 		v )
108 | 		  	echo $VERSION
109 | 		  	exit 1
110 | 		  	;;
111 | 		\?)
112 | 			echo "Invalid Option: -$OPTARG" 1>&2
113 | 			usage
114 | 			exit 1
115 | 			;;
116 | 		: )
117 |       		echo "Option -$OPTARG requires an argument." >&2
118 |       		exit 1
119 |       		;;
120 |       	* )
121 | 			echo "Unimplemented option: -$OPTARG" >&2;
122 | 			exit 1
123 | 			;;
124 | 
125 | 	esac
126 | done
127 | shift $((OPTIND-1))
128 | 
129 | ## Execute plasmidID with test data.
130 | echo "Executing:../plasmidID.sh -1 $R1 -2 $R2 -d $database -c $contigs -s KPN --no-trim"
131 | echo "Forward reads: $R1"
132 | echo "Reverse reads: $R2"
133 | echo "PlasmidDatabase: $database"
134 | echo "Contigs: $contigs"
135 | echo "Options: --no-trim"
136 | 
137 | echo "export PATH=$PATH:$script_dir/../bin" > path
138 | source path
139 | $script_dir/../plasmidID -1 $script_dir/$R1 -2 $script_dir/$R2 -d $script_dir/$database -c $script_dir/$contigs -s KPN --no-trim
140 | 
141 | 
142 | echo "ALL DONE. TEST COMPLETED SUCCESSFULLY YOUR INSTALLATION SHOULD BE CORRECT."
143 | 


--------------------------------------------------------------------------------