number of threads
180 | -v | --version version
181 | -h | --help display usage message
182 |
183 | example: ./plasmidID.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -d database.fasta -s ECO_553 -G ENTERO
184 | ./plasmidID.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -d PacBio_sample.fasta -c scaffolds.fasta -C 60 -s ECO_60 -G ENTERO --no-trim
185 | ```
186 |
187 | ## Examples
188 |
189 | Under construction
190 |
191 | ## Output
192 |
193 | Since v1.6, the more relevant output is located in GROUP/SAMPLE folder:
194 |
195 | - **SAMPLE_final_results.html(.tab)**
196 | - id: Name of the accession number of reference
197 | - length: length of the reference sequence
198 | - species: species of the reference sequence
199 | - description: rest of reference fasta header
200 | - contig_name: number of the contigs that align the minimun required for complete contig track
201 | - SAMPLE:
202 | - Image of the reconstructed plasmid (click to open in new tab)
203 | - MAPPING % (percentage): percentage of reference covered with reads
204 | - X for contig mode (gray colour)
205 | - Orientative colouring (the closer to 100% the better)
206 | - ALIGN FR (fraction_covered): total length of contigs aligned (complete) / reference sequence length
207 | - Orientative colouring (the closer to 1 the better)
208 |
209 |
210 | ## Annotation file
211 |
212 | Under construction
213 |
214 | ## Illustrated pipeline
215 |
216 | This image sumarizes PlasmidID pipeline, including the most important steps.
217 | For furder details, including:
218 | - [Results interpretation](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track)
219 | - and more, please visit: [**PLASMIDID WIKI**](https://github.com/BU-ISCIII/plasmidID/wiki)
220 |
221 |
222 |
223 |
--------------------------------------------------------------------------------
/bin/adapt_filter_coverage.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 | #~set -x
7 |
8 | #=============================================================
9 | # HEADER
10 | #=============================================================
11 |
12 | #INSTITUTION:ISCIII
13 | #CENTRE:BU-ISCIII
14 | #AUTHOR: Pedro J. Sola
15 | VERSION=1.0
16 | #CREATED: 21 March 2018
17 | #REVISION:
18 | #DESCRIPTION:adapt_filter_coverage script that adapt percentages and filter coverage info from bedtools genomecov output
19 |
20 | #================================================================
21 | # END_OF_HEADER
22 | #================================================================
23 |
24 | #SHORT USAGE RULES
25 | #LONG USAGE FUNCTION
26 | usage() {
27 | cat << EOF
28 |
29 | adapt_filter_coverage script that adapt percentages and filter coverage info from bedtools genomecov output
30 |
31 | usage : $0 <-i inputfile(.fasta)> [-o ] [-c ] [-s ] [-v] [-h]
32 |
33 | -i input file
34 | -o output directory (optional). By default the file is replaced in the same location
35 | -c percentage value to filter >= values. If not supplied, all records will be outputted
36 | -s string to ad at the end of the outputted file (list of accession numbers)
37 | -v version
38 | -h display usage message
39 |
40 | example: adapt_filter_coverage.sh -i ecoli.coverage -c 70
41 |
42 | EOF
43 | }
44 |
45 | #================================================================
46 | # OPTION_PROCESSING
47 | #================================================================
48 | #Make sure the script is executed with arguments
49 | if [ $# = 0 ] ; then
50 | usage >&2
51 | exit 1
52 | fi
53 |
54 | # Error handling
55 | error(){
56 | local parent_lineno="$1"
57 | local script="$2"
58 | local message="$3"
59 | local code="${4:-1}"
60 |
61 | RED='\033[0;31m'
62 | NC='\033[0m'
63 |
64 | if [[ -n "$message" ]] ; then
65 | echo -e "\n---------------------------------------\n"
66 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
67 | echo -e "MESSAGE:\n"
68 | echo -e "$message"
69 | echo -e "\n---------------------------------------\n"
70 | else
71 | echo -e "\n---------------------------------------\n"
72 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
73 | echo -e "\n---------------------------------------\n"
74 | fi
75 |
76 | exit "${code}"
77 | }
78 |
79 | #DECLARE FLAGS AND VARIABLES
80 | cwd="$(pwd)"
81 | input_file="Input_file"
82 | coverage_cutoff_input=100
83 |
84 | #PARSE VARIABLE ARGUMENTS WITH getops
85 | #common example with letters, for long options check longopts2getopts.sh
86 | options=":i:o:c:s:vh"
87 | while getopts $options opt; do
88 | case $opt in
89 | i )
90 | input_file=$OPTARG
91 | ;;
92 | o )
93 | output_dir=$OPTARG
94 | ;;
95 | c )
96 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
97 | echo "please, provide a percentage between 0 and 100"
98 | usage
99 | exit 1
100 | else
101 | coverage_cutoff_input=$OPTARG
102 | fi
103 | ;;
104 | s )
105 | suffix=$OPTARG
106 | ;;
107 | h )
108 | usage
109 | exit 1
110 | ;;
111 | v )
112 | echo $VERSION
113 | exit 1
114 | ;;
115 | \?)
116 | echo "Invalid Option: -$OPTARG" 1>&2
117 | usage
118 | exit 1
119 | ;;
120 | : )
121 | echo "Option -$OPTARG requires an argument." >&2
122 | exit 1
123 | ;;
124 | * )
125 | echo "Unimplemented option: -$OPTARG" >&2;
126 | exit 1
127 | ;;
128 |
129 | esac
130 | done
131 | shift $((OPTIND-1))
132 |
133 | #================================================================
134 | # MAIN_BODY
135 | #================================================================
136 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
137 |
138 | echo -e "\n#Executing" $0 "\n"
139 |
140 | check_mandatory_files.sh $input_file
141 |
142 |
143 | suffix="_adapted_filtered_"$coverage_cutoff_input
144 | coverage_cutoff=$(echo "(1 - ($coverage_cutoff_input/100))" | bc -l)
145 |
146 | #echo $coverage_cutoff
147 |
148 | if [ ! $output_dir ]; then
149 | output_dir=$(dirname $input_file)
150 | #echo "Default output directory is" $output_dir
151 | mkdir -p $output_dir
152 | else
153 | #echo "Output directory is" $output_dir
154 | mkdir -p $output_dir
155 | fi
156 |
157 |
158 | if [ ! $filename ]; then
159 | filename=$(basename $input_file | cut -d. -f1)
160 | fi
161 |
162 |
163 | if [ -f $input_file"_adapted" ]; then
164 | echo "Found previous" $(basename $input_file"_adapted")", removing it"
165 | rm $input_file"_adapted"
166 | fi
167 |
168 | ## Keep information about positions with 0 coverage. If no 0 coverage positions for a plasmid, create line including this info.
169 | awk '
170 | BEGIN{OFS="\t"}
171 | (!x[$1]++) {if ($1 != "genome")
172 | {if ($2 == 0)
173 | {print $0}
174 | else
175 | {print $1, 0, $4, $4, 0.0000000001}
176 | }
177 | }
178 | ' $input_file > $input_file"_adapted" || error ${LINENO} $(basename $0) "Awk command for bedtools coverage output parsing in $input_file\"_adapted\" creation. See $output_dir/logs for more information"
179 |
180 | ## Keep plasmids with coverage < 1-coverage_cutoff_input/100
181 | awk '
182 | {if ($2 == 0 && $5 < '"${coverage_cutoff}"')
183 | {print $1}
184 | }
185 | ' $input_file"_adapted" > $input_file$suffix || error ${LINENO} $(basename $0) "Awk command for coverage filtering in $input_file$suffix creation. See $output_dir/logs for more information."
186 |
187 | echo "$(date)"
188 | echo "Done filtering sequences with" $coverage_cutoff_input"% and greater coverage"
189 | echo "Those sequences can be found at" $input_file$suffix
190 | echo -e $(cat $input_file$suffix | wc -l) mapped equals or more than $coverage_cutoff_input "\n"
191 |
--------------------------------------------------------------------------------
/bin/blast_align.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | #=============================================================
6 | # HEADER
7 | #=============================================================
8 |
9 | #INSTITUTION:ISCIII
10 | #CENTRE:BU-ISCIII
11 | #AUTHOR: Pedro J. Sola
12 | VERSION=1.0
13 | #CREATED: 1 May 2018
14 | #REVISION:
15 | #DESCRIPTION:Script that blast a query against a database
16 | #
17 | #DOCUMENTATION
18 | #
19 | #Blast output6 with aditions:
20 | #1 Query label.(qseqid)
21 | #2 Target or subject(database sequence or cluster centroid) label. (sseqid)
22 | #3 Percent identity. (pident)
23 | #4 Alignment length. (length)
24 | #5 Number of mismatches. (mismatch)
25 | #6 Number of gap opens. (gapopen)
26 | #7 Start position in query. Query coordinates start with 1 at the first base in the sequence as it appears in the input file. For translated searches (nucleotide queries, protein targets), query startend for -ve frame. (qstart)
27 | #8 End position in query. (qend)
28 | #9 Start position in target. Target coordinates start with 1 at the first base in sequence as it appears in the database. For untranslated nucleotide searches, target startend for a reverse-complement alignment. (sstart)
29 | #10 End position in target. (send)
30 | #11 E-value calculated using Karlin-Altschul statistics. (evalue)
31 | #12 Bit score calculated using Karlin-Altschul statistics. (bitscore)
32 | #13 Lenght of query (qlen)
33 | #14 Length of target (slen)
34 | #
35 | #
36 | #TO DO:
37 | #
38 | #Handle all types of blast: blastn, blastp...
39 | #
40 | #================================================================
41 | # END_OF_HEADER
42 | #================================================================
43 |
44 | #SHORT USAGE RULES
45 | #LONG USAGE FUNCTION
46 | usage() {
47 | cat << EOF
48 |
49 | blast_align is a script that blast a query against a database
50 |
51 | usage : $0 <-i inputfile(query)> <-d inputfile(database)> [-p ] [-o ] [-t ]
52 | [-T ] [-e ] [-v] [-h]
53 |
54 | -i query file in FASTA format
55 | -d database to blast against
56 | -o output directory, default same directory as query
57 | -p prefix for blast identification (mandatory) and output file name
58 | -q type of query, nucl by default
59 | -t type of database, nucl by default
60 | -e evalue for blast analysis, default 0.0001
61 | -T number of threads
62 | -v version
63 | -h display usage message
64 |
65 | Output directory is the same as input directory by default
66 |
67 | example: blast_align -i ecoli.fasta -d plasmid_ddbb.fasta -p plasmid
68 |
69 |
70 | EOF
71 | }
72 |
73 |
74 | #================================================================
75 | # OPTION_PROCESSING
76 | #================================================================
77 | #Make sure the script is executed with arguments
78 | if [ $# = 0 ] ; then
79 | usage >&2
80 | exit 1
81 | fi
82 |
83 | # Error handling
84 | error(){
85 | local parent_lineno="$1"
86 | local script="$2"
87 | local message="$3"
88 | local code="${4:-1}"
89 |
90 | RED='\033[0;31m'
91 | NC='\033[0m'
92 |
93 | if [[ -n "$message" ]] ; then
94 | echo -e "\n---------------------------------------\n"
95 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
96 | echo -e "MESSAGE:\n"
97 | echo -e "$message"
98 | echo -e "\n---------------------------------------\n"
99 | else
100 | echo -e "\n---------------------------------------\n"
101 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
102 | echo -e "\n---------------------------------------\n"
103 | fi
104 |
105 | exit "${code}"
106 | }
107 |
108 | #DECLARE FLAGS AND VARIABLES
109 | cwd="$(pwd)"
110 | group="NO_GROUP"
111 | input_file="Input_file"
112 | database="Database"
113 | query_type="nucl"
114 | database_type="nucl"
115 | evalue=0.0001
116 | threads=1
117 | blast_command="blastn"
118 |
119 | #PARSE VARIABLE ARGUMENTS WITH getops
120 | #common example with letters, for long options check longopts2getopts.sh
121 | options=":i:o:p:f:d:q:t:e:T:vh"
122 | while getopts $options opt; do
123 | case $opt in
124 | i )
125 | input_file=$OPTARG
126 | ;;
127 | d )
128 | database=$OPTARG
129 | ;;
130 | o )
131 | output_dir=$OPTARG
132 | ;;
133 | p)
134 | prefix=$OPTARG
135 | ;;
136 | f)
137 | file_name=$OPTARG
138 | ;;
139 | t )
140 | database_type=$OPTARG
141 | ;;
142 | q )
143 | query_type=$OPTARG
144 | ;;
145 | g )
146 | group=$OPTARG
147 | ;;
148 | e )
149 | evalue=$OPTARG
150 | ;;
151 | T)
152 | threads=$OPTARG
153 | ;;
154 | h )
155 | usage
156 | exit 1
157 | ;;
158 | v )
159 | echo $VERSION
160 | exit 1
161 | ;;
162 | \?)
163 | echo "Invalid Option: -$OPTARG" 1>&2
164 | usage
165 | exit 1
166 | ;;
167 | : )
168 | echo "Option -$OPTARG requires an argument." >&2
169 | exit 1
170 | ;;
171 | * )
172 | echo "Unimplemented option: -$OPTARG" >&2;
173 | exit 1
174 | ;;
175 |
176 | esac
177 | done
178 | shift $((OPTIND-1))
179 |
180 | #================================================================
181 | # MAIN_BODY
182 | #================================================================
183 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
184 |
185 | echo -e "\n#Executing" $0 "\n"
186 |
187 | check_mandatory_files.sh $input_file $database
188 |
189 | #check_dependencies.sh blastn
190 |
191 |
192 | if [ ! $prefix ]; then
193 | echo "please provide a prefix to identify this blast analysis"
194 | exit 1
195 | fi
196 |
197 | if [ $query_type == "prot" ] || [ $query_type == "nucl" ]; then
198 | echo "query type selected as" $database_type
199 | else
200 | echo "please provide a proper query type"
201 | exit 1
202 | fi
203 |
204 | if [ $query_type == "prot" ]; then
205 | blast_command="tblastn"
206 | fi
207 |
208 | if [ ! $output_dir ]; then
209 | output_dir=$(dirname $input_file)
210 | echo "Default output directory is" $output_dir
211 | mkdir -p $output_dir
212 | else
213 | echo "Output directory is" $output_dir
214 | mkdir -p $output_dir
215 | fi
216 |
217 | if [ ! $file_name ]; then
218 | file_name=$(basename $input_file | cut -d. -f1)
219 | echo "filename is" $file_name
220 | fi
221 |
222 | database_name=$(basename $database)
223 | database_dir=$(dirname $database)
224 |
225 | ##BLAST EXECUTION
226 |
227 | echo "$(date)"
228 | echo "Blasting" $file_name "agaist" $database_name
229 |
230 | makeblastdb -in $database -out $database_dir/$database_name".blast.tmp" -dbtype $database_type || error ${LINENO} $(basename $0) "Makeblastdb command failed. See $output_dir/logs for more information."
231 |
232 | echo "BLAST command is" $blast_command
233 |
234 | $blast_command -query $input_file \
235 | -db $database_dir/$database_name".blast.tmp" \
236 | -out $output_dir/$file_name"."$prefix".blast" \
237 | -evalue $evalue \
238 | -num_threads $threads \
239 | -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen" || error ${LINENO} $(basename $0) "Blastn command failed. See $output_dir/logs for more information"
240 |
241 |
242 | echo "$(date)"
243 | echo "Done blasting" $file_name "agaist" $database_name
244 | echo -e "blasted file can be found in" $output_dir/$file_name"."$prefix".blast" "\n"
245 |
--------------------------------------------------------------------------------
/bin/blast_to_bed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | #set -e
6 | #set -x
7 |
8 | #=============================================================
9 | # HEADER
10 | #=============================================================
11 |
12 | #INSTITUTION:ISCIII
13 | #CENTRE:BU-ISCIII
14 | #AUTHOR: Pedro J. Sola
15 | VERSION=1.0
16 | #CREATED: 4 May 2018
17 | #REVISION:
18 | #06 May 2018: add id optiopn in bed output
19 | #04 June 2018: add an option for an aditional division mostly for ABR sort
20 | #
21 | #DESCRIPTION:blast_to_bed script obtain a BED file with coordinates of local blast alignments matching some given conditions
22 | #================================================================
23 | # END_OF_HEADER
24 | #================================================================
25 |
26 | #SHORT USAGE RULES
27 | #LONG USAGE FUNCTION
28 | usage() {
29 | cat << EOF
30 |
31 | blast_to_bed is a script than obtain a BED file with coordinates of local blast alignments matching some given conditions
32 |
33 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o ] [-b ] [-l ] [-L ]
34 | [-p ] [-d ] [-D (l|r)] [-q ] [-Q (l|r)] [-U ] [-I] [-u] [-v] [-h]
35 |
36 | -i input file
37 | -b blast identity cutoff (0 - 100), default 90
38 | -l blast length percentage cutoff (0 - 100), default 20, use 90 for genes
39 | -L blast length alignment cutoff, default 0, use 200 or 500 for contigs
40 | -o output directory (optional). By default the file is replaced in the same location
41 | -q database chraracter delimiter, default "_"
42 | -Q query field to retrieve (l=left, r=right), default left
43 | -d database chraracter delimiter, default "_"
44 | -D database field to retrieve (l=left, r=right), default right
45 | -I contig mode
46 | -u unique. Outputs only one query entry per database entry
47 | -U unique mode with delimiter. Outputs only one delimited query per database entry
48 | -v version
49 | -h display usage message
50 |
51 | example: blast_to_bed.sh -i ecoli_prefix.blast -b 80 -l 50 -q - -Q r
52 |
53 | EOF
54 | }
55 |
56 | #================================================================
57 | # OPTION_PROCESSING
58 | #================================================================
59 | #Make sure the script is executed with arguments
60 | if [ $# = 0 ] ; then
61 | usage >&2
62 | exit 1
63 | fi
64 |
65 | error(){
66 | local parent_lineno="$1"
67 | local script="$2"
68 | local message="$3"
69 | local code="${4:-1}"
70 |
71 | RED='\033[0;31m'
72 | NC='\033[0m'
73 |
74 | if [[ -n "$message" ]] ; then
75 | echo -e "\n---------------------------------------\n"
76 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
77 | echo -e "MESSAGE:\n"
78 | echo -e "$message"
79 | echo -e "\n---------------------------------------\n"
80 | else
81 | echo -e "\n---------------------------------------\n"
82 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
83 | echo -e "\n---------------------------------------\n"
84 | fi
85 |
86 | exit "${code}"
87 | }
88 |
89 | #DECLARE FLAGS AND VARIABLES
90 | cwd="$(pwd)"
91 | input_file="Input_file"
92 | blast_id_cutoff=90
93 | blast_len_percentage=10
94 | blast_len_alignment=0
95 | database_delimiter="_"
96 | database_field=r
97 | query_delimiter="_"
98 | query_field=l
99 | unique=false
100 | unique_divider=false
101 | divider_delimiter="-"
102 | suffix=""
103 | id_circos=false
104 | id_output=""
105 |
106 | #PARSE VARIABLE ARGUMENTS WITH getops
107 | #common example with letters, for long options check longopts2getopts.sh
108 | options=":i:b:q:Q:d:D:o:l:L:U:Iuvh"
109 | while getopts $options opt; do
110 | case $opt in
111 | i )
112 | input_file=$OPTARG
113 | ;;
114 | b )
115 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
116 | echo "please, provide a percentage between 0 and 100"
117 | exit 1
118 | else
119 | blast_id_cutoff=$OPTARG
120 | fi
121 | ;;
122 | o )
123 | output_dir=$OPTARG
124 | ;;
125 | l )
126 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
127 | echo "please, provide a percentage between 0 and 100"
128 | exit 1
129 | else
130 | blast_len_percentage=$OPTARG
131 | fi
132 | ;;
133 | L )
134 | blast_len_alignment=$OPTARG
135 | ;;
136 | d )
137 | database_delimiter=$OPTARG
138 | ;;
139 | D )
140 | database_field=$OPTARG
141 | ;;
142 | q )
143 | query_delimiter=$OPTARG
144 | ;;
145 | Q )
146 | query_field=$OPTARG
147 | ;;
148 | u )
149 | unique=true
150 | suffix=".unique.tmp"
151 | ;;
152 | U )
153 | unique_divider=true
154 | suffix=".unique.divider.tmp"
155 | divider_delimiter=$OPTARG
156 | ;;
157 | I)
158 | id_circos=true
159 | id_output=",\"id=\"query_name[length(query_name)]"
160 | ;;
161 | h )
162 | usage
163 | exit 1
164 | ;;
165 | v )
166 | echo $VERSION
167 | exit 1
168 | ;;
169 | \?)
170 | echo "Invalid Option: -$OPTARG" 1>&2
171 | usage
172 | exit 1
173 | ;;
174 | : )
175 | echo "Option -$OPTARG requires an argument." >&2
176 | exit 1
177 | ;;
178 | * )
179 | echo "Unimplemented option: -$OPTARG" >&2;
180 | exit 1
181 | ;;
182 |
183 | esac
184 | done
185 | shift $((OPTIND-1))
186 |
187 | #================================================================
188 | # MAIN_BODY
189 | #================================================================
190 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
191 |
192 | echo -e "\n#Executing" $0 "\n"
193 |
194 | check_mandatory_files.sh $input_file
195 |
196 |
197 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l)
198 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//')
199 |
200 |
201 | if [ ! $output_dir ]; then
202 | output_dir=$(dirname $input_file)
203 | #echo "Default output directory is" $output_dir
204 | mkdir -p $output_dir
205 | else
206 | #echo "Output directory is" $output_dir
207 | mkdir -p $output_dir
208 | fi
209 |
210 |
211 | if [ ! $file_name ]; then
212 | file_name=$(basename $input_file | cut -d. -f1,2)
213 | fi
214 |
215 | ##CHECK FIELDS TO RETRIEVE
216 |
217 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then
218 |
219 | if [ $database_field == l ]; then
220 | database_field="1"
221 | else
222 | database_field="length(database_name)"
223 | fi
224 |
225 | else
226 | echo "Please introduce r or l for database"
227 | exit 1
228 | fi
229 |
230 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then
231 |
232 | if [ $query_field == l ]; then
233 | query_field="1"
234 | else
235 | query_field="length(query_name)"
236 | fi
237 |
238 | else
239 |
240 | echo "Please introduce 0 or 1 for query"
241 | exit 1
242 | fi
243 |
244 | echo "$(date)"
245 | echo "Adapting blast to bed using" $(basename $input_file) "with:"
246 | echo "Blast identity=" $blast_id_cutoff
247 | echo "Min length aligned=" $blast_len_alignment
248 | echo "Min len percentage=" $blast_len_percentage
249 | echo "database_delimiter=" $database_delimiter
250 | echo "database_field)=" $database_field
251 | echo "query_delimiter=" $query_delimiter
252 | echo "query_field=" $query_field
253 |
254 |
255 | cat $input_file | sort -k3 -nr | \
256 | awk '
257 | {OFS="\t"
258 | split($2, database_name, "'"${database_delimiter}"'")
259 | split($1, query_name, "'"${query_delimiter}"'")}
260 | (($3 >= '"${blast_id_cutoff}"')&&(($4/$13) >= '"${blast_len_percentage_value}"')&&($4 >= '"${blast_len_alignment}"')) \
261 | {print database_name['"$database_field"'], $9, $10, query_name['"$query_field"']'"$id_output"'}
262 | ' \
263 | > $output_dir/$file_name".bed"$suffix || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\"$suffix. See $output_dir/logs for more information."
264 |
265 |
266 | if [ "$unique" == "true" ]; then
267 | echo "unique option enabled"
268 | awk '
269 | (!x[$1$4]++)
270 | ' $output_dir/$file_name".bed"$suffix \
271 | > $output_dir/$file_name".bed" || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\". See $output_dir/logs for more information."
272 | rm $output_dir/$file_name".bed"$suffix
273 | fi
274 |
275 |
276 | if [ "$unique_divider" == "true" ]; then
277 | echo "unique delimiter option enabled"
278 | awk '
279 | {split($4,query,"'"${divider_delimiter}"'")}
280 | (!x[query[1]$1]++)
281 | ' $output_dir/$file_name".bed"$suffix \
282 | > $output_dir/$file_name".bed" || error ${LINENO} $(basename $0) "AWK command fail in $file_name\".bed\"$suffix. See $output_dir/logs for more information."
283 | rm $output_dir/$file_name".bed"$suffix
284 | fi
285 |
286 |
287 |
288 | echo "$(date)"
289 | echo "DONE adapting blast to bed"
290 | echo -e "File can be found at" $output_dir/$file_name".bed" "\n"
291 |
--------------------------------------------------------------------------------
/bin/blast_to_complete.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | #set -e
6 | #set -x
7 |
8 | #=============================================================
9 | # HEADER
10 | #=============================================================
11 |
12 | #INSTITUTION:ISCIII
13 | #CENTRE:BU-ISCIII
14 | #AUTHOR: Pedro J. Sola
15 | VERSION=1.0
16 | #CREATED: 13 May 2018
17 | #
18 | #DESCRIPTION:blast_to_complete script obtain full length of sequences from blast and adapt it to circos
19 | #================================================================
20 | # END_OF_HEADER
21 | #================================================================
22 |
23 | #SHORT USAGE RULES
24 | #LONG USAGE FUNCTION
25 | usage() {
26 | cat << EOF
27 |
28 | blast_to_complete is a script that obtain full length of sequences from blast and adapt it to circos
29 |
30 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o ] [-b ] [-l ]
31 | [-p ] [-d ] [-D (l|r)] [-q ] [-Q (l|r)] [-I] [-u] [-v] [-h]
32 |
33 | -i input file
34 | -b blast identity cutoff (0 - 100), default 90
35 | -l blast length percentage cutoff (0 - 100), default 50, use 90 for genes
36 | -o output directory (optional). By default the file is replaced in the same location
37 | -q database chraracter delimiter, default "_"
38 | -Q query field to retrieve (l=left, r=right), default left
39 | -d database chraracter delimiter, default "_"
40 | -D database field to retrieve (l=left, r=right), default right
41 | -I contig mode
42 | -u unique. Outputs only one query entry per database entry
43 | -v version
44 | -h display usage message
45 |
46 | example: blast_to_complete.sh -i ecoli_prefix.blast
47 | EOF
48 | }
49 |
50 | #================================================================
51 | # OPTION_PROCESSING
52 | #================================================================
53 | #Make sure the script is executed with arguments
54 | if [ $# = 0 ] ; then
55 | usage >&2
56 | exit 1
57 | fi
58 |
59 | # Error handling
60 | error(){
61 | local parent_lineno="$1"
62 | local script="$2"
63 | local message="$3"
64 | local code="${4:-1}"
65 |
66 | RED='\033[0;31m'
67 | NC='\033[0m'
68 |
69 | if [[ -n "$message" ]] ; then
70 | echo -e "\n---------------------------------------\n"
71 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
72 | echo -e "MESSAGE:\n"
73 | echo -e "$message"
74 | echo -e "\n---------------------------------------\n"
75 | else
76 | echo -e "\n---------------------------------------\n"
77 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
78 | echo -e "\n---------------------------------------\n"
79 | fi
80 |
81 | exit "${code}"
82 | }
83 |
84 | #DECLARE FLAGS AND VARIABLES
85 | cwd="$(pwd)"
86 | input_file="Input_file"
87 | blast_id_cutoff=90
88 | blast_len_percentage=15
89 | database_delimiter="-"
90 | database_field=r
91 | query_delimiter="_"
92 | query_field=r
93 | unique=false
94 | suffix=""
95 | id_circos=false
96 | id_output=""
97 |
98 | #PARSE VARIABLE ARGUMENTS WITH getops
99 | #common example with letters, for long options check longopts2getopts.sh
100 | options=":i:b:q:Q:d:D:o:l:Iuvh"
101 | while getopts $options opt; do
102 | case $opt in
103 | i )
104 | input_file=$OPTARG
105 | ;;
106 | b )
107 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
108 | echo "please, provide a percentage between 0 and 100"
109 | exit 1
110 | else
111 | blast_id_cutoff=$OPTARG
112 | fi
113 | ;;
114 | o )
115 | output_dir=$OPTARG
116 | ;;
117 | l )
118 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
119 | echo "please, provide a percentage between 0 and 100"
120 | exit 1
121 | else
122 | blast_len_percentage=$OPTARG
123 | fi
124 | ;;
125 | d )
126 | database_delimiter=$OPTARG
127 | ;;
128 | D )
129 | database_field=$OPTARG
130 | ;;
131 | q )
132 | query_delimiter=$OPTARG
133 | ;;
134 | Q )
135 | query_field=$OPTARG
136 | ;;
137 | u )
138 | unique=true
139 | suffix=".unique.tmp"
140 | ;;
141 | I)
142 | id_circos=true
143 | id_output=",\"id=\"database_name[length(database_name)]"
144 | ;;
145 | h )
146 | usage
147 | exit 1
148 | ;;
149 | v )
150 | echo $VERSION
151 | exit 1
152 | ;;
153 | \?)
154 | echo "Invalid Option: -$OPTARG" 1>&2
155 | usage
156 | exit 1
157 | ;;
158 | : )
159 | echo "Option -$OPTARG requires an argument." >&2
160 | exit 1
161 | ;;
162 | * )
163 | echo "Unimplemented option: -$OPTARG" >&2;
164 | exit 1
165 | ;;
166 |
167 | esac
168 | done
169 | shift $((OPTIND-1))
170 |
171 | #================================================================
172 | # MAIN_BODY
173 | #================================================================
174 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
175 |
176 | echo -e "\n#Executing" $0 "\n"
177 |
178 | check_mandatory_files.sh $input_file
179 |
180 |
181 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l)
182 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//')
183 |
184 |
185 | if [ ! $output_dir ]; then
186 | output_dir=$(dirname $input_file)
187 | #echo "Default output directory is" $output_dir
188 | mkdir -p $output_dir
189 | else
190 | #echo "Output directory is" $output_dir
191 | mkdir -p $output_dir
192 | fi
193 |
194 |
195 | if [ ! $file_name ]; then
196 | file_name=$(basename $input_file | cut -d. -f1,2)
197 | fi
198 |
199 | ##CHECK FIELDS TO RETRIEVE
200 |
201 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then
202 |
203 | if [ $database_field == l ]; then
204 | database_field="1"
205 | else
206 | database_field="length(database_name)"
207 | fi
208 |
209 | else
210 | echo "Please introduce r or l for database"
211 | exit 1
212 | fi
213 |
214 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then
215 |
216 | if [ $query_field == l ]; then
217 | query_field="1"
218 | else
219 | query_field="length(query_name)"
220 | fi
221 |
222 | else
223 |
224 | echo "Please introduce 0 or 1 for query"
225 | exit 1
226 | fi
227 |
228 | echo "$(date)"
229 | echo "Adapting blast to complete using" $(basename $input_file) "with:"
230 | echo "Blast identity=" $blast_id_cutoff
231 | echo "Min len percentage=" $blast_len_percentage
232 |
233 |
234 | cat $input_file |\
235 | awk '
236 | BEGIN{OFS="\t"}
237 | {split($1, query_name, "'"${query_delimiter}"'")
238 | split($2,database_name, "'"${database_delimiter}"'")}
239 | (($3 >= '"${blast_id_cutoff}"') && (($4/$13)>='"${blast_len_percentage_value}"') && (!x[$1$2]++)) \
240 | {{isInverted=($10-$9)
241 | ext2=($13-$8)}
242 | {if (isInverted < 0)
243 | {pos1 = $10
244 | pos2 = $9}
245 | else
246 | {pos1 =$9
247 | pos2 = $10}
248 | {if ((isInverted < 0) && (($14 - pos2) > $7))
249 | {coordChr2 = (pos2 + $7)}
250 | else if ((isInverted < 0) && (($14 - pos2) <= $7))
251 | {coordChr2=$14}
252 | {if ((isInverted < 0) && (ext2 <= pos1))
253 | {coordChr1= pos1 - ext2;}
254 | else if ((isInverted < 0) && (ext2 > pos1))
255 | {coordChr1= 1}
256 | {if ((isInverted > 0) && (pos1 > $7))
257 | {coordChr1=(pos1 - $7)}
258 | else if ((isInverted > 0) && (pos1 <= $7))
259 | {coordChr1=1}
260 | {if ((isInverted > 0) && (ext2 > ($14-pos2)))
261 | {coordChr2= $14;}
262 | else if ((isInverted > 0) && (ext2 <= ($14-pos2)))
263 | {coordChr2= (pos2 + ext2)}
264 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13} }}}}}}
265 | ' \
266 | >$output_dir/$file_name".complete"|| error ${LINENO} $(basename $0) "Awk command parsing blast output for circos input in $file_name\".complete\" creation failed. See $output_dir/logs for more information"
267 |
268 |
269 | cat $input_file |\
270 | awk '
271 | BEGIN{OFS="\t"}
272 | {split($1, query_name, "'"${query_delimiter}"'")
273 | split($2,database_name, "'"${database_delimiter}"'")}
274 | (($3 >= '"${blast_id_cutoff}"') && (($4/$13)>='"${blast_len_percentage_value}"') && (!x[$2$1]++)) \
275 | {{isInverted=($10-$9)
276 | ext2=($13-$8)}
277 | {if (isInverted < 0)
278 | {pos1=$10
279 | pos2=$9}
280 | else
281 | {pos1 =$9
282 | pos2=$10}; \
283 | {if ((isInverted < 0) && (($14 - pos2) < $7))
284 | {coordChr1=1
285 | coordChr2=($7-($14-pos2))
286 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}}
287 | {if ((isInverted < 0) && (ext2 > pos1))
288 | {coordChr1=($14-(ext2-pos1))
289 | coordChr2=$14
290 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}}
291 | {if ((isInverted > 0) && (pos1 < $7))
292 | {coordChr1=($14-($7-pos1))
293 | coordChr2=$14
294 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}}
295 | {if ((isInverted > 0) && (ext2 > ($14-pos2)))
296 | {coordChr1=1
297 | coordChr2=(ext2-($14-pos2))
298 | {print database_name['"$database_field"'], coordChr1, coordChr2, query_name['"$query_field"'], "id="$13}
299 | }
300 | }}}}}}
301 | ' \
302 | >>$output_dir/$file_name".complete" || error ${LINENO} $(basename $0) "Awk command parsing blast output for circos input in $file_name\".complete\" second step creation failed. See $output_dir/logs for more information"
303 |
304 |
305 |
306 | echo "$(date)"
307 | echo "DONE adapting blast to complete"
308 | echo -e "File can be found at" $output_dir/$file_name".complete" "/n"
309 |
--------------------------------------------------------------------------------
/bin/blast_to_link.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | #set -e
6 | #set -x
7 |
8 | #=============================================================
9 | # HEADER
10 | #=============================================================
11 |
12 | #INSTITUTION:ISCIII
13 | #CENTRE:BU-ISCIII
14 | #AUTHOR: Pedro J. Sola
15 | VERSION=1.0
16 | #CREATED: 14 May 2018
17 | #
18 | #DESCRIPTION:blast_to_link script to obtain a link file that represent duplications between all members of the query
19 | #================================================================
20 | # END_OF_HEADER
21 | #================================================================
22 |
23 | #SHORT USAGE RULES
24 | #LONG USAGE FUNCTION
25 | usage() {
26 | cat << EOF
27 |
28 | blast_to_bed is a script than obtain a BED file with coordinates of local blast alignments matching some given conditions
29 |
30 | usage : $0 <-i inputfile(.blast)> <-b id cutoff> [-o ] [-b ] [-l ] [-L ]
31 | [-p ] [-d ] [-D (l|r)] [-q ] [-Q (l|r)] [-I] [-u] [-v] [-h]
32 |
33 | -i input file
34 | -b blast identity cutoff (0 - 100), default 90
35 | -l blast length percentage cutoff (0 - 100), default 20, use 90 for genes
36 | -o output directory (optional). By default the file is replaced in the same location
37 | -q database chraracter delimiter, default "_"
38 | -Q query field to retrieve (l=left, r=right), default left
39 | -d database chraracter delimiter, default "_"
40 | -D database field to retrieve (l=left, r=right), default right
41 | -I contig mode
42 | -v version
43 | -h display usage message
44 |
45 | example: blast_to_link.sh -i ecoli_prefix.blast -b 80 -l 50
46 | EOF
47 | }
48 |
49 | #================================================================
50 | # OPTION_PROCESSING
51 | #================================================================
52 | #Make sure the script is executed with arguments
53 | if [ $# = 0 ] ; then
54 | usage >&2
55 | exit 1
56 | fi
57 |
58 | # Error handling
59 | error(){
60 | local parent_lineno="$1"
61 | local script="$2"
62 | local message="$3"
63 | local code="${4:-1}"
64 |
65 | RED='\033[0;31m'
66 | NC='\033[0m'
67 |
68 | if [[ -n "$message" ]] ; then
69 | echo -e "\n---------------------------------------\n"
70 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
71 | echo -e "MESSAGE:\n"
72 | echo -e "$message"
73 | echo -e "\n---------------------------------------\n"
74 | else
75 | echo -e "\n---------------------------------------\n"
76 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
77 | echo -e "\n---------------------------------------\n"
78 | fi
79 |
80 | exit "${code}"
81 | }
82 |
83 | #DECLARE FLAGS AND VARIABLES
84 | cwd="$(pwd)"
85 | input_file="Input_file"
86 | blast_id_cutoff=90
87 | blast_len_percentage=50
88 | blast_len_alignment=0
89 | database_delimiter="-"
90 | database_field=l
91 | query_delimiter="_"
92 | query_field=r
93 | unique=false
94 | suffix=""
95 | id_circos=false
96 | id_output=""
97 |
98 | #PARSE VARIABLE ARGUMENTS WITH getops
99 | #common example with letters, for long options check longopts2getopts.sh
100 | options=":i:b:q:Q:d:D:o:l:L:Iuvh"
101 | while getopts $options opt; do
102 | case $opt in
103 | i )
104 | input_file=$OPTARG
105 | ;;
106 | b )
107 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
108 | echo "please, provide a percentage between 0 and 100"
109 | exit 1
110 | else
111 | blast_id_cutoff=$OPTARG
112 | fi
113 | ;;
114 | o )
115 | output_dir=$OPTARG
116 | ;;
117 | l )
118 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
119 | echo "please, provide a percentage between 0 and 100"
120 | exit 1
121 | else
122 | blast_len_percentage=$OPTARG
123 | fi
124 | ;;
125 | d )
126 | database_delimiter=$OPTARG
127 | ;;
128 | D )
129 | database_field=$OPTARG
130 | ;;
131 | q )
132 | query_delimiter=$OPTARG
133 | ;;
134 | Q )
135 | query_field=$OPTARG
136 | ;;
137 | u )
138 | unique=true
139 | suffix=".unique.tmp"
140 | ;;
141 | I)
142 | id_circos=true
143 | id_output=",\"id=\"query_name[length(query_name)]"
144 | ;;
145 | h )
146 | usage
147 | exit 1
148 | ;;
149 | v )
150 | echo $VERSION
151 | exit 1
152 | ;;
153 | \?)
154 | echo "Invalid Option: -$OPTARG" 1>&2
155 | usage
156 | exit 1
157 | ;;
158 | : )
159 | echo "Option -$OPTARG requires an argument." >&2
160 | exit 1
161 | ;;
162 | * )
163 | echo "Unimplemented option: -$OPTARG" >&2;
164 | exit 1
165 | ;;
166 |
167 | esac
168 | done
169 | shift $((OPTIND-1))
170 |
171 |
172 | #================================================================
173 | # MAIN_BODY
174 | #================================================================
175 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
176 |
177 | echo -e "\n#Executing" $0 "\n"
178 |
179 | check_mandatory_files.sh $input_file
180 |
181 |
182 | blast_len_percentage_value=$(echo "($blast_len_percentage/100)" | bc -l)
183 | #blast_len_percentage_decimal=$(echo $blast_len_percentage_value | sed 's/0\{1,\}$//')
184 |
185 |
186 | if [ ! $output_dir ]; then
187 | output_dir=$(dirname $input_file)
188 | #echo "Default output directory is" $output_dir
189 | mkdir -p $output_dir
190 | else
191 | #echo "Output directory is" $output_dir
192 | mkdir -p $output_dir
193 | fi
194 |
195 |
196 | if [ ! $file_name ]; then
197 | file_name=$(basename $input_file | cut -d. -f1,2)
198 | fi
199 |
200 | ##CHECK FIELDS TO RETRIEVE
201 |
202 | if [ "$database_field" == "l" ] || [ "$database_field" == "r" ]; then
203 |
204 | if [ $database_field == l ]; then
205 | database_field="1"
206 | else
207 | database_field="length(database_name)"
208 | fi
209 |
210 | else
211 | echo "Please introduce r or l for database"
212 | exit 1
213 | fi
214 |
215 | if [ $query_field == "l" ] || [ $query_field == "r" ]; then
216 |
217 | if [ $query_field == l ]; then
218 | query_field="1"
219 | else
220 | query_field="length(query_name)"
221 | fi
222 |
223 | else
224 |
225 | echo "Please introduce 0 or 1 for query"
226 | exit 1
227 | fi
228 |
229 | echo "$(date)"
230 | echo "Adapting blast to links using" $(basename $input_file) "with:"
231 | echo "Blast identity=" $blast_id_cutoff
232 | echo "Min len percentage=" $blast_len_percentage
233 |
234 | ##Have only into account blast entries with a determine blast length
235 |
236 | awk '
237 | (($4/$13) >= '"${blast_len_percentage_value}"') && !contigPlasmid[$1$2]++ \
238 | {print $1$2}
239 | ' $input_file \
240 | > $output_dir/$file_name".dict_length_percentage" || error ${LINENO} $(basename $0) "Awk command in $file_name\".dict_length_percentage\" creation failed. See $output_dir/logs for more information."
241 |
242 |
243 | ##Obtain coordinates query --> ddbb
244 |
245 | awk '
246 | NR==FNR{contigPlasmid[$1]=$1;next}
247 | {split($2, database_name, "'"${database_delimiter}"'")
248 | split($1, query_name, "'"${query_delimiter}"'")
249 | header=$1$2}
250 | {if ((header in contigPlasmid) && ($3>='"${blast_id_cutoff}"') && (($4/$13)>=0.03))
251 | print query_name['"$query_field"'], $7,$8,database_name['"$database_field"'],$9,$10'"$id_output"'}' \
252 | $output_dir/$file_name".dict_length_percentage" $input_file \
253 | > $output_dir/$file_name."blast.links" || error ${LINENO} $(basename $0) "Awk command in $file_name\".blast.links\" creation failed. See $output_dir/logs for more information"
254 |
255 | ##Change coordinates from query --> ddbb to ddbb-->ddbb in order to represent them in CIRCOSS
256 |
257 | awk '
258 | BEGIN{OFS="\t"}
259 | {
260 | if($1 != savedNode)
261 | {savedNode= $1; delete chr}
262 | else{for(i in chr)
263 | {print $4" "$5" "$6" "chr[i]" id="savedNode}
264 | }
265 | chr[$4$5$6] = $4" "$5" "$6}' \
266 | $output_dir/$file_name."blast.links" \
267 | > $output_dir/$file_name."links" || error ${LINENO} $(basename $0) "Awk command in $file_name\".links\" creation failed. See $output_dir/logs for more information"
268 |
269 |
270 | rm $output_dir/$file_name".dict_length_percentage"
271 |
272 | echo "$(date)"
273 | echo "DONE adapting blast to link"
274 | echo -e "File can be found at" $output_dir/$file_name".links" "\n"
275 |
--------------------------------------------------------------------------------
/bin/bowtie_mapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | #=============================================================
6 | # HEADER
7 | #=============================================================
8 |
9 | #INSTITUTION:ISCIII
10 | #CENTRE:BU-ISCIII
11 | #AUTHOR: Pedro J. Sola
12 | VERSION=1.0
13 | #CREATED: 15 March 2018
14 | #REVISION:
15 | # 19 March 2018: Complete usage info
16 | # 19 March 2018: Check mandatory files. folders and variables
17 | #DESCRIPTION:Script that index a database and map a supplied pair-end sequences
18 | #TODO
19 | # -Handle files extensions for bowtie, now is fastq by default
20 | #================================================================
21 | # END_OF_HEADER
22 | #================================================================
23 |
24 | #SHORT USAGE RULES
25 | #LONG USAGE FUNCTION
26 | usage() {
27 | cat << EOF
28 |
29 | Bowtie_mapper script index a database and map a supplied pair-end sequences
30 |
31 | usage : $0 [-i ] [-o ] <-d database(fasta)> <-s sample_name> <-1 R1> <-2 R2>
32 | [-g group_name] [-f ] [-T ] [-a] [-v] [-h]
33 |
34 | -i input directory (optional)
35 | -o output directory (optional)
36 | -d database to map (.fasta)
37 | -s sample name
38 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group
39 | -1 reads corresponding to paired-end R1
40 | -2 reads corresponding to paired-end R2
41 | -f offrate index for bowtie_build (optional). Default value 1. for quicker indexing use higher number
42 | -a use -a mapping (off by default)
43 | -T number of threads
44 | -v version
45 | -h display usage message
46 |
47 | example: bowtie_mapper.sh -d database.fasta -s COLI -1 ecoli_1.fastq -2 ecoli_2.fastq -a
48 |
49 | EOF
50 | }
51 |
52 | #================================================================
53 | # OPTION_PROCESSING
54 | #================================================================
55 | #Make sure the script is executed with arguments
56 | if [ $# = 0 ] ; then
57 | usage >&2
58 | exit 1
59 | fi
60 |
61 | # Error handling
62 | error(){
63 | local parent_lineno="$1"
64 | local script="$2"
65 | local message="$3"
66 | local code="${4:-1}"
67 |
68 | RED='\033[0;31m'
69 | NC='\033[0m'
70 |
71 | if [[ -n "$message" ]] ; then
72 | echo -e "\n---------------------------------------\n"
73 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
74 | echo -e "MESSAGE:\n"
75 | echo -e "$message"
76 | echo -e "\n---------------------------------------\n"
77 | else
78 | echo -e "\n---------------------------------------\n"
79 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
80 | echo -e "\n---------------------------------------\n"
81 | fi
82 |
83 | exit "${code}"
84 | }
85 |
86 | #DECLARE FLAGS AND VARIABLES
87 | threads=1
88 | offrate=1
89 | cwd="$(pwd)"
90 | a_mapping=""
91 | group="NO_GROUP"
92 | database="Database"
93 | R1="R1"
94 | R2="R2"
95 |
96 | #PARSE VARIABLE ARGUMENTS WITH getops
97 | #common example with letters, for long options check longopts2getopts.sh
98 | options=":i:o:s:g:d:1:2:f:T:avh"
99 | while getopts $options opt; do
100 | case $opt in
101 | i )
102 | input_dir=$OPTARG
103 | ;;
104 | o )
105 | output_dir=$OPTARG
106 | ;;
107 | s )
108 | sample=$OPTARG
109 | ;;
110 | g)
111 | group=$OPTARG
112 | ;;
113 | d )
114 | database=$OPTARG
115 | ;;
116 | 1 )
117 | R1=$OPTARG
118 | ;;
119 | 2 )
120 | R2=$OPTARG
121 | ;;
122 | f )
123 | offrate=$OPTARG
124 | ;;
125 | T )
126 | threads=$OPTARG
127 | ;;
128 | a)
129 | a_mapping="-a"
130 | ;;
131 | h )
132 | usage
133 | exit 1
134 | ;;
135 | v )
136 | echo $VERSION
137 | exit 1
138 | ;;
139 | \?)
140 | echo "Invalid Option: -$OPTARG" 1>&2
141 | usage
142 | exit 1
143 | ;;
144 | : )
145 | echo "Option -$OPTARG requires an argument." >&2
146 | exit 1
147 | ;;
148 | * )
149 | echo "Unimplemented option: -$OPTARG" >&2;
150 | exit 1
151 | ;;
152 |
153 | esac
154 | done
155 | shift $((OPTIND-1))
156 |
157 |
158 | #================================================================
159 | # MAIN_BODY
160 | #================================================================
161 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
162 |
163 | echo -e "\n#Executing" $0 "\n"
164 |
165 | check_dependencies.sh bowtie2-build bowtie2
166 |
167 | check_mandatory_files.sh $database $R1 $R2
168 |
169 | if [ ! $sample ]; then
170 | echo "ERROR: please, provide a sample name"
171 | usage
172 | exit 1
173 | fi
174 |
175 | if [ ! $output_dir ]; then
176 | output_dir=$cwd"/$group/$sample/mapping/"
177 | echo "Default output directory is" $output_dir
178 | mkdir -p $output_dir
179 | else
180 | echo "Output directory is" $output_dir
181 | mkdir -p $output_dir
182 | fi
183 |
184 |
185 | ########INDEXING############
186 | ############################
187 |
188 | files_bt2=$(ls $database*bt2 2> /dev/null | wc -l)
189 |
190 |
191 | if [ "$files_bt2" = "6" ];then \
192 | echo "Found an indexed ddbb for" $(basename $database);
193 | echo "Omitting indexing"
194 | else
195 | echo "Building index of " $(basename $database);
196 | bowtie2-build \
197 | --offrate $offrate \
198 | $database $database || error ${LINENO} $(basename $0) "Bowtie2-build command failed. See $output_dir/logs for more information"
199 | fi
200 |
201 | ########MAPPING#############
202 | ############################
203 |
204 | if [ -f $mappedDir/$sample.sorted.bam -a -f $mappedDir/$sample.sorted.bam.bai ];then \
205 | echo "Found a mapping file for sample" $sample;
206 | echo "Omitting mapping"
207 | else
208 | echo "$(date)"
209 | echo mapping $R1
210 | echo mapping $R2
211 |
212 | bowtie2 \
213 | -1 $R1 \
214 | -2 $R2 \
215 | -S $output_dir/$sample.sam \
216 | -q \
217 | --very-sensitive-local \
218 | $a_mapping \
219 | -p $threads \
220 | -x $database || error ${LINENO} $(basename $0) "Bowtie2 command failed. See $output_dir/logs for more information"
221 |
222 |
223 | echo "$(date)"
224 | echo -e "DONE Mapping $sample of $group Group" "\n"
225 | fi
226 |
227 |
--------------------------------------------------------------------------------
/bin/build_karyotype.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 | #set -x
7 |
8 | #=============================================================
9 | # HEADER
10 | #=============================================================
11 |
12 | #INSTITUTION:ISCIII
13 | #CENTRE:BU-ISCIII
14 | #AUTHOR: Pedro J. Sola
15 | VERSION=1.0
16 | #CREATED: 13 April 2018
17 | #REVISION:
18 | #DESCRIPTION:build_karyotype script that creates karyotype file for CIRCOS either for summary and individual image
19 |
20 | #================================================================
21 | # END_OF_HEADER
22 | #================================================================
23 |
24 | #SHORT USAGE RULES
25 | #LONG USAGE FUNCTION
26 | usage() {
27 | cat << EOF
28 |
29 | build_karyotype script that creates karyotype file for CIRCOS either for summary and individual image
30 |
31 | usage : $0 <-i inputfile(coverage)> [-o ] [-f ] [-g ] <-k int(0-100)> <-K int(0-100)> [-v] [-h]
32 |
33 | -i input file
34 | -o output directory (optional). By default the file is replaced in the same location
35 | -f file name for identification
36 | -g group name for identification
37 | -R Reconstruct
38 | -K percentage value to display plasmids covered >= in summary image
39 | -k percentage value to display plasmids covered >= in individual image
40 | -v version
41 | -h display usage message
42 |
43 | example: build_karyotype.sh -i ecoli.coverage -K 80 -k 50
44 |
45 | EOF
46 | }
47 |
48 | #================================================================
49 | # OPTION_PROCESSING
50 | #================================================================
51 | #Make sure the script is executed with arguments
52 | if [ $# = 0 ] ; then
53 | usage >&2
54 | exit 1
55 | fi
56 |
57 |
58 | # Error handling
59 | error(){
60 | local parent_lineno="$1"
61 | local script="$2"
62 | local message="$3"
63 | local code="${4:-1}"
64 |
65 | RED='\033[0;31m'
66 | NC='\033[0m'
67 |
68 | if [[ -n "$message" ]] ; then
69 | echo -e "\n---------------------------------------\n"
70 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
71 | echo -e "MESSAGE:\n"
72 | echo -e "$message"
73 | echo -e "\n---------------------------------------\n"
74 | else
75 | echo -e "\n---------------------------------------\n"
76 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
77 | echo -e "\n---------------------------------------\n"
78 | fi
79 |
80 | exit "${code}"
81 | }
82 |
83 | #DECLARE FLAGS AND VARIABLES
84 | cwd="$(pwd)"
85 | input_file="Input_file"
86 | coverage_cutoff_input=100
87 | reconstruct=false
88 |
89 | #PARSE VARIABLE ARGUMENTS WITH getops
90 | #common example with letters, for long options check longopts2getopts.sh
91 | options=":i:o:f:g:K:k:Rvh"
92 | while getopts $options opt; do
93 | case $opt in
94 | i )
95 | input_file=$OPTARG
96 | ;;
97 | o )
98 | output_dir=$OPTARG
99 | ;;
100 | f ) file_name=$OPTARG
101 | ;;
102 | g ) group_name=$OPTARG
103 | ;;
104 | R ) reconstruct=true
105 | ;;
106 | K )
107 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
108 | echo "please, provide a summary percentage between 0 and 100"
109 | usage
110 | exit 1
111 | else
112 | coverage_cutoff_summary_percentage=$OPTARG
113 | fi
114 | ;;
115 | k )
116 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
117 | echo "please, provide an individual percentage between 0 and 100"
118 | usage
119 | exit 1
120 | else
121 | coverage_cutoff_individual_percentage=$OPTARG
122 | fi
123 | ;;
124 | h )
125 | usage
126 | exit 1
127 | ;;
128 | v )
129 | echo $VERSION
130 | exit 1
131 | ;;
132 | \?)
133 | echo "Invalid Option: -$OPTARG" 1>&2
134 | usage
135 | exit 1
136 | ;;
137 | : )
138 | echo "Option -$OPTARG requires an argument." >&2
139 | exit 1
140 | ;;
141 | * )
142 | echo "Unimplemented option: -$OPTARG" >&2;
143 | exit 1
144 | ;;
145 |
146 | esac
147 | done
148 | shift $((OPTIND-1))
149 |
150 | #================================================================
151 | # MAIN_BODY
152 | #================================================================
153 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
154 |
155 | echo -e "\n#Executing" $0 "\n"
156 |
157 | check_mandatory_files.sh $input_file
158 |
159 | coverage_cutoff_summary=$(echo "(1 - ($coverage_cutoff_summary_percentage/100))" | bc -l)
160 | coverage_cutoff_individual=$(echo "(1 - ($coverage_cutoff_individual_percentage/100))" | bc -l)
161 |
162 |
163 | if [ ! $output_dir ]; then
164 | output_dir=$(dirname $input_file)
165 | #echo "Default output directory is" $output_dir
166 | mkdir -p $output_dir
167 | else
168 | #echo "Output directory is" $output_dir
169 | mkdir -p $output_dir
170 | fi
171 |
172 |
173 | if [ ! $file_name ]; then
174 | file_name=$(basename $input_file | cut -d. -f1)
175 | fi
176 |
177 | echo "FILE NAME" $file_name
178 |
179 | echo "$(date)"
180 | echo "Obtain list of cromosomes (idiogram) for CIRCOS karyotype file"
181 | echo "Generating summary karyotype file with plasmids that mapped more than" $coverage_cutoff_summary_percentage"%"
182 | if [ $reconstruct = true ];then
183 |
184 | awk '{print "chr -", $1, $1, "0", $2, "id="$1}' $input_file \
185 | >$output_dir/$file_name".karyotype_summary.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype summary in $file_name\".karyotype_summary.txt\" creation. See $output_dir/logs for more information"
186 |
187 | awk '{print "chr -", $1, $1, "0", $2, "id="$1}' $input_file \
188 | >$output_dir/$file_name".karyotype_individual.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype individual in $file_name\".karyotype_individual.txt\" creation. See $output_dir/logs for more information."
189 |
190 | else
191 | awk '
192 | {if ($2 == 0 && $5 < '"${coverage_cutoff_summary}"')
193 | {print "chr -", $1, $1, "0", $4, "id="$1}
194 | }
195 | ' $input_file \
196 | > $output_dir/$file_name".karyotype_summary.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype summary in $file_name\".karyotype_summary.txt\" creation. See $output_dir/logs for more information."
197 |
198 |
199 | echo "Generating individual karyotype file with plasmids that mapped more than" $coverage_cutoff_individual_percentage"%"
200 |
201 | awk '
202 | {if ($2 == 0 && $5 < '"${coverage_cutoff_individual}"')
203 | {print "chr -", $1, $1, "0", $4, "id="$1}
204 | }
205 | ' $input_file \
206 | > $output_dir/$file_name".karyotype_individual.txt" || error ${LINENO} $(basename $0) "Awk command for karyotype individual in $file_name\".karyotype_individual.txt\" creation. See $output_dir/logs for more information"
207 |
208 | fi
209 |
210 | echo "$(date)"
211 | echo "Done Obtain list of cromosomes (idiogram) for CIRCOS karyotype file"
212 | echo "Files can be found at" $output_dir
213 | echo $(cat $output_dir/$file_name".karyotype_summary.txt" | wc -l) "sequences will be displayed on summary image"
214 | echo -e $(cat $output_dir/$file_name".karyotype_individual.txt" | wc -l) "images will be created individually" "\n"
215 |
--------------------------------------------------------------------------------
/bin/calculate_seqlen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 |
7 | #=============================================================
8 | # HEADER
9 | #=============================================================
10 |
11 | #INSTITUTION:ISCIII
12 | #CENTRE:BU-ISCIII
13 | #AUTHOR: Pedro J. Sola
14 | VERSION=1.0
15 | #CREATED: 20 March 2018
16 | #REVISION:
17 | #DESCRIPTION:Script that convert a supplied SAM file into compressed binary indexed BAM
18 | #AKNOWLEDGE:
19 | # -Adapted from klashxx: https://stackoverflow.com/questions/23992646/sequence-length-of-fasta-file/23992773
20 | #================================================================
21 | # END_OF_HEADER
22 | #================================================================
23 |
24 | #SHORT USAGE RULES
25 | #LONG USAGE FUNCTION
26 | usage() {
27 | cat << EOF
28 |
29 | Calculate_sequlen script calculates a supplied FASTA length
30 |
31 | usage : $0 <-i inputfile(.fasta)> [-o ] [-n ] [-r] [-v] [-h]
32 |
33 | -i input file
34 | -o output directory (optional). By default the file is replaced in the same location
35 | -n file name (optional). By default is the same name with .length extension
36 | -r remove ">" (greater-than) symbol from fasta header
37 | -v version
38 | -h display usage message
39 |
40 | example: calculate_sequlen.sh -i ecoli.fasta
41 |
42 | EOF
43 | }
44 |
45 | #================================================================
46 | # OPTION_PROCESSING
47 | #================================================================
48 | #Make sure the script is executed with arguments
49 | if [ $# = 0 ] ; then
50 | usage >&2
51 | exit 1
52 | fi
53 |
54 | # Error handling
55 | error(){
56 | local parent_lineno="$1"
57 | local script="$2"
58 | local message="$3"
59 | local code="${4:-1}"
60 |
61 | RED='\033[0;31m'
62 | NC='\033[0m'
63 |
64 | if [[ -n "$message" ]] ; then
65 | echo -e "\n---------------------------------------\n"
66 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
67 | echo -e "MESSAGE:\n"
68 | echo -e "$message"
69 | echo -e "\n---------------------------------------\n"
70 | else
71 | echo -e "\n---------------------------------------\n"
72 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
73 | echo -e "\n---------------------------------------\n"
74 | fi
75 |
76 | exit "${code}"
77 | }
78 |
79 | #DECLARE FLAGS AND VARIABLES
80 | remove_head=remove_head_false
81 | cwd="$(pwd)"
82 | file_name="file_name"
83 | input_file="Input_file"
84 |
85 | #PARSE VARIABLE ARGUMENTS WITH getops
86 | #common example with letters, for long options check longopts2getopts.sh
87 | options=":i:o:n:rvh"
88 | while getopts $options opt; do
89 | case $opt in
90 | i )
91 | input_file=$OPTARG
92 | ;;
93 | o )
94 | output_dir=$OPTARG
95 | ;;
96 | n )
97 | filename=$OPTARG
98 | ;;
99 | r )
100 | remove_head="^>"
101 | ;;
102 | h )
103 | usage
104 | exit 1
105 | ;;
106 | v )
107 | echo $VERSION
108 | exit 1
109 | ;;
110 | \?)
111 | echo "Invalid Option: -$OPTARG" 1>&2
112 | usage
113 | exit 1
114 | ;;
115 | : )
116 | echo "Option -$OPTARG requires an argument." >&2
117 | exit 1
118 | ;;
119 | * )
120 | echo "Unimplemented option: -$OPTARG" >&2;
121 | exit 1
122 | ;;
123 |
124 | esac
125 | done
126 | shift $((OPTIND-1))
127 |
128 | #================================================================
129 | # MAIN_BODY
130 | #================================================================
131 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
132 |
133 | echo -e "\n#Executing" $0 "\n"
134 |
135 | check_mandatory_files.sh $input_file
136 |
137 | if [ ! $output_dir ]; then
138 | output_dir=$(dirname $input_file)
139 | #echo "Default output directory is" $output_dir
140 | mkdir -p $output_dir
141 | else
142 | #echo "Output directory is" $output_dir
143 | mkdir -p $output_dir
144 | fi
145 |
146 | if [ ! $filename ]; then
147 | filename=$(basename $input_file | cut -d. -f1)
148 | fi
149 |
150 | awk '
151 | BEGIN {FS=="| "}
152 | /^>/ {if (seqlen)
153 | print seqlen;printf "%s\t", $1; seqlen=0; next
154 | }
155 | {seqlen+=length($0)}
156 | END {print seqlen}' $input_file | sed 's/'$remove_head'//g' \
157 | >$output_dir/$filename".length" || error ${LINENO} $(basename $0) "Awk command for bedtools seqlen calculation in $filename\".length\" creation. See $output_dir/logs for more information."
158 |
159 | echo "$(date)"
160 | echo "Done seqlen calculation"
161 | echo "Files can be found at" $output_dir
162 |
--------------------------------------------------------------------------------
/bin/cdhit_cluster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | #=============================================================
6 | # HEADER
7 | #=============================================================
8 |
9 | #INSTITUTION:ISCIII
10 | #CENTRE:BU-ISCIII
11 | #AUTHOR: Pedro J. Sola
12 | VERSION=1.0
13 | #CREATED: 6 April 2018
14 | #REVISION:
15 | #DESCRIPTION:Script that uses cd-hit/psi-cd-hit to clusterize a FASTA file
16 | #
17 | #DOCUMENTATION
18 | #
19 | #
20 | #Compare floats in BASH
21 | #
22 | #if [ $(echo "$cluster_cutoff > 0.7"|bc -l) -eq 1 ]; then
23 | # echo "YES"
24 | #else
25 | # echo "NO"
26 | #fi
27 | #
28 | #-d length of description in .clstr file, default 20. if set to 0,
29 | # it takes the fasta defline and stops at first space
30 | #-s length difference cutoff, default 0.0
31 | # if set to 0.9, the shorter sequences need to be
32 | # at least 90% length of the representative of the cluster
33 | #-B 1 or 0, default 0, by default, sequences are stored in RAM
34 | # if set to 1, sequence are stored on hard drive
35 | # it is recommended to use -B 1 for huge databases
36 | #-g 1 or 0, default 0
37 | # By cd-hit’s default algorithm, a sequence is clustered to the first
38 | # cluster that meet the threshold (fast mode). If set to 1, the program
39 | # will cluster it into the most similar cluster that meet the threshold
40 | # (accurate but slow mode)
41 | #
42 | # PSI-CD-HIT
43 | #-G (1/0) use global identity? default 1, sequence identity
44 | # calculated as total identical residues of local alignments
45 | # length of shorter sequence
46 | #
47 | #-n 5 for thresholds 0.7 ~ 1.0
48 | #-n 4 for thresholds 0.6 ~ 0.7
49 | #-n 3 for thresholds 0.5 ~ 0.6
50 | #-n 2 for thresholds 0.4 ~ 0.5
51 |
52 | #================================================================
53 | # END_OF_HEADER
54 | #================================================================
55 |
56 | #SHORT USAGE RULES
57 | #LONG USAGE FUNCTION
58 | usage() {
59 | cat << EOF
60 |
61 | Cdhit_cluster script uses cd-hit/psi-cd-hit to clusterize a FASTA file
62 |
63 | usage : $0 <-i inputfile(FASTA)> [-o ] [-n ] [-c ] [-H ]
64 | [-T ] [-g group_name] [-s ] [-M ][-C <(0|1)>] [-G <(0|1)>] [-b ] [p] [-v] [-h]
65 |
66 | -i input file in FASTA format
67 | -c percentage threshold to cluster, default 80
68 | -H cd-hit command, default cd-hit-est
69 | -M max available memory (Mbyte), default 400
70 | -n file name
71 | -s length difference cutoff, default 0.8
72 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group
73 | -p runs psi-cd-hit instead of cd-hit
74 | -C psi-cd-hit only: circular sequences, default 1. If set to 0 sequence is assumed lineal
75 | -G psi-cd-hit only: gobal identity, -G 0 only takes the first local alignment for clustering
76 | -b psi-cd-hit only: choose blast program, default blastn
77 | -T number of threads
78 | -v version
79 | -h display usage message
80 |
81 |
82 | Output directory is the same as input directory
83 |
84 | example: cdhit_cluster -i ecoli.fasta -c 90 -M 50000 -T 0
85 | cdhit_cluster -H cd-hit -i ecoli.fasta -c 90 -M 50000 -T 0
86 |
87 |
88 | EOF
89 | }
90 |
91 | #================================================================
92 | # OPTION_PROCESSING
93 | #================================================================
94 | #Make sure the script is executed with arguments
95 | if [ $# = 0 ] ; then
96 | usage >&2
97 | exit 1
98 | fi
99 |
100 | # Error handling
101 | error(){
102 | local parent_lineno="$1"
103 | local script="$2"
104 | local message="$3"
105 | local code="${4:-1}"
106 |
107 | RED='\033[0;31m'
108 | NC='\033[0m'
109 |
110 | if [[ -n "$message" ]] ; then
111 | echo -e "\n---------------------------------------\n"
112 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
113 | echo -e "MESSAGE:\n"
114 | echo -e "$message"
115 | echo -e "\n---------------------------------------\n"
116 | else
117 | echo -e "\n---------------------------------------\n"
118 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
119 | echo -e "\n---------------------------------------\n"
120 | fi
121 |
122 | exit "${code}"
123 | }
124 |
125 | #DECLARE FLAGS AND VARIABLES
126 | cwd="$(pwd)"
127 | group="NO_GROUP"
128 | input_file="Input_file"
129 | cluster_cutoff=0.8
130 | max_memory=400
131 | length_cutoff=0.8
132 | cd_hit_command=cd-hit-est
133 | is_circle=1
134 | global_psi_cd_hit=1
135 | psi_cd_hit_program=blastn
136 | word_size=0
137 | threads=0
138 |
139 | #PARSE VARIABLE ARGUMENTS WITH getops
140 | #common example with letters, for long options check longopts2getopts.sh
141 | options=":i:o:c:M:n:s:g:C:G:b:T:H:pvh"
142 | while getopts $options opt; do
143 | case $opt in
144 | i )
145 | input_file=$OPTARG
146 | ;;
147 |
148 | c )
149 | cluster_cutoff_input=$OPTARG
150 | ;;
151 | g)
152 | group=$OPTARG
153 | ;;
154 | H)
155 | cd_hit_command=$OPTARG
156 | ;;
157 | M )
158 | max_memory=$OPTARG
159 | ;;
160 | n )
161 | file_name=$OPTARG
162 | ;;
163 | s )
164 | length_cutoff=$OPTARG
165 | ;;
166 | p )
167 | cd_hit_command=psi-cd-hit.pl
168 | ;;
169 | C )
170 | is_circle=$OPTARG
171 | ;;
172 | G)
173 | global_psi_cd_hit=$OPTARG
174 | ;;
175 | T)
176 | threads=$OPTARG
177 | ;;
178 | b)
179 | psi_cd_hit_program=$OPTARG
180 | ;;
181 | h )
182 | usage
183 | exit 1
184 | ;;
185 | v )
186 | echo $VERSION
187 | exit 1
188 | ;;
189 | \?)
190 | echo "Invalid Option: -$OPTARG" 1>&2
191 | usage
192 | exit 1
193 | ;;
194 | : )
195 | echo "Option -$OPTARG requires an argument." >&2
196 | exit 1
197 | ;;
198 | * )
199 | echo "Unimplemented option: -$OPTARG" >&2;
200 | exit 1
201 | ;;
202 |
203 | esac
204 | done
205 | shift $((OPTIND-1))
206 |
207 | #================================================================
208 | # MAIN_BODY
209 | #================================================================
210 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
211 |
212 | echo -e "\n#Executing" $0 "\n"
213 |
214 |
215 | check_mandatory_files.sh $input_file
216 |
217 | check_dependencies.sh cd-hit-est
218 | #psi-cd-hit.pl
219 |
220 |
221 |
222 | # Set word size (parameter -n for cd-hit) as author recomends
223 | #according to clustering percentage
224 |
225 |
226 | cluster_cutoff=$(echo "$cluster_cutoff_input / 100" | bc -l | sed 's/0\{1,\}$//')
227 | #cluster_cutoff=${cluster_cutoff%.*} #Remove float value
228 |
229 |
230 | if [[ "$cluster_cutoff_input" -gt 70 && "$cluster_cutoff_input" -le 100 ]]; then
231 | word_size=5
232 | elif [[ "$cluster_cutoff_input" -gt 60 && "$cluster_cutoff_input" -le 70 ]]; then
233 | word_size=4
234 | elif [[ "$cluster_cutoff_input" -gt 50 && "$cluster_cutoff_input" -le 60 ]]; then
235 | word_size=3
236 | elif [[ "$cluster_cutoff_input" -ge 40 && "$cluster_cutoff_input" -le 50 ]]; then
237 | word_size=2
238 | else
239 | echo "please introduce a valid cluster percentage value between 0.4 and 1"
240 | exit 1
241 | fi
242 |
243 |
244 |
245 | if [ ! $output_dir ]; then
246 | output_dir=$(dirname $input_file)
247 | echo "Default output directory is" $output_dir
248 | mkdir -p $output_dir
249 | else
250 | echo "Output directory is" $output_dir
251 | mkdir -p $output_dir
252 | fi
253 |
254 | if [ ! $file_name ]; then
255 | file_name=$(basename $input_file)
256 | echo "filename is" $file_name
257 | fi
258 |
259 | ##CD-HIT EXECUTION
260 |
261 | echo "$(date)"
262 | echo "Clustering sequences with identity" $cluster_cutoff_input"% or higher"
263 | echo "Using" $cd_hit_command "with file" $input_file
264 | seq_number_prev_clstr=$(cat $input_file | grep ">" | wc -l)
265 |
266 | cd $(dirname $input_file)
267 |
268 | if [ -f $output_dir/$file_name""_""$cluster_cutoff_input ]; then \
269 | echo "Found a clustered file for sample" $file_name;
270 | echo "Omitting clustering process calculation"
271 | exit 1
272 | else
273 | if [ $cd_hit_command == "psi-cd-hit.pl" ]; then
274 |
275 | check_dependencies.sh psi-cd-hit.pl
276 | $cd_hit_command -i $(basename $input_file) -o $file_name""_""$cluster_cutoff_input -c $cluster_cutoff -G $global_psi_cd_hit -g 1 -prog $psi_cd_hit_program -circle $is_circle -core $threads || error ${LINENO} $(basename $0) "PSI-CD-HIT command failed. See $output_dir/logs for more information."
277 |
278 | else
279 |
280 | $cd_hit_command -i $(basename $input_file) -o $file_name""_""$cluster_cutoff_input -c $cluster_cutoff -n $word_size -d 0 -s $length_cutoff -B 1 -M $max_memory -T $threads|| error ${LINENO} $(basename $0) "CD-HIT command failed. See $output_dir/logs for more information"
281 |
282 | fi
283 | fi
284 |
285 | seq_number_post_clstr=$(cat $file_name""_""$cluster_cutoff_input | grep ">" | wc -l)
286 |
287 | echo "$(date)"
288 | echo "DONE Clustering sequences with identity" $cluster_cutoff_input"% or higher"
289 | echo "fasta file can be found in" $output_dir/$file_name""_""$cluster_cutoff_input
290 | echo "Previous number of sequences=" $seq_number_prev_clstr
291 | echo -e "Number of sequences after clustering=" $seq_number_post_clstr "\n"
292 | cd $cwd
293 |
--------------------------------------------------------------------------------
/bin/check_dependencies.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #=============================================================
4 | # HEADER
5 | #=============================================================
6 |
7 | #INSTITUTION:ISCIII
8 | #CENTRE:BU-ISCIII
9 | #AUTHOR: Pedro J. Sola
10 | VERSION=1.0
11 | #CREATED: 19 March 2018
12 | #REVISION: 12 July 2018: add formated output and colors
13 | #AKNOWLEDGE: Colored text: https://stackoverflow.com/questions/5947742/how-to-change-the-output-color-of-echo-in-linux
14 | #DESCRIPTION:Short function to evaluate if programs are on path
15 |
16 | #================================================================
17 | # END_OF_HEADER
18 | #================================================================
19 |
20 | #SHORT USAGE RULES
21 | #LONG USAGE FUNCTION
22 | usage() {
23 | cat << EOF
24 |
25 | Check_dependencies Short function to evaluate if files exist
26 |
27 | usage : $0 [program_name2] ...
28 |
29 | example: lib/check_dependencies.sh foo bar
30 |
31 | EOF
32 | }
33 |
34 | if [ $# = 0 ] ; then
35 | usage >&2
36 | exit 1
37 | fi
38 |
39 | #DECLARE FLAGS AND VARIABLES
40 | missing_dependencies=0
41 |
42 | #SET COLORS
43 |
44 | RED='\033[0;31m'
45 | GREEN='\033[0;32m'
46 | NC='\033[0m'
47 |
48 | printf '\n%s\t%20s\n' "DEPENDENCY" "STATUS"
49 | printf '%s\t%20s\n' "----------" "------"
50 |
51 | for command in "$@"; do
52 | #dependency_version=$($command --version)
53 | length_command=$(echo $command | wc -m)
54 | distance_table=$((30 - $length_command))
55 | distance_expression=$(echo "%${distance_table}s")
56 |
57 | printf '%s' $command
58 | if ! [ -x "$(which $command 2> /dev/null)" ]; then
59 |
60 |
61 | printf $distance_expression
62 | printf "${RED}NOT INSTALLED${NC} \n"
63 | let missing_dependencies++
64 | else
65 | printf $distance_expression
66 | printf "${GREEN}INSTALLED${NC} \n"
67 | fi
68 | done
69 |
70 | if [ $missing_dependencies -gt 0 ]; then
71 | printf "${RED}ERROR${NC}: $missing_dependencies missing dependencies, aborting execution\n" >&2
72 | exit 1
73 | fi
74 |
75 |
--------------------------------------------------------------------------------
/bin/check_mandatory_files.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #=============================================================
4 | # HEADER
5 | #=============================================================
6 |
7 | #INSTITUTION:ISCIII
8 | #CENTRE:BU-ISCIII
9 | #AUTHOR: Pedro J. Sola
10 | VERSION=1.0
11 | #CREATED: 19 March 2018
12 | #REVISION:
13 | #DESCRIPTION:Short function to evaluate if files exist
14 |
15 | #================================================================
16 | # END_OF_HEADER
17 | #================================================================
18 |
19 | #SHORT USAGE RULES
20 | #LONG USAGE FUNCTION
21 | usage() {
22 | cat << EOF
23 |
24 | Check_mandatory_files Short function to evaluate if files exist
25 |
26 | usage : $0 [file2] ...
27 |
28 | example: lib/check_mandatory_files.sh foo.txt bar.fasta
29 |
30 | EOF
31 | }
32 |
33 | if [ $# = 0 ] ; then
34 | usage >&2
35 | exit
36 | fi
37 |
38 | #DECLARE FLAGS AND VARIABLES
39 | missing_files=0
40 |
41 | for file in "$@"; do
42 | if [ ! -f $file ]; then
43 | echo "$(basename $file)" "not supplied, please, introduce a valid file" >&2
44 | let missing_files++
45 | fi
46 | done
47 |
48 | if [ $missing_files -gt 0 ]; then
49 | echo "ERROR: $missing_files missing files, aborting execution" >&2
50 | exit 1
51 | fi
--------------------------------------------------------------------------------
/bin/coordinate_adapter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | #set -e
6 | #set -x
7 |
8 | #=============================================================
9 | # HEADER
10 | #=============================================================
11 |
12 | #INSTITUTION:ISCIII
13 | #CENTRE:BU-ISCIII
14 | #AUTHOR: Pedro J. Sola
15 | VERSION=1.0
16 | #CREATED: 17 May 2018
17 | #REVISION:
18 | #DESCRIPTION:coordinate_adapter script adapt coordinates obtained with a bed file to a reference sequences in a link file
19 | #
20 | #
21 | #================================================================
22 | # END_OF_HEADER
23 | #================================================================
24 |
25 | #SHORT USAGE RULES
26 | #LONG USAGE FUNCTION
27 | usage() {
28 | cat << EOF
29 |
30 | coordinate_adapter script adapt coordinates obtained with a bed file to a reference sequences in a link file
31 |
32 | usage : $0 <-i inputfile(.bed)> <-l link_file> [-o ] [-n ] [-f ] [-u] [-v] [-h]
33 |
34 | -i input file in bed format
35 | -l link file with coordinates relationship within bed file ddbb and link reference
36 | -o output directory (optional). By default the file is placed in the same location as input
37 | -n length to extend annotation, default 2000
38 | -f file name
39 | -u uniq mode. Remove duplicates
40 | -p prokka mode. Remove suffix of prokka
41 | -v version
42 | -h display usage message
43 |
44 | example: ./coordinate_adapter.sh -i genes.bed -l ecoli.links -n 10000
45 |
46 | EOF
47 | }
48 |
49 | #================================================================
50 | # OPTION_PROCESSING
51 | #================================================================
52 | #Make sure the script is executed with arguments
53 | if [ $# = 0 ] ; then
54 | usage >&2
55 | exit 1
56 | fi
57 |
58 | # Error handling
59 | error(){
60 | local parent_lineno="$1"
61 | local script="$2"
62 | local message="$3"
63 | local code="${4:-1}"
64 |
65 | RED='\033[0;31m'
66 | NC='\033[0m'
67 |
68 | if [[ -n "$message" ]] ; then
69 | echo -e "\n---------------------------------------\n"
70 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
71 | echo -e "MESSAGE:\n"
72 | echo -e "$message"
73 | echo -e "\n---------------------------------------\n"
74 | else
75 | echo -e "\n---------------------------------------\n"
76 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
77 | echo -e "\n---------------------------------------\n"
78 | fi
79 |
80 | exit "${code}"
81 | }
82 |
83 | #DECLARE FLAGS AND VARIABLES
84 | cwd="$(pwd)"
85 | input_file="Bed_file"
86 | link_file="Link_file"
87 | number_extension=2000
88 | unique=false
89 | prokka_mode=false
90 | suffix=""
91 |
92 | #PARSE VARIABLE ARGUMENTS WITH getops
93 | #common example with letters, for long options check longopts2getopts.sh
94 | options=":i:l:n:f:puvh"
95 | while getopts $options opt; do
96 | case $opt in
97 | i )
98 | input_file=$OPTARG
99 | ;;
100 | l )
101 | link_file=$OPTARG
102 | ;;
103 | o )
104 | output_dir=$OPTARG
105 | ;;
106 | n )
107 | number_extension=$OPTARG
108 | ;;
109 | f)
110 | file_name=$OPTARG
111 | ;;
112 | u )
113 | unique=true
114 | suffix=".unique.tmp"
115 | ;;
116 | p )
117 | prokka_mode=true
118 | suffix=".prokka.tmp"
119 | ;;
120 | h )
121 | usage
122 | exit 1
123 | ;;
124 | v )
125 | echo $VERSION
126 | exit 1
127 | ;;
128 | \?)
129 | echo "Invalid Option: -$OPTARG" 1>&2
130 | usage
131 | exit 1
132 | ;;
133 | : )
134 | echo "Option -$OPTARG requires an argument." >&2
135 | exit 1
136 | ;;
137 | * )
138 | echo "Unimplemented option: -$OPTARG" >&2;
139 | exit 1
140 | ;;
141 |
142 | esac
143 | done
144 | shift $((OPTIND-1))
145 |
146 |
147 | #================================================================
148 | # MAIN_BODY
149 | #================================================================
150 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
151 |
152 | echo -e "\n#Executing" $0 "\n"
153 |
154 | check_mandatory_files.sh $input_file $link_file
155 |
156 | if [ ! $output_dir ]; then
157 | output_dir=$(dirname $input_file)
158 | #echo "Default output directory is" $output_dir
159 | mkdir -p $output_dir
160 | else
161 | #echo "Output directory is" $output_dir
162 | mkdir -p $output_dir
163 | fi
164 |
165 |
166 | if [ ! $file_name ]; then
167 | file_name=$(basename $input_file | cut -d. -f1,2)
168 | fi
169 |
170 |
171 | echo "$(date)"
172 | echo "adapting coordinates from" $input_file and $link_file
173 | echo "file name is:" $file_name
174 |
175 | #Create a dictionary file with all posibilities: Column 1 and 5 must have some common terms
176 | awk 'NR==FNR{a[NR]=$1;b[NR]=$0;next}{for(i = 1; i <= NR; ++i){if (a[i] == $1) print b[i],"\t", $0}}' \
177 | $input_file $link_file > $output_dir/$file_name".coordinates.tmp" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.tmp\" creation. See $output_dir/logs for more information."
178 |
179 | awk '(($2 >= $6 - '"${number_extension}"' && $2 <= $7) || ($3 >= $6 && $3 <= $7 + '"${number_extension}"')) {{isInverted=($10-$9); \
180 | genelength=($3-$2)};{if (isInverted < 0) {coordChr1=(($7-$3)+$10);} else {coordChr1=(($2-$6)+$9)}}; \
181 | coordChr2=(coordChr1+genelength); {print $8, coordChr1, coordChr2, $4}}' $output_dir/$file_name".coordinates.tmp" > $output_dir/$file_name".coordinates.negatives"|| error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.negatives\" creation. See $output_dir/logs for more information."
182 |
183 |
184 | #resulting in a bed file with coordinated of plasmid bur refering to contig annotation:
185 | #NZ_CP010574.1 34820 33528 arsB_1
186 | #NZ_CP008930.1 90527 89235 arsB_1
187 | #NZ_CP006927.1 44969 43677 arsB_1
188 | #NZ_CP010574.1 81021 82508 ltrA_1
189 | #NZ_CP008930.1 144220 145707 ltrA_1
190 |
191 |
192 | #Remove duplicate of several matches
193 |
194 | awk '($2 > 0) && ($3 > 0)' $output_dir/$file_name".coordinates.negatives" \
195 | > $output_dir/$file_name".coordinates"$suffix || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates$suffix\" creation. See $output_dir/logs for more information."
196 |
197 |
198 | if [ "$unique" == "true" ]; then
199 | awk '
200 | (!x[$1$4]++)
201 | ' $output_dir/$file_name".coordinates"$suffix \
202 | > $output_dir/$file_name".coordinates" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates\" creation. See $output_dir/logs for more information."
203 |
204 | rm $output_dir/$file_name".coordinates"$suffix
205 | fi
206 |
207 | if [ "$prokka_mode" == "true" ]; then
208 |
209 | awk '
210 | (!uniq[$1$4]++)
211 | ' $output_dir/$file_name".coordinates"$suffix \
212 | > $output_dir/$file_name".coordinates.prokka.unique.tmp"|| error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates.prokka.unique.tmp\" creation. See $output_dir/logs for more information."
213 |
214 |
215 | awk '
216 | BEGIN{OFS="\t"}{split($4, namelowbar, "_")} {$4=($4 !~ /CDS/) ? namelowbar[1] : $4}1
217 | ' $output_dir/$file_name".coordinates.prokka.unique.tmp" \
218 | > $output_dir/$file_name".coordinates" || error ${LINENO} $(basename $0) "Awk command in $file_name\".coordinates\" creation. See $output_dir/logs for more information."
219 |
220 | rm $output_dir/$file_name".coordinates.prokka.unique.tmp"
221 | rm $output_dir/$file_name".coordinates"$suffix
222 |
223 | fi
224 |
225 | rm $output_dir/$file_name".coordinates.tmp"
226 | rm $output_dir/$file_name".coordinates.negatives"
227 |
228 |
229 | echo "$(date)"
230 | echo -e "Coordinates adapted to file" $output_dir/$file_name".coordinates" "\n"
231 |
--------------------------------------------------------------------------------
/bin/download_plasmid_database.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Standard library imports
4 | import os
5 | import sys
6 | import logging
7 |
8 | # Third party imports
9 | import argparse
10 | import datetime
11 | import pandas as pd
12 | import Bio
13 | from Bio import Entrez
14 | from Bio import SeqIO
15 |
16 | logger = logging.getLogger()
17 |
18 | """
19 | =============================================================
20 | HEADER
21 | =============================================================
22 | FUNCTION: Download up to date plasmid database from https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/plasmids.txt.
23 | Remove those sequences with terms not related to complete plasmid such: gene, protein, partial, putative or hypothetical
24 |
25 | INSTITUTION:CNM-ISCIII
26 | AUTHOR: Pedro J. Sola (pedroscampoy@gmail.com)
27 | d^v^b
28 | VERSION=0.1
29 | CREATED: 26 February 2020
30 | REVISION:
31 |
32 | TODO:
33 | add user defined terms
34 | filter by record size (len(record))
35 | ================================================================
36 | END_OF_HEADER
37 | ================================================================
38 | """
39 |
40 |
41 | def check_create_dir(path):
42 | if os.path.exists(path):
43 | pass
44 | else:
45 | os.mkdir(path)
46 |
47 |
48 | def main():
49 |
50 | def get_arguments():
51 |
52 | parser = argparse.ArgumentParser(
53 | prog='download_plasmid_database.py', description='Download up to date plasmid database from ncbi ftp')
54 |
55 | parser.add_argument('-o', '--output', type=str, required=True,
56 | help='REQUIRED. Output directory to extract plasmid database')
57 |
58 | arguments = parser.parse_args()
59 |
60 | return arguments
61 |
62 | args = get_arguments()
63 |
64 | output_dir = os.path.abspath(args.output)
65 |
66 | check_create_dir(output_dir)
67 |
68 | # LOGGING
69 | # Create log file with date and time
70 | today = str(datetime.date.today())
71 | right_now_full = "".join(today.split("-"))
72 |
73 | log_filename = 'plasmidID_database' + "_" + right_now_full + ".log"
74 | log_full_path = os.path.join(output_dir, log_filename)
75 |
76 | logger = logging.getLogger()
77 | logger.setLevel(logging.DEBUG)
78 |
79 | formatter = logging.Formatter('%(asctime)s:%(message)s')
80 |
81 | file_handler = logging.FileHandler(log_full_path)
82 | file_handler.setLevel(logging.DEBUG)
83 | file_handler.setFormatter(formatter)
84 |
85 | stream_handler = logging.StreamHandler()
86 | stream_handler.setLevel(logging.INFO)
87 | # stream_handler.setFormatter(formatter)
88 |
89 | logger.addHandler(stream_handler)
90 | logger.addHandler(file_handler)
91 |
92 | #####################START PIPELINE################
93 |
94 | logger.debug(args)
95 |
96 | plasmid_text_file = today + "_plasmids.txt"
97 | plasmid_text_path = os.path.join(output_dir, plasmid_text_file)
98 |
99 | plasmid_fasta_file = today + "_plasmids.fasta"
100 | plasmid_fasta_path = os.path.join(output_dir, plasmid_fasta_file)
101 |
102 | plasmid_failed_file = today + "failed_plasmids.txt"
103 | plasmid_failed_path = os.path.join(output_dir, plasmid_failed_file)
104 |
105 | try:
106 | df = pd.read_csv(
107 | 'https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/plasmids.txt', sep='\t')
108 | except:
109 | logger.info('there was a problem accessing the ftp')
110 | sys.exit(1)
111 |
112 | df.to_csv(plasmid_text_path, sep='\t', index=False)
113 |
114 | plasmid_reference = df['RefSeq'][df.RefSeq !=
115 | "-"].tolist() + df['INSDC'][df.RefSeq == "-"].tolist()
116 |
117 | # remove duplicates
118 | plasmid_reference = set(plasmid_reference)
119 | # Set terms to exclude
120 | terms_to_exclude = ['gene ', 'protein',
121 | 'partial', 'putative', 'hypothetical']
122 | # Dictionary with erroneous accession numbers to determine the reason
123 | erroneous = {}
124 |
125 | Entrez.email = "A.N.Other@example.com"
126 |
127 | total_sequences = len(plasmid_reference)
128 | current_record = 1
129 | logger.info("")
130 | logger.info("Starting plasmid database download script: " +
131 | str(total_sequences) + " will be downloaded")
132 | logger.info("This will take a while.\nCheck progress in " + log_full_path)
133 |
134 | with open(plasmid_fasta_path, 'w+') as output_handle:
135 | for plasmid_accnumber in plasmid_reference:
136 | try:
137 | handle = Entrez.efetch(
138 | db="nucleotide", id=plasmid_accnumber, rettype="fasta", retmode="text")
139 | record = SeqIO.read(handle, "fasta")
140 | terms_present = [
141 | x in record.description for x in terms_to_exclude]
142 | handle.close()
143 | if sum(terms_present) > 0:
144 | terms_true = [terms_to_exclude[i]
145 | for i, x in enumerate(terms_present) if x == True]
146 | erroneous[record.id] = "Include terms: " + \
147 | ', '.join(terms_true) + " => " + record.description
148 | logger.debug(" %s/%s Invalid terms in record %s" %
149 | (current_record, total_sequences, record.id))
150 | else:
151 | logger.debug(" %s/%s Downloading record %s" %
152 | (current_record, total_sequences, record.id))
153 | SeqIO.write(record, output_handle, "fasta")
154 | except:
155 | logger.debug(" %s/%s Failed to download %s" %
156 | (current_record, total_sequences, record.id))
157 | erroneous[record.id] = "failed to download"
158 | current_record = current_record + 1
159 |
160 | if len(erroneous) > 0:
161 | with open(plasmid_failed_path, 'w+') as ferror:
162 | for acc, reason in erroneous.items():
163 | ferror.write(acc + ": " + reason + "\n")
164 |
165 | logger.info("ALL DONE\nFASTA file is available in: " + plasmid_fasta_path)
166 |
167 |
168 | if __name__ == '__main__':
169 | try:
170 | main()
171 | except Exception as e:
172 | logger.exception(e)
173 | raise
174 |
--------------------------------------------------------------------------------
/bin/draw_circos_images.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 | #set -x
7 | #=============================================================
8 | # HEADER
9 | #=============================================================
10 |
11 | #INSTITUTION:ISCIII
12 | #CENTRE:BU-ISCIII
13 | #AUTHOR: Pedro J. Sola
14 | VERSION=1.0
15 | #CREATED: 01 May 2018
16 | #REVISION:
17 | # 11 July 2018: Apply good practices bash
18 | # Include independent files
19 | # Include several databases
20 | # 13 July 2018: Include log file
21 | # manage directories
22 | #DESCRIPTION:Script that creates and execute a cicos config file for plasmidID
23 | #
24 | #
25 | #
26 | #================================================================
27 | # END_OF_HEADER
28 | #================================================================
29 |
30 | #SHORT USAGE RULES
31 | #LONG USAGE FUNCTION
32 | usage() {
33 | cat << EOF
34 |
35 | draw_circos_image script that creates and execute a cicos config file for plasmidID
36 |
37 | usage : $0 <-i input_directory> <-d config_files_directory> <-C config_file> <-s sample> <-g <-o [-l ] [-V] [-c] [-v] [-h]
38 |
39 | -i input directory containing files to represent
40 | -d directory containing config files
41 | -C config file selected to draw
42 | -s sample
43 | -g group
44 | -l log file
45 | -o output directory to create config and pictures
46 | -c clean: remove config files
47 | -v version
48 | -V verbose
49 | -h display usage message
50 |
51 | EOF
52 | }
53 |
54 | #================================================================
55 | # OPTION_PROCESSING
56 | #================================================================
57 | #Make sure the script is executed with arguments
58 | if [ $# = 0 ] ; then
59 | usage >&2
60 | exit 1
61 | fi
62 |
63 | # Error handling
64 | error(){
65 | local parent_lineno="$1"
66 | local script="$2"
67 | local message="$3"
68 | local code="${4:-1}"
69 |
70 | RED='\033[0;31m'
71 | NC='\033[0m'
72 |
73 | if [[ -n "$message" ]] ; then
74 | echo -e "\n---------------------------------------\n"
75 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
76 | echo -e "MESSAGE:\n"
77 | echo -e "$message"
78 | echo -e "\n---------------------------------------\n"
79 | else
80 | echo -e "\n---------------------------------------\n"
81 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
82 | echo -e "\n---------------------------------------\n"
83 | fi
84 |
85 | exit "${code}"
86 | }
87 |
88 | #DECLARE FLAGS AND VARIABLES
89 |
90 | cwd="$(pwd)"
91 | clean=false
92 | verbose=false
93 |
94 |
95 | #PARSE VARIABLE ARGUMENTS WITH getops
96 | #common example with letters, for long options check longopts2getopts.sh
97 | options=":i:m:o:g:l:s:d:C:cVvh"
98 | while getopts $options opt; do
99 | case $opt in
100 | i )
101 | input_dir=$OPTARG
102 | ;;
103 | o )
104 | output_dir=$OPTARG
105 | ;;
106 | d )
107 | config_dir=$OPTARG
108 | ;;
109 | C )
110 | config_file_individual=$OPTARG
111 | ;;
112 | l )
113 | log_file=$OPTARG
114 | ;;
115 | g )
116 | group=$OPTARG
117 | ;;
118 | s )
119 | sample=$OPTARG
120 | ;;
121 | c )
122 | clean=true
123 | ;;
124 | h )
125 | usage
126 | exit 1
127 | ;;
128 | V )
129 | verbose=true
130 | ;;
131 | v )
132 | echo $VERSION
133 | exit 1
134 | ;;
135 | \?)
136 | echo "Invalid Option: -$OPTARG" 1>&2
137 | usage
138 | exit 1
139 | ;;
140 | : )
141 | echo "Option -$OPTARG requires an argument." >&2
142 | exit 1
143 | ;;
144 | * )
145 | echo "Unimplemented option: -$OPTARG" >&2;
146 | exit 1
147 | ;;
148 |
149 | esac
150 | done
151 | shift $((OPTIND-1))
152 |
153 | #================================================================
154 | # MAIN_BODY
155 | #================================================================
156 |
157 | imageDir=$input_dir"/data"
158 |
159 | if [ -f $log_file ]; then
160 | rm $log_file
161 | fi
162 |
163 | echo -e "\n#Executing" $0 "\n" &>> $log_file
164 |
165 | cdsDdbb_file=$input_dir/database/$sample".gff.bed"
166 | cdsDdbb_file_forward=$input_dir/database/$sample".gff.forward.bed"
167 | cdsDdbb_file_reverse=$input_dir/database/$sample".gff.reverse.bed"
168 |
169 |
170 | circos_conf_summary="$config_dir/circos_summary_1_3_3.conf"
171 | circos_conf_individual="$config_dir/$config_file_individual"
172 | circosDir="$output_dir"
173 |
174 |
175 | plasmidMapped=$imageDir/$sample".coverage_adapted_clustered_ac"
176 |
177 | karyotype_file_individual=$imageDir/$sample".karyotype_individual.txt"
178 | karyotype_file_summary=$imageDir/$sample".karyotype_summary.txt"
179 | annotation_text_file=$imageDir/pID_text_annotation.coordinates
180 | annotation_highlights_file=$imageDir/pID_highlights.conf
181 |
182 | coverage_file=$imageDir/$sample".bedgraph_term"
183 | cds_contig_file=$imageDir/$sample".gff.coordinates"
184 | cds_contig_file_forward=$imageDir/$sample".gff.forward.coordinates"
185 | cds_contig_file_reverse=$imageDir/$sample".gff.reverse.coordinates"
186 |
187 |
188 | contig_file=$imageDir/$sample".plasmids.bed"
189 | contig_file_complete=$imageDir/$sample".plasmids.complete"
190 | links_file=$imageDir/$sample".plasmids.links"
191 |
192 | imageName=$sample"_summary.png"
193 |
194 | mkdir -p $circosDir
195 |
196 |
197 | echo "Creating individual config file for SAMPLE $sample using FILE $circos_conf_individual" &>> $log_file
198 |
199 | awk '{gsub("PLASMID_KARYOTYPE","'$karyotype_file_individual'"); \
200 | gsub("PLASMID_SPECIFIC_TEXT","'$annotation_text_file'"); \
201 | gsub("PID_ALL_HIGHLIGHTS","'$annotation_highlights_file'"); \
202 | gsub("PLASMID_COVERAGE_GRAPH","'$coverage_file'"); \
203 | gsub("PLASMID_CDS_CONTIG","'$cds_contig_file'"); \
204 | gsub("PLASMID_CDS_FORWARD","'$cds_contig_file_forward'"); \
205 | gsub("PLASMID_CDS_REVERSE","'$cds_contig_file_reverse'"); \
206 | gsub("PLASMID_CDS_DDBB","'$cdsDdbb_file'"); \
207 | gsub("CDS_DDBB_FORWARD","'$cdsDdbb_file_forward'"); \
208 | gsub("CDS_DDBB_REVERSE","'$cdsDdbb_file_reverse'"); \
209 | gsub("PLASMID_CONTIGS_COMPLETE","'$contig_file_complete'"); \
210 | gsub("PLASMID_CONTIGS","'$contig_file'"); \
211 | gsub("OUTPUTDIR","'$circosDir'"); \
212 | print $0}' $circos_conf_individual > $circosDir/$sample"_individual.circos.conf"
213 |
214 | echo "DONE Creating config file for circos in SAMPLE $sample" &>> $log_file
215 |
216 | echo "Executing circos in SAMPLE $sample" &>> $log_file
217 |
218 |
219 |
220 | for i in $(cat $plasmidMapped)
221 | do
222 | echo "Creating image for plasmid $i in sample $sample" &>> $log_file
223 | awk '{gsub("SAMPLE_SHOWN","'$i'"); \
224 | gsub("IMAGENAME_SAMPLE_PLASMID","'$sample'_'$i'.png"); \
225 | print $0}' $circosDir/$sample"_individual.circos.conf" > $circosDir/$sample"_"$i"_individual.circos.conf"
226 | if [ $verbose = true ];then
227 | $(circos -conf $circosDir/$sample"_"$i"_individual.circos.conf" |& tee -a $log_file) || error ${LINENO} $(basename $0) "Circos command for individual image has failed. See $output_dir/logs for more information"
228 | else
229 | $(circos -conf $circosDir/$sample"_"$i"_individual.circos.conf" &>> $log_file) || error ${LINENO} $(basename $0) "Circos command for individual image has failed. See $output_dir/logs for more information"
230 | fi
231 | done
232 |
233 |
234 | if [ -s $karyotype_file_summary ]; then
235 |
236 | echo "Creating summary image for in sample" $sample "from FILE" $circos_conf_summary &>> $log_file
237 |
238 | awk '{gsub("PLASMID_KARYOTYPE","'$karyotype_file_summary'"); \
239 | gsub("PLASMID_SPECIFIC_TEXT","'$annotation_text_file'"); \
240 | gsub("PID_ALL_HIGHLIGHTS","'$annotation_highlights_file'"); \
241 | gsub("PLASMID_COVERAGE_GRAPH","'$coverage_file'"); \
242 | gsub("PLASMID_CDS_CONTIG","'$cds_contig_file'"); \
243 | gsub("PLASMID_CDS_FORWARD","'$cds_contig_file_forward'"); \
244 | gsub("PLASMID_CDS_REVERSE","'$cds_contig_file_reverse'"); \
245 | gsub("PLASMID_CDS_DDBB","'$cdsDdbb_file'"); \
246 | gsub("PLASMID_CONTIGS","'$contig_file'"); \
247 | gsub("PLASMID_LINKS","'$links_file'"); \
248 | gsub("OUTPUTDIR","'$circosDir'"); \
249 | gsub("IMAGENAME","'$imageName'"); \
250 | print $0}' $circos_conf_summary > $circosDir/$sample"_summary.circos.conf"
251 |
252 | if [ $verbose = true ]; then
253 | circos -conf $circosDir/$sample"_summary.circos.conf" |& tee -a $log_file || exit 1
254 | else
255 | circos -conf $circosDir/$sample"_summary.circos.conf" &>> $log_file || exit 1
256 | fi
257 |
258 | else
259 |
260 | echo "No plasmid matched requirements to draw the summary image"
261 |
262 | fi
263 |
264 |
265 | #Remove config files
266 | if [ clean = true ];then
267 | for i in $(cat $plasmidMapped)
268 | do
269 | if [ -f $circosDir/$sample"_"$i"_individual.circos.conf" ]; then
270 | rm $circosDir/$sample"_"$i"_individual.circos.conf"
271 | fi
272 | done
273 |
274 | rm $circosDir/$sample"_summary.circos.conf"
275 | rm $circosDir/$sample"_individual.circos.conf"
276 | fi
277 |
278 | echo "DONE, files can be found at $circosDir"
279 |
--------------------------------------------------------------------------------
/bin/filter_fasta.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 | #set -x
7 | #=============================================================
8 | # HEADER
9 | #=============================================================
10 |
11 | #INSTITUTION:ISCIII
12 | #CENTRE:BU-ISCIII
13 | #AUTHOR: Pedro J. Sola
14 | VERSION=1.0
15 | #CREATED: 21 March 2018
16 | #REVISION:
17 | # 22 March 2018: Handle output directory by default the same as -f file
18 | # 13 April 2018: Include -G option to filter any file by term with both file or term
19 | #DESCRIPTION:Script that extract sequences by term, either by key or file with a list
20 | #AKNOWLEDGE:
21 | # -Multiple arguments in one flag: https://stackoverflow.com/questions/7529856/retrieving-multiple-arguments-for-a-single-option-using-getopts-in-bash
22 | #TODO:
23 | # -Add and remove sequences in the same execution
24 | #================================================================
25 | # END_OF_HEADER
26 | #================================================================
27 |
28 | #SHORT USAGE RULES
29 | #LONG USAGE FUNCTION
30 | usage() {
31 | cat << EOF
32 |
33 | Filter_fasta script that extract sequences by term, either by key or file with a list
34 |
35 | usage : $0 <-i file.fasta> <(-l term1 -l term2 -l term3 | -f file)> [-n ] [-o ] [-G] [-N] [-v] [-h]
36 |
37 | -i fasta file to filter
38 | -o output directory (optional). By default the file is replaced in the same location
39 | -n file name (optional). By default is the same as -f file with .fasta extension
40 | -l list of key terms separated by space
41 | -N Use term to discard sequences with terms (Negative filter)
42 | -G General filter: filter any file with a list of keys
43 | -f file with a list of terms to filter
44 | -v version
45 | -h display usage message
46 |
47 | example: filter_fasta.sh -i ecoli.fasta -l NC00012 -l WC52247 -l hypothetical -l partial -n NAME
48 | filter_fasta.sh -i ecoli.fasta -l "NC00012 WC52247 hypothetical partial"
49 | filter_fasta.sh -i ecoli.fasta -f list_with_terms.txt
50 |
51 | EOF
52 | }
53 |
54 | #================================================================
55 | # OPTION_PROCESSING
56 | #================================================================
57 | #Make sure the script is executed with arguments
58 | if [ $# = 0 ] ; then
59 | usage >&2
60 | exit 1
61 | fi
62 |
63 | # Error handling
64 | error(){
65 | local parent_lineno="$1"
66 | local script="$2"
67 | local message="$3"
68 | local code="${4:-1}"
69 |
70 | RED='\033[0;31m'
71 | NC='\033[0m'
72 |
73 | if [[ -n "$message" ]] ; then
74 | echo -e "\n---------------------------------------\n"
75 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
76 | echo -e "MESSAGE:\n"
77 | echo -e "$message"
78 | echo -e "\n---------------------------------------\n"
79 | else
80 | echo -e "\n---------------------------------------\n"
81 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
82 | echo -e "\n---------------------------------------\n"
83 | fi
84 |
85 | exit "${code}"
86 | }
87 |
88 | #DECLARE FLAGS AND VARIABLES
89 | term_option=false
90 | file_option=false
91 | general_filter=false
92 | negative_filter=""
93 | cwd="$(pwd)"
94 | input_file="Input_file"
95 |
96 | #PARSE VARIABLE ARGUMENTS WITH getops
97 | #common example with letters, for long options check longopts2getopts.sh
98 | options=":i:o:n:l:f:GNvh"
99 | while getopts $options opt; do
100 | case $opt in
101 | i )
102 | input_file=$OPTARG
103 | ;;
104 | o )
105 | output_dir=$OPTARG
106 | ;;
107 | n )
108 | file_name=$OPTARG
109 | ;;
110 | N )
111 | negative_filter="!"
112 | ;;
113 | G )
114 | general_filter=true
115 | ;;
116 | l )
117 | terms_for_filtering+=($OPTARG)
118 | term_option=true
119 | ;;
120 | f )
121 | file_for_filtering=$OPTARG
122 | check_mandatory_files.sh $input_file
123 | file_option=true
124 | ;;
125 | h )
126 | usage
127 | exit 1
128 | ;;
129 | v )
130 | echo $VERSION
131 | exit 1
132 | ;;
133 | \?)
134 | echo "Invalid Option: -$OPTARG" 1>&2
135 | usage
136 | exit 1
137 | ;;
138 | : )
139 | echo "Option -$OPTARG requires an argument." >&2
140 | exit 1
141 | ;;
142 | * )
143 | echo "Unimplemented option: -$OPTARG" >&2;
144 | exit 1
145 | ;;
146 |
147 | esac
148 | done
149 | shift $((OPTIND-1))
150 |
151 | #================================================================
152 | # MAIN_BODY
153 | #================================================================
154 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
155 |
156 | echo -e "\n#Executing" $0 "\n"
157 |
158 | check_mandatory_files.sh $input_file
159 |
160 |
161 | if [ $general_filter = true ]; then
162 | file_name=$(basename $input_file)
163 | output_dir=$(dirname $input_file)
164 | fi
165 |
166 | #MANAGE OUTPUT DIRECTORY
167 | if [ $file_option = true ] && [ ! $output_dir ]; then
168 | output_dir=$(dirname $file_for_filtering)
169 | echo "Output directory is" $output_dir
170 | mkdir -p $output_dir
171 | elif [ $file_option = false ] && [ ! $output_dir ]; then
172 | echo "please, provide an output directory" $output_dir
173 | exit 1
174 | else
175 | echo "Output directory is=" $output_dir
176 | mkdir -p $output_dir
177 | fi
178 |
179 | #MANAGE FILE NAME
180 | if [ $file_option = true ] && [ ! $file_name ]; then
181 | file_name=$(echo $(basename $file_for_filtering))
182 | elif [ $file_option = false ] && [ ! $file_name ]; then
183 | file_name=$terms_for_filtering #First term supplied by -l
184 | else
185 | echo "File name is=" $file_name
186 | fi
187 |
188 | #PROCESS REGULAR EXPRESSION TERMS
189 | if [ $term_option = true ] && [ $file_option = false ]; then
190 |
191 | list_terms_listed=$(for term in "${terms_for_filtering[@]}"; do echo "$term"; done) #process terms into list
192 | final_list_terms_regexp=$(printf "%s|" $list_terms_listed | sed 's/|$//g') #suitable for regexp
193 |
194 | elif [ $term_option = false ] && [ $file_option = true ]; then
195 |
196 | check_mandatory_files.sh $file_for_filtering
197 | if [ ! -s $file_for_filtering ];then
198 | echo -e "ERROR: terms file empty!!"
199 | exit 1
200 | fi
201 |
202 | final_list_terms_regexp=$(printf "%s|" $(cat $file_for_filtering) | sed 's/|$//g')
203 | else
204 |
205 | check_mandatory_files.sh $file_for_filtering
206 | if [ ! -s $file_for_filtering ];then
207 | echo -e "ERROR: terms file empty!!"
208 | exit 1
209 | fi
210 | list_terms_listed=$(for term in "${terms_for_filtering[@]}"; do echo "$term"; done)
211 | list_terms_regexp_term=$(printf "%s|" $list_terms_listed | sed 's/|$//g')
212 | list_terms_regexp_file=$(printf "%s|" $(cat $file_for_filtering) | sed 's/|$//g')
213 | final_list_terms_regexp=$(echo $list_terms_regexp_term"|"$list_terms_regexp_file) #concat all regexp into one
214 | fi
215 |
216 | #AWK SCRIPT THAT FILTER SEQUENCES#
217 | ##################################
218 |
219 | if [ $general_filter = true ]; then
220 |
221 | echo "$(date)"
222 | echo "General filtering terms on file" $(basename $input_file)
223 |
224 | awk '
225 | /'"${final_list_terms_regexp}"'/ {print $0}
226 | ' $input_file \
227 | > $output_dir/$file_name"_term" || error ${LINENO} $(basename $0) "Awk command for fasta filtering in $file_name\"_term\" creation. See $output_dir/logs for more information."
228 |
229 | echo "$(date)"
230 | echo "Done general filtering terms on file" $(basename $input_file)
231 | echo "File with filtered lines can be found in" $output_dir/$file_name"_term"
232 |
233 | else
234 | echo "$(date)"
235 | echo "Filtering terms on file" $(basename $input_file)
236 | seq_number_prev=$(cat $input_file | grep ">" | wc -l)
237 |
238 | awk '
239 | BEGIN {RS=">"}
240 | '"${negative_filter}"'/'"${final_list_terms_regexp}"'/ {print ">"$0}
241 | ' $input_file \
242 | > $output_dir/$file_name"_term.fasta" || error ${LINENO} $(basename $0) "Awk command for fasta filtering in $file_name\"_term.fata\" creation. See $output_dir/logs for more information."
243 |
244 | echo "$(date)"
245 | echo "DONE Filtering terms on file" $(basename $input_file)
246 | seq_number_post=$(cat $output_dir/$file_name"_term.fasta" | grep ">" | wc -l)
247 | echo "File with filtered sequences can be found in" $output_dir/$file_name"_term.fasta"
248 |
249 | echo "Previous number of sequences=" $seq_number_prev
250 | echo "Post number of sequences=" $seq_number_post
251 | echo -e "\n"
252 | fi
253 |
--------------------------------------------------------------------------------
/bin/get_coverage.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | #=============================================================
6 | # HEADER
7 | #=============================================================
8 |
9 | #INSTITUTION:ISCIII
10 | #CENTRE:BU-ISCIII
11 | #AUTHOR: Pedro J. Sola
12 | VERSION=1.0
13 | #CREATED: 20 March 2018
14 | #REVISION:
15 | #DESCRIPTION:Script that uses bedtool to obtain coverage data from a BAMm file
16 | #The default output format is as follows:
17 | #
18 | #chromosome (or entire genome)
19 | #depth of coverage from features in input file
20 | #number of bases on chromosome (or genome) with depth equal to column 2.
21 | #size of chromosome (or entire genome) in base pairs
22 | #fraction of bases on chromosome (or entire genome) with depth equal to column 2.
23 | #
24 | #chr1 0 980 1000 0.98
25 | #chr1 1 20 1000 0.02
26 | #chr2 1 500 500 1
27 | #genome 0 980 1500 0.653333
28 | #genome 1 520 1500 0.346667
29 | #
30 | #-p option is equivalent to -bga BEDGRAPH output
31 | #
32 | #chr1 0 554304 0
33 | #chr1 554304 554309 5
34 | #chr1 554309 554313 6
35 | #chr1 554313 554314 1
36 | #chr1 554314 554315 0
37 | #chr1 554315 554316 6
38 | #chr1 554316 554317 5
39 | #chr1 554317 554318 1
40 | #chr1 554318 554319 2
41 | #chr1 554319 554321 6
42 | #================================================================
43 | # END_OF_HEADER
44 | #================================================================
45 |
46 | #SHORT USAGE RULES
47 | #LONG USAGE FUNCTION
48 | usage() {
49 | cat << EOF
50 |
51 | Get_coverage script uses bedtool to obtain coverage data from a BAMm file
52 |
53 | usage : $0 <-i inputfile(sorted.bam)> [-o ] [-d ] [-s sample_name]
54 | [-g group_name] [-m ] [p] [-v] [-h]
55 |
56 | -i input file in sorted BAM format
57 | -o output directory (optional)
58 | -d database to extract length. Fasta file used to map against
59 | -m max depth reported (default 500)
60 | -p reports genome coverage for all positions in BEDGRAPH format includig 0 positions.
61 | Default option is bedtools genomecov that needs the reference genome
62 | -s sample name
63 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group
64 | -v version
65 | -h display usage message
66 |
67 | example: get_coverage.sh -i ecoli.bam -d database.fasta
68 | get_coverage.sh -i ecoli.bam -p -m 100
69 |
70 | EOF
71 | }
72 |
73 | #================================================================
74 | # OPTION_PROCESSING
75 | #================================================================
76 | #Make sure the script is executed with arguments
77 | if [ $# = 0 ] ; then
78 | usage >&2
79 | exit 1
80 | fi
81 |
82 | # Error handling
83 | error(){
84 | local parent_lineno="$1"
85 | local script="$2"
86 | local message="$3"
87 | local code="${4:-1}"
88 |
89 | RED='\033[0;31m'
90 | NC='\033[0m'
91 |
92 | if [[ -n "$message" ]] ; then
93 | echo -e "\n---------------------------------------\n"
94 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
95 | echo -e "MESSAGE:\n"
96 | echo -e "$message"
97 | echo -e "\n---------------------------------------\n"
98 | else
99 | echo -e "\n---------------------------------------\n"
100 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
101 | echo -e "\n---------------------------------------\n"
102 | fi
103 |
104 | exit "${code}"
105 | }
106 |
107 | #DECLARE FLAGS AND VARIABLES
108 | cwd="$(pwd)"
109 | group="NO_GROUP"
110 | input_file="Input_file"
111 | database="Database"
112 | positional=false
113 | max_coverage=500
114 |
115 | #PARSE VARIABLE ARGUMENTS WITH getops
116 | #common example with letters, for long options check longopts2getopts.sh
117 | options=":i:o:d:s:g:m:n:pvh"
118 | while getopts $options opt; do
119 | case $opt in
120 | i )
121 | input_file=$OPTARG
122 | ;;
123 | o )
124 | output_dir=$OPTARG
125 | ;;
126 | s )
127 | sample=$OPTARG
128 | ;;
129 | g)
130 | group=$OPTARG
131 | ;;
132 | d )
133 | database=$OPTARG
134 | ;;
135 | m )
136 | max_coverage=$OPTARG
137 | ;;
138 | p )
139 | positional=true
140 | ;;
141 | h )
142 | usage
143 | exit 1
144 | ;;
145 | v )
146 | echo $VERSION
147 | exit 1
148 | ;;
149 | \?)
150 | echo "Invalid Option: -$OPTARG" 1>&2
151 | usage
152 | exit 1
153 | ;;
154 | : )
155 | echo "Option -$OPTARG requires an argument." >&2
156 | exit 1
157 | ;;
158 | * )
159 | echo "Unimplemented option: -$OPTARG" >&2;
160 | exit 1
161 | ;;
162 |
163 | esac
164 | done
165 | shift $((OPTIND-1))
166 |
167 | #================================================================
168 | # MAIN_BODY
169 | #================================================================
170 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
171 |
172 | echo -e "\n#Executing" $0 "\n"
173 |
174 | check_mandatory_files.sh $input_file
175 |
176 | check_dependencies.sh bedtools
177 |
178 | if [ ! $output_dir ]; then
179 | output_dir=$(dirname $input_file)
180 | echo "Default output directory is" $output_dir
181 | mkdir -p $output_dir
182 | else
183 | echo "Output directory is" $output_dir
184 | mkdir -p $output_dir
185 | fi
186 |
187 | if [ ! $filename ]; then
188 | filename=$(basename $input_file | cut -d. -f1)
189 | fi
190 |
191 |
192 |
193 | if [ $positional = true ]; then
194 | if [ -f $imageDir/$sample".plasmid.bedgraph" ];then \
195 | echo "Found a bedgraph file for sample" $sample;
196 | echo "Omitting bedgraph step"
197 | else
198 | echo "$(date)"
199 | echo "Obtaining coverage coordinates from sequences"
200 |
201 | bedtools genomecov -ibam $input_file -bga -max $max_coverage > $output_dir/$filename".bedgraph"|| error ${LINENO} $(basename $0) "Bedtools genomecov command failed. See $output_dir/logs for more information."
202 |
203 | echo "$(date)"
204 | echo "DONE obtaining coverage coordinates from sequences"
205 | fi
206 | else
207 |
208 |
209 | check_mandatory_files.sh $database
210 |
211 | if [ -f $database".length" ]; then
212 | echo "Found length file for" $(basename $database)
213 | echo "Omitting length calculation"
214 | else
215 | echo "$(date)"
216 | echo "Creating a length file for" $(basename $database)
217 | calculate_seqlen.sh -r -i $database > $database".length"|| error ${LINENO} $(basename $0) "calculate_seqlen script failed. See $output_dir/logs for more information."
218 | fi
219 |
220 | if [ -f $output_dir/$filename".coverage" ];then \
221 | echo "Found a coverage file for sample" $sample;
222 | echo "Omitting coverage calculation"
223 | else
224 | echo "$(date)"
225 | echo "Calculating coverage for every position that mapped $filename"
226 |
227 | bedtools genomecov -ibam $input_file -g $database".length" > $output_dir/$filename".coverage"|| error ${LINENO} $(basename $0) "Bedtools genomecov command failed. See $output_dir/logs for more information."
228 |
229 | echo "$(date)"
230 | echo "DONE Calculating coverage for every plamid that mapped $sample"
231 | fi
232 | fi
233 |
234 | echo -e "\n"
235 |
--------------------------------------------------------------------------------
/bin/mash_screener.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | #=============================================================
6 | # HEADER
7 | #=============================================================
8 |
9 | #INSTITUTION:ISCIII
10 | #CENTRE:BU-ISCIII
11 | #AUTHOR: Pedro J. Sola
12 | VERSION=1.0
13 | #CREATED: 27 November 2019
14 | #REVISION:
15 |
16 | #DESCRIPTION:Script that screen reads over a database using kmers and estract sequences ids with higher values
17 | #TODO
18 | #================================================================
19 | # END_OF_HEADER
20 | #================================================================
21 |
22 | #SHORT USAGE RULES
23 | #LONG USAGE FUNCTION
24 | usage() {
25 | cat << EOF
26 |
27 | Bowtie_mapper script index a database and map a supplied pair-end sequences
28 |
29 | usage : $0 [-i ] [-o ] <-d database(fasta)> <-s sample_name> <-1 R1> <-2 R2>
30 | [-g group_name] [-f ] [-T ] [-a] [-v] [-h]
31 |
32 | -i input directory (optional)
33 | -o output directory (optional)
34 | -d database to screen (.fasta)
35 | -s sample name
36 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group
37 | -1 reads corresponding to paired-end R1
38 | -2 reads corresponding to paired-end R2
39 | -f threshold identity value to retieve sequence ids with at least this value (default 0.9)
40 | -w use winner takes it all
41 | -T number of threads
42 | -v version
43 | -h display usage message
44 |
45 | example: mash_screener.sh -d database.fasta -s COLI -1 ecoli_1.fastq -2 ecoli_2.fastq
46 |
47 | EOF
48 | }
49 |
50 | #================================================================
51 | # OPTION_PROCESSING
52 | #================================================================
53 | #Make sure the script is executed with arguments
54 | if [ $# = 0 ] ; then
55 | usage >&2
56 | exit 1
57 | fi
58 |
59 | # Error handling
60 | error(){
61 | local parent_lineno="$1"
62 | local script="$2"
63 | local message="$3"
64 | local code="${4:-1}"
65 |
66 | RED='\033[0;31m'
67 | NC='\033[0m'
68 |
69 | if [[ -n "$message" ]] ; then
70 | echo -e "\n---------------------------------------\n"
71 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
72 | echo -e "MESSAGE:\n"
73 | echo -e "$message"
74 | echo -e "\n---------------------------------------\n"
75 | else
76 | echo -e "\n---------------------------------------\n"
77 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
78 | echo -e "\n---------------------------------------\n"
79 | fi
80 |
81 | exit "${code}"
82 | }
83 |
84 | #DECLARE FLAGS AND VARIABLES
85 | threads=1
86 | offrate=1
87 | filter_identity=0.9
88 | cwd="$(pwd)"
89 | w_winner=""
90 | group="NO_GROUP"
91 | database="Database"
92 | R1="R1"
93 | R2="R2"
94 |
95 | #PARSE VARIABLE ARGUMENTS WITH getops
96 | #common example with letters, for long options check longopts2getopts.sh
97 | options=":i:o:s:g:d:1:2:f:T:avwh"
98 | while getopts $options opt; do
99 | case $opt in
100 | i )
101 | input_dir=$OPTARG
102 | ;;
103 | o )
104 | output_dir=$OPTARG
105 | ;;
106 | s )
107 | sample=$OPTARG
108 | ;;
109 | g)
110 | group=$OPTARG
111 | ;;
112 | d )
113 | database=$OPTARG
114 | ;;
115 | 1 )
116 | R1=$OPTARG
117 | ;;
118 | 2 )
119 | R2=$OPTARG
120 | ;;
121 | f )
122 | filter_identity=$OPTARG
123 | ;;
124 | w)
125 | w_winner="-w"
126 | ;;
127 | T )
128 | threads=$OPTARG
129 | ;;
130 |
131 | h )
132 | usage
133 | exit 1
134 | ;;
135 | v )
136 | echo $VERSION
137 | exit 1
138 | ;;
139 | \?)
140 | echo "Invalid Option: -$OPTARG" 1>&2
141 | usage
142 | exit 1
143 | ;;
144 | : )
145 | echo "Option -$OPTARG requires an argument." >&2
146 | exit 1
147 | ;;
148 | * )
149 | echo "Unimplemented option: -$OPTARG" >&2;
150 | exit 1
151 | ;;
152 |
153 | esac
154 | done
155 | shift $((OPTIND-1))
156 |
157 |
158 | #================================================================
159 | # MAIN_BODY
160 | #================================================================
161 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
162 |
163 | echo -e "\n#Executing" $0 "\n"
164 |
165 | check_dependencies.sh bash mash
166 |
167 | check_mandatory_files.sh $database $R1
168 |
169 | if [ ! $sample ]; then
170 | echo "ERROR: please, provide a sample name"
171 | usage
172 | exit 1
173 | fi
174 |
175 | if [ ! $output_dir ]; then
176 | output_dir=$cwd"/$group/$sample/kmer/"
177 | echo "Default output directory is" $output_dir
178 | mkdir -p $output_dir
179 | else
180 | echo "Output directory is" $output_dir
181 | mkdir -p $output_dir
182 | fi
183 |
184 |
185 | ########SKETCH##############
186 | ############################
187 |
188 | if [ -f $output_dir/database.msh ]; then \
189 | echo "Found a sketch ddbb for" $(basename $database);
190 | echo "Omitting sketching"
191 | else
192 | echo "creating sketch of " $(basename $database);
193 | mash sketch -i -k 32 -s 1000 -p $threads -o $output_dir/database $database || error ${LINENO} $(basename $0) "mash screen command failed. See $output_dir/logs for more information"
194 | fi
195 |
196 | ########SCREEN##############
197 | ############################
198 |
199 | if [ -f $output_dir/database.screen.tab ];then \
200 | echo "Found a mash screen file for sample" $sample;
201 | echo "Omitting screening"
202 | else
203 | echo "$(date)"
204 | echo screening $R1
205 |
206 | mash screen $w_winner -p $threads $output_dir/database.msh $R1 > $output_dir/database.screen.tab || error ${LINENO} $(basename $0) "Bowtie2 command failed. See $output_dir/logs for more information"
207 |
208 |
209 | echo "$(date)"
210 | echo -e "DONE Screening $sample of $group Group" "\n"
211 | fi
212 |
213 | ######PARSE_RESULT##########
214 | ############################
215 |
216 | output_mash_id=$output_dir/database.filtered_$filter_identity
217 |
218 | echo "Retrieving sequences matching more than $filter_identity identity"
219 |
220 | cat $output_dir/database.screen.tab | awk '($1 >= '"${filter_identity}"') {print $5}' > $output_mash_id
221 |
222 |
223 | #####FILTER SEQUENCES#######
224 | ############################
225 | if [ $(cat $output_mash_id | wc -l | cut -d " " -f 1) -gt 0 ]
226 | then
227 | filter_fasta.sh -i $database -f $output_mash_id
228 | else
229 | echo "No plasmids have passed the mash identity filter!! Exiting!!"
230 | exit 0
231 | fi
232 |
--------------------------------------------------------------------------------
/bin/ncbi_database_fetcher.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 | #set -x
7 | #=============================================================
8 | # HEADER
9 | #=============================================================
10 |
11 | #INSTITUTION:ISCIII
12 | #CENTRE:BU-ISCIII
13 | #AUTHOR: Pedro J. Sola
14 | VERSION=1.0
15 | #CREATED: 12 June 2018
16 | #REVISION:
17 | # 22 June 2018: include quite mode that avoid watching the progress
18 | #
19 | #
20 | #DESCRIPTION:Script that extract a database from ncbi database using terms
21 | #AKNOWLEDGE:
22 | # -Multiple arguments in one flag: https://stackoverflow.com/questions/7529856/retrieving-multiple-arguments-for-a-single-option-using-getopts-in-bash
23 | #
24 | #================================================================
25 | # END_OF_HEADER
26 | #================================================================
27 |
28 | #SHORT USAGE RULES
29 | #LONG USAGE FUNCTION
30 | usage() {
31 | cat << EOF
32 |
33 | ncbi_database_fetcher is a script that extract sequences from NCBI by term
34 |
35 | usage : $0 <(-y term1 -y term2 | -y "term1 term2")> [(-n term1 -n term2 | -n "term1 term2")] [-O ][-d (nucleotide|protein)] [-f ] [-o ] [-q] [-v] [-h]
36 |
37 | -y list of key terms separated by space to be INCLUDED in sequences title
38 | -n list of key terms separated by space to be EXCLUDED in sequences title
39 | -O organism to filter
40 | -d database type, default nucleotide
41 | -o output directory (optional). By default the file is placed in cwd
42 | -f file name (optional). By default is the first term used as query
43 | -q quiet
44 | -v version
45 | -h display usage message
46 |
47 | example: ./ncbi_database_fetcher.sh -y plasmid -n unnamed -n partial -O Archaea
48 |
49 | EOF
50 | }
51 |
52 | #================================================================
53 | # OPTION_PROCESSING
54 | #================================================================
55 | #Make sure the script is executed with arguments
56 | if [ $# = 0 ] ; then
57 | usage >&2
58 | exit 1
59 | fi
60 |
61 | #DECLARE FLAGS AND VARIABLES
62 | cwd="$(pwd)"
63 | use_term_and=false
64 | use_term_not=false
65 | use_term_org=false
66 | quiet=false
67 | database_type=nucleotide
68 | #PARSE VARIABLE ARGUMENTS WITH getops
69 |
70 | options=":y:n:o:f:d:O:qvh"
71 | while getopts $options opt; do
72 | case $opt in
73 | o )
74 | output_dir=$OPTARG
75 | ;;
76 | O)
77 | terms_organism+=($OPTARG)
78 | use_term_org=true
79 | ;;
80 | f )
81 | file_name=$OPTARG
82 | ;;
83 | d )
84 | database_type=$OPTARG
85 | ;;
86 | y )
87 | terms_and+=($OPTARG)
88 | use_term_and=true
89 | ;;
90 | n )
91 | terms_not+=($OPTARG)
92 | use_term_not=true
93 | ;;
94 | q )
95 | quiet=true
96 | ;;
97 | h )
98 | usage
99 | exit 1
100 | ;;
101 | v )
102 | echo $VERSION
103 | exit 1
104 | ;;
105 | \?)
106 | echo "Invalid Option: -$OPTARG" 1>&2
107 | usage
108 | exit 1
109 | ;;
110 | : )
111 | echo "Option -$OPTARG requires an argument." >&2
112 | exit 1
113 | ;;
114 | * )
115 | echo "Unimplemented option: -$OPTARG" >&2;
116 | exit 1
117 | ;;
118 |
119 | esac
120 | done
121 | shift $((OPTIND-1))
122 |
123 | #================================================================
124 | # MAIN_BODY
125 | #================================================================
126 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
127 |
128 | echo -e "\n#Executing" $0 "\n"
129 |
130 | if [ $use_term_and = false ]; then
131 | echo "Please, introduce at least one term to include search"
132 | usage
133 | exit 1
134 | fi
135 |
136 | #MANAGE OUTPUT DIRECTORY
137 | if [ ! $output_dir ]; then
138 | output_dir=$cwd
139 | echo "Default output_dir is" $output_dir
140 | mkdir -p $output_dir
141 | else
142 | echo "Output directory is" $output_dir
143 | mkdir -p $output_dir
144 | fi
145 |
146 | #MANAGE FILE NAME
147 |
148 | if [ ! $file_name ]; then
149 |
150 | if [ "${#terms_and[@]}" -gt 1 ]; then
151 | file_name_value_one=$(echo ${terms_and[0]})
152 | file_name_value_two=$(echo ${terms_and[1]})
153 |
154 | file_name=$file_name_value_one"_"$file_name_value_two
155 | echo "Default file name is" $file_name
156 | else
157 | file_name=$terms_and".database"
158 | echo "Default file name is" $file_name
159 | fi
160 | else
161 | echo "File name is" $file_name
162 | fi
163 |
164 |
165 | ##PROCESS REGULAR EXPRESSION TERMS
166 |
167 | list_terms_and=$(for term in "${terms_and[@]}"; do echo "$term"; done)
168 | list_terms_org=$(for organism in "${terms_organism[@]}"; do echo "$organism"; done)
169 |
170 | #echo "${#terms_and[@]}" "NUMBER OF TERMS"
171 |
172 | list_terms_regexp_and=$(printf "%s[Title] AND " $list_terms_and | sed 's/ AND $//g')
173 | list_terms_regexp_organism=$(printf "AND %s[organism] " $list_terms_org | sed 's/ $//g')
174 |
175 | if [ $use_term_not = true ]; then
176 |
177 | list_terms_not=$(for term in "${terms_not[@]}"; do echo "$term"; done)
178 | list_terms_regexp_not=$(printf "NOT %s[Title] " $list_terms_not | sed 's/ $//g')
179 | final_list_terms_regexp=$(echo $list_terms_regexp_and" "$list_terms_regexp_not" "$list_terms_regexp_organism) #concat all regexp into one
180 |
181 | else
182 | final_list_terms_regexp=$(echo $list_terms_regexp_and " "$list_terms_regexp_organism)
183 | fi
184 |
185 | echo $final_list_terms_regexp
186 |
187 | ########EUTILS COMMAND############
188 | ##################################
189 |
190 | echo "$(date)"
191 | echo "Obtaining seuences with terms:" $list_terms_and
192 | echo "But not those terms:" $list_terms_not
193 | if [ $use_term_org = true ]; then
194 | echo "Filtering by organisms:" $list_terms_org
195 | fi
196 | echo ""
197 |
198 | base="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
199 |
200 | ##DETERMINE RETMAX
201 | wget -q -O $output_dir/$file_name".count" $base"esearch.fcgi?db="$database_type"&term=""$final_list_terms_regexp"
202 |
203 | counter=$(cat $output_dir/$file_name".count" | awk '//' | head -n 1 | awk '// {split($0,counter_prev,"");split(counter_prev[1],counter,"")}END{print counter[length(counter)]}')
204 | echo -e "FOUND" $counter "RECORDS\n"
205 |
206 | if [ $counter -eq 0 ]; then
207 | echo "Try different terms"
208 | echo "EXIT"
209 | exit 1
210 | fi
211 |
212 | echo "Retrieving Id"
213 |
214 | ##OBTAIN TOTAL LIST OF ID
215 | wget -q -O $output_dir/$file_name".id" $base"esearch.fcgi?db="$database_type"&term=""$final_list_terms_regexp""&RetMax="$counter
216 |
217 | list_of_id=$(cat $output_dir/$file_name".id"| awk '{split($0,id_prev,"");split(id_prev[1],id,"")}//{print id[length(id)]}')
218 | array_of_id=($list_of_id)
219 |
220 | echo "And sequences"
221 | counter=1
222 |
223 |
224 | ##Checking previous DDBB
225 | if [ -s $output_dir/$file_name".fasta" ]; then
226 | echo -e "\nFound a ddbb with the same name, Removing it\n"
227 | rm $output_dir/$file_name".fasta"
228 | fi
229 |
230 |
231 | ##RETRIEVING FASTA SEQUENCE
232 |
233 | for i in $list_of_id
234 | do
235 | if [ $quiet = false ]; then
236 |
237 | echo $counter"/""${#array_of_id[@]}"
238 | fi
239 |
240 | ((counter++))
241 |
242 | curl -s $base"efetch.fcgi?db="$database_type"&id="$i"&retmode=text&rettype=fasta" >> $output_dir/$file_name".fasta"
243 | done
244 |
245 |
246 | echo "$(date)"
247 | echo "DONE obtaining seuences with terms supplied"
248 |
249 | seq_number_post=$(cat $output_dir/$file_name".fasta" | grep ">" | wc -l)
250 | echo "File with filtered sequences can be found in" $output_dir/$file_name".fasta"
251 | echo "with" $seq_number_post "sequences"
252 |
253 | rm $output_dir/$file_name".count"
254 | rm $output_dir/$file_name".id"
255 |
--------------------------------------------------------------------------------
/bin/process_cluster_output.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 | #set -x
7 |
8 | #=============================================================
9 | # HEADER
10 | #=============================================================
11 |
12 | #INSTITUTION:ISCIII
13 | #CENTRE:BU-ISCIII
14 | #AUTHOR: Pedro J. Sola
15 | VERSION=1.0
16 | #CREATED: 12 April 2018
17 | #REVISION:
18 | #DESCRIPTION:process_cluster_output script obtain a list of ac from fasta, and estract their coverage value from a coverage file
19 |
20 | #================================================================
21 | # END_OF_HEADER
22 | #================================================================
23 |
24 | #SHORT USAGE RULES
25 | #LONG USAGE FUNCTION
26 | usage() {
27 | cat << EOF
28 |
29 | process_cluster_output script obtain a list of ac from fasta, and estract their coverage value from a coverage file
30 |
31 | usage : $0 <-i inputfile(.fasta)> <-b coverage_file> [-o ] [-c ] [-s ] [-v] [-h]
32 |
33 | -i input file
34 | -b file with coverage info
35 | -o output directory (optional). By default the file is replaced in the same location
36 | -c percentage value to filter >= values. If not supplied, all records will be outputted
37 | -s string to ad at the end of the outputted file (list of accession numbers)
38 | -v version
39 | -h display usage message
40 |
41 | example: process_cluster_output.sh -i ecoli_clustered.fasta_70 -b ecoli.coverage
42 |
43 | EOF
44 | }
45 |
46 | #================================================================
47 | # OPTION_PROCESSING
48 | #================================================================
49 | #Make sure the script is executed with arguments
50 | if [ $# = 0 ] ; then
51 | usage >&2
52 | exit 1
53 | fi
54 |
55 | # Error handling
56 | error(){
57 | local parent_lineno="$1"
58 | local script="$2"
59 | local message="$3"
60 | local code="${4:-1}"
61 |
62 | RED='\033[0;31m'
63 | NC='\033[0m'
64 |
65 | if [[ -n "$message" ]] ; then
66 | echo -e "\n---------------------------------------\n"
67 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
68 | echo -e "MESSAGE:\n"
69 | echo -e "$message"
70 | echo -e "\n---------------------------------------\n"
71 | else
72 | echo -e "\n---------------------------------------\n"
73 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
74 | echo -e "\n---------------------------------------\n"
75 | fi
76 |
77 | exit "${code}"
78 | }
79 |
80 | #DECLARE FLAGS AND VARIABLES
81 | cwd="$(pwd)"
82 | input_file="Input_file"
83 | coverage_cutoff_input=100
84 |
85 | #PARSE VARIABLE ARGUMENTS WITH getops
86 | #common example with letters, for long options check longopts2getopts.sh
87 | options=":i:b:o:c:s:vh"
88 | while getopts $options opt; do
89 | case $opt in
90 | i )
91 | input_file=$OPTARG
92 | ;;
93 | b )
94 | coverage_file=$OPTARG
95 | ;;
96 | o )
97 | output_dir=$OPTARG
98 | ;;
99 | c )
100 | if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 100 ]; then
101 | echo "please, provide a percentage between 0 and 100"
102 | usage
103 | exit 1
104 | else
105 | coverage_cutoff_input=$OPTARG
106 | fi
107 | ;;
108 | s )
109 | suffix=$OPTARG
110 | ;;
111 | h )
112 | usage
113 | exit 1
114 | ;;
115 | v )
116 | echo $VERSION
117 | exit 1
118 | ;;
119 | \?)
120 | echo "Invalid Option: -$OPTARG" 1>&2
121 | usage
122 | exit 1
123 | ;;
124 | : )
125 | echo "Option -$OPTARG requires an argument." >&2
126 | exit 1
127 | ;;
128 | * )
129 | echo "Unimplemented option: -$OPTARG" >&2;
130 | exit 1
131 | ;;
132 |
133 | esac
134 | done
135 | shift $((OPTIND-1))
136 |
137 | #================================================================
138 | # MAIN_BODY
139 | #================================================================
140 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
141 |
142 | echo -e "\n#Executing" $0 "\n"
143 |
144 | check_mandatory_files.sh $input_file
145 |
146 | suffix="_clustered"
147 | coverage_cutoff=$(echo "(1 - ($coverage_cutoff_input/100))" | bc -l)
148 |
149 | if [ ! $output_dir ]; then
150 | output_dir=$(dirname $input_file)
151 | #echo "Default output directory is" $output_dir
152 | mkdir -p $output_dir
153 | else
154 | #echo "Output directory is" $output_dir
155 | mkdir -p $output_dir
156 | fi
157 |
158 |
159 | if [ ! $file_name ]; then
160 | file_name=$(basename $input_file)
161 | coverage_name=$(basename $coverage_file)
162 | fi
163 |
164 | echo "$(date)"
165 | echo "extracting coverage info from clustered sequences in" $file_name
166 |
167 | ac_input_file=$(cat $input_file | grep ">" | awk '{gsub(">","");print $1}')
168 |
169 | for i in $ac_input_file ;do
170 | awk '
171 | /^'"$i"'/
172 | ' $coverage_file
173 | done > $output_dir/$coverage_name$suffix || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix creation. See $output_dir/logs for more information."
174 |
175 |
176 | awk '
177 | {if ($2 == 0 && $5 <= '"${coverage_cutoff}"')
178 | {print $1}}
179 | ' $output_dir/$coverage_name$suffix > $output_dir/$coverage_name$suffix"_ac" || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix\"_ac\" creation. See $output_dir/logs for more information."
180 |
181 |
182 | awk '
183 | {if ($2 == 0 && $5 <= '"${coverage_cutoff}"')
184 | {print $1, ((1 - $5)*100)}
185 | }
186 | ' $output_dir/$coverage_name$suffix > $output_dir/$coverage_name$suffix"_percentage" || error ${LINENO} $(basename $0) "Awk command error in $coverage_name$suffix\"_percentage\" creation. See $output_dir/logs for more information."
187 |
188 | echo "$(date)"
189 | echo "DONE extracting coverage info from clustered sequences in" $file_name
190 | echo -e "Info can be found at" $coverage_name$suffix"_ac and" "\n" $coverage_name$suffix"_percentage" "\n"
191 |
--------------------------------------------------------------------------------
/bin/prokka_annotation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #set -e
4 |
5 | #=============================================================
6 | # HEADER
7 | #=============================================================
8 |
9 | #INSTITUTION:ISCIII
10 | #CENTRE:BU-ISCIII
11 | #AUTHOR: Pedro J. Sola
12 | VERSION=1.0
13 | #CREATED: 30 April 2018
14 | #REVISION:
15 | #12 June 2018: Handled cleaning process without hard coded paths
16 | #
17 | #DESCRIPTION:Script that uses prokka to annotate a FASTA file
18 | #
19 | #DOCUMENTATION
20 | #
21 | #Prokka outputs the fasta headers as:
22 | # gnl|center|locustag_01
23 | # gnl|center|locustag_02
24 | #
25 | #TO DO:
26 | #Handle cleaning [v]
27 | #
28 | #================================================================
29 | # END_OF_HEADER
30 | #================================================================
31 |
32 | #SHORT USAGE RULES
33 | #LONG USAGE FUNCTION
34 | usage() {
35 | cat << EOF
36 |
37 | Prokka_annotation is a script that uses prokka to annotate a FASTA file
38 |
39 | usage : $0 <-i inputfile(FASTA)> <-p prefix> [-o ] [-k ]
40 | [-T ] [-g group_name][-G genus] [-S species] [-c] [-v] [-h]
41 |
42 | -i input file in FASTA format
43 | -o output directory
44 | -p prefix for sample identification (mandatory) and output file name
45 | -k kingdom (Bacteria by default)
46 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group
47 | -G sample genus in case is known by user
48 | -S sample species in case is known by user
49 | -c clean:remove files other than gff and renamed fasta
50 | -T number of threads
51 | -v version
52 | -h display usage message
53 |
54 |
55 | Output directory is the same as input directory by default
56 |
57 | example: prokka_annotation -i ecoli.fasta -p ECO -T 5
58 |
59 |
60 | EOF
61 | }
62 |
63 |
64 | #================================================================
65 | # OPTION_PROCESSING
66 | #================================================================
67 | #Make sure the script is executed with arguments
68 | if [ $# = 0 ] ; then
69 | usage >&2
70 | exit 1
71 | fi
72 |
73 | # Error handling
74 | error(){
75 | local parent_lineno="$1"
76 | local script="$2"
77 | local message="$3"
78 | local code="${4:-1}"
79 |
80 | RED='\033[0;31m'
81 | NC='\033[0m'
82 |
83 | if [[ -n "$message" ]] ; then
84 | echo -e "\n---------------------------------------\n"
85 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
86 | echo -e "MESSAGE:\n"
87 | echo -e "$message"
88 | echo -e "\n---------------------------------------\n"
89 | else
90 | echo -e "\n---------------------------------------\n"
91 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
92 | echo -e "\n---------------------------------------\n"
93 | fi
94 |
95 | exit "${code}"
96 | }
97 |
98 | #DECLARE FLAGS AND VARIABLES
99 | cwd="$(pwd)"
100 | group="NO_GROUP"
101 | input_file="Input_file"
102 | kingdom="Bacteria"
103 | clean=false
104 | genus=""
105 | species=""
106 | threads=1
107 |
108 | #PARSE VARIABLE ARGUMENTS WITH getops
109 | #common example with letters, for long options check longopts2getopts.sh
110 | options=":i:o:p:k:g:G:S:T:cvh"
111 | while getopts $options opt; do
112 | case $opt in
113 | i )
114 | input_file=$OPTARG
115 | ;;
116 |
117 | o )
118 | output_dir=$OPTARG
119 | ;;
120 | p)
121 | prefix=$OPTARG
122 | file_name=$OPTARG
123 | ;;
124 | k )
125 | kingdom=$OPTARG
126 | ;;
127 | g )
128 | group=$OPTARG
129 | ;;
130 | S )
131 | species=$OPTARG
132 | ;;
133 | G)
134 | genus=$OPTARG
135 | ;;
136 | c )
137 | clean=true
138 | ;;
139 | T)
140 | threads=$OPTARG
141 | ;;
142 |
143 | h )
144 | usage
145 | exit 1
146 | ;;
147 | v )
148 | echo $VERSION
149 | exit 1
150 | ;;
151 | \?)
152 | echo "Invalid Option: -$OPTARG" 1>&2
153 | usage
154 | exit 1
155 | ;;
156 | : )
157 | echo "Option -$OPTARG requires an argument." >&2
158 | exit 1
159 | ;;
160 | * )
161 | echo "Unimplemented option: -$OPTARG" >&2;
162 | exit 1
163 | ;;
164 |
165 | esac
166 | done
167 | shift $((OPTIND-1))
168 |
169 | #================================================================
170 | # MAIN_BODY
171 | #================================================================
172 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
173 |
174 | echo -e "\n#Executing" $0 "\n"
175 |
176 | check_mandatory_files.sh $input_file
177 |
178 | check_dependencies.sh prokka
179 | echo "PREFIX" $prefix
180 |
181 | if [ ! $prefix ]; then
182 | echo "please provide a prefix"
183 | exit 1
184 | fi
185 |
186 | if [ ! $output_dir ]; then
187 | output_dir=$(dirname $input_file)
188 | echo "Default output directory is" $output_dir
189 | mkdir -p $output_dir
190 | else
191 | echo "Output directory is" $output_dir
192 | mkdir -p $output_dir
193 | fi
194 |
195 | if [ ! $file_name ]; then
196 | file_name=$(basename $input_file)
197 | echo "filename is" $file_name
198 | fi
199 |
200 |
201 | ##PROKKA EXECUTION
202 |
203 | echo "$(date)"
204 | echo "Annotating $input_file with prokka"
205 |
206 | prokka --force --outdir $output_dir \
207 | --prefix $prefix \
208 | --addgenes \
209 | --kingdom $kingdom \
210 | --genus $genus \
211 | --species $species \
212 | --usegenus \
213 | --centre BU-ISCIII \
214 | --locustag $prefix \
215 | --addgenes \
216 | --cpus $threads \
217 | $input_file #|| error ${LINENO} $(basename $0) "Prokka command failed. See $output_dir/logs for more information."
218 |
219 | echo "$(date)"
220 | echo "done annotating $input_file with prokka"
221 |
222 | ##CLEAN FILES THAT WILL NOT BE USED IN PLASMIDID
223 |
224 | if [ $clean = true ]; then
225 |
226 | echo "Removing unwanted files"
227 | for i in $(ls $output_dir/$prefix.??? | awk '!/fna|gff|log|err|gb/')
228 | do
229 | rm $i
230 | done
231 | fi
232 |
233 | echo -e "\n"
234 |
--------------------------------------------------------------------------------
/bin/quality_trim.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | #=============================================================
6 | # HEADER
7 | #=============================================================
8 |
9 | #INSTITUTION:ISCIII
10 | #CENTRE:BU-ISCIII
11 | #AUTHOR: Pedro J. Sola
12 | VERSION=1.0
13 | #CREATED: 21 May 2018
14 | #REVISION:
15 | #DESCRIPTION:Script that execute trimmomatic to filter by quality
16 | #
17 | #
18 | #================================================================
19 | # END_OF_HEADER
20 | #================================================================
21 |
22 |
23 | usage() {
24 | cat << EOF
25 |
26 | quality_trim script execute trimmomatic to filter by quality
27 |
28 | usage : $0 <-1 R1 file> <-2 R2 file> [-o ] [-d ] <-s sample_name>
29 | [-a adapter_file] [-g group_name] [-f ] [-l ] [-M ] [-T ][-v] [-h]
30 |
31 | -1 R1 file (mandatory)
32 | -2 R2 file (mandatory)
33 | -d directory where trimmomatic is installed, default: /opt/Trimmomatic/
34 | -a adapters to remove, default: TruSeq3-PE.fa
35 | -o output directory (optional)
36 | -f file name
37 | -l minimus length of trimmed reads (default 40)
38 | -s sample name (mandatory)
39 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group
40 | -M RAM memmory (Gb), default 8
41 | -T threads, default 1
42 | -v version
43 | -h display usage message
44 |
45 | example: ./quality_trim.sh -1 ecoli_R1.fastq.gz -2 ecoli_R2.fastq.gz -s ECO232 -g ENTERO -T 8
46 |
47 | EOF
48 | }
49 |
50 | #================================================================
51 | # OPTION_PROCESSING
52 | #================================================================
53 | #Make sure the script is executed with arguments
54 | if [ $# = 0 ] ; then
55 | usage >&2
56 | exit 1
57 | fi
58 |
59 | # Error handling
60 | error(){
61 | local parent_lineno="$1"
62 | local script="$2"
63 | local message="$3"
64 | local code="${4:-1}"
65 |
66 | RED='\033[0;31m'
67 | NC='\033[0m'
68 |
69 | if [[ -n "$message" ]] ; then
70 | echo -e "\n---------------------------------------\n"
71 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
72 | echo -e "MESSAGE:\n"
73 | echo -e "$message"
74 | echo -e "\n---------------------------------------\n"
75 | else
76 | echo -e "\n---------------------------------------\n"
77 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
78 | echo -e "\n---------------------------------------\n"
79 | fi
80 |
81 | exit "${code}"
82 | }
83 |
84 | #DECLARE FLAGS AND VARIABLES
85 | cwd="$(pwd)"
86 | group="NO_GROUP"
87 | r1_file="R1_file"
88 | r2_file="R2_file"
89 | trimmomatic_directory=/opt/Trimmomatic/
90 | adapter_file="TruSeq3-PE.fa"
91 | minimus_length=40
92 | max_mem=8
93 | threads=1
94 |
95 | #PARSE VARIABLE ARGUMENTS WITH getops
96 | #common example with letters, for long options check longopts2getopts.sh
97 | options=":1:2:o:f:d:a:s:g:l:n:M:T:vh"
98 | while getopts $options opt; do
99 | case $opt in
100 | 1 )
101 | r1_file=$OPTARG
102 | ;;
103 | 2 )
104 | r2_file=$OPTARG
105 | ;;
106 | o )
107 | output_dir=$OPTARG
108 | ;;
109 | f )
110 | file_name=$OPTARG
111 | ;;
112 | s )
113 | sample=$OPTARG
114 | ;;
115 | d)
116 | trimmomatic_directory=$OPTARG
117 | ;;
118 | a)
119 | adapter_file=$OPTARG
120 | ;;
121 | l)
122 | minimus_length=$OPTARG
123 | ;;
124 | g)
125 | group=$OPTARG
126 | ;;
127 | M )
128 | max_mem=$OPTARG
129 | ;;
130 | T )
131 | threads=$OPTARG
132 | ;;
133 | h )
134 | usage
135 | exit 1
136 | ;;
137 | v )
138 | echo $VERSION
139 | exit 1
140 | ;;
141 | \?)
142 | echo "Invalid Option: -$OPTARG" 1>&2
143 | usage
144 | exit 1
145 | ;;
146 | : )
147 | echo "Option -$OPTARG requires an argument." >&2
148 | exit 1
149 | ;;
150 | * )
151 | echo "Unimplemented option: -$OPTARG" >&2;
152 | exit 1
153 | ;;
154 |
155 | esac
156 | done
157 | shift $((OPTIND-1))
158 |
159 |
160 |
161 | #================================================================
162 | # MAIN_BODY
163 | #================================================================
164 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
165 |
166 | echo -e "\n#Executing" $0 "\n"
167 |
168 | check_mandatory_files.sh $r1_file $r2_file
169 |
170 | check_dependencies.sh trimmomatic
171 |
172 | if [ ! $sample ]; then
173 | echo "Please include a sample name"
174 | exit 1
175 | fi
176 |
177 |
178 | if [ ! $output_dir ]; then
179 | output_dir="$group/$sample/trimmed"
180 | echo "Default output directory is" $output_dir
181 | mkdir -p $output_dir
182 | else
183 | echo "Output directory is" $output_dir
184 | mkdir -p $output_dir
185 | fi
186 |
187 | if [ ! $filename ]; then
188 | filename=$sample
189 | fi
190 |
191 |
192 | #trimmomatic_executable=$(find $trimmomatic_directory -type f -name "trimmomatic*.jar" | awk 'NR==1')
193 |
194 | trimmomatic_path=$(whereis trimmomatic | cut -d " " -f 2 | cut -d "/" -f 1,2,3,4,5,6)
195 | trimmomatic_adapter=$(find $trimmomatic_path -type f -name $adapter_file | awk 'NR==1')
196 |
197 | echo "$(date)"
198 | echo "Quality trimming:"
199 | echo "R1 = " $r1_file
200 | echo "R2 = " $r2_file
201 |
202 | trimmomatic PE -threads $threads \
203 | $r1_file \
204 | $r2_file \
205 | $output_dir/$sample"_1_paired.fastq.gz" \
206 | $output_dir/$sample"_1_unpaired.fastq.gz" \
207 | $output_dir/$sample"_2_paired.fastq.gz" \
208 | $output_dir/$sample"_2_unpaired.fastq.gz" \
209 | ILLUMINACLIP:$trimmomatic_adapter:2:30:10 SLIDINGWINDOW:4:20 MINLEN:$minimus_length || error ${LINENO} $(basename $0) "Trimmomatic command failed. See $output_dir/logs for more information."
210 |
211 | echo "$(date)"
212 | echo "DONE quality trimming, file can be fount at:"
213 | echo $output_dir/$sample"_1_paired.fastq.gz"
214 | echo $output_dir/$sample"_1_unpaired.fastq.gz"
215 | echo $output_dir/$sample"_2_paired.fastq.gz"
216 | echo $output_dir/$sample"_2_unpaired.fastq.gz"
217 | echo -e "\n"
--------------------------------------------------------------------------------
/bin/rename_from_fasta.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 | #set -x
7 |
8 | #=============================================================
9 | # HEADER
10 | #=============================================================
11 |
12 | #INSTITUTION:ISCIII
13 | #CENTRE:BU-ISCIII
14 | #AUTHOR: Pedro J. Sola
15 | VERSION=1.0
16 | #CREATED: 06 June 2018
17 | #REVISION:
18 | #DESCRIPTION:rename_from_fasta script rename any field in a file by either providing two fasta files or a dictionary file
19 |
20 | #================================================================
21 | # END_OF_HEADER
22 | #================================================================
23 |
24 |
25 | usage() {
26 | cat << EOF
27 |
28 | rename_from_fasta script rename any field in a file by either providing two fasta files or a dictionary file
29 |
30 | usage : $0 <-i file_to_rename> [-1 ] [-2 ] [-d ] [-o ] [-f ] [-v] [-h]
31 |
32 | -i input file to rename
33 | -1 original fata file whose names will be finally printed
34 | -2 new fata file whose names will be replaced
35 | -o output directory (optional). By default the file is replaced in the same location
36 | -f output file name (".rename" will be added at the end)
37 | -d dictionary file to be used if fasta files are not supplied
38 | -v version
39 | -h display usage message
40 |
41 | example: process_cluster_output.sh -i ecoli_clustered.fasta_70 -b ecoli.coverage
42 |
43 | EOF
44 | }
45 |
46 | #================================================================
47 | # OPTION_PROCESSING
48 | #================================================================
49 | #Make sure the script is executed with arguments
50 | if [ $# = 0 ] ; then
51 | usage >&2
52 | exit 1
53 | fi
54 |
55 | # Error handling
56 | error(){
57 | local parent_lineno="$1"
58 | local script="$2"
59 | local message="$3"
60 | local code="${4:-1}"
61 |
62 | RED='\033[0;31m'
63 | NC='\033[0m'
64 |
65 | if [[ -n "$message" ]] ; then
66 | echo -e "\n---------------------------------------\n"
67 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
68 | echo -e "MESSAGE:\n"
69 | echo -e "$message"
70 | echo -e "\n---------------------------------------\n"
71 | else
72 | echo -e "\n---------------------------------------\n"
73 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
74 | echo -e "\n---------------------------------------\n"
75 | fi
76 |
77 | exit "${code}"
78 | }
79 |
80 | #DECLARE FLAGS AND VARIABLES
81 | cwd="$(pwd)"
82 | input_file="Input_file"
83 |
84 | #PARSE VARIABLE ARGUMENTS WITH getops
85 | #common example with letters, for long options check longopts2getopts.sh
86 | options=":i:1:2:f:o:d:vh"
87 | while getopts $options opt; do
88 | case $opt in
89 | i )
90 | input_file=$OPTARG
91 | ;;
92 | 1 )
93 | fasta_file_old=$OPTARG
94 | ;;
95 | 2 )
96 | fasta_file_new=$OPTARG
97 | ;;
98 | d )
99 | dictionary_file_new=$OPTARG
100 | ;;
101 | o )
102 | output_dir=$OPTARG
103 | ;;
104 | f )
105 | file_name=$OPTARG
106 | ;;
107 | h )
108 | usage
109 | exit 1
110 | ;;
111 | v )
112 | echo $VERSION
113 | exit 1
114 | ;;
115 | \?)
116 | echo "Invalid Option: -$OPTARG" 1>&2
117 | usage
118 | exit 1
119 | ;;
120 | : )
121 | echo "Option -$OPTARG requires an argument." >&2
122 | exit 1
123 | ;;
124 | * )
125 | echo "Unimplemented option: -$OPTARG" >&2;
126 | exit 1
127 | ;;
128 |
129 | esac
130 | done
131 | shift $((OPTIND-1))
132 |
133 | #================================================================
134 | # MAIN_BODY
135 | #================================================================
136 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
137 |
138 | echo -e "\n#Executing" $0 "\n"
139 |
140 | check_mandatory_files.sh $input_file
141 |
142 | if [ ! $output_dir ]; then
143 | output_dir=$(dirname $input_file)
144 | echo "Default output directory is" $output_dir
145 | mkdir -p $output_dir
146 | else
147 | echo "Output directory is" $output_dir
148 | mkdir -p $output_dir
149 | fi
150 |
151 |
152 | if [ ! $file_name ]; then
153 | file_name=$(basename $input_file | cut -d "." -f1,2)
154 | fi
155 |
156 | fasta_file_old_name=$(basename $fasta_file_old)
157 | fasta_file_new_name=$(basename $fasta_file_new)
158 |
159 | echo "$(date)"
160 | echo "Renaming" $file_name
161 |
162 | cat $fasta_file_old | awk '/>/ {print $1}'| sed 's/>//g' | sed 's/|/-/g' > $output_dir/$fasta_file_old_name".ac"
163 | cat $fasta_file_new | awk '/>/ {print $1}'| sed 's/>//g' | sed 's/|/-/g' > $output_dir/$fasta_file_new_name".ac"
164 | cat $input_file | sed 's/|/-/g' > $output_dir/$file_name".nopipe.tmp"
165 |
166 |
167 | #Paste colums to relate names in a dictionary
168 | awk 'NR==FNR{ac[NR]=$0;next}{print ac[FNR], "\t", $0"\\t" }' $output_dir/$fasta_file_old_name".ac" $output_dir/$fasta_file_new_name".ac" > $output_dir/dictionary.txt || error ${LINENO} $(basename $0) "AWK command failed in dictionary.txt creation. See $output_dir/logs for more information."
169 |
170 | #Rename fields
171 |
172 | #cat $output_dir/dictionary.txt | while read -r line; do word1=$(cut -f1); word2=$(cut -f2); echo "##########word 1="$word1;echo "###########word 2="$word2; sed 's/$word2/$word1/g' $input_file; done > $output_dir/$file_name".renamed"
173 |
174 |
175 | awk 'FNR==NR {dict[$2]=$1"\t"; next} {for (i in dict) gsub(i, dict[i])}1' $output_dir/dictionary.txt $output_dir/$file_name".nopipe.tmp" > $output_dir/$file_name".renamed" || error ${LINENO} $(basename $0) "AWK command failed in $file_name\".renamed\" creation. See $output_dir/logs for more information."
176 |
177 | #awk 'NR==FNR{dict[$2]=$1;next}{$1=dict[$1]}1' $output_dir/dictionary.txt $input_file #> $output_dir/$file_name".renamed"
178 |
179 |
180 | rm $output_dir/$fasta_file_old_name".ac"
181 | rm $output_dir/$fasta_file_new_name".ac"
182 | rm $output_dir/$file_name".nopipe.tmp"
183 | rm $output_dir/dictionary.txt
184 |
185 | echo "$(date)"
186 | echo "DONE renaming" $file_name
187 | echo -e "Renamed file can be found at" $output_dir/$file_name".renamed"
188 |
--------------------------------------------------------------------------------
/bin/sam_to_bam.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | #set -e
6 | # Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error when performing parameter expansion.
7 | # An error message will be written to the standard error, and a non-interactive shell will exit
8 | #set -u
9 | #Print everything as if it were executed, after substitution and expansion is applied: Debug|log option
10 | #set -x
11 |
12 | #=============================================================
13 | # HEADER
14 | #=============================================================
15 |
16 | #INSTITUTION:ISCIII
17 | #CENTRE:BU-ISCIII
18 | #AUTHOR: Pedro J. Sola
19 | VERSION=1.0
20 | #CREATED: 19 March 2018
21 | #REVISION:
22 | #DESCRIPTION:Script that convert a supplied SAM file into compressed binary indexed BAM
23 |
24 | #================================================================
25 | # END_OF_HEADER
26 | #================================================================
27 |
28 | #SHORT USAGE RULES
29 | #LONG USAGE FUNCTION
30 | usage() {
31 | cat << EOF
32 |
33 | Sam_to_bam script converts a supplied SAM file into compressed binary indexed BAM
34 |
35 | usage : $0 <-i inputfile(.sam)> [-o ] [-s sample_name] [-g group_name] [-T ] [-v] [-h]
36 |
37 | -i input file
38 | -o output directory (optional). By default the BAM file will replace SAM in the same location
39 | -s sample name
40 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group
41 | -T number of threads
42 | -v version
43 | -h display usage message
44 |
45 | example: sam_to_bam.sh -i ecoli.sam
46 |
47 | EOF
48 | }
49 |
50 | #================================================================
51 | # OPTION_PROCESSING
52 | #================================================================
53 | #Make sure the script is executed with arguments
54 | if [ $? != 0 ] ; then
55 | usage >&2
56 | exit 1
57 | fi
58 |
59 | # Error handling
60 | error(){
61 | local parent_lineno="$1"
62 | local script="$2"
63 | local message="$3"
64 | local code="${4:-1}"
65 |
66 | RED='\033[0;31m'
67 | NC='\033[0m'
68 |
69 | if [[ -n "$message" ]] ; then
70 | echo -e "\n---------------------------------------\n"
71 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
72 | echo -e "MESSAGE:\n"
73 | echo -e "$message"
74 | echo -e "\n---------------------------------------\n"
75 | else
76 | echo -e "\n---------------------------------------\n"
77 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
78 | echo -e "\n---------------------------------------\n"
79 | fi
80 |
81 | exit "${code}"
82 | }
83 |
84 | #DECLARE FLAGS AND VARIABLES
85 | threads=1
86 | cwd="$(pwd)"
87 | group="NO_GROUP"
88 | input_file="Input_file"
89 |
90 | #PARSE VARIABLE ARGUMENTS WITH getops
91 | #common example with letters, for long options check longopts2getopts.sh
92 | options=":i:o:s:g:vh"
93 | while getopts $options opt; do
94 | case $opt in
95 | i )
96 | input_file=$OPTARG
97 | ;;
98 | o )
99 | output_dir=$OPTARG
100 | ;;
101 | s )
102 | sample=$OPTARG
103 | ;;
104 | g)
105 | group=$OPTARG
106 | ;;
107 |
108 | T )
109 | threads=$OPTARG
110 | ;;
111 |
112 | h )
113 | usage
114 | exit 1
115 | ;;
116 | v )
117 | echo $VERSION
118 | exit 1
119 | ;;
120 | \?)
121 | echo "Invalid Option: -$OPTARG" 1>&2
122 | usage
123 | exit 1
124 | ;;
125 | : )
126 | echo "Option -$OPTARG requires an argument." >&2
127 | exit 1
128 | ;;
129 | * )
130 | echo "Unimplemented option: -$OPTARG" >&2;
131 | exit 1
132 | ;;
133 |
134 | esac
135 | done
136 | shift $((OPTIND-1))
137 |
138 |
139 | #================================================================
140 | # MAIN_BODY
141 | #================================================================
142 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
143 |
144 | echo -e "\n#Executing" $0 "\n"
145 |
146 | check_mandatory_files.sh $input_file
147 |
148 | check_dependencies.sh samtools
149 |
150 |
151 | if [ ! $output_dir ]; then
152 | output_dir=$(dirname $input_file)
153 | echo "Default output directory is" $output_dir
154 | mkdir -p $output_dir
155 | else
156 | echo "Output directory is" $output_dir
157 | mkdir -p $output_dir
158 | fi
159 |
160 | if [ ! $sample ]; then
161 | sample=$(basename $input_file | cut -d. -f1)
162 | fi
163 |
164 | ########SAM_TO_BAM##########
165 | ############################
166 |
167 |
168 | if [ -f $output_dir/$sample.sorted.bam -a -f $output_dir/$sample.sorted.bam.bai ];then \
169 | echo "Found a sorted .BAM file for sample" $sample;
170 | echo "Omitting BAM to SAM convertion"
171 | else
172 | echo "$(date)"
173 | echo "Converting SAM to sorted indexed BAM in $sample"
174 |
175 | samtools view \
176 | -Sb $input_file \
177 | -o $output_dir/$sample.bam || error ${LINENO} $(basename $0) "Samtools view command failed. See $output_dir/logs for more information."
178 |
179 |
180 | echo "$(date)"
181 | echo "Sorting BAM file in $sample"
182 |
183 | samtools sort \
184 | -T $output_dir/$sample".sorted.bam" \
185 | -o $output_dir/$sample".sorted.bam" \
186 | $output_dir/$sample.bam || error ${LINENO} $(basename $0) "Samtools sort command failed. See $output_dir/logs for more information."
187 |
188 | echo "$(date)"
189 | echo "Indexing BAM file in $sample"
190 |
191 | samtools index \
192 | $output_dir/$sample".sorted.bam" || error ${LINENO} $(basename $0) "Samtools index command failed. See $output_dir/logs for more information."
193 |
194 |
195 | echo "$(date)"
196 | echo "DONE Converting SAM to sorted indexed BAM in $sample"
197 | fi
198 |
199 | if [ -f $output_dir/$sample.sam ];then \
200 |
201 | echo $sample.sam "removed"
202 | rm $output_dir/$sample.sam
203 |
204 | fi
205 |
206 | if [ -f $output_dir/$sample.bam ];then \
207 |
208 | echo $sample.bam "removed"
209 | rm $output_dir/$sample.bam
210 |
211 | fi
212 |
213 | echo -e "\n"
214 |
--------------------------------------------------------------------------------
/bin/spades_assembly.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 | #set -x
5 |
6 | #=============================================================
7 | # HEADER
8 | #=============================================================
9 |
10 | #INSTITUTION:ISCIII
11 | #CENTRE:BU-ISCIII
12 | #AUTHOR: Pedro J. Sola
13 | VERSION=1.0
14 | #CREATED: 21 May 2018
15 | #REVISION:
16 | #DESCRIPTION:Script that assemble illumina sequences using SPAdes
17 | #
18 | #
19 | #================================================================
20 | # END_OF_HEADER
21 | #================================================================
22 |
23 |
24 | usage() {
25 | cat << EOF
26 |
27 | spades_assembly script that assemble illumina sequences using SPAdes
28 |
29 | usage : $0 <-p R1_paired file> <-P R2_paired file> [-o ]
30 | [-k ][-s sample_name] [-g group_name] [-f ] [-T ] [q] [-c] [-v] [-h]
31 |
32 | -p R1_paired file (mandatory)
33 | -P R2_paired file (mandatory)
34 | -k kmers, supplied as numbers sepparated by number or one flag per number, default: 21,33,55,77,99,127
35 | -o output directory (optional)
36 | -f file name
37 | -s sample name (mandatory)
38 | -g group name (optional). If unset, samples will be gathered in NO_GROUP group
39 | -q quick_mode: look for files in a folder SUPPLIED with "paired" term
40 | -c clean mode: remove unnecesary temporary folders
41 | -T threads, default 1
42 | -v version
43 | -h display usage message
44 |
45 | example: ./spades_assembly.sh -p ecoli_R1_paired.fastq.gz -P ecoli_R2_paired.fastq.gz -c
46 |
47 | EOF
48 | }
49 |
50 |
51 | #================================================================
52 | # OPTION_PROCESSING
53 | #================================================================
54 | #Make sure the script is executed with arguments
55 | if [ $# = 0 ] ; then
56 | usage >&2
57 | exit 1
58 | fi
59 |
60 | # Error handling
61 | error(){
62 | local parent_lineno="$1"
63 | local script="$2"
64 | local message="$3"
65 | local code="${4:-1}"
66 |
67 | RED='\033[0;31m'
68 | NC='\033[0m'
69 |
70 | if [[ -n "$message" ]] ; then
71 | echo -e "\n---------------------------------------\n"
72 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
73 | echo -e "MESSAGE:\n"
74 | echo -e "$message"
75 | echo -e "\n---------------------------------------\n"
76 | else
77 | echo -e "\n---------------------------------------\n"
78 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
79 | echo -e "\n---------------------------------------\n"
80 | fi
81 |
82 | exit "${code}"
83 | }
84 |
85 | #DECLARE FLAGS AND VARIABLES
86 | cwd="$(pwd)"
87 | group="NO_GROUP"
88 | r1_paired_file="R1_paired_file"
89 | r2_paired_file="R2_paired_file"
90 | threads=1
91 | kmer_values_command="21,33,55,77,99,127"
92 | kmer_option=false
93 | quick_mode=false
94 | clean_mode=false
95 |
96 | #PARSE VARIABLE ARGUMENTS WITH getops
97 | #common example with letters, for long options check longopts2getopts.sh
98 | options=":p:P:u:U:o:f:d:a:s:g:k:T:q:cvh"
99 | while getopts $options opt; do
100 | case $opt in
101 | p )
102 | r1_paired_file=$OPTARG
103 | ;;
104 | P )
105 | r2_paired_file=$OPTARG
106 | ;;
107 | o )
108 | output_dir=$OPTARG
109 | ;;
110 | f )
111 | file_name=$OPTARG
112 | ;;
113 | s )
114 | sample=$OPTARG
115 | ;;
116 | k)
117 | kmer_value+=($OPTARG)
118 | kmer_option=true
119 | ;;
120 | q)
121 | directory_reads=$OPTARG
122 | quick_mode=true
123 | ;;
124 | l)
125 | minimus_length=$OPTARG
126 | ;;
127 | g)
128 | group=$OPTARG
129 | ;;
130 | c)
131 | clean_mode=true
132 | ;;
133 | M )
134 | max_mem=$OPTARG
135 | ;;
136 | T )
137 | threads=$OPTARG
138 | ;;
139 | h )
140 | usage
141 | exit 1
142 | ;;
143 | v )
144 | echo $VERSION
145 | exit 1
146 | ;;
147 | \?)
148 | echo "Invalid Option: -$OPTARG" 1>&2
149 | usage
150 | exit 1
151 | ;;
152 | : )
153 | echo "Option -$OPTARG requires an argument." >&2
154 | exit 1
155 | ;;
156 | * )
157 | echo "Unimplemented option: -$OPTARG" >&2;
158 | exit 1
159 | ;;
160 |
161 | esac
162 | done
163 | shift $((OPTIND-1))
164 |
165 |
166 |
167 | #================================================================
168 | # MAIN_BODY
169 | #================================================================
170 | ##CHECK DEPENDENCIES, MANDATORY FIELDS, FOLDERS AND ARGUMENTS
171 |
172 | echo -e "\n#Executing" $0 "\n"
173 |
174 | check_dependencies.sh spades.py
175 |
176 |
177 | if [ ! $directory_reads ]; then
178 | directory_reads=$(dirname $r1_paired_file)
179 | echo "Reads directory is" $directory_reads
180 | else
181 | echo "Reads directory for quick mode is" $directory_reads
182 | sample_dir=$(dirname $directory_reads)
183 | output_dir=$sample_dir"/assembly"
184 | mkdir -p $output_dir
185 | fi
186 |
187 |
188 | if [ ! $output_dir ]; then
189 | sample_dir=$(dirname $directory_reads)
190 | output_dir=$sample_dir"/assembly"
191 | echo "Default output directory is" $output_dir
192 | mkdir -p $output_dir
193 | else
194 | echo "Output directory is" $output_dir
195 | mkdir -p $output_dir
196 | fi
197 |
198 |
199 | if [ $quick_mode = true ]; then
200 | echo "Entering QUICK MODE"
201 | r1_paired_file=$(find $directory_reads -name "*1_paired.fastq.gz" -type f)
202 | r2_paired_file=$(find $directory_reads -name "*2_paired.fastq.gz" -type f)
203 | fi
204 |
205 |
206 | check_mandatory_files.sh $r1_paired_file $r2_paired_file
207 |
208 | if [ $kmer_option = true ]; then
209 | list_kmer_values=$(for value in "${kmer_value[@]}"; do echo "$value"; done)
210 | kmer_values_command=$(printf "%s," $list_kmer_values | sed 's/,$//g')
211 | fi
212 |
213 |
214 | echo "$(date)"
215 | echo "Assembly:"
216 | echo "R1 paired file = " $r1_paired_file
217 | echo "R2 paired file = " $r2_paired_file
218 |
219 |
220 | spades.py \
221 | --careful \
222 | -t $threads \
223 | -k $kmer_values_command \
224 | --pe1-1 $r1_paired_file \
225 | --pe1-2 $r2_paired_file \
226 | -o $output_dir || error ${LINENO} $(basename $0) "Spades command failed. See $output_dir/logs for more information."
227 |
228 |
229 |
230 | echo "$(date)"
231 | echo "DONE. Assembled contigs can be found at $output_dir/contigs.fasta:"
232 | echo "DONE. Assembled scaffolds can be found at $output_dir/scaffolds.fasta:"
233 |
234 | if [ $clean_mode = true ]; then
235 | echo "Removing unnecesary folders"
236 | rm -rf $(find $output_dir -maxdepth 1 -mindepth 1 -type d)
237 | echo "DONE removing unwanted folders"
238 | fi
239 |
240 | echo -e "\n"
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
--------------------------------------------------------------------------------
/config_files/annotation_config_file.txt:
--------------------------------------------------------------------------------
1 | #1. Fasta file for annotation
2 | #2. Name given to this annotation
3 | #3. Alignment %Identity necessary to include the sequence
4 | #4. Alignment %Length neccesary to include the sequence
5 | #5. Query divisor for the sequence name. (ie. For name Inc_NC_632542_protein-description)
6 | #6. Query field to represent (l:left|r:rigth) (ie. with divisor "_", left would be "Inc" and rigth "protein-description")
7 | #7. Unique. Each sequence will be allowed only once per plasmid
8 | #8. Doble Unique. This field uses a provided separator to extract only the best match. (ie within OXA-11 and OXA-48, using "-" as separator will retrieve only one). Use n if not used.
9 | #9. Color. Color used to represent this database (blue, green, grey, orange, purple, red, yellow. vvl,, vl, l, d, vd and vvd stands for very v), light(l) and dark(d))
10 |
11 | #DDBBFILE,NANE,P_IDENTITY,P_ALIGNMENT,Q_DIVISOR,Q_SIDE_LR,IS_UNIQUE,DOBLE_UNIQUE,COLOR,
12 |
13 | #DEFAULTEXAMPLE: Copy and paste next line, change the file name, name of database and color. Remove "#"
14 | #PATH/TO/FILE,NAME,95,90,_,l,n,n,nucl,COLOR
15 |
16 | #ANTIBIOTIC_RESISTANCE_ANNOTATION
17 | databases/ARGannot.pID.fasta,abr,98,90, ,r,y,-,nucl,lred
18 | #REPLISOME_ANNOTATION
19 | databases/plasmidFinder_01_26_2018.fsa,inc,95,80,_,l,y,n,nucl,lyellow
20 |
--------------------------------------------------------------------------------
/config_files/circos_summary_1_3_0.conf:
--------------------------------------------------------------------------------
1 | ######## CIRCOS.CONF
2 | ####################
3 |
4 | karyotype = PLASMID_KARYOTYPE
5 |
6 | chromosome_units = 1000000
7 | chromosomes_display_default = yes
8 | #chromosomes_display_default = no
9 | #chromosomes = /NZ/
10 | chromosomes_color = /./ = lblue
11 | #chromosomes_scale = /./ = 1rn
12 | #chromosomes_scale = eval(var(size)) < 100000 = 0.5r
13 | z=100
14 |
15 |
16 | #############################HIGHLIGHTS
17 |
18 |
19 | <>
20 |
21 |
22 |
23 |
24 |
25 | #############################PLOTS
26 |
27 |
28 | ############### COVERAGE
29 |
30 | type = histogram
31 | file = PLASMID_COVERAGE_GRAPH
32 |
33 | color = black
34 | r1 = 0.99r
35 | r0 = 0.90r
36 | extend_bin = no
37 | min= 0
38 | max= 500
39 | thickness = 2
40 | orientation = out
41 |
42 | #
43 | #show = data
44 | #
45 | #color = vvlgrey
46 | #
47 | #
48 |
49 |
50 |
51 |
52 | thickness = 1
53 | color = lgrey
54 | spacing = 50
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 | condition = var(value) < 20
65 | color = lorange
66 | thickness = 3
67 | flow = continue
68 |
69 |
70 |
71 | condition = var(value) == 0
72 | color = red
73 | thickness = 3
74 | flow = continue
75 |
76 |
77 |
78 | condition = var(value) > 200
79 | color = green
80 | thickness = 3
81 |
82 |
83 |
84 |
85 |
86 |
87 | ############### /COVERAGE
88 |
89 |
90 | ############### TEXT_ADITIONAL_ANNOTATION
91 |
92 | type = text
93 | color = black
94 | label_font = bold
95 | label_size = 10p
96 | file = PLASMID_SPECIFIC_TEXT
97 | r1 = 0.85r+200p
98 | r0 = 0.80r
99 | orientation = center
100 | show_links = no
101 |
102 | margin = 0u
103 | label_parallel = no
104 | padding = 1p
105 | rpadding = 2p
106 | label_snuggle = yes
107 | max_snuggle_distance = 5r
108 | snuggle_sampling = 2
109 | snuggle_tolerance = 1r
110 | snuggle_link_overlap_test = yes
111 | snuggle_link_overlap_tolerance = 20p
112 |
113 |
114 | ############### /TEXT_ADITIONAL_ANNOTATION
115 |
116 | ############### TEXT_CDS_CONTIG
117 |
118 | type = text
119 | color = black
120 | label_font = default
121 | label_size = 9p
122 | file = PLASMID_CDS_CONTIG
123 | r1 = 0.80r
124 | r0 = 0.75r
125 | orientation = center
126 | show_links = yes
127 | label_parallel = no
128 | padding = 0p
129 | label_snuggle = yes
130 | max_snuggle_distance = 6r
131 | snuggle_sampling = 10
132 | snuggle_tolerance = 1r
133 | snuggle_link_overlap_test = yes
134 | snuggle_link_overlap_tolerance = 10p
135 | #snuggle_refine = yes
136 |
137 | #
138 | #
139 | #condition = var(value) =~ /CDS/
140 | #show = no
141 | #flow = continue
142 | #
143 | #
144 |
145 |
146 |
147 |
148 | ############### /TEXT_CDS_CONTIG
149 |
150 | ############### CDS_CONTIGS_PROKKA
151 |
152 | type = tile
153 | file = PLASMID_CDS_CONTIG
154 | r1 = 0.75r
155 | r0 = 0.70r
156 | layers = 3
157 | layers_overflow = collapse
158 | margin = 10u
159 | thickness = 20
160 | padding = 10
161 | orientation = in
162 | stroke_thickness = 1
163 | stroke_color = vdgrey
164 | color = purple
165 | #units_ok = bupr
166 | #units_nounit = n
167 |
168 |
169 | ############### /CDS_CONTIGS_PROKKA
170 |
171 | ############### TEXT_CONTIG
172 |
173 | type = text
174 | #color = black
175 | label_font = bold
176 |
177 | label_size = 10p
178 | file = PLASMID_CONTIGS
179 | r1 = 0.70r
180 | r0 = 0.64r
181 | orientation = out
182 | show_links = yes
183 | label_parallel = yes
184 | padding = 10p
185 | margin = 10p
186 | label_snuggle = yes
187 | max_snuggle_distance = 10r
188 | snuggle_sampling = 10
189 | snuggle_tolerance = 1r
190 | snuggle_link_overlap_test = yes
191 | snuggle_link_overlap_tolerance = 500p
192 | #snuggle_refine = yes
193 |
194 |
195 |
196 |
197 | condition = var(id) =~ /(\d+)(\d+)(\d*)/
198 | color = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
199 | flow = continue
200 |
201 |
202 |
203 | condition = var(id) =~ /(\d+)(\d+)(\d*)/
204 | link_color = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
205 | flow = continue
206 |
207 |
208 |
209 | condition = var(size) < 1kb
210 | show = no
211 |
212 |
213 |
214 |
215 | ############### /TEXT_CONTIG
216 |
217 | ############### CONTIGS SPADES ALL
218 |
219 | type = tile
220 | file = PLASMID_CONTIGS
221 | r1 = 0.65r
222 | r0 = 0.6r
223 | layers = 4
224 | margin = 5u
225 | thickness = 20
226 | padding = 5
227 | layers_overflow = collapse
228 | orientation = out
229 | stroke_thickness = 0
230 | stroke_color = grey
231 | color = grey
232 |
233 |
234 |
235 |
236 |
237 | condition = var(id) =~ /(\d+)(\d+)(\d*)/
238 | color = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
239 | flow = continue
240 |
241 | #importance = 100
242 | #condition = 1
243 | #color = eval(sprintf("spectral-11-div-%d",remap_int(NODE_%d%d,0,10e6,1,11)))
244 | #color = eval((qw(vvvlgrey vvlgrey vlgrey lgrey grey dgrey vdgrey vvdgrey))[var(id) % 8])
245 |
246 |
247 |
248 | condition = var(size) < 1kb
249 | show = no
250 |
251 |
252 |
253 |
254 |
255 | ############### /CONTIGS SPADES ALL
256 |
257 |
258 |
259 |
260 | ######## LINKS
261 | ##############
262 |
263 |
264 |
265 |
266 |
267 | file = PLASMID_LINKS
268 | r1 = 0.50r
269 | r0 = 0r
270 | ribbon = yes
271 | flat = yes
272 | radius = 0.6r
273 | bezier_radius = 0.1r
274 | crest = 0.2
275 | color = lgrey_a4
276 |
277 |
278 |
279 |
280 | condition = var(intrachr)
281 | show = no
282 |
283 |
284 |
285 | importance = 110
286 | condition = var(size1) < 2kb
287 | show = no
288 | flow = continue
289 |
290 |
291 |
292 | importance = 110
293 | condition = var(size2) < 2kb
294 | show = no
295 | flow = continue
296 |
297 |
298 |
299 |
300 | condition = var(id) =~ /(\d+)(\d+)(\d*)/
301 | color = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
302 | #"paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12)
303 | #"set3-12-qual-%d_a%d"
304 | #"rev(set3-12-qual-%d_a%d)"
305 | flow = continue
306 |
307 |
308 |
309 | condition = 1
310 | z = eval(average(-1*(var(size1),var(size2))))
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 | ######## IDEOGRAM
321 | #################
322 |
323 |
324 | show = yes
325 |
326 |
327 | default = 5000u
328 | #when representing witout scaling
329 | #default = 1000u
330 | break = 500u
331 |
332 |
333 | chromosomes_color = dblue
334 | stroke_color = blue
335 |
336 | radius = 0.93r
337 | thickness = 25p
338 | fill = yes
339 |
340 | show_label = yes
341 |
342 | label_font = bold
343 | label_radius = dims(ideogram,radius_inner)
344 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2
345 |
346 | label_size = 17
347 | label_parallel = yes
348 |
349 |
350 |
351 | ######## TICKS
352 | ##############
353 |
354 | show_ticks = yes
355 | show_tick_labels = yes
356 |
357 |
358 | radius = dims(ideogram,radius_outer)
359 | color = black
360 | thickness = 2p
361 |
362 | #multiplier = 0.001
363 |
364 |
365 | #spacing = 1000u
366 | rspacing = 0.025
367 | multiplier = 0.001
368 | spacing_type = relative
369 | skip_first_label = yes
370 | skip_last_label = no
371 | size = 5p
372 | show_label = yes
373 | label_size = 20p
374 | #label_relative = yes
375 | suffix = " kb"
376 | #rdivisor = ideogram
377 | format = %d
378 | rmultiplier = 1
379 |
380 |
381 |
382 |
383 | #
384 | #spacing = 2000u
385 | #size = 15p
386 | #show_label = yes
387 | #label_size = 20p
388 | #labe_offset = 10p
389 | #suffix = " kb"
390 | #format = %d
391 | #
392 |
393 |
394 |
395 | ########COLORS
396 | ##############
397 | <>
398 |
399 |
400 | ########HOUSEKEEPING
401 | ####################
402 | <>
403 | max_points_per_track* = 8000000
404 |
405 | ########IMAGE
406 | #############
407 |
408 | dir = OUTPUTDIR
409 | #dir = conf(configdir)
410 | file = IMAGENAME
411 | png = yes
412 | svg = no
413 | # radius of inscribed circle in image
414 | radius = 1900p
415 | # by default angle=0 is at 3 o'clock position
416 | angle_offset = -90
417 | #angle_orientation = counterclockwise
418 | auto_alpha_colors = yes
419 | auto_alpha_steps = 5
420 |
421 |
422 |
--------------------------------------------------------------------------------
/config_files/circos_summary_1_3_3.conf:
--------------------------------------------------------------------------------
1 | ######## CIRCOS.CONF
2 | ####################
3 |
4 | karyotype = PLASMID_KARYOTYPE
5 |
6 | chromosome_units = 1000000
7 | chromosomes_display_default = yes
8 | #chromosomes_display_default = no
9 | #chromosomes = /NZ/
10 | chromosomes_color = /./ = lblue
11 | #chromosomes_scale = /./ = 1rn
12 | #chromosomes_scale = eval(var(size)) < 100000 = 0.5r
13 | z=100
14 |
15 |
16 | #############################HIGHLIGHTS
17 |
18 |
19 | <>
20 | r1 = 0.90r
21 | r0 = 0.75r
22 |
23 |
24 |
25 |
26 | #############################PLOTS
27 |
28 |
29 | ############### COVERAGE
30 |
31 | type = histogram
32 | file = PLASMID_COVERAGE_GRAPH
33 |
34 | color = black
35 | r1 = 0.99r
36 | r0 = 0.90r
37 | extend_bin = no
38 | min= 0
39 | max= 500
40 | thickness = 2
41 | orientation = out
42 |
43 | #
44 | #show = data
45 | #
46 | #color = vvlgrey
47 | #
48 | #
49 |
50 |
51 |
52 |
53 | thickness = 1
54 | color = lgrey
55 | spacing = 50
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 | condition = var(value) < 20
66 | color = lorange
67 | thickness = 3
68 | flow = continue
69 |
70 |
71 |
72 | condition = var(value) == 0
73 | color = red
74 | thickness = 3
75 | flow = continue
76 |
77 |
78 |
79 | condition = var(value) > 200
80 | color = green
81 | thickness = 3
82 |
83 |
84 |
85 |
86 |
87 |
88 | ############### /COVERAGE
89 |
90 |
91 | ############### TEXT_ADITIONAL_ANNOTATION
92 |
93 | type = text
94 | color = black
95 | label_font = bold
96 | label_size = 10p
97 | file = PLASMID_SPECIFIC_TEXT
98 | r1 = 0.85r+200p
99 | r0 = 0.82r
100 | orientation = center
101 | show_links = no
102 |
103 | margin = 0u
104 | label_parallel = no
105 | padding = 1p
106 | rpadding = 2p
107 | label_snuggle = yes
108 | max_snuggle_distance = 5r
109 | snuggle_sampling = 2
110 | snuggle_tolerance = 1r
111 | snuggle_link_overlap_test = yes
112 | snuggle_link_overlap_tolerance = 20p
113 |
114 |
115 | ############### /TEXT_ADITIONAL_ANNOTATION
116 |
117 | ############### TEXT_CDS_CONTIG
118 |
119 | type = text
120 | color = black
121 | label_font = default
122 | label_size = 9p
123 | file = PLASMID_CDS_CONTIG
124 | r1 = 0.80r
125 | r0 = 0.75r
126 | orientation = center
127 | show_links = yes
128 | link_dims = 8p,8p,10p,8p,8p
129 | link_color = purple
130 | label_parallel = no
131 | padding = 0p
132 | label_snuggle = yes
133 | max_snuggle_distance = 6r
134 | snuggle_sampling = 10
135 | snuggle_tolerance = 1r
136 | snuggle_link_overlap_test = yes
137 | snuggle_link_overlap_tolerance = 10p
138 | #snuggle_refine = yes
139 |
140 | #
141 | #
142 | #condition = var(value) =~ /CDS/
143 | #show = no
144 | #flow = continue
145 | #
146 | #
147 |
148 |
149 |
150 |
151 | ############### /TEXT_CDS_CONTIG
152 |
153 | ############### CDS_CONTIGS_PROKKA
154 |
155 | type = tile
156 | file = PLASMID_CDS_FORWARD
157 | r1 = 0.80r
158 | r0 = 0.75r
159 | layers = 3
160 | layers_overflow = grow
161 | margin = 0.001u
162 | thickness = 20p
163 | padding = 0p
164 | rpadding = 0p
165 | orientation = out
166 | stroke_thickness = 1
167 | stroke_color = dgrey
168 | color = dpurple
169 |
170 |
171 |
172 |
173 | r1 = 0.75r
174 | r0 = 0.75r
175 |
176 |
177 | position = 0.75r
178 | color = dgrey
179 | thickness = 2
180 |
181 |
182 |
183 |
184 |
185 | type = tile
186 | file = PLASMID_CDS_REVERSE
187 | r1 = 0.75r
188 | r0 = 0.70r
189 | layers = 3
190 | layers_overflow = grow
191 | margin = 0.001u
192 | thickness = 20p
193 | padding = 0p
194 | rpadding = 0p
195 | orientation = in
196 | stroke_thickness = 1
197 | stroke_color = dgrey
198 | color = lpurple
199 |
200 |
201 | ############### /CDS_CONTIGS_PROKKA
202 |
203 | ############### TEXT_CONTIG
204 |
205 | type = text
206 | #color = black
207 | label_font = bold
208 |
209 | label_size = 10p
210 | file = PLASMID_CONTIGS
211 | r1 = 0.70r
212 | r0 = 0.64r
213 | orientation = out
214 | show_links = yes
215 | label_parallel = yes
216 | padding = 10p
217 | margin = 10p
218 | label_snuggle = yes
219 | max_snuggle_distance = 10r
220 | snuggle_sampling = 10
221 | snuggle_tolerance = 1r
222 | snuggle_link_overlap_test = yes
223 | snuggle_link_overlap_tolerance = 500p
224 | #snuggle_refine = yes
225 |
226 |
227 |
228 |
229 | condition = var(id) =~ /(\d+)(\d+)(\d*)/
230 | color = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
231 | flow = continue
232 |
233 |
234 |
235 | condition = var(id) =~ /(\d+)(\d+)(\d*)/
236 | link_color = eval(my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
237 | flow = continue
238 |
239 |
240 |
241 | condition = var(size) < 1kb
242 | show = no
243 |
244 |
245 |
246 |
247 | ############### /TEXT_CONTIG
248 |
249 | ############### CONTIGS SPADES ALL
250 |
251 | type = tile
252 | file = PLASMID_CONTIGS
253 | r1 = 0.65r
254 | r0 = 0.60r
255 | layers = 4
256 | margin = 5u
257 | thickness = 20
258 | padding = 5
259 | layers_overflow = collapse
260 | orientation = out
261 | stroke_thickness = 0
262 | stroke_color = grey
263 | color = grey
264 |
265 |
266 |
267 |
268 |
269 | condition = var(id) =~ /(\d+)(\d+)(\d*)/
270 | color = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
271 | flow = continue
272 |
273 | #importance = 100
274 | #condition = 1
275 | #color = eval(sprintf("spectral-11-div-%d",remap_int(NODE_%d%d,0,10e6,1,11)))
276 | #color = eval((qw(vvvlgrey vvlgrey vlgrey lgrey grey dgrey vdgrey vvdgrey))[var(id) % 8])
277 |
278 |
279 |
280 | condition = var(size) < 1kb
281 | show = no
282 |
283 |
284 |
285 |
286 |
287 | ############### /CONTIGS SPADES ALL
288 |
289 |
290 |
291 |
292 | ######## LINKS
293 | ##############
294 |
295 |
296 |
297 |
298 |
299 | file = PLASMID_LINKS
300 | r1 = 0.50r
301 | r0 = 0r
302 | ribbon = yes
303 | flat = yes
304 | radius = 0.6r
305 | bezier_radius = 0.1r
306 | crest = 0.2
307 | color = lgrey_a4
308 |
309 |
310 |
311 |
312 | condition = var(intrachr)
313 | show = no
314 |
315 |
316 |
317 | importance = 110
318 | condition = var(size1) < 2kb
319 | show = no
320 | flow = continue
321 |
322 |
323 |
324 | importance = 110
325 | condition = var(size2) < 2kb
326 | show = no
327 | flow = continue
328 |
329 |
330 |
331 |
332 | condition = var(id) =~ /(\d+)(\d+)(\d*)/
333 | color = eval( my @match = "var(id)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
334 | #"paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12)
335 | #"set3-12-qual-%d_a%d"
336 | #"rev(set3-12-qual-%d_a%d)"
337 | flow = continue
338 |
339 |
340 |
341 | condition = 1
342 | z = eval(average(-1*(var(size1),var(size2))))
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 | ######## IDEOGRAM
353 | #################
354 |
355 |
356 | show = yes
357 |
358 |
359 | default = 5000u
360 | #when representing witout scaling
361 | #default = 1000u
362 | break = 500u
363 |
364 |
365 | chromosomes_color = dblue
366 | stroke_color = blue
367 |
368 | radius = 0.93r
369 | thickness = 25p
370 | fill = yes
371 |
372 | show_label = yes
373 |
374 | label_font = bold
375 | label_radius = dims(ideogram,radius_inner)
376 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2
377 |
378 | label_size = 17
379 | label_parallel = yes
380 |
381 |
382 |
383 | ######## TICKS
384 | ##############
385 |
386 | show_ticks = yes
387 | show_tick_labels = yes
388 |
389 |
390 | radius = dims(ideogram,radius_outer)
391 | color = black
392 | thickness = 2p
393 |
394 | #multiplier = 0.001
395 |
396 |
397 | #spacing = 1000u
398 | rspacing = 0.025
399 | multiplier = 0.001
400 | spacing_type = relative
401 | skip_first_label = yes
402 | skip_last_label = no
403 | size = 5p
404 | show_label = yes
405 | label_size = 20p
406 | #label_relative = yes
407 | suffix = " kb"
408 | #rdivisor = ideogram
409 | format = %d
410 | rmultiplier = 1
411 |
412 |
413 |
414 |
415 | #
416 | #spacing = 2000u
417 | #size = 15p
418 | #show_label = yes
419 | #label_size = 20p
420 | #labe_offset = 10p
421 | #suffix = " kb"
422 | #format = %d
423 | #
424 |
425 |
426 |
427 | ########COLORS
428 | ##############
429 | <>
430 |
431 |
432 | ########HOUSEKEEPING
433 | ####################
434 | <>
435 | max_points_per_track* = 8000000
436 | max_ideograms*=1000
437 | ########IMAGE
438 | #############
439 |
440 | dir = OUTPUTDIR
441 | #dir = conf(configdir)
442 | file = IMAGENAME
443 | png = yes
444 | svg = no
445 | # radius of inscribed circle in image
446 | radius = 1900p
447 | # by default angle=0 is at 3 o'clock position
448 | angle_offset = -90
449 | #angle_orientation = counterclockwise
450 | auto_alpha_colors = yes
451 | auto_alpha_steps = 5
452 |
453 |
454 |
--------------------------------------------------------------------------------
/config_files/simple.conf:
--------------------------------------------------------------------------------
1 | ######## CIRCOS.CONF
2 | ####################
3 |
4 | karyotype = PLASMID_KARYOTYPE
5 |
6 | chromosome_units = 1000000
7 | chromosomes_display_default = no
8 | chromosomes = SAMPLE_SHOWN
9 | chromosomes_color = /./ = lblue
10 | z=100
11 |
12 | #
13 | #
14 | #chr = NZ_CP018342.1
15 | #start = 30000u
16 | #end = 52000u
17 | #scale = 15
18 |
19 | #smooth_distance = 10r
20 | #smooth_steps = 5
21 |
22 | #
23 | #
24 |
25 | #############################HIGHLIGHTS
26 |
27 |
28 | <>
29 |
30 |
31 |
32 | ########################################PLOTS
33 |
34 |
35 | ############### COVERAGE
36 |
37 | type = histogram
38 | file = PLASMID_COVERAGE_GRAPH
39 |
40 | color = black
41 | r1 = 0.99r
42 | r0 = 0.90r
43 | extend_bin = no
44 | min= 0
45 | max= 500
46 | thickness = 2
47 | orientation = out
48 |
49 |
50 |
51 |
52 | thickness = 1
53 | color = lgrey
54 | spacing = 50
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 | condition = var(value) < 20
63 | color = lorange
64 | thickness = 3
65 | flow = continue
66 |
67 |
68 |
69 | condition = var(value) == 0
70 | color = red
71 | thickness = 3
72 | flow = continue
73 |
74 |
75 |
76 | condition = var(value) > 200
77 | color = green
78 | thickness = 3
79 |
80 |
81 |
82 |
83 |
84 | ############### /COVERAGE
85 |
86 | ############### TEXT_ADITIONAL_ANNOTATION
87 |
88 | type = text
89 | color = black
90 | label_font = bold
91 | label_size = 30p
92 | file = PLASMID_SPECIFIC_TEXT
93 | r1 = 0.85r+200p
94 | r0 = 0.74r
95 | orientation = center
96 | show_links = no
97 |
98 | margin = 0u
99 | label_parallel = no
100 | padding = 1p
101 | rpadding = 2p
102 | label_snuggle = yes
103 | max_snuggle_distance = 5r
104 | snuggle_sampling = 2
105 | snuggle_tolerance = 1r
106 | snuggle_link_overlap_test = yes
107 | snuggle_link_overlap_tolerance = 20p
108 |
109 |
110 | ############### /TEXT_ADITIONAL_ANNOTATION
111 |
112 |
113 | ############### TEXT_CDS_CONTIG
114 |
115 |
116 | type = text
117 | color = black
118 | label_font = default
119 | label_size = 42p
120 | file = PLASMID_CDS_CONTIG
121 | r1 = 0.70r+200p
122 | r0 = 0.70r
123 | orientation = center
124 | show_links = yes
125 | link_dims = 8p,8p,30p,8p,8p
126 | link_color = grey
127 |
128 | label_parallel = no
129 | padding = 0p
130 | label_snuggle = yes
131 | max_snuggle_distance = 6r
132 | snuggle_sampling = 10
133 | snuggle_tolerance = 1r
134 | snuggle_link_overlap_test = yes
135 | snuggle_link_overlap_tolerance = 10p
136 |
137 | #
138 | #
139 | #condition = var(value) eq "cds"
140 | #label_size = 7p
141 | #show = no
142 | #flow = continue
143 | #
144 | #
145 |
146 |
147 | ############### /TEXT_CDS_CONTIG
148 |
149 | ############### CDS_CONTIGS_PROKKA
150 |
151 | type = tile
152 | file = PLASMID_CDS_FORWARD
153 | r1 = 0.73r
154 | r0 = 0.70r
155 | layers = 2
156 | layers_overflow = grow
157 | margin = 0.001u
158 | thickness = 30p
159 | padding = 0p
160 | rpadding = 0p
161 | orientation = out
162 | stroke_thickness = 1
163 | stroke_color = vvdgrey
164 | color = dgrey
165 |
166 |
167 |
168 | r1 = 0.70r
169 | r0 = 0.70r
170 |
171 |
172 | position = 0.70r
173 | color = dgrey
174 | thickness = 2
175 |
176 |
177 |
178 |
179 |
180 | type = tile
181 | file = PLASMID_CDS_REVERSE
182 | r1 = 0.70r
183 | r0 = 0.67r
184 | layers = 2
185 | layers_overflow = grow
186 | margin = 0.001u
187 | thickness = 30p
188 | padding = 0p
189 | rpadding = 0p
190 | orientation = in
191 | stroke_thickness = 1
192 | stroke_color = dgrey
193 | color = lgrey
194 |
195 |
196 | ############### /CDS_CONTIGS_PROKKA
197 |
198 |
199 | ############### TEXT_CONTIG
200 |
201 | type = text
202 | label_font = bold
203 | label_size = 20p
204 | file = PLASMID_CONTIGS
205 | r1 = 0.60r+100p
206 | r0 = 0.60r
207 | orientation = out
208 | show_links = yes
209 | label_parallel = yes
210 | padding = 5p
211 | rpadding = 2p
212 | margin = 15p
213 | label_snuggle = yes
214 | max_snuggle_distance = 10r
215 | snuggle_sampling = 10
216 | snuggle_tolerance = 5r
217 | snuggle_link_overlap_test = yes
218 | snuggle_link_overlap_tolerance = 3p
219 |
220 |
221 |
222 |
223 | condition = var(value) =~ /(\d+)(\d+)(\d*)/
224 | color = eval(my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
225 | flow = continue
226 |
227 |
228 |
229 | condition = var(value) =~ /(\d+)(\d+)(\d*)/
230 | link_color = eval(my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
231 | flow = continue
232 |
233 |
234 |
235 | condition = var(size) < 0.2kb
236 | show = no
237 |
238 |
239 |
240 |
241 | ############### /TEXT_CONTIG
242 |
243 | ############### CONTIGS SPADES ALL
244 |
245 | type = tile
246 | file = PLASMID_CONTIGS
247 | r1 = 0.60r
248 | r0 = 0.50r
249 | layers = 5
250 | margin = 5u
251 | thickness = 40
252 | padding = 5
253 | layers_overflow = collapse
254 | orientation = in
255 | stroke_thickness = 0
256 | stroke_color = grey
257 | color = grey
258 |
259 |
260 |
261 |
262 | condition = var(value) =~ /(\d+)(\d+)(\d*)/
263 | color = eval( my @match = "var(value)" =~ /(\d+)(\d+)(\d*)/; sprintf("paired-12-qual-%d_a%d", remap_int($match[0],1,9,1,12),remap_int($match[1],1,9,4,2 )))
264 | flow = continue
265 |
266 |
267 |
268 |
269 | condition = var(size) < 0.2kb
270 | show = no
271 |
272 |
273 |
274 |
275 |
276 | ############### /CONTIGS SPADES ALL
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 | ######## IDEOGRAM
285 | #################
286 |
287 | z=5000
288 | show = yes
289 |
290 |
291 | default = 10u
292 | #when representing witout scaling
293 | #default = 1000u
294 | break = 10u
295 |
296 |
297 | chromosomes_color = dblue
298 | stroke_color = blue
299 |
300 | radius = 0.93r
301 | thickness = 30p
302 | fill = yes
303 |
304 | show_label = yes
305 | label_color = dgrey
306 | label_center = yes
307 | label_font = bold
308 | label_radius = 0.1r
309 | #label_radius = dims(ideogram,radius_inner)
310 | #(dims(ideogram,radius_inner) + dims(ideogram,radius_outer))/2
311 |
312 | label_size = 50
313 | label_parallel = yes
314 |
315 |
316 |
317 | ######## TICKS
318 | ##############
319 |
320 | show_ticks = yes
321 | show_tick_labels = yes
322 |
323 |
324 | radius = dims(ideogram,radius_outer)
325 | color = black
326 | thickness = 2p
327 | labe_offset = 0p
328 |
329 | #multiplier = 0.001
330 |
331 |
332 | #spacing = 1000u
333 | rspacing = 0.025
334 | multiplier = 0.001
335 | spacing_type = relative
336 | skip_first_label = yes
337 | skip_last_label = no
338 | size = 5p
339 | show_label = yes
340 | label_size = 20p
341 | #label_relative = yes
342 | suffix = " kb"
343 | #rdivisor = ideogram
344 | format = %d
345 | rmultiplier = 1
346 |
347 |
348 |
349 | #
350 | #spacing = 2000u
351 | #multiplier = 0.001
352 | #size = 5p
353 | #show_label = yes
354 | #skip_first_label = yes
355 | #label_size = 15p
356 | #labe_offset = 0p
357 | #suffix = " kb"
358 | #format = %d
359 | #
360 |
361 |
362 |
363 | ########COLORS
364 | ##############
365 | <>
366 |
367 |
368 | ########HOUSEKEEPING
369 | ####################
370 | <>
371 | max_points_per_track* = 8000000
372 | max_ideograms*=1000
373 |
374 | ########IMAGE
375 | #############
376 |
377 | dir = OUTPUTDIR
378 | #dir = conf(configdir)
379 | file = IMAGENAME_SAMPLE_PLASMID
380 | png = yes
381 | svg = no
382 | # radius of inscribed circle in image
383 | radius = 1900p
384 | # by default angle=0 is at 3 o'clock position
385 | angle_offset = -90
386 | #angle_orientation = counterclockwise
387 | auto_alpha_colors = yes
388 | auto_alpha_steps = 5
389 |
390 |
--------------------------------------------------------------------------------
/documents/ECCMID plasmidID 2018.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/documents/ECCMID plasmidID 2018.pdf
--------------------------------------------------------------------------------
/documents/Istall_dependencies.md:
--------------------------------------------------------------------------------
1 | # Trimmomatic
2 | - wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.38.zip
3 | - unzip Trimmomatic-0.38.zip
4 | - copy to /opt/Trimmomatic or use trimmomatic-dir PATH/TO/Trimmomatic-0.38
5 |
6 | # SPAdes
7 |
8 | - wget http://cab.spbu.ru/files/release3.12.0/SPAdes-3.12.0-Linux.tar.gz
9 | - tar -xzf SPAdes-3.12.0-Linux.tar.gz
10 | - Add to PATH SPAdes-3.12.0-Linux/bin/
11 |
12 | # Blast+
13 |
14 | - sudo apt-get install ncbi-blast+
15 |
16 | # Bowtie2
17 |
18 | - sudo apt install bowtie2
19 |
20 | # Cd-hit-est
21 |
22 | - sudo apt-get install cd-hit
23 |
24 | # Bedtools
25 |
26 | - sudo apt install bedtools
27 |
28 | # Prokka
29 |
30 | - sudo apt-get install libdatetime-perl libxml-simple-perl libdigest-md5-perl git default-jre bioperl
31 | - sudo cpan Bio::Perl
32 | - git clone https://github.com/tseemann/prokka.git $HOME/prokka
33 | - $HOME/prokka/bin/prokka --setupdb
34 | - Add $HOME/prokka/bin/ to PATH
35 |
36 | # Circos
37 |
38 |
39 | - wget http://www.circos.ca/distribution/circos-0.69-6.tgz
40 | - tar xvfz circos-0.69-6.tgz
41 | - sudo apt-get -y install libgd2-xpm-dev
42 | - Add circos-0.69-6.tgz/bin to PATH
43 | - sudo sed -i 's/max_points_per_track = 25000/max_points_per_track = 20000000/g' /opt/circos-0.69-6/etc/housekeeping.conf
44 |
45 |
46 |
47 |
48 |
49 |
50 | ##g++
51 | - sudo apt-get install build-essential
52 | ##libz.h
53 | - sudo apt-get install libz-dev
54 | ##circos dependencies
55 | - sudo apt install circos
56 |
--------------------------------------------------------------------------------
/documents/PlasmidID_IWBBIO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/documents/PlasmidID_IWBBIO.pdf
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: plasmidID
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | - defaults
6 | dependencies:
7 | - python>=3.6
8 | - bioconda::perl-gd>=2.71
9 | - bioconda::bowtie2
10 | - bioconda::bedtools
11 | - bioconda::samtools
12 | - bioconda::mash>=2
13 | - bioconda::circos
14 | - bioconda::prokka>=1.14
15 | - bioconda::blast
16 | - bioconda::spades
17 | - bioconda::trimmomatic
18 | - tbb==2020.2
19 | - conda-forge::gawk
20 | - conda-forge::biopython
21 | - conda-forge::numpy
22 | - conda-forge::pandas
23 | - conda-forge::scikit-learn
24 | - conda-forge::scipy
25 | - conda-forge::tabulate
26 | - conda-forge::wget
27 | - conda-forge::bc
28 |
--------------------------------------------------------------------------------
/img/01_plasmid_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/01_plasmid_track.png
--------------------------------------------------------------------------------
/img/02_mapping_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/02_mapping_track.png
--------------------------------------------------------------------------------
/img/03_annotation_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/03_annotation_track.png
--------------------------------------------------------------------------------
/img/04_contig_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/04_contig_track.png
--------------------------------------------------------------------------------
/img/05_01_complete_contig_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/05_01_complete_contig_track.png
--------------------------------------------------------------------------------
/img/05_complete_contig_track.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/05_complete_contig_track.png
--------------------------------------------------------------------------------
/img/Alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Alignment.png
--------------------------------------------------------------------------------
/img/Annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Annotation.png
--------------------------------------------------------------------------------
/img/Clustering_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Clustering_2.png
--------------------------------------------------------------------------------
/img/Mapping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Mapping.png
--------------------------------------------------------------------------------
/img/Overlap_examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Overlap_examples.png
--------------------------------------------------------------------------------
/img/PIPELNE TFM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/PIPELNE TFM.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_K00826.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_K00826.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_002305.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_002305.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_003384.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_003384.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_003385.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_003385.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_009981.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_009981.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NC_013365.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NC_013365.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT883154.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT883154.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT904853.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904853.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT904874.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904874.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT904880.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904880.1.png
--------------------------------------------------------------------------------
/img/SEN30_000195995_NZ_LT904895.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN30_000195995_NZ_LT904895.1.png
--------------------------------------------------------------------------------
/img/SEN_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN_summary.png
--------------------------------------------------------------------------------
/img/SEN_summary_numbers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/SEN_summary_numbers.png
--------------------------------------------------------------------------------
/img/Short_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Short_pipeline.png
--------------------------------------------------------------------------------
/img/Visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/Visualization.png
--------------------------------------------------------------------------------
/img/isciii_logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/isciii_logo.jpeg
--------------------------------------------------------------------------------
/img/pipeline_pID.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/pipeline_pID.png
--------------------------------------------------------------------------------
/img/plasmidID_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/plasmidID_logo.png
--------------------------------------------------------------------------------
/img/summary_image_1_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_1_3.png
--------------------------------------------------------------------------------
/img/summary_image_2_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_2_3.png
--------------------------------------------------------------------------------
/img/summary_image_3_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/img/summary_image_3_3.png
--------------------------------------------------------------------------------
/test/KPN_TEST_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/test/KPN_TEST_R1.fastq.gz
--------------------------------------------------------------------------------
/test/KPN_TEST_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BU-ISCIII/plasmidID/a07517f1fb20bdab3336d237c486537925374086/test/KPN_TEST_R2.fastq.gz
--------------------------------------------------------------------------------
/test/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a pipeline, which may consist of a single simple command, a list,
4 | #or a compound command returns a non-zero status: If errors are not handled by user
5 | set -e
6 | # Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error when performing parameter expansion.
7 |
8 | #Print everything as if it were executed, after substitution and expansion is applied: Debug|log option
9 | #set -x
10 |
11 | #=============================================================
12 | # HEADER
13 | #=============================================================
14 |
15 | #INSTITUTION:ISCIII
16 | #CENTRE:BU-ISCIII
17 | #AUTHOR: Pedro J. Sola (pedroscampoy@gmail.com)
18 | VERSION=1.6.3
19 | #CREATED: 15 March 2018
20 | #
21 | #ACKNOLEDGE: longops2getops.sh: https://gist.github.com/adamhotep/895cebf290e95e613c006afbffef09d7
22 | #
23 | #DESCRIPTION: test.sh uses test data for testing plasmidID installation.
24 | #
25 | #
26 | #================================================================
27 | # END_OF_HEADER
28 | #================================================================
29 |
30 | #SHORT USAGE RULES
31 | #LONG USAGE FUNCTION
32 | usage() {
33 | cat << EOF
34 |
35 | plasmidID is a computational pipeline tha reconstruct and annotate the most likely plasmids present in one sample
36 |
37 | usage : $0
38 |
39 | -v | --version version
40 | -h | --help display usage message
41 |
42 | example: ./test.sh
43 |
44 | EOF
45 | }
46 |
47 | #================================================================
48 | # OPTION_PROCESSING
49 | #================================================================
50 | # Error handling
51 | error(){
52 | local parent_lineno="$1"
53 | local script="$2"
54 | local message="$3"
55 | local code="${4:-1}"
56 |
57 | RED='\033[0;31m'
58 | NC='\033[0m'
59 |
60 | if [[ -n "$message" ]] ; then
61 | echo -e "\n---------------------------------------\n"
62 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
63 | echo -e "MESSAGE:\n"
64 | echo -e "$message"
65 | echo -e "\n---------------------------------------\n"
66 | else
67 | echo -e "\n---------------------------------------\n"
68 | echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
69 | echo -e "\n---------------------------------------\n"
70 | fi
71 |
72 | exit "${code}"
73 | }
74 |
75 | # translate long options to short
76 | reset=true
77 | for arg in "$@"
78 | do
79 | if [ -n "$reset" ]; then
80 | unset reset
81 | set -- # this resets the "$@" array so we can rebuild it
82 | fi
83 | case "$arg" in
84 | --help) set -- "$@" -h ;;
85 | --version) set -- "$@" -v ;;
86 | # pass through anything else
87 | *) set -- "$@" "$arg" ;;
88 | esac
89 | done
90 |
91 | #DECLARE FLAGS AND VARIABLES
92 | script_dir=$(dirname $(readlink -f $0))
93 | R1=KPN_TEST_R1.fastq.gz
94 | R2=KPN_TEST_R2.fastq.gz
95 | database=plasmids_TEST_database.fasta
96 | contigs=contigs_KPN_TEST.fasta
97 |
98 | #PARSE VARIABLE ARGUMENTS WITH getops
99 | #common example with letters, for long options check longopts2getopts.sh
100 | options=":1:2:d:s:g:c:a:i:o:C:S:f:l:L:T:M:X:y:Y:RVtvh"
101 | while getopts $options opt; do
102 | case $opt in
103 | h )
104 | usage
105 | exit 1
106 | ;;
107 | v )
108 | echo $VERSION
109 | exit 1
110 | ;;
111 | \?)
112 | echo "Invalid Option: -$OPTARG" 1>&2
113 | usage
114 | exit 1
115 | ;;
116 | : )
117 | echo "Option -$OPTARG requires an argument." >&2
118 | exit 1
119 | ;;
120 | * )
121 | echo "Unimplemented option: -$OPTARG" >&2;
122 | exit 1
123 | ;;
124 |
125 | esac
126 | done
127 | shift $((OPTIND-1))
128 |
129 | ## Execute plasmidID with test data.
130 | echo "Executing:../plasmidID.sh -1 $R1 -2 $R2 -d $database -c $contigs -s KPN --no-trim"
131 | echo "Forward reads: $R1"
132 | echo "Reverse reads: $R2"
133 | echo "PlasmidDatabase: $database"
134 | echo "Contigs: $contigs"
135 | echo "Options: --no-trim"
136 |
137 | echo "export PATH=$PATH:$script_dir/../bin" > path
138 | source path
139 | $script_dir/../plasmidID -1 $script_dir/$R1 -2 $script_dir/$R2 -d $script_dir/$database -c $script_dir/$contigs -s KPN --no-trim
140 |
141 |
142 | echo "ALL DONE. TEST COMPLETED SUCCESSFULLY YOUR INSTALLATION SHOULD BE CORRECT."
143 |
--------------------------------------------------------------------------------