├── ITS-Pipeline
├── README.md
├── cutadapt_trim_adapters_and_primers.sh
└── img
├── PCR-twostep.jpg
├── PCR.png
├── README
├── water-testing.jpg
└── youtube-video-sequencing.png
/ITS-Pipeline:
--------------------------------------------------------------------------------
1 | # ITS_metabarcoding_analyses
2 | ITS2 pipeline
3 | Sequences were amplified following the EMP, http://www.earthmicrobiome.org/protocols-and-standards/its/
4 | Illumina Hiseq 2500, 250 bp paired-end reads.
5 |
6 | ## Nextera Adapters
7 | ### Adapters in Forward Reads
8 | >Trans2_rc_in_20_sequences
9 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC
10 |
11 | ### Adpeters in Reverse Reads
12 | >Trans1_rc
13 | CTGTCTCTTATACACATCTGACGCTGCCGACGA
14 |
15 |
16 | ## Primer Sequences
17 | ### EMP.ITSkabir reverse primer (ITS2), barcoded
18 | Reverse complement of 3′ Illumina adapter -> CAAGCAGAAGACGGCATACGAGAT
19 | Golay barcode -> NNNNNNNNNN
20 | Reverse primer linker -> CG
21 | Reverse primer (ITS2; Note: This is identical to ITS2 from White et al., 1990.) -> GCTGCGTTCTTCATCGATGC
22 |
23 | ## Reference Database : UNITE
24 | #### https://unite.ut.ee/repository.php
25 | #### https://plutof.ut.ee/#/datacite/10.15156%2FBIO%2F587481
26 |
27 | wget "https://files.plutof.ut.ee/doi/0A/0B/0A0B25526F599E87A1E8D7C612D23AF7205F0239978CBD9C491767A0C1D237CC.zip"
28 |
29 |
30 | qiime tools import --type 'FeatureData[Taxonomy]' --source-format HeaderlessTSVTaxonomyFormat --input-path sh_ --output-path qiime-ref-taxonomy_99.qz
31 |
32 | qiime tools import --type 'FeatureData[Taxonomy]' --source-format HeaderlessTSVTaxonomyFormat --input-path sh_ --output-path q
33 | iime-ref-taxonomy_99.qz
34 |
35 |
36 | ## Step 1: Trim Sequences and Remove Primers with cutadapt
37 | sbatch ./cutadapt_trim_adapters_and_primers.sh
38 |
39 | qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path cutadapt_data/ --source-format CasavaOneEightSingleLanePerSampleDirFmt --output-path cutadapt-paired-end.qza
40 |
41 |
42 | ## Step 1 NEW! : Trim sequences and remove primers in qiime2 : Make sure there are no reverse primsers on your forward reads cutadapt plugin which provides trim-paired option
43 |
44 | qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path raw_data/ --source-format CasavaOneEightSingleLanePerSampleDirFmt --output-path raw-paired-end.qza
45 |
46 | qiime cutadapt trim-paired --i-demultiplexed-sequences raw-paired-end.qza --o-trimmed-sequences trimmed-paired-end.qza --p-cores 60 --p-anywhere-f CTGTCTCTTATACACATCTCCGAGCCCACGAGAC --p-anywhere-r CTGTCTCTTATACACATCTGACGCTGCCGACGA --p-front-f CAAGCAGAAGACGGCATACGAGAT
47 |
48 | ## Step 2: DADA2 : after trimming primers, you may want to disable truncation filtering entirely by setting trunc_len to 0
49 |
50 | qiime dada2 denoise-paired --p-trim-left-f 0 --p-trim-left-r 0 --p-trunc-len-f 0 --p-trunc-len-r 0 --i-demultiplexed-seqs trimmed-paired-end.qza --o-table trimmed-0-0-table --o-representative-sequences trimmed-req-seqs --verbose --p-n-threads 60
51 |
52 | ## I also ran the single ends for comparison
53 | qiime dada2 denoise-single --p-trim-left 0 --p-trunc-len 0 --i-demultiplexed-seqs cutadapt-single-end.qza --o-table cutadapt-single-end-0-0-tqble --o-representative-sequences cutadapt-single-end-rep-seqs --verbose --p-n-threads 60
54 |
55 |
56 | ## Step3: Assign Taxonomy: here we do it with qiime BLAST
57 |
58 | qiime feature-classifier classify-consensus-blast --i-query trimmed-req-seqs.qz --i-reference-taxonomy reference_UNITE/qiime-ref-taxonomy_99.qza --i-reference-reads reference_UNITE/qiime_fasta_99.qza --o-classification unite_99_9_trimmed-paired-end-classification_blast.qza --p-perc-identity 0.9 --p-maxaccepts 1
59 |
60 | qiime feature-classifier classify-consensus-blast --i-query cutadapt-single-end-rep-seqs.qza --i-reference-taxonomy reference_UNITE/qiime-ref-taxonomy_99.qza --i-reference-reads reference_UNITE/qiime_fasta_99.qza --o-classification unite_99_9_cutadapt-single-end-classification_blast.qza --p-perc-identity 0.9 --p-maxaccepts 1
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 | ### Make blast database for taxonomy assignment (go into database rep set) only if using joes taoxnomy assignment
94 | makeblastdb -dbtype nucl -in 99_otus.fasta -out 99_otus
95 | #
96 | ## Make fresh directory for raw reads, all reads should be gzipped at this point.
97 |
98 | mkdir raw_reads & cd raw_reads
99 |
100 | mv /path/to/project/dir/*/*R1* ./
101 | mv /path/to/project/dir/*/*R3* ./
102 | #
103 | ## rename files to this format --> SampleName_barcode_L001_R1_001.fastq.gz
104 | ##### for your data I simply needed to fix the ones with ITS in it. (remove the -n to actually make the change)
105 | rename -v -n 's/(.*)-(.*)__(.*)/$1$2_$3/' *
106 | #
107 | ## Merge reads, add forward reads that did not merge to merged reads
108 | cd raw_reads
109 | join_reads_and_prep_for_qiime.py ./
110 |
111 |
112 |
113 | ## Truncate primers
114 |
115 | truncate_reverse_primer.py -f seqs.fna -m mapping_file.txt -o truncate_reverse_primers
116 |
117 | ## Open Reference OTU Picking
118 |
119 | pick_open_reference_otus.py -i seqs.fna -o pick_references_output_open -r /home/genome/joseph7e/MEG_its_test/its_database/rep_set/97_otus.fasta -p parameters
120 |
121 | ### Run qiime2 enviornment
122 |
123 |
124 |
125 | ## IMPORT TO QIIME 2 table
126 | qiime tools import \
127 | --input-path ../otu_table_mc2.biom \
128 | --type "FeatureTable[Frequency]" \
129 | --source-format BIOMV210Format \
130 | --output-path feature-table-from_qiime1.qza
131 |
132 | ### Import to qiime 2 taxonomy
133 |
134 | qiime tools import \
135 | --type "FeatureData[Taxonomy]" \
136 | --input-path ../uclust_assigned_taxonomy/rep_set_tax_assignments.txt \
137 | --output-path taxonomy_from_qiime1
138 |
139 | ### Summarize data in table
140 | qiime feature-table summarize \
141 | --i-table feature-table-from_qiime1.qza \
142 | --o-visualization table.qzv \
143 | --m-sample-metadata-file ../../../Microbiome_ITS_Mapping_File.csv
144 |
145 |
146 | ### Summarize through barplots
147 | qiime taxa barplot \
148 | --i-table feature-table-from_qiime1.qza \
149 | --i-taxonomy taxonomy_from_qiime1.qza \
150 | --m-metadata-file Microbiome_ITS_Mapping_File.tsv \
151 | --o-visualization taxa-bar-plots.qzv
152 |
153 |
154 |
155 | ## Create a tree
156 |
157 | ## Import the rrep set form qiime2
158 |
159 | qiime tools import \
160 | --type "FeatureData[Sequence]" \
161 | --input-path ../rep_set.fna \
162 | --output-path rep_set_from_qiime1
163 |
164 | qiime alignment mafft \
165 | --i-sequences rep_set_from_qiime1.qza \
166 | --o-alignment aligned-rep-seqs.qza
167 |
168 | qiime alignment mask \
169 | --i-alignment aligned-rep-seqs.qza \
170 | --o-masked-alignment masked-aligned-rep-seqs.qza
171 |
172 |
173 | qiime phylogeny fasttree \
174 | --i-alignment masked-aligned-rep-seqs.qza \
175 | --o-tree unrooted-tree.qza
176 |
177 |
178 | ### root the tree
179 |
180 | qiime phylogeny midpoint-root --i-tree tree_unrooted_from_qiime1.qza --o-rooted-tree rooted-tree.qza
181 |
182 | ### Rareify this shit
183 |
184 | my_numbers : 6092, 2130, 10065
185 |
186 |
187 | sampling_depth=10065
188 |
189 | qiime diversity core-metrics \
190 | --i-phylogeny rooted-tree.qza \
191 | --i-table feature-table-from_qiime1.qza \
192 | --p-sampling-depth $sampling_depth \
193 | --output-dir core-metric_$sampling_depth
194 |
195 |
196 | ### ALpha diversity
197 |
198 | qiime diversity alpha-group-significance \
199 | --i-alpha-diversity core-metric_$sampling_depth/faith_pd_vector.qza \
200 | --m-metadata-file Microbiome_ITS_Mapping_File.tsv \
201 | --o-visualization core-metric_$sampling_depth/faith-pd-group-significance.qzv
202 |
203 | qiime diversity alpha-group-significance \
204 | --i-alpha-diversity core-metric_$sampling_depth/evenness_vector.qza \
205 | --m-metadata-file Microbiome_ITS_Mapping_File.tsv \
206 | --o-visualization core-metric_$sampling_depth/evenness-group-significance.qzv
207 |
208 | qiime diversity alpha-correlation \
209 | --i-alpha-diversity core-metric_$sampling_depth/faith_pd_vector.qza \
210 | --m-metadata-file Microbiome_ITS_Mapping_File.tsv \
211 | --o-visualization core-metric_$sampling_depth/faith-pd-correlation.qzv
212 |
213 | qiime diversity alpha-correlation \
214 | --i-alpha-diversity core-metric_$sampling_depth/evenness_vector.qza \
215 | --m-metadata-file Microbiome_ITS_Mapping_File.tsv \
216 | --o-visualization core-metric_$sampling_depth/evenness-correlation.qzv
217 |
218 |
219 | ### Beta diversity
220 |
221 | categories: #SampleID,BarcodeSequence,LinkerPrimerSequence,SampleType,Year,State,WNSStatus,Species,Sex,Description
222 |
223 | category=Year
224 |
225 | qiime diversity beta-group-significance \
226 | --i-distance-matrix core-metric_$sampling_depth/unweighted_unifrac_distance_matrix.qza \
227 | --m-metadata-file Microbiome_ITS_Mapping_File.tsv \
228 | --m-metadata-category $category \
229 | --o-visualization core-metric_$sampling_depth/unweighted-unifrac-$category-significance.qzv
230 |
231 |
232 | ### Emperor PCOA plots (must be numerical)
233 |
234 | sampling_depth=2130
235 | #category=WNSStatus
236 |
237 | qiime emperor plot \
238 | --i-pcoa core-metric_$sampling_depth/unweighted_unifrac_pcoa_results.qza \
239 | --m-metadata-file ../../../../Microbiome_ITS_Mapping_File.txt \
240 | --o-visualization core-metric_$sampling_depth/unweighted-unifrac-emperor.qzv
241 |
242 | qiime emperor plot \
243 | --i-pcoa core-metric_$sampling_depth/bray_curtis_pcoa_results.qza \
244 | --m-metadata-file ../../../../Microbiome_ITS_Mapping_File.txt \
245 | --o-visualization core-metric_$sampling_depth/bray-curtis-emperor.qzv
246 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Overview - 16S Metabarcoding with Qiime 2
2 | In this tutorial we'll go over how to use QIIME 2 to analyze metabarcoding data. We'll start with an introduction about how metabarcoding (aka amplicon) data is produced and with a refresher for working in the BASH command-line environment.
3 |
4 | ## Some definitions
5 |
6 | DNA taxonomy in a broad sense, means any form of analysis that uses variation in DNA sequence data to inform species delimitation.
7 |
8 | Barcoding - identification (taxonomically/phylogenetically) of an organism by a short section of DNA/RNA, usually through PCR amplification with conserved DNA/RNA primers.
9 |
10 | Polymerase chain reaction (PCR) - an amplification technique for cloning the specific or targeted parts of a DNA sequence to generate thousands to millions of copies of DNA of interest.
11 |
12 | Metabarcoding - barcoding of DNA/RNA or eDNA/eRNA in a manner that allows for identification of many taxa within the same sample. Also known as "amplicon" sequencing, or "marker gene sequencing".
13 |
14 | eDNA - environmental DNA.
15 |
16 | Metagenomics - *en mass* sequencing of a community of organisms using whole-genome shotgun sequencing
17 |
18 |
19 |
20 |
21 | Collect Sample | Extract DNA
22 | :-------------------------:|:-------------------------:
23 |
|
24 |
25 | PCR Amplification | Repeat for x Cycles
26 | :-------------------------:|:-------------------------:
27 |
|
28 |
29 |
30 | Prepare Library | Sequence DNA
31 | :-------------------------:|:-------------------------:
32 |  |
33 |
34 |
35 |
36 | image references:
37 |
38 |
39 | ## How NGS sequencing works
40 | [](https://www.youtube.com/watch?v=p4vKJJlNTKA&t=9s "Sequencing")
41 |
42 |
43 |
44 | Selecing a locus for barcoding
45 |
46 | 1. The targeted region should have little intraspecific variation (< 2% sequence identity) and enough interspecific variation (>2% sequence identity) to distinguish different species.
47 |
48 | 2. It should be phylogenetically informative to allow the placement of newly barcoded organisms to accurate lineages.
49 |
50 | 3. The primer binding sites should be highly conserved and specific, so DNA amplification is reliable across all taxa in question. This is especially important for en mass community analyses (metabarcoding) and will make the development of universal primers more efficient and help alleviate potential PCR bias.
51 |
52 | 4. For studies utilizing HTS the target region must be small enough to be recovered on one or two sequencing reads when using paired-end information (<600 bps). Shorter sequences are also preferred for recovering barcoding sequences from preserved or degraded samples.
53 |
54 | 5. Reference sequence databases with taxonomic information exists for the region in question.
55 |
56 |
57 |
58 | ## Common primers used at the HCGS
59 | | Target-Organisms | Gene | Region | Location/Name | Length (bp) | Forward-primer | Reverse-primer | F_length | R_length | Reference |
60 | | --------------------- | ---- | --------- | ------------- | ---------------- | ---------------------- | --------------------------- | -------- | -------- | -------------------------- |
61 | | Prokaryotes | 16S | V4 | 515F-806R | ~390 | GTGYCAGCMGCCGCGGTAA | GGACTACNVGGGTWTCTAAT | 19 | 20 | Walters et al. 2016 |
62 | | Prokaryotes | 16S | V4-V5 | 515-926R | ~510 | GTGYCAGCMGCCGCGGTAA | CCGYCAATTYMTTTRAGTTT | 19 | 20 | Stoek et al. 2010 |
63 | | Microbial Eukaryotes | 18S | V9 | 1391F-1510R | ~210 - 310 | GTACACACCGCCCGTC | TGATCCTTCTGCAGGTTCACCTAC | 16 | 24 | Amaral-Zettler et al. 2009 |
64 | | Fungal and micro euks | ITS | ITS1-ITS2 | ITS1F-ITS2 | ~250 - 600 | CTTGGTCATTTAGAGGAAGTAA | GCTGCGTTCTTCATCGATGC | 22 | 20 | White et al., 1990 |
65 | | Fish | 12S | V5 | MiFish | ~163 - 185 | GTCGGTAAAACTCGTGCCAGC | CATAGTGGGGTATCTAATCCCAGTTTG | 21 | 27 | Miya et al, 2015 |
66 |
67 |
68 |
69 |
70 | ## General Notes:
71 | **For each program that we run in this tutorial I have provided a link to the manual**. These manuals provide a thorough explanation of what exactly we are doing. Before running the workflow on your own data you should read the manual/publication for the program.
72 |
73 | Throughout this tutorial the commands you will type are formatted into the gray text boxes (don't do it when learning but they can be faithfully copied and pasted). The '#' symbol indicates a comment, BASH knows to ignore these lines.
74 |
75 | This tutorial assumes a general understanding of the BASH environment. **You should be familiar with moving around the directories and understand how to manipulate files**.
76 |
77 |
78 | **Remember to tab complete!** There is a reason the tab is my favorite key. It prevents spelling errors and allows you to work much faster. Remember if a filename isn't auto-completing you can hit tab twice to see your files while you continue typing your command. If a file doesn't auto-complete it means you either have a spelling mistake, are in a different directory than you originally thought, or that it doesn't exist.
79 |
80 |
81 |
82 | # Let's Begin!
83 |
84 | ## Connect to the server
85 | See the BASH tutorials to get started.
86 |
87 | [BASH Tutorials](https://github.com/Joseph7e/HCGS-BASH-tutorial)
88 |
89 | [INBRE BASH Tutorials](https://geiselmed.dartmouth.edu/nhinbre/bioinformatics-modules/)
90 |
91 | ## Activate the genomics environment
92 | This is important and ensures that all the programs we use are updated and in working order. You'll need to do this every time you login to the server and need general bioinformatic tools.
93 |
94 | ```
95 | conda activate genomics
96 |
97 | conda info --envs
98 | ```
99 |
100 | ## BASH practice
101 |
102 | ```
103 | # setup working directory
104 | mkdir ~/bash-practice
105 | cd ~/bash-practice
106 |
107 | # copy example reads
108 | cp -r /home/share/examples/example-reads/ ./
109 |
110 | ```
111 |
112 | [Link explaining the 'Read Name Format'](http://support.illumina.com/content/dam/illumina-support/help/BaseSpaceHelp_v2/Content/Vault/Informatics/Sequencing_Analysis/BS/swSEQ_mBS_FASTQFiles.htm): SampleName_Barcode_LaneNumber_001.fastq.gz
113 |
114 |
115 |
116 | ## Sequencing Read Assessment
117 |
118 | Note the file extension - fastq.**gz**. Since these files are usually pretty big it is standard to receive them compressed. To view these files ourselves (which you normally wouldn't do) you either have to decompress the data with gunzip or by using variations of the typical commands. Instead of 'cat' we use 'zcat', instead of grep we can use 'zgrep'. Or just use less which allows you to stream a zipped file for viewing.
119 |
120 | ```bash
121 | less -S example-reads/*_R1_*
122 | ```
123 |
124 | #### Fastq File Format
125 | Each sequencing read entry is four lines long..
126 |
127 | - Line 1. Always begins with an '@' symbol and denotes the header. This is unique to each sequence and has info about the sequencing run.
128 |
129 | - Line 2. The actual sequencing read for your organism, a 250 bp string of As, Ts, Cs, and Gs.
130 |
131 | - Line 3. Begins with a '+' symbol, this is the header for the read quality. Usually the same as the first line header.
132 |
133 | - Line 4. Next are ascii symbols representing the quality score (see table below) for each base in your sequence. This denotes how confident we are in the base call for each respective nucleotide. This line is the same length as the sequencing line since we have a quality score for each and every base of the sequence.
134 |
135 | 
136 |
137 | 
138 |
139 | * Count The Number of Raw Reads
140 |
141 | I always start by counting the number of reads I have for each sample. This is done to quickly assess whether we have enough data to assemble a meaningful genome. Usually these file contains millions of reads, good thing BASH is great for parsing large files! Note that the forward and reverse reads will have the same number of entries so you only need to count one.
142 |
143 | ```bash
144 | # using grep. Note that I don't count just '@', this is because that symbol may appear in the quality lines.
145 | zgrep -c '^@' Sample*/*R1*
146 | # counting the lines and dividing by 4. Remember each read entry is exactly four lines long. These numbers should match.
147 | zcat Sample*/*_R1_* | wc -l
148 | ```
149 | * Whats our total bp of data? This is what we call our sequencing throughput. We multiple the number of reads by the read length (ours is 250) and by 2 because it is paired-end data.
150 |
151 | (Read length x 2(paired-end) x Number of reads)
152 |
153 | ```
154 | # we can do this calculation from the terminal with echo and bc (bc is the terminal calculator)
155 | echo "Number_of_reads * 250 * 2" | bc
156 | ```
157 |
158 | ## Examine Read Quality
159 | program: FASTQC
160 | manual: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
161 |
162 | [FASTQC explained](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/)
163 |
164 | * Run Fastqc
165 |
166 | FastQC is a program to summarize read qualities and base composition. Since we have millions of reads there is no practical way to do this by hand. We call the program to parse through the fastq files and do the hard work for us. **The input to the program is one or more fastq file(s) and the output is an html file with several figures.** The link above describes what each of the output figures are describing. I mainly focus on the first graph which visualizes our average read qualities and the last figure which shows the adapter content. Note that this program does not do anything to your data, as with the majority of the assessment tools, it merely reads it.
167 |
168 | ```bash
169 | # make a directory to store the output
170 | mkdir fastqc_raw-reads
171 | # run the program
172 | fastqc example-reads*/*_R1_* example-reads*/*_R2_* -o fastqc_raw-reads
173 | ls fastqc_raw-reads
174 | # the resulting folder should contain a zipped archive and an html file, we can ignore the zipped archive which is redundant.
175 | ```
176 |
177 | * Transfer resulting HTML files to computer using filezilla or with the command line on OSX/Linux.
178 |
179 | On filezilla you will need to enter the same server information when you login form the terminal. Be sure to use port 22.
180 |
181 | ```bash
182 | # to get the absolute path to a file you can use the ‘readlink’ command.
183 | readlink -f fastqc_raw-reads/*.html
184 | # copy those paths, we will use them for the file transfer
185 | # In a fresh terminal on OSX, Linux, or BASH for windows
186 | scp USERNAME@ron.sr.unh.edu:/home/GROUP/USERNAME/mdibl-t3-2019-WGS/fastqc_raw-reads/*.html /path/to/put/files
187 | ```
188 |
189 | * Transfer resulting HTML files to computer using filezilla or with the command line on OSX/Linux.
190 |
191 | On filezilla you will need to enter the same server information when you login form the terminal. Be sure to use port 22.
192 |
193 | ```bash
194 | # to get the absolute path to a file you can use the ‘readlink’ command.
195 | readlink -f fastqc_raw-reads/*.html
196 | # copy those paths, we will use them for the file transfer
197 | # In a fresh terminal on OSX, Linux, or BASH for windows
198 | scp USERNAME@ron.sr.unh.edu:/home/GROUP/USERNAME/bash-practice/fastqc_raw-reads/*.html /path/to/put/files
199 | ```
200 |
201 | 
202 |
203 |
204 | ## Qiime2 documentation
205 |
206 | 
207 |
208 | "QIIME 2™ is a next-generation microbiome bioinformatics platform that is extensible, free, open source, and community developed."
209 |
210 | [Qiime2 user documentation](https://docs.qiime2.org/2022.2/)
211 |
212 | [Qiime2 visuals](https://view.qiime2.org/)
213 |
214 | [Moving Pictures Tutorial](https://docs.qiime2.org/2022.2/tutorials/moving-pictures/)
215 |
216 | [Getting Oriented](https://docs.qiime2.org/2022.2/tutorials/overview/#let-s-get-oriented-flowcharts)
217 |
218 |
219 |
220 | ## Example Data
221 |
222 | These data are from set of mouse fecal samples provided by [Jason Bubier from The Jackson Laboratory](https://www.jax.org/research-and-faculty/faculty/research-scientists/jason-bubier).
223 | The samples were run targeting the V1-V3 region of the 16S gene using the 27F - 534R primer pair on an Illumnina MiSeq on a paired end 300 bp run.
224 |
225 | ### Primers
226 | ~~~
227 | 27F [20 bp]
228 | 5'AGM GTT YGA TYM YGG CTC AG
229 | 534R [17 bp]
230 | 5'ATT ACC GCG GCT GCT GG
231 | ~~~
232 |
233 |
234 |
235 |
236 | **image source - https://help.ezbiocloud.net/16s-rrna-and-16s-rrna-gene/**
237 |
238 |
239 |
240 | For Metadata we have the sex, strain, and age (# days).
241 | Our goal is to examine the correlation of the fecal microbiome we observe with these metadata.
242 | We will primarily use the [Qiime 2](https://qiime2.org/) bioinformatics platform.
243 | Qiime 2 is free and open source and available from Linux and OSX.
244 | We will use the Qiime2 command line interface, there is also the ["Artifact" python API](https://docs.qiime2.org/2019.4/interfaces/artifact-api/) which can be more powerful.
245 |
246 | ### Copy starting data
247 | ~~~bash
248 | mkdir hcgs-qiime2-workshop
249 | cd hcgs-qiime2-workshop
250 | cp -r /home/share/examples/cocaine_mouse/* .
251 | ls
252 |
253 | # mdat.tsv
254 |
255 | less -S mdat.tsv
256 | #SampleID Sex Treatment Strain Date PrePost Dataset HaveBred PerformedPCR Description pptreatment Testing
257 | #q2:types categorical categorical categorical numeric categorical categorical categorical categorical categorical categorical
258 | JBCDJ00OLJ1STT0B00000191821C7M7FGT1904904 F Sham CC004 0 Pre Dataset1 Jax 19182_1 PreSham Train
259 | JBCDJ00OLK1STT0B00000191671C7M7FGT1904905 F Coc CC041 0 Pre Dataset1 Jax 19167_1 PreCoc Train
260 | JBCDJ00OLL1STT0B00000191771C7M7FGT1904906 M Sham CC004 0 Pre Dataset1 Jax 19177_1 PreSham Test
261 | JBCDJ00OLM1STT0B00000191861C7M7FGT1904907 M Coc CC004 0 Pre Dataset1 Jax 19186_1 PreCoc Test
262 | JBCDJ00OLN1STT0B00000191791C7M7FGT1904908 F Coc CC004 0 Pre Dataset1 Jax 19179_1 PreCoc Train
263 | JBCDJ00OLO1STT0B00000191691C7M7FGT1904909 F Sham CC041 0 Pre Dataset1 Jax 19169_1 PreSham Test
264 | JBCDJ00OLP1STT0B00000191731C7M7FGT1904910 M Coc CC041 0 Pre Dataset1 Jax 19173_1 PreCoc Test
265 | JBCDJ00OLQ1STT0B00000191641C7M7FGT1904911 M Coc CC041 0 Pre Dataset1 Jax 19164_1 PreCoc Train
266 | JBCDJ00OLR1STT0B00000191801C7M7FGT1904912 F Sham CC004 0 Pre Dataset1 Jax 19180_1 PreSham Train
267 | JBCDJ00OLS1STT0B00000191831C7M7FGT1904913 F Coc CC004 0 Pre Dataset1 Jax 19183_1 PreCoc Train
268 | JBCDJ00OLT1STT0B00000191841C7M7FGT1904914 M Sham CC004 0 Pre Dataset1 Jax 19184_1 PreSham Train
269 | JBCDJ00OLU1STT0B00000191711C7M7FGT1904915 M Coc CC041 0 Pre Dataset1 Jax 19171_1 PreCoc Train
270 | JBCDJ00OLV1STT0B00000191681C7M7FGT1904916 F Sham CC041 0 Pre Dataset1 Jax 19168_1 PreSham Train
271 |
272 | ~~~
273 | When we look at the metadata file we see the metadata that we will be able to use during our analysis
274 |
275 | ## Running Qiime2 commands
276 |
277 | ~~~bash
278 | ## Anatomy of a qiime command
279 | qiime plugin action\
280 | --i-inputs foo\ ## input arguments start with --i
281 | --p-parameters bar\ ## paramaters start with --p
282 | --m-metadata mdat\ ## metadata options start with --m
283 | --o-outputs out ## and output starts with --o
284 | ~~~
285 | Qiime works on two types of files, Qiime Zipped Archives (.qza) and Qiime Zipped Visualizations (.qzv). Both are simply renamed .zip archives that hold the appropriate qiime data in a structured format. This includes a "provenance" for that object which tracks the history of commands that led to it. The qza files contain data, while the qzv files contain visualizations displaying some data. We'll look at a quality summary of our reads to help decide how to trim and truncate them.
286 |
287 | # Import data into Qiime2
288 | ~~~bash
289 | qiime tools import\
290 | --type 'SampleData[PairedEndSequencesWithQuality]'\
291 | --input-path manifest.csv\
292 | --output-path demux\
293 | --input-format PairedEndFastqManifestPhred33
294 | ## the correct extension is automatically added for the output by qiime.
295 | ~~~
296 |
297 | ## Quality Control
298 | Now we want to look at the quality profile of our reads. Our goal is to determine how much we should truncate the reads before the paired end reads are joined. This will depend on the length of our amplicon, and the quality of the reads.
299 | ~~~bash
300 | qiime demux summarize\
301 | --i-data demux.qza\
302 | --o-visualization demux
303 | ~~~
304 | When looking we want to answer these questions:
305 |
306 | How much total sequence do we need to preserve an sufficient overlap to merge the paired end reads?
307 |
308 | How much poor quality sequence can we truncate before trying to merge?
309 |
310 | In this case we know our amplicons are about 390 bp long, and we want to preserve approximately 50 bp combined overlap. So our target is to retain ~450 bp of total sequence from the two reads. 450 bp/2 = 225 bp but looking at the demux.qzv, the forward reads seem to be higher quality than the reverse, so let's retain more of the forward and less of the reverse.
311 |
312 | ## Denoising
313 | We're now ready to denoise our data. Through qiime we will be using the program DADA2, the goal is to take our imperfectly sequenced reads, and recover the "real" sequence composition of the sample that went into the sequencer.
314 | DADA2 does this by learning the error rates for each transition between bases at each quality score. It then assumes that all of the sequences are errors off the same original sequence. Then using the error rates it calculates the likelihood of each sequence arising. Sequences with a likelihood falling below a threshold are split off into their own groups and the algorithm is iteratively applied. Because of the error model we should only run samples which were sequenced together through dada2 together, as different runs may have different error profiles. We can merge multiple runs together after dada2. During this process dada2 also merges paired end reads, and checks for chimeric sequences.
315 | ~~~bash
316 | qiime dada2 denoise-paired\
317 | --i-demultiplexed-seqs demux.qza\
318 | --p-trim-left-f 20 --p-trim-left-r 17\
319 | --p-trunc-len-f 295 --p-trunc-len-r 275\
320 | --p-n-threads 18\
321 | --o-denoising-stats dns\
322 | --o-table table\
323 | --o-representative-sequences rep-seqs
324 | ~~~
325 |
326 | Now lets visualize the results of Dada2.
327 | ~~~bash
328 | ## Metadata on denoising
329 | qiime metadata tabulate\
330 | --m-input-file dns.qza\
331 | --o-visualization dns
332 | ## Unique sequences accross all samples
333 | qiime feature-table tabulate-seqs\
334 | --i-data rep-seqs.qza\
335 | --o-visualization rep-seqs
336 | ## Table of per-sample sequence counts
337 | qiime feature-table summarize\
338 | --i-table table.qza\
339 | --m-sample-metadata-file mdat.tsv\
340 | --o-visualization table
341 | ~~~
342 | Looking at dns.qzv first we can see how many sequences passed muster for each sample at each step performed by dada2. Here we are seeing great final sequence counts, and most of the sequences being filtered in the initial quality filtering stage. Relatively few are failing to merge, which suggests we did a good job selecting our truncation lengths.
343 |
344 | In the table.qzv we can see some stats on our samples. We have millions of counts spread across thousands of unique sequences and tens of samples. We'll come back to the table.qzv file when we want to select our rarefaction depth.
345 |
346 | In the rep-seqs.qzv we can see the sequences and the distribution of sequence lengths. Each sequence is a link to a web-blast against the ncbi nucleotide database.
347 | The majority of the sequences we observe are in our expected length range.
348 | Later on we can use this to blast specific sequences we are interested in against the whole nucleotide database.
349 |
350 |
351 | # Extract data from qiime2
352 | qiime tools extract --input-path table.qza --output-path extracted-table
353 | biom convert -i extracted-table/*/data/feature-table.biom -o feature-table.tsv --to-tsv
354 | qiime tools extract --input-path rep-seqs.qza --output-path extracted-seqs
355 |
356 |
357 |
358 |
359 | ## Taxonomic Assignment
360 | VSEARCH uses a fast heuristic based on words shared by the query and target sequences in order to quickly identify similar sequence
361 |
362 | The main qiime2 tutorials utilizes a pre-trained Naive Bayes classifier (machine learning) and the q2-feature-classifier plugin. Here we will utilize Vsearch which works well out-of-the-box for most datasets. The output is similiar to BLAST.
363 |
364 | ~~~bash
365 | qiime feature-classifier classify-consensus-vsearch\
366 | --i-query rep-seqs.qza\
367 | --i-reference-reads /home/share/databases/SILVA_databases/silva-138-99-seqs.qza\
368 | --i-reference-taxonomy /home/share/databases/SILVA_databases/silva-138-99-tax.qza\
369 | --p-maxaccepts 5 --p-query-cov 0.4\
370 | --p-perc-identity 0.7\
371 | --o-classification taxonomy\
372 | --p-threads 72
373 | ~~~
374 |
375 | ~~~bash
376 | qiime metadata tabulate\
377 | --m-input-file taxonomy.qza\
378 | --o-visualization taxonomy.qzv
379 |
380 | qiime taxa barplot --i-table table.qza\
381 | --i-taxonomy taxonomy.qza\
382 | --o-visualization taxa-barplot\
383 | --m-metadata-file mdat.tsv
384 | ~~~
385 |
386 | ## Diversity analysis
387 | Our next step is to look at the diversity in the sequences of these samples.
388 | Here we will use the differences between the sequences in the sample, and metrics to quantify those differences to tell us about the diversity, richness and evenness of the sequence variants found in the samples.
389 | In doing so we will construct a de novo phylogenetic tree, which works much better if we first remove any spurious sequences that are not actually the target region of our 16S gene.
390 | To do that we will use our taxonomic assignments to filter out sequences that remained Unassigned, are assigned only as Bacteria or are Eukaryotes. We should look at what we are filtering out and try and find out what it is.
391 |
392 | ~~~bash
393 | ## exact match to filter out unassigned and Bacteria
394 | ## exact because bacteria is part of many other that we want to keep.
395 | qiime taxa filter-table\
396 | --i-table table.qza\
397 | --i-taxonomy taxonomy.qza\
398 | --p-exclude "Unassigned,D_0__Bacteria"\
399 | --p-mode exact\
400 | --o-filtered-table bacteria-table
401 |
402 | ## Partial match to Eukaryota to filter out any Euks
403 | qiime taxa filter-table\
404 | --i-table bacteria-table.qza\
405 | --i-taxonomy taxonomy.qza\
406 | --p-exclude "Eukaryota"\
407 | --o-filtered-table bacteria-table2
408 |
409 | mv bacteria-table2.qza bacteria-table.qza
410 |
411 | ## Any additional sequences that we should exclude can be filtered on a "per feature basis"
412 | ## In this case we had some sequences that look like they were sequenced backwards!
413 |
414 | qiime feature-table filter-features\
415 | --i-table bacteria-table.qza\
416 | --m-metadata-file exclude.tsv\
417 | --p-exclude-ids\
418 | --o-filtered-table bact-table.qza
419 |
420 | ## How much did we filter out?
421 | qiime feature-table summarize\
422 | --i-table bacteria-table.qza\
423 | --m-sample-metadata-file mdat.tsv\
424 | --o-visualization bacteria-table
425 |
426 | ## Does it look very different?
427 | qiime taxa barplot --i-table bacteria-table.qza\
428 | --i-taxonomy taxonomy.qza\
429 | --o-visualization bacteria-taxa-barplot\
430 | --m-metadata-file mdat.tsv
431 |
432 | ## Filter the sequences to reflect the new table.
433 | qiime feature-table filter-seqs\
434 | --i-table bacteria-table.qza\
435 | --i-data rep-seqs.qza\
436 | --o-filtered-data bacteria-rep-seqs
437 |
438 | qiime feature-table tabulate-seqs\
439 | --i-data bacteria-rep-seqs.qza\
440 | --o-visualization bacteria-rep-seqs
441 | ~~~
442 |
443 | Now that we have only our target region we can create the de novo phylogenetic tree.
444 | We'll use the default qiime2 pipeline because it is quick and easy to run, while providing good results.
445 | This pipeline first performs a multi sequence alignment with mafft, this alignment would be significantly worse if we had not removed the non target sequences.
446 | It then masks highly variable parts of the sequence as they add noise to the tree.
447 | It then uses FastTree to create an unrooted phylogenetic tree which is then midpoint rooted.
448 |
449 | ~~~bash
450 | qiime phylogeny align-to-tree-mafft-fasttree\
451 | --i-sequences bacteria-rep-seqs.qza\
452 | --o-alignment aligned-rep-seqs.qza\
453 | --o-masked-alignment masked-aligned-rep-seqs.qza\
454 | --o-tree unrooted-tree.qza\
455 | --o-rooted-tree rooted-tree.qza\
456 | --p-n-threads 18
457 | ~~~
458 |
459 | Now we can look at the [tree we created on iToL](https://itol.embl.de/tree/20922221311082651562009639).
460 | And for reference here is [the tree if we had not filtered it](https://itol.embl.de/tree/209222213110293351562082149).
461 | We can see that the filtering upped the contrast between different groups.
462 |
463 | Now we are ready to run some diversity analysis!
464 | We are going to start by running qiimes core phylogenetic pipeline, this will take into account the relationships between sequences, as represented by our phylogenetic tree.
465 | It will calculate a few key metrics for us, faiths-pd a measure of phylogenetic diversity, evenness, a measure of evenness and several beta statistics, like weighted and unweighted unifracs.
466 | To do these comparisons we need to make our samples comparable to each other.
467 | The way this is generally done is to rarefy the samples to the same sampling depth.
468 | We can use the bacteria-table.qzv we made earlier to inform this decision.
469 | We want to balance setting as high as possible of a rarefaction depth to preserve as many reads as possible, while setting it low enough to preserve as many samples as possible.
470 |
471 |
472 | ## CReate rarefaction plots
473 | https://docs.qiime2.org/2022.2/plugins/available/diversity/alpha-rarefaction/?highlight=rarefaction
474 |
475 | https://www.drive5.com/usearch/manual/rare.gif
476 |
477 |
478 | ```bash
479 | qiime diversity alpha-rarefaction --i-table bacteria-table.qza --i-phylogeny rooted-tree.qza --p-max-depth 5000 --p-steps 100 --m-metadata-file mdat.tsv --o-visualization alpha-rarefaction.qzv
480 |
481 |
482 | ```
483 |
484 | ~~~bash
485 | qiime diversity core-metrics-phylogenetic\
486 | --i-phylogeny rooted-tree.qza\
487 | --i-table bacteria-table.qza\
488 | --p-sampling-depth 16951\
489 | --m-metadata-file mdat.tsv\
490 | --output-dir core-metrics-results
491 | ~~~
492 | From this initial step we can start by looking at some PCoA plots, we'll augment the PCoA with some of the most predictive features.
493 |
494 | ~~~bash
495 | qiime feature-table relative-frequency\
496 | --i-table core-metrics-results/rarefied_table.qza\
497 | --o-relative-frequency-table core-metrics-results/relative_rarefied_table
498 |
499 | qiime diversity pcoa-biplot\
500 | --i-features core-metrics-results/relative_rarefied_table.qza\
501 | --i-pcoa core-metrics-results/unweighted_unifrac_pcoa_results.qza\
502 | --o-biplot core-metrics-results/unweighted_unifrac_pcoa_biplot
503 |
504 | qiime emperor biplot\
505 | --i-biplot core-metrics-results/unweighted_unifrac_pcoa_biplot.qza\
506 | --m-sample-metadata-file mdat.tsv\
507 | --o-visualization core-metrics-results/unweighted_unifrac_pcoa_biplot
508 | ~~~
509 | We can see that the strains separate well, which implies that we should be able to find some separating distances in our data.
510 |
511 | Lets start looking for those differences by looking at differences in diversity as a whole.
512 | For numeric metadata categories we can plot our favorite metrics with the value of that metadata.
513 |
514 | ~~~bash
515 | qiime diversity alpha-correlation\
516 | --i-alpha-diversity core-metrics-results/faith_pd_vector.qza\
517 | --m-metadata-file mdat.tsv\
518 | --o-visualization core-metrics-results/faith-alpha-correlation
519 | ~~~
520 |
521 | Then for the categorical metadata catagories we can plot some box and whisker plots.
522 | ~~~bash
523 | qiime diversity alpha-group-significance\
524 | --i-alpha-diversity core-metrics-results/faith_pd_vector.qza\
525 | --m-metadata-file mdat.tsv\
526 | --o-visualization core-metrics-results/faith-group-significance
527 | ~~~
528 |
529 | ## Differential Abundance Analysis
530 | Now lets combine the taxonomy with the diversity analysis to see if there are related groups of organisms that are differentially abundant groups within the samples.
531 | We'll start by combining our table and tree into a hierarchy and set of balances.
532 | Balances are the weighted log ratios of sets of features for samples.
533 | And we will be looking for significant differences in the balances between groups of samples.
534 | ~~~bash
535 | qiime gneiss ilr-phylogenetic\
536 | --i-table bacteria-table.qza\
537 | --i-tree rooted-tree.qza\
538 | --o-balances balances --o-hierarchy hierarchy
539 | ~~~
540 | To view the sets used in the balances we can plot out a heatmap of feature abundance which highlights the ratios.
541 | ~~~bash
542 | qiime gneiss dendrogram-heatmap\
543 | --i-table bacteria-table.qza\
544 | --i-tree hierarchy.qza\
545 | --m-metadata-file mdat.tsv\
546 | --m-metadata-column Strain\
547 | --p-color-map seismic\
548 | --o-visualization heatmap.qzv
549 | ~~~
550 | With this we can begin to look at specific balances to see their composition.
551 | ~~~bash
552 | for ((i=0; i<10; i++)); do
553 | qiime gneiss balance-taxonomy\
554 | --i-table bacteria-table.qza\
555 | --i-tree hierarchy.qza\
556 | --i-taxonomy taxonomy.qza\
557 | --p-taxa-level 5\
558 | --p-balance-name y$i\
559 | --m-metadata-file mdat.tsv\
560 | --m-metadata-column Strain\
561 | --o-visualization y${i}_taxa_summary.qzv
562 | done
563 | ~~~
564 |
--------------------------------------------------------------------------------
/cutadapt_trim_adapters_and_primers.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/sh
2 | #SBATCH --ntasks=1
3 | #SBATCH --job-name="CUTADAPT"
4 | #SBATCH --output="CUTADAPT"
5 |
6 |
7 | module purge
8 |
9 |
10 | # Define variables
11 | READ1="CTGTCTCTTATACACATCTCCGAGCCCACGAGAC" #Nexterera adapter in FWD
12 | READ2="CTGTCTCTTATACACATCTGACGCTGCCGACGA" #Nexterera adapter in REV
13 | PRIMER_F="TCAAGCAGAAGACGGCATACGAGAT" #Reverse complement of 3′ Illumina adapter
14 | PRIMER_R="GCTGCGTTCTTCATCGATGC" #Reverse primer (ITS2)
15 | MIN_LENGTH=50 # Discard trimmed reads that are shorter than MIN_LENGTH
16 | OVERLAP_MIN_LENGTH=10
17 | MIN_QUALITY=10
18 | CUTADAPT="/mnt/lustre/macmaneslab/maa1024/.local/bin/cutadapt -q ${MIN_QUALITY} --minimum-length ${MIN_LENGTH} -O ${OVERLAP_MIN_LENGTH}"
19 | NAMES=(`cat readnamelist.txt`)
20 |
21 | mkdir trimmed
22 |
23 | for i in "${NAMES[@]}"
24 | do
25 | echo $i
26 | FORWARD_READ="${i}_R1_001.fastq"
27 | REVERSE_READ="${i}_R2_001.fastq"
28 | TRIMMED_R1="$(echo $i | cut -d'/' -f3)_R1_001.fq"
29 | TRIMMED_R2="$(echo $i | cut -d'/' -f3)_R2_001.fq"
30 |
31 | srun ${CUTADAPT} -a ${READ1} -A ${READ2} -g ${PRIMER_F} -G ${PRIMER_R} -o trimmed/${TRIMMED_R1} -p trimmed/${TRIMMED_R2} ${FORWARD_READ} ${REVERSE_READ}
32 | done
33 |
--------------------------------------------------------------------------------
/img/PCR-twostep.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Joseph7e/HCGS_Metabarcoding_Tutorials/d91e585cc204739fc9016a5ef833e11e2c33e1a4/img/PCR-twostep.jpg
--------------------------------------------------------------------------------
/img/PCR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Joseph7e/HCGS_Metabarcoding_Tutorials/d91e585cc204739fc9016a5ef833e11e2c33e1a4/img/PCR.png
--------------------------------------------------------------------------------
/img/README:
--------------------------------------------------------------------------------
1 | images for main workflow
2 |
--------------------------------------------------------------------------------
/img/water-testing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Joseph7e/HCGS_Metabarcoding_Tutorials/d91e585cc204739fc9016a5ef833e11e2c33e1a4/img/water-testing.jpg
--------------------------------------------------------------------------------
/img/youtube-video-sequencing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Joseph7e/HCGS_Metabarcoding_Tutorials/d91e585cc204739fc9016a5ef833e11e2c33e1a4/img/youtube-video-sequencing.png
--------------------------------------------------------------------------------