├── .DS_Store
├── README.md
├── _config.yml
├── assets
    ├── css
    │   └── style.scss
    └── images
    │   └── dna-sequence-1600x800.jpg
├── fastqc
    ├── Icons
    │   ├── error.png
    │   ├── fastqc_icon.png
    │   ├── tick.png
    │   └── warning.png
    ├── Images
    │   ├── duplication_levels.png
    │   ├── kmer_profiles.png
    │   ├── per_base_gc_content.png
    │   ├── per_base_n_content.png
    │   ├── per_base_quality.png
    │   ├── per_base_sequence_content.png
    │   ├── per_sequence_gc_content.png
    │   ├── per_sequence_quality.png
    │   └── sequence_length_distribution.png
    └── Mov10oe_1-fastqc_report.html
├── img
    ├── .DS_Store
    ├── DNA_to_protein_or_ncRNA.svg.png
    ├── FastQC_contam.png
    ├── FastQC_seq_qual.png
    ├── Filezilla_step1.png
    ├── Filezilla_step2.png
    ├── Gene_products.png
    ├── Gene_structure.png
    ├── IGV_mov10.png
    ├── README.md
    ├── RNA-seq_library_prep.png
    ├── RNAseqWorkflow.png
    ├── R_screenshot.png
    ├── R_screenshot2.png
    ├── Rstudio_interface.png
    ├── SAM_file.png
    ├── Slide1.jpg
    ├── alignment_STAR_step1.ai
    ├── alignment_STAR_step1.png
    ├── alignment_STAR_step2.ai
    ├── alignment_STAR_step2.png
    ├── alignment_STAR_step3.ai
    ├── alignment_STAR_step3.png
    ├── alignment_STAR_step4.ai
    ├── alignment_STAR_step4.png
    ├── alignment_STAR_step5.ai
    ├── alignment_STAR_step5.png
    ├── alignmentfree_workflow_aug2017.png
    ├── alignmentfree_workflow_june2017.png
    ├── bad_quality.png
    ├── batch_effect.png
    ├── batch_effect_pca.png
    ├── bioconductor_logo.png
    ├── bitwiseflags.png
    ├── cigar_strings.png
    ├── complete_wd_setup.png
    ├── confounded_batch.png
    ├── confounded_design.png
    ├── console.png
    ├── corr_map.png
    ├── count-fig1.png
    ├── count-fig2.png
    ├── count-matrix.png
    ├── count_matrix.png
    ├── counts-workflow.jpg
    ├── counts-workflow.png
    ├── counts_view.png
    ├── cran_packages.png
    ├── data_life_cycle_gouldv2.png
    ├── de_norm_counts_var.png
    ├── de_replicates_img.png
    ├── de_variation.png
    ├── demultiplexing.png
    ├── drawings.pptx
    ├── environment.png
    ├── exp_design.png
    ├── factors.png
    ├── factors_both.png
    ├── factors_new.png
    ├── factors_sm.png
    ├── factors_sm_intact.png
    ├── fastqc_results.png
    ├── feature-overlap.png
    ├── flow_cells.png
    ├── gProfiler.png
    ├── gene_expression2.png
    ├── gene_expression_cells.png
    ├── genemania.png
    ├── getwd.png
    ├── good_quality.png
    ├── gvng.jpg
    ├── igv_screenshot.png
    ├── illumina_platforms.png
    ├── illumina_sequencing.png
    ├── long_read_tech.png
    ├── metadata_batch.png
    ├── non_confounded_design.png
    ├── paired-end_data.png
    ├── paired_end_reads.png
    ├── pca_plot.png
    ├── permission-directory.png
    ├── placeholder.png
    ├── pseudo_count_comparison-cufflinks.png
    ├── pseudo_count_comparison-sailfish.png
    ├── pseudo_count_comparison-sailfish_sm.png
    ├── pseudo_count_comparison-star.png
    ├── pseudo_count_comparison-star_sm.png
    ├── pseudo_count_comparison.gif
    ├── pseudo_count_comparison.png
    ├── r_starting_how_it_should_like.png
    ├── replicates.png
    ├── rnaseq_workflow.png
    ├── rnaseq_workflow_FASTQC.png
    ├── rnaseq_workflow_trimming.png
    ├── rrna.png
    ├── rstudio_logo.png
    ├── salmon_quasialignment.png
    ├── salmon_rstudio.png
    ├── salmon_workflow_updated.png
    ├── sam_bam.png
    ├── sam_bam2.png
    ├── sam_bam3.png
    ├── star.png
    ├── teachin-team.png
    ├── union.png
    ├── vim_insert.png
    ├── vim_postsave.png
    ├── vim_quit.png
    ├── vim_save.png
    ├── vim_spider.png
    ├── vim_spider_number.png
    ├── why_R.png
    ├── workflow_alignment.png
    └── wrap_option.png
├── lectures
    ├── 2_day
    │   ├── HPC_intro_O2.pdf
    │   ├── Intro_to_workshop.pdf
    │   ├── RNAseq-analysis-methods.pdf
    │   ├── Wrap_up.pdf
    │   └── rna-seq_design.pdf
    ├── Accessing_genomics_dataonline.pdf
    ├── HPC_intro_O2.pdf
    ├── Intro_to_workshop.pdf
    ├── NGS_workflows.pdf
    ├── RNA-seq_troubleshooting.pdf
    ├── RNAseq-analysis-methods.pdf
    ├── Sequence_alignment.pdf
    ├── error_profiles_mm.pdf
    ├── other rnaseq applications.pdf
    ├── rna-seq_design.pdf
    └── sequencing_technologies_mm.zip
├── lessons
    ├── .DS_Store
    ├── 01_data_organization.md
    ├── 02_assessing_quality.md
    ├── 03_alignment.md
    ├── 04_alignment_quality.md
    ├── 05_counting_reads.md
    ├── 06_multiQC.md
    ├── 07_automating_workflow.md
    ├── 08_salmon.md
    ├── 2day_rnaseq_workflow.md
    ├── DE_analysis.md
    ├── advanced_bash.md
    ├── experimental_planning_considerations.md
    └── rnaseq_workflow.md
├── sam.md
├── schedule
    ├── .DS_Store
    ├── 2-day
    │   ├── .DS_Store
    │   └── README.md
    └── README.md
└── scripts
    ├── mov10_fastqc.run
    ├── rnaseq_analysis_on_allfiles_for-slurm.sh
    ├── rnaseq_analysis_on_input_file.sh
    └── salmon_all_samples.sh


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **THIS REPO IS ARCHIVED, PLEASE GO TO [https://hbctraining.github.io/main](https://hbctraining.github.io/main) FOR CURRENT LESSONS.**
 2 | 
 3 | # Introduction to RNA-seq using high-performance computing (HPC)
 4 | 
 5 | | Audience | Computational skills required | Duration |
 6 | :----------|:-------------|:----------|
 7 | | Biologists | None | 2- or 3-day workshop (~13 - 19.5 hours of trainer-led time)|
 8 | 
 9 | ### Description
10 | 
11 | This repository has teaching materials for a 2-day Introduction to RNA-sequencing data analysis workshop. This workshop focuses on teaching basic computational skills to enable the effective use of an high-performance computing environment to implement an RNA-seq data analysis workflow. It includes an introduction to shell (bash) and shell scripting. In addition to running the RNA-seq workflow from FASTQ files to count data, the workshop covers best practice guidlelines for RNA-seq experimental design and data organization/management.
12 | 
13 | > These materials were developed for a trainer-led workshop, but are also amenable to self-guided learning.
14 | 
15 | ### Learning Objectives
16 | 
17 | 1.	Understand the necessity for, and use of, the command line interface (bash) and HPC for analyzing high-throughput sequencing data.
18 | 2.	Understand best practices for designing an RNA-seq experiment and analysis the resulting data.
19 | 
20 | ### Lessons
21 | 
22 | Below are links to the lessons and suggested schedules:
23 | 
24 | * 2 day schedule
25 | * [3 day schedule](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/schedule/)  
26 | 
27 | ### Installation Requirements
28 | 
29 | ***All:***
30 | 
31 | * [FileZilla](https://filezilla-project.org/download.php?type=client) (make sure you get 'FileZilla Client')
32 | * [Integrative Genomics Viewer (IGV)](http://software.broadinstitute.org/software/igv/) (scroll down on the page for Download options). If you have trouble opening IGV after installing it, you may need to install [Java](https://www.java.com/en/download/).
33 | 
34 | ***Mac users:***
35 | 
36 | * Plain text editor like [Sublime text](http://www.sublimetext.com/) or similar
37 | 
38 | ***Windows users:***
39 | 
40 | * [GitBash](https://git-scm.com/download/win)
41 | * Plain text editor like [Notepad++](http://notepad-plus-plus.org/) or similar
42 | 
43 | ### Dataset
44 | 
45 | * Day 1 - [Introduction to Shell: Dataset](https://www.dropbox.com/s/3lua2h1oo18gbug/unix_lesson.tar.gz?dl=1)
46 | * Days 2 and 3 - RNA-seq analysis (coming soon)
47 | 
48 | ***
49 | *These materials have been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
50 | 
51 | * *Some materials used in these lessons were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
52 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
53 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
2 | title: Introduction to RNA-Seq using high-performance computing - ARCHIVED
3 | 


--------------------------------------------------------------------------------
/assets/css/style.scss:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 | 
4 | @import "{{ site.theme }}";
5 | 
6 | .page-header { color: #fff; text-align: center; background-image: url("../images/dna-sequence-1600x800.jpg"); }
7 | 
8 | .main-content h1, .main-content h2, .main-content h3, .main-content h4, .main-content h5, .main-content h6 { margin-top: 2rem; margin-bottom: 1rem; font-weight: normal; color: #000000; }
9 | 


--------------------------------------------------------------------------------
/assets/images/dna-sequence-1600x800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/assets/images/dna-sequence-1600x800.jpg


--------------------------------------------------------------------------------
/fastqc/Icons/error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Icons/error.png


--------------------------------------------------------------------------------
/fastqc/Icons/fastqc_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Icons/fastqc_icon.png


--------------------------------------------------------------------------------
/fastqc/Icons/tick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Icons/tick.png


--------------------------------------------------------------------------------
/fastqc/Icons/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Icons/warning.png


--------------------------------------------------------------------------------
/fastqc/Images/duplication_levels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/duplication_levels.png


--------------------------------------------------------------------------------
/fastqc/Images/kmer_profiles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/kmer_profiles.png


--------------------------------------------------------------------------------
/fastqc/Images/per_base_gc_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/per_base_gc_content.png


--------------------------------------------------------------------------------
/fastqc/Images/per_base_n_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/per_base_n_content.png


--------------------------------------------------------------------------------
/fastqc/Images/per_base_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/per_base_quality.png


--------------------------------------------------------------------------------
/fastqc/Images/per_base_sequence_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/per_base_sequence_content.png


--------------------------------------------------------------------------------
/fastqc/Images/per_sequence_gc_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/per_sequence_gc_content.png


--------------------------------------------------------------------------------
/fastqc/Images/per_sequence_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/per_sequence_quality.png


--------------------------------------------------------------------------------
/fastqc/Images/sequence_length_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/fastqc/Images/sequence_length_distribution.png


--------------------------------------------------------------------------------
/fastqc/Mov10oe_1-fastqc_report.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN">
  2 | <html>
  3 | <head><title>Mov10_oe_1.fastq FastQC Report</title>
  4 | 
  5 | <style type="text/css">
  6 | 
  7 |  @media screen {
  8 |   div.summary {
  9 |     width: 18em;
 10 |     position:fixed;
 11 |     top: 3em;
 12 |     margin:1em 0 0 1em;
 13 |   }
 14 |   
 15 |   div.main {
 16 |     display:block;
 17 |     position:absolute;
 18 |     overflow:auto;
 19 |     height:auto;
 20 |     width:auto;
 21 |     top:4.5em;
 22 |     bottom:2.3em;
 23 |     left:18em;
 24 |     right:0;
 25 |     border-left: 1px solid #CCC;
 26 |     padding:0 0 0 1em;
 27 |     background-color: white;
 28 |     z-index:1;
 29 |   }
 30 |   
 31 |   div.header {
 32 |     background-color: #EEE;
 33 |     border:0;
 34 |     margin:0;
 35 |     padding: 0.5em;
 36 |     font-size: 200%;
 37 |     font-weight: bold;
 38 |     position:fixed;
 39 |     width:100%;
 40 |     top:0;
 41 |     left:0;
 42 |     z-index:2;
 43 |   }
 44 | 
 45 |   div.footer {
 46 |     background-color: #EEE;
 47 |     border:0;
 48 |     margin:0;
 49 | 	padding:0.5em;
 50 |     height: 1.3em;
 51 | 	overflow:hidden;
 52 |     font-size: 100%;
 53 |     font-weight: bold;
 54 |     position:fixed;
 55 |     bottom:0;
 56 |     width:100%;
 57 |     z-index:2;
 58 |   }
 59 |   
 60 |   img.indented {
 61 |     margin-left: 3em;
 62 |   }
 63 |  }
 64 |  
 65 |  @media print {
 66 | 	img {
 67 | 		max-width:100% !important;
 68 | 		page-break-inside: avoid;
 69 | 	}
 70 | 	h2, h3 {
 71 | 		page-break-after: avoid;
 72 | 	}
 73 | 	div.header {
 74 |       background-color: #FFF;
 75 |     }
 76 | 	
 77 |  }
 78 |  
 79 |  body {    
 80 |   font-family: sans-serif;   
 81 |   color: #000;   
 82 |   background-color: #FFF;
 83 |   border: 0;
 84 |   margin: 0;
 85 |   padding: 0;
 86 |   }
 87 |   
 88 |   div.header {
 89 |   border:0;
 90 |   margin:0;
 91 |   padding: 0.5em;
 92 |   font-size: 200%;
 93 |   font-weight: bold;
 94 |   width:100%;
 95 |   }    
 96 |   
 97 |   #header_title {
 98 |   display:inline-block;
 99 |   float:left;
100 |   clear:left;
101 |   }
102 |   #header_filename {
103 |   display:inline-block;
104 |   float:right;
105 |   clear:right;
106 |   font-size: 50%;
107 |   margin-right:2em;
108 |   text-align: right;
109 |   }
110 | 
111 |   div.header h3 {
112 |   font-size: 50%;
113 |   margin-bottom: 0;
114 |   }
115 |   
116 |   div.summary ul {
117 |   padding-left:0;
118 |   list-style-type:none;
119 |   }
120 |   
121 |   div.summary ul li img {
122 |   margin-bottom:-0.5em;
123 |   margin-top:0.5em;
124 |   }
125 | 	  
126 |   div.main {
127 |   background-color: white;
128 |   }
129 |       
130 |   div.module {
131 |   padding-bottom:1.5em;
132 |   padding-top:1.5em;
133 |   }
134 | 	  
135 |   div.footer {
136 |   background-color: #EEE;
137 |   border:0;
138 |   margin:0;
139 |   padding: 0.5em;
140 |   font-size: 100%;
141 |   font-weight: bold;
142 |   width:100%;
143 |   }
144 | 
145 | 
146 |   a {
147 |   color: #000080;
148 |   }
149 | 
150 |   a:hover {
151 |   color: #800000;
152 |   }
153 |       
154 |   h2 {
155 |   color: #800000;
156 |   padding-bottom: 0;
157 |   margin-bottom: 0;
158 |   clear:left;
159 |   }
160 | 
161 |   table { 
162 |   margin-left: 3em;
163 |   text-align: center;
164 |   }
165 |   
166 |   th { 
167 |   text-align: center;
168 |   background-color: #000080;
169 |   color: #FFF;
170 |   padding: 0.4em;
171 |   }      
172 |   
173 |   td { 
174 |   font-family: monospace; 
175 |   text-align: left;
176 |   background-color: #EEE;
177 |   color: #000;
178 |   padding: 0.4em;
179 |   }
180 | 
181 |   img {
182 |   padding-top: 0;
183 |   margin-top: 0;
184 |   border-top: 0;
185 |   }
186 | 
187 |   
188 |   p {
189 |   padding-top: 0;
190 |   margin-top: 0;
191 |   }
192 |   
193 | </style>
194 | 
195 | </head>
196 | <body>
197 | <div class="header">
198 | <div id="header_title"><img src="Icons/fastqc_icon.png" alt="FastQC">FastQC Report</div>
199 | <div id="header_filename">
200 | Wed 30 Sep 2015<br />
201 | Mov10_oe_1.fastq
202 | </div>
203 | </div>
204 | <div class="summary">
205 | <h2>Summary</h2>
206 | <ul>
207 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M0">Basic Statistics</a></li>
208 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M1">Per base sequence quality</a></li>
209 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M2">Per sequence quality scores</a></li>
210 | <li><img src="Icons/error.png" alt="[FAIL]"> <a href="#M3">Per base sequence content</a></li>
211 | <li><img src="Icons/error.png" alt="[FAIL]"> <a href="#M4">Per base GC content</a></li>
212 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M5">Per sequence GC content</a></li>
213 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M6">Per base N content</a></li>
214 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M7">Sequence Length Distribution</a></li>
215 | <li><img src="Icons/error.png" alt="[FAIL]"> <a href="#M8">Sequence Duplication Levels</a></li>
216 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M9">Overrepresented sequences</a></li>
217 | <li><img src="Icons/warning.png" alt="[WARNING]"> <a href="#M10">Kmer Content</a></li>
218 | </ul>
219 | </div>
220 | <div class="main">
221 | <div class="module"><h2 id="M0"><img src="Icons/tick.png" alt="[OK]"> Basic Statistics</h2>
222 | <table>
223 | <tr>
224 | <th>Measure</th>
225 | <th>Value</th>
226 | </tr>
227 | <tr>
228 | <td>Filename</td>
229 | <td>Mov10_oe_1.fastq</td>
230 | </tr>
231 | <tr>
232 | <td>File type</td>
233 | <td>Conventional base calls</td>
234 | </tr>
235 | <tr>
236 | <td>Encoding</td>
237 | <td>Sanger / Illumina 1.9</td>
238 | </tr>
239 | <tr>
240 | <td>Total Sequences</td>
241 | <td>39971841</td>
242 | </tr>
243 | <tr>
244 | <td>Filtered Sequences</td>
245 | <td>0</td>
246 | </tr>
247 | <tr>
248 | <td>Sequence length</td>
249 | <td>100</td>
250 | </tr>
251 | <tr>
252 | <td>%GC</td>
253 | <td>47</td>
254 | </tr>
255 | </table>
256 | </div>
257 | <div class="module"><h2 id="M1"><img src="Icons/tick.png" alt="[OK]"> Per base sequence quality</h2>
258 | <p><img class="indented" src="Images/per_base_quality.png" alt="Per base quality graph"></p>
259 | </div>
260 | <div class="module"><h2 id="M2"><img src="Icons/tick.png" alt="[OK]"> Per sequence quality scores</h2>
261 | <p><img class="indented" src="Images/per_sequence_quality.png" alt="Per Sequence quality graph"></p>
262 | </div>
263 | <div class="module"><h2 id="M3"><img src="Icons/error.png" alt="[FAIL]"> Per base sequence content</h2>
264 | <p><img class="indented" src="Images/per_base_sequence_content.png" alt="Per base sequence content"></p>
265 | </div>
266 | <div class="module"><h2 id="M4"><img src="Icons/error.png" alt="[FAIL]"> Per base GC content</h2>
267 | <p><img class="indented" src="Images/per_base_gc_content.png" alt="Per base GC content graph"></p>
268 | </div>
269 | <div class="module"><h2 id="M5"><img src="Icons/tick.png" alt="[OK]"> Per sequence GC content</h2>
270 | <p><img class="indented" src="Images/per_sequence_gc_content.png" alt="Per sequence GC content graph"></p>
271 | </div>
272 | <div class="module"><h2 id="M6"><img src="Icons/tick.png" alt="[OK]"> Per base N content</h2>
273 | <p><img class="indented" src="Images/per_base_n_content.png" alt="N content graph"></p>
274 | </div>
275 | <div class="module"><h2 id="M7"><img src="Icons/tick.png" alt="[OK]"> Sequence Length Distribution</h2>
276 | <p><img class="indented" src="Images/sequence_length_distribution.png" alt="Sequence length distribution"></p>
277 | </div>
278 | <div class="module"><h2 id="M8"><img src="Icons/error.png" alt="[FAIL]"> Sequence Duplication Levels</h2>
279 | <p><img class="indented" src="Images/duplication_levels.png" alt="Duplication level graph"></p>
280 | </div>
281 | <div class="module"><h2 id="M9"><img src="Icons/tick.png" alt="[OK]"> Overrepresented sequences</h2>
282 | <p>No overrepresented sequences</p>
283 | </div>
284 | <div class="module"><h2 id="M10"><img src="Icons/warning.png" alt="[WARN]"> Kmer Content</h2>
285 | <p><img class="indented" src="Images/kmer_profiles.png" alt="Kmer graph"></p>
286 | <table>
287 | <tr>
288 | <th>Sequence</th>
289 | <th>Count</th>
290 | <th>Obs/Exp Overall</th>
291 | <th>Obs/Exp Max</th>
292 | <th>Max Obs/Exp Position</th>
293 | </tr>
294 | <tr>
295 | <td>AAAAA</td>
296 | <td>16795015</td>
297 | <td>4.1748657</td>
298 | <td>6.059911</td>
299 | <td>2</td>
300 | </tr>
301 | <tr>
302 | <td>CTGGG</td>
303 | <td>8376590</td>
304 | <td>2.5479658</td>
305 | <td>6.4841547</td>
306 | <td>1</td>
307 | </tr>
308 | <tr>
309 | <td>TTCTT</td>
310 | <td>12161990</td>
311 | <td>2.543816</td>
312 | <td>5.0711346</td>
313 | <td>6</td>
314 | </tr>
315 | <tr>
316 | <td>TCTTC</td>
317 | <td>10938540</td>
318 | <td>2.529536</td>
319 | <td>5.185696</td>
320 | <td>7</td>
321 | </tr>
322 | <tr>
323 | <td>CTTCT</td>
324 | <td>10885845</td>
325 | <td>2.5173504</td>
326 | <td>5.0255194</td>
327 | <td>1</td>
328 | </tr>
329 | <tr>
330 | <td>CTCCA</td>
331 | <td>8804215</td>
332 | <td>2.377327</td>
333 | <td>8.132946</td>
334 | <td>1</td>
335 | </tr>
336 | <tr>
337 | <td>GGCAG</td>
338 | <td>7360785</td>
339 | <td>2.3646495</td>
340 | <td>9.048611</td>
341 | <td>1</td>
342 | </tr>
343 | <tr>
344 | <td>TCCAG</td>
345 | <td>8433860</td>
346 | <td>2.3336692</td>
347 | <td>5.7122536</td>
348 | <td>7</td>
349 | </tr>
350 | <tr>
351 | <td>CTCCT</td>
352 | <td>8862290</td>
353 | <td>2.2658334</td>
354 | <td>6.7507205</td>
355 | <td>1</td>
356 | </tr>
357 | <tr>
358 | <td>CAGGA</td>
359 | <td>7528755</td>
360 | <td>2.2545867</td>
361 | <td>5.9135337</td>
362 | <td>1</td>
363 | </tr>
364 | <tr>
365 | <td>CTTCA</td>
366 | <td>9153500</td>
367 | <td>2.235553</td>
368 | <td>6.1804776</td>
369 | <td>1</td>
370 | </tr>
371 | <tr>
372 | <td>CCCAG</td>
373 | <td>7216920</td>
374 | <td>2.207828</td>
375 | <td>6.119858</td>
376 | <td>1</td>
377 | </tr>
378 | <tr>
379 | <td>GCCAG</td>
380 | <td>6455370</td>
381 | <td>2.023714</td>
382 | <td>6.2788043</td>
383 | <td>1</td>
384 | </tr>
385 | <tr>
386 | <td>CTGCA</td>
387 | <td>7241750</td>
388 | <td>2.0038092</td>
389 | <td>5.1436768</td>
390 | <td>1</td>
391 | </tr>
392 | <tr>
393 | <td>CTTGG</td>
394 | <td>6897085</td>
395 | <td>1.8517264</td>
396 | <td>5.505673</td>
397 | <td>1</td>
398 | </tr>
399 | <tr>
400 | <td>CTGGA</td>
401 | <td>6511845</td>
402 | <td>1.8464246</td>
403 | <td>6.7122235</td>
404 | <td>1</td>
405 | </tr>
406 | <tr>
407 | <td>CTCAG</td>
408 | <td>6449570</td>
409 | <td>1.7846115</td>
410 | <td>7.22948</td>
411 | <td>1</td>
412 | </tr>
413 | <tr>
414 | <td>CTTTT</td>
415 | <td>8479045</td>
416 | <td>1.7734871</td>
417 | <td>5.9101095</td>
418 | <td>1</td>
419 | </tr>
420 | <tr>
421 | <td>TTTCA</td>
422 | <td>7934210</td>
423 | <td>1.7526736</td>
424 | <td>5.187293</td>
425 | <td>6</td>
426 | </tr>
427 | <tr>
428 | <td>TTCAG</td>
429 | <td>6830700</td>
430 | <td>1.7095337</td>
431 | <td>5.021524</td>
432 | <td>7</td>
433 | </tr>
434 | <tr>
435 | <td>CTTGA</td>
436 | <td>5609765</td>
437 | <td>1.4039677</td>
438 | <td>5.2207584</td>
439 | <td>1</td>
440 | </tr>
441 | <tr>
442 | <td>CTCAT</td>
443 | <td>4925100</td>
444 | <td>1.2028538</td>
445 | <td>5.1273108</td>
446 | <td>1</td>
447 | </tr>
448 | <tr>
449 | <td>CTCAA</td>
450 | <td>4489260</td>
451 | <td>1.1579475</td>
452 | <td>5.334822</td>
453 | <td>1</td>
454 | </tr>
455 | </table>
456 | </div>
457 | </div><div class="footer">Produced by <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/">FastQC</a> (version 0.10.1)</div>
458 | </body></html>


--------------------------------------------------------------------------------
/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/.DS_Store


--------------------------------------------------------------------------------
/img/DNA_to_protein_or_ncRNA.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/DNA_to_protein_or_ncRNA.svg.png


--------------------------------------------------------------------------------
/img/FastQC_contam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/FastQC_contam.png


--------------------------------------------------------------------------------
/img/FastQC_seq_qual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/FastQC_seq_qual.png


--------------------------------------------------------------------------------
/img/Filezilla_step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/Filezilla_step1.png


--------------------------------------------------------------------------------
/img/Filezilla_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/Filezilla_step2.png


--------------------------------------------------------------------------------
/img/Gene_products.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/Gene_products.png


--------------------------------------------------------------------------------
/img/Gene_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/Gene_structure.png


--------------------------------------------------------------------------------
/img/IGV_mov10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/IGV_mov10.png


--------------------------------------------------------------------------------
/img/README.md:
--------------------------------------------------------------------------------
1 | ###All images for Session II of NGS Data Analysis Course
2 | 


--------------------------------------------------------------------------------
/img/RNA-seq_library_prep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/RNA-seq_library_prep.png


--------------------------------------------------------------------------------
/img/RNAseqWorkflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/RNAseqWorkflow.png


--------------------------------------------------------------------------------
/img/R_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/R_screenshot.png


--------------------------------------------------------------------------------
/img/R_screenshot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/R_screenshot2.png


--------------------------------------------------------------------------------
/img/Rstudio_interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/Rstudio_interface.png


--------------------------------------------------------------------------------
/img/SAM_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/SAM_file.png


--------------------------------------------------------------------------------
/img/Slide1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/Slide1.jpg


--------------------------------------------------------------------------------
/img/alignment_STAR_step1.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step1.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step1.png


--------------------------------------------------------------------------------
/img/alignment_STAR_step2.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step2.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step2.png


--------------------------------------------------------------------------------
/img/alignment_STAR_step3.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step3.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step3.png


--------------------------------------------------------------------------------
/img/alignment_STAR_step4.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step4.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step4.png


--------------------------------------------------------------------------------
/img/alignment_STAR_step5.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step5.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignment_STAR_step5.png


--------------------------------------------------------------------------------
/img/alignmentfree_workflow_aug2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignmentfree_workflow_aug2017.png


--------------------------------------------------------------------------------
/img/alignmentfree_workflow_june2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/alignmentfree_workflow_june2017.png


--------------------------------------------------------------------------------
/img/bad_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/bad_quality.png


--------------------------------------------------------------------------------
/img/batch_effect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/batch_effect.png


--------------------------------------------------------------------------------
/img/batch_effect_pca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/batch_effect_pca.png


--------------------------------------------------------------------------------
/img/bioconductor_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/bioconductor_logo.png


--------------------------------------------------------------------------------
/img/bitwiseflags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/bitwiseflags.png


--------------------------------------------------------------------------------
/img/cigar_strings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/cigar_strings.png


--------------------------------------------------------------------------------
/img/complete_wd_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/complete_wd_setup.png


--------------------------------------------------------------------------------
/img/confounded_batch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/confounded_batch.png


--------------------------------------------------------------------------------
/img/confounded_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/confounded_design.png


--------------------------------------------------------------------------------
/img/console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/console.png


--------------------------------------------------------------------------------
/img/corr_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/corr_map.png


--------------------------------------------------------------------------------
/img/count-fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/count-fig1.png


--------------------------------------------------------------------------------
/img/count-fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/count-fig2.png


--------------------------------------------------------------------------------
/img/count-matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/count-matrix.png


--------------------------------------------------------------------------------
/img/count_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/count_matrix.png


--------------------------------------------------------------------------------
/img/counts-workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/counts-workflow.jpg


--------------------------------------------------------------------------------
/img/counts-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/counts-workflow.png


--------------------------------------------------------------------------------
/img/counts_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/counts_view.png


--------------------------------------------------------------------------------
/img/cran_packages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/cran_packages.png


--------------------------------------------------------------------------------
/img/data_life_cycle_gouldv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/data_life_cycle_gouldv2.png


--------------------------------------------------------------------------------
/img/de_norm_counts_var.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/de_norm_counts_var.png


--------------------------------------------------------------------------------
/img/de_replicates_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/de_replicates_img.png


--------------------------------------------------------------------------------
/img/de_variation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/de_variation.png


--------------------------------------------------------------------------------
/img/demultiplexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/demultiplexing.png


--------------------------------------------------------------------------------
/img/drawings.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/drawings.pptx


--------------------------------------------------------------------------------
/img/environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/environment.png


--------------------------------------------------------------------------------
/img/exp_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/exp_design.png


--------------------------------------------------------------------------------
/img/factors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/factors.png


--------------------------------------------------------------------------------
/img/factors_both.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/factors_both.png


--------------------------------------------------------------------------------
/img/factors_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/factors_new.png


--------------------------------------------------------------------------------
/img/factors_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/factors_sm.png


--------------------------------------------------------------------------------
/img/factors_sm_intact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/factors_sm_intact.png


--------------------------------------------------------------------------------
/img/fastqc_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/fastqc_results.png


--------------------------------------------------------------------------------
/img/feature-overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/feature-overlap.png


--------------------------------------------------------------------------------
/img/flow_cells.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/flow_cells.png


--------------------------------------------------------------------------------
/img/gProfiler.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/gProfiler.png


--------------------------------------------------------------------------------
/img/gene_expression2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/gene_expression2.png


--------------------------------------------------------------------------------
/img/gene_expression_cells.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/gene_expression_cells.png


--------------------------------------------------------------------------------
/img/genemania.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/genemania.png


--------------------------------------------------------------------------------
/img/getwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/getwd.png


--------------------------------------------------------------------------------
/img/good_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/good_quality.png


--------------------------------------------------------------------------------
/img/gvng.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/gvng.jpg


--------------------------------------------------------------------------------
/img/igv_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/igv_screenshot.png


--------------------------------------------------------------------------------
/img/illumina_platforms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/illumina_platforms.png


--------------------------------------------------------------------------------
/img/illumina_sequencing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/illumina_sequencing.png


--------------------------------------------------------------------------------
/img/long_read_tech.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/long_read_tech.png


--------------------------------------------------------------------------------
/img/metadata_batch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/metadata_batch.png


--------------------------------------------------------------------------------
/img/non_confounded_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/non_confounded_design.png


--------------------------------------------------------------------------------
/img/paired-end_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/paired-end_data.png


--------------------------------------------------------------------------------
/img/paired_end_reads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/paired_end_reads.png


--------------------------------------------------------------------------------
/img/pca_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/pca_plot.png


--------------------------------------------------------------------------------
/img/permission-directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/permission-directory.png


--------------------------------------------------------------------------------
/img/placeholder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/placeholder.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-cufflinks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/pseudo_count_comparison-cufflinks.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-sailfish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/pseudo_count_comparison-sailfish.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-sailfish_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/pseudo_count_comparison-sailfish_sm.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-star.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/pseudo_count_comparison-star.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-star_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/pseudo_count_comparison-star_sm.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/pseudo_count_comparison.gif


--------------------------------------------------------------------------------
/img/pseudo_count_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/pseudo_count_comparison.png


--------------------------------------------------------------------------------
/img/r_starting_how_it_should_like.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/r_starting_how_it_should_like.png


--------------------------------------------------------------------------------
/img/replicates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/replicates.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/rnaseq_workflow.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow_FASTQC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/rnaseq_workflow_FASTQC.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow_trimming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/rnaseq_workflow_trimming.png


--------------------------------------------------------------------------------
/img/rrna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/rrna.png


--------------------------------------------------------------------------------
/img/rstudio_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/rstudio_logo.png


--------------------------------------------------------------------------------
/img/salmon_quasialignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/salmon_quasialignment.png


--------------------------------------------------------------------------------
/img/salmon_rstudio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/salmon_rstudio.png


--------------------------------------------------------------------------------
/img/salmon_workflow_updated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/salmon_workflow_updated.png


--------------------------------------------------------------------------------
/img/sam_bam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/sam_bam.png


--------------------------------------------------------------------------------
/img/sam_bam2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/sam_bam2.png


--------------------------------------------------------------------------------
/img/sam_bam3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/sam_bam3.png


--------------------------------------------------------------------------------
/img/star.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/star.png


--------------------------------------------------------------------------------
/img/teachin-team.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/teachin-team.png


--------------------------------------------------------------------------------
/img/union.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/union.png


--------------------------------------------------------------------------------
/img/vim_insert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/vim_insert.png


--------------------------------------------------------------------------------
/img/vim_postsave.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/vim_postsave.png


--------------------------------------------------------------------------------
/img/vim_quit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/vim_quit.png


--------------------------------------------------------------------------------
/img/vim_save.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/vim_save.png


--------------------------------------------------------------------------------
/img/vim_spider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/vim_spider.png


--------------------------------------------------------------------------------
/img/vim_spider_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/vim_spider_number.png


--------------------------------------------------------------------------------
/img/why_R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/why_R.png


--------------------------------------------------------------------------------
/img/workflow_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/workflow_alignment.png


--------------------------------------------------------------------------------
/img/wrap_option.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/img/wrap_option.png


--------------------------------------------------------------------------------
/lectures/2_day/HPC_intro_O2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/2_day/HPC_intro_O2.pdf


--------------------------------------------------------------------------------
/lectures/2_day/Intro_to_workshop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/2_day/Intro_to_workshop.pdf


--------------------------------------------------------------------------------
/lectures/2_day/RNAseq-analysis-methods.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/2_day/RNAseq-analysis-methods.pdf


--------------------------------------------------------------------------------
/lectures/2_day/Wrap_up.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/2_day/Wrap_up.pdf


--------------------------------------------------------------------------------
/lectures/2_day/rna-seq_design.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/2_day/rna-seq_design.pdf


--------------------------------------------------------------------------------
/lectures/Accessing_genomics_dataonline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/Accessing_genomics_dataonline.pdf


--------------------------------------------------------------------------------
/lectures/HPC_intro_O2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/HPC_intro_O2.pdf


--------------------------------------------------------------------------------
/lectures/Intro_to_workshop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/Intro_to_workshop.pdf


--------------------------------------------------------------------------------
/lectures/NGS_workflows.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/NGS_workflows.pdf


--------------------------------------------------------------------------------
/lectures/RNA-seq_troubleshooting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/RNA-seq_troubleshooting.pdf


--------------------------------------------------------------------------------
/lectures/RNAseq-analysis-methods.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/RNAseq-analysis-methods.pdf


--------------------------------------------------------------------------------
/lectures/Sequence_alignment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/Sequence_alignment.pdf


--------------------------------------------------------------------------------
/lectures/error_profiles_mm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/error_profiles_mm.pdf


--------------------------------------------------------------------------------
/lectures/other rnaseq applications.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/other rnaseq applications.pdf


--------------------------------------------------------------------------------
/lectures/rna-seq_design.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/rna-seq_design.pdf


--------------------------------------------------------------------------------
/lectures/sequencing_technologies_mm.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lectures/sequencing_technologies_mm.zip


--------------------------------------------------------------------------------
/lessons/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/lessons/.DS_Store


--------------------------------------------------------------------------------
/lessons/01_data_organization.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Data Management and Project Organization
  3 | author: Mary Piper, Meeta Mistry, Michael Steinbaugh, Radhika Khetani
  4 | date: Wednesday, September 20, 2017
  5 | duration: 35
  6 | ---
  7 | 
  8 | [Data management slides]: https://github.com/hbc/NGS-Data-Analysis-long-course/raw/Fall_2016/sessionI/slides/data_management.pdf
  9 | [SRA]: http://www.ncbi.nlm.nih.gov/sra  "Sequence Read Archive"
 10 | 
 11 | ## Learning Objectives
 12 | 
 13 | - Recognize the need for data management.
 14 | - Plan a good genomics experiment and getting started with project organization.
 15 | - Explain the RNA-seq experiment and its objectives.
 16 | - Define "metadata" and describe it for the example experiment.
 17 | 
 18 | ## Data Management
 19 | 
 20 | One of the most important parts of research that involves large amounts of data, is how best to manage it. We tend to prioritize the analysis, but there are many other important aspects that are  often overlooked in the excitement to get a first look at new data. 
 21 | 
 22 | The data management lifecycle displayed below, courtesy of the [HMS Data Management Working Group](https://datamanagement.hms.harvard.edu/hms-data-management-working-group), illustrates some things to consider beyond the data creation and analysis components:
 23 | 
 24 | <img src="../img/data_life_cycle_gouldv2.png" width="350">
 25 | 
 26 | _Image aquired from the [Harvard Biomedical Data Management Website](https://datamanagement.hms.harvard.edu/hms-data-lifecycle)_
 27 | 
 28 | We will cover some parts of this lifecycle by talking about best practices for the **Research** half of the above lifecycle. Later in this workshop we will talk a little more about the data storage. For more information about the full lifecycle and more guidelines for data management, please look at the resources linked below.
 29 | 
 30 | **Resources**
 31 | 
 32 | * The [HMS Data Management Working Group's website](https://datamanagement.hms.harvard.edu/)
 33 | * A guide from the [Harvard library](http://guides.library.harvard.edu/dmp).
 34 | 
 35 | ### Planning
 36 | 
 37 | You should approach your sequencing project in a very similar way to how you do a biological experiment, and ideally, begins with **experimental design**. We're going to assume that you've already designed a beautiful sequencing experiment to address your biological question, collected appropriate samples, and that you have enough statistical power.
 38 | 
 39 | During this stage it is important to keep track of how the experiment was performed and clearly tracking the source of starting materials and kits used. It is also best practice to include information about any small variations within the experiment or variation relative to standard experiments. 
 40 | 
 41 | ### Organization
 42 | 
 43 | Every computational analysis you do is going to spawn many files, and inevitability you'll want to run some of those analyses again. For each experiment you work on and analyze data for, it is considered best practice to get organized by creating a planned storage space (directory structure).
 44 | 
 45 | We will start by creating a directory that we can use for the rest of the RNA-seq session.
 46 | 
 47 | First, make sure that you are in your home directory.
 48 | 
 49 | ```bash
 50 | $ cd
 51 | $ pwd
 52 | ```
 53 | 
 54 | This should return `/home/username`.
 55 | 
 56 | We will change into the `unix_lesson` directory:
 57 | 
 58 | ```bash
 59 | cd unix_lesson
 60 | ```
 61 | 
 62 | Next, we will create a project directory and set up the following structure within it to keep files organized. 
 63 | 
 64 | ```bash
 65 | rnaseq
 66 |   ├── logs
 67 |   ├── meta
 68 |   ├── raw_data  
 69 |   ├── results
 70 |   └── scripts
 71 | ```
 72 | 
 73 | *This is a generic structure and can be tweaked based on personal preference and analysis workflow.*
 74 | 
 75 | - `logs`: to keep track of the commands run and the specific parameters used, but also to have a record of any standard output that is generated while running the command. 
 76 | - `meta`: for any information that describes the samples you are using, which we refer to as [metadata](https://datamanagement.hms.harvard.edu/metadata-overview). We will discuss this in more detail as it pertains to our example dataset, later in this lesson.
 77 | - `raw_data`: for any **unmodified** (raw) data obtained prior to computational analysis here, e.g. FASTQ files from the sequencing center. We strongly recommend leaving this directory unmodified through the analysis.
 78 | - `results`: for output from the different tools you implement in your workflow. Create sub-folders specific to each tool/step of the workflow within this folder. 
 79 | - `scripts`: for scripts that you write and use to run analyses/workflow.
 80 | 
 81 | Here, you can use the parents flag (`-p` or `--parents`) with `mkdir` to complete the file path by creating any parent directories that do not exist. In our case, we have not yet created the `rnaseq` directory and so since it does not exist it will be created. This flag can be very useful when scripting workflows. 
 82 | 
 83 | 
 84 | ```bash
 85 | 
 86 | $ mkdir -p rnaseq/logs rnaseq/meta rnaseq/raw_data rnaseq/results rnaseq/scripts
 87 | ``` 
 88 | 
 89 | Verify that the project directory and subdirectories now exist.
 90 | 
 91 | ```bash
 92 | 
 93 | $ cd rnaseq
 94 | $ ls -l
 95 | 
 96 | ```
 97 | 
 98 | Let's populate the `rnaseq/` project with our example RNA-seq FASTQ data.
 99 | 
100 | The FASTQ files are located inside `~/unix_lesson/raw_fastq/`, and we need to copy them to `raw_data/`. We can match them by file extension with `*.fq`.
101 | 
102 | ```bash
103 | $ cp ~/unix_lesson/raw_fastq/*.fq raw_data/
104 | ```
105 | 
106 | Perfect, now the structure of `rnaseq/` should look like this:
107 | 
108 | ```bash
109 | rnaseq
110 |   ├── logs
111 |   ├── meta
112 |   ├── raw_data
113 |   │   ├── Irrel_kd_1.subset.fq
114 |   │   ├── Irrel_kd_2.subset.fq
115 |   │   ├── Irrel_kd_3.subset.fq
116 |   │   ├── Mov10_oe_1.subset.fq
117 |   │   ├── Mov10_oe_2.subset.fq
118 |   │   └── Mov10_oe_3.subset.fq
119 |   ├── results
120 |   └── scripts
121 | ```
122 | 
123 | > #### File naming conventions
124 | > 
125 | > Another aspect of staying organized is making sure that all the filenames in an analysis are as consistent as possible, and are not things like `alignment1.bam`, but more like `20170823_kd_rep1_gmap-1.4.bam`. [This link](https://datamanagement.hms.harvard.edu/file-naming-conventions) and [this slideshow](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) have some good guidelines for file naming dos and don'ts.
126 | 
127 | 
128 | ### Documentation
129 | 
130 | **Documentation doesn't stop at the sequencer!** Keeping notes on what happened in what order, and what was done, is essential for reproducible research.
131 | 
132 | #### Log files
133 | 
134 | In your lab notebook, you likely keep track of the different reagents and kits used for a specific protocol. Similarly, recording information about the tools and parameters is important for documenting your computational experiments. 
135 | 
136 | - **Make note of the software you use.** Do your research and find out what tools are best for the data you are working with. Don't just work with tools that you are able to easily install.
137 | - **Keep track of software versions.** Keep up with the literature and make sure you are using the most up-to-date versions.
138 | - **Record information on parameters used and summary statistics** at every step (e.g., how many adapters were removed, how many reads did not align)
139 |     - A general rule of thumb is to test on a single sample or a subset of the data before running your entire dataset through. This will allow you to debug quicker and give you a chance to also get a feel for the tool and the different parameters.
140 |     - Different tools have different ways of reporting log messages and you might have to experiment a bit to figure out what output to capture. You can redirect standard output with the `>` symbol which is equivalent to `1> (standard out)`; other tools might require you to use `2>` to re-direct the `standard error` instead.
141 |  
142 | #### README files
143 | 
144 | After setting up the directory structure and when the analysis is running it is useful to have a **[README file](https://datamanagement.hms.harvard.edu/readme-files) within your project directory**. This file will usually contain a quick one line summary about the project and any other lines that follow will describe the files/directories found within it. An example README is shown below. Within each sub-directory you can also include README files to describe the analysis and the files that were generated.
145 | 
146 | ```
147 | ## README ##
148 | ## This directory contains data generated during the Intro to RNA-seq course
149 | ## Date: 
150 | 
151 | There are six subdirectories in this directory:
152 | 
153 | raw_data : contains raw data
154 | meta:  contains...
155 | logs:
156 | results:
157 | scripts:
158 | ```
159 | 
160 | *** 
161 | 
162 | ### Homework exercise
163 | 
164 | - Take a moment to create a README for the `rnaseq/` folder (hint: use `vim` to create the file). Give a short description of the project and brief descriptions of the types of files you will be storing within each of the sub-directories. 
165 | 
166 | ***
167 | 
168 | ## Exploring the example dataset
169 | 
170 | The dataset we are using is part of a larger study described in [Kenny PJ et al., *Cell Rep* 2014](http://www.ncbi.nlm.nih.gov/pubmed/25464849). The authors are investigating interactions between various genes involved in Fragile X syndrome, a disease of aberrant protein production, which results in cognitive impairment and autistic-like features. **The authors sought to show that RNA helicase MOV10 regulates the translation of RNAs involved in Fragile X syndrome.**
171 | 
172 | ### Raw data
173 | 
174 | From this study we are using the [RNA-seq](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE50499) data which is publicly available in the [Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra/?term=SRP029367). 
175 | 
176 | ### Metadata
177 | 
178 | In addition to the raw sequence data we also need to collect **information about the data**, also known as **metadata**.  We are usually quick to want to begin analysis of the sequence data (FASTQ files), but how useful is it if we know nothing about the samples that this sequence data originated from? Some relevant metadata for our dataset is provided below:
179 | 
180 | * The RNA was extracted from **HEK293F cells** that were transfected with a **MOV10 transgene**, **MOV10 siRNA**, or an **irrelevant siRNA**.  (*For this workshop we won't be using the MOV10 knock down samples.*)
181 | * The libraries for this dataset are **stranded** and were generated using the standard Tru-seq prep kit (using the dUTP method). 
182 | * Sequencing was carried out on the **Illumina HiSeq-2500** and **100bp single end** reads were generated. 
183 | * The full dataset was sequenced to **~40 million reads** per sample, but for this workshop we will be looking at a small subset on chr1 (~300,000 reads/sample).
184 | * For each group we have three replicates as described in the figure below.
185 | 
186 | ![Automation](../img/exp_design.png)
187 | 
188 | 
189 | 
190 | ---
191 | 
192 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
193 | 
194 | * *The materials used in this lesson were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
195 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
196 | * *Adapted from the lesson by Tracy Teal. Original contributors: Paul Wilson, Milad Fatenejad, Sasha Wood and Radhika Khetani for Software Carpentry (http://software-carpentry.org/)*
197 | 
198 | 


--------------------------------------------------------------------------------
/lessons/02_assessing_quality.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Quality control using FASTQC"
  3 | author: "Mary Piper, Radhika Khetani"
  4 | date: Wednesday, September 20, 2017
  5 | duration: 85 minutes
  6 | ---
  7 | 
  8 | ## Learning Objectives:
  9 | 
 10 | * Evaluate the quality of your NGS data using FastQC
 11 | * Create and run a job submission script to automate quality assessment
 12 | 
 13 | ## Quality Control of FASTQ files
 14 | 
 15 | <img src="../img/RNAseqWorkflow.png" width="400">
 16 | 
 17 | The first step in the RNA-Seq workflow is to take the FASTQ files received from the sequencing facility and assess the quality of the sequence reads. 
 18 | 
 19 | ### Unmapped read data (FASTQ)
 20 | 
 21 | The [FASTQ](https://en.wikipedia.org/wiki/FASTQ_format) file format is the defacto file format for sequence reads generated from next-generation sequencing technologies. This file format evolved from FASTA in that it contains sequence data, but also contains quality information. Similar to FASTA, the FASTQ file begins with a header line. The difference is that the FASTQ header is denoted by a `@` character. For a single record (sequence read) there are four lines, each of which are described below:
 22 | 
 23 | |Line|Description|
 24 | |----|-----------|
 25 | |1|Always begins with '@' and then information about the read|
 26 | |2|The actual DNA sequence|
 27 | |3|Always begins with a '+' and sometimes the same info in line 1|
 28 | |4|Has a string of characters which represent the quality scores; must have same number of characters as line 2|
 29 | 
 30 | Let's use the following read as an example:
 31 | 
 32 | ```
 33 | @HWI-ST330:304:H045HADXX:1:1101:1111:61397
 34 | CACTTGTAAGGGCAGGCCCCCTTCACCCTCCCGCTCCTGGGGGANNNNNNNNNNANNNCGAGGCCCTGGGGTAGAGGGNNNNNNNNNNNNNNGATCTTGG
 35 | +
 36 | @?@DDDDDDHHH?GH:?FCBGGB@C?DBEGIIIIAEF;FCGGI#########################################################
 37 | ```
 38 | 
 39 | As mentioned previously, line 4 has characters encoding the quality of each nucleotide in the read. The legend below provides the mapping of quality scores (Phred-33) to the quality encoding characters. *Different quality encoding scales exist (differing by offset in the ASCII table), but note the most commonly used one is fastqsanger.*
 40 | 
 41 |  ```
 42 |  Quality encoding: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
 43 |                    |         |         |         |         |
 44 |     Quality score: 0........10........20........30........40                                
 45 | ```
 46 |  
 47 | Using the quality encoding character legend, the first nucelotide in the read (C) is called with a quality score of 31 and our Ns are called with a score of 2. **As you can tell by now, this is a bad read.** 
 48 | 
 49 | Each quality score represents the probability that the corresponding nucleotide call is incorrect. This quality score is logarithmically based and is calculated as:
 50 | 
 51 | 	Q = -10 x log10(P), where P is the probability that a base call is erroneous
 52 | 
 53 | These probabaility values are the results from the base calling algorithm and dependent on how much signal was captured for the base incorporation. The score values can be interpreted as follows:
 54 | 
 55 | |Phred Quality Score |Probability of incorrect base call |Base call accuracy|
 56 | |:-------------------|:---------------------------------:|-----------------:|
 57 | |10	|1 in 10 |	90%|
 58 | |20	|1 in 100|	99%|
 59 | |30	|1 in 1000|	99.9%|
 60 | |40	|1 in 10,000|	99.99%|
 61 | 
 62 | Therefore, for the first nucleotide in the read (C), there is less than a 1 in 1000 chance that the base was called incorrectly. Whereas, for the the end of the read there is greater than 50% probabaility that the base is called incorrectly.
 63 | 
 64 | ## Assessing quality with FastQC
 65 | 
 66 | Now we understand what information is stored in a FASTQ file, the next step is to examine quality metrics for our data.
 67 | 
 68 | [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) provides a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses which you can use to give a quick impression of whether your data has any problems of which you should be aware before doing any further analysis.
 69 | 
 70 | The main functions of FastQC are:
 71 | 
 72 | * Import of data from BAM, SAM or FastQ files (any variant)
 73 | * Providing a quick overview to tell you in which areas there may be problems
 74 | * Summary graphs and tables to quickly assess your data
 75 | * Export of results to an HTML based permanent report
 76 | * Offline operation to allow automated generation of reports without running the interactive application
 77 | 
 78 | ### Run FastQC  
 79 | 
 80 | Before we run FastQC, let's start an interactive session on the cluster (if you don't already have one going):
 81 | 
 82 | ```bash
 83 | $ srun --pty -p short -t 0-12:00 --mem 8G --reservation=HBC /bin/bash
 84 | ```
 85 | 
 86 | ***An interactive session is very useful to test tools, workflows, run jobs that open new interactive windows (X11-forwarding) and so on.***
 87 | 
 88 | Once your interactive job starts, notice that the command prompt has changed; this is because we are working on a compute node now, not on a login node. Change directories to `raw_data`.
 89 | 
 90 | ```bash
 91 | $ cd ~/unix_lesson/rnaseq/raw_data
 92 | ```  
 93 | 
 94 | Before we start using software, we have to load the environments for each software package. On the O2 cluster, this is done using an **LMOD** system. 
 95 | 
 96 | If we check which modules we currently have loaded, we should not see FastQC.
 97 | 
 98 | ```bash
 99 | $ module list
100 | ```
101 | 
102 | This is because the FastQC program is not in our $PATH (i.e. its not in a directory that unix will automatically check to run commands/programs).
103 | 
104 | ```bash
105 | $ echo $PATH
106 | ```
107 | 
108 | To run the FastQC program, we first need to load the appropriate module, so it puts the program into our path. To find the FastQC module to load we need to search the versions available:
109 | 
110 | ```bash
111 | $ module spider
112 | ```
113 | 
114 | Then we can load the FastQC module:
115 | 
116 | ```bash
117 | $ module load fastqc/0.11.3
118 | ```
119 | 
120 | Once a module for a tool is loaded, you have essentially made it directly available to you like any other basic UNIX command.
121 | 
122 | ```bash
123 | $ module list
124 | 
125 | $ echo $PATH
126 | ```
127 | 
128 | FastQC will accept multiple file names as input, so we can use the `*.fq` wildcard.
129 | 
130 | ```bash
131 | $ fastqc *.fq
132 | ```
133 | 
134 | *Did you notice how each file was processed serially? How do we speed this up?*
135 | 
136 | Exit the interactive session and start a new one with 6 cores, and use the multi-threading functionality of FastQC to run 6 jobs at once.
137 | 
138 | ```bash
139 | $ exit  #exit the current interactive session
140 | 
141 | $ srun --pty -c 6 -p short -t 0-12:00 --mem 8G --reservation=HBC /bin/bash  #start a new one with 6 cpus (-n 6) and 8G RAM (--mem 8G)
142 | 
143 | $ module load fastqc/0.11.3  #reload the module for the new session
144 | 
145 | $ cd ~/unix_lesson/rnaseq/raw_data
146 | 
147 | $ fastqc -t 6 *.fq  #note the extra parameter we specified for 6 threads
148 | ```
149 | 
150 | How did I know about the -t argument for FastQC?
151 | 
152 | ```bash
153 | $ fastqc --help
154 | ```
155 | 
156 | 
157 | Now, let's create a home for our results
158 | 
159 | ```bash
160 | $ mkdir ~/unix_lesson/rnaseq/results/fastqc
161 | ```
162 | 
163 | ...and move them there (recall, we are still in `~/unix_lesson/rnaseq/raw_data/`)
164 | 
165 | ```bash
166 | $ mv *fastqc* ~/unix_lesson/rnaseq/results/fastqc/
167 | ```
168 | 
169 | ### Performing quality assessment using job submission scripts
170 | So far in our FASTQC analysis, we have been directly submitting commands to O2 using an interactive session (ie. `srun --pty -n 6 -p short -t 0-12:00 --mem 8G bash`). However, there are many more partitions available on O2 than just the interactive partition. We can submit commands or series of commands to these partitions using job submission scripts. 
171 | 
172 | **Job submission scripts** for O2 are just regular scripts, but contain the O2 **options/directives** for job submission, such as *number of cores, name of partition, runtime limit, etc*. We can submit these scripts to whichever partition we specify in the script using the `sbatch` command as follows:
173 | 
174 | ```bash
175 | # DO NOT RUN THIS
176 | $ sbatch job_submission_script.run
177 | ```
178 | 
179 | Submission of the script using the `sbatch` command allows SLURM to run your job when its your turn. Let's create a job submission script to load the FASTQC module, run FASTQC on all of our fastq files, and move the files to the appropriate directory.
180 | 
181 | Change directories to `~/unix_lesson/rnaseq/scripts`, and create a script named `mov10_fastqc.run` using `vim`.
182 | 
183 | ```bash
184 | $ cd ~/unix_lesson/rnaseq/scripts
185 | 
186 | $ vim mov10_fastqc.run
187 | ```
188 | 
189 | The first thing we need in our script is the **shebang line**:
190 | 
191 | ```bash
192 | #!/bin/bash
193 | ```
194 | 
195 | Following the shebang line are the O2 options. For the script to run, we need to include options for **queue/partition (-p) and runtime limit (-t)**. To specify our options, we precede the option with `#SBATCH`, which tells O2 that the line contains options for job submission to SLURM. 
196 | 
197 | ```bash
198 | #SBATCH -p short 		# partition name
199 | #SBATCH -t 0-2:00 		# hours:minutes runlimit after which job will be killed
200 | #SBATCH -c 6 		# number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job
201 | #SBATCH --job-name rnaseq_mov10_fastqc 		# Job name
202 | #SBATCH -o %j.out			# File to which standard out will be written
203 | #SBATCH -e %j.err 		# File to which standard err will be written
204 | ```
205 | Now in the body of the script, we can include any commands we want run:
206 | 
207 | ```bash
208 | ## Changing directories to where the fastq files are located
209 | cd ~/unix_lesson/rnaseq/raw_data
210 | 
211 | ## Loading modules required for script commands
212 | module load fastqc/0.11.3
213 | 
214 | ## Running FASTQC
215 | fastqc -t 6 *.fq
216 | 
217 | ## Moving files to our results directory
218 | mv *fastqc* ../results/fastqc/
219 | ```
220 | 
221 | Save and quit the script. Now, let's submit the job to the SLURM:
222 | 
223 | ```bash
224 | $ sbatch mov10_fastqc.run
225 | ```
226 | 
227 | You can check on the status of your job with:
228 | 
229 | ```bash
230 | $ sacct
231 | ```
232 | 
233 | ```bash
234 | $ ls -lh ../results/fastqc/
235 | ```
236 | There should also be standard error (`.err`) and standard out (`.out`) files from the job listed in `~/unix_lesson/rnaseq/scripts`. You can move these over to your `logs` directory and give them more intuitive names:
237 | 
238 | ```bash
239 | $ mv *.err ../logs/fastqc.err
240 | $ mv *.out ../logs/fastqc.out
241 | ```
242 | 
243 | ***
244 | **Exercise**
245 | 
246 | How would you change the `mov10_fastqc.run` script if you had 9 fastq files you wanted to run in parallel.
247 | 
248 | ***
249 | 
250 | ### FastQC Results
251 |    
252 | Let's take a closer look at the files generated by FastQC:
253 |    
254 | `$ ls -lh ~/unix_lesson/rnaseq/results/fastqc/`
255 | 
256 | #### HTML reports
257 | The .html files contain the final reports generated by fastqc, let's take a closer look at them. Transfer the file for `Mov10_oe_1.subset.fq` over to your laptop via *FileZilla*.
258 | 
259 | ##### Filezilla - Step 1
260 | 
261 | Open *FileZilla*, and click on the File tab. Choose 'Site Manager'.
262 |  
263 | ![FileZilla_step1](../img/Filezilla_step1.png)
264 | 
265 | ##### Filezilla - Step 2
266 | 
267 | Within the 'Site Manager' window, do the following: 
268 | 
269 | 1. Click on 'New Site', and name it something intuitive (e.g. O2)
270 | 2. Host: transfer.rc.hms.harvard.edu 
271 | 3. Protocol: SFTP - SSH File Transfer Protocol
272 | 4. Logon Type: Normal
273 | 5. User: ECommons ID
274 | 6. Password: ECommons password
275 | 7. Click 'Connect'
276 | 
277 | <img src="../img/Filezilla_step2.png" width="500">	
278 | 	
279 | ***FastQC is just an indicator of what's going on with your data, don't take the "PASS"es and "FAIL"s too seriously.***
280 | 
281 | FastQC has a really well documented [manual page](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) with [more details](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/) about all the plots in the report. We recommend looking at [this post](http://bioinfo-core.org/index.php/9th_Discussion-28_October_2010) for more information on what bad plots look like and what they mean for your data.
282 | 
283 | > **We also have a [slidedeck](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/raw/master/lectures/error_profiles_mm.pdf) of error profiles for Illumina sequencing, where we discuss specific FASTQC plots and possible sources of these types of errors.**
284 | 
285 | Below are two of the most important analysis modules in FastQC, the **"Per base sequence quality"** plot and the **"Overrepresented sequences"** table. 
286 | 
287 | The **"Per base sequence quality"** plot provides the distribution of quality scores across all bases at each position in the reads.
288 | 
289 | ![FastQC_seq_qual](../img/FastQC_seq_qual.png)
290 | 
291 | The **"Overrepresented sequences"** table displays the sequences (at least 20 bp) that occur in more than 0.1% of the total number of sequences. This table aids in identifying contamination, such as vector or adapter sequences. 
292 | 
293 | ![FastQC_contam](../img/FastQC_contam.png)
294 | 
295 | We will go over the remaining plots in class. Remember, our report only represents a subset of reads (chromosome 1) for `Mov10_oe_1.subset.fq`, which can skew the QC results. We encourage you to look at the [full set of reads](../fastqc/Mov10oe_1-fastqc_report.html) and note how the QC results differ when using the entire dataset.
296 |    
297 | > **_NOTE:_** 
298 | >The other output of FastQC is a .zip file. These .zip files need to be unpacked with the `unzip` program. If we try to `unzip` them all at once:
299 | >
300 | >```bash
301 | >$ cd ~/unix_lesson/rnaseq/results/fastqc/    
302 | >$ unzip *.zip
303 | >```
304 | >
305 | >Did it work? 
306 | >
307 | >No, because `unzip` expects to get only one zip file. Welcome to the real world.
308 | >We *could* do each file, one by one, but what if we have 500 files? There is a smarter way.
309 | >We can save time by using a simple shell `for loop` to iterate through the list of files in *.zip.
310 | >
311 | >After you type the first line, you will get a special '>' prompt to type next lines.  
312 | >You start with 'do', then enter your commands, then end with 'done' to execute the loop.
313 | >
314 | >Note that in the first line, we create a variable named `zip`.  After that, we call that variable with the syntax `$zip`. `$zip` is assigned the value of each item (file) in the list *.zip, once for each iteration of the loop.
315 | >
316 | >This loop is basically a simple program. When it runs
317 | >
318 | >```bash
319 | >$ for zip in *.zip
320 | > do
321 | > unzip $zip
322 | > done
323 | >```
324 | >This will run unzip once for each file (whose name is stored in the $zip variable). The contents of each file will be unpacked into a separate directory by the unzip program.
325 | >
326 | >The 'for loop' is interpreted as a multipart command.  If you press the up arrow on your keyboard to recall the command, it will be shown like so:
327 | >
328 | >```bash
329 | >for zip in *.zip; do unzip $zip; done
330 | >```
331 | >
332 | >When you check your history later, it will help you remember what you did!
333 | >
334 | >What information is contained in the unzipped folder?
335 | >
336 | >```bash
337 | >$ ls -lh Mov10_oe_1.subset_fastqc
338 | >$ head Mov10_oe_1.subset_fastqc/summary.txt
339 | >```
340 | >
341 | >To save a record, let's `cat` all `fastqc summary.txt` files into one `full_report.txt` and move this to `~/unix_lesson/rnaseq/docs`. 
342 | >You can use wildcards in paths as well as file names.  Do you remember how we said `cat` is really meant for concatenating text files?
343 | >    
344 | >```bash
345 | >$ cat */summary.txt > ~/unix_lesson/rnaseq/logs/fastqc_summaries.txt
346 | >```
347 | 
348 | ## Quality Control (*Optional*) - Trimming 
349 | 
350 | We want to make sure that as many reads as possible map or align accurately to the genome. To ensure accuracy, only a small number of mismatches between the read sequence and the genome sequence are allowed, and any read with more than a few mismatches will be marked as being unaligned. 
351 | 
352 | Therefore, to make sure that all the reads in the dataset have a chance to map/align to the genome, unwanted information can be trimmed off from every read, one read at a time. The types of unwanted information can include one or more of the following:
353 | - leftover adapter sequences
354 | - known contaminants (strings of As/Ts, other sequences)
355 | - poor quality bases at read ends
356 | 
357 | **We will not be performing this step** because:
358 | * our data does not have an appreciable amount of leftover adapter sequences or other contaminating sequences based on FastQC.
359 | * the alignment tool we have picked (STAR) is able to account for low-quality bases at the ends of reads when matching them to the genome.
360 | 
361 | If you need to perform trimming on your fastq data to remove unwanted sequences/bases, the recommended tool is [cutadapt](http://cutadapt.readthedocs.io/en/stable/index.html). 
362 | 
363 | Example of cutadapt usage:
364 | 
365 | ```bash
366 | $ cutadapt --adapter=AGATCGGAAGAG --minimum-length=25  -o myfile_trimmed.fastq.gz myfile.fastq.gz 
367 | ```
368 | 
369 | After trimming, cutadapt can remove any reads that are too short to ensure that you do not get spurious mapping of very short sequences to multiple locations on the genome. In addition to adapter trimming, cutadapt can trim off any low-quality bases too, but **please note that quality-based trimming is not considered best practice, since majority of the newer, recommended alignment tools can account for this.**
370 | 
371 | ---
372 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
373 | 
374 | * *The materials used in this lesson was derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
375 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
376 | 


--------------------------------------------------------------------------------
/lessons/03_alignment.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Alignment with STAR"
  3 | author: "Meeta Mistry, Bob Freeman, Mary Piper"
  4 | date: Wednesday, June 7, 2017
  5 | ---
  6 | 
  7 | Approximate time: 90 minutes
  8 | 
  9 | ## Learning Objectives:
 10 | 
 11 | * Understanding the alignment method STAR utilizes to align sequence reads to the reference genome
 12 | * Identifying the intricacies of alignment tools used in NGS analysis (parameters, usage, etc)
 13 | * Choosing appropriate STAR alignment parameters for our dataset
 14 | 
 15 | ## Read Alignment
 16 | 
 17 | <img src="../img/RNAseqWorkflow.png" width="400">
 18 | 
 19 | Now that we have explored the quality of our raw reads, we can move on to read alignment. We perform read alignment or mapping to determine where in the genome the reads originated from. The alignment process consists of choosing an appropriate reference genome to map our reads against and performing the read alignment using one of several splice-aware alignment tools such as [STAR](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635) or [HISAT2](http://ccb.jhu.edu/software/hisat2/index.shtml). The choice of aligner is often a personal preference and also dependent on the computational resources that are available to you.
 20 | 
 21 | ## STAR Aligner
 22 | 
 23 | To determine where on the human genome our reads originated from, we will align our reads to the reference genome using [STAR](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530905/) (Spliced Transcripts Alignment to a Reference). STAR is an aligner designed to specifically address many of the challenges of RNA-seq data mapping using a strategy to account for spliced alignments. 
 24 | 
 25 | ### STAR Alignment Strategy
 26 | 
 27 | STAR is shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. The algorithm achieves this highly efficient mapping by performing a two-step process:
 28 | 
 29 | 1. Seed searching
 30 | 2. Clustering, stitching, and scoring
 31 | 
 32 | #### Seed searching
 33 | 
 34 | For every read that STAR aligns, STAR will search for the longest sequence that exactly matches one or more locations on the reference genome. These longest matching sequences are called the Maximal Mappable Prefixes (MMPs):
 35 | 
 36 | 
 37 | ![STAR_step1](../img/alignment_STAR_step1.png)
 38 | 	
 39 | The different parts of the read that are mapped separately are called 'seeds'. So the first MMP that is mapped to the genome is called *seed1*.
 40 | 
 41 | STAR will then search again for only the unmapped portion of the read to find the next longest sequence that exactly matches the reference genome, or the next MMP, which will be *seed2*. 
 42 | 
 43 | ![STAR_step2](../img/alignment_STAR_step2.png)
 44 | 
 45 | This sequential searching of only the unmapped portions of reads underlies the efficiency of the STAR algorithm. STAR uses an uncompressed suffix array (SA) to efficiently search for the MMPs, this allows for quick searching against even the largest reference genomes. Other slower aligners use algorithms that often search for the entire read sequence before splitting reads and performing iterative rounds of mapping.
 46 | 
 47 | **If STAR does not find an exact matching sequence** for each part of the read due to mismatches or indels, the previous MMPs will be extended.
 48 | 
 49 | ![STAR_step3](../img/alignment_STAR_step3.png)
 50 | 
 51 | **If extension does not give a good alignment**, then the poor quality or adapter sequence (or other contaminating sequence) will be soft clipped.
 52 | 
 53 | ![STAR_step4](../img/alignment_STAR_step4.png)
 54 | 
 55 | 
 56 | #### Clustering, stitching, and scoring
 57 | 
 58 | The separate seeds are stitched together to create a complete read by first clustering the seeds together based on proximity to a set of 'anchor' seeds, or seeds that are not multi-mapping.
 59 | 
 60 | Then the seeds are stitched together based on the best alignment for the read (scoring based on mismatches, indels, gaps, etc.). 
 61 | 
 62 | ![STAR_step5](../img/alignment_STAR_step5.png)
 63 | 
 64 | ## Running STAR
 65 | 
 66 | ### Set-up
 67 | 
 68 | To get started with this lesson, start an interactive session with 6 cores:
 69 | 
 70 | ```bash
 71 | $ srun --pty -p short -t 0-12:00 -c 6 --mem 8G --reservation=HBC /bin/bash	
 72 | ```
 73 | 
 74 | You should have a directory tree setup similar to that shown below. it is best practice to have all files you intend on using for your workflow present within the same directory. In our case, we have our original FASTQ files generated in the previous section. 
 75 | 
 76 | ```bash
 77 | rnaseq
 78 | 	├── logs
 79 | 	├── meta
 80 | 	├── raw_data
 81 | 	│   ├── Irrel_kd_1.subset.fq
 82 | 	│   ├── Irrel_kd_2.subset.fq
 83 | 	│   ├── Irrel_kd_3.subset.fq
 84 | 	│   ├── Mov10_oe_1.subset.fq
 85 | 	│   ├── Mov10_oe_2.subset.fq
 86 | 	│   └── Mov10_oe_3.subset.fq
 87 | 	├── results
 88 | 	└── scripts
 89 | ```
 90 | 
 91 | To use the STAR aligner, load the module: 
 92 | 
 93 | ```bash
 94 | $ module load gcc/6.2.0 star/2.5.2b
 95 | ```
 96 | 
 97 | Aligning reads using STAR is a two step process:   
 98 | 
 99 | 1. Create a genome index 
100 | 2. Map reads to the genome
101 | 
102 | > A quick note on shared databases for human and other commonly used model organisms. The O2 cluster has a designated directory at `/n/groups/shared_databases/` in which there are files that can be accessed by any user. These files contain, but are not limited to, genome indices for various tools, reference sequences, tool specific data, and data from public databases, such as NCBI and PDB. So when using a tool that requires a reference of sorts, it is worth taking a quick look here because chances are it's already been taken care of for you. 
103 | >
104 | >```bash
105 | > $ ls -l /n/groups/shared_databases/igenome/
106 | >```
107 | 
108 | ### Creating a genome index
109 | 
110 | For this workshop we are using reads that originate from a small subsection of chromosome 1 (~300,000 reads) and so we are using only chr1 as the reference genome. 
111 | 
112 | To store our genome indices, we will use the `/n/scratch2/` space with large temporary storage capacity. We need to create a directory for the indices within this space:
113 | 
114 | ```bash
115 | $ mkdir -p /n/scratch2/username/chr1_hg38_index
116 | ```
117 | 
118 | The basic options to **generate genome indices** using STAR are as follows:
119 | 
120 | * `--runThreadN`: number of threads
121 | * `--runMode`: genomeGenerate mode
122 | * `--genomeDir`: /path/to/store/genome_indices
123 | * `--genomeFastaFiles`: /path/to/FASTA_file 
124 | * `--sjdbGTFfile`: /path/to/GTF_file
125 | * `--sjdbOverhang`: readlength -1
126 | 
127 | > *NOTE:* In case of reads of varying length, the ideal value for `--sjdbOverhang` is max(ReadLength)-1. In most cases, the default value of 100 will work similarly to the ideal value.
128 | 
129 | Now let's create a job submission script to generate the genome index:
130 | 
131 | ```bash
132 | $ vim ~/unix_lesson/rnaseq/scripts/genome_index.run
133 | ```
134 | Within `vim` we now add our shebang line, the SLURM directives, and our STAR command. 
135 | 
136 | ```bash
137 | #!/bin/bash
138 | 
139 | #SBATCH -p short 		# partition name
140 | #SBATCH -t 0-2:00 		# hours:minutes runlimit after which job will be killed
141 | #SBATCH -c 6 		# number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job
142 | #SBATCH --mem 16G
143 | #SBATCH --job-name STAR_index 		# Job name
144 | #SBATCH -o %j.out			# File to which standard out will be written
145 | #SBATCH -e %j.err 		# File to which standard err will be written
146 | 
147 | cd /n/scratch2/username/
148 | 
149 | module load gcc/6.2.0 star/2.5.2b
150 | 
151 | STAR --runThreadN 6 \
152 | --runMode genomeGenerate \
153 | --genomeDir chr1_hg38_index \
154 | --genomeFastaFiles /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.dna.chromosome.1.fa \
155 | --sjdbGTFfile /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \
156 | --sjdbOverhang 99
157 | ```
158 | 
159 | ```bash
160 | $ sbatch ~/unix_lesson/rnaseq/scripts/genome_index.run
161 | ```
162 | 
163 | ### Aligning reads
164 | 
165 | After you have the genome indices generated, you can perform the read alignment. We previously generated the genome indices for you in `/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index/` directory so that we don't get held up waiting on the generation of the indices.
166 | 
167 | Create an output directory for our alignment files:
168 | 
169 | ```bash
170 | $ cd ~/unix_lesson/rnaseq/raw_data
171 | 
172 | $ mkdir ../results/STAR
173 | ```
174 | 
175 | ### STAR command in interactive bash
176 | 
177 | For now, we're going to work on just one sample to set up our workflow. To start we will use the first replicate in the Mov10 over-expression group, `Mov10_oe_1.subset.fq`. Details on STAR and its functionality can be found in the [user manual](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf); we encourage you to peruse through to get familiar with all available options.
178 | 
179 | The basic options for aligning reads to the genome using STAR are:
180 | 
181 | * `--runThreadN`: number of threads / cores
182 | * `--readFilesIn`: /path/to/FASTQ_file
183 | * `--genomeDir`: /path/to/genome_indices_directory
184 | * `--outFileNamePrefix`: prefix for all output files
185 | 
186 | Listed below are additional parameters that we will use in our command:
187 | 
188 | * `--outSAMtype`: output filetype (SAM default)
189 | * `--outSAMunmapped`: what to do with unmapped reads
190 | 
191 | > **NOTE:** Default filtering is applied in which the maximum number of multiple alignments allowed for a read is set to 10. If a read exceeds this number there is no alignment output. To change the default you can use `--outFilterMultimapNmax`, but for this lesson we will leave it as default. Also, note that "**STAR’s default parameters are optimized for mammalian genomes.** Other species may require significant modifications of some alignment parameters; in particular, the maximum and minimum intron sizes have to be reduced for organisms with smaller introns" [[1](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.full.pdf+html)].
192 | 
193 | We can access the software by simply using the STAR command followed by the basic parameters described above and any additional parameters. The full command is provided below for you to copy paste into your terminal. If you want to manually enter the command, it is advisable to first type out the full command in a text editor (i.e. [Sublime Text](http://www.sublimetext.com/) or [Notepad++](https://notepad-plus-plus.org/)) on your local machine and then copy paste into the terminal. This will make it easier to catch typos and make appropriate changes. 
194 | 
195 | ```bash
196 | 
197 | STAR --genomeDir /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index/ \
198 | --runThreadN 6 \
199 | --readFilesIn Mov10_oe_1.subset.fq \
200 | --outFileNamePrefix ../results/STAR/Mov10_oe_1_ \
201 | --outSAMtype BAM SortedByCoordinate \
202 | --outSAMunmapped Within \
203 | --outSAMattributes Standard 
204 | 
205 | ```
206 | 
207 | ---
208 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
209 | 


--------------------------------------------------------------------------------
/lessons/04_alignment_quality.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "SAM/BAM file and assessing quality "
  3 | author: "Meeta Mistry, Bob Freeman"
  4 | date: "Tuesday, June 28, 2016"
  5 | ---
  6 | 
  7 | Approximate time: 120 minutes
  8 | 
  9 | ## Learning objectives
 10 | 
 11 | * Evaluating the STAR aligner output files
 12 | * Understanding the standard alignment file (SAM/BAM) structure
 13 | * Using `samtools` to evaluate alignment quality 
 14 | * Visualizing alignment quality using IGV (genome browser)  
 15 | 
 16 | 
 17 | ## Assessing alignment quality
 18 | 
 19 | After running our single FASTQ file through the STAR aligner, you should have noticed a number of output files in the `~/unix_workshop/rnaseq/results/STAR` directory. Let's take a quick look at some of the files that were generated and explore the content of some of them. 
 20 | 
 21 | 	$ cd ~/unix_lesson/rnaseq/results/STAR
 22 | 	
 23 | 	$ ls -lh
 24 | 
 25 | What you should see, is that for each FASTQ file you have **5 output files** and a single tmp directory. Briefly, these files are described below:
 26 | 
 27 | * `Log.final.out` - a summary of mapping statistics for the sample
 28 | * `Aligned.sortedByCoord.out.bam` - the aligned reads, sorted by coordinate, in BAM format
 29 | * `Log.out` - a running log from STAR, with information about the run 
 30 | * `Log.progress.out` -  job progress with the number of processed reads, % of mapped reads etc., updated every ~1 minute
 31 | * `SJ.out.tab` - high confidence collapsed splice junctions in tab-delimited format. Only junctions supported by uniquely mapping reads are reported
 32 | 
 33 | ## Mapping statistics
 34 | 
 35 | Having completed the alignment, the first thing we want to know is how well did our reads align to the reference. Rather than looking at each read alignment, it can be more useful to evaluate statistics that give a general overview for the sample. One of the output files from the STAR aligner contains mapping statistics, let's take a closer look at one of those files. We'll use the `less` command which allows us to scroll through it easily: 
 36 | 
 37 | 	$ less Mov10_oe_1_Log.final.out
 38 | 	
 39 | The log file provides information on reads that 1) mapped uniquely, 2) reads that mapped to mutliple locations and 3) reads that are unmapped. Additionally, we get details on splicing, insertion and deletion. From this file the most informative statistics include the **mapping rate and the number of multimappers**.
 40 | 
 41 | * As an example, a good quality sample will have **at least 75% of the reads uniquely mapped**. Once values start to drop lower than 60% it's advisable to start troubleshooting. The lower the number of uniquely mapping reads means the higher the number of reads that are mapping to multiple locations. It is best to keep this number low because multi-mappers are not included when we start counting reads
 42 | 
 43 | > NOTE: The thresholds suggested above will vary depending on the organism that you are working with. Much of what is discussed here is in the context of working with human or mouse data. For example, 75% of mapped reads holds true only if the genome is good or mature. For badly assembled genomes we may not observe a high mapping rate, even if the actual sequence sample is good.
 44 | 
 45 | *** 
 46 | 
 47 | **Exercise**
 48 | 
 49 | Using the less command take a look at `Mov10_oe_1_Log.final.out` and answer the following questions:
 50 | 
 51 | 1. How many reads map to more than 10 locations on the genome?
 52 | 2. How many reads are unmapped due to read length?
 53 | 3. What is the average mapped length per read?
 54 | 
 55 | ***
 56 | 
 57 | 
 58 | ## Other quality checks
 59 | 
 60 | In addition to the aligner-specific summary we can also obtain quality metrics using tools like [Qualimap](http://qualimap.bioinfo.cipf.es/doc_html/intro.html#what-is-qualimap) or [RNASeQC](https://www.broadinstitute.org/publications/broad4133). These tools examine sequencing alignment data according to the features of the mapped reads and their genomic properties and **provide an overall view of the data that helps to to the detect biases in the sequencing and/or mapping of the data**.The input can be one or more BAM files and the output consists of HTML or PDF reports with useful figures and tab delimited files of metrics data.
 61 | 
 62 | We will not be performing this step in the workshop, but we describe some of the features below to point out things to look for when assessing alignment quality of RNA-seq data:
 63 | 
 64 | * **Reads genomic origin**: Even if you have high genomic mapping rate for all samples, check to see where the reads are mapping. Ensure that there is not an unusually high number of **reads mapping to intronic regions** (~30% expected) and fewer than normally observed **mapping to exons** (~55%). A high intronic mapping suggests possible genomic DNA contamination and/or pre-mRNA. 
 65 | * **Ribosomal RNA (rRNA)** constitutes a large majority of the RNA species in any total RNA preparation. Despite depletion methods, you can never achieve complete rRNA removal. Even with Poly-A enrichment a small percentage of ribosomal RNA can stick to the enrichment beads non-specifically. **Excess ribosomal content (> 2%)** will normally have to be filtered out so that differences in rRNA mapped reads across samples do not affect alignment rates and skew subsequent normalization of the data.
 66 | * **Transcript coverage and 5'-3' bias**: assesing the affect on expression level and on levels of transcript GC content
 67 | * **Junction analysis**: analysis of junction positions in spliced alignments (i.e known, partly known, novel) 
 68 | * **Strand specificity:** assess the performance of strand-specific library construction methods. The percentage of sense-derived reads is given for each end of the read pair. A non-strand-specific protocol would give values of 50%/50%, whereas strand-specific protocols typically yield 99%/1% or 1%/99% for this metric.
 69 | 
 70 | >**NOTE:** If interested in using Qualimap, we have [materials available](https://hbctraining.github.io/Intro-to-rnaseq-hpc-salmon/lessons/03_QC_STAR_and_Qualimap_run.html#qualimap) that you can walk through.
 71 | 
 72 | ## Alignment file format: SAM/BAM
 73 | 
 74 | The output we requested from the STAR aligner (using the appropriate parameters) is a BAM file. By default STAR will return a file in SAM format. BAM is a binary, compressed version of the SAM file, also known as **Sequence Alignment Map format**. The SAM file, introduced is a tab-delimited text file that contains information for each individual read and its alignment to the genome. While we will go into some features of the SAM format, the paper by [Heng Li et al](http://bioinformatics.oxfordjournals.org/content/25/16/2078.full) provides a lot more detail on the specification.
 75 | 
 76 | The file begins with a **header**, which is optional. The header is used to describe source of data, reference sequence, method of alignment, etc., this will change depending on the aligner being used. Each section begins with character ‘@’ followed by a two-letter record type code.  These are followed by two-letter tags and values. Example of some common sections are provided below:
 77 | 
 78 | ```
 79 | @HD  The header line
 80 | VN: format version
 81 | SO: Sorting order of alignments
 82 | 
 83 | @SQ  Reference sequence dictionary
 84 | SN: reference sequence name
 85 | LN: reference sequence length
 86 | SP: species
 87 | 
 88 | @PG  Program
 89 | PN: program name
 90 | VN: program version
 91 | ```
 92 | 
 93 | Following the header is the **alignment section**. Each line that follows corresponds to alignment information for a single read. Each alignment line has **11 mandatory fields for essential mapping information** and a variable number of other fields for aligner specific information. 
 94 | 
 95 | ![SAM1](../img/sam_bam.png)
 96 | 
 97 | An example read mapping is displayed above. *Note that the example above spans two lines, but in the file it is a single line.* Let's go through the fields one at a time. 
 98 | 
 99 | We are going to go through this out of order. Let's start with `QNAME` which is the read name. `RNAME` is the reference sequence name, which is 'chr1' in this example. `POS` refers to the 1-based leftmost position of the alignment. `MAPQ` is giving us the alignment quality, the scale of which will depend on the aligner being used. 
100 | 
101 | Next, we the `FLAG`. The `FLAG` value represents various things about the alignment. that is displayed can be translated into information about the mapping. 
102 | 
103 | | Flag | Description |
104 | | ------:|:----------------------:|
105 | | 1 | read is mapped |
106 | | 2 | read is mapped as part of a pair |
107 | | 4 | read is unmapped |
108 | | 8 | mate is unmapped |
109 | | 16| read reverse strand|
110 | | 32 | mate reverse strand |
111 | | 64 | first in pair |
112 | | 128 | second in pair |
113 | | 256 | not primary alignment |
114 | | 512 | read fails platform/vendor quality checks |
115 | | 1024| read is PCR or optical duplicate |
116 | 
117 | * For a given alignment, each of these flags are either **on or off** indicating the condition is **true or false**. 
118 | * The `FLAG` is a combination (additive) of all of the individual flags (from the table above) that are true for the alignment 
119 | * The beauty of the flag values is that any combination of flags can only result in one sum.
120 | 
121 | In our example we have a number that exist in the table, making it relatively easy to translate. But suppose our read alignment has a flag of 163; this means it is some combination of the list above and you can find out what this mean using various online tools like [this one from Picard](https://broadinstitute.github.io/picard/explain-flags.html)**
122 | 
123 | `CIGAR` is a sequence of letters and numbers that represent the *edits or operations* required to match the read to the reference. The letters are operations that are used to indicate which bases align to the reference (i.e. match, mismatch, deletion, insertion), and the numbers indicate the associated base lengths for each 'operation'. This is used by some downstream tools to quickly assess the alignment.
124 | 
125 | Now to the remaning fields in our SAM file:
126 | 
127 | ![SAM1](../img/sam_bam3.png)
128 | 
129 | The next three fields are more pertinent to paired-end data. `MRNM` is the mate reference name. `MPOS` is the mate position (1-based, leftmost). `ISIZE` is the inferred insert size.
130 | 
131 | Finally, you have the data from the original FASTQ file stored for each read. That is the raw sequence (`SEQ`) and the associated quality values for each position in the read (`QUAL`).
132 | 
133 | 
134 | ## `samtools`
135 | 
136 | [SAMtools](http://www.htslib.org/) is a tool that provides alot of functionality in dealing with SAM files. SAMtools utilities include, but are not limited to, viewing, sorting, filtering, merging, and indexing alignments in the SAM format. In this lesson we will explore a few of these utilities on our alignment files. To use this we need to load the module.
137 | 
138 | ```
139 | $ module load samtools/1.3.1
140 | ```
141 | 
142 | ### Viewing the SAM file
143 | 
144 | Now that we have learned so much about the SAM file format, let's use `samtools` to take a quick peek at our own files. The output we had requested from STAR was a BAM file. The problem is the BAM file is binary and not human-readable. Using the `view` command within `samtools` we can easily convert the BAM into something that we can understand. You will be returned to screen the entire SAM file, and so we can either write to file, or pipe this to the `less` command so we can scroll through it.
145 | 
146 | We will do the latter (since we don't really need it for downstream analysis) and scroll through the SAM file (using the up and down arrows) to see how the fields correspond to what we expected. Adding the `-h` flag allows to also view the header.
147 | 
148 | ```
149 | $ samtools view -h Mov10_oe_1_Aligned.sortedByCoord.out.bam | less
150 | 
151 | ``` 
152 | 
153 | > ### Filtering the SAM file
154 | > 
155 | > Now we know that we have all of this information for each of the reads -- wouldn't it be useful to summarize and filter based on selected criteria? Suppose we wanted to set a **threshold on mapping quality**. For example, we want to know how many reads aligned with a quality score higher than 30. To do this, we can combine the `view` command with additional flags `q 30` and `-c` (to count):
156 | > 
157 | > ```
158 | > $ samtools view -q 30 -c Mov10_oe_1_Aligned.sortedByCoord.out.bam
159 | > 
160 | > ```
161 | > *How many of reads have a mapping quality of 30 or higher?*
162 | > 
163 | > We can also **apply filters to select reads based on where they fall within the `FLAG` categories**. Remember that the bitwise flags are like boolean values. If the flag exists, the statement is true. Similar to when filtering by quality we need to use the `samtools view` command, however this time use the `-F` or `-f` flags.
164 | > 
165 | > * `-f` - to find the reads that agree with the flag statement 
166 | > * `-F`  - to find the reads that do not agree with the flag statement
167 | > 
168 | > ```
169 | > ## This will tell us how many reads are unmapped
170 | > $ samtools view -f 4 -c Mov10_oe_1_Aligned.sortedByCoord.out.bam
171 | > 
172 | > ## This should give us the remaining reads that do not have this flag set (i.e reads that are mapped)
173 | > $ samtools view -F 4 -c Mov10_oe_1_Aligned.sortedByCoord.out.bam
174 | > ```
175 | 
176 | ### Indexing the BAM file
177 | 
178 | To perform some functions (i.e. subsetting, visualization) on the BAM file, an index is required, but this index is different from the genomic index we worked with in the last lesson. In order to index a BAM file, it must first be sorted in one way or another. `samtools` can perform this sorting, however in our case STAR performed a coordinate sort for us because of a parameter we had specified.
179 | 
180 | To index the BAM file we use the `index` command:
181 | 
182 | ```bash
183 | $ samtools index Mov10_oe_1_Aligned.sortedByCoord.out.bam
184 | ```
185 | 
186 | This will create an index in the same directory as the BAM file, which will be identical to the input file in name but with an added extension of `.bai`.
187 | 
188 | ****
189 | 
190 | ## Visualization
191 | 
192 | Another method for assessing the quality of your alignment is to visualize the alignment using a genome browser. For this course we will be using the [Integrative Genomics Viewer (IGV)](https://www.broadinstitute.org/igv/) from the Broad Institute. *You should already have this downloaded on your laptop.* IGV is an interactive tool which allows exploration of large, integrated genomic datasets. It supports a wide variety of data types, including array-based and next-generation sequence data, and genomic annotations, which facilitates invaluable comparisons.
193 | 
194 | ### Transfer files
195 | 
196 | In order to visualize our alignments we will first need to move over the relevant files. We previously used FileZilla to transfer files from O2 to your laptop. However, there is another way to do so using the command line interface. Similar to the `cp` command to copy there is a command that allows you to securely copy files between computers. **The command is called `scp` and allows files to be copied to, from, or between different remote computers.** 
197 | 
198 | First, identify the location of the _origin file_ you intend to copy, followed by the _destination_ of that file. Since the origin file is located on O2, this requires you to provide remote host and login information.
199 | 
200 | The following 2 files need to be moved from O2 to your local machine,
201 |  
202 | `Mov10_oe_1_Aligned.sortedByCoord.out.bam`,
203 | 
204 | `Mov10_oe_1_Aligned.sortedByCoord.out.bam.bai` 
205 | 
206 | ```
207 | $ scp user_name@o2.hms.harvard.edu:/home/$USER/unix_lesson/rnaseq/results/Mov10_oe_1_Aligned.sortedByCoord.out.bam* /path/to/directory_on_laptop
208 | ```
209 | 
210 | ### Visualize
211 | 
212 | * Start [IGV](https://www.broadinstitute.org/software/igv/download) _You should have this previously installed on your laptop_
213 | * Load the Human genome (hg19) into IGV using the dropdown menu at the top left of your screen. _Note: there is also an option to "Load Genomes from File..." under the "Genomes" pull-down menu - this is useful when working with non-model organisms_
214 | * Load the .bam file using the **"Load from File..."** option under the **"File"** pull-down menu. *IGV requires the .bai file to be in the same location as the .bam file that is loaded into IGV, but there is no direct use for that file.*
215 | * Type MOV10 into the search bar.
216 | 
217 | ![IGV screenshot](../img/IGV_mov10.png)
218 | 
219 | ***
220 | 
221 | **Exercise**
222 | 
223 | Take a look at a few genes by typing into the search bar. Anything interesting about PPM1J and PTPN22?
224 | 
225 | ***
226 | 
227 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
228 | 


--------------------------------------------------------------------------------
/lessons/05_counting_reads.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Counting reads
  3 | author: Meeta Mistry, Bob Freeman, Radhika Khetani
  4 | date: 06/06/2017
  5 | ---
  6 | 
  7 | Approximate time: 75 minutes
  8 | 
  9 | ## Learning Objectives:
 10 | 
 11 | * understand how counting tools work
 12 | * generate a count matrix using featureCounts
 13 | 
 14 | 
 15 | ## Counting reads as a measure of gene expression
 16 | <img src="../img/counts-workflow.png" width="400">
 17 | 
 18 | Once we have our reads aligned to the genome, the next step is to count how many reads have mapped to each gene. There are many tools that can use BAM files as input and output the number of reads (counts) associated with each feature of interest (genes, exons, transcripts, etc.). 2 commonly used counting tools are [featureCounts](http://bioinf.wehi.edu.au/featureCounts/) and [htseq-count](https://htseq.readthedocs.io/en/release_0.10.0/). 
 19 | 
 20 | * The above tools only report the "raw" counts of reads that **map to a single location** (uniquely mapping) and are best at counting at the **gene level**. Essentially, total read count associated with a gene (*meta-feature*) = the sum of reads associated with each of the exons (*feature*) that "belong" to that gene.
 21 | 
 22 | * There are **other tools** available that are able to account for **multiple transcripts** for a given gene. In this case the counts are not whole numbers, but have fractions. In the simplest example case, if 1 read is associated with 2 transcripts, it can get counted as 0.5 and 0.5 and the resulting count for that transcript is not a whole number.
 23 | 
 24 | * In addition there are **other tools that will count multimapping reads**, but this is a dangerous thing to do since you will be overcounting the total number of reads which can cause issues with normalization and eventually with accuracy of differential gene expression results. 
 25 | 
 26 | **Input for counting = multiple BAM files + 1 GTF file**
 27 | 
 28 | Simply speaking, the genomic coordinates of where the read is mapped (BAM) are cross-referenced with the genomic coordinates of whichever feature you are interested in counting expression of (GTF), it can be exons, genes or transcripts.
 29 | 
 30 | <img src="../img/count-fig2.png" width="600">
 31 | 
 32 | **Output of counting = A count matrix, with genes as rows and samples are columns**
 33 | 
 34 | These are the "raw" counts and will be used in statistical programs downstream for differential gene expression.
 35 | 
 36 | <img src="../img/count-matrix.png" width="600">
 37 | 
 38 | ### Counting using featureCounts
 39 | Today, we will be using the [featureCounts](http://bioinf.wehi.edu.au/featureCounts/) tool to get the *gene* counts. We picked this tool because it is accurate, fast and is relatively easy to use. It counts reads that map to a single location (uniquely mapping) and follows the scheme in the figure below for assigning reads to a gene/exon. 
 40 | 
 41 | <img src="../img/union.png" width="300">
 42 | 
 43 | featureCounts can also take into account whether your data are **stranded** or not. If strandedness is specified, then in addition to considering the genomic coordinates it will also take the strand into account for counting. If your data are stranded always specify it.
 44 | 
 45 | #### Setting up to run featureCounts
 46 | First things first, start an interactive session with 4 cores:
 47 | 	
 48 | ``` bash
 49 | $ srun --pty -p short -t 0-12:00 -c 4 --mem 8G --reservation=HBC /bin/bash
 50 | ```
 51 | 
 52 | Now, change directories to your rnaseq directory and start by creating 2 directories, (1) a directory for the output and (2) a directory for the bam files:
 53 | 
 54 | ``` bash
 55 | $ cd ~/unix_lesson/rnaseq/
 56 | $ mkdir results/counts results/STAR/bams
 57 | ```
 58 | 
 59 | Rather than using the BAM file we generated in the last lesson, let's copy over all of the BAM files that we have already generated for you:
 60 | 	
 61 | ``` bash
 62 | 
 63 | $ cp /n/groups/hbctraining/intro_rnaseq_hpc/bam_STAR38/*bam ~/unix_lesson/rnaseq/results/STAR/bams
 64 | ```
 65 | featureCounts is not available as a module on O2, but we have already added the path for it to our `$PATH` variable last time. 
 66 | 
 67 | ``` bash
 68 | $ echo $PATH  # You should see /n/app/bcbio/tools/bin/ among other paths
 69 | ```
 70 | 
 71 | > ** If you don't see `/n/app/bcbio/tools/bin/` in your `$PATH` variable, add the following `export` command to your `~/.bashrc` file using vim: `export PATH=/n/app/bcbio/tools/bin/:$PATH`.**
 72 | 
 73 | 
 74 | #### Running featureCounts
 75 | 
 76 | How do we use this tool, what is the command and what options/parameters are available to us?
 77 | 
 78 | ``` bash
 79 | $ featureCounts
 80 | ```
 81 | 
 82 | So, it looks like the usage is `featureCounts [options] -a <annotation_file> -o <output_file> input_file1 [input_file2] ... `, where `-a`, `-o` and input files are required.
 83 | 
 84 | We are going to use the following options:
 85 | 
 86 | `-T 4 # specify 4 cores`
 87 | 
 88 | `-s 2 # these data are "reverse"ly stranded`
 89 | 
 90 | and the following are the values for the required parameters:
 91 | 
 92 | `-a ~/unix_lesson/rnaseq/reference_data/chr1-hg19_genes.gtf # required option for specifying path to GTF`
 93 | 
 94 | `-o ~/unix_lesson/rnaseq/results/counts/Mov10_featurecounts.txt # required option for specifying path to, and name of the text output (count matrix)`
 95 | 
 96 | `~/unix_lesson/rnaseq/results/STAR/bams/*bam # the list of all the bam files we want to collect count information for`
 97 | 
 98 | Let's run this now:
 99 | 
100 | ``` bash
101 | $ featureCounts -T 4 -s 2 \
102 |   -a /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \
103 |   -o ~/unix_lesson/rnaseq/results/counts/Mov10_featurecounts.txt \
104 |   ~/unix_lesson/rnaseq/results/STAR/bams/*.out.bam
105 | ```
106 | 
107 | > If you wanted to collect the information that is on the screen as the job runs, you can modify the command and add the `2>` redirection at the end. This type of redirection will collect all the information from the terminal/screen into a file.
108 | 
109 | ``` bash
110 | # **DO NOT RUN THIS** 
111 | # note the last line of the command below
112 | 	
113 | $ featureCounts -T 4 -s 2 \
114 |   -a /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \
115 |   -o ~/unix_lesson/rnaseq/results/counts/Mov10_featurecounts.txt \
116 |   ~/unix_lesson/rnaseq/results/STAR/bams/*.out.bam \
117 |   2> ~/unix_lesson/rnaseq/results/counts/Mov10_featurecounts.screen-output.log
118 | ```
119 | #### featureCounts output
120 | 
121 | The output of this tool is 2 files, *a count matrix* and *a summary file* that tabulates how many the reads were "assigned" or counted and the reason they remained "unassigned". Let's take a look at the summary file:
122 | 	
123 | ``` bash
124 | $ less results/counts/Mov10_featurecounts.txt.summary
125 | ```
126 | Now let's look at the count matrix:
127 | 	
128 | ``` bash
129 | $ less results/counts/Mov10_featurecounts.txt
130 | ```	
131 | 
132 | ##### Cleaning up the featureCounts matrix
133 | There is information about the genomic coordinates and the length of the gene, we don't need this for the next step, so we are going to extract the columns that we are interested in.
134 | 	
135 | ``` bash
136 | $ cut -f1,7,8,9,10,11,12 results/counts/Mov10_featurecounts.txt > results/counts/Mov10_featurecounts.Rmatrix.txt
137 | ```
138 | The next step is to clean it up a little further by modifying the header line (we could also do this in R, or in a GUI text editor):
139 | 	
140 | ``` bash
141 | $ vim results/counts/Mov10_featurecounts.Rmatrix.txt
142 | ```
143 | 
144 | Vim has nice shortcuts for cleaning up the header of our file using the following steps: 
145 | 
146 | 1. Move the cursor to the beginning of the document by typing: `gg` (in command mode). 
147 | 2. Remove the first line by typing: `dd` (in command mode).
148 | 2. Remove the file name following the sample name by typing: `:%s/_Aligned.sortedByCoord.out.bam//g` (in command mode).
149 | 3. Remove the path leading up to the file name by typing: `:%s/\/home\/username\/unix_lesson\/rnaseq\/results\/STAR\/bams\///g` (in command mode).
150 | 	
151 | 	> Note that we have a `\` preceding each `/`, which tells vim that we are not using the `/` as part of our search and replace command, but instead the `/` is part of the pattern that we are replacing. This is called *escaping* the `/`.
152 | 
153 | ### Note on counting PE data
154 | 
155 | For paired-end (PE) data, the bam file contains information about whether both read1 and read2 mapped and if they were at roughly the correct distance from each other, that is to say if they were "properly" paired. For most counting tools, **only properly paired reads are considered by default, and each read pair is counted only once as a single "fragment"**. 
156 | 
157 | For counting PE fragments associated with genes, the input bam files need to be sorted by read name (i.e. alignment information about both read pairs in adjoining rows). The alignment tool might sort them for you, but watch out for how the sorting was done. If they are sorted by coordinates (like with STAR), you will need to use `samtools sort` to re-sort them by read name before using as input in featureCounts. If you do not sort you BAM file by read name before using as input, featureCounts assumes that almost all the reads are not properly paired.
158 | 
159 | 
160 | ---
161 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
162 | 


--------------------------------------------------------------------------------
/lessons/06_multiQC.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: MultiQC
 3 | author: Radhika Khetani
 4 | ---
 5 | 
 6 | Approximate time: 30 minutes
 7 | 
 8 | ## Documenting results and gathering QC metrics
 9 | 
10 | As you go through the RNA-seq workflow (or any data analysis workflow), it is important to document the parameters you are using to run the analysis, in addition it is very important to document the results at every step, or some metric that speaks to the results. Evaluating these is a form of QC and it will enable you to identify any issues with the data and/or the parameters you are using, as well as alert you to the presence of contamination or systematic biases, etc.  
11 | 
12 | With the RNA-seq workflow there are several metrics that you can evaluate, but below are 3 important one that you should keep track of for each sample:
13 | * number of raw reads
14 | * percentage of reads aligned to genome
15 | * percentage of reads associated with genes 
16 | 
17 | An important QC step is to make sure these metrics are consistent across the samples for a given experiment, and any outliers should be investigated further.
18 | 
19 | Manually tracking these metrics is tedious and prone to errors. Several tools exist that help you with the documentation and QC assessment. Some of these also have really nice visualizations to easily identify any issues, e.g. [Qualimap](http://qualimap.bioinfo.cipf.es/doc_html/index.html), [MultiQC](http://multiqc.info/). These tools have some overlap, but Qualimap is focused on count data whereas MultiQC is able to make a report from the output of many different tools (for RNA-seq analysis and other NGS workflows) and is a little simpler to use.
20 | 
21 | >**NOTE:** If interested in using Qualimap, we have [materials available](https://hbctraining.github.io/Intro-to-rnaseq-hpc-salmon/lessons/03_QC_STAR_and_Qualimap_run.html#qualimap) that you can walk through.
22 | 
23 | ### Tracking and aggregating results from workflow tools with *MultiQC*
24 | 
25 | Today we will be using [MultiQC](https://multiqc.info/), which aggregates results from several tools and generates a single HTML report with plots to visualize and compare various QC metrics between the samples.
26 | 
27 | MultiQC can generate this report from 36 different bioinformatics tools, and these tools span various NGS analyses, e.g., basic QC, RNA-seq, variant calling, genome annotation, etc. Today we are going to use it to aggregate information from the results of [FastQC](http://multiqc.info/docs/#fastqc), [STAR](http://multiqc.info/docs/#star), and [featureCounts](http://multiqc.info/docs/#featurecounts). MultiQC can parse the information from specific output files from each of these tools and the manual page specifies the required output from each of the tools that should be used as input to MultiQC.
28 | 
29 | We are going to start by creating a directory in the `~/unix_lesson/rnaseq/results/` directory called `multiqc_report`, and navigating to it. You should already be in the `~/unix_lesson/rnaseq/` directory.
30 | 
31 | ```bash
32 | mkdir results/multiqc_report
33 | 
34 | cd results/multiqc_report
35 | ```
36 | 
37 | Next, we are going to run multiQC on the following 3 outputs from our workflow:
38 | 
39 | * `.zip` files from FastQC
40 | * `.Log.final.out` files from STAR
41 | * `.summary` file from featurecounts
42 | 
43 | > Note that `multiqc` is not a module on O2 and we will be using the version in `/n/app/bcbio/tools/bin/`, which is in our `$PATH`.
44 | 
45 | Before we run it, we need to populate the `~/unix_lesson/rnaseq/logs` directory with the `*.Log.final.out` files for all 6 STAR runs:
46 | ```bash
47 | cp /n/groups/hbctraining/intro_rnaseq_hpc/bam_STAR/*Log.final.out ~/unix_lesson/rnaseq/logs/
48 | ```
49 | 
50 | Now, let's run multiQC!
51 | 
52 | ```bash
53 | multiqc -n multiqc_report_rnaseq \
54 | ~/unix_lesson/rnaseq/results/fastqc/*zip \
55 | ~/unix_lesson/rnaseq/logs/*Log.final.out \
56 | ~/unix_lesson/rnaseq/results/counts/Mov10_featurecounts.txt.summary
57 | ```
58 | 
59 | > If you want to save the output on the terminal into a log file, you can use `2>` operator to redirect it to a file.
60 | 
61 | The output of multiQC is 1 HTML file and a data folder. Let's transfer the interactive HTML report over to our laptops using FileZilla and visualize the outputs of the 3 tools we used to generate the report.
62 | 
63 | The multiQC report is relatively quick to generate and provides a really clear method for comparing the samples to determine consistency, and to identify problematic samples.
64 | 
65 | ---
66 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
67 | 


--------------------------------------------------------------------------------
/lessons/07_automating_workflow.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Automating an RNA-Seq workflow"
  3 | author: "Bob Freeman, Meeta Mistry, Radhika Khetani"
  4 | date: "Tuesday, August 22, 2017"
  5 | ---
  6 | 
  7 | ## Learning Objectives:
  8 | 
  9 | * Automate a workflow by grouping a series of sequential commands into a script
 10 | * Modify and submit the workflow script to the cluster
 11 | 
 12 | ## Automating the analysis path from Sequence reads to Count matrix
 13 | 
 14 | Once you have optimized all the tools and parameters using a single sample (likely using an interactive session), you can write a script to run the whole workflow on all the samples in parallel.
 15 | 
 16 | This will ensure that you run every sample with the exact same parameters, and will enable you to keep track of all the tools and their versions. In addition, the script is like a lab notebook; in the future, you (or your colleagues) can go back and check the workflow for methods, which enables efficiency and reproducibility.
 17 | 
 18 | Before we start with the script, let's check how many cores our interactive session has by using `squeue`. 
 19 | 
 20 | ```bash
 21 | $ sacct
 22 | ```
 23 | 
 24 | We need to have an interactive session with 6 cores, if you already have one you are set. If you have a session with fewer cores then `exit` out of your current interactive session and start a new one with `-n 6`.
 25 | 
 26 | ```bash
 27 | $ srun --pty -p short -t 0-12:00 -c 6 --mem 8G --reservation=HBC /bin/bash
 28 | ```
 29 | 
 30 | ### More Flexibility with variables
 31 | 
 32 | We can write a shell script that will run on a specific file, but to make it more flexible and efficient we would prefer that it lets us give it an input fastq file when we run the script. To be able to provide an input to any shell script, we need to use **Positional Parameters**.
 33 | 
 34 | For example, we can refer to the components of the following command as numbered variables **within** the actual script:
 35 | 
 36 | ```bash
 37 | # * DO NOT RUN *
 38 | sh  run_rnaseq.sh  input.fq  input.gtf  12
 39 | ```
 40 | 
 41 | `$0` => run_rnaseq.sh
 42 | 
 43 | `$1` => input.fq
 44 | 
 45 | `$2` => input.gtf
 46 | 
 47 | `$3` => 12
 48 | 
 49 | The variables $1, $2, $3,...$9 and so on are **positional parameters** in the context of the shell script, and can be used within the script to refer to the files/number specified on the command line. Basically, the script is written with the expectation that $1 will be a fastq file and $2 will be a GTF file, and so on.
 50 | 
 51 | *There can be virtually unlimited numbers of inputs to a shell script, but it is wise to only have a few inputs to avoid errors and confusion when running a script that used positional parameters.*
 52 | 
 53 | > [This is an example of a simple script that used the concept of positional parameters and the associated variables](http://steve-parker.org/sh/eg/var3.sh.txt). You should try this script out after the class to get a better handle on positional parameters for shell scripting.
 54 | 
 55 | Let's use this new concept in the script we are writing. We want the first positional parameter ($1) to be the name of our fastq file. We could just use the variable `$1` throughout the script to refer to the fastq file, but this variable name is not intuitive, so we want to create a new variable called `fq` and copy the contents of `$1` into it.
 56 | 
 57 | First, we need to start a new script called `rnaseq_analysis_on_input_file.sh` in the `~/unix_lesson/rnaseq/scripts/` directory:
 58 | 
 59 | ```bash
 60 | $ cd ~/unix_lesson/rnaseq/scripts/
 61 | 
 62 | $ vim rnaseq_analysis_on_input_file.sh
 63 | ```
 64 | 
 65 | ```bash
 66 | #!/bin/bash/
 67 | 
 68 | # initialize a variable with an intuitive name to store the name of the input fastq file
 69 | 
 70 | fq=$1
 71 | ```
 72 | 
 73 | > When we set up variables we do not use the `$` before it, but when we *use the variable*, we always have to have the `$` before it. >
 74 | >
 75 | > For example: 
 76 | >
 77 | > initializing the `fq` variable => `fq=$1`
 78 | >
 79 | > using the `fq` variable => `fastqc $fq`
 80 | 
 81 | To ensure that all the output files from the workflow are properly named with sample IDs we should extract the "base name" (or sample ID) from the name of the input file.
 82 | 
 83 | ```
 84 | # grab base of filename for naming outputs
 85 | 
 86 | base=`basename $fq .subset.fq`
 87 | echo "Sample name is $base"           
 88 | ```
 89 | 
 90 | > **Remember `basename`?**
 91 | >
 92 | > 1. the `basename` command: this command takes a path or a name and trims away all the information before the last `\` and if you specify the string to clear away at the end, it will do that as well. In this case, if the variable `$fq` contains the path *"~/unix_lesson/rnaseq/raw_data/Mov10_oe_1.subset.fq"*, `basename $fq .subset.fq` will output "Mov10_oe_1".
 93 | > 2. to assign the value of the `basename` command to the `base` variable, we encapsulate the `basename...` command in backticks. This syntax is necessary for assigning the output of a command to a variable.
 94 | 
 95 | Next we want to specify how many cores the script should use to run the analysis. This provides us with an easy way to modify the script to run with more or fewer cores without have to replace the number within all commands where cores are specified.
 96 | 
 97 | ```
 98 | # specify the number of cores to use
 99 | 
100 | cores=6
101 | ```
102 | Next we'll initialize 2 more variables named `genome` and `gtf`, these will contain the paths to where the reference files are stored. This makes it easier to modify the script for when you want to use a different genome, i.e. you'll just have to change the contents of these variable at the beginning of the script.
103 | 
104 | ```
105 | # directory with genome reference FASTA and index files + name of the gene annotation file
106 | 
107 | genome=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index/
108 | gtf=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf
109 | ```
110 | 
111 | We'll create output directories, but with the `-p` option. This will make sure that `mkdir` will create the directory only if it does not exist, and it won't throw an error if it does exist.
112 | 
113 | ```
114 | # make all of the output directories
115 | # The -p option means mkdir will create the whole path if it 
116 | # does not exist and refrain from complaining if it does exist
117 | 
118 | mkdir -p ~/unix_lesson/rnaseq/results/fastqc
119 | mkdir -p ~/unix_lesson/rnaseq/results/STAR
120 | mkdir -p ~/unix_lesson/rnaseq/results/counts
121 | ```
122 | 
123 | Now that we have already created our output directories, we can now specify variables with the path to those directories both for convenience but also to make it easier to see what is going on in a long command.
124 | 
125 | ```
126 | # set up output filenames and locations
127 | 
128 | fastqc_out=~/unix_lesson/rnaseq/results/fastqc
129 | align_out=~/unix_lesson/rnaseq/results/STAR/${base}_
130 | counts_input_bam=~/unix_lesson/rnaseq/results/STAR/${base}_Aligned.sortedByCoord.out.bam
131 | counts=~/unix_lesson/rnaseq/results/counts/${base}_featurecounts.txt
132 | ```
133 | 
134 | ### Keeping track of tool versions
135 | 
136 | All of our variables are now staged. Next, let's make sure all the modules are loaded. This is also a good way to keep track of the versions of tools that you are using in the script:
137 | 
138 | ```
139 | # set up the software environment
140 | 
141 | module load fastqc/0.11.3
142 | module load gcc/6.2.0  
143 | module load star/2.5.2b
144 | module load samtools/1.3.1
145 | export PATH=/n/app/bcbio/tools/bin:$PATH 	# for using featureCounts if not already in $PATH
146 | ```
147 | 
148 | ### Preparing for future debugging
149 | 
150 | In the script, it is a good idea to use `echo` for debugging. `echo` basically displays the string of characters specified within the quotations. When you have strategically place `echo` commands specifying what stage of the analysis is next, in case of failure you can determine the last `echo` statement displayed to troubleshoot the script.
151 | 
152 | ```
153 | echo "Processing file $fq"
154 | ```
155 | 
156 | > You can also use `set -x`:
157 | >
158 | > `set -x` is a debugging tool that will make bash display the command before executing it. In case of an issue with the commands in the shell script, this type of debugging lets you quickly pinpoint the step that is throwing an error. Often, tools will display the error that caused the program to stop running, so keep this in mind for times when you are running into issues where this is not available.
159 | > You can turn this functionality off by saying `set +x`
160 | 
161 | ### Running the tools
162 | 
163 | ```
164 | # Run FastQC and move output to the appropriate folder
165 | fastqc $fq
166 | 
167 | # Run STAR
168 | STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq --outFileNamePrefix $align_out --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard
169 | 
170 | # Create BAM index
171 | samtools index $counts_input_bam
172 | 
173 | # Count mapped reads
174 | featureCounts -T $cores -s 2 -a $gtf -o $counts $counts_input_bam
175 | ```
176 | 
177 | ### Last addition to the script
178 | 
179 | It is best practice to have the script **usage** specified at the top any script. This should have information such that when your future self, or a co-worker, uses the script they know what it will do and what input(s) are needed. For our script, we should have the following lines of comments right at the top after `#!/bin/bash/`:
180 | 
181 | ```
182 | # This script takes a fastq file of RNA-Seq data, runs FastQC and outputs a counts file for it.
183 | # USAGE: sh rnaseq_analysis_on_input_file.sh <name of fastq file>
184 | ```
185 | 
186 | It is okay to specify this after everything else is set up, since you will have most clarity about the script only once it is fully done.
187 | 
188 | ### Saving and running script
189 | 
190 | To transfer the contents of the script to O2, you can copy and paste the contents into a new file using `vim`. 
191 | 
192 | ```bash
193 | $ cd ~/unix_lesson/rnaseq/scripts/
194 | 
195 | $ vim rnaseq_analysis_on_input_file.sh 
196 | ```
197 | > *Alternatively, you can save the script on your computer and transfer it to `~/unix_lesson/rnaseq/scripts/` using FileZilla.*
198 | 
199 | 
200 | We should all have an interactive session with 6 cores, so we can run the script as follows:
201 | 
202 | ```bash
203 | $ sh rnaseq_analysis_on_input_file.sh ~/unix_lesson/rnaseq/raw_data/Mov10_oe_1.subset.fq
204 | ```
205 | 
206 | ## Running the script to submit jobs in parallel to the SLURM scheduler
207 | 
208 | The above script will run in an interactive session **one file at a time**. But the whole point of writing this script was to run it on all files at once. How do you think we can do this?
209 | 
210 | To run the above script **"in serial"** for all of the files on a worker node via the job scheduler, we can create a separate submission script that will need 2 components:
211 | 
212 | 1. **SLURM directives** at the **beginning** of the script. This is so that the scheduler knows what resources we need in order to run our job on the compute node(s).
213 | 2. a **`for`** loop that iterates through and runs the above script for all the fastq files.
214 | 
215 | Below is what this second script (`rnaseq_analysis_on_allfiles.slurm`) would look like **\[DO NOT RUN THIS\]**:
216 | 
217 | ```
218 | #!/bin/bash
219 | 
220 | #SBATCH -p medium 		# partition name
221 | #SBATCH -t 0-2:00 		# hours:minutes runlimit after which job will be killed
222 | #SBATCH -c 6 		# number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job
223 | #SBATCH --job-name STAR_mov10 		# Job name
224 | #SBATCH -o %j.out			# File to which standard out will be written
225 | #SBATCH -e %j.err 		# File to which standard err will be written
226 | 
227 | # this `for` loop, will take the fastq files as input and run the script for all of them one after the other. 
228 | for fq in ~/unix_lesson/rnaseq/raw_data/*.fq
229 | do
230 |   echo "running analysis on $fq"
231 |   rnaseq_analysis_on_input_file.sh $fq
232 | done
233 | ```
234 | 
235 | **But we don't want to run the analysis on these 6 samples one after the other!** We want to run them "in parallel" as 6 separate jobs. 
236 | 
237 | **Note:** If you create and run the above script, or something similar to it, i.e. with SLURM directives at the top, you should give the script name `.run` or `.slurm` as the extension. This will make it obvious that it is meant to submit jobs to the SLURM scheduler. 
238 | 
239 | ***
240 | **Exercise**
241 | 
242 | How would you run `rnaseq_analysis_on_allfiles.slurm`, i.e. the above script?
243 | 
244 | ***
245 | 
246 | ## Parallelizing the analysis for efficiency
247 | 
248 | Parallelization will save you a lot of time with real (large) datasets. To parallelize our analysis, we will still need to write a second script that will call the script we just wrote that takes a fastq file as input (rnaseq_analysis_on_input_file.sh). We will still use a `for` loop, but we will be creating a regular shell script and we will be specifying the SLURM directives differently. 
249 | 
250 | Use `vim` to start a new shell script called `rnaseq_analysis_on_allfiles-for_slurm.sh`: 
251 | 
252 | ```bash
253 | $ vim rnaseq_analysis_on_allfiles_for-slurm.sh
254 | ```
255 | 
256 | This script loops through the same files as in the previous (demo) script, but the command being submitted within the `for` loop is `sbatch` with SLURM directives specified on the same line:
257 | 
258 | ```bash
259 | #! /bin/bash
260 | 
261 | for fq in ~/unix_lesson/rnaseq/raw_data/*.fq
262 | do
263 | 
264 | sbatch -p short -t 0-2:00 -c 6 --job-name rnaseq-workflow --wrap="sh ~/unix_lesson/rnaseq/scripts/rnaseq_analysis_on_input_file.sh $fq"
265 | sleep 1	# wait 1 second between each job submission
266 |   
267 | done
268 | ```
269 | > Please note that after the `sbatch` directives the command `sh ~/unix_lesson/rnaseq/scripts/rnaseq_analysis_on_input_file.sh $fq` is in quotes.
270 | 
271 | What you should see on the output of your screen would be the jobIDs that are returned from the scheduler for each of the jobs that your script submitted.
272 | 
273 | You can use `sacct` to check progress.
274 | 
275 | Don't forget about the `scancel` command, should something go wrong and you need to cancel your jobs.
276 | 
277 | > **NOTE:** All job schedulers are similar, but not the same. Once you understand how one works, you can transition to another one without too much trouble. They all have their pros and cons which are considered by the system administrators when picking one for a given HPC environment. 
278 | 
279 | #### Generating a Count Matrix
280 | 
281 | The above script will generate separate count files for each sample. Hence, after the script has run, you will have to merge them using `paste` and do some cleanup using `cut` to generate a full count matrix wherein the first column is gene names and the rest of the columns are gene counts in each sample (as shown below).
282 | 
283 | <img src="../img/count_matrix.png" width="500">
284 | 
285 | Alternatively, you could remove featureCounts from the original script, and run it after all the jobs finish generating BAM files.
286 | 
287 | ---
288 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
289 | 
290 | * *The materials used in this lesson was derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
291 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
292 | 


--------------------------------------------------------------------------------
/lessons/08_salmon.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Quantification of transcript abundance using Salmon"
  3 | author: "Mary Piper and Meeta Mistry"
  4 | date: "Wednesday, September 20, 2017"
  5 | ---
  6 | 
  7 | Approximate time: 1.25 hours
  8 | 
  9 | ## Learning Objectives
 10 | 
 11 | * Explore using lightweight algorithms to quantify reads to pseudocounts
 12 | * Understand how Salmon performs quasi-mapping and transcript abundance estimation
 13 | 
 14 | 
 15 | ## Lightweight alignment and quantification of gene expression
 16 | 
 17 | In the standard RNA-seq pipeline that we have presented so far, we have taken our reads post-QC and aligned them to the genome using our transcriptome annotation (GTF) as guidance. The goal is to identify the genomic location where these reads originated from. **Another strategy for quantification which has more recently been introduced involves transcriptome mapping**. Tools that fall in this category include [Kallisto](https://pachterlab.github.io/kallisto/about), [Sailfish](http://www.nature.com/nbt/journal/v32/n5/full/nbt.2862.html) and [Salmon](https://combine-lab.github.io/salmon/); each working slightly different from one another. (For this workshop we will explore Salmon in more detail.) Common to all of these tools is that **base-to-base alignment of the reads is avoided**, which is a time-consuming step, and these tools **provide quantification estimates much faster than do standard approaches** (typically more than 20 times faster) with **improvements in accuracy** at **the transcript level**. 
 18 | 
 19 | These transcript expression estimates, often referred to as 'pseudocounts', can be converted for use with DGE tools like DESeq2 (using [tximport](https://bioconductor.org/packages/release/bioc/html/tximport.html)) or the estimates can be used directly for isoform-level differential expression using a tool like [Sleuth](http://www.biorxiv.org/content/biorxiv/early/2016/06/10/058164.full.pdf). 
 20 | 
 21 | <img src="../img/salmon_workflow_updated.png" width="400">
 22 | 
 23 | The improvement in accuracy for lightweight alignment tools in comparison with the standard alignment/counting methods primarily relate to the ability of the lightweight alignment tools to quantify multimapping reads. This has been shown by Robert et. al by comparing the accuracy of 12 different alignment/quantification methods using simulated data to estimate the gene expression of 1000 perfect RNA-Seq read pairs from each of of the genes [[1](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0734-x)]. As shown in the figures below taken from the paper, the **standard alignment and counting methods such as STAR/htseq or Tophat2/htseq result in underestimates of many genes - particularly those genes comprised of multimapping reads** [[1](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0734-x)]. 
 24 | 
 25 | <img src="../img/pseudo_count_comparison-star_sm.png" width="400">
 26 | 
 27 | _**NOTE:** Scatter plots are comparing FPKM for each of the 12 methods against the known FPKM from simulated data. The red line indicates the y = x line. For histograms of read counts, we expect a single peak at 1000 therefore tails represent over and under estimates._ 
 28 | 
 29 | While the STAR/htseq standard method of alignment and counting is a bit conservative and can result in false negatives, **Cufflinks tends to overestimate gene expression and results in many false positives**, which is why Cufflinks is generally not recommended for gene expression quantification.
 30 | 
 31 | <img src="../img/pseudo_count_comparison-cufflinks.png" width="750">
 32 | 
 33 | Finally, the most accurate quantification of gene expression was achieved using the lightweight alignment tool Sailfish (if used without bias correction).
 34 | 
 35 | <img src="../img/pseudo_count_comparison-sailfish_sm.png" width="400">
 36 | 
 37 | Lightweight alignment tools such as Sailfish, Kallisto, and Salmon have generally been found to yield the most accurate estimations of transcript/gene expression. Salmon is considered to have some improvements relative to Sailfish, and it is considered to give very similar results to Kallisto. 
 38 | 
 39 | 
 40 | ### What is Salmon?
 41 | 
 42 | [Salmon](http://salmon.readthedocs.io/en/latest/salmon.html#using-salmon) is based on the philosophy of lightweight algorithms, which use the reference transcriptome (in FASTA format) and raw sequencing reads (in FASTQ format) as input, but do not align the full reads. These tools perform both mapping and quantification. Unlike most lightweight and standard alignment/quantification tools, **Salmon utilizes sample-specific bias models for transcriptome-wide abundance estimation**. Sample-specific bias models are helpful when needing to account for known biases present in RNA-Seq data including:
 43 | 
 44 | - GC bias
 45 | - positional coverage biases
 46 | - sequence biases at 5' and 3' ends of the fragments
 47 | - fragment length distribution
 48 | - strand-specific methods
 49 | 
 50 | If not accounted for, these biases can lead to unacceptable false positive rates in differential expression studies [[2](http://salmon.readthedocs.io/en/latest/salmon.html#quasi-mapping-based-mode-including-lightweight-alignment)]. The **Salmon algorithm can learn these sample-specific biases and account for them in the transcript abundance estimates**. Salmon is extremely fast at "mapping" reads to the transcriptome and often more accurate than standard approaches [[2](http://salmon.readthedocs.io/en/latest/salmon.html#quasi-mapping-based-mode-including-lightweight-alignment)]. 
 51 | 
 52 | 
 53 | ### How does Salmon estimate transcript abundances?
 54 | 
 55 | Similar to standard base-to-base alignment approaches, the quasi-mapping approach utilized by Salmon requires a reference index to determine the position and orientation information for where the fragments best map prior to quantification [[3](https://academic.oup.com/bioinformatics/article/32/12/i192/2288985/RapMap-a-rapid-sensitive-and-accurate-tool-for)]. 
 56 | 
 57 | #### **Indexing** 
 58 | 
 59 | This step involves creating an index to evaluate the sequences for all possible unique sequences of length k (kmer) in the **transcriptome** (genes/transcripts).
 60 | 
 61 | **The index helps creates a signature for each transcript in our reference transcriptome.** The Salmon index has two components:
 62 | 
 63 | 1. a **suffix array** (SA) of the reference transcriptome sequence
 64 | 2. a **hash table** to map each k-mer occurring in the reference transcriptome to it's location in the SA (is not required, but improves the speed of mapping drastically)
 65 | 
 66 | #### **Quasi-mapping and quantification** 
 67 | 
 68 | The quasi-mapping approach estimates the numbers of reads mapping to each transcript, and is described below. We have details and a schematic provided by the Rapmap tool [[3](https://academic.oup.com/bioinformatics/article/32/12/i192/2288985/RapMap-a-rapid-sensitive-and-accurate-tool-for)], which provides the underlying algorithm for the quasi-mapping.
 69 | 
 70 | - **Step 1: Quasi mapping and abundance estimation**
 71 | 
 72 | 	<img src="../img/salmon_quasialignment.png" width="750">
 73 | 	
 74 | 	>RapMap: a rapid, sensitive and accurate tool for mapping RNA-seq reads to transcriptomes. A. Srivastava, H. Sarkar, N. Gupta, R. Patro. Bioinformatics (2016) 32 (12): i192-i200.
 75 | 	
 76 | 	The quasi-mapping procedure performs the following steps [[3](https://academic.oup.com/bioinformatics/article/32/12/i192/2288985/RapMap-a-rapid-sensitive-and-accurate-tool-for)]:
 77 | 
 78 | 	1. The read is scanned from left to right until a k-mer that appears in the hash table is discovered.
 79 | 	2. The k-mer is looked up in the hash table and the SA intervals are retrieved, giving all suffixes containing that k-mer
 80 | 	3. Similar to STAR, the maximal matching prefix (MMP) is identified by finding the longest read sequence that exactly matches the reference suffixes.
 81 | 	4. Instead of searching for the next MMP in the read as we do with STAR, Salmon uses a k-mer skipping approach to identify the next informative position (NIP). In this way the SA search is likely to return a different set of transcripts than those returned for the previous MMP. Often natural variation or a sequencing error in the read is the cause of the mismatch from the reference, so beginning the search at this position would likely return the same set of transcripts. 
 82 | 	5. This process is repeated until the end of the read.
 83 | 	6. The final set of mappings is determined by a consensus mechanism. Specifically, the algorithm reports the intersection of transcripts appearing in all MMPs for that read. The transcripts, orientation and transcript location are output for each read.
 84 | 	
 85 | >
 86 | > **NOTE:** If there are k-mers in the reads that are not in the index, they are not counted. As such, trimming is not required when using this method. Accordingly, if there are reads from transcripts not present in the reference transcriptome, they will not be quantified. Quantification of the reads is only as good as the quality of the reference transcriptome.
 87 | 	
 88 | 	
 89 | - **Step 2: Improving abundance estimates**
 90 | Using multiple complex modeling approaches, like Expectation Maximization (EM), Salmon can also correct the abundance estimates for any sample-specific biases/factors [[4](http://www.nature.com.ezp-prod1.hul.harvard.edu/nmeth/journal/v14/n4/full/nmeth.4197.html?WT.feed_name=subjects_software&foxtrotcallback=true)]. Generally, this step results in more accurate transcript abundance estimation.
 91 | 
 92 | ## Running Salmon on O2
 93 | 
 94 | First start an interactive session and create a new directory for our Salmon analysis:
 95 | 
 96 | ```bash
 97 | $ srun --pty -p short -t 0-12:00 --mem 8G --reservation=HBC /bin/bash
 98 | 
 99 | $ mkdir ~/unix_lesson/rnaseq/salmon
100 | 
101 | $ cd ~/unix_lesson/rnaseq/salmon
102 | ```   
103 | 
104 | > Salmon is not available as a module on O2, but it is installed as part of the bcbio pipeline. Since we already have the appropriate path (`/n/app/bcbio/tools/bin/`) in our `$PATH` variable we can use it by simply typing in `salmon`.     
105 | 
106 | As you can imagine from the description above, when running Salmon there are also two steps.
107 | 
108 | **Step 1: Indexing**
109 |  "Index" the transcriptome using the `index` command:
110 |     
111 | ```bash
112 | ## DO NOT RUN THIS CODE
113 | $ salmon index -t transcripts.fa -i transcripts_index --type quasi -k 31
114 | ```
115 | > **NOTE:** Default for salmon is --type quasi and -k 31, so we do not need to include these parameters in the index command. The kmer default of 31 is optimized for 75bp or longer reads, so if your reads are shorter, you may want a smaller kmer to use with shorter reads (kmer size needs to be an odd number).
116 | 
117 | > 
118 | **We are not going to run this in class, but it only takes a few minutes.** We will be using an index we have generated from transcript sequences (all known transcripts/ splice isoforms with multiples for some genes) for human. The transcriptome data (FASTA) was obtained from the [Ensembl ftp site](ftp://ftp.ensembl.org/pub/current_fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz).
119 | 
120 | 
121 | **Step 2: Quantification:**
122 | Get the transcript abundance estimates using the `quant` command and the parameters described below (more information on parameters can be found [here](http://salmon.readthedocs.io/en/latest/salmon.html#id5)):
123 | 
124 | * `-i`: specify the location of the index directory; for us it is `n/groups/hbctraining/ngs-data-analysis-longcourse/rnaseq/salmon.ensembl38.idx`
125 | * `-l A`: auto-detect library type. *You can also specify stranded single-end reads, more info available [here](http://salmon.readthedocs.io/en/latest/salmon.html#what-s-this-libtype)*
126 | * `-r`: list of files for sample
127 | * `-o`: output quantification file name
128 | * `--seqBias` will enable Salmon to learn and correct for sequence-specific biases in the input data
129 | * `--useVBOpt`: use variational Bayesian EM algorithm rather than the ‘standard EM’ to optimize abundance estimates (more accurate) 
130 | * `--writeMappings=salmon.out`: creates a SAM-like file of all of the mappings. **We won't add this parameter, as it creates large files that will take up too much space in your home directory.** 
131 |    
132 | To run the quantification step on a single sample we have the command provided below. Let's try running it on our subset sample for `Mov10_oe_1.subset.fq`:
133 | 
134 | ```bash
135 | $ salmon quant -i /n/groups/hbctraining/ngs-data-analysis-longcourse/rnaseq/salmon.ensembl38.idx \
136 |  -l A \
137 |  -r ~/unix_lesson/rnaseq/raw_data/Mov10_oe_1.subset.fq \
138 |  -o Mov10_oe_1.subset.salmon \
139 |  --seqBias \
140 |  --useVBOpt
141 | ```
142 | 
143 | >**NOTE 1:** Paired-end reads require both sets of reads to be given in addition to a [paired-end specific library type](http://salmon.readthedocs.io/en/latest/salmon.html#what-s-this-libtype):
144 | `salmon quant -i transcripts_index -l <LIBTYPE> -1 reads1.fq -2 reads2.fq -o transcripts_quant`
145 | > 
146 | 
147 | > **NOTE 2:** To have Salmon correct for other RNA-Seq biases you will need to specify the appropriate parameters when you run it. Before using these parameters it is advisable to assess your data using tools like [Qualimap](http://qualimap.bioinfo.cipf.es/) to look specifically for the presence of these biases in your data and decide on which parameters would be appropriate. 
148 | > 
149 | > To correct for the various sample-specific biases you could add the following parameters to the Salmon command:
150 | >
151 | > * `--gcBias` to learn and correct for fragment-level GC biases in the input data
152 | > * `--posBias` will enable modeling of a position-specific fragment start distribution
153 | 
154 | 
155 | ## Salmon output
156 | 
157 | You should see a new directory has been created that is named by the string value you provided in the `-o` command. Take a look at what is contained in this directory:
158 | 
159 |     $ ls -l Mov10_oe_1.subset.salmon/
160 |     
161 | There is a logs directory, which contains all of the text that was printed to screen as Sailfish was running. Additionally, there is a file called `quant.sf`. 
162 | 
163 | This is the **quantification file** in which each row corresponds to a transcript, listed by Ensembl ID, and the columns correspond to metrics for each transcript:
164 | 
165 | ```bash
166 | Name    Length  EffectiveLength TPM     NumReads
167 | ENST00000456328 1657    1407.000        0.000000        0.000
168 | ENST00000450305 632     382.000 0.000000        0.000
169 | ENST00000488147 1351    1101.000        0.000000        0.000
170 | ENST00000619216 68      3.000   0.000000        0.000
171 | ENST00000473358 712     462.000 0.000000        0.000
172 | ENST00000469289 535     285.000 0.000000        0.000
173 | ENST00000607096 138     5.000   0.000000        0.000
174 | ENST00000417324 1187    937.000 0.000000        0.000
175 | 
176 | ....
177 | 
178 | ```
179 | 
180 | *  The first two columns are self-explanatory, the **name** of the transcript and the **length of the transcript** in base pairs (bp). 
181 | *  The **effective length** represents the various factors that effect the length of transcript (i.e degradation, technical limitations of the sequencing platform)
182 | * Salmon outputs ‘pseudocounts’ which predict the relative abundance of different isoforms in the form of three possible metrics (KPKM, RPKM, and TPM). **TPM (transcripts per million)** is a commonly used normalization method as described in [[1]](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2820677/) and is computed based on the effective length of the transcript.
183 | * Estimated **number of reads** (an estimate of the number of reads drawn from this transcript given the transcript’s relative abundance and length)
184 | 
185 |  
186 | ## Running Salmon on multiple samples 
187 | 
188 | We just ran Salmon on a single sample (and keep in mind only on a subset of chr1 from the original data). To obtain meaningful results we need to run this on **all samples for the full dataset**. To do so, we will need to create a job submission script.
189 | 
190 | > *NOTE:* We are iterating over FASTQ files in the **full dataset directory**, located at `/n/groups/hbctraining/ngs-data-analysis-longcourse/rnaseq/full_dataset`
191 | 
192 | 
193 | ### Create a job submission script to run Salmon in serial
194 | 
195 | Since Salmon is only able to take a single file as input, one way in which we can do this is to use a for loop to run Salmon on all samples in serial. What this means is that Salmon will process the dataset one sample at a time.
196 | 
197 | Let's start by opening up a script in `vim`:
198 | 
199 | 	$ vim salmon_all_samples.sbatch
200 | 
201 | 
202 | Let's start our script with a **shebang line followed by SBATCH directives which describe the resources we are requesting from O2**. We will ask for 6 cores and take advantage of Salmon's multi-threading capabilities. Note that we also removed the `--reservation` from our SBATCH options.
203 | 
204 | Next we will do the following:
205 | 
206 | 1. **Create a for loop to iterate over all FASTQ samples**. 
207 | 2. Inside the loop we will create a variable that stores the prefix we will use for naming output files.
208 | 3. Then we run Salmon. 
209 | 
210 | > **NOTE:** We have **added a couple of new parameters**. First, since we are **multithreading** with 6 cores we will use `-p 6`. Another new parameter we have added is called `--numBootstraps`. Salmon has the ability to optionally compute bootstrapped abundance estimates. **Bootstraps are required for estimation of technical variance**. We will discuss this in more detail when we talk about transcript-level differential expression analysis.
211 | 
212 | The final script is shown below:
213 | 
214 | ```
215 | #!/bin/bash
216 | 
217 | #SBATCH -p short 
218 | #SBATCH -c 6 
219 | #SBATCH -t 0-12:00 
220 | #SBATCH --mem 8G 
221 | #SBATCH --job-name salmon_in_serial 
222 | #SBATCH -o %j.out 
223 | #SBATCH -e %j.err
224 | 
225 | cd ~/unix_lesson/rnaseq/salmon
226 | 
227 | for fq in /n/groups/hbctraining/ngs-data-analysis-longcourse/rnaseq/full_dataset/*.fastq 
228 | 
229 | do
230 | 
231 | # create a prefix
232 | base=`basename $fq .fastq`
233 | 
234 | # run salmon
235 | salmon quant -i /n/groups/hbctraining/ngs-data-analysis-longcourse/rnaseq/salmon.ensembl38.idx \
236 |  -l A \
237 |  -r $fq \
238 |  -p 6 \
239 |  -o $base.salmon \
240 |  --seqBias \
241 |  --useVBOpt \
242 |  --numBootstraps 30
243 | 
244 | done
245 | 
246 | ```
247 | 
248 | Save and close the script. This is now ready to run.
249 | 
250 | 	$ sbatch salmon_all_samples.sbatch
251 | 
252 | Once you have run Salmon on all of your samples, you will need to decide whether you would like to perform gene-level or isoform-level analysis. The **output directory from Salmon for each sample will be required as input for any of these downstream tools**. In our standard workflow we ended up with a count matrix where all expression data was nicely summarized into a single file, but with this alternative approach the downstream tools for differential expression will take care of compiling it for you.
253 | 
254 | ***
255 | 
256 | **Exercise**
257 | 
258 | We learned in the [Automation lesson](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/07_automating_workflow.html) that we can be more efficient by running our jobs in parallel. In this way, each sample is run as an independent job and not waiting for the previous sample to finish. **Create a new script to run Salmon in parallel**.
259 | 
260 | ***
261 | 
262 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 


--------------------------------------------------------------------------------
/lessons/DE_analysis.md:
--------------------------------------------------------------------------------
  1 | ## Learning Objectives:
  2 | -------------------
  3 | 
  4 | * Learning how to run R scripts from the command line
  5 | * Use the count matrix as input to an R script for differential expression analysis
  6 | * Apply Unix commands to look at the results that are generated and extract relevant information
  7 | * Familiarize yourself with various functional analysis tools for gene lists
  8 | 
  9 | 
 10 | ## Differential expression analysis
 11 | -------------------
 12 | 
 13 | At the end of the workflow from the last lesson, our final end product was a count matrix. This is a matrix in which each row represents a gene (or feature) and each column corresponds to a sample. In our dataset, we have two sample classes (control and Mov10oe) and we want to assess the difference in expression between these groups on a gene-by-gene basis.
 14 | 
 15 | <p align="center">
 16 | <img src="../img/de_variation.png" width="600">
 17 | </p>
 18 | 
 19 | _Illustration taken from slides courtesy of Dr. Paul Pavlidis, UBC_ 
 20 | 
 21 | Since we know which samples belong to which group, we could just compute a fold-change for each gene and then rank genes by that value. Easy, right? Not exactly. 
 22 | 
 23 | The problem is, the **gene expression changes** we observe are not just a result of the differences between the groups that we are investigating, rather it **is a measurement of the sum of many effects**. In a set of biological samples the transcriptional patterns can be associated not only with our experimetal variable(s) but also many extraneous factors; some that we are aware of (i.e demographic factors, batch information) and sources that are unknown. The goal of differential expression analysis to determine the relative role of these effects, and to **separate the “interesting” from the “uninteresting”.**
 24 | 
 25 | 
 26 | ### Statistical models in R
 27 | 
 28 | [R](https://www.r-project.org/) is a software environment for statistical computing and graphics. R is widely used in the field of bioinformatics, amongst various other disciplines. 
 29 | 
 30 | <p align="center">
 31 | <img src="../img/why_R.png" width="600">
 32 | </p>
 33 | 
 34 | It can be locally installed on almost all operating systems (and it's free!), with numerous packages available that help in increasing efficency of data handling, data manipulation and data analysis. Discussing the specifics about R is outside the scope of this course. However, we encourage you to take a look at some of the R resources listed below if you are interested in learning more. 
 35 | 
 36 | R is a powerful language that can be very useful for NGS data analysis, and there are many popular packages for working with RNA-Seq count data. Some of these packages include [edgeR](https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf), [DESeq2](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf), and [limma-voom](http://www.genomebiology.com/2014/15/2/R29). All of these tools use statistical modeling of the count data to test each gene against the null hypothesis and evaluate whether or not it is significantly differentially expressed. 
 37 | 
 38 | <p align="center">
 39 | <img src="../img/de_norm_counts_var.png" width="400">
 40 | </p>
 41 | 
 42 | These methods determine, for each gene, whether the differences in expression (counts) **between groups** is significant given the amount of variation observed **within groups** (replicates). To test for significance, we need an appropriate statistical model that accurately performs normalization (to account for differences in sequencing depth, etc.) and variance modeling (to account for few numbers of replicates and large dynamic expression range). The details on how each package works is described thoroughly within each of the respective vignettes.
 43 | 
 44 | ### Running R scripts
 45 | 
 46 | In order to run R on O2, let's first **log on to the cluster and start an interactive session with a single core**.
 47 | 
 48 | Once you are in an interactive session, navigate to the `rnaseq` directory:
 49 | 
 50 | 	$ cd ~/unix_lesson/rnaseq
 51 | 
 52 | We will be running an R script that uses the R package [DESeq2](http://bioconductor.org/packages/release/bioc/html/DESeq2.html) to identify differentially expressed genes. This package is available from [Bioconductor](https://www.bioconductor.org/), which is a repository of packages for the analysis of high-throughput genomic data. There are also a few other packages that are required to generate some additional figures.
 53 | 
 54 | We first need to load the R module and the GCC compiler:
 55 | 
 56 | ```bash
 57 | $ module load gcc/6.2.0 R/3.4.1
 58 | ```
 59 | You can open R by simply typing `R` at the command prompt and pressing `Enter`. You are now in the R console (note that the command prompt has changed to a `>` instead of a `$`):
 60 | 
 61 | <p align="center">
 62 | <img src="../img/R_screenshot2.png")
 63 | </p>
 64 | 
 65 | Installing packages can be timely and particularly cumbersome when doing this on a cluster environment. So rather than installing packages we have instructions for you to use the libraries from our installation. 
 66 | 
 67 | > **NOTE:**  Packages are bundles of code that perform functions and include detailed documentation on how to use those functions. Once installed, they are referred to as _libraries_. 
 68 | 
 69 | **To use the libraries we have created for you first exit R with:**
 70 | 
 71 | ```R
 72 | q()
 73 | ```
 74 | You should find yourself back at the shell command prompt. The next few lines will set the environment variable `R_LIBS_USER` to let R know where the R libraries directory resides.
 75 | 
 76 | ```bash
 77 | # check if the variable is already set 
 78 | $ echo $R_LIBS_USER 
 79 | 
 80 | # If the above command returns nothing, then run the command below
 81 | $ export R_LIBS_USER="/n/groups/hbctraining/R/library/"
 82 | ```
 83 | 
 84 | To run differential expression analysis, we are going to run a script from the `results` directory, so let's navigate there and create a directory for the results of our analysis. We will call the directory `diffexpression`:
 85 | 
 86 | ```bash
 87 | $ cd ~/unix_lesson/rnaseq/results
 88 | $ mkdir diffexpression
 89 | ```
 90 | First, let's copy over the script file:
 91 | 
 92 | ```bash
 93 | $ cp /n/groups/hbctraining/intro_rnaseq_hpc/DESeq2_script.R diffexpression/
 94 | ```
 95 | 
 96 | The DE script will require as input **1) your count matrix file** and **2) a metadata file**. The count matrix we generated in the last lesson and is in the `counts` directory. The metadata file is a tab-delimited file which contains any information associated with our samples. Each row corresponds to a sample and each column contains some information about each sample.
 97 | 
 98 | ```bash
 99 | $ cp ~/unix_lesson/other/Mov10_rnaseq_metadata.txt diffexpression
100 | ```
101 | > **NOTE:** If you _didn't generate this file in class_ we have a pre-computed count matrix generated that you can use:
102 | >  
103 | >  `$ cp /groups/hbctraining/intro_rnaseq_hpc/counts_STAR/Mov10_rnaseq_counts_complete.txt diffexpression`
104 | > 
105 | 
106 | Once you have the files copied, take a quick look at the metadata using `less`.
107 | 
108 | Now we're all setup to run our R script! Let's run it from within our `diffexpression` directory,
109 | ```bash
110 | $ cd diffexpression
111 | $ Rscript DESeq2_script.R Mov10_rnaseq_counts_complete.txt Mov10_rnaseq_metadata.txt 
112 | ```
113 | 
114 | > **NOTE:** You will notice chunks of code in the text that correspond to plotting figures, and these chunks have been commented out. The reason for this is in order to generate figures on **O2 you require the X11 system**, which we are currently not setup to do with the training accounts. If you are interested in learning more about using X11 applications you can [find out more on the O2 wiki page](https://wiki.rc.hms.harvard.edu/display/O2/Using+X11+Applications+Remotely). 
115 | 
116 | ### Gene list exploration
117 | 
118 | There are two results files generated from `DE_script.R`, a full table and significant genes table (at FDR < 0.05). Take a look at the significant results file and see what values have been reported:
119 | 
120 | ```bash
121 | $ head DEresults_sig_table.txt
122 | ```
123 | You should have a table with 7 columns in it:
124 | 
125 | 1. `Gene symbols` (this will not have a column name, due to the nature of the `write` function)
126 | 2. `baseMean`: the average normalized counts across all samples
127 | 3. `log2FoldChange`
128 | 4. `lfcse`: the standard error of the log2 FC
129 | 5. `stat`: the Wald test statistic
130 | 6. `pvalue`
131 | 7. `padj`: p-value adjusted for multiple test correction using the BH method
132 | 
133 | Since we have the full table of results for all genes, we could apply a filter based on the `padj` column to keep only genes we consider significant. We could also increase the stringency by adding in a fold change criteria. Alternatively, the full table can be useful for investigating groups of interesting genes of that are co-regulated but did not appear in our significant list.
134 | 
135 | Using `wc -l` find out how many genes are identified in the significant table. Keep in mind this is generated using the truncated dataset.
136 | 
137 | ```bash
138 | $ wc -l DEresults_sig_table.txt
139 | ```
140 | 
141 | For downstream analysis, the relevant information that we will require from this results table is the gene names and the FDR value. We can cut the columns to a new file and and use that as input to some functional analaysis tools.
142 | 
143 | ```bash
144 | $ cut -f1,7 DEresults_sig_table.txt > Mov10_sig_genelist.txt
145 | ```
146 | 
147 | Since the list we have is generated from analaysis on a small subset of chromosome 1, using these genes as input to downstream tools will not provide any meaningful results. As such, **we have generated a list using the full dataset for these samples and can be downloaded to your laptop via [this link]().** 
148 | 
149 | 
150 | ## Differential expression analysis using pseudocounts 
151 | ----------------------
152 | 
153 | In the script described above, we used count data generated from the standard RNA-seq workflow as input. The instructions are below to **perform a similar analysis with the output from Salmon, but on your local laptop**. To perform this analysis, you will need to use R and Rstudio directly. We do not have a script available that works on O2. 
154 | 
155 | **The rest of this section assumes that you are comfortable with R and RStudio.**
156 | 
157 | The output from Salmon is transcript counts, but DESeq2 works well only with gene counts. To bridge this gap, the developers of DESeq2 have developed a package makes the output of Salmon compatible with DESeq2. This package is called [`tximport`](https://bioconductor.org/packages/release/bioc/html/tximport.html) and is also available through Bioconductor. `tximport` imports transcript-level abundance, estimated counts and transcript lengths, and summarizes this into matrices for use with downstream gene-level analysis packages. 
158 | 
159 | First, you have to download the directory with the quant.sf files for the 8 full datasets using the link below. Once you have them downloaded continue to follow the rest of instructions:
160 | 
161 | 1. [Download Salmon files](https://www.dropbox.com/s/aw170f8zge01jpq/salmon.zip?dl=0)
162 | 2. Decompress (unzip) the zip archive and move the folder to an appropriate location (i.e `~/Desktop`)
163 | 3. Open RStudio and select 'File' -> 'New Project'  -> 'Existing Directory' and navigate to the `salmon` directory 
164 | 4. Open up a new R script ('File' -> 'New File' -> 'Rscript'), and save it as `salmon_de.R`
165 | 
166 | Your Rstudio interface should look something like the screenshot below:
167 | 
168 | <img src="../img/salmon_rstudio.png" size="600">
169 | 
170 | To perform this analysis you will have to install the following libraries:
171 | 
172 | `tximport`
173 | 
174 | `readr`
175 | 
176 | `DESeq2`
177 | 
178 | `biomaRt`
179 | 
180 | **Step 1:** Load the required libraries:
181 | 
182 | ```R
183 | # Load libraries
184 | library(tximport)
185 | library(readr)
186 | library(DESeq2)
187 | library(biomaRt) # tximport requires gene symbols as row names
188 | ```
189 | 
190 | **Step 2:** Load the quantification data that was output from Salmon:
191 | 
192 | ```R
193 | ## List all directories containing data  
194 | samples <- list.files(path = ".", full.names = F, pattern="\\.salmon$")
195 |     
196 | ## Obtain a vector of all filenames including the path
197 | files <- file.path(samples, "quant.sf")
198 |     
199 | ## Since all quant files have the same name it is useful to have names for each element
200 | names(files) <-  samples
201 | ```
202 | 
203 | The **main objective here is to add names to our quant files which will allow us to easily discriminate between samples in the final output matrix**. 
204 | 
205 | **Step 3.** Create a dataframe containing Ensembl Transcript IDs and Gene symbols
206 | 
207 | Our Salmon index was generated with transcript sequences listed by Ensembl IDs, but `tximport` needs to know **which genes these transcripts came from**, so we need to use the `biomaRt` package to extract this information. 
208 | 
209 | > *NOTE:* Keep in mind that the Ensembl IDs listed in our Salmon output contained version numbers (i.e ENST00000632684.1). If we query Biomart with those IDs it will not return anything. Therefore, before querying Biomart in R do not forget to strip the version numbers from the Ensembl IDs.
210 | 
211 | ```R
212 | ## DO NOT RUN
213 | 
214 | # Create a character vector of Ensembl IDs		
215 | ids <- read.delim(files[1], sep="\t", header=T)    # extract the transcript ids from one of the files
216 | ids <- as.character(ids[,1])
217 | require(stringr)
218 | ids.strip <- str_replace(ids, "([.][0-9])", "")
219 | 
220 | # Create a mart object
221 | # Note that we are using an archived host, since "www.ensembl.org" gave us an error
222 | mart <- useDataset("hsapiens_gene_ensembl", useMart("ENSEMBL_MART_ENSEMBL", host="mar2016.archive.ensembl.org"))
223 | 
224 | # Get official gene symbol and Ensembl gene IDs
225 | tx2gene <- getBM(
226 |     filters= "ensembl_transcript_id", 
227 |      attributes= c("ensembl_transcript_id", "external_gene_name"),
228 |      values= ids.strip,
229 |      mart= mart)
230 |      
231 | ```
232 | 
233 | **We have already run the above code for you and saved the output in a text file which is in the salmon directory.** Load it in using: 
234 | 
235 | ```R
236 | tx2gene <- read.delim("tx2gene.txt",sep="\t")
237 | ```
238 |     
239 | **Step 4:** Run tximport to summarize gene-level information    
240 | ```R
241 | ?tximport   # let's take a look at the arguments for the tximport function
242 | 
243 | txi <- tximport(files, type="salmon", txIn = TRUE, txOut = FALSE, tx2gene=tx2gene, reader=read_tsv, ignoreTxVersion=TRUE)
244 | ```
245 | ### Output from `tximport`
246 | 
247 | The `txi` object is a simple list with three matrices: abundance, counts, length. 
248 | ```R
249 | attributes(txi)
250 | ```
251 | A final element 'countsFromAbundance' carries through the character argument used in the tximport call. The length matrix contains the average transcript length for each gene which can be used as an offset for gene-level analysis. 
252 | 
253 | ### Using DESeq2 for DE analysis with pseudocounts
254 | 
255 | ```R    
256 | ## Create a sampletable/metadata
257 | 
258 | # Before we create this metadata object, let's see what the sample (column) order of the counts matrix is:
259 | colnames(txi$counts)
260 | 
261 | condition=factor(c(rep("Ctl",3), rep("KD", 2), rep("OE", 3)))
262 | sampleTable <- data.frame(condition, row.names = colnames(txi$counts))
263 | 
264 | ## Create a DESeqDataSet object
265 | dds <- DESeqDataSetFromTximport(txi, sampleTable, ~ condition)
266 | ```
267 | 
268 | Now you have created a DESeq object to proceed with DE analysis you can now complete the DE analysis using methods in the script we ran
269 | for the counts from STAR. 
270 | 
271 | ### Resources for R
272 | 
273 | * https://www.datacamp.com/courses/free-introduction-to-r
274 | * Software Carpentry materials: http://swcarpentry.github.io/r-novice-inflammation/
275 | * Data Carpentry materials: http://tracykteal.github.io/R-genomics/
276 | * Materials from IQSS at Harvard: http://tutorials.iq.harvard.edu/R/Rintro/Rintro.html
277 | * [swirl](http://swirlstats.com/): learn R interactively from within the R console
278 | * The free "try R" class from [Code School](http://tryr.codeschool.com)
279 | * HarvardX course ["Statistics and R for the Life Sciences"](https://courses.edx.org/courses/HarvardX/PH525.1x/1T2015/info)
280 | 
281 | ***
282 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
283 | 
284 | 


--------------------------------------------------------------------------------
/lessons/advanced_bash.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Advanced Shell"
  3 | author: "Radhika Khetani, Meeta Mistry"
  4 | date: "May 9, 2018"
  5 | ---
  6 | 
  7 | ## Learning Objectives
  8 | 
  9 | * Increasing productivity when working on the command-line and in a cluster environment
 10 | * Becoming familiar with advanced bash commands and utilities 
 11 | 
 12 | ## Advanced Bash Commands and Utilities
 13 | 
 14 | As you begin working more with the Shell, you will discover that there are mountains of different utilities at your fingertips to help increase command-line productivity. So far we have introduced you to some of the basics to help get you started. In this lesson, we will touch on some more advanced topics that can be very useful as you carry out analyses in a cluster environment. 
 15 | 
 16 | ## O2-specific utilities
 17 | 
 18 | * [Configuring your shell](#config)
 19 |     * [`.bashrc` versus `.bash_profile`](#bashrc)
 20 |     * [Aliases](#alias) 
 21 | * [Symbolic links](#symlinks)
 22 | * [Transferring files with `rsync`](#rsync)
 23 |     * [Working on `/n/scratch2/`](#nscratch)
 24 | 
 25 | ***
 26 | 
 27 | ## Configuring your shell <a name="config"></a>
 28 | 
 29 | In your home directory there are two hidden files `.bashrc` and `.bash_profile`. These files contain all the startup configuration and preferences for your command line interface and are loaded before your Terminal loads the shell environment. Modifying these files allow you to change your preferences for features like your command prompt, the colors of text, and adding aliases for commands you use all the time. 
 30 | 
 31 | > **NOTE:** These files begin with a dot (`.`) which makes it a hidden file. To view all hidden files in your home directory you can use:
 32 | > 
 33 | > `$ ls -al ~/`
 34 | 
 35 | ### `.bashrc` versus `.bash_profile` <a name="bashrc"></a>
 36 | 
 37 | You can put configurations in either file, and you can create either if it doesn’t exist. **But why two different files? What is the difference?**
 38 | 
 39 | The difference is that **`.bash_profile` is executed for login shells, while `.bashrc` is executed for interactive non-login shells**. It is helpful to have these separate files when there are preferences you only want to see on the login and not every time you open a new terminal window. For example, suppose you would like to print some lengthy diagnostic information about your machine (load average, memory usage, current users, etc) - the `.bash_profile` would be a good place since you would only want in displayed once when starting out.   
 40 | 
 41 | Most of the time you don’t want to maintain two separate configuration files for login and non-login shells. For example, when you export a `$PATH` (as we had done previously), you want it to apply to both. You can do this by sourcing `.bashrc` from within your `.bash_profile` file. Take a look at your `.bash_profile` file, it has already been done for you:
 42 | 
 43 | ```bash
 44 | $ less ~/.bash_profile
 45 | ```
 46 | 
 47 | You should see the following lines:
 48 | 
 49 | ```bash
 50 | if [ -f ~/.bashrc ]; then
 51 |    source ~/.bashrc
 52 | fi
 53 | ```
 54 | 
 55 | What this means is that if a `.bashrc` files exist, all configuration settings will be sourced upon logging in. Any settings you would like applied to all shell windows (login and interactive) can simply be added directly to the `.bashrc` file rather than in two separate files.
 56 | 
 57 | 
 58 | ### Aliases <a name="alias"></a>
 59 | 
 60 | An alias is a short name that the shell translates into another (usually longer) name or command. They are typically placed in the `.bash_profile` or `.bashrc` startup files so that they are available to all subshells. You can use the `alias` built-in command without any arguments, and the **shell will display a list of all defined aliases**:
 61 | 
 62 | ```bash
 63 | $ alias
 64 | ```
 65 | 
 66 | This should return to you the list of aliases that have been set for you, and you can see **the syntax used for setting an alias is**:
 67 | 
 68 | ```bash
 69 | alias aliasname=value
 70 | ```
 71 | 
 72 | When setting an alias **no spaces are permitted around the equal sign**. If value contains spaces or tabs, you must enclose the value within quotation marks. If you look through the list of aliases that have been set for you, `ll` is a good example of this:
 73 | 
 74 | ```bash
 75 | alias ll='ls -l'
 76 | ```
 77 | 
 78 | Since we have a modifier `-l` and there is a space required, the quotations are necessary.
 79 | 
 80 | Let's **setup our own alias**! Every time we want to start an interactive session we have type out this lengthy command. Wouldn't it be great if we could type in a short name instead? Open up the `.bashrc` file using `vim`:
 81 | 
 82 | ```bash
 83 | $ vim ~/.bashrc
 84 | ```
 85 | 
 86 | Scroll down to the heading "`# User specific aliases and functions`" and on the next line you can set your alias:
 87 | 
 88 | ```bash
 89 | o2i='srun --pty -p short -t 0-12:00 --mem 8G /bin/bash'
 90 | ```
 91 | 
 92 | Save and quit. Now we can source the `.bash_profile` file and test it out. By typing `o2i` at the command prompt we will request an interactive session for 12 hours with 8G of memory. You can change the directives to those you use more often (i.e add more cores, increase memory).
 93 | 
 94 | 
 95 | ```bash
 96 | $ source ~/.bash_profile
 97 | 
 98 | $ o2i
 99 | ```
100 | 
101 | ## Symbolic links <a name="symlinks"></a>
102 | 
103 | The O2 cluster supports symbolic links also known as symlinks. This is a kind of “file” that is **essentially a pointer to another file name**. Symbolic links can be made to directories or across file systems with no restrictions. You can also make a symbolic link to a name which is not the name of any file. (Opening this link will fail until a file by that name is created.) Likewise, if the symbolic link points to an existing file which is later deleted, the symbolic link continues to point to the same file name even though the name no longer names any file.
104 | 
105 | Symlinks can be used in lieu of copying over large files. For example, when we began the RNA-seq part of this workshop we had copied over FASTQ files from `~/unix_lesson/raw_fastq` to `~/unix_lesson/rnaseq/raw_data`. But what we could have done instead is created symlinks to those files.
106 | 
107 | The basic syntax for creating a symlink is:
108 | 
109 | ```bash
110 | ln -s /path/to/file /path/to/symlink
111 | ```
112 | 
113 | So if we wanted to have symlinks to our FASTQ files instead of having duplicate copies, we can first remove the files that are currrently there:
114 | 
115 | ```bash
116 | $ rm ~/unix_lesson/rnaseq/raw_data/*
117 | ```
118 | 
119 | And then we can symlink the files:
120 | 
121 | ```bash
122 | $ ln -s ~/unix_lesson/raw_fastq/*.fq ~/unix_lesson/rnaseq/raw_data/
123 | ```
124 | 
125 | Now, if you check the directory where we created the symlinks you should see the filenames listed in cyan text followed by an arrow pointing the actual file location. (_NOTE: If your files are flashing red text, this is an indication your links are broken so you might want to double check the paths._)
126 | 
127 | ```bash
128 | $ ll ~/unix_lesson/rnaseq/raw_data
129 | ```
130 | 
131 | ## Transferring files with `rsync` <a name="rsync"></a>
132 | 
133 | During this workshop we have mostly used Filezilla to transfer files to and from your laptop to the O2 cluster. At the end of the Alignment/Counting lesson we also introduced how to do this on the command line using `scp`. The way `scp` works is it reads the source file and writes it to the destination. It performs a plain linear copy, locally, or over a network.
134 | 
135 | When **transferring large files or a large number of files `rsync` is a better command** to use. `rsync` employs a special delta transfer algorithm and a few optimizations to make the operation a lot faster. **It will check files sizes and modification timestamps** of both file(s) to be copied and the destination, and skip any further processing if they match. If the destination file(s) already exists, the delta transfer algorithm will **make sure only differences between the two are sent over.**
136 | 
137 | There are many modifiers for the `rsync` command, but in the examples below we only introduce a select few that we commonly use during a file transfer.
138 | 
139 | **Example 1:**
140 | 
141 | ```
142 | rsync -t --progress /path/to/transfer/files/*.c /path/to/destination
143 | ```
144 | 
145 | This command would transfer all files matching the pattern *.c from the transfer directory to the destination directory. If any of the files already exist at the destination then the rsync remote-update protocol is used to update the file by sending only the differences.
146 | 
147 | **Example 2:**
148 | 
149 | ```
150 | rsync -avr --progress /path/to/transfer/directory /path/to/destination
151 | ```
152 | 
153 | This command would recursively transfer all files from the transfer directory into the destination directory. The files are transferred in "archive" mode (`-a`), which ensures that symbolic links, devices, attributes, permissions, ownerships, etc. are preserved in the transfer. In both commands, we have additional modifiers for verbosity so we have an idea of how the transfer is progressing (`-v`, `--progress`)
154 | 
155 | > **NOTE:** A trailing slash on the transfer directory changes the behavior to avoid creating an additional directory level at the destination.  You can think of a trailing `/ ` as meaning "copy the contents of this directory" as opposed to "copy the  directory  by  name".
156 | 
157 | ### Working on `/n/scratch2` <a name="nscratch"></a>
158 | 
159 | Typically, the `rsync` command is used to move files between a remote computer and a local computer. But it can also be used to  to move files on the same computer. For example, we could use it to move files across filesystems on O2.
160 | 
161 | Most HPC environments have a "scratch space" available to use. This is **a temporary filesystem with larger amounts of storage space and resources, which is ideal for running analyses**.  On the O2 cluster, this is located at `/n/scratch2`. Each user is entitled to 10 TB of space in the `/n/scratch2` filesystem. You can create your own directories inside `/n/scratch2/` and put data in there. These files are not backed up and will be deleted if they are not accessed for 30 days.
162 | 
163 | Scratch will not work very well with workflows that write many thousands of small files. It is designed for workflows with medium and large files (> 100 MB), making it ideal for many next-gen sequencing analysis, image analysis, and other bioinformatics workflows that use large files.
164 | 
165 | When performing your analysis, you may want to take advantage of this space and will want to start by copying over your raw FASTQ files. Rather than using `cp`, the `rsync` command would be benefical since FASTQ files are large in size. As an example we will copy over our FASTQ files to `/n/scratch2`, but first we will need to create a directory to copy them to. You can name this directory with your user login name.
166 | 
167 | ```bash
168 | $ mkdir /n/scratch2/$USER
169 | ```
170 | 
171 | Now we can copy over the entire directory of FASTQ files:
172 | 
173 | ```bash
174 | $ rsync -avr --progress ~/unix_lesson/raw_fastq /n/scratch2/$USER
175 | ```
176 | 
177 | Take a look at the directory on scratch and see that the files transferred successfully.
178 | 
179 | > **NOTE:** If you are copying files from a remote resource to your local laptop (or vice versa), the syntax will change. You will need to add the host address before specifying the path. Below is an example of command you would **run in a Terminal on your local laptop**:
180 | > 
181 | 
182 | ```bash
183 | ## DO NOT RUN
184 | $ rysnc -avr --progress rc_training01@transfer.rc.hms.harvard.edu:/home/rc_traning01/unix_lesson/rnaseq/raw_data /path/on/local/machine
185 | ```
186 | 
187 | 
188 | ## General Bash commands
189 | 
190 | > *These materials are adapted from training materials generated by [FAS Research Computing at Harvard University](https://www.rc.fas.harvard.edu/training/training-materials/).*
191 | 
192 | * [Setting up](#setup)
193 | * [Regular expressions (regex) in `bash`](#regex)
194 | * [Reintroducing `grep`](#grep)
195 |     * [`grep` examples](#example1)
196 | * [Introducing `sed`](#sed)
197 |     * [`sed` examples](#example2)
198 | * [Reintroducing `awk`](#awk)
199 |     * [`awk` examples](#example3)
200 | 
201 | ***
202 | 
203 | ## Setting up <a name="setup"></a>
204 | 
205 | ```bash
206 | $ cd ~/unix_lesson
207 | 
208 | $ cp /n/groups/hbctraining/ngs-data-analysis-longcourse/unix_lesson/bicycle.txt .
209 | ```
210 | ***
211 | 
212 | ## Regular expressions (regex) in `bash` <a name="regex"></a>
213 | 
214 | "A regular expression, regex or regexp (sometimes called a rational expression) is a sequence of characters that define a search pattern. Usually this pattern is then used by string searching algorithms for "find" or "find and replace" operations on strings." -[Wikipedia](https://en.wikipedia.org/wiki/Regular_expression)
215 | 
216 | "The specific syntax rules vary depending on the specific implementation, programming language, or library in use. Additionally, the functionality of regex implementations can vary between versions of languages." -[Wikipedia](https://en.wikipedia.org/wiki/Regular_expression)
217 | 
218 | Below is a small subset of characters that can be used for pattern generation in `bash`.
219 | 
220 | **Special Characters:**
221 | 
222 | *  `.` : *match any character (except new line)
223 | *  `\` : *make next character literal*
224 | *  `^` : *matches at the start of the line*
225 | *  `$` : *matches at the end of line*
226 | *  `*` : *repeat match
227 | *  `?` : *preceding character is optional
228 | *  `[ ]` : *sequence of characters*
229 |       * `[a-z]` : any one from a through z 
230 |       * `[aei]` : either a, e, i
231 |       * `[0-9]` : any one from 1 through 9
232 | 
233 | **Examples:**
234 | 
235 | * `.at` == any three-character string ending with "at", including "hat", "cat", and "bat".
236 | * `ab*c` == "ac", "abc", "abbc", "abbbc", and so on*
237 | * `colou?r` == "color" or "colour"*
238 | * `[hc]at` == "hat" and "cat".
239 | * `[^b]at` == all strings matched by .at except "bat".
240 | * `[^hc]at` == all strings matched by .at other than "hat" and "cat".
241 | * `^[hc]at` == "hat" and "cat", but only at the beginning of the string or line.
242 | * `[hc]at$` == "hat" and "cat", but only at the end of the string or line.
243 | * `\[.\]` == any single character surrounded by "[" and "]" since the brackets are escaped, for example: "[a]" and "[b]".
244 | * `s.*` == "s" followed by zero or more characters, for example: "s" and "saw" and "seed" and "shawshank".
245 | 
246 | >  above examples excerpted from [Wikipedia](-[Wikipedia](https://en.wikipedia.org/wiki/Regular_expression))
247 | 
248 | **Non printable characters:**
249 | 
250 | * `\t` : tab
251 | * `\n` : new line (Unix)
252 | * `\s` : space
253 | 
254 | ***
255 | 
256 | ## Reintroducing `grep` (GNU regex parser) <a name="grep"></a>
257 | 
258 | As we saw yesterday, `grep` is a line by line parser that outputs lines matching a pattern of interest. In addition it also allows the use of regular expressions (regex) in the specified pattern, let's use some regular expressions with `grep`.
259 | 
260 | **`grep` usage:**
261 | 
262 | `cat file | grep pattern`
263 | 
264 | OR
265 | 
266 | `grep pattern file`
267 | 
268 | **`grep` common options:**
269 | 
270 | * `c` : count the number of occurrences
271 | * `v` : invert match, print non-matching lines
272 | * `R` : recursively through directories
273 | * `o` : only print matching part of line
274 | * `n` : print the line number
275 | 
276 | ***
277 | 
278 | ### Examples `grep` usage <a name="example1"></a>
279 | 
280 | ```bash
281 | $ grep -c bicycle bicycle.txt
282 | 
283 | $ grep "bicycle bicycle" bicycle.txt 
284 | 
285 | $ grep ^bicycle bicycle.txt
286 | $ grep ^Bicycle bicycle.txt 
287 | 
288 | $ grep yeah$ bicycle.txt
289 | 
290 | $ grep [SJ] bicycle.txt
291 | 
292 | $ grep ^[SJ] bicycle.txt 
293 | ```
294 | ***
295 | 
296 | ## Introducing `sed` <a name="sed"></a>
297 | 
298 | `sed` takes a stream of stdin and pattern matches and returns the replaced text to stdout ("Think amped-up Windows Find & Replace").
299 | 
300 | **`sed` usage:** 
301 | 
302 | `cat file | sed ‘command’`
303 | 
304 | OR
305 | 
306 | `sed ‘command’  file`
307 | 
308 | **`sed` common options:**
309 | 
310 | * `4d` : *delete line 4*
311 | * `2,4d` : *delete lines 2-4*
312 | * `/here/d` : *delete line matching here*
313 | * `/here/,/there/d` : *delete lines matching here to there*
314 | * `s/pattern/text/` : *switch text matching pattern*
315 | * `s/pattern/text/g` : *switch text matching pattern globally*
316 | * `/pattern/a\text` : *append line with text after matching pattern*
317 | * `/pattern/c\text` : *change line with text for matching pattern*
318 | 
319 | ### Examples `sed` usage <a name="example2"></a>
320 | 
321 | ```bash
322 | $ sed '1,2d' bicycle.txt
323 | 
324 | $ sed 's/Superman/Batman/' bicycle.txt 
325 | 
326 | $ sed 's/bicycle/car/' bicycle.txt 
327 | $ sed 's/bicycle/car/g' bicycle.txt 
328 | 
329 | $ sed 's/.icycle/car/g' bicycle.txt
330 | 
331 | $ sed 's/bi*/car/g' bicycle.txt
332 | 
333 | $ sed 's/bicycle/tri*cycle/g' bicycle.txt | sed 's/tri*cycle/tricycle/g'   ## does this work?
334 | $ sed 's/bicycle/tri*cycle/g' bicycle.txt | sed 's/tri\*cycle/tricycle/g'
335 | 
336 | $ sed 's/\s/\t/g' bicycle.txt
337 | $ sed 's/\s/\\t/g' bicycle.txt
338 | 
339 | $ sed 's/\s//g' bicycle.txt
340 | ```
341 | ***
342 | 
343 | ## Introducing `awk` <a name="awk"></a>
344 | 
345 | `awk` is command/script language that turns text into records and fields which can be selected to display as kind of an ad hoc database.  With awk you can perform many manipulations to these fields or records before they are displayed. 
346 | 
347 | **`awk` usage:** 
348 | 
349 | `cat file | awk ‘command’`
350 | 
351 | OR
352 | 
353 | `awk ‘command’  file`
354 | 
355 | **`awk` concepts:**
356 | 
357 | *Fields:*
358 | 
359 | Fields are separated by white space, or you can specifying a field separator (FS). The fields are denoted $1, $2, ..., while $0 refers to the entire line. If there is no FS, the input line is split into one field per character.
360 | 
361 | The `awk` program has some internal environment variables that are useful (more exist and change upon platform)
362 | 
363 | * `NF` – number of fields in the current record
364 | * `NR` – number of the current record (somewhat similar to row number)
365 | * `FS` – regular expression used to separate fields; also settable by option -Ffs (default whitespace)
366 | * `RS` – input record separator (default newline)
367 | * `OFS` – output field separator (default blank)
368 | * `ORS` – output record separator (default newline)
369 | 
370 | `awk` also supports more complex statements, some examples are below: 
371 | 
372 | * if (expression) statement [ else statement ]
373 | * while (expression) statement
374 | * for (expression ; expression ; expression) statement
375 | * for (var in array) statement
376 | * do statement while (expression)
377 | 
378 | Please note that awk is a language on it's own, and we will only be looking at some examples os its usage.
379 | 
380 | ### Examples `awk` usage <a name="example3"></a>
381 | 
382 | ```bash
383 | $ awk '{print $3}' reference_data/chr1-hg19_genes.gtf | head
384 | 
385 | $ awk '{print $3 | "sort -u"}' reference_data/chr1-hg19_genes.gtf 
386 | 
387 | $ awk '{OFS = "\t" ; if ($3 == "stop_codon") print $1,$4,$5,$3,$10}' reference_data/chr1-hg19_genes.gtf | head
388 | $ awk '{OFS = "\t" ; if ($3 == "stop_codon") print $1,$4,$5,$3,$10}' reference_data/chr1-hg19_genes.gtf | sed 's/"//g' | sed 's/;//g' | head
389 | 
390 | $ awk -F "\t" '{print $10}' reference_data/chr1-hg19_genes.gtf | head
391 | $ awk -F "\t" '{print $9}' reference_data/chr1-hg19_genes.gtf | head
392 | 
393 | # head other/bad-reads.count.summary
394 | $ awk -F ":" 'NR > 1 {sum += $2} END {print sum}' other/bad-reads.count.summary
395 | 
396 | # head ../rnaseq/results/counts/Mov10_featurecounts.Rmatrix.txt
397 | $ awk 'NR > 1 {sum += $2} END {print sum}' ../rnaseq/results/counts/Mov10_featurecounts.Rmatrix.txt
398 | ```
399 | 
400 | 
401 | ---
402 | 
403 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
404 | 
405 | * *The materials used in this lesson were derived from training materials generated by [FAS Reseach Computing at Harvard University](https://www.rc.fas.harvard.edu/training/training-materials/) and [HMS Research Computing](https://rc.hms.harvard.edu/)*
406 | 
407 | 
408 | 


--------------------------------------------------------------------------------
/lessons/experimental_planning_considerations.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Library preparation, sequencing and experimental considerations"
  3 | author: "Mary Piper, Meeta Mistry, Radhika Khetani"
  4 | date: "Monday, October 29, 2018"
  5 | ---
  6 | 
  7 | Approximate time: 90 minutes
  8 | 
  9 | ## Learning Objectives:
 10 | 
 11 | * Describe the process of RNA-seq library preparation
 12 | * Describe Illumina sequencing method
 13 | * Discuss special considerations for experimental design
 14 | 
 15 | # Experimental steps and considerations
 16 | 
 17 | ## Introduction to RNA-seq
 18 | 
 19 | RNA-seq is an exciting experimental technique that is utilized to explore and/or quantify gene expression within or between conditions. 
 20 | 
 21 | 
 22 | As we know, genes provide instructions to make proteins, which perform some function within the cell. Although all cells contain the same DNA sequence, muscle cells are different from nerve cells and other types of cells because of the different genes that are turned on in these cells and the different RNAs and proteins produced. 
 23 | 
 24 |  <img src="../img/gene_expression_cells.png" width="600">
 25 | 
 26 | Different biological processes, as well as mutations, can affect which genes are turned on and which are turned off, in addition to, *how much* specific genes are turned on/off.
 27 | 
 28 | To make proteins, the DNA is transcribed into messenger RNA, or mRNA, which is translated by the ribosome into protein. However, some genes encode RNA that does not get translated into protein; these RNAs are called non-coding RNAs, or ncRNAs. Often these RNAs have a function in and of themselves and include rRNAs, tRNAs, and siRNAs, among others. All RNAs transcribed from genes are called transcripts.
 29 | 
 30 |  <img src="../img/Gene_products.png" width="400">
 31 | 
 32 | To be translated into proteins, the RNA must undergo processing to generate the mRNA. In the figure below, the top strand in the image represents a gene in the DNA, comprised of the untranslated regions (UTRs) and the open read frame. Genes are transcribed into pre-mRNA, which still contains the intronic sequences.  After post-transciptional processing, the introns are spliced out and a polyA tail and 5' cap are added to yield mature mRNA transcripts, which can be translated into proteins.
 33 | 
 34 |  <img src="../img/Gene_structure.png" width="600">
 35 | 
 36 | **While mRNA transcripts have a polyA tail, many of the non-coding RNA transcripts do not as the post-transcriptional processing is different for these transcripts.**
 37 | 
 38 | RNA-seq data can be used to explore and/or quantify the RNA transcripts, which can be utilized for the following types of experiments:
 39 | 
 40 | - Differential Gene Expression: *quantitative* evaluation and comparison of transcript levels
 41 | - Transcriptome assembly: building the profile of transcribed regions of the genome, a *qualitative* evaluation. 
 42 | - Can be used to help build better gene models, and verify them using the assembly
 43 | - Metatranscriptomics or community transcriptome analysis
 44 | 
 45 | 
 46 | ## Illumina library preparation
 47 | 
 48 | When starting an RNA-seq experiment, for every sample the RNA needs to be isolated and turned into a cDNA library for sequencing. Generally, ribosomal RNA represents the majority of the RNAs present in a cell, while messenger RNAs represent a small percentage of total RNA, ~2% in humans.
 49 | 
 50 |  <img src="../img/rrna.png" width="400">
 51 | 
 52 | Therefore, if we want to study the protein-coding genes, we need to enrich for mRNA or deplete the rRNA. **For differential gene expression analysis, it is best to enrich for Poly(A)+, unless you are aiming to obtain information about long non-coding RNAs, then do a ribosomal RNA depletion.**
 53 | 
 54 | The workflow for library preparation is detailed in the image below:
 55 | 
 56 |  <img src="../img/RNA-seq_library_prep.png" width="800">
 57 |  
 58 |  *Image credit: [Martin J.A. and Wang Z., Nat. Rev. Genet. (2011) 12:671–682](https://www.nature.com/articles/nrg3068)*
 59 | 
 60 | Briefly, the RNA is isolated from the sample and contaminating DNA is removed, followed by either selection of the mRNA or depletion of the rRNA. The resulting RNA is fragmented then reverse transcribed into cDNA. Sequence adapters are added to the ends of the fragments and the fragments are PCR amplified if needed. Finally, the fragments are size selected (usually ~300-500bp) to finish the library.
 61 | 
 62 | The cDNA libraries can be generated in a way to retain information about which strand of DNA the RNA was transcribed from. Libraries that retain this information are called stranded libraries, which are now standard with Illumina’s TruSeq stranded RNA-Seq kits. Stranded libraries should not be any more expensive than unstranded, so there is not really any reason not to acquire this additional information. 
 63 | 
 64 | There are 3 types of cDNA libraries available:
 65 | 
 66 | - Forward (secondstrand) – reads resemble the gene sequence or the secondstrand cDNA sequence
 67 | - Reverse (firststrand) – reads resemble the complement of the gene sequence or firststrand cDNA sequence (TruSeq)
 68 | - Unstranded 
 69 | 
 70 | > **NOTE:** This workflow is specific to Illumina sequencing, which is currently the most utilized sequencing method. But there are other long-read methods worth noting, such as:
 71 | >
 72 | > - Pacific Biosciences: http://www.pacb.com/ 
 73 | > - Oxford Nanopore (MinION): https://nanoporetech.com/ 
 74 | > - 10X Genomics: https://www.10xgenomics.com/ 
 75 | > 
 76 | > Advantages and disadvantages of these technologies can be explored in the table below:
 77 | >
 78 | > <img src="../img/long_read_tech.png" width="700">
 79 | 
 80 | ## Illumina Sequencing
 81 | 
 82 | After preparation of the libraries, sequencing can be performed to generate the nucleotide sequences of the ends of the fragments, which are called **reads**. You will have the choice of sequencing a single end of the cDNA fragments (single-end reads) or both ends of the fragments (paired-end reads).
 83 | 
 84 | <img src="../img/paired_end_reads.png" width="500">
 85 | 
 86 | - SE - Single end dataset => Only Read1
 87 | - PE - Paired-end dataset => Read1 + Read2
 88 | 	- can be 2 separate FastQ files or just one with interleaved pairs
 89 | 
 90 | Generally single-end sequencing is sufficient unless it is expected that the reads will match multiple locations on the genome (e.g. organisms with many paralogous genes), assemblies are being performed, or for splice isoform differentiation. Be aware that paired-end reads are generally 2x more expensive.
 91 | 
 92 | There are a variety of Illumina platforms to choose from to sequence the cDNA libraries.
 93 | 
 94 | <img src="../img/illumina_platforms.png" width="800">
 95 | 
 96 |  *Image credit: Adapted from [Illumina](www.illumina.com)*
 97 | 
 98 | Differences in platform can alter the length of reads generated as well as the total number of reads sequenced per run and the amount of time required to sequence the libraries. The different platforms each use a different flow cell, which is a glass surface coated with an arrangement of paired oligos that are complementary to the adapters added to your template molecules. The flow cell is where the sequencing reactions take place.
 99 | 
100 | <img src="../img/flow_cells.png" width="400">
101 | 
102 |  *Image credit: Adapted from [Illumina](www.illumina.com)*
103 |  
104 | 
105 | Let's explore how Illumina sequencing is performed:
106 | 
107 | [<img src="../img/illumina_sequencing.png" width="400">](https://www.dropbox.com/s/f4t94tcw06f9stg/Illumina%20Sequencing%20by%20Synthesis-14840.mp4?dl=0)
108 | 
109 | - Number of clusters ~= Number of reads
110 | - Number of sequencing cycles = Length of reads
111 | 
112 | The number of cycles (length of the reads) will depend on sequencing platform used as well as your preferences.
113 | 
114 | Charges for sequencing are usually per lane of the flow cell, and usually you don’t need one lane per sample. Multiplexing allows you to sequence multiple samples per lane with addition of indices (within the Illumina adapter) or special barcodes (outside the Illumina adapter).
115 | 
116 | <img src="../img/demultiplexing.png" width="800">
117 | 
118 | ## Experimental planning considerations
119 | 
120 | Understanding the steps in the experimental process of RNA extraction and preparation of RNA-Seq libraries is helpful for designing an RNA-Seq experiment, but there are special considerations that should be highlighted that can greatly affect the quality of a differential expression analysis. 
121 | 
122 | These important considerations include:
123 | 
124 | 1. Number and type of **replicates**
125 | 2. Avoiding **confounding**
126 | 3. Addressing **batch effects**
127 | 
128 | We will go over each of these considerations in detail, discussing best practice and optimal design.
129 | 
130 | ## Replicates
131 | 
132 | Experimental replicates can be performed as **technical replicates** or **biological replicates**. 
133 | 
134 | <img src="../img/replicates.png" width="500">
135 | 
136 | *Image credit: [Klaus B., EMBO J (2015) **34**: 2727-2730](https://dx.doi.org/10.15252%2Fembj.201592958)*
137 | 
138 | - **Technical replicates:** use the same biological sample to repeat the technical or experimental steps in order to accurately measure technical variation and remove it during analysis. 
139 | 
140 | - **Biological replicates** use different biological samples of the same condition to measure the biological variation between samples. 
141 | 
142 | For mice or rats, this might be easy to determine what constitutes a different biological sample, but it's a bit more difficult to determine for cell lines. When using cell lines it's best to include as much variation between samples as possible, and [this article](http://paasp.net/accurate-design-of-in-vitro-experiments-why-does-it-matter/) gives some great recommendations for cell line replicates.
143 | 
144 | In the days of microarrays, technical replicates were considered a necessity; however, with the current RNA-Seq technologies, technical variation is much lower than biological variation and **technical replicates are unneccessary**.
145 | 
146 | In contrast, **biological replicates are absolutely essential**. For differential expression analysis, the more biological replicates, the better the estimates of biological variation and the more precise our estimates of the mean expression levels. This leads to more accurate modeling of our data and identification of more differentially expressed genes.
147 | 
148 | <img src="../img/de_replicates_img.png" width="500">
149 | 
150 | *Image credit: [Liu, Y., et al., Bioinformatics (2014) **30**(3): 301–304](https://doi.org/10.1093/bioinformatics/btt688)*
151 | 
152 | As the figure above illustrates, **biological replicates are of greater importance than sequencing depth**, which is the total number of reads sequenced per sample. The figure shows the relationship between sequencing depth and number of replicates on the number of differentially expressed genes identified [[1](https://academic.oup.com/bioinformatics/article/30/3/301/228651/RNA-seq-differential-expression-studies-more)]. Note that an **increase in the number of replicates tends to return more DE genes than increasing the sequencing depth**. Therefore, generally more replicates are better than higher sequencing depth, with the caveat that higher depth is required for detection of lowly expressed DE genes and for performing isoform-level differential expression. 
153 | 
154 | Replicates are almost always preferred to greater sequencing depth for bulk RNA-Seq. However, **guidelines depend on the experiment performed and the desired analysis**. Below we list some general guidelines for replicates and sequencing depth to help with experimental planning:
155 | 
156 | 
157 | - **General gene-level differential expression:**
158 | 
159 |   - ENCODE guidelines suggest 30 million SE reads per sample (stranded).
160 |   
161 |   - 15 million reads per sample is often sufficient, if there are a good number of replicates (>3). 
162 | 
163 |   - Spend money on more biological replicates, if possible.
164 | 
165 | - **Gene-level differential expression with detection of lowly-expressed genes:**
166 |   
167 |   - Similarly benefits from replicates more than sequencing depth.
168 | 
169 |   - Sequence deeper with at least 30-60 million reads depending on level of expression (start with 30 million with a good number of replicates). 
170 |   
171 | - **Isoform-level differential expression:**
172 | 
173 |   - Of known isoforms, suggested to have a depth of at least 30 million reads per sample and paired-end reads.
174 | 
175 |   - Of novel isoforms should have more depth (> 60 million reads per sample).
176 | 
177 |   - Choose biological replicates over paired/deeper sequencing.
178 | 
179 |   - Perform careful QC of RNA quality. Be careful to use high quality preparation methods and restrict analysis to high quality RIN # samples.  
180 |   
181 | - **Other types of RNA analyses (intron retention, small RNA-Seq, etc.):** 
182 |   
183 |   - Different recommendations depending on the analysis.
184 |   
185 |   - Almost always more biological replicates are better!
186 |   
187 | > **NOTE:** The factor used to estimate the depth of sequencing for genomes is "coverage" - how many times do the number nucleotides sequenced "cover" the genome. This metric is not exact for genomes, but it works okay. It **does not work for transcriptomes** because expression of the genes depend on the condition being studied.
188 | 
189 | ## Confounding
190 |   
191 | A confounded RNA-Seq experiment is one where you **cannot distinguish the separate effects of two different sources of variation** in the data. 
192 | 
193 | For example, we know that sex has large effects on gene expression, and if all of our *control* mice were female and all of the *treatment* mice were male, then our treatment effect would be confounded by sex. **We could not differentiate the effect of treatment from the effect of sex.**
194 | 
195 | <img src="../img/confounded_design.png" width="500">  
196 | 
197 | **To AVOID confounding:**
198 | 
199 | - Ensure animals in each condition are all the **same sex, age, litter, and batch**, if possible.
200 | 
201 | - If not possible, then ensure to split the animals equally between conditions
202 | 
203 |   <img src="../img/non_confounded_design.png" width="400">
204 | 
205 | ## Batch effects
206 | 
207 | Batch effects are a significant issue for RNA-Seq analyses, since you can see significant differences in expression due solely to the batch effect. 
208 | 
209 | <img src="../img/batch_effect_pca.png" width="600">
210 | 
211 | *Image credit: [Hicks SC, et al., bioRxiv (2015)](https://www.biorxiv.org/content/early/2015/08/25/025528)*
212 | 
213 | To explore the issues generated by poor batch study design, they are highlighted nicely in [this paper](https://f1000research.com/articles/4-121/v1).
214 | 
215 | ### How to know whether you have batches?
216 | 
217 | - Were all RNA isolations performed on the same day?
218 | 
219 | - Were all library preparations performed on the same day?
220 | 
221 | - Did the same person perform the RNA isolation/library preparation for all samples?
222 | 
223 | - Did you use the same reagents for all samples?
224 | 
225 | - Did you perform the RNA isolation/library preparation in the same location?
226 | 
227 | If *any* of the answers is **‘No’**, then you have batches.
228 | 
229 | ### Best practices regarding batches:
230 | 
231 | - Design the experiment in a way to **avoid batches**, if possible.
232 | 
233 | - If unable to avoid batches:
234 | 
235 |   - **Do NOT confound** your experiment by batch:
236 | 
237 |     <img src="../img/confounded_batch.png" width="300">
238 |     
239 |     *Image credit: [Hicks SC, et al., bioRxiv (2015)](https://www.biorxiv.org/content/early/2015/08/25/025528)*
240 |   
241 |   - **DO** split replicates of the different sample groups across batches. The more replicates the better (definitely more than 2).
242 |   
243 |     <img src="../img/batch_effect.png" width="300">
244 | 
245 |     *Image credit: [Hicks SC, et al., bioRxiv (2015)](https://www.biorxiv.org/content/early/2015/08/25/025528)*
246 |     
247 |   - **DO** include batch information in your **experimental metadata**. During the analysis, we can regress out the variation due to batch so it doesn’t affect our results if we have that information.
248 | 
249 |     <img src="../img/metadata_batch.png" width="300">
250 |     
251 |  ***
252 |  **Exercise**
253 |  
254 | Your experiment has three different treatment groups, A, B, and C. Due to the lengthy process of tissue extraction, you can only isolate the RNA from two samples at the same time. You plan to have 4 replicates per group.
255 | 
256 | 1. Fill in the `RNA isolation` column of the metadata table. Since we can only prepare 2 samples at a time and we have 12 samples total, you will need to isolate RNA in 6 batches. In the `RNA isolation` column, enter one of the following values for each sample: `group1`, `group2`, `group3`, `group4`, `group5`, `group6`. Make sure to fill in the table so as to avoid confounding by batch of `RNA isolation`. 
257 | 
258 | 2. **BONUS:** To perform the RNA isolations more quickly, you devote two researchers to perform the RNA isolations. Fill in their initials to the `researcher` column for the samples they will prepare: use initials `AB` or `CD`.
259 | 
260 |   | sample | treatment | sex | replicate | RNA isolation |
261 |   | --- | --- | --- | --- | --- |
262 |   | sample1 | A | F | 1 | 
263 |   | sample2 | A | F | 2 |
264 |   | sample3 | A | M | 3 |
265 |   | sample4 | A | M | 4 |
266 |   | sample5 | B | F | 1 |
267 |   | sample6 | B | F | 2 |
268 |   | sample7 | B | M | 3 |
269 |   | sample8 | B | M | 4 |
270 |   | sample9 | C | F | 1 |
271 |   | sample10 | C | F | 2 |
272 |   | sample11 | C | M | 3 |
273 |   | sample12 | C | M | 4 |
274 | 
275 | ***    
276 | 


--------------------------------------------------------------------------------
/sam.md:
--------------------------------------------------------------------------------
 1 | ## samtools  extras
 2 | 
 3 | To play around with a few `samtools` commands, first change directories into the directory containing all BAM files.
 4 | 
 5 | `$ cd ~/unix_workshop/rnaseq/results/STAR/bams`
 6 | 
 7 | ### Write only mapped reads to file (filter out unmapped reads)
 8 | 
 9 | `$ samtools view -b -h -F 4 Mov10_oe_1_Aligned.sortedByCoord.out.bam > Mov10_oe_1_Aligned.onlyAligned.bam`
10 | 
11 | ### Create a FASTQ file containing only mapped reads
12 | 
13 | `$ bamtofastq -o Mov10_oe_1_Mapped.fastq --no-unaligned Mov10_oe_1_Aligned.onlyMapped.bam`
14 | 
15 | ### Index BAM file
16 | 
17 | `$ samtools index Mov10_oe_1_Aligned.sortedByCoord.out.bam`
18 | 
19 | ### Extract reads from a specific region of the chromosome
20 | 
21 | `$samtools view  Mov10_oe_1_Aligned.sortedByCoord.out.bam chr1:200000-500000`
22 | 
23 | ### Randomly subsample half of the reads into a new BAM file
24 | 
25 | `$ samtools view -s 0.5 -b Mov10_oe_1_Aligned.sortedByCoord.out.bam > Mov10_oe_1_subsample.bam`
26 | 
27 | ### Simple stats for alignment file
28 | 
29 | `$ samtools flagstat Mov10_oe_1_Aligned.sortedByCoord.out.bam`
30 | 
31 | ### Visualizing mismatches
32 | 
33 | `$ samtools view -h Mov10_oe_1_Aligned.sortedByCoord.out.bam | head -n 5 | samtools fillmd -e - ~/unix_workshop/rnaseq/reference_data/chr1.fa`
34 | 
35 | 


--------------------------------------------------------------------------------
/schedule/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/schedule/.DS_Store


--------------------------------------------------------------------------------
/schedule/2-day/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-rnaseq-hpc-O2/0feca5559bcbde27cbc7634085f07b3624a36f2f/schedule/2-day/.DS_Store


--------------------------------------------------------------------------------
/schedule/2-day/README.md:
--------------------------------------------------------------------------------
 1 | # Workshop Schedule (2-day)
 2 | 
 3 | ## Day 1
 4 | 
 5 | | Time            |  Topic  | Instructor |
 6 | |:------------------------:|:------------------------------------------------:|:--------:|
 7 | |9:00 - 9:40 | [Workshop Introduction] | Radhika |
 8 | |9:40 - 10:40 | [Introduction to the Shell] | Radhika |
 9 | |10:40 - 10:50 | Break | |
10 | |10:50 - 11:35 | [Introduction to the Shell (cont.)] | Meeta |
11 | |11:35 - 12:15 | [Searching and Redirection] | Mary |
12 | |12:15 - 13:15 | Lunch | |
13 | |13:15 - 13:35 | [Introduction to the Vim Text Editor] | Mary |
14 | |13:35 - 14:50 | [Loops and Shell Scripts] | Meeta |
15 | |14:50 - 15:00 | Break | |
16 | |15:00 - 15:30 | [Permissions and Environment Variables] | Radhika |
17 | |15:30 - 16:00 | [Project Organization and Best Practices in Data Management] | Meeta |
18 | |16:00 - 17:00 | [Introduction to RNA-seq and Library Prep] | Radhika |
19 | 
20 | ## Day 2
21 | 
22 | | Time            |   Topic  | Instructor |
23 | |:------------------------:|:----------:|:--------:|
24 | |9:00 - 9:50 | [Introduction to High-Performance Computing] | Radhika |
25 | |9:50 - 10:30 | [RNA-seq analysis workshop - Quality Assessment] | Mary |
26 | |10:30 - 10:40 | Break | |
27 | |10:40 - 11:15 | [RNA-seq analysis workshop - Quality Assessment] | Mary |
28 | |11:15 - 12:00 | [RNA-seq analysis workshop - Alignment and Counting] | Meeta |
29 | |12:00 - 13:00 | Break | |
30 | |13:00 - 13:45 | [RNA-seq analysis workshop - Alignment and Counting] | Meeta |
31 | |13:45 - 14:45 | [Automating the RNA-seq workflow] | Radhika |
32 | |14:45 - 15:00 | Break | |
33 | |15:00 - 16:30 | [Advanced concepts in bash] | Meeta/Radhika |
34 | |16:30 - 17:00 | [Wrap up + Q & A] | Radhika |
35 | 


--------------------------------------------------------------------------------
/schedule/README.md:
--------------------------------------------------------------------------------
 1 | # Workshop Schedule
 2 | 
 3 | ## Day 1
 4 | 
 5 | | Time            |  Topic  | Instructor |
 6 | |:------------------------:|:------------------------------------------------:|:--------:|
 7 | |9:00 - 9:40 | [Workshop Introduction](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/raw/master/lectures/Intro_to_workshop.pdf) | Meeta |
 8 | |9:40 - 10:30 | [Introduction to the Shell](https://hbctraining.github.io/Intro-to-Shell/lessons/01_the_filesystem.html) | Mary |
 9 | |10:30 - 10:45 | Break | |
10 | |10:45 - 11:35 | [Introduction to the Shell (cont.)](https://hbctraining.github.io/Intro-to-Shell/lessons/01_the_filesystem.html) | Meeta |
11 | |11:35 - 12:15 | [Searching and Redirection](https://hbctraining.github.io/Intro-to-Shell/lessons/02_searching_files.html) | Mary |
12 | |12:15 - 13:15 | Lunch | |
13 | |13:15 - 13:45 | [Introduction to the Vim Text Editor](https://hbctraining.github.io/Intro-to-Shell/lessons/03_vim.html) | Mary |
14 | |13:45 - 15:00 | [Loops and Shell Scripts](https://hbctraining.github.io/Intro-to-Shell/lessons/04_loops_and_scripts.html) | Meeta |
15 | |15:00 - 15:15 | Break | |
16 | |15:15 - 15:45 | [Permissions and Environment Variables](https://hbctraining.github.io/Intro-to-Shell/lessons/05_permissions_and_environment_variables.html) | Mary |
17 | |15:45 - 17:00 | [Project Organization and Best Practices in Data Management](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/01_data_organization.html) | Meeta |
18 | 
19 | ## Day 2
20 | 
21 | | Time            |   Topic  | Instructor |
22 | |:------------------------:|:----------:|:--------:|
23 | |9:00 - 9:45 | [Introduction to High-Performance Computing for HMS-RC's O2](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/raw/master/lectures/HPC_intro_O2.pdf) | Meeta |
24 | |9:45 - 10:00 | Break | |
25 | |10:00 - 11:15 | [Introduction to RNA-seq and Library Prep](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/blob/master/lectures/rna-seq_design.pdf) | Mary |
26 | |11:15 - 11:55 | [NGS workflows and data standards](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/blob/master/lectures/NGS_workflows.pdf) | Meeta |
27 | |11:55 - 12:55 | Lunch | |
28 | |12:55 - 13:50 | [Quality Assessment of Sequence Data](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/02_assessing_quality.html) | Mary | 
29 | |13:50 - 14:30 | [Sequence Alignment Theory](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/blob/master/lectures/Sequence_alignment.pdf) | Meeta |
30 | |14:30 - 14:45 | Break | |
31 | |14:45 - 16:00 | [RNA-seq Alignment with STAR](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/03_alignment.html) | Mary |
32 | |16:00 - 17:00 | [Assessing Alignment Quality](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/04_alignment_quality.html) | Meeta |
33 | 
34 | ## Day 3
35 | 
36 | | Time            |  Topic  | Instructor |
37 | |:------------------------:|:----------:|:--------:|
38 | |9:00 - 10:15 | [Generating a Count Matrix](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/05_counting_reads.html) | Meeta |
39 | |10:15 - 10:45 | [Documenting Steps in the Workflow with MultiQC](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/06_multiQC.html) | Meeta |
40 | |10:45 - 11:00 | Break | |
41 | |11:00 - 12:35 | [Automating the RNA-seq workflow](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/07_automating_workflow.html) | Mary |
42 | |12:35 - 13:35 | Lunch | |
43 | |13:35 - 13:45 | [Alternative workflows for analyzing RNA-seq data](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/blob/master/lectures/RNAseq-analysis-methods.pdf) | Mary |
44 | |13:45 - 15:20 | [Quantifying expression using alignment-free methods (Salmon)](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/08_salmon.html) | Meeta |
45 | |15:20 - 15:35 | Break | |
46 | |15:35 - 15:45 | [Other Applications of RNA-seq](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/blob/master/lectures/other%20rnaseq%20applications.pdf) | Mary |
47 | |15:45 - 16:25 | [Troubleshooting RNA-seq Data Analysis](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/blob/master/lectures/RNA-seq_troubleshooting.pdf) | Mary |
48 | |16:25 - 17:00 | [Genome Builds and Accessing Data on GEO/SRA](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/blob/master/lectures/Accessing_genomics_dataonline.pdf) | Meeta |
49 | | | [Wrap-up](https://www.dropbox.com/s/6diqq661xn3wgko/Wrap-up.pdf?dl=0) | Mary |
50 | 
51 | 


--------------------------------------------------------------------------------
/scripts/mov10_fastqc.run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p short 		# partition name
 4 | #SBATCH -t 0-2:00 		# hours:minutes runlimit after which job will be killed
 5 | #SBATCH -n 6 		# number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job
 6 | #SBATCH --job-name rnaseq_mov10_fastqc 		# Job name
 7 | #SBATCH -o %j.out			# File to which standard out will be written
 8 | #SBATCH -e %j.err 		# File to which standard err will be written
 9 | 
10 | ## Changing directories to where the fastq files are located
11 | cd ~/unix_workshop/rnaseq/raw_data
12 | 
13 | ## Loading modules required for script commands
14 | module load seq/fastqc/0.11.3
15 | 
16 | ## Running FASTQC
17 | fastqc -t 6 *.fq
18 | 
19 | ## Moving files to our results directory
20 | mv *fastqc* ../results/fastqc/
21 | 


--------------------------------------------------------------------------------
/scripts/rnaseq_analysis_on_allfiles_for-slurm.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | for fq in ~/unix_lesson/rnaseq/raw_data/*.fq
 4 | do
 5 | 
 6 | sbatch -p short -t 0-2:00 -n 6 --job-name rnaseq-workflow --wrap="sh ~/unix_lesson/rnaseq/scripts/rnaseq_analysis_on_input_file.sh $fq"
 7 | sleep 1	# wait 1 second between each job submission
 8 |   
 9 | done
10 | 


--------------------------------------------------------------------------------
/scripts/rnaseq_analysis_on_input_file.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash/
 2 | 
 3 | # This script takes a fastq file of RNA-Seq data, runs FastQC and outputs a counts file for it.
 4 | # USAGE: sh rnaseq_analysis_on_allfiles.sh <name of fastq file>
 5 | 
 6 | # initialize a variable with an intuitive name to store the name of the input fastq file
 7 | 
 8 | fq=$1
 9 | 
10 | # grab base of filename for naming outputs
11 | 
12 | base=`basename $fq .subset.fq`
13 | echo "Sample name is $base"           
14 | 
15 | # specify the number of cores to use
16 | 
17 | cores=2
18 | 
19 | # directory with genome reference FASTA and index files + name of the gene annotation file
20 | 
21 | genome=/groups/hbctraining/unix_workshop_other/reference_STAR/
22 | gtf=~/unix_workshop/rnaseq/reference_data/chr1-hg19_genes.gtf
23 | 
24 | # make all of the output directories
25 | # The -p option means mkdir will create the whole path if it 
26 | # does not exist and refrain from complaining if it does exist
27 | 
28 | mkdir -p ~/unix_workshop/rnaseq/results/fastqc/
29 | mkdir -p ~/unix_workshop/rnaseq/results/STAR
30 | mkdir -p ~/unix_workshop/rnaseq/results/counts
31 | 
32 | # set up output filenames and locations
33 | 
34 | fastqc_out=~/unix_workshop/rnaseq/results/fastqc/
35 | align_out=~/unix_workshop/rnaseq/results/STAR/${base}_
36 | counts_input_bam=~/unix_workshop/rnaseq/results/STAR/${base}_Aligned.sortedByCoord.out.bam
37 | counts=~/unix_workshop/rnaseq/results/counts/${base}_featurecounts.txt
38 | 
39 | # set up the software environment
40 | 
41 | module load seq/fastqc/0.11.3
42 | module load seq/STAR/2.5.3a
43 | module load seq/samtools/1.3
44 | PATH=/opt/bcbio/centos/bin:$PATH 	# for using featureCounts if not already in $PATH
45 | 
46 | echo "Processing file $fq"
47 | 
48 | # Run FastQC and move output to the appropriate folder
49 | fastqc $fq
50 | 
51 | # Run STAR
52 | STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq --outFileNamePrefix $align_out --outFilterMultimapNmax 10 --outSAMstrandField intronMotif --outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes NH HI NM MD AS
53 | 
54 | # Create BAM index
55 | samtools index $counts_input_bam
56 | 
57 | # Count mapped reads
58 | featureCounts -T $cores -s 2 -a $gtf -o $counts $counts_input_bam
59 | 


--------------------------------------------------------------------------------
/scripts/salmon_all_samples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash/
 2 | 
 3 | for fq in /groups/hbctraining/unix_workshop_other/full_dataset/*.fastq
 4 |  do 
 5 |    base=`basename $fq .fastq`
 6 |    bsub -q mcore -n 6 -W 1:30 -R "rusage[mem=4000]" -J $base.mov10_salmon -o %J.$base.out -e %J.$base.err \
 7 |    salmon quant -i /groups/hbctraining/unix_workshop_other/salmon.ensembl37.idx/ \
 8 |    -p 6 -l SR -r $fq --useVBOpt --numBootstraps 30 -o $base.salmon
 9 |  done
10 | 


--------------------------------------------------------------------------------