├── README.md ├── _config.yml ├── activities └── practice_exercise.md ├── answer_key ├── exp_design_table_answer_key.xlsx └── salmon_all_samples.sbatch ├── assets ├── css │ └── style.scss └── images │ └── dna-sequence-1600x800.jpg ├── data └── exp_design_table.xlsx ├── fastqc ├── Icons │ ├── error.png │ ├── fastqc_icon.png │ ├── tick.png │ └── warning.png ├── Images │ ├── duplication_levels copy.png │ ├── duplication_levels.png │ ├── kmer_profiles.png │ ├── per_base_gc_content.png │ ├── per_base_n_content.png │ ├── per_base_quality.png │ ├── per_base_sequence_content.png │ ├── per_sequence_gc_content.png │ ├── per_sequence_quality.png │ └── sequence_length_distribution.png └── Mov10oe_1-fastqc_report.html ├── img ├── FastQC_contam.png ├── FastQC_seq_qual.png ├── FileZilla_click.gif ├── FileZilla_information.gif ├── Filezilla_step1.png ├── Filezilla_step2.png ├── Gene_products.png ├── Gene_structure.png ├── IGV_mov10.png ├── O2_login.gif ├── O2_primary-storage.png ├── Overrepresented_sequences_adaptor_only.png ├── Overrepresented_sequences_homopolymers.png ├── Per_base_sequence_content_bad.png ├── Per_sequence_GC_content_bad.png ├── Per_sequence_GC_content_good.png ├── Per_sequence_quality_scores_bad.png ├── Per_sequence_quality_scores_good.png ├── QC_workflow_Sept2018.png ├── README.md ├── RNA-seq_library_prep.png ├── RNAseqWorkflow.png ├── R_screenshot.png ├── R_screenshot2.png ├── Rstudio_interface.png ├── SAM_file.png ├── Slide1.jpg ├── alignment_STAR_step1.ai ├── alignment_STAR_step1.png ├── alignment_STAR_step2.ai ├── alignment_STAR_step2.png ├── alignment_STAR_step3.ai ├── alignment_STAR_step3.png ├── alignment_STAR_step4.ai ├── alignment_STAR_step4.png ├── alignment_STAR_step5.ai ├── alignment_STAR_step5.png ├── alignmentfree_workflow_aug2017.png ├── alignmentfree_workflow_june2017.png ├── bad_quality.png ├── base_calling.png ├── batch_effect.png ├── batch_effect_pca.png ├── bioconductor_logo.png ├── bitwiseflags.png ├── cigar_strings.png ├── clonal_amplification.png ├── cluster_generation.png ├── complete_wd_setup.png ├── confounded_batch.png ├── confounded_design.png ├── console.png ├── corr_map.png ├── count-fig1.png ├── count-fig2.png ├── count-matrix.png ├── count_matrix.png ├── counts-workflow.jpg ├── counts-workflow.png ├── counts_view.png ├── cran_packages.png ├── data-lifecycle-base.png ├── data_life_cycle_gouldv2.png ├── de_norm_counts_var.png ├── de_replicates_img.png ├── de_replicates_img2.png ├── de_variation.png ├── demultiplexing.png ├── drawings.pptx ├── environment.png ├── exp_design.png ├── factors.png ├── factors_both.png ├── factors_new.png ├── factors_sm.png ├── factors_sm_intact.png ├── fastqc_GC.png ├── fastqc_basic_stats.png ├── fastqc_duplication.png ├── fastqc_over-represented_sequences.png ├── fastqc_per_base_sequence_content.png ├── fastqc_per_sequence_quality_scores.png ├── fastqc_results.png ├── fastqc_summary.png ├── feature-overlap.png ├── filezilla_login.png ├── filezilla_setup.png ├── flow_cell_oligos.png ├── flow_cells.png ├── full_workflow_2019.png ├── full_workflow_Sept2018.png ├── full_workflow_qualimap_2019.png ├── gProfiler.png ├── gene_expression2.png ├── gene_expression_cells.png ├── genemania.png ├── getwd.png ├── good_quality.png ├── gvng.jpg ├── igv_screenshot.png ├── illumina_platforms.png ├── illumina_sequencing.png ├── illumina_sequencing_process.png ├── libraryprep_step1-2.png ├── libraryprep_step3.png ├── libraryprep_step4-5.png ├── libraryprep_step6.png ├── long_read_tech.png ├── metadata_batch.png ├── multiqc_GC_content.png ├── multiqc_alignment_scores.png ├── multiqc_alignment_scores1.png ├── multiqc_columns.png ├── multiqc_coverage_profile.png ├── multiqc_coverage_profile1.png ├── multiqc_duplicates.png ├── multiqc_table.png ├── multiqc_table1.png ├── multithreaded_hpc_3samples.png ├── non_confounded_design.png ├── paired-end_data.png ├── paired_end_reads.png ├── pca_plot.png ├── permission-directory.png ├── placeholder.png ├── pseudo_count_comparison-cufflinks.png ├── pseudo_count_comparison-sailfish.png ├── pseudo_count_comparison-sailfish_sm.png ├── pseudo_count_comparison-star.png ├── pseudo_count_comparison-star_sm.png ├── pseudo_count_comparison.gif ├── pseudo_count_comparison.png ├── qc_cycles_lost.png ├── qc_manifold_burst.png ├── qc_overclustering.png ├── qc_phasing.png ├── qc_read2_failed.png ├── qc_signal_decay.png ├── qc_troubleshooting.png ├── qualimap_coverage_profile.png ├── qualimap_genomic _origin.png ├── qualimap_genomic_feature.png ├── qualimap_genomic_origin.png ├── qualimap_genomic_origin1.png ├── qualimap_junctions.png ├── qualimap_read_alignment.png ├── qualimap_transcript_coverage.png ├── r_starting_how_it_should_like.png ├── replicates.png ├── rnaseq_salmon_workflow.png ├── rnaseq_workflow.png ├── rnaseq_workflow_FASTQC.png ├── rnaseq_workflow_trimming.png ├── rrna.png ├── rstudio_logo.png ├── salmon_plot_multiqc.png ├── salmon_plot_multiqc1.png ├── salmon_quasialignment.png ├── salmon_rstudio.png ├── salmon_workflow.png ├── salmon_workflow_subset.png ├── sam_bam.png ├── sam_bam2.png ├── sam_bam3.png ├── sbs_image.png ├── scratch3_best-practice.png ├── scratch_recommended_practice.png ├── serial_hpc_3samples.png ├── star.png ├── teachin-team.png ├── union.png ├── vim_insert.png ├── vim_postsave.png ├── vim_quit.png ├── vim_save.png ├── vim_spider.png ├── vim_spider_number.png ├── why_R.png ├── workflow_align_qualimap.png ├── workflow_alignment.png ├── workflow_salmon.png └── wrap_option.png ├── lectures ├── 2_day │ ├── HPC_intro_O2.pdf │ ├── Intro_to_workshop.pdf │ ├── RNAseq-analysis-methods.pdf │ ├── Wrap_up.pdf │ └── rna-seq_design.pdf ├── Intro_to_workshop.pdf ├── RNA-seq_troubleshooting.pdf ├── alignment_quantification.pdf ├── expression_quantification.pdf ├── workflow_overview.pdf ├── workshop_intro_slides.pdf ├── workshop_wrapup.pdf └── workshop_wrapup_slides.pdf ├── lessons ├── 01_intro-to-RNAseq.md ├── 02_experimental_planning_considerations.md ├── 03_working_on_HPC.md ├── 04a_data_organization.md ├── 04b_data_organization.md ├── 05_qc_running_fastqc_interactively.md ├── 06_qc_running_fastqc_sbatch.md ├── 07_qc_fastqc_assessment.md ├── 08_quasi_alignment_salmon.md ├── 09_quasi_alignment_salmon_sbatch.md ├── 10_QC_Qualimap.md ├── 11_multiQC.md ├── 12_automating_workflow.md ├── 2day_rnaseq_workflow.md ├── DE_analysis.md ├── QC_STAR_and_Qualimap_run.md ├── STAR Alignment Strategy.md ├── STAR_alignment.md ├── STAR_alignment_strategy.md ├── alignment_quality.md ├── counting_reads.md ├── fastqc-troubleshooting.md ├── more_bash_cluster.md ├── rnaseq_workflow.md ├── sam.md ├── shell_review.md ├── shell_review_answer_key.md ├── test.md └── working_on_HPC_noExercises.md ├── multiqc └── multiqc_report_rnaseq.html ├── schedule ├── README.md └── links-to-lessons.md └── scripts ├── PE-rnaseq_analysis_on_allfiles_for-slurm.sh ├── PE-rnaseq_analysis_on_input_file.sh ├── mov10_fastqc.run ├── rnaseq_analysis_on_allfiles_for-slurm.sh ├── rnaseq_analysis_on_input_file.sh ├── salmon_all_files_PE.sh └── star_genome_index.run /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to bulk RNA-seq: From reads to count matrix 2 | 3 | | Audience | Computational skills required | Duration | 4 | :----------|:----------|:----------| 5 | | Biologists | [Shell for Bioinformatics](https://hbctraining.github.io/Shell-for-bioinformatics/) | 3-session online workshop (~7.5 hours of trainer-led time) | 6 | 7 | ### Description 8 | 9 | This repository has teaching materials for a 3-day **Introduction to bulk RNA-seq: From reads to count matrix** workshop. This workshop focuses on teaching basic computational skills to enable the effective use of an high-performance computing environment to implement an RNA-seq data analysis workflow. In addition to running the RNA-seq workflow from FASTQ files to count data using Salmon, the workshop covers best practice guidelines for RNA-seq experimental design and data organization/management. 10 | 11 | > **Pre-requisite for this workshop:** The *Basic Data Skills* [Shell for Bioinformatics](https://hbctraining.github.io/Shell-for-bioinformatics/) workshop or a working knowledge of the command line and cluster computing. 12 | 13 | **Note for Trainers:** Please note that the schedule linked below assumes that learners will spend between 3-4 hours on reading through, and completing exercises from selected lessons between classes. 14 | 15 | > These materials were developed for a trainer-led workshop, but are also amenable to self-guided learning. 16 | 17 | ### Learning Objectives 18 | 19 | 1. Utilize the command line interface (bash) and HPC for analyzing high-throughput sequencing data. 20 | 2. Understand best practices for designing an RNA-seq experiment 21 | 3. Perform read-level QC on bulk RNA-seq data 22 | 4. Quantify reads from bulk RNA-seq to generat a counts matrix 23 | 24 | ### Lessons 25 | 26 | * [Workshop schedule (trainer-led learning)](schedule/) 27 | * [Self-learning](schedule/links-to-lessons.md) 28 | 29 | ### Installation Requirements 30 | 31 | ***All:*** 32 | 33 | * [FileZilla Client](https://filezilla-project.org/download.php?type=client) (make sure you get ‘FileZilla Client') 34 | 35 | ***Mac users:*** 36 | 37 | * Plain text editor like [Sublime text](http://www.sublimetext.com/) or similar 38 | 39 | ***Windows users:*** 40 | 41 | * [GitBash](https://git-scm.com/download/win) 42 | * Plain text editor like [Notepad++](http://notepad-plus-plus.org/) or similar 43 | 44 | 45 | --- 46 | 47 | ### Citation 48 | 49 | To cite material from this course in your publications, please use: 50 | 51 | > Mary E. Piper, Meeta Mistry, Jihe Liu, William J. Gammerdinger, & Radhika S. Khetani. (2022, January 10). hbctraining/Intro-to-rnaseq-hpc-salmon-flipped: Introduction to RNA-seq using Salmon Lessons from HCBC (first release). Zenodo. https://doi.org/10.5281/zenodo.5833880. RRID:SCR_025373. 52 | 53 | 54 | A lot of time and effort went into the preparation of these materials. Citations help us understand the needs of the community, gain recognition for our work, and attract further funding to support our teaching activities. Thank you for citing this material if it helped you in your data analysis. 55 | 56 | --- 57 | 58 | *These materials have been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 59 | 60 | * *Some materials used in these lessons were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 61 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).* 62 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | title: Introduction to RNA-Seq using high-performance computing 3 | google_analytics: UA-150953419-1 4 | -------------------------------------------------------------------------------- /activities/practice_exercise.md: -------------------------------------------------------------------------------- 1 | _**To perform this exercise you will need an O2 account. You can request an account by following the instructions on [O2's account request page](https://harvardmed.service-now.com/stat?id=service_catalog_cards&sys_id=5165e1dbdb209050b642f27139961979&sysparm_category=991a7f2edb890c10b642f2713996196a).**_ 2 | 3 | ## Running the RNA-seq workflow 4 | 5 | We have downloaded the raw FASTQ files from the SRA for the sequencing data used in the paper: [Silencing SMOC2 ameliorates kidney fibrosis by inhibiting fibroblast to myofibroblast transformation](https://pubmed.ncbi.nlm.nih.gov/28422762/). The paper explores kidney fibrosis in wildtype and SMOC2-overexpressing mice.  6 | 7 | >_**NOTE:** If you are interested in downloading other datasets from the SRA, we have [materials](https://hbctraining.github.io/Accessing_public_genomic_data/lessons/downloading_from_SRA.html) available detailing how to do this._ 8 | 9 | ### Set-up 10 | 1. Copy the compressed experimental data folder from `/n/groups/hbctraining/kidney_fibrosis_rnaseq.tar.gz` to your own `/n/scratch3/users/ecommonsID` directory. 11 | 2. Extract the directory using the command `tar -xzvf kidney_fibrosis_rnaseq.tar.gz`. This command may take a while to run. 12 | 3. Look inside the directory, you should find the following: 13 | 14 | - a `raw_fastq` folder containing the raw fastq files 15 | - a `meta` folder with a metadata file containing information about each of the samples 16 | 4. Create a `reference_data` folder and download the transcriptome FASTA file for mouse to the folder. 17 | 18 | - For Ensembl references, go to [http://useast.ensembl.org/info/data/ftp/index.html](http://useast.ensembl.org/info/data/ftp/index.html) 19 | - Find the mouse species row and click on the *FASTA* link in the **cDNA (FASTA)** column. 20 | - Right-click on the link for the `*cdna.all.fa.gz` file to copy it. 21 | - Navigate to the `reference_data` folder and run the command `wget `. This should download the transcriptome FASTA file to the directory. 22 | - Extract the `*cdna.all.fa.gz` file by running the code: `gzip -d *cdna.all.fa.gz`. 23 | 5. Set-up additional expected folders (e.g. results, etc.) for your project (i.e. create subdirectories and additional directories where you feel is necessary).  24 | 25 | ### Analysis 26 | Using the workflow and submission scripts we generated in class, parallelize the RNA-Seq analysis of all files in this dataset. For each FASTQ file you will need to perform the following: 27 | 28 | - Run FastQC 29 | - Generate abundance estimates with Salmon 30 | - Evaluate the MultiQC report 31 | 32 | **HINT: You will need to create a mouse index for Salmon.**  33 | -------------------------------------------------------------------------------- /answer_key/exp_design_table_answer_key.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/answer_key/exp_design_table_answer_key.xlsx -------------------------------------------------------------------------------- /answer_key/salmon_all_samples.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -p short 4 | #SBATCH -c 6 5 | #SBATCH -t 0-12:00 6 | #SBATCH --mem 8G 7 | #SBATCH --job-name salmon_in_serial 8 | #SBATCH -o %j.out 9 | #SBATCH -e %j.err 10 | #SBATCH --mail-type=END 11 | #SBATCH --mail-user=xyz10@harvard.edu 12 | 13 | # Load Salmon module 14 | module load salmon/1.8.0 15 | 16 | # Change directory to where the Salmon results will be output 17 | cd ~/rnaseq/results/salmon 18 | 19 | # Main script for running salmon with for loop 20 | 21 | for fq in ~/rnaseq/raw_data/*.fq 22 | 23 | do 24 | 25 | # create a prefix for the output file 26 | samplename=`basename $fq .fq` 27 | 28 | # run salmon 29 | salmon quant -i /n/groups/hbctraining/rna-seq_2019_02/reference_data/salmon_index \ 30 | -l A \ 31 | -r $fq \ 32 | -o ${samplename}_salmon \ 33 | --seqBias \ 34 | --useVBOpt \ 35 | --validateMappings \ 36 | -p 6 \ 37 | 38 | done 39 | 40 | -------------------------------------------------------------------------------- /assets/css/style.scss: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | @import "{{ site.theme }}"; 5 | 6 | .page-header { color: #fff; text-align: center; background-image: url("../images/dna-sequence-1600x800.jpg"); } 7 | 8 | .main-content h1, .main-content h2, .main-content h3, .main-content h4, .main-content h5, .main-content h6 { margin-top: 2rem; margin-bottom: 1rem; font-weight: normal; color: #000000; } 9 | -------------------------------------------------------------------------------- /assets/images/dna-sequence-1600x800.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/assets/images/dna-sequence-1600x800.jpg -------------------------------------------------------------------------------- /data/exp_design_table.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/data/exp_design_table.xlsx -------------------------------------------------------------------------------- /fastqc/Icons/error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Icons/error.png -------------------------------------------------------------------------------- /fastqc/Icons/fastqc_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Icons/fastqc_icon.png -------------------------------------------------------------------------------- /fastqc/Icons/tick.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Icons/tick.png -------------------------------------------------------------------------------- /fastqc/Icons/warning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Icons/warning.png -------------------------------------------------------------------------------- /fastqc/Images/duplication_levels copy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/duplication_levels copy.png -------------------------------------------------------------------------------- /fastqc/Images/duplication_levels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/duplication_levels.png -------------------------------------------------------------------------------- /fastqc/Images/kmer_profiles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/kmer_profiles.png -------------------------------------------------------------------------------- /fastqc/Images/per_base_gc_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_base_gc_content.png -------------------------------------------------------------------------------- /fastqc/Images/per_base_n_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_base_n_content.png -------------------------------------------------------------------------------- /fastqc/Images/per_base_quality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_base_quality.png -------------------------------------------------------------------------------- /fastqc/Images/per_base_sequence_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_base_sequence_content.png -------------------------------------------------------------------------------- /fastqc/Images/per_sequence_gc_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_sequence_gc_content.png -------------------------------------------------------------------------------- /fastqc/Images/per_sequence_quality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_sequence_quality.png -------------------------------------------------------------------------------- /fastqc/Images/sequence_length_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/sequence_length_distribution.png -------------------------------------------------------------------------------- /fastqc/Mov10oe_1-fastqc_report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Mov10_oe_1.fastq FastQC Report 4 | 5 | 194 | 195 | 196 | 197 |
198 |
FastQCFastQC Report
199 |
200 | Wed 30 Sep 2015
201 | Mov10_oe_1.fastq 202 |
203 |
204 |
205 |

Summary

206 | 219 |
220 |
221 |

[OK] Basic Statistics

222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 |
MeasureValue
FilenameMov10_oe_1.fastq
File typeConventional base calls
EncodingSanger / Illumina 1.9
Total Sequences39971841
Filtered Sequences0
Sequence length100
%GC47
256 |
257 |

[OK] Per base sequence quality

258 |

Per base quality graph

259 |
260 |

[OK] Per sequence quality scores

261 |

Per Sequence quality graph

262 |
263 |

[FAIL] Per base sequence content

264 |

Per base sequence content

265 |
266 |

[FAIL] Per base GC content

267 |

Per base GC content graph

268 |
269 |

[OK] Per sequence GC content

270 |

Per sequence GC content graph

271 |
272 |

[OK] Per base N content

273 |

N content graph

274 |
275 |

[OK] Sequence Length Distribution

276 |

Sequence length distribution

277 |
278 |

[FAIL] Sequence Duplication Levels

279 |

Duplication level graph

280 |
281 |

[OK] Overrepresented sequences

282 |

No overrepresented sequences

283 |
284 |

[WARN] Kmer Content

285 |

Kmer graph

286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 |
SequenceCountObs/Exp OverallObs/Exp MaxMax Obs/Exp Position
AAAAA167950154.17486576.0599112
CTGGG83765902.54796586.48415471
TTCTT121619902.5438165.07113466
TCTTC109385402.5295365.1856967
CTTCT108858452.51735045.02551941
CTCCA88042152.3773278.1329461
GGCAG73607852.36464959.0486111
TCCAG84338602.33366925.71225367
CTCCT88622902.26583346.75072051
CAGGA75287552.25458675.91353371
CTTCA91535002.2355536.18047761
CCCAG72169202.2078286.1198581
GCCAG64553702.0237146.27880431
CTGCA72417502.00380925.14367681
CTTGG68970851.85172645.5056731
CTGGA65118451.84642466.71222351
CTCAG64495701.78461157.229481
CTTTT84790451.77348715.91010951
TTTCA79342101.75267365.1872936
TTCAG68307001.70953375.0215247
CTTGA56097651.40396775.22075841
CTCAT49251001.20285385.12731081
CTCAA44892601.15794755.3348221
456 |
457 |
458 | -------------------------------------------------------------------------------- /img/FastQC_contam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/FastQC_contam.png -------------------------------------------------------------------------------- /img/FastQC_seq_qual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/FastQC_seq_qual.png -------------------------------------------------------------------------------- /img/FileZilla_click.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/FileZilla_click.gif -------------------------------------------------------------------------------- /img/FileZilla_information.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/FileZilla_information.gif -------------------------------------------------------------------------------- /img/Filezilla_step1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Filezilla_step1.png -------------------------------------------------------------------------------- /img/Filezilla_step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Filezilla_step2.png -------------------------------------------------------------------------------- /img/Gene_products.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Gene_products.png -------------------------------------------------------------------------------- /img/Gene_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Gene_structure.png -------------------------------------------------------------------------------- /img/IGV_mov10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/IGV_mov10.png -------------------------------------------------------------------------------- /img/O2_login.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/O2_login.gif -------------------------------------------------------------------------------- /img/O2_primary-storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/O2_primary-storage.png -------------------------------------------------------------------------------- /img/Overrepresented_sequences_adaptor_only.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Overrepresented_sequences_adaptor_only.png -------------------------------------------------------------------------------- /img/Overrepresented_sequences_homopolymers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Overrepresented_sequences_homopolymers.png -------------------------------------------------------------------------------- /img/Per_base_sequence_content_bad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_base_sequence_content_bad.png -------------------------------------------------------------------------------- /img/Per_sequence_GC_content_bad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_sequence_GC_content_bad.png -------------------------------------------------------------------------------- /img/Per_sequence_GC_content_good.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_sequence_GC_content_good.png -------------------------------------------------------------------------------- /img/Per_sequence_quality_scores_bad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_sequence_quality_scores_bad.png -------------------------------------------------------------------------------- /img/Per_sequence_quality_scores_good.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_sequence_quality_scores_good.png -------------------------------------------------------------------------------- /img/QC_workflow_Sept2018.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/QC_workflow_Sept2018.png -------------------------------------------------------------------------------- /img/README.md: -------------------------------------------------------------------------------- 1 | ###All images for Session II of NGS Data Analysis Course 2 | -------------------------------------------------------------------------------- /img/RNA-seq_library_prep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/RNA-seq_library_prep.png -------------------------------------------------------------------------------- /img/RNAseqWorkflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/RNAseqWorkflow.png -------------------------------------------------------------------------------- /img/R_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/R_screenshot.png -------------------------------------------------------------------------------- /img/R_screenshot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/R_screenshot2.png -------------------------------------------------------------------------------- /img/Rstudio_interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Rstudio_interface.png -------------------------------------------------------------------------------- /img/SAM_file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/SAM_file.png -------------------------------------------------------------------------------- /img/Slide1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Slide1.jpg -------------------------------------------------------------------------------- /img/alignment_STAR_step1.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step1.ai -------------------------------------------------------------------------------- /img/alignment_STAR_step1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step1.png -------------------------------------------------------------------------------- /img/alignment_STAR_step2.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step2.ai -------------------------------------------------------------------------------- /img/alignment_STAR_step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step2.png -------------------------------------------------------------------------------- /img/alignment_STAR_step3.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step3.ai -------------------------------------------------------------------------------- /img/alignment_STAR_step3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step3.png -------------------------------------------------------------------------------- /img/alignment_STAR_step4.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step4.ai -------------------------------------------------------------------------------- /img/alignment_STAR_step4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step4.png -------------------------------------------------------------------------------- /img/alignment_STAR_step5.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step5.ai -------------------------------------------------------------------------------- /img/alignment_STAR_step5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step5.png -------------------------------------------------------------------------------- /img/alignmentfree_workflow_aug2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignmentfree_workflow_aug2017.png -------------------------------------------------------------------------------- /img/alignmentfree_workflow_june2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignmentfree_workflow_june2017.png -------------------------------------------------------------------------------- /img/bad_quality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/bad_quality.png -------------------------------------------------------------------------------- /img/base_calling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/base_calling.png -------------------------------------------------------------------------------- /img/batch_effect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/batch_effect.png -------------------------------------------------------------------------------- /img/batch_effect_pca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/batch_effect_pca.png -------------------------------------------------------------------------------- /img/bioconductor_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/bioconductor_logo.png -------------------------------------------------------------------------------- /img/bitwiseflags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/bitwiseflags.png -------------------------------------------------------------------------------- /img/cigar_strings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/cigar_strings.png -------------------------------------------------------------------------------- /img/clonal_amplification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/clonal_amplification.png -------------------------------------------------------------------------------- /img/cluster_generation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/cluster_generation.png -------------------------------------------------------------------------------- /img/complete_wd_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/complete_wd_setup.png -------------------------------------------------------------------------------- /img/confounded_batch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/confounded_batch.png -------------------------------------------------------------------------------- /img/confounded_design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/confounded_design.png -------------------------------------------------------------------------------- /img/console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/console.png -------------------------------------------------------------------------------- /img/corr_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/corr_map.png -------------------------------------------------------------------------------- /img/count-fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/count-fig1.png -------------------------------------------------------------------------------- /img/count-fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/count-fig2.png -------------------------------------------------------------------------------- /img/count-matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/count-matrix.png -------------------------------------------------------------------------------- /img/count_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/count_matrix.png -------------------------------------------------------------------------------- /img/counts-workflow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/counts-workflow.jpg -------------------------------------------------------------------------------- /img/counts-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/counts-workflow.png -------------------------------------------------------------------------------- /img/counts_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/counts_view.png -------------------------------------------------------------------------------- /img/cran_packages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/cran_packages.png -------------------------------------------------------------------------------- /img/data-lifecycle-base.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/data-lifecycle-base.png -------------------------------------------------------------------------------- /img/data_life_cycle_gouldv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/data_life_cycle_gouldv2.png -------------------------------------------------------------------------------- /img/de_norm_counts_var.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/de_norm_counts_var.png -------------------------------------------------------------------------------- /img/de_replicates_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/de_replicates_img.png -------------------------------------------------------------------------------- /img/de_replicates_img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/de_replicates_img2.png -------------------------------------------------------------------------------- /img/de_variation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/de_variation.png -------------------------------------------------------------------------------- /img/demultiplexing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/demultiplexing.png -------------------------------------------------------------------------------- /img/drawings.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/drawings.pptx -------------------------------------------------------------------------------- /img/environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/environment.png -------------------------------------------------------------------------------- /img/exp_design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/exp_design.png -------------------------------------------------------------------------------- /img/factors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors.png -------------------------------------------------------------------------------- /img/factors_both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors_both.png -------------------------------------------------------------------------------- /img/factors_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors_new.png -------------------------------------------------------------------------------- /img/factors_sm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors_sm.png -------------------------------------------------------------------------------- /img/factors_sm_intact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors_sm_intact.png -------------------------------------------------------------------------------- /img/fastqc_GC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_GC.png -------------------------------------------------------------------------------- /img/fastqc_basic_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_basic_stats.png -------------------------------------------------------------------------------- /img/fastqc_duplication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_duplication.png -------------------------------------------------------------------------------- /img/fastqc_over-represented_sequences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_over-represented_sequences.png -------------------------------------------------------------------------------- /img/fastqc_per_base_sequence_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_per_base_sequence_content.png -------------------------------------------------------------------------------- /img/fastqc_per_sequence_quality_scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_per_sequence_quality_scores.png -------------------------------------------------------------------------------- /img/fastqc_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_results.png -------------------------------------------------------------------------------- /img/fastqc_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_summary.png -------------------------------------------------------------------------------- /img/feature-overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/feature-overlap.png -------------------------------------------------------------------------------- /img/filezilla_login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/filezilla_login.png -------------------------------------------------------------------------------- /img/filezilla_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/filezilla_setup.png -------------------------------------------------------------------------------- /img/flow_cell_oligos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/flow_cell_oligos.png -------------------------------------------------------------------------------- /img/flow_cells.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/flow_cells.png -------------------------------------------------------------------------------- /img/full_workflow_2019.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/full_workflow_2019.png -------------------------------------------------------------------------------- /img/full_workflow_Sept2018.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/full_workflow_Sept2018.png -------------------------------------------------------------------------------- /img/full_workflow_qualimap_2019.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/full_workflow_qualimap_2019.png -------------------------------------------------------------------------------- /img/gProfiler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/gProfiler.png -------------------------------------------------------------------------------- /img/gene_expression2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/gene_expression2.png -------------------------------------------------------------------------------- /img/gene_expression_cells.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/gene_expression_cells.png -------------------------------------------------------------------------------- /img/genemania.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/genemania.png -------------------------------------------------------------------------------- /img/getwd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/getwd.png -------------------------------------------------------------------------------- /img/good_quality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/good_quality.png -------------------------------------------------------------------------------- /img/gvng.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/gvng.jpg -------------------------------------------------------------------------------- /img/igv_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/igv_screenshot.png -------------------------------------------------------------------------------- /img/illumina_platforms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/illumina_platforms.png -------------------------------------------------------------------------------- /img/illumina_sequencing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/illumina_sequencing.png -------------------------------------------------------------------------------- /img/illumina_sequencing_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/illumina_sequencing_process.png -------------------------------------------------------------------------------- /img/libraryprep_step1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/libraryprep_step1-2.png -------------------------------------------------------------------------------- /img/libraryprep_step3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/libraryprep_step3.png -------------------------------------------------------------------------------- /img/libraryprep_step4-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/libraryprep_step4-5.png -------------------------------------------------------------------------------- /img/libraryprep_step6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/libraryprep_step6.png -------------------------------------------------------------------------------- /img/long_read_tech.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/long_read_tech.png -------------------------------------------------------------------------------- /img/metadata_batch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/metadata_batch.png -------------------------------------------------------------------------------- /img/multiqc_GC_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_GC_content.png -------------------------------------------------------------------------------- /img/multiqc_alignment_scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_alignment_scores.png -------------------------------------------------------------------------------- /img/multiqc_alignment_scores1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_alignment_scores1.png -------------------------------------------------------------------------------- /img/multiqc_columns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_columns.png -------------------------------------------------------------------------------- /img/multiqc_coverage_profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_coverage_profile.png -------------------------------------------------------------------------------- /img/multiqc_coverage_profile1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_coverage_profile1.png -------------------------------------------------------------------------------- /img/multiqc_duplicates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_duplicates.png -------------------------------------------------------------------------------- /img/multiqc_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_table.png -------------------------------------------------------------------------------- /img/multiqc_table1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_table1.png -------------------------------------------------------------------------------- /img/multithreaded_hpc_3samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multithreaded_hpc_3samples.png -------------------------------------------------------------------------------- /img/non_confounded_design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/non_confounded_design.png -------------------------------------------------------------------------------- /img/paired-end_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/paired-end_data.png -------------------------------------------------------------------------------- /img/paired_end_reads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/paired_end_reads.png -------------------------------------------------------------------------------- /img/pca_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pca_plot.png -------------------------------------------------------------------------------- /img/permission-directory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/permission-directory.png -------------------------------------------------------------------------------- /img/placeholder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/placeholder.png -------------------------------------------------------------------------------- /img/pseudo_count_comparison-cufflinks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-cufflinks.png -------------------------------------------------------------------------------- /img/pseudo_count_comparison-sailfish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-sailfish.png -------------------------------------------------------------------------------- /img/pseudo_count_comparison-sailfish_sm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-sailfish_sm.png -------------------------------------------------------------------------------- /img/pseudo_count_comparison-star.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-star.png -------------------------------------------------------------------------------- /img/pseudo_count_comparison-star_sm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-star_sm.png -------------------------------------------------------------------------------- /img/pseudo_count_comparison.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison.gif -------------------------------------------------------------------------------- /img/pseudo_count_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison.png -------------------------------------------------------------------------------- /img/qc_cycles_lost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_cycles_lost.png -------------------------------------------------------------------------------- /img/qc_manifold_burst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_manifold_burst.png -------------------------------------------------------------------------------- /img/qc_overclustering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_overclustering.png -------------------------------------------------------------------------------- /img/qc_phasing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_phasing.png -------------------------------------------------------------------------------- /img/qc_read2_failed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_read2_failed.png -------------------------------------------------------------------------------- /img/qc_signal_decay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_signal_decay.png -------------------------------------------------------------------------------- /img/qc_troubleshooting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_troubleshooting.png -------------------------------------------------------------------------------- /img/qualimap_coverage_profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_coverage_profile.png -------------------------------------------------------------------------------- /img/qualimap_genomic _origin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_genomic _origin.png -------------------------------------------------------------------------------- /img/qualimap_genomic_feature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_genomic_feature.png -------------------------------------------------------------------------------- /img/qualimap_genomic_origin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_genomic_origin.png -------------------------------------------------------------------------------- /img/qualimap_genomic_origin1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_genomic_origin1.png -------------------------------------------------------------------------------- /img/qualimap_junctions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_junctions.png -------------------------------------------------------------------------------- /img/qualimap_read_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_read_alignment.png -------------------------------------------------------------------------------- /img/qualimap_transcript_coverage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_transcript_coverage.png -------------------------------------------------------------------------------- /img/r_starting_how_it_should_like.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/r_starting_how_it_should_like.png -------------------------------------------------------------------------------- /img/replicates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/replicates.png -------------------------------------------------------------------------------- /img/rnaseq_salmon_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rnaseq_salmon_workflow.png -------------------------------------------------------------------------------- /img/rnaseq_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rnaseq_workflow.png -------------------------------------------------------------------------------- /img/rnaseq_workflow_FASTQC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rnaseq_workflow_FASTQC.png -------------------------------------------------------------------------------- /img/rnaseq_workflow_trimming.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rnaseq_workflow_trimming.png -------------------------------------------------------------------------------- /img/rrna.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rrna.png -------------------------------------------------------------------------------- /img/rstudio_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rstudio_logo.png -------------------------------------------------------------------------------- /img/salmon_plot_multiqc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_plot_multiqc.png -------------------------------------------------------------------------------- /img/salmon_plot_multiqc1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_plot_multiqc1.png -------------------------------------------------------------------------------- /img/salmon_quasialignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_quasialignment.png -------------------------------------------------------------------------------- /img/salmon_rstudio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_rstudio.png -------------------------------------------------------------------------------- /img/salmon_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_workflow.png -------------------------------------------------------------------------------- /img/salmon_workflow_subset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_workflow_subset.png -------------------------------------------------------------------------------- /img/sam_bam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/sam_bam.png -------------------------------------------------------------------------------- /img/sam_bam2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/sam_bam2.png -------------------------------------------------------------------------------- /img/sam_bam3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/sam_bam3.png -------------------------------------------------------------------------------- /img/sbs_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/sbs_image.png -------------------------------------------------------------------------------- /img/scratch3_best-practice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/scratch3_best-practice.png -------------------------------------------------------------------------------- /img/scratch_recommended_practice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/scratch_recommended_practice.png -------------------------------------------------------------------------------- /img/serial_hpc_3samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/serial_hpc_3samples.png -------------------------------------------------------------------------------- /img/star.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/star.png -------------------------------------------------------------------------------- /img/teachin-team.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/teachin-team.png -------------------------------------------------------------------------------- /img/union.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/union.png -------------------------------------------------------------------------------- /img/vim_insert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_insert.png -------------------------------------------------------------------------------- /img/vim_postsave.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_postsave.png -------------------------------------------------------------------------------- /img/vim_quit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_quit.png -------------------------------------------------------------------------------- /img/vim_save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_save.png -------------------------------------------------------------------------------- /img/vim_spider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_spider.png -------------------------------------------------------------------------------- /img/vim_spider_number.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_spider_number.png -------------------------------------------------------------------------------- /img/why_R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/why_R.png -------------------------------------------------------------------------------- /img/workflow_align_qualimap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/workflow_align_qualimap.png -------------------------------------------------------------------------------- /img/workflow_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/workflow_alignment.png -------------------------------------------------------------------------------- /img/workflow_salmon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/workflow_salmon.png -------------------------------------------------------------------------------- /img/wrap_option.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/wrap_option.png -------------------------------------------------------------------------------- /lectures/2_day/HPC_intro_O2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/HPC_intro_O2.pdf -------------------------------------------------------------------------------- /lectures/2_day/Intro_to_workshop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/Intro_to_workshop.pdf -------------------------------------------------------------------------------- /lectures/2_day/RNAseq-analysis-methods.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/RNAseq-analysis-methods.pdf -------------------------------------------------------------------------------- /lectures/2_day/Wrap_up.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/Wrap_up.pdf -------------------------------------------------------------------------------- /lectures/2_day/rna-seq_design.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/rna-seq_design.pdf -------------------------------------------------------------------------------- /lectures/Intro_to_workshop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/Intro_to_workshop.pdf -------------------------------------------------------------------------------- /lectures/RNA-seq_troubleshooting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/RNA-seq_troubleshooting.pdf -------------------------------------------------------------------------------- /lectures/alignment_quantification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/alignment_quantification.pdf -------------------------------------------------------------------------------- /lectures/expression_quantification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/expression_quantification.pdf -------------------------------------------------------------------------------- /lectures/workflow_overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/workflow_overview.pdf -------------------------------------------------------------------------------- /lectures/workshop_intro_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/workshop_intro_slides.pdf -------------------------------------------------------------------------------- /lectures/workshop_wrapup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/workshop_wrapup.pdf -------------------------------------------------------------------------------- /lectures/workshop_wrapup_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/workshop_wrapup_slides.pdf -------------------------------------------------------------------------------- /lessons/04a_data_organization.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Data Management and Project Organization 3 | author: Mary Piper, Meeta Mistry, Michael Steinbaugh, Radhika Khetani, Jihe Liu 4 | date: November 9, 2020 5 | duration: 35 6 | --- 7 | 8 | [Data management slides]: https://github.com/hbc/NGS-Data-Analysis-long-course/raw/Fall_2016/sessionI/slides/data_management.pdf 9 | 10 | ## Learning Objectives 11 | 12 | - Explain the need for data management. 13 | - Describe strategies for managing your own projects. 14 | 15 | 16 | ## What is data management? 17 | 18 | In this course we teach you how to independently analyze your own sequencing data, so naturally we should begin with what what to do once you get the raw data back from the sequencing facility. Obviously, we begin with the analysis! Right? 19 | 20 | Wrong. 21 | 22 | One of the most important parts of research that involves large amounts of data is how best to manage it. Once data is generated we tend to prioritize the analysis. **In the excitement to get a first look at new data, there are many important aspects that are often overlooked.** 23 | 24 | Wait, don't leave this page just yet! 25 | 26 | We know that data management can be hard to get excited about. However, **ignoring it can be detrimental to your research.** Here are just a few reasons **why data management should matter to you**: 27 | 28 | * It will make your life easier. It's easier to analyze organized, and well documented data. 29 | * Your future self will thank you. Managing well from the get-go means it's easier to retrieve at a later date. 30 | * Data is precious. Thinking ahead about things like storage means you reduce risk of losing it. 31 | * Funding agencies are increasingly mandating that research projects are developed with a data management plan. 32 | 33 | ### Data Lifecycle 34 | The data lifecycle displayed below, courtesy of the [HMS Data Management Working Group](https://datamanagement.hms.harvard.edu/), illustrates some things to consider beyond data creation and analysis. Below, we discuss components of the lifecycle and how they apply to any NGS experiment. 35 | 36 |

37 | 38 |

39 | 40 | _Image acquired from the [Harvard Biomedical Data Management Website](https://datamanagement.hms.harvard.edu/data-lifecycle)_ 41 | 42 | ### Plan and Design 43 | You should approach your sequencing project in a very similar way you do with any biological experiment, and ideally, begins with a good **experimental design**. You want to think about experiment at the outset and collect appropriate samples such that you have enough statistical power to make the comparisons you need. In a later lesson, we delve more into the details of planning and the experimental design considerations. Planning for your computational work is just as important as planning when working on the bench. Every computational analysis you do is going to spawn many files and you will want to think about short-term storage options for your data and computational resources for analyzing it. 44 | 45 | ### Collect and Create 46 | The next step is preparing samples as required. During this stage it is important to keep track of how the experiment was performed, making sure to clearly document the source of starting materials and kits used. It is also best practice to include any information about any small variations within the experiment (across the different samples being prepared) or any changes relative to standard experiment protocols. This collection of information serves as the **metadata of the experiment** which will prove to be very useful during the analysis stage. 47 | 48 | ### Analyze and Collaborate 49 | Once you have the sequencing data back from the sequencing facility, it's time to analyze it. The process of data analysis should be well documented to ensure reproducibility and also for ease of collaboration. We will spend some more time on this component of the lifecycle later in class, as it applies to our dataset. 50 | 51 | ### Evaluate and Archive 52 | When the analysis is complete you will want to think about which files are most pertinent to keep. Consider long-term storage options for your data that meet requirements of NIH, other funding agencies, and any guidelines from your institution. 53 | 54 | ### Disseminate and share 55 | The results of your analysis will hopefully generate some exciting findings that will be beneficial to the scientific community. At this stage in the lifecycle you rely on your previous steps of documentation to turn those notes into a clear and concise methods section of your manuscript. 56 | 57 | ### Access and Reuse 58 | In addition to sharing information on the analysis, you should plan for sharing the data. It has become increasingly common for researchers to make their data available to others when they complete a study. While a major reason for sharing is compliance (with journals or research funding organizations), there are also important research benefits including reproducibility and data sharing and reuse. 59 | 60 | 61 | **Resources** 62 | 63 | * The [HMS Data Management Working Group (DMWG)'s website](https://datamanagement.hms.harvard.edu/) 64 | * A guide from the [Harvard library](http://guides.library.harvard.edu/dmp). 65 | * **Sign-up** for the [DMWG quarterly newsletter](https://datamanagement.hms.harvard.edu/dmwg-newsletter) for helpful tips, classes and events related to data management 66 | 67 | 68 | --- 69 | 70 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 71 | 72 | * *The materials used in this lesson were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 73 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).* 74 | * *Adapted from the lesson by Tracy Teal. Original contributors: Paul Wilson, Milad Fatenejad, Sasha Wood and Radhika Khetani for Software Carpentry (http://software-carpentry.org/)* 75 | 76 | -------------------------------------------------------------------------------- /lessons/04b_data_organization.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Data Management and Project Organization - In-class 3 | author: Mary Piper, Meeta Mistry, Michael Steinbaugh, Radhika Khetani, Jihe Liu 4 | date: November 9, 2020 5 | duration: 35 6 | --- 7 | 8 | [Data management slides]: https://github.com/hbc/NGS-Data-Analysis-long-course/raw/Fall_2016/sessionI/slides/data_management.pdf 9 | [SRA]: http://www.ncbi.nlm.nih.gov/sra "Sequence Read Archive" 10 | 11 | ## Learning Objectives 12 | 13 | - Describe the example RNA-seq experiment and its objectives. 14 | - Demonstrate strategies for good data management and project organization. 15 | 16 | ## The Dataset 17 | 18 | The dataset we are using for this workshop is part of a larger study described in [Kenny PJ et al., *Cell Rep* 2014](http://www.ncbi.nlm.nih.gov/pubmed/25464849). The authors are investigating interactions between various genes involved in Fragile X syndrome, a disease of aberrant protein production, which results in cognitive impairment and autistic-like features. **The authors sought to show that RNA helicase MOV10 regulates the translation of RNAs involved in Fragile X syndrome.** 19 | 20 | ### Raw data 21 | 22 | From this study we are using the [RNA-seq](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE50499) data which is publicly available in the [Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra/?term=SRP029367). 23 | 24 | > **NOTE:** If you are interested in how to obtain publicly available sequence data from the SRA we have some materials on this [linked here](https://hbctraining.github.io/Accessing_public_genomic_data/lessons/downloading_from_SRA.html). 25 | 26 | ### Metadata 27 | 28 | In addition to the raw sequence data we also need to collect **information about the data**, also known as **metadata**. We are usually quick to want to begin analysis of the sequence data (FASTQ files), but how useful is it if we know nothing about the samples that this sequence data originated from? 29 | 30 | Some relevant metadata for our dataset is provided below: 31 | 32 | * The RNA was extracted from **HEK293F cells** that were transfected with a **MOV10 transgene**, **MOV10 siRNA**, or an **irrelevant siRNA**. (*For this workshop we won't be using the MOV10 knock down samples.*) 33 | * The libraries for this dataset are **stranded** and were generated using the standard Tru-seq prep kit (using the dUTP method). 34 | * Sequencing was carried out on the **Illumina HiSeq-2500** and **100bp single end** reads were generated. 35 | * The full dataset was sequenced to **~40 million reads** per sample, but for this workshop we will be looking at a small subset on chr1 (~300,000 reads/sample). 36 | * For each group we have three replicates as described in the figure below. 37 | 38 |

39 | 40 |

41 | 42 | 43 | ## Implementing data management best practices 44 | 45 | In a [previous lesson](04a_data_organization.md) we describe the data lifecycle and the **different aspects to consider when working on your own projects**. Here, we implement some of those strategies to get ourselves setup before we begin with any analysis. 46 | 47 |

48 | 49 |

50 | 51 | _Image acquired from the [Harvard Biomedical Data Management Website](https://datamanagement.hms.harvard.edu/data-lifecycle)_ 52 | 53 | ### Planning and organization 54 | 55 | For each experiment you work on and analyze data for, it is considered best practice to get organized by creating a planned storage space (directory structure). We will start by creating a directory that we can use for the rest of the workshop. First, make sure that you are in your home directory. 56 | 57 | ```bash 58 | $ cd 59 | $ pwd 60 | ``` 61 | 62 | This should return `/home/rc_training`. Create the directory `rnaseq` and move into it. 63 | 64 | ```bash 65 | $ mkdir rnaseq 66 | $ cd rnaseq 67 | ``` 68 | 69 | Next, we will create a project directory and set up the following structure to keep our files organized. 70 | 71 | ```bash 72 | rnaseq 73 | ├── logs 74 | ├── meta 75 | ├── raw_data 76 | ├── results 77 | └── scripts 78 | ``` 79 | 80 | *This is a generic structure and can be tweaked based on personal preference and the analysis workflow.* 81 | 82 | - `logs`: to keep track of the commands run and the specific parameters used, but also to have a record of any standard output that is generated while running the command. 83 | - `meta`: for any information that describes the samples you are using, which we refer to as [metadata](https://datamanagement.hms.harvard.edu/metadata-overview). 84 | - `raw_data`: for any **unmodified** (raw) data obtained prior to computational analysis here, e.g. FASTQ files from the sequencing center. We strongly recommend leaving this directory unmodified through the analysis. 85 | - `results`: for output from the different tools you implement in your workflow. Create sub-folders specific to each tool/step of the workflow within this folder. 86 | - `scripts`: for scripts that you write and use to run analyses/workflow. 87 | 88 | 89 | ```bash 90 | $ mkdir logs meta raw_data results scripts 91 | ``` 92 | 93 | > #### File naming conventions 94 | > 95 | > Another aspect of staying organized is making sure that all the directories and filenames for an analysis are as consistent as possible. You want to avoid names like `alignment1.bam`, and rather have names like `20170823_kd_rep1_gmap-1.4.bam` which provide a basic level of information about the file. [This link](https://datamanagement.hms.harvard.edu/file-naming-conventions) and [this slideshow](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) have some good guidelines for file naming dos and don'ts. 96 | 97 | 98 | ### Documentation 99 | 100 | In your lab notebook, you likely keep track of the different reagents and kits used for a specific protocol. Similarly, recording information about the tools used in the workflow is important for documenting your computational experiments. 101 | 102 | - **Make note of the software you use.** Do your research and find out what tools are best for the data you are working with. Don't just work with tools that you are able to easily install. 103 | - **Keep track of software versions.** Keep up with the literature and make sure you are using the most up-to-date versions. 104 | - **Record information on parameters used and summary statistics** at every step (e.g., how many adapters were removed, how many reads did not align) 105 | - A general rule of thumb is to test on a single sample or a subset of the data before running your entire dataset through. This will allow you to debug quicker and give you a chance to also get a feel for the tool and the different parameters. 106 | - Different tools have different ways of reporting log messages to the terminal. You might have to experiment a bit to figure out what output to capture. You can redirect standard output with the `>` symbol which is equivalent to `1> (standard out)`; other tools might require you to use `2>` to re-direct the `standard error` instead. 107 | 108 | #### README files 109 | 110 | After setting up the directory structure it is useful to have a **[README file](https://datamanagement.hms.harvard.edu/readme-files) within your project directory**. This is a plain text file containing a short summary about the project and a description of the files/directories found within it. An example README is shown below. It can also be helpful to include a README within each sub-directory with any information pertaining to the analysis. 111 | 112 | ``` 113 | ## README ## 114 | ## This directory contains data generated during the Introduction to RNA-seq workshop 115 | ## Date: 116 | 117 | There are five subdirectories in this directory: 118 | 119 | raw_data : contains raw data 120 | meta: contains... 121 | logs: 122 | results: 123 | scripts: 124 | ``` 125 | 126 | *** 127 | 128 | **Exercise** 129 | 130 | 1. Take a moment to create a README for the `rnaseq/` folder (hint: use `vim` to create the file). Give a short description of the project and brief descriptions of the types of files you will be storing within each of the sub-directories. 131 | 132 | *** 133 | 134 | 135 | ### Obtaining data 136 | 137 | Let's populate the `rnaseq/` project with some data. The FASTQ files are located on the O2 cluster in the `/n/groups` space. Copy them over from the path shown below, into your `raw_data` directory: 138 | 139 | ```bash 140 | $ cp /n/groups/hbctraining/unix_lesson/raw_fastq/*.fq ~/rnaseq/raw_data/ 141 | ``` 142 | 143 | > **NOTE**: When obtaining data from your sequencing facility, the data will not be stored on O2 and so a simple copy command (`cp`) will not suffice. The raw sequence data will likely be located on another remote computer/server that is hosted by the sequencing facility and you will be given login credentials to access it. To copy it over you can use commands like `rsync`, `wget` or `scp`. These are all commands that can help securely copy the data over to the appropriate location on O2. We have some information [linked here](more_bash_cluster.md#copying-files-to-and-from-the-cluster-) if you would like to learn more. 144 | 145 | Now the structure of `rnaseq/` should look like this: 146 | 147 | ```bash 148 | rnaseq 149 | ├── logs 150 | ├── meta 151 | ├── raw_data 152 | │   ├── Irrel_kd_1.subset.fq 153 | │   ├── Irrel_kd_2.subset.fq 154 | │   ├── Irrel_kd_3.subset.fq 155 | │   ├── Mov10_oe_1.subset.fq 156 | │   ├── Mov10_oe_2.subset.fq 157 | │   └── Mov10_oe_3.subset.fq 158 | ├── README.txt 159 | ├── results 160 | └── scripts 161 | ``` 162 | 163 | Okay, we are all set to begin the analysis! 164 | 165 | 166 | --- 167 | 168 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 169 | 170 | * *The materials used in this lesson were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 171 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).* 172 | * *Adapted from the lesson by Tracy Teal. Original contributors: Paul Wilson, Milad Fatenejad, Sasha Wood and Radhika Khetani for Software Carpentry (http://software-carpentry.org/)* 173 | 174 | -------------------------------------------------------------------------------- /lessons/05_qc_running_fastqc_interactively.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quality control using FASTQC" 3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry, Jihe Liu" 4 | date: Friday, October 30, 2020 5 | duration: 45 minutes 6 | --- 7 | 8 | ## Learning Objectives: 9 | 10 | * Describe the contents and format of a FASTQ file 11 | * Create a quality report using FASTQC 12 | 13 | ## Quality Control of FASTQ files 14 | 15 | 16 | The first step in the RNA-Seq workflow is to take the FASTQ files received from the sequencing facility and assess the quality of the sequence reads. 17 | 18 |

19 | 20 |

21 | 22 | ### Unmapped read data (FASTQ) 23 | 24 | The [FASTQ](https://en.wikipedia.org/wiki/FASTQ_format) file format is the defacto file format for sequence reads generated from next-generation sequencing technologies. This file format evolved from FASTA in that it contains sequence data, but also contains quality information. Similar to FASTA, the FASTQ file begins with a header line. The difference is that the FASTQ header is denoted by a `@` character. For a single record (sequence read), there are four lines, each of which are described below: 25 | 26 | |Line|Description| 27 | |----|-----------| 28 | |1|Always begins with '@', followed by information about the read| 29 | |2|The actual DNA sequence| 30 | |3|Always begins with a '+', and sometimes the same info as in line 1| 31 | |4|Has a string of characters representing the quality scores; must have same number of characters as line 2| 32 | 33 | Let's use the following read as an example: 34 | 35 | ``` 36 | @HWI-ST330:304:H045HADXX:1:1101:1111:61397 37 | CACTTGTAAGGGCAGGCCCCCTTCACCCTCCCGCTCCTGGGGGANNNNNNNNNNANNNCGAGGCCCTGGGGTAGAGGGNNNNNNNNNNNNNNGATCTTGG 38 | + 39 | @?@DDDDDDHHH?GH:?FCBGGB@C?DBEGIIIIAEF;FCGGI######################################################### 40 | ``` 41 | 42 | The line 4 has characters encoding the quality of each nucleotide in the read. The legend below provides the mapping of quality scores (Phred-33) to the quality encoding characters. *Different quality encoding scales exist (differing by offset in the ASCII table), but note the most commonly used one is fastqsanger, which is the scale output by Illumina since mid-2011.* 43 | ``` 44 | Quality encoding: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI 45 | | | | | | 46 | Quality score: 0........10........20........30........40 47 | ``` 48 | 49 | Using the quality encoding character legend, the first nucelotide in the read (C) is called with a quality score of 31 (corresponding to encoding character `@`), and our Ns are called with a score of 2 (corresponding to encoding character `#`). **As you can tell by now, this is a bad read.** 50 | 51 | Each quality score represents the probability that the corresponding nucleotide call is incorrect. This quality score is logarithmically based and is calculated as: 52 | 53 | Q = -10 x log10(P), where P is the probability that a base call is erroneous 54 | 55 | These probabaility values are the results from the base calling algorithm and dependent on how much signal was captured for the base incorporation. The score values can be interpreted as follows: 56 | 57 | |Phred Quality Score |Probability of incorrect base call |Base call accuracy| 58 | |:-------------------:|:---------------------------------:|:-----------------:| 59 | |10 |1 in 10 | 90%| 60 | |20 |1 in 100| 99%| 61 | |30 |1 in 1000| 99.9%| 62 | |40 |1 in 10,000| 99.99%| 63 | 64 | Therefore, for the first nucleotide in the read (C), there is less than a 1 in 1000 chance that the base was called incorrectly. Whereas, for the the end of the read there is greater than 50% probabaility that the base is called incorrectly. 65 | 66 | ## Assessing quality with FastQC 67 | 68 | Now that we understand what information is stored in a FASTQ file, the next step is to examine quality metrics for our data. 69 | 70 | [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) provides a simple way to do some quality checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses, which you can use to obtain an impression of whether your data has any problems that you should be aware of before moving on to the next analysis. 71 | 72 | FastQC does the following: 73 | * accepts FASTQ files (or BAM files) as input 74 | * generates summary graphs and tables to help assess your data 75 | * generates an easy-to-view HTML-based report with the graphs and tables 76 | 77 | *** 78 | 79 | > NOTE: Before we run FastQC, **you should be on a compute node** in an interactive session. Please run the following `srun` command if you are not on a compute node. 80 | > 81 | > ```bash 82 | > $ srun --pty -p interactive -t 0-3:00 --mem 1G /bin/bash 83 | > ``` 84 | > 85 | > ***An interactive session is very useful to test tools and workflows.*** 86 | 87 | ### Run FastQC 88 | 89 | Change directories to `raw_data`. 90 | 91 | ```bash 92 | $ cd ~/rnaseq/raw_data 93 | ``` 94 | 95 | Before we start using software, we have to load the module for each tool. On O2, this is done using an **LMOD** system. 96 | 97 | If we check which modules we currently have loaded, we should not see FastQC. 98 | 99 | ```bash 100 | $ module list 101 | ``` 102 | 103 | This is because the FastQC program is not in our $PATH (i.e. it's not in a directory that shell will automatically check to run commands/programs). 104 | 105 | ```bash 106 | $ echo $PATH 107 | ``` 108 | 109 | To run the FastQC program, we first need to load the appropriate module, so it puts the program into our path. To find the FastQC module to load we need to search the versions available: 110 | 111 | ```bash 112 | $ module spider fastqc 113 | ``` 114 | 115 | Once we know which version we want to use (0.12.1), we can load the FastQC module: 116 | 117 | ```bash 118 | $ module load fastqc/0.12.1 119 | ``` 120 | 121 | Once a module for a tool is loaded, you have essentially made it directly available to you like any other basic shell command. 122 | 123 | ```bash 124 | $ module list 125 | 126 | $ echo $PATH 127 | ``` 128 | 129 | Now, let's create a directory to store the output of FastQC: 130 | 131 | ```bash 132 | $ mkdir ~/rnaseq/results/fastqc 133 | ``` 134 | 135 | We will need to specify this directory in the command to run FastQC. How do we know which argument to use? 136 | 137 | ```bash 138 | $ fastqc --help 139 | ``` 140 | 141 | > **NOTE:** From the help manual, we know that `-o` (or `--outdir`) will create all output files in the specified output directory. Note that another argument, `-t`, specifies the number of files which can be processed simultaneously. We will use `-t` argument later. You may explore other arguments as well based on your needs. 142 | 143 | FastQC will accept multiple file names as input, so we can use the `*.fq` wildcard. 144 | 145 | ```bash 146 | $ fastqc -o ~/rnaseq/results/fastqc/ *.fq 147 | ``` 148 | 149 | *Did you notice how each file was processed serially? How do we speed this up?* 150 | 151 | FastQC has the capability of splitting up a single process to run on multiple cores! To do this, we will need to specify an additional argument `-t` indicating number of cores. We will also need to exit the current interactive session, since we started this interactive session with only 1 core. We cannot have a tool to use more cores than requested on a compute node. 152 | 153 | Exit the interactive session and start a new one with 6 cores: 154 | 155 | ```bash 156 | $ exit #exit the current interactive session (you will be back on a login node) 157 | 158 | $ srun --pty -c 6 -p interactive -t 0-3:00 --mem 2G /bin/bash #start a new one with 6 cores (-c 6) and 2GB RAM (--mem 2G) 159 | ``` 160 | 161 | Once you are on the compute node, check what job(s) you have running and what resources you are using. 162 | 163 | ```bash 164 | $ O2squeue 165 | ``` 166 | 167 | Now that we are in a new interactive session with the appropriate resources, we will need to load the module again for this new session. 168 | 169 | ```bash 170 | $ module load fastqc/0.12.1 #reload the module for the new (6-core) interactive session 171 | ``` 172 | 173 | We will also move into the `raw_data` directory (remember we are on a new compute node now): 174 | 175 | ```bash 176 | $ cd ~/rnaseq/raw_data 177 | ``` 178 | 179 | Run FastQC and use the multi-threading functionality of FastQC to run 6 jobs at once (with an additional argument `-t`). 180 | 181 | ```bash 182 | $ fastqc -o ~/rnaseq/results/fastqc/ -t 6 *.fq #note the extra parameter we specified for 6 threads 183 | ``` 184 | 185 | *Do you notice a difference? Is there anything in the ouput that suggests this is no longer running serially?* 186 | 187 | --- 188 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 189 | 190 | * *The materials used in this lesson was derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 191 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).* 192 | -------------------------------------------------------------------------------- /lessons/06_qc_running_fastqc_sbatch.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quality control using FASTQC - script running" 3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry, Jihe Liu" 4 | date: Friday, October 30, 2020 5 | duration: 45 minutes 6 | --- 7 | 8 | ## Learning Objectives: 9 | 10 | * Create and run a SLURM job submission script to automate quality assessment 11 | 12 | ## Quality Control of FASTQ files 13 | 14 | 15 | ### Performing quality assessment using job submission scripts 16 | So far in our FASTQC analysis, we have been directly submitting commands to O2 using an interactive session (ie. `srun --pty -c 6 -p interactive -t 0-12:00 --mem 6G --reservation=HBC /bin/bash`). However, there are many [more partitions available on O2](https://wiki.rc.hms.harvard.edu/display/O2/Using+Slurm+Basic#UsingSlurmBasic-Partitions(akaQueues)) than just the interactive partition. We can submit a command or series of commands to these partitions using job submission scripts. 17 | 18 | **Job submission scripts** for O2 are just regular shell scripts, but contain the Slurm **options/directives** for our job submission. These directives define the various resources we are requesting for our job (i.e *number of cores, name of partition, runtime limit* ) 19 | 20 | Submission of the script using the `sbatch` command allows Slurm to run your job when its your turn. Let's create a job submission script to automate what we have done in [previous lesson](05_qc_running_fastqc_interactively.md). 21 | 22 | Our script will do the following: 23 | 24 | 1. Change directories to where the FASTQ files are located 25 | 2. Load the FastQC module 26 | 3. Run FastQC on all of our FASTQ files 27 | 28 | Let's first change the directory to `~/rnaseq/scripts`, and create a script named `mov10_fastqc.run` using `vim`. 29 | 30 | ```bash 31 | $ cd ~/rnaseq/scripts 32 | 33 | $ vim mov10_fastqc.run 34 | ``` 35 | 36 | Once in the vim editor, click `i` to enter INSERT mode. The first thing we need in our script is the **shebang line**: 37 | 38 | ```bash 39 | #!/bin/bash 40 | ``` 41 | 42 | Following the shebang line are the Slurm directives. For the script to run, we need to include options for **queue/partition (-p) and runtime limit (-t)**. To specify our options, we precede the option with `#SBATCH`. Some key resources to specify are: 43 | 44 | |Resource|Flag|Description| 45 | |:----:|:----:|:----:| 46 | |partition|-p|partition name| 47 | |time|-t|hours:minutes run limit, after which the job will be killed| 48 | |core|-c|number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job| 49 | |memory|--mem|memory limit per compute node for the job| 50 | 51 | Let's specify those options as follows: 52 | 53 | ```bash 54 | #SBATCH -p short # partition name 55 | #SBATCH -t 0-2:00 # time limit 56 | #SBATCH -c 6 # number of cores 57 | #SBATCH --mem 6G # requested memory 58 | #SBATCH --job-name rnaseq_mov10_fastqc # Job name 59 | #SBATCH -o %j.out # File to which standard output will be written 60 | #SBATCH -e %j.err # File to which standard error will be written 61 | ``` 62 | 63 | Now in the body of the script, we can include any commands we want to run. In this case, it will be the following: 64 | 65 | ```bash 66 | ## Change directories to where the fastq files are located 67 | cd ~/rnaseq/raw_data 68 | 69 | ## Load modules required for script commands 70 | module load fastqc/0.12.1 71 | 72 | ## Run FASTQC 73 | fastqc -o ~/rnaseq/results/fastqc/ -t 6 *.fq 74 | ``` 75 | 76 | > **NOTE:** These are the same commands we used when running FASTQC in the interactive session. Since we are writing them in a script, the `tab` completion function will **not work**, so please make sure you don't have any typos when writing the script! 77 | 78 | Once done with your script, click `esc` to exit the INSERT mode. Then save and quit the script by typing `:wq`. You may double check your script by typing `less mov10_fastqc.run`. If everything looks good submit the job! 79 | 80 | ```bash 81 | $ sbatch mov10_fastqc.run 82 | ``` 83 | 84 | You should immediately see a prompt saying `Submitted batch job JobID`. Your job is assigned with that unique identifier `JobID`. You can check on the status of your job with: 85 | 86 | ```bash 87 | $ O2sacct 88 | ``` 89 | 90 | Look for the row that corresponds to your `JobID`. The third column indicates the state of your job. Possible states include `PENDING`, `RUNNING`, `COMPLETED`. Once your job state is `RUNNING`, you should expect it to finish in less than two minutes. When the state is `COMPLETED`, that means your job is finished. 91 | 92 | > **NOTE:** Other helpful options for checking/managing jobs are available as a [cheatsheet](https://wiki.rc.hms.harvard.edu/display/O2/O2+Command+CheatSheet) from HMS-RC. 93 | 94 | Check out the output files in your directory: 95 | ```bash 96 | $ ls -lh ../results/fastqc/ 97 | ``` 98 | There should also be one standard error (`.err`) and one standard out (`.out`) files from the job listed in `~/rnaseq/scripts`. You can move these over to your `logs` directory and give them more intuitive names: 99 | 100 | ```bash 101 | $ mv *.err ../logs/fastqc.err 102 | $ mv *.out ../logs/fastqc.out 103 | ``` 104 | > **NOTE:** The `.err` and `.out` files store log information during the script running. They are helpful resources, especially when your script does not run as expected and you need to troubleshoot the script. 105 | 106 | *** 107 | **Exercise** 108 | 1. Take a look at what's inside the `.err` and `.out` files. What do you observe? Do you remember where you see those information when using the interactive session? 109 | 2. How would you change the `mov10_fastqc.run` script if you had 9 fastq files you wanted to run in parallel? 110 | 111 | --- 112 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 113 | 114 | * *The materials used in this lesson was derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 115 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).* 116 | -------------------------------------------------------------------------------- /lessons/09_quasi_alignment_salmon_sbatch.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quantification of transcript abundance using Salmon" 3 | author: "Mary Piper, Meeta Mistry, Radhika Khetani, Jihe Liu" 4 | date: "November 16, 2020" 5 | --- 6 | 7 | Approximate time: 30 minutes 8 | 9 | ## Learning Objectives 10 | 11 | * Create a job submission script to run Salmon on all samples in the dataset 12 | 13 | 14 | ## Running Salmon on multiple samples 15 | 16 | In class we talked in depth about how the Salmon algorithm works, and provided the command required to run Salmon on a single sample. In this lesson we walk through the steps required to **efficiently run Salmon on all samples** in the dataset. Unlike our experience with FastQC, where we could use one command and simply provide all files with the use of a wildcard (`*`), Salmon is only able to take a single file as input. 17 | 18 | Rather than typing out the Salmon command six times, we will use **a for loop to iterate over all FASTQ files in our dataset** (inside the `raw_fastq` directory). Furthermore, rather than running this `for` loop interactively, we will put it inside a text file and create a **job submission script**. 19 | 20 | ### Create a job submission script to run Salmon in serial 21 | 22 | Let's start by opening up a text file in `vim`: 23 | 24 | ``` 25 | $ vim salmon_all_samples.sbatch 26 | ``` 27 | 28 | Begin the script starting with the **shebang line**. 29 | 30 | ```bash 31 | #!/bin/bash 32 | 33 | ``` 34 | *** 35 | 36 | **Exercise 1** 37 | 38 | 1. Add the Slurm directives ( i.e `#SBATCH`) to request specific resources for our job. The resources we need are listed below. 39 | 40 | > **NOTE:** Helpful resources include: 41 | > * This [linked lesson](03_working_on_HPC.md#requesting-resources-from-slurm) 42 | > * [HMS-RC's O2 Wiki](https://wiki.rc.hms.harvard.edu/display/O2/Using+Slurm+Basic) 43 | 44 | * Your job will use the `short` partition 45 | * Request 6 cores to take advantage of Salmon's multi-threading capabilities 46 | * Request 12 hours of runtime 47 | * Request 8G of memory 48 | * Give your job the name `salmon_in_serial` 49 | * Add an email and request to be notified when the job is complete 50 | 51 | *** 52 | 53 | Now that we have the resources requested, we can begin to **add the commands into our shell script**. 54 | 55 | 56 | *** 57 | 58 | **Exercise 2** 59 | 60 | 1. Add a line of code required to load the Salmon module 61 | 2. Add a line of code to change directories to where the Salmon results will be output (be sure to use a full path here). 62 | 63 | > *Add comments to your script liberally, wherever you feel it's needed.* 64 | 65 | *** 66 | 67 | The last piece of the shell script is the **for loop** code provided below. **Copy and paste this into your script**. 68 | 69 | ```bash 70 | for fq in ~/rnaseq/raw_data/*.fq 71 | 72 | do 73 | 74 | # create a prefix for the output file 75 | samplename=`basename $fq .fq` 76 | 77 | # run salmon 78 | salmon quant -i /n/groups/hbctraining/RNA_seq_part_1/reference_data/salmon/ref-transcripts \ 79 | -l A \ 80 | -r $fq \ 81 | -o ${samplename}_salmon \ 82 | --seqBias \ 83 | --useVBOpt \ 84 | --validateMappings 85 | 86 | done 87 | ``` 88 | 89 | Note, that our for loop is iterating over all FASTQ files in the `raw_fastq` directory. For each file, a prefix is generated to name the output file and then the Salmon command is run with the same parameters as used in the single sample run. 90 | 91 | *** 92 | 93 | **Exercise 3** 94 | 95 | 1. Add two additional parameters (as described below) to the current Salmon command (*remember to use "`\`" if dissecting one command in multiple lines*): 96 | 97 | 1. `-p`: specifies the number of processors or cores we would like to use for **multi-threading**. What value will you provide here, knowing what we asked for in our Slurm directives? 98 | 1. `--numBootstraps`: specifies computation of bootstrapped abundance estimates. **Bootstraps are required for isoform level differential expression analysis for estimation of technical variance**. Here, you can set the value to 30. 99 | 100 | > _**NOTE:** `--numBootstraps` is necessary if performing **isoform-level differential expression analysis** with Sleuth, but not for gene-level differential expression analysis. Due to the statistical procedure required to assign reads to gene isoforms, in addition to the random processes underlying RNA-Seq, there will be **technical variability in the abundance estimates** output from the pseudo-alignment tool [[2](https://rawgit.com/pachterlab/sleuth/master/inst/doc/intro.html), [3](https://www.nature.com/articles/nmeth.4324)] for the isoform level abundance estimates (not necessary for gene-level estimates). Therefore, **we would need technical replicates to distinguish technical variability from the biological variability** for gene isoforms._ 101 | > 102 | > _The bootstraps estimate technical variation per gene by calculating the abundance estimates for all genes using a different sub-sample of reads during each round of bootstrapping. The variation in the abundance estimates output from each round of bootstrapping is used for the estimation of the technical variance for each gene._ 103 | 104 | 2. Save and close the script. This script is now ready to run. 105 | 106 | ``` 107 | $ sbatch salmon_all_samples.sbatch 108 | ``` 109 | 110 | 3. **After you confirmed that the script runs as expected, copy and paste your final script to [Google forms](https://docs.google.com/forms/d/e/1FAIpQLScxaj3IIO4Bx7FCRw87cCeuTPQyhD_7WR2QU638y8IZDv5r1A/viewform?usp=sf_link).** 111 | 112 | --- 113 | 114 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 115 | -------------------------------------------------------------------------------- /lessons/11_multiQC.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: MultiQC 3 | authors: Radhika Khetani, Mary Piper, Jihe Liu, Meeta Mistry 4 | --- 5 | 6 | Approximate time: 30 minutes 7 | 8 | ## Learning Objectives 9 | * Run the multiQC tool to gather QC metrics from multiple tools for all samples 10 | * Assess and compare QC metrics among samples 11 | 12 | ## Documenting results and gathering QC metrics 13 | 14 | As you go through the RNA-seq workflow (or any data analysis workflow), it is important to document the parameters you used for running the analysis. In addition, it is also very important to document the metrics/results at every step. Careful evaluation of metrics is a form of QC, and it will enable you to identify any issues with the data and/or the parameters you are using, as well as alert you to the presence of contamination or systematic biases, etc. 15 | 16 | There are several metrics you can evaluate in the RNA-seq workflow. Below are 3 important ones that you should keep track of for each sample: 17 | 18 | * number of raw reads 19 | * percentage of reads aligned to genome 20 | * percentage of reads associated with genes 21 | 22 | An important QC step is to make sure that these metrics are consistent across the samples for a given experiment, and any outliers should be investigated further. 23 | 24 | Manually tracking these metrics is tedious and error-prone. Many tools can help you with the documentation and QC assessment, some of which also have really nice visualizations to easily identify any issues, e.g. [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [Qualimap](http://qualimap.bioinfo.cipf.es/doc_html/index.html), [MultiQC](http://multiqc.info/). Some of these tools tend to focus on a single sample at a time, and on QC for a specific step in the workflow. MultiQC, on the other hand, is able to make a report from the output of many different tools (for RNA-seq analysis and other NGS workflows) and it is able to combine the information for multiple samples. 25 | 26 | ### Tracking and aggregating results from workflow tools with *MultiQC* 27 | 28 | In this lesson, we will be using MultiQC to aggregate results from several tools and generates a single HTML report with plots to visualize and compare QC metrics between the samples. 29 | 30 | MultiQC can generate this report from 96 different bioinformatics tools, and these tools span various NGS analyses, e.g., basic QC, RNA-seq, ChIP-seq, variant calling, genome annotation, etc. We are going to use it to aggregate information from the results of [FastQC](http://multiqc.info/docs/#fastqc), [STAR](http://multiqc.info/docs/#star), [Qualimap](http://multiqc.info/docs/#qualimap), and [salmon](http://multiqc.info/docs/#salmon). MultiQC can parse the information from **specific output files** of these tools. 31 | 32 | Start by creating a directory for our output called `multiqc_report`: 33 | 34 | ```bash 35 | $ cd ~/rnaseq/ 36 | 37 | $ mkdir results/multiqc_report 38 | ``` 39 | 40 | Then navigate into that directory: 41 | 42 | ```bash 43 | $ cd results/multiqc_report 44 | ``` 45 | 46 | Next, load the three modules needed to run MultiQC: `gcc`, `python`, `multiqc`. 47 | 48 | ```bash 49 | $ cd results/multiqc_report 50 | 51 | $ module load gcc/9.2.0 python/2.7.12 multiqc/1.21 52 | ``` 53 | *** 54 | 55 | **Exercise** 56 | 57 | How did we know which modules to load in addition to multiqc? 58 | 59 | *** 60 | 61 | We are going to run MultiQC on the following 4 outputs from our workflow: 62 | 63 | * `.zip` files from FastQC 64 | * `.Log.final.out` files from STAR 65 | * `qualimap/*` directories from Qualimap 66 | * `salmon/*` directories from salmon 67 | 68 | To create a more meaningful report to look at we thought it best to run MultiQC on the full dataset instead of the subset we have been working with so far. We have run each of the tools mentioned above on the full dataset and stored the result in the directory `/n/groups/hbctraining/intro_rnaseq_hpc/full_dataset`. We will point to these files as input for our MultiQC analysis. 69 | 70 | To run MultiQC, we can provide it two inputs at a minimum: 71 | 72 | 1. a name for our output report and folder 73 | 2. the paths to our results files 74 | 75 | > **NOTE:** MultiQC has additional parameters we could include; use `multiqc -h` to find out more. 76 | 77 | ```bash 78 | $ multiqc -n multiqc_report_rnaseq \ 79 | /n/groups/hbctraining/RNA_seq_part_1/full_dataset_results/fastqc/*zip \ 80 | /n/groups/hbctraining/RNA_seq_part_1/full_dataset_results/STAR/*Log.final.out \ 81 | /n/groups/hbctraining/RNA_seq_part_1/full_dataset_results/qualimap/* \ 82 | /n/groups/hbctraining/RNA_seq_part_1/full_dataset_results/salmon/* 83 | ``` 84 | 85 | > **NOTE**: You will see the progress of analysis printed out on the terminal as the tool runs. If you want to save this output into a log file (for future reference), you can use `2>` operator to redirect it to a file. For example, at the end of script, add `2> log.txt`. `2>`redirects the output of so-called standard error. 86 | 87 | It takes a couple of minutes to generate the MultiQC report. The report provides nice visualizations across samples, which is very useful to determine consistency and to identify problematic samples. 88 | 89 | The output of MultiQC is one HTML file (`multiqc_report_rnaseq.html`) and a data folder. Transfer the interactive HTML report over to your laptop using **FileZilla**, and visualize the outputs of the four tools we used to generate the report. 90 | 91 | > *For a refresher on using Filezilla, please refer back to our [FastQC assessment lesson](07_qc_fastqc_assessment.md).* 92 | 93 | ## Assessing the quality control metrics 94 | 95 | The main metrics to explore first are: 96 | 97 | * number of raw reads or total reads 98 | * percentage of reads aligned to genome 99 | * percentage of reads associated with genes 100 | 101 | > Note: If you don't see exact columns as ours, you may need to configure the columns, which is a button just underneath the 'General Statistics' heading. 102 | 103 |

104 | 105 |

106 | 107 | Using `Configure Columns` button, we are going to choose the following columns: 108 | 109 |

110 | 111 |

112 | 113 | In the above image, the description column is helpful in interpretating the table. Upon perusal of the table, we can see input from FastQC, STAR, Qualimap and salmon. For example, the total number of raw reads is given in the `M Seqs` column on the far right of the table. 114 | 115 | STAR provides information about *uniquely mapping reads* in the `%Aligned` column. A good quality sample will have **at least 75% of the reads uniquely mapped**. Once the value starts to drop below 60%, it's advisable to start troubleshooting. Low number of uniquely mapping reads means that more reads are mapped to multiple locations. 116 | 117 | The 'STAR: Alignment Scores' plot visually represents this mapping information. The % uniquely mapped, multimapped, and unmapped reads can be easily compared between samples to get a nice overview of the quality of the samples. 118 | 119 |

120 | 121 |

122 | 123 | > NOTE: The thresholds suggested above will vary depending on the organism that you are working with. Much of what is discussed here is in the context of working with human or mouse data. For example, 75% of mapped reads holds true only if the genome is good or mature. For badly assembled genomes, we may not observe a high mapping rate, even if the actual sequences from the sample are good. 124 | 125 | Salmon also provides a `%Aligned` column representing the percent of mapped reads. The percentage from Salmon is different from that of STAR, because STAR is based on the alignment to genome reference, while Salmon is based on the alignment to transcriptome reference. Since we will be using the salmon abundance estimates for downstream analysis, these numbers are particularly important for our analysis. 126 | 127 | 128 | ### Complexity 129 | 130 | The complexity of the RNA-seq library can be explored with the `%Dups` column. If a large percentage of the library is duplicated, then this could indicate a library of either low complexity or over-amplification. If huge differences of `%Dups` exist between samples, this may lead to biases in the data, such as different %GC content. 131 | 132 | ### Exploring biases 133 | 134 | Within this report, we can also explore the bias metrics output by Qualimap and FastQC. The `5'-3' bias` column denotes whether our data has any 5' or 3' biases. These biases could be due to RNA degradation or different sample preparation techniques. Generally, we should explore our data more if we have biases approaching 0.5 or 2. 135 | 136 | The transcript position plot can also help identify 5' or 3' bias, in addition to other coverage issues. We generally expect roughly even coverage. 137 | 138 |

139 | 140 |

141 | 142 | In addition, we can see whether our different samples have differences in `%GC` column. GC bias could be caused by low-complexity libraries, differences in amplification, or library-specific issues. We expect to observe similar GC content aross samples. 143 | 144 | ### Contamination 145 | 146 | We can also identify possible contamination of our samples by inspecting the percentage of reads that are exonic, intronic or intergenic. High levels of intergenic reads is indicative of DNA contamination (>30%). Also, if polyA selection of messenger RNAs was performed in library preparation, then high percentages of intronic reads would also be concerning. 147 | 148 |

149 | 150 |

151 | 152 | Generally speaking, in a good library, we expect over 60% of reads to be mapped to exons for mouse or human organisms. For other organisms, the percentage depends on how well the genome is annotated. 153 | 154 | ### Fragment length distribution 155 | 156 | The auxiliary directory generated from Salmon will contain a file called `fld.gz`. This file contains an approximation of the observed fragment length distribution. This is more meaningful for paired-end data, where the length can be estimated based on the location from both ends of the fragment. These plots can be compared to our expectations based on our knowledge of the size selection step performed during the library preparation stage. 157 | 158 | > **NOTE:** For single end data (which is what we have), Salmon reports a fixed insert length distribution. Therefore, the values are identical for all samples, and we only observe one distribution curve in the plot. 159 | 160 |

161 | 162 |

163 | 164 | --- 165 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 166 | -------------------------------------------------------------------------------- /lessons/STAR Alignment Strategy.md: -------------------------------------------------------------------------------- 1 | title: "STAR alignment strategy" 2 | author: "Meeta Mistry, Mary Piper" 3 | date: Monday September 10, 2018 4 | --- 5 | 6 | Approximate time: 20 minutes 7 | 8 | ### STAR Alignment Strategy 9 | 10 | STAR is shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. The algorithm achieves this highly efficient mapping by performing a two-step process: 11 | 12 | 1. Seed searching 13 | 2. Clustering, stitching, and scoring 14 | 15 | #### Seed searching 16 | 17 | For every read that STAR aligns, STAR will search for the longest sequence that exactly matches one or more locations on the reference genome. These longest matching sequences are called the Maximal Mappable Prefixes (MMPs): 18 | 19 | ![STAR_step1](../img/alignment_STAR_step1.png) 20 | 21 | The different parts of the read that are mapped separately are called 'seeds'. So the first MMP that is mapped to the genome is called *seed1*. 22 | 23 | STAR will then search again for only the unmapped portion of the read to find the next longest sequence that exactly matches the reference genome, or the next MMP, which will be *seed2*. 24 | 25 | ![STAR_step2](../img/alignment_STAR_step2.png) 26 | 27 | This sequential searching of only the unmapped portions of reads underlies the efficiency of the STAR algorithm. STAR uses an uncompressed suffix array (SA) to efficiently search for the MMPs, this allows for quick searching against even the largest reference genomes. Other slower aligners use algorithms that often search for the entire read sequence before splitting reads and performing iterative rounds of mapping. 28 | 29 | **If STAR does not find an exact matching sequence** for each part of the read due to mismatches or indels, the previous MMPs will be extended. 30 | 31 | ![STAR_step3](../img/alignment_STAR_step3.png) 32 | 33 | **If extension does not give a good alignment**, then the poor quality or adapter sequence (or other contaminating sequence) will be soft clipped. 34 | 35 | ![STAR_step4](../img/alignment_STAR_step4.png) 36 | 37 | 38 | #### Clustering, stitching, and scoring 39 | 40 | The separate seeds are stitched together to create a complete read by first clustering the seeds together based on proximity to a set of 'anchor' seeds, or seeds that are not multi-mapping. 41 | 42 | Then the seeds are stitched together based on the best alignment for the read (scoring based on mismatches, indels, gaps, etc.). 43 | 44 | ![STAR_step5](../img/alignment_STAR_step5.png) 45 | -------------------------------------------------------------------------------- /lessons/STAR_alignment.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Alignment with STAR" 3 | author: "Meeta Mistry, Bob Freeman, Mary Piper" 4 | date: Wednesday, June 7, 2017 5 | --- 6 | 7 | Approximate time: 90 minutes 8 | 9 | ## Learning Objectives: 10 | 11 | * Understanding the alignment method STAR utilizes to align sequence reads to the reference genome 12 | * Identifying the intricacies of alignment tools used in NGS analysis (parameters, usage, etc) 13 | * Choosing appropriate STAR alignment parameters for our dataset 14 | 15 | ## Read Alignment 16 | 17 | 18 | 19 | Now that we have explored the quality of our raw reads, we can move on to read alignment. We perform read alignment or mapping to determine where in the genome the reads originated from. The alignment process consists of choosing an appropriate reference genome to map our reads against and performing the read alignment using one of several splice-aware alignment tools such as [STAR](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635) or [HISAT2](http://ccb.jhu.edu/software/hisat2/index.shtml). The choice of aligner is often a personal preference and also dependent on the computational resources that are available to you. 20 | 21 | ## STAR Aligner 22 | 23 | To determine where on the human genome our reads originated from, we will align our reads to the reference genome using [STAR](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530905/) (Spliced Transcripts Alignment to a Reference). STAR is an aligner designed to specifically address many of the challenges of RNA-seq data mapping using a strategy to account for spliced alignments. 24 | 25 | ### STAR Alignment Strategy 26 | 27 | STAR is shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. The algorithm achieves this highly efficient mapping by performing a two-step process: 28 | 29 | 1. Seed searching 30 | 2. Clustering, stitching, and scoring 31 | 32 | #### Seed searching 33 | 34 | For every read that STAR aligns, STAR will search for the longest sequence that exactly matches one or more locations on the reference genome. These longest matching sequences are called the Maximal Mappable Prefixes (MMPs): 35 | 36 | 37 | ![STAR_step1](../img/alignment_STAR_step1.png) 38 | 39 | The different parts of the read that are mapped separately are called 'seeds'. So the first MMP that is mapped to the genome is called *seed1*. 40 | 41 | STAR will then search again for only the unmapped portion of the read to find the next longest sequence that exactly matches the reference genome, or the next MMP, which will be *seed2*. 42 | 43 | ![STAR_step2](../img/alignment_STAR_step2.png) 44 | 45 | This sequential searching of only the unmapped portions of reads underlies the efficiency of the STAR algorithm. STAR uses an uncompressed suffix array (SA) to efficiently search for the MMPs, this allows for quick searching against even the largest reference genomes. Other slower aligners use algorithms that often search for the entire read sequence before splitting reads and performing iterative rounds of mapping. 46 | 47 | **If STAR does not find an exact matching sequence** for each part of the read due to mismatches or indels, the previous MMPs will be extended. 48 | 49 | ![STAR_step3](../img/alignment_STAR_step3.png) 50 | 51 | **If extension does not give a good alignment**, then the poor quality or adapter sequence (or other contaminating sequence) will be soft clipped. 52 | 53 | ![STAR_step4](../img/alignment_STAR_step4.png) 54 | 55 | 56 | #### Clustering, stitching, and scoring 57 | 58 | The separate seeds are stitched together to create a complete read by first clustering the seeds together based on proximity to a set of 'anchor' seeds, or seeds that are not multi-mapping. 59 | 60 | Then the seeds are stitched together based on the best alignment for the read (scoring based on mismatches, indels, gaps, etc.). 61 | 62 | ![STAR_step5](../img/alignment_STAR_step5.png) 63 | 64 | ## Running STAR 65 | 66 | ### Set-up 67 | 68 | To get started with this lesson, start an interactive session with 6 cores: 69 | 70 | ```bash 71 | $ srun --pty -p interactive -t 0-12:00 -n 6 --mem 8G --reservation=HBC1 /bin/bash 72 | ``` 73 | 74 | You should have a directory tree setup similar to that shown below. it is best practice to have all files you intend on using for your workflow present within the same directory. In our case, we have our original FASTQ files generated in the previous section. 75 | 76 | ```bash 77 | rnaseq 78 | ├── logs 79 | ├── meta 80 | ├── raw_data 81 | │ ├── Irrel_kd_1.subset.fq 82 | │ ├── Irrel_kd_2.subset.fq 83 | │ ├── Irrel_kd_3.subset.fq 84 | │ ├── Mov10_oe_1.subset.fq 85 | │ ├── Mov10_oe_2.subset.fq 86 | │ └── Mov10_oe_3.subset.fq 87 | ├── results 88 | └── scripts 89 | ``` 90 | 91 | To use the STAR aligner, load the module: 92 | 93 | ```bash 94 | $ module load gcc/6.2.0 star/2.5.2b 95 | ``` 96 | 97 | Aligning reads using STAR is a two step process: 98 | 99 | 1. Create a genome index 100 | 2. Map reads to the genome 101 | 102 | > A quick note on shared databases for human and other commonly used model organisms. The O2 cluster has a designated directory at `/n/groups/shared_databases/` in which there are files that can be accessed by any user. These files contain, but are not limited to, genome indices for various tools, reference sequences, tool specific data, and data from public databases, such as NCBI and PDB. So when using a tool that requires a reference of sorts, it is worth taking a quick look here because chances are it's already been taken care of for you. 103 | > 104 | >```bash 105 | > $ ls -l /n/groups/shared_databases/igenome/ 106 | >``` 107 | 108 | ### Creating a genome index 109 | 110 | For this workshop we are using reads that originate from a small subsection of chromosome 1 (~300,000 reads) and so we are using only chr1 as the reference genome. 111 | 112 | To store our genome indices, we will use the `/n/scratch2/` space with large temporary storage capacity. We need to create a directory for the indices within this space: 113 | 114 | ```bash 115 | $ mkdir -p /n/scratch2/username/chr1_hg38_index 116 | ``` 117 | 118 | The basic options to **generate genome indices** using STAR are as follows: 119 | 120 | * `--runThreadN`: number of threads 121 | * `--runMode`: genomeGenerate mode 122 | * `--genomeDir`: /path/to/store/genome_indices 123 | * `--genomeFastaFiles`: /path/to/FASTA_file 124 | * `--sjdbGTFfile`: /path/to/GTF_file 125 | * `--sjdbOverhang`: readlength -1 126 | 127 | > *NOTE:* In case of reads of varying length, the ideal value for `--sjdbOverhang` is max(ReadLength)-1. In most cases, the default value of 100 will work similarly to the ideal value. 128 | 129 | Now let's create a job submission script to generate the genome index: 130 | 131 | ```bash 132 | $ vim ~/rnaseq/scripts/genome_index.run 133 | ``` 134 | Within `vim` we now add our shebang line, the SLURM directives, and our STAR command. 135 | 136 | ```bash 137 | #!/bin/bash 138 | 139 | #SBATCH -p short # partition name 140 | #SBATCH -t 0-2:00 # hours:minutes runlimit after which job will be killed 141 | #SBATCH -n 6 # number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job 142 | #SBATCH --mem 16G 143 | #SBATCH --job-name STAR_index # Job name 144 | #SBATCH -o %j.out # File to which standard out will be written 145 | #SBATCH -e %j.err # File to which standard err will be written 146 | 147 | cd /n/scratch2/username/ 148 | 149 | module load gcc/6.2.0 star/2.5.2b 150 | 151 | STAR --runThreadN 6 \ 152 | --runMode genomeGenerate \ 153 | --genomeDir chr1_hg38_index \ 154 | --genomeFastaFiles /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.dna.chromosome.1.fa \ 155 | --sjdbGTFfile /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \ 156 | --sjdbOverhang 99 157 | ``` 158 | 159 | ```bash 160 | $ sbatch ~/rnaseq/scripts/genome_index.run 161 | ``` 162 | 163 | ### Aligning reads 164 | 165 | After you have the genome indices generated, you can perform the read alignment. We previously generated the genome indices for you in `/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index/` directory so that we don't get held up waiting on the generation of the indices. 166 | 167 | Create an output directory for our alignment files: 168 | 169 | ```bash 170 | $ cd ~/rnaseq/raw_data 171 | 172 | $ mkdir ../results/STAR 173 | ``` 174 | 175 | ### STAR command in interactive bash 176 | 177 | For now, we're going to work on just one sample to set up our workflow. To start we will use the first replicate in the Mov10 over-expression group, `Mov10_oe_1.subset.fq`. Details on STAR and its functionality can be found in the [user manual](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf); we encourage you to peruse through to get familiar with all available options. 178 | 179 | The basic options for aligning reads to the genome using STAR are: 180 | 181 | * `--runThreadN`: number of threads / cores 182 | * `--readFilesIn`: /path/to/FASTQ_file 183 | * `--genomeDir`: /path/to/genome_indices_directory 184 | * `--outFileNamePrefix`: prefix for all output files 185 | 186 | Listed below are additional parameters that we will use in our command: 187 | 188 | * `--outSAMtype`: output filetype (SAM default) 189 | * `--outSAMunmapped`: what to do with unmapped reads 190 | 191 | > **NOTE:** Default filtering is applied in which the maximum number of multiple alignments allowed for a read is set to 10. If a read exceeds this number there is no alignment output. To change the default you can use `--outFilterMultimapNmax`, but for this lesson we will leave it as default. Also, note that "**STAR’s default parameters are optimized for mammalian genomes.** Other species may require significant modifications of some alignment parameters; in particular, the maximum and minimum intron sizes have to be reduced for organisms with smaller introns" [[1](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.full.pdf+html)]. 192 | 193 | We can access the software by simply using the STAR command followed by the basic parameters described above and any additional parameters. The full command is provided below for you to copy paste into your terminal. If you want to manually enter the command, it is advisable to first type out the full command in a text editor (i.e. [Sublime Text](http://www.sublimetext.com/) or [Notepad++](https://notepad-plus-plus.org/)) on your local machine and then copy paste into the terminal. This will make it easier to catch typos and make appropriate changes. 194 | 195 | ```bash 196 | 197 | STAR --genomeDir /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index/ \ 198 | --runThreadN 6 \ 199 | --readFilesIn Mov10_oe_1.subset.fq \ 200 | --outFileNamePrefix ../results/STAR/Mov10_oe_1_ \ 201 | --outSAMtype BAM SortedByCoordinate \ 202 | --outSAMunmapped Within \ 203 | --outSAMattributes Standard 204 | 205 | ``` 206 | 207 | --- 208 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 209 | -------------------------------------------------------------------------------- /lessons/STAR_alignment_strategy.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "STAR alignment strategy" 3 | author: "Meeta Mistry, Mary Piper" 4 | date: Monday September 10, 2018 5 | --- 6 | 7 | Approximate time: 20 minutes 8 | 9 | ### STAR Alignment Strategy 10 | 11 | STAR is shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. The algorithm achieves this highly efficient mapping by performing a two-step process: 12 | 13 | 1. Seed searching 14 | 2. Clustering, stitching, and scoring 15 | 16 | #### Seed searching 17 | 18 | For every read that STAR aligns, STAR will search for the longest sequence that exactly matches one or more locations on the reference genome. These longest matching sequences are called the Maximal Mappable Prefixes (MMPs): 19 | 20 | ![STAR_step1](../img/alignment_STAR_step1.png) 21 | 22 | The different parts of the read that are mapped separately are called 'seeds'. So the first MMP that is mapped to the genome is called *seed1*. 23 | 24 | STAR will then search again for only the unmapped portion of the read to find the next longest sequence that exactly matches the reference genome, or the next MMP, which will be *seed2*. 25 | 26 | ![STAR_step2](../img/alignment_STAR_step2.png) 27 | 28 | This sequential searching of only the unmapped portions of reads underlies the efficiency of the STAR algorithm. STAR uses an uncompressed suffix array (SA) to efficiently search for the MMPs, this allows for quick searching against even the largest reference genomes. Other slower aligners use algorithms that often search for the entire read sequence before splitting reads and performing iterative rounds of mapping. 29 | 30 | **If STAR does not find an exact matching sequence** for each part of the read due to mismatches or indels, the previous MMPs will be extended. 31 | 32 | ![STAR_step3](../img/alignment_STAR_step3.png) 33 | 34 | **If extension does not give a good alignment**, then the poor quality or adapter sequence (or other contaminating sequence) will be soft clipped. 35 | 36 | ![STAR_step4](../img/alignment_STAR_step4.png) 37 | 38 | 39 | #### Clustering, stitching, and scoring 40 | 41 | The separate seeds are stitched together to create a complete read by first clustering the seeds together based on proximity to a set of 'anchor' seeds, or seeds that are not multi-mapping. 42 | 43 | Then the seeds are stitched together based on the best alignment for the read (scoring based on mismatches, indels, gaps, etc.). 44 | 45 | ![STAR_step5](../img/alignment_STAR_step5.png) 46 | 47 | --- 48 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 49 | -------------------------------------------------------------------------------- /lessons/counting_reads.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Counting reads 3 | author: Meeta Mistry, Bob Freeman, Radhika Khetani 4 | date: 06/06/2017 5 | --- 6 | 7 | Approximate time: 75 minutes 8 | 9 | ## Learning Objectives: 10 | 11 | * understand how counting tools work 12 | * generate a count matrix using featureCounts 13 | 14 | 15 | ## Counting reads as a measure of gene expression 16 | 17 | 18 | Once we have our reads aligned to the genome, the next step is to count how many reads have mapped to each gene. There are many tools that can use BAM files as input and output the number of reads (counts) associated with each feature of interest (genes, exons, transcripts, etc.). 2 commonly used counting tools are [featureCounts](http://bioinf.wehi.edu.au/featureCounts/) and [htseq-count](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html). 19 | 20 | * The above tools only report the "raw" counts of reads that **map to a single location** (uniquely mapping) and are best at counting at the **gene level**. Essentially, total read count associated with a gene (*meta-feature*) = the sum of reads associated with each of the exons (*feature*) that "belong" to that gene. 21 | 22 | * There are **other tools** available that are able to account for **multiple transcripts** for a given gene. In this case the counts are not whole numbers, but have fractions. In the simplest example case, if 1 read is associated with 2 transcripts, it can get counted as 0.5 and 0.5 and the resulting count for that transcript is not a whole number. 23 | 24 | * In addition there are **other tools that will count multimapping reads**, but this is a dangerous thing to do since you will be overcounting the total number of reads which can cause issues with normalization and eventually with accuracy of differential gene expression results. 25 | 26 | **Input for counting = multiple BAM files + 1 GTF file** 27 | 28 | Simply speaking, the genomic coordinates of where the read is mapped (BAM) are cross-referenced with the genomic coordinates of whichever feature you are interested in counting expression of (GTF), it can be exons, genes or transcripts. 29 | 30 | 31 | 32 | **Output of counting = A count matrix, with genes as rows and samples are columns** 33 | 34 | These are the "raw" counts and will be used in statistical programs downstream for differential gene expression. 35 | 36 | 37 | 38 | ### Counting using featureCounts 39 | Today, we will be using the [featureCounts](http://bioinf.wehi.edu.au/featureCounts/) tool to get the *gene* counts. We picked this tool because it is accurate, fast and is relatively easy to use. It counts reads that map to a single location (uniquely mapping) and follows the scheme in the figure below for assigning reads to a gene/exon. 40 | 41 | 42 | 43 | featureCounts can also take into account whether your data are **stranded** or not. If strandedness is specified, then in addition to considering the genomic coordinates it will also take the strand into account for counting. If your data are stranded always specify it. 44 | 45 | #### Setting up to run featureCounts 46 | First things first, start an interactive session with 4 cores: 47 | 48 | ``` bash 49 | $ srun --pty -p interactive -t 0-12:00 -n 4 --mem 8G --reservation=HBC1 /bin/bash 50 | ``` 51 | 52 | Now, change directories to your rnaseq directory and start by creating 2 directories, (1) a directory for the output and (2) a directory for the bam files: 53 | 54 | ``` bash 55 | $ cd ~/rnaseq/ 56 | $ mkdir results/counts results/STAR/bams 57 | ``` 58 | 59 | Rather than using the BAM file we generated in the last lesson, let's copy over all of the BAM files that we have already generated for you: 60 | 61 | ``` bash 62 | 63 | $ cp /n/groups/hbctraining/intro_rnaseq_hpc/bam_STAR/*bam ~/rnaseq/results/STAR/bams 64 | ``` 65 | featureCounts is not available as a module on O2, but we have already added the path for it to our `$PATH` variable last time. 66 | 67 | ``` bash 68 | $ echo $PATH # You should see /n/app/bcbio/tools/bin/ among other paths 69 | ``` 70 | 71 | > ** If you don't see `/n/app/bcbio/tools/bin/` in your `$PATH` variable, add the following `export` command to your `~/.bashrc` file using vim: `export PATH=/n/app/bcbio/tools/bin/:$PATH`.** 72 | 73 | 74 | #### Running featureCounts 75 | 76 | How do we use this tool, what is the command and what options/parameters are available to us? 77 | 78 | ``` bash 79 | $ featureCounts 80 | ``` 81 | 82 | So, it looks like the usage is `featureCounts [options] -a -o input_file1 [input_file2] ... `, where `-a`, `-o` and input files are required. 83 | 84 | We are going to use the following options: 85 | 86 | `-T 4 # specify 4 cores` 87 | 88 | `-s 2 # these data are "reverse"ly stranded` 89 | 90 | and the following are the values for the required parameters: 91 | 92 | `-a ~/rnaseq/reference_data/chr1-hg19_genes.gtf # required option for specifying path to GTF` 93 | 94 | `-o ~/rnaseq/results/counts/Mov10_featurecounts.txt # required option for specifying path to, and name of the text output (count matrix)` 95 | 96 | `~/rnaseq/results/STAR/bams/*bam # the list of all the bam files we want to collect count information for` 97 | 98 | Let's run this now: 99 | 100 | ``` bash 101 | $ featureCounts -T 4 -s 2 \ 102 | -a /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \ 103 | -o ~/rnaseq/results/counts/Mov10_featurecounts.txt \ 104 | ~/rnaseq/results/STAR/bams/*.out.bam 105 | ``` 106 | 107 | > If you wanted to collect the information that is on the screen as the job runs, you can modify the command and add the `2>` redirection at the end. This type of redirection will collect all the information from the terminal/screen into a file. 108 | 109 | ``` bash 110 | # **DO NOT RUN THIS** 111 | # note the last line of the command below 112 | 113 | $ featureCounts -T 4 -s 2 \ 114 | -a /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \ 115 | -o ~/rnaseq/results/counts/Mov10_featurecounts.txt \ 116 | ~/rnaseq/results/STAR/bams/*.out.bam \ 117 | 2> /unix_lesson/rnaseq/results/counts/Mov10_featurecounts.screen-output 118 | ``` 119 | #### featureCounts output 120 | 121 | The output of this tool is 2 files, *a count matrix* and *a summary file* that tabulates how many the reads were "assigned" or counted and the reason they remained "unassigned". Let's take a look at the summary file: 122 | 123 | ``` bash 124 | $ less results/counts/Mov10_featurecounts.txt.summary 125 | ``` 126 | Now let's look at the count matrix: 127 | 128 | ``` bash 129 | $ less results/counts/Mov10_featurecounts.txt 130 | ``` 131 | 132 | ##### Cleaning up the featureCounts matrix 133 | There is information about the genomic coordinates and the length of the gene, we don't need this for the next step, so we are going to extract the columns that we are interested in. 134 | 135 | ``` bash 136 | $ cut -f1,7,8,9,10,11,12 results/counts/Mov10_featurecounts.txt > results/counts/Mov10_featurecounts.Rmatrix.txt 137 | ``` 138 | The next step is to clean it up a little further by modifying the header line (we could also do this in R, or in a GUI text editor): 139 | 140 | ``` bash 141 | $ vim results/counts/Mov10_featurecounts.Rmatrix.txt 142 | ``` 143 | 144 | Vim has nice shortcuts for cleaning up the header of our file using the following steps: 145 | 146 | 1. Move the cursor to the beginning of the document by typing: `gg` (in command mode). 147 | 2. Remove the first line by typing: `dd` (in command mode). 148 | 2. Remove the file name following the sample name by typing: `:%s/_Aligned.sortedByCoord.out.bam//g` (in command mode). 149 | 3. Remove the path leading up to the file name by typing: `:%s/\/home\/username\/unix_lesson\/rnaseq\/results\/STAR\/bams\///g` (in command mode). 150 | 151 | > Note that we have a `\` preceding each `/`, which tells vim that we are not using the `/` as part of our search and replace command, but instead the `/` is part of the pattern that we are replacing. This is called *escaping* the `/`. 152 | 153 | ### Note on counting PE data 154 | 155 | For paired-end (PE) data, the bam file contains information about whether both read1 and read2 mapped and if they were at roughly the correct distance from each other, that is to say if they were "properly" paired. For most counting tools, **only properly paired reads are considered by default, and each read pair is counted only once as a single "fragment"**. 156 | 157 | For counting PE fragments associated with genes, the input bam files need to be sorted by read name (i.e. alignment information about both read pairs in adjoining rows). The alignment tool might sort them for you, but watch out for how the sorting was done. If they are sorted by coordinates (like with STAR), you will need to use `samtools sort` to re-sort them by read name before using as input in featureCounts. If you do not sort you BAM file by read name before using as input, featureCounts assumes that almost all the reads are not properly paired. 158 | 159 | 160 | --- 161 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 162 | -------------------------------------------------------------------------------- /lessons/fastqc-troubleshooting.md: -------------------------------------------------------------------------------- 1 | ## Troubleshooting quality issues of raw data 2 | 3 | While the data for this analysis is quite good, it's unfortunate that that's not always the case. So now that we know a bit about the types of quality issues to check for in the raw RNA-seq data, how do we troubleshoot them? 4 | 5 | 6 | 7 | To help think through the troubleshooting, we can arrange the data by the main problems encountered: 8 | 9 | - **Poor quality data** 10 | - Poor quality at 3' end of sequence 11 | - **Probable cause(s):** Fluorescent signal decay or phasing issues - expected for Illumina data, but take note of the decrease in quality. 12 | - Poor quality across sequence 13 | - **Probable cause(s):** Problems at the sequencing facility - contact them 14 | - Drop in quality in the middle 15 | - **Probable cause(s):** Problems at the sequencing facility - contact them 16 | - Large percentage of sequences with low mean quality scores 17 | - **Probable cause(s):** Problems at the sequencing facility - contact them 18 | 19 | - **Issues based on read sequence expectations** 20 | 21 | - Unexpected %GC for organism and/or % of each nucleotide does not remain similar across the read (except for first 10-12 bases for RNA-Seq) 22 | - **Probable cause(s):** Contaminating sequences: different species, adapters, vector, mitochondrial/rRNA 23 | - High level of sequence duplications 24 | - **Probable cause(s):** Low complexity library, too many cycles of PCR amplification / too little starting material 25 | - Over-represented sequences more than 1-2%, unless expected based on experimental design 26 | - **Probable cause(s):** Contaminating sequences: adapters, vector, mitochondrial/rRNA 27 | -------------------------------------------------------------------------------- /lessons/more_bash_cluster.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Bash_extras" 3 | author: "Radhika Khetani", "Meeta Mistry" 4 | date: 2019-10-29 5 | duration: 30 6 | --- 7 | 8 | ## Overview 9 | 10 | * [Creating shortcuts or `alias`](#alias) 11 | * [Copying files using `scp` and `rsync`](#rsync) 12 | * [Symbolic Links or "sym links"](#symlink) 13 | 14 | *** 15 | 16 | ## Setting up some `alias`es 17 | 18 | In your terminal, do the following: 19 | 20 | ```bash 21 | $ cd 22 | 23 | $ ls -l 24 | 25 | $ ll 26 | ``` 27 | 28 | `ll` should have output the same thing as `ls -l`. Why does it work this way? This is because the HMS-RC folks have internally setup what is called an **alias**. 29 | 30 | A **shell alias is a shortcut to reference a command**. It can be used to avoid typing long commands. For common patterns it can reduce keystrokes and improve efficiency. A simple example is setting default options on commands to avoid having to type them each time a command is run. 31 | 32 | For example suppose that because you are just starting out on the cluster, and you prefer to confirm deleting a file before using the `rm` command. Remember that the `rm` command supports this with the `-i` option. To avoid forgetting to use the `-i` option each time, an alias can be created so that each time `rm` is run it will use the `-i` option and prompt the user to confirm. 33 | 34 | 35 | ```bash 36 | $ alias rm='rm -i' 37 | ``` 38 | 39 | However, this alias is only going to be available to you while that Terminal window is open. If you wanted to **use that alias all the time, what would you do?** 40 | 41 | You would add it to `~/.bashrc`! Let's open `~/.bashrc` and add a few commands to it. At the bottom of the file you should see a header titled "User specific aliases". Under that header go ahead and add the alias. 42 | 43 | ```bash 44 | $ vim ~/.bashrc 45 | ``` 46 | 47 | Add in a line at the end of your `.bashrc` file: 48 | 49 | ``` 50 | alias rm='rm -i' 51 | ``` 52 | 53 | 54 | Now, we can source the `.bashrc` file for the alias to take effect and we can try it out. You should see the question ` 55 | remove draft.txt?` and here you can answer `n` for No. 56 | 57 | ```bash 58 | $ source ~/.bashrc 59 | 60 | $ rm ~/unix_lesson/other/draft.txt 61 | ``` 62 | 63 | As we mentioned, aliases are super helpful for long commands that we are repeatedly having to tyoe out. A good example of this is the `srun` command for starting and interactive session. **First exit the interactive session and get on a login node, if you are not there already.** 64 | 65 | ```bash 66 | $ alias o2i='srun --pty -p interactive -t 0-12:00 --mem 2G --reservation=HBC /bin/bash' 67 | ``` 68 | 69 | Now you can test it out! 70 | 71 | ```bash 72 | $ o2i 73 | ``` 74 | 75 | Similar to what we did above, you can put this (or a similar) command in the `.bash_profile` file so it is available when you log on next time. 76 | 77 | > ### `.bashrc` versus `.bash_profile` 78 | > `.bash_profile` is executed for login shells, while `.bashrc` is executed for interactive non-login shells. When you login (type username and password) to O2 the `.bash_profile` is executed. So if you want the alias available **only** when you login, you will want to put it in your `.bash_profile`. 79 | 80 | ## Copying files to and from the cluster 81 | 82 | So far we have used FileZilla to copy files over from O2, but there are other way to do so using the command line interface. When you obtain your data from the sequencing facility, it will likely be stored on some remote computer and they will give you login credentials which will allow you to access it. There are various commands that can be used to help you copy those files from the remote computer over to 1) your local computer, 2) O2, or 3) whatever cluster environment you plan to work on. We present a few options here. 83 | 84 | ### `scp` 85 | 86 | Similar to the `cp` command to copy there is a command that allows you to **securely copy files between computers**. The command is called `scp` and allows files to be copied to, from, or between different hosts. It uses ssh for data transfer and provides the same authentication and same level of security as ssh. 87 | 88 | In the example below, the first argument is the **location on the remote server** and the second argument is the **destination on your local machine**. 89 | 90 | > *You can also do this in the opposite direction by swapping the arguments.* 91 | 92 | ```bash 93 | $ scp username@transfer.rc.hms.harvard.edu:/path/to/file_on_O2 Path/to/directory/local_machine 94 | ``` 95 | 96 | Let's try copying over the README file from your `unix_lesson` folder. **First open up a new terminal window.** Look and see where you currently are: 97 | 98 | ```bash 99 | $ pwd 100 | ``` 101 | 102 | Then type in: 103 | 104 | ```bash 105 | $ scp rc_trainingXX@transfer.rc.hms.harvard.edu:~/unix_lesson/other/draft.txt . 106 | ``` 107 | 108 | Now see that the file has transferred over: 109 | 110 | ```bash 111 | $ less draft.txt 112 | ``` 113 | 114 | > **NOTE:** Windows users may encounter a permissions error when using `scp` to copy over locally. We are not sure how to troubleshoot this, but will update materials as we obtain more information. 115 | 116 | ### `rsync` 117 | 118 | `rsync` is used to copy or synchronize data between directories. It has many advantages over `cp`, `scp` etc. It works in a specific direction, i.e. from the first directory **to** the second directory, similar to `cp`. 119 | 120 | **Salient Features of `rsync`** 121 | 122 | * If the command (or transfer) is interrupted, you can start it again and *it will restart from where it was interrupted*. 123 | * Once a folder has been synced between 2 locations, the next time you run `rsync` it will *only update and not copy everything over again*. 124 | * It runs a check to ensure that every file it is "syncing" over is the exact same in both locations. This check is run using a version of ["checksum"](https://en.wikipedia.org/wiki/Checksum) which ensures the data integrity during the data transfer process. 125 | 126 | > You can run the checksum function yourself when transferring large datasets without `rsync` using one of the following commands (or similar): `md5`, `md5sum`. 127 | 128 | 129 | ### Between directories on the same machine 130 | 131 | ```bash 132 | #DO NOT RUN 133 | $ rsync -av ~/large_dataset/. /n/groups/dir/groupdata/ 134 | ``` 135 | 136 | ### Between different machines 137 | 138 | When copying over large datasets to or from a remote machine, `rsync` works similarly to `scp`. 139 | 140 | ```bash 141 | #DO NOT RUN 142 | $ rsync -av -e ssh testfile username@transfer.rc.hms.harvard.edu:~/large_files/ 143 | ``` 144 | 145 | * `a` is for archive - means it preserves permissions (owners, groups), times, symbolic links, and devices. 146 | * `v` is for verbosity - means that it prints on the screen what is being copied 147 | * `-e ssh` is for encryption - means that we want to use the ssh protocol for encryption of the file transfer 148 | 149 | *More helpful information and examples using rsync can be found [at this link](https://www.comentum.com/rsync.html)* 150 | 151 | > Please do not use O2’s login nodes for transferring large datasets (like fastq files) between your computer and O2 with `rsync` or `scp`. Instead, use the transfer nodes `ssh eCommons@transfer.rc.hms.harvard.edu`. 152 | 153 | 154 | ## Symbolic Links or "sym links" 155 | 156 | Symbolic links are like shortcuts you may create on your laptop. A sym link makes it appear as if the linked object is actually there. It can be useful to access a file from multiple locations without creating copies and without using much disk space. (Symlinks are only a few bytes in size.) 157 | 158 | Let's check out an example of a folder with lots of symlinks. 159 | 160 | 161 | ```bash 162 | ls -l /n/app/bcbio/tools/bin/ 163 | ``` 164 | 165 | Now, let's create a sym link in our home directory for the same `unix_lesson` folder we had originally copied over. 166 | 167 | ```bash 168 | $ cd 169 | 170 | $ ln -s /n/groups/hbctraining/unix_lesson/ unix_lesson_sym 171 | 172 | $ ls -l 173 | ``` 174 | 175 | We recommend that you create something like this for your raw data so it does not accidentally get corrupted or overwritten. 176 | 177 | > Note: a “hard” link (just `ln` without the `-s` option) is very different. Always use “ln -s” unless you really know what you’re doing! 178 | 179 | ## Additional topics 180 | 181 | If you are interested in learning more about regular expressions (regex) and the tools `awk` and `sed1`, you can find more information in the ["extra_bash_tools"](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/extra_bash_tools.html) lesson. 182 | 183 | 184 | *** 185 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 186 | -------------------------------------------------------------------------------- /lessons/sam.md: -------------------------------------------------------------------------------- 1 | ## samtools extras 2 | 3 | To play around with a few `samtools` commands, first change directories into the directory containing all BAM files. 4 | 5 | `$ cd ~/unix_workshop/rnaseq/results/STAR/bams` 6 | 7 | ### Write only mapped reads to file (filter out unmapped reads) 8 | 9 | `$ samtools view -b -h -F 4 Mov10_oe_1_Aligned.sortedByCoord.out.bam > Mov10_oe_1_Aligned.onlyAligned.bam` 10 | 11 | ### Create a FASTQ file containing only mapped reads 12 | 13 | `$ bamtofastq -o Mov10_oe_1_Mapped.fastq --no-unaligned Mov10_oe_1_Aligned.onlyMapped.bam` 14 | 15 | ### Index BAM file 16 | 17 | `$ samtools index Mov10_oe_1_Aligned.sortedByCoord.out.bam` 18 | 19 | ### Extract reads from a specific region of the chromosome 20 | 21 | `$samtools view Mov10_oe_1_Aligned.sortedByCoord.out.bam chr1:200000-500000` 22 | 23 | ### Randomly subsample half of the reads into a new BAM file 24 | 25 | `$ samtools view -s 0.5 -b Mov10_oe_1_Aligned.sortedByCoord.out.bam > Mov10_oe_1_subsample.bam` 26 | 27 | ### Simple stats for alignment file 28 | 29 | `$ samtools flagstat Mov10_oe_1_Aligned.sortedByCoord.out.bam` 30 | 31 | ### Visualizing mismatches 32 | 33 | `$ samtools view -h Mov10_oe_1_Aligned.sortedByCoord.out.bam | head -n 5 | samtools fillmd -e - ~/unix_workshop/rnaseq/reference_data/chr1.fa` 34 | 35 | -------------------------------------------------------------------------------- /lessons/shell_review.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Shell" 3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry, Jihe Liu" 4 | date: "October 26, 2020" 5 | --- 6 | 7 | ## Learning Objectives 8 | - Review shell commands and concepts 9 | 10 | 11 | ## Setting up 12 | 13 | This workshop assumes that you have either a) taken our [Introduction to command-line interface workshop](https://hbctraining.github.io/Shell-for-bioinformatics/schedule/) or b) been working on the command-line and are already fluent with shell/bash. **We ask that you complete the exercises below**, to refresh some basic commands that you will be using over the course of the workshop. For each section we have relevant materials linked as a helpful reference. 14 | 15 | ### Opening up a terminal window 16 | 17 | > *NOTE: This mandatory pre-work does not require you to login to the O2 cluster.* 18 | 19 | On your local laptop, you will need to open up your terminal window. This will be different depending on what kind of operating system (OS) you are working on. 20 | 21 | **With Mac OS** 22 | 23 | Macs have a utility application called "**Terminal**" for performing tasks on the command line (shell), both locally and on remote machines. 24 | 25 | Please find and open the Terminal utility on your computers using the *Spotlight Search* at the top right hand corner of your screen. 26 | 27 | **With Windows OS** 28 | 29 | By default, there is no built-in Terminal that uses the bash shell on the Windows OS. So, we will be using a downloaded program called "**Git BASH**" which is part of the [Git for Windows](https://git-for-windows.github.io/) tool set. **Git BASH is a shell/bash emulator.** What this means is that it shows you a very similar interface to, and provides you the functionality of, the Terminal utility found on the Mac and Linux Operating systems. 30 | 31 | Please find and open Git BASH. 32 | 33 | > **Tip** - Windows users can use another program called [Putty](http://www.chiark.greenend.org.uk/~sgtatham/putty/download.html) instead of a *bash emulator* to log in to remote machines, but it is a little more involved and has different capabilities. We encourage you to take a look at it, but we will not be covering it in this workshop. 34 | 35 | ### Downloading the example data folder 36 | 37 | We will be exploring the capabilities of the shell by working with some RNA-Seq data. We need to **download the data to our current folder** using the link below. To do so, follow the step-by-step instructions below. 38 | 39 | **1. Find out what folder we are currently inside**. To do this, we can use the 'print working directory' command: 40 | 41 | ```bash 42 | $ pwd 43 | ``` 44 | 45 | > On a **Mac** your current folder should be something starting with `/Users/`, like `/Users/marypiper/`. 46 | > 47 | > On a **Windows** machine your current folder should be something starting with `/c/Users/marypiper`. To find this in your File explorer try clicking on PC and navigating to that path. 48 | 49 | _Once you have identified which folder you are in, this is where we will be downloading your data._ 50 | 51 | **2. Click on the link below then go to file > download to download the data"**. This will automatically download the folder to your downloads folder. If you downloaded the data previously as a part of the Basic Shell workshop, you do not need to download it again unless you have deleted it. 52 | 53 | * Download data by [clicking here](https://www.dropbox.com/s/x66jksdd4jklpdw/unix_lesson.zip?dl=0). 54 | 55 | **3.** Once you have downloaded the file to the correct location, go back to your **terminal window and type the 'list' command**: 56 | 57 | ```bash 58 | $ ls 59 | ``` 60 | 61 | > `ls` stands for 'list' and it lists the contents of a directory. 62 | 63 | _You should see `unix_lesson.zip` as part of the output to the screen._ 64 | 65 | **4.** Finally, to **decompress the folder**: 66 | 67 | * Double click on unix_lesson.zip on a mac. This will automatically inflate the folder. 68 | * If you are on windows, press and hold (or right-click) the folder, select Extract All..., and then follow the instructions. 69 | 70 | 71 | **5.** Now when you **run the `ls` command** again you should see a folder called `unix_lesson`, which means you are all set with the data download! 72 | 73 | ```bash 74 | $ ls 75 | ``` 76 | 77 | **6.** Go into the folder for the lesson 78 | 79 | on mac type: 80 | ```bash 81 | $ cd unix_lesson 82 | ``` 83 | 84 | on windows type: 85 | 86 | ```bash 87 | $ cd unix_lesson/unix_lesson 88 | ``` 89 | 90 | *** 91 | 92 | 93 | ## Reviewing shell commands 94 | 95 | ### Shell basics 96 | We are going to start this review with some basic commands pertaining to navigating around the filesystem. Helpful reference materials are listed below: 97 | 98 | * [Introduction to Shell](https://hbctraining.github.io/Shell-for-bioinformatics//lessons/01_the_filesystem.html) 99 | * [Wildcards and shortcuts in Shell](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/02_wildcards_shortcuts.html) 100 | * [Examining and creating files](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/03_working_with_files.html) 101 | 102 | 1. Change directory into the `unix_lesson/` directory. 103 | 2. Take a quick look at the `Mov10_oe_1.subset.fq` file (located in `raw_fastq` directory) using `less` from `unix_lesson/`, without changing directories. 104 | 3. Use a shortcut to move out of the directory to the parent of `unix_lesson/`. 105 | 4. Change directories into the `raw_fastq/` folder with a single command. 106 | 5. What does the `~` in the command prompt mean? 107 | 6. What is the full path to the `unix_lesson` directory? 108 | 8. List all the files in the `raw_fastq` directory. 109 | 8. Modify the above command using the `*` wildcard to only list those files that have "oe" in their names. 110 | 10. How many and which commands have you run so far? 111 | 112 | ### Searching and redirection 113 | Next, we will search our files for specific patterns and redirect the results to file. Helpful reference materials are listed below: 114 | 115 | * [Searching and redirection](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/04_searching_files.html) 116 | 117 | 12. Create a new directory called `shell_review/` within the `unix_lesson/` directory. 118 | 13. Search the file `unix_lesson/reference_data/chr1-hg19_genes.gtf` for lines containing the string "MOV10". Save the output in the `shell_review/` directory with a new name - "Mov10_hg19.gtf". 119 | 14. Use `vim` to open the newly created file `unix_lesson/shell_review/Mov10_hg19.gtf` and add a comment at the top specifying how this file was created and the source of the content. Save the modified file and quit `vim`. 120 | 15. In the new file "Mov10_hg19.gtf", how many lines contain the word "exon"? 121 | 122 | ### Loops and shell scripts 123 | 124 | * [Shell scripts and variables in Shell](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/05_shell-scripts_variable.html) 125 | * [Loops and automation](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/06_loops_and_automation.html) 126 | 127 | 16. Use the `for` loop to iterate over each FASTQ file in `raw_fastq` and do the following: 128 | * Print the name of the current file 129 | * Generate a prefix to use for naming our output files, and store it inside a variable called `sample`. 130 | * Dump out the first 40 lines into a new file that will be saved in `shell_review` 131 | 17. Place the above `for` loop into a shell script using `vim` and run it. 132 | 133 | ### Permissions 134 | 135 | * [Interpreting the permissions string](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/07_permissions_and_environment_variables.html#permissions) 136 | 137 | There is a folder in the HBC training shared space on the O2 cluster called `intro_rnaseq_hpc`. Below we have displayed a long listing of its contents. 138 | 139 | ``` bash 140 | total 714 141 | drwxrwsr-x 3 mm573 hbctraining 1111 Aug 22 2017 bam_STAR 142 | drwxrwsr-x 8 mp298 hbctraining 1914 May 21 2018 bam_STAR38 143 | drwxrwsr-x 2 mm573 hbctraining 522 Oct 6 2015 bam_tophat 144 | drwxrwsr-x 2 mm573 hbctraining 240 Oct 19 2015 counts 145 | drwxrwsr-x 2 mm573 hbctraining 260 Oct 19 2015 counts_STAR 146 | -rw-rw-r-- 1 mm573 hbctraining 2416 Aug 22 2017 DE_script.R 147 | -rw-rw-r-- 1 mm573 hbctraining 2064 Mar 28 2018 DESeq2_script.R 148 | drwxrwsr-x 2 mm573 hbctraining 705 Oct 6 2015 fastqc 149 | drwxrwsr-x 2 mm573 hbctraining 272 Jan 31 2018 full_dataset 150 | -rw-rw-r-- 1 mm573 hbctraining 216 Nov 10 2015 install_libraries.R 151 | -rw-rw-r-- 1 mm573 hbctraining 117 Oct 19 2015 install_libraries.sh 152 | drwxrwsr-x 78 mm573 hbctraining 1969 Aug 22 2017 R-3.3.1 153 | drwxrwsr-x 3 mp298 hbctraining 234 Feb 27 2019 reference_data_ensembl38 154 | drwxrwsr-x 2 mm573 hbctraining 555 Oct 5 2015 reference_STAR 155 | drwxrwsr-x 2 rsk27 hbctraining 260 Aug 22 2017 salmon.ensembl37.idx 156 | drwxrwsr-x 2 mm573 hbctraining 306 Oct 6 2015 trimmed_fastq 157 | 158 | ``` 159 | 160 | 18. How many owners have files in this folder? 161 | 19. How many groups? 162 | 20. Are there any executable *files* in this folder? 163 | 21. What kind of access does the user `mm573` have to the `full_dataset/` directory? 164 | 22. You are considered as "other" or everyone else on this system (i.e you are not part of the group `hbctraining`. What command would allow the user `mm573` do to take away your ability to look inside the `full_dataset/` directory? 165 | 166 | 167 | ### Environment variables 168 | 169 | * [Understanding environment variables](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/07_permissions_and_environment_variables.html#environment-variables) 170 | 171 | 23. Display the contents of the `$HOME` variable on your computer. 172 | 24. Use the `which` command to check where the executable file for the `pwd` command lives in the directory structure. 173 | 25. How does shell know where to find the executable file for the `pwd` command? 174 | 26. Display the contents of the variable that stores the various paths to folders containing executable command files. 175 | 176 | 177 | 178 | ### Review your answers 179 | * [Answer key](shell_review_answer_key.md) 180 | 181 | **** 182 | 183 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 184 | -------------------------------------------------------------------------------- /lessons/shell_review_answer_key.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Shell Review Answer Key" 3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry, Jihe Liu" 4 | date: "October 26, 2020" 5 | --- 6 | 7 | ## Learning Objectives 8 | - Review shell commands and concepts 9 | 10 | 11 | ## Setting up 12 | 13 | The Introduction to RNA-seq workshop assumes that you have either a) taken our [Introduction to command-line interface workshop](https://hbctraining.github.io/Intro-to-shell-flipped/schedule/) or b) been working on the command-line and are already fluent with shell/bash. **We ask that you complete the exercises below**, to refresh some basic commands that you will be using over the course of the workshop. For each section we have relevant materials linked as a helpful reference. 14 | 15 | ### Opening up a terminal window 16 | 17 | > *NOTE: This mandatory pre-work does not require you to login to the O2 cluster.* 18 | 19 | On your local laptop, you will need to open up your terminal window. This will be different depending on what kind of operating system (OS) you are working on. 20 | 21 | **With Mac OS** 22 | 23 | Macs have a utility application called "**Terminal**" for performing tasks on the command line (shell), both locally and on remote machines. 24 | 25 | Please find and open the Terminal utility on your computers using the *Spotlight Search* at the top right hand corner of your screen. 26 | 27 | **With Windows OS** 28 | 29 | By default, there is no built-in Terminal that uses the bash shell on the Windows OS. So, we will be using a downloaded program called "**Git BASH**" which is part of the [Git for Windows](https://git-for-windows.github.io/) tool set. **Git BASH is a shell/bash emulator.** What this means is that it shows you a very similar interface to, and provides you the functionality of, the Terminal utility found on the Mac and Linux Operating systems. 30 | 31 | Please find and open Git BASH. 32 | 33 | > **Tip** - Windows users can use another program called [Putty](http://www.chiark.greenend.org.uk/~sgtatham/putty/download.html) instead of a *bash emulator* to log in to remote machines, but it is a little more involved and has different capabilities. We encourage you to take a look at it, but we will not be covering it in this workshop. 34 | 35 | ### Downloading the example data folder 36 | 37 | The data you will be working with can be downloaded using the link below. Clicking on the link will automatically place a file called `unix_lesson.zip` to your `Downloads` folder on your computer. 38 | 39 | - [Introduction to Shell: Dataset](https://github.com/hbctraining/Training-modules/blob/master/Intro_shell/data/unix_lesson.zip?raw=true) 40 | 41 | Now, in you terminal window change directories into your `Downloads` folder and check that the file is listed there: 42 | 43 | ```bash 44 | $ cd ~/Downloads 45 | $ ls -l unix_lesson.zip 46 | ``` 47 | 48 | To decompress the file into a folder called `unix_lesson` we use the `unzip` command: 49 | 50 | ```bash 51 | $ unzip unix_lesson.zip 52 | ``` 53 | 54 | Check to see that you have the folder `unix_lesson` before proceeding. 55 | 56 | ```bash 57 | $ ls -l unix_lesson 58 | ``` 59 | 60 | ## Reviewing shell commands 61 | 62 | ### Shell basics 63 | We are going to start this review with some basic commands pertaining to navigating around the filesystem. Helpful reference materials are listed below: 64 | 65 | * [Introduction to Shell](https://hbctraining.github.io/Intro-to-shell-flipped//lessons/01_the_filesystem.html) 66 | * [Wildcards and shortcuts in Shell](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/02_wildcards_shortcuts.html) 67 | * [Examining and creating files](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/03_working_with_files.html) 68 | 69 | 1. Change directory into the `unix_lesson/` directory. 70 | ```bash 71 | $ cd unix_lesson 72 | ``` 73 | 74 | 2. Take a quick look at the `Mov10_oe_1.subset.fq` file (located in `raw_fastq` directory) using `less` from `unix_lesson/`, without changing directories. 75 | ```bash 76 | $ less raw_fastq/Mov10_oe_1.subset.fq 77 | ``` 78 | 79 | 3. Use a shortcut to move out of the directory to the parent of `unix_lesson/`. 80 | ```bash 81 | $ cd .. 82 | ``` 83 | 84 | 4. Change directories into the `raw_fastq/` folder with a single command. 85 | ```bash 86 | $ cd unix_lesson/raw_fastq/ 87 | ``` 88 | 89 | 5. What does the `~` in the command prompt mean? 90 | Answer: `~` means home directory. 91 | 92 | 6. What is the full path to the `unix_lesson` directory? 93 | Answer: `/Users/your_username/Downloads/unix_lesson` (**the result will vary based on your computer's file system**) 94 | 95 | 8. List all the files in the `raw_fastq` directory. 96 | ```bash 97 | # (option 1) You can navigate to the `raw_fastq` directory and say 98 | $ ls -l 99 | 100 | # (option 2) You can identify your location and give the full or relative path to raw_fastq 101 | ``` 102 | 103 | 8. Modify the above command using the `*` wildcard to only list those files that have "oe" in their names. 104 | ```bash 105 | # (option 1) You can navigate to the `raw_fastq` directory and say 106 | $ ls -l *oe* 107 | 108 | # (option 2) You can identify your location and give the full or relative path to raw_fastq 109 | ``` 110 | 111 | 10. How many and which commands have you run so far? 112 | ```bash 113 | $ history 114 | ``` 115 | Answer: Result will vary based on your activity. 116 | 117 | ### Searching and redirection 118 | Next, we will search our files for specific patterns and redirect the results to file. Helpful reference materials are listed below: 119 | 120 | * [Searching and redirection](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/04_searching_files.html) 121 | 122 | 12. Create a new directory called `shell_review/` within the `unix_lesson/` directory. 123 | ```bash 124 | # First make sure that you navigate to the `unix_lesson/` directory 125 | $ mkdir shell_review 126 | ``` 127 | 13. Search the file `unix_lesson/reference_data/chr1-hg19_genes.gtf` for lines containing the string "MOV10". Save the output in the `shell_review/` directory with a new name - "Mov10_hg19.gtf". 128 | ```bash 129 | # First make sure that you navigate to the `unix_lesson/` directory 130 | $ grep MOV10 reference_data/chr1-hg19_genes.gtf > shell_review/Mov10_hg19.gtf 131 | ``` 132 | 14. Use `vim` to open the newly created file `unix_lesson/shell_review/Mov10_hg19.gtf` and add a comment at the top specifying how this file was created and the source of the content. Save the modified file and quit `vim`. 133 | 134 | Answer: 135 | * Open file - `vim shell_review/Mov10_hg19.gtf` 136 | * Edit file - `i`- Add text using `#` to indicate comment 137 | * Exit edit mode - `esc` 138 | * Save and quit - `:wq`. 139 | 140 | 15. In the new file "Mov10_hg19.gtf", how many lines contain the word "exon"? 141 | ```bash 142 | # First make sure that you navigate to the `unix_lesson/` directory 143 | $ grep exon shell_review/Mov10_hg19.gtf | wc -l 144 | ``` 145 | Answer: 42 146 | 147 | ### Loops and shell scripts 148 | 149 | * [Shell scripts and variables in Shell](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/05_shell-scripts_variable.html) 150 | * [Loops and automation](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/06_loops_and_automation.html) 151 | 152 | 16. Use the `for` loop to iterate over each FASTQ file in `raw_fastq` and do the following: 153 | * Print the name of the current file 154 | * Generate a prefix to use for naming our output files, and store it inside a variable called `sample`. 155 | * Dump out the first 40 lines into a new file that will be saved in `shell_review` 156 | 157 | ```bash 158 | # First make sure that you navigate to the `raw_fastq/` directory 159 | $ for file in *fq 160 | > do 161 | > echo $file 162 | > sample=`basename $file .subset.fq` 163 | > head -n 40 $file > ../shell_review/${sample}_first40.fq 164 | > done 165 | ``` 166 | 17. Place the above `for` loop into a shell script using `vim` and run it. 167 | 168 | Answer: Navigate to the `raw_fastq/` directory, and create a script `vim generate_first40.sh` 169 | 170 | ```bash 171 | #!/bin/bash 172 | for file in *fq 173 | do 174 | echo $file 175 | sample=`basename $file .subset.fq` 176 | head -n 40 $file > ../shell_review/${sample}_first40.fq 177 | done 178 | ``` 179 | Run the script with the following command: 180 | 181 | ```bash 182 | $ sh generate_first40.sh 183 | ``` 184 | 185 | ### Permissions 186 | 187 | * [Interpreting the permissions string](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/07_permissions_and_environment_variables.html#permissions) 188 | 189 | There is a folder in the HBC training shared space on the O2 cluster called `intro_rnaseq_hpc`. Below we have displayed a long listing of its contents. 190 | 191 | ``` bash 192 | total 714 193 | drwxrwsr-x 3 mm573 hbctraining 1111 Aug 22 2017 bam_STAR 194 | drwxrwsr-x 8 mp298 hbctraining 1914 May 21 2018 bam_STAR38 195 | drwxrwsr-x 2 mm573 hbctraining 522 Oct 6 2015 bam_tophat 196 | drwxrwsr-x 2 mm573 hbctraining 240 Oct 19 2015 counts 197 | drwxrwsr-x 2 mm573 hbctraining 260 Oct 19 2015 counts_STAR 198 | -rw-rw-r-- 1 mm573 hbctraining 2416 Aug 22 2017 DE_script.R 199 | -rw-rw-r-- 1 mm573 hbctraining 2064 Mar 28 2018 DESeq2_script.R 200 | drwxrwsr-x 2 mm573 hbctraining 705 Oct 6 2015 fastqc 201 | drwxrwsr-x 2 mm573 hbctraining 272 Jan 31 2018 full_dataset 202 | -rw-rw-r-- 1 mm573 hbctraining 216 Nov 10 2015 install_libraries.R 203 | -rw-rw-r-- 1 mm573 hbctraining 117 Oct 19 2015 install_libraries.sh 204 | drwxrwsr-x 78 mm573 hbctraining 1969 Aug 22 2017 R-3.3.1 205 | drwxrwsr-x 3 mp298 hbctraining 234 Feb 27 2019 reference_data_ensembl38 206 | drwxrwsr-x 2 mm573 hbctraining 555 Oct 5 2015 reference_STAR 207 | drwxrwsr-x 2 rsk27 hbctraining 260 Aug 22 2017 salmon.ensembl37.idx 208 | drwxrwsr-x 2 mm573 hbctraining 306 Oct 6 2015 trimmed_fastq 209 | 210 | ``` 211 | 212 | 18. How many owners have files in this folder? 213 | 214 | Answer: 3 215 | 216 | 19. How many groups? 217 | 218 | Answer: 1 219 | 220 | 20. Are there any executable *files* in this folder? 221 | 222 | Answer: No 223 | 224 | 21. What kind of access does the user `mm573` have to the `full_dataset/` directory? 225 | 226 | Answer: r(read), w(write/edit), and x(execute). 227 | 228 | 22. You are considered as "other" or everyone else on this system (i.e you are not part of the group `hbctraining`. What command would allow the user `mm573` do to take away your ability to look inside the `full_dataset/` directory? 229 | 230 | Answer: chmod o-r full_dataset/ 231 | 232 | ### Environment variables 233 | 234 | * [Understanding environment variables](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/07_permissions_and_environment_variables.html#environment-variables) 235 | 236 | 23. Display the contents of the `$HOME` variable on your computer. 237 | ```bash 238 | $ echo $HOME 239 | ``` 240 | 24. Use the `which` command to check where the executable file for the `pwd` command lives in the directory structure. 241 | ```bash 242 | $ which pwd 243 | ``` 244 | 25. How does shell know where to find the executable file for the `pwd` command? 245 | 246 | Answer: the shell searches through each path in $PATH until it finds an executable file for the `pwd` command. 247 | 248 | 26. Display the contents of the variable that stores the various paths to folders containing executable command files. 249 | 250 | ```bash 251 | $ echo $PATH 252 | ``` 253 | 254 | **** 255 | 256 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 257 | -------------------------------------------------------------------------------- /schedule/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to bulk RNA-seq: From reads to count matrix 2 | 3 | > **Pre-requisite for this workshop:** The *Basic Data Skills* [Shell for Bioinformatics](https://hbctraining.github.io/Shell-for-bioinformatics/) workshop or a working knowledge of the command line and cluster computing. 4 | 5 | ## Pre-reading 6 | 7 | * [Shell basics review](../lessons/shell_review.md) 8 | * [Best Practices in Research Data Management (RDM)](../lessons/04a_data_organization.md) 9 | * [Introduction to RNA-seq](../lessons/01_intro-to-RNAseq.md) 10 | 11 | ## Day 1 12 | 13 | | Time | Topic | Instructor | 14 | |:-----------:|:----------:|:--------:| 15 | | 09:30 - 09:45 | [Workshop Introduction](../lectures/workshop_intro_slides.pdf) | Will | 16 | | 09:45 - 10:25 | [Working in an HPC environment - Review](../lessons/03_working_on_HPC.md) | Upen | 17 | | 10:25 - 11:05 | [Project Organization (using Data Management best practices)](../lessons/04b_data_organization.md) | Will | 18 | | 11:05 - 11:45 | [Quality Control of Sequence Data: Running FASTQC](../lessons/05_qc_running_fastqc_interactively.md) | Upen | 19 | | 11:45 - 12:00 | Overview of self-learning materials and homework submission | Will | 20 | 21 | ### Before the next class: 22 | 23 | 1. Please **study the contents** and **work through all the code** within the following lessons: 24 | 25 | * [Experimental design considerations](../lessons/02_experimental_planning_considerations.md) 26 | * [Quality Control of Sequence Data: Running FASTQC on multiple samples](../lessons/06_qc_running_fastqc_sbatch.md) 27 | * [Quality Control of Sequence Data: Evaluating FASTQC reports](../lessons/07_qc_fastqc_assessment.md) 28 | 29 | > **NOTE:** To run through the code above, you will need to be **logged into O2** and **working on a compute node** (i.e. your command prompt should have the word `compute` in it). 30 | > 1. Log in using `ssh rc_trainingXX@o2.hms.harvard.edu` and enter your password (replace the "XX" in the username with the number you were [assigned in class](https://docs.google.com/spreadsheets/d/1kBlYowhjjHJC9ZovmbBULmbqozKpprM17vZ2wPlhNg0/edit#gid=0)). 31 | > 2. Once you are on the login node, use `srun --pty -p interactive -t 0-2:30 --mem 1G /bin/bash` to get on a compute node or as specified in the lesson. 32 | > 3. Proceed only once your command prompt has the word `compute` in it. 33 | > 4. If you log out between lessons (using the `exit` command twice), please follow points 1. and 2. above to log back in and get on a compute node when you restart with the self learning. 34 | 35 | 2. **Complete the exercises**: 36 | * Each lesson above contain exercises; please go through each of them. 37 | * Add your answers to the questions to [Google forms](https://docs.google.com/forms/d/e/1FAIpQLSdxdSM4528uYTWT7k5c8gYAuCUaTqRkUSI88eUmKg7qyQZZAQ/viewform?usp=sf_link) the **day before the next class**. 38 | 39 | ### Questions? 40 | * ***If you get stuck due to an error*** while runnning code in the lesson, [email us](mailto:hbctraining@hsph.harvard.edu) 41 | 42 | *** 43 | 44 | ## Day 2 45 | 46 | | Time | Topic | Instructor | 47 | |:-----------:|:----------:|:--------:| 48 | | 09:30 - 10:30 | Self-learning lessons review | All | 49 | | 10:30 - 11:10 | [Expression quantification: Theory and Tools](../lectures/expression_quantification.pdf) | Will | 50 | | 11:10 - 11:50 | [Quantifying expression using alignment-free methods (Salmon)](../lessons/08_quasi_alignment_salmon.md) | Upen | 51 | | 11:50 - 12:00 | [Review of workflow](../lectures/workflow_overview.pdf) | Upen | 52 | 53 | ### Before the next class: 54 | 55 | 1. Please **study the contents** and **work through all the code** within the following lessons: 56 | 57 | * [Quantifying expression using alignment-free methods (Salmon on multiple samples)](../lessons/09_quasi_alignment_salmon_sbatch.md) 58 |
59 | Click here for a preview of this lesson 60 |
Now that we know how to run the quantification of one sample with Salmon, this lesson will guide you to run multiple samples by creating a job submission script

61 |
62 | * [QC with Alignment Data](../lessons/10_QC_Qualimap.md) 63 |
64 | Click here for a preview of this lesson 65 |
Besides transcript-level quantification, we also want to understand the quality of the mapping, which is not provided in Salmon output.

This lesson will cover:
66 | - Aligning the reads with an aligner, STAR
67 | - Assessing QC metrics among samples

68 |
69 | * [Documenting Steps in the Workflow with MultiQC](../lessons/11_multiQC.md) 70 |
71 | Click here for a preview of this lesson 72 |
It would be great to have a summary document of all QC results from the previous analysis.

This lesson will cover:
73 | - Generating such a summary report with multiQC
74 | - Generating alignment metric with Qualimap

75 |
76 | 77 | > **NOTE:** To run through the code above, you will need to be **logged into O2** and **working on a compute node** (i.e. your command prompt should have the word `compute` in it). 78 | > 1. Log in using `ssh rc_trainingXX@o2.hms.harvard.edu` and enter your password (replace the "XX" in the username with the number you were assigned in class). 79 | > 2. Once you are on the login node, use `srun --pty -p interactive -t 0-2:30 --mem 8G /bin/bash` to get on a compute node or as specified in the lesson. 80 | > 3. Proceed only once your command prompt has the word `compute` in it. 81 | > 4. If you log out between lessons (using the `exit` command twice), please follow points 1. and 2. above to log back in and get on a compute node when you restart with the self learning. 82 | 83 | 2. **Complete the exercises**: 84 | * Each lesson above contain exercises; please go through each of them. 85 | * Add your answers to the questions to [Google forms](https://docs.google.com/forms/d/e/1FAIpQLScxaj3IIO4Bx7FCRw87cCeuTPQyhD_7WR2QU638y8IZDv5r1A/viewform?usp=sf_link) the **day before the next class**. 86 | 87 | ### Questions? 88 | * ***If you get stuck due to an error*** while runnning code in the lesson, [email us](mailto:hbctraining@hsph.harvard.edu) 89 | 90 | *** 91 | 92 | ## Day 3 93 | 94 | | Time | Topic | Instructor | 95 | |:-----------:|:----------:|:--------:| 96 | | 09:30 - 10:10 | Self-learning lessons review | All | 97 | | 10:10 - 11:10 | [Automating the RNA-seq workflow](../lessons/12_automating_workflow.md) | Will | 98 | | 11:10 - 11:45 | [Troubleshooting RNA-seq Data Analysis](../lectures/RNA-seq_troubleshooting.pdf)| Upen | 99 | | 11:45 - 12:00 | [Wrap up](../lectures/workshop_wrapup_slides.pdf) | Will | 100 | 101 | *** 102 | 103 | * Downloadable Answer Keys (Day 2 exercises): 104 | * [Experimental design (one possible solution)](https://www.dropbox.com/scl/fi/vk6g9qvvosgmjjonoqint/exp_design_table.xlsx?rlkey=rbxkeln9mm0lxf4kdjbrqsidt&st=6sf562u3&dl=0) 105 | * [sbatch script](https://www.dropbox.com/scl/fi/3y7oa5i1eub7dzajfpko7/mov10_fastqc.run?rlkey=4eii6tc6nrludbjagcdgs1qxi&st=ipmbrx9y&dl=0) 106 | * [.out file](https://www.dropbox.com/scl/fi/m0f1ux4522sw2flt73aje/22914006.out?rlkey=sizy0vkm0r5fz14uyswrtdeew&st=sk7gh4i5&dl=0) 107 | * [.err file](https://www.dropbox.com/scl/fi/iye10ysh780danfo6r6v6/22914006.err?rlkey=maeny1p52dmio5ovli8c5ipss&st=lu0iuvcw&dl=0) 108 | 109 | * Downloadable Answer Keys (Day 3 exercises): 110 | * [sbatch script to run salmon for all samples](../answer_key/salmon_all_samples.sbatch) 111 | 112 | * [Automation Script](../scripts/rnaseq_analysis_on_input_file.sh) 113 | 114 | *** 115 | 116 | ## Resources 117 | * [Getting an O2 account](https://harvardmed.atlassian.net/wiki/spaces/O2/pages/1918304257/How+to+request+an+O2+account) 118 | * [Video about statistics behind salmon quantification](https://www.youtube.com/watch?v=TMLIxwDP7sk) 119 | * Advanced bash for working on O2: 120 | * [Creating shortcuts or aliases](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#alias) 121 | * [Copying files from other remote locations to O2](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#rsync) 122 | * [Creating symbolic links](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#symlink) 123 | * [Obtaining reference genomes or transcriptomes](https://hbctraining.github.io/Accessing_public_genomic_data/lessons/accessing_genome_reference_data.html) 124 | * Youtube videos 125 | * [Hash tables - Paul Programming](https://www.youtube.com/watch?v=MfhjkfocRR0&ab_channel=PaulProgramming) 126 | * [Suffix arrays - William Fiset](https://www.youtube.com/watch?v=zqKlL3ZpTqs) 127 | *** 128 | 129 | ## Building on this workshop 130 | * [Introduction to R workshop materials](https://hbctraining.github.io/Intro-to-R-flipped/#lessons) 131 | * [Introduction to Differential Gene Expression analysis workshop materials](https://hbctraining.github.io/Intro-to-DGE/#lessons) 132 | 133 | *** 134 | *These materials have been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 135 | -------------------------------------------------------------------------------- /schedule/links-to-lessons.md: -------------------------------------------------------------------------------- 1 | # Introduction to bulk RNA-seq: From reads to count matrix 2 | 3 | ## Learning Objectives 4 | 5 | - Understand the necessity for, and use of, the command line interface (bash) and HPC for analyzing high-throughput sequencing data. 6 | - Understand best practices for designing an RNA-seq experiment and analysis the resulting data. 7 | 8 | ## Installations 9 | 10 | ***All:*** 11 | 12 | * [FileZilla Client](https://filezilla-project.org/download.php?type=client) (make sure you get ‘FileZilla Client') 13 | 14 | ***Mac users:*** 15 | 16 | * Plain text editor like [Sublime text](http://www.sublimetext.com/) or similar 17 | 18 | ***Windows users:*** 19 | 20 | * [GitBash](https://git-scm.com/download/win) 21 | * Plain text editor like [Notepad++](http://notepad-plus-plus.org/) or similar 22 | 23 | ## Notes 24 | * These materials focus on the use of local computational resources at Harvard, which are **only accessible to Harvard affiliates** 25 | * Non-Harvard folks can [download the data](https://www.dropbox.com/s/t3lkyz1pz021222/unix_lesson.tar.gz?dl=0) and set up to work on their local clusters (with the help of local system administrators) 26 | 27 | ### Instructions for Harvard researchers with access to HMS-RC's O2 cluster 28 | 29 | To run through the code in the lessons below, you will need to be **logged into O2** and **working on a compute node** (i.e. your command prompt should have the word `compute` in it). 30 | 31 | 1. Log in using `ssh ecommonsID@o2.hms.harvard.edu` and enter your password. 32 | 2. Once you are on the login node, use `srun --pty -p interactive -t 0-2:30 --mem 1G /bin/bash` to get on a compute node or as specified in the lesson. 33 | 3. Proceed only once your command prompt has the word `compute` in it. 34 | 4. If you log out between lessons (using the `exit` command twice), please follow points 1. and 2. above to log back in and get on a compute node when you restart with the self learning. 35 | 36 | ## Lessons 37 | 38 | ### Part 1 39 | 1. [Introduction to RNA-seq](../lessons/01_intro-to-RNAseq.md) 40 | 1. [Shell basics review](../lessons/shell_review.md) 41 | 1. [Working in an HPC environment - Review](../lessons/03_working_on_HPC.md) 42 | 1. [Best Practices in Research Data Management (RDM)](../lessons/04a_data_organization.md) 43 | 1. [Project Organization (using Data Management best practices)](../lessons/04b_data_organization.md) 44 | 45 | *** 46 | 47 | ### Part II 48 | 1. [Quality Control of Sequence Data: Running FASTQC](../lessons/05_qc_running_fastqc_interactively.md) 49 | 1. [Experimental design considerations](../lessons/02_experimental_planning_considerations.md) 50 | 1. [Quality Control of Sequence Data: Running FASTQC on multiple samples](../lessons/06_qc_running_fastqc_sbatch.md) 51 | 1. [Quality Control of Sequence Data: Evaluating FASTQC reports](../lessons/07_qc_fastqc_assessment.md) 52 | 53 | *** 54 | 55 | ### Part III 56 | 1. [Sequence Alignment Theory](../lectures/alignment_quantification.pdf) 57 | 1. [Quantifying expression using alignment-free methods (Salmon on multiple samples)](../lessons/09_quasi_alignment_salmon_sbatch.md) 58 | 59 | *** 60 | 61 | ### Part IV 62 | 63 | 1. [QC with Alignment Data](../lessons/10_QC_Qualimap.md) 64 | 1. [Documenting Steps in the Workflow with MultiQC](../lessons/11_multiQC.md) 65 | 1. [Troubleshooting RNA-seq Data Analysis](../lectures/RNA-seq_troubleshooting.pdf) 66 | 67 | *** 68 | 69 | ### Part V 70 | 71 | 1. [Automating the RNA-seq workflow](../lessons/12_automating_workflow.md) 72 | 73 | *** 74 | 75 | ### Answer Keys 76 | 77 | * [Experimental design (one possible solution)](https://www.dropbox.com/s/524mevuyba34l5b/exp_design_table.xlsx?dl=1) 78 | * [FASTQC sbatch script](https://www.dropbox.com/s/9wdyhfqpic05l6p/mov10_fastqc.run?dl=1) 79 | * [FASTQC sbatch script .out file](https://www.dropbox.com/s/l7puf8oahtbwmpk/22914006.out?dl=1) 80 | * [FASTQC sbatch script .err file](https://www.dropbox.com/s/8a1g6o9t2kxit30/22914006.err?dl=1). 81 | * [sbatch script to run salmon for all samples](../answer_key/salmon_all_samples.sbatch) 82 | * [Automation Script](../scripts/rnaseq_analysis_on_input_file.sh) 83 | 84 | *** 85 | 86 | ## Building on this workshop 87 | * [Introduction to R workshop materials](https://hbctraining.github.io/Intro-to-R-flipped/schedule/links-to-lessons.html) 88 | * [Bulk RNA-seq Part II (differential gene expression analysis) materials](https://hbctraining.github.io/DGE_workshop_salmon_online/schedule/links-to-lessons.html) 89 | 90 | *** 91 | 92 | ## Resources 93 | * [Video about statistics behind salmon quantification](https://www.youtube.com/watch?v=TMLIxwDP7sk) 94 | * Advanced bash for working on O2: 95 | * [Creating shortcuts or aliases](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#alias) 96 | * [Copying files from other remote locations to O2](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#rsync) 97 | * [Creating symbolic links](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#symlink) 98 | * [Obtaining reference genomes or transcriptomes](https://hbctraining.github.io/Accessing_public_genomic_data/lessons/accessing_genome_reference_data.html) 99 | 100 | *** 101 | *These materials have been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.* 102 | -------------------------------------------------------------------------------- /scripts/PE-rnaseq_analysis_on_allfiles_for-slurm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | 4 | # This script is for PAIRED-END data which typically have the following naming convention: 5 | # - {sample}_R1.fastq and {sample}_R2.fastq 6 | # OR 7 | # - {sample}_1.fastq and {sample}_2.fastq 8 | 9 | # NOTE to change the extension to match the naming convention of your PE files. 10 | 11 | for fq in ~/unix_lesson/rnaseq/raw_data/*_R1.fq 12 | do 13 | 14 | sbatch -p short -t 0-2:00 -c 6 --job-name rnaseq-workflow --wrap="sh ~/unix_lesson/rnaseq/scripts/PE-rnaseq_analysis_on_input_file.sh $fq" 15 | sleep 1 # wait 1 second between each job submission 16 | 17 | done 18 | -------------------------------------------------------------------------------- /scripts/PE-rnaseq_analysis_on_input_file.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash/ 2 | 3 | # This script is for PAIRED-END data which typically have the following naming convention: 4 | # - {sample}_R1.fastq and {sample}_R2.fastq 5 | # OR 6 | # - {sample}_1.fastq and {sample}_2.fastq 7 | 8 | # You will need to identify which format matches your data (i.e .fastq, .fq, .fq.gz) and **modify line 22 and line 26 accordingly!** 9 | 10 | 11 | # USAGE: sh PE-rnaseq_analysis_on_input_file.sh 12 | # The script takes as input the R1 fastq file ONLY. It will runs FastQC, STAR, Qualimap and Salmon. 13 | 14 | 15 | # initialize a variable with an intuitive name to store the name of the input fastq file for Read1 16 | fq1=$1 17 | 18 | # grab the path information to use for loading the Read2 fastq file 19 | path=`temp=$( realpath "$fq1" ) && dirname "$temp"` 20 | 21 | # grab base of filename 22 | samplename=`basename $fq _R1.fastq` 23 | echo "Starting analysis of sample $samplename" 24 | 25 | # create a variable to store the read 2 file 26 | fq2=${path}/${samplename}_R2.fastq 27 | 28 | # change directories to /n/scratch3/ so that all the analysis is stored there. 29 | cd /n/scratch3/users/r/$USER/rnaseq_hbc-workshop/ 30 | 31 | # specify the number of cores to use 32 | cores=6 33 | 34 | # directory with the genome and transcriptome index files + name of the gene annotation file 35 | genome=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index 36 | transcriptome=/n/groups/hbctraining/rna-seq_2019_02/reference_data/salmon_index 37 | gtf=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.1.gtf 38 | 39 | # make all of the output directories 40 | # The -p option means mkdir will create the whole path if it 41 | # does not exist and refrain from complaining if it does exist 42 | mkdir -p results/fastqc/ 43 | mkdir -p results/STAR/ 44 | mkdir -p results/qualimap/ 45 | mkdir -p results/salmon/ 46 | 47 | # set up output filenames and locations 48 | fastqc_out=results/fastqc/ 49 | align_out=results/STAR/${samplename} 50 | align_out_bam=results/STAR/${samplename}_Aligned.sortedByCoord.out.bam 51 | qualimap_out=results/qualimap/${samplename}.qualimap 52 | salmon_out=results/salmon/${samplename}.salmon 53 | salmon_mappings=results/salmon/${samplename}_salmon.out 54 | 55 | # set up the software environment (use version numbers) 56 | module load fastqc/0.11.3 57 | module load gcc/6.2.0 58 | module load star/2.7.0a 59 | module load samtools/1.3.1 60 | module load java/jdk-1.8u112 61 | module load qualimap/2.2.1 62 | module load salmon/1.4.0 63 | unset DISPLAY 64 | 65 | echo "Processing file $fq" 66 | 67 | echo "Starting QC for $samplename" 68 | 69 | # Run FastQC and move output to the appropriate folder 70 | fastqc -o $fastqc_out $fq1 $fq2 71 | 72 | 73 | # Run STAR 74 | STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq1 $fq2 --outFileNamePrefix $align_out --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard 75 | 76 | # Run Qualimap 77 | qualimap rnaseq \ 78 | -outdir $qualimap_out \ 79 | -a proportional \ 80 | -bam $align_out_bam \ 81 | -p strand-specific-reverse \ 82 | -gtf $gtf \ 83 | --java-mem-size=8G 84 | 85 | # Run salmon 86 | 87 | echo "Starting Salmon run for $samplename" 88 | 89 | salmon quant -i $transcriptome \ 90 | -p $cores \ 91 | -l A \ 92 | -1 $fq1 -2 $fq2 \ 93 | -o $salmon_out \ 94 | --seqBias \ 95 | --useVBOpt 96 | -------------------------------------------------------------------------------- /scripts/mov10_fastqc.run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -p short # partition name 4 | #SBATCH -t 0-2:00 # hours:minutes runlimit after which job will be killed 5 | #SBATCH -c 6 # number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job 6 | #SBATCH --job-name rnaseq_mov10_fastqc # Job name 7 | #SBATCH -o %j.out # File to which standard out will be written 8 | #SBATCH -e %j.err # File to which standard err will be written 9 | 10 | ## Changing directories to where the fastq files are located 11 | cd ~/unix_workshop/rnaseq/raw_data 12 | 13 | ## Loading modules required for script commands 14 | module load seq/fastqc/0.11.3 15 | 16 | ## Running FASTQC 17 | fastqc -t 6 *.fq 18 | 19 | ## Moving files to our results directory 20 | mv *fastqc* ../results/fastqc/ 21 | -------------------------------------------------------------------------------- /scripts/rnaseq_analysis_on_allfiles_for-slurm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | for fq in ~/unix_lesson/rnaseq/raw_data/*.fq 4 | do 5 | 6 | sbatch -p short -t 0-2:00 -c 6 --job-name rnaseq-workflow --wrap="sh ~/unix_lesson/rnaseq/scripts/rnaseq_analysis_on_input_file.sh $fq" 7 | sleep 1 # wait 1 second between each job submission 8 | 9 | done 10 | -------------------------------------------------------------------------------- /scripts/rnaseq_analysis_on_input_file.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash/ 2 | 3 | # This script takes a fastq file of RNA-seq data, runs FastQC, STAR, Qualimap and Salmon. 4 | # USAGE: sh rnaseq_analysis_on_input_file.sh 5 | 6 | # change directories to /n/scratch3/ so that all the analysis is stored there. 7 | cd /n/scratch3/users/r/$USER/rnaseq_hbc-workshop/ 8 | 9 | # initialize a variable with an intuitive name to store the name of the input fastq file 10 | fq=$1 11 | 12 | # grab base of filename for naming outputs 13 | samplename=`basename $fq .subset.fq` 14 | echo "Sample name is $samplename" 15 | 16 | # specify the number of cores to use 17 | cores=6 18 | 19 | # directory with the genome and transcriptome index files + name of the gene annotation file 20 | genome=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index 21 | transcriptome=/n/groups/hbctraining/rna-seq_2019_02/reference_data/salmon_index 22 | gtf=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.1.gtf 23 | 24 | # make all of the output directories 25 | # The -p option means mkdir will create the whole path if it 26 | # does not exist and refrain from complaining if it does exist 27 | mkdir -p results/fastqc/ 28 | mkdir -p results/STAR/ 29 | mkdir -p results/qualimap/ 30 | mkdir -p results/salmon/ 31 | 32 | # set up output filenames and locations 33 | fastqc_out=results/fastqc/ 34 | align_out=results/STAR/${samplename} 35 | align_out_bam=results/STAR/${samplename}_Aligned.sortedByCoord.out.bam 36 | qualimap_out=results/qualimap/${samplename}.qualimap 37 | salmon_out=results/salmon/${samplename}.salmon 38 | salmon_mappings=results/salmon/${samplename}_salmon.out 39 | 40 | # set up the software environment (use version numbers) 41 | module load fastqc/0.11.3 42 | module load gcc/6.2.0 43 | module load star/2.7.0a 44 | module load samtools/1.3.1 45 | module load java/jdk-1.8u112 46 | module load qualimap/2.2.1 47 | module load salmon/1.4.0 48 | unset DISPLAY 49 | 50 | echo "Processing file $fq" 51 | 52 | echo "Starting QC for $samplename" 53 | 54 | # Run FastQC and move output to the appropriate folder 55 | fastqc -o $fastqc_out $fq 56 | 57 | 58 | # Run STAR 59 | STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq --outFileNamePrefix $align_out --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard 60 | 61 | # Run Qualimap 62 | qualimap rnaseq \ 63 | -outdir $qualimap_out \ 64 | -a proportional \ 65 | -bam $align_out_bam \ 66 | -p strand-specific-reverse \ 67 | -gtf $gtf \ 68 | --java-mem-size=8G 69 | 70 | # Run salmon 71 | 72 | echo "Starting Salmon run for $samplename" 73 | 74 | salmon quant -i $transcriptome \ 75 | -p $cores \ 76 | -l A \ 77 | -r $fq \ 78 | -o $salmon_out \ 79 | --seqBias \ 80 | --useVBOpt 81 | -------------------------------------------------------------------------------- /scripts/salmon_all_files_PE.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -p priority # partition name 4 | #SBATCH -t 0-12:00 # hours:minutes runlimit after which job will be killed 5 | #SBATCH -c 6 # number of cores requested - what you plan to use to run your job 6 | #SBATCH --mem 8G 7 | #SBATCH --job-name salmon_mapping_PE # Job name 8 | #SBATCH -o salmon-mapping.out # File to which standard out will be written 9 | #SBATCH -e salmon_mapping.err # File to which standard err will be written 10 | 11 | 12 | # Change directories 13 | cd cd ~/rnaseq/raw_data 14 | 15 | # Get all sample names from a file that contains the prefix 16 | files=`cut -f 1 samples.csv` 17 | 18 | for sample in $files 19 | 20 | do 21 | 22 | salmon quant -i /n/groups/hbctraining/rna-seq_2019_02/reference_data/salmon.ensembl38.idx \ 23 | -l A \ 24 | -r ${sample}_R1.fastq ${sample}_R2.fastq \ 25 | -o ../results/salmon/${sample} \ 26 | -p 6 \ 27 | --seqBias \ 28 | --useVBOpt \ 29 | --numBootstraps 30 30 | 31 | done 32 | -------------------------------------------------------------------------------- /scripts/star_genome_index.run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -p short # partition name 4 | #SBATCH -t 0-2:00 # hours:minutes runlimit after which job will be killed 5 | #SBATCH -c 6 # number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job 6 | #SBATCH --mem 16G 7 | #SBATCH --job-name STAR_index # Job name 8 | #SBATCH -o %j.out # File to which standard out will be written 9 | #SBATCH -e %j.err # File to which standard err will be written 10 | 11 | cd /n/scratch2/username/ 12 | 13 | module load gcc/6.2.0 star/2.5.4a 14 | 15 | STAR --runThreadN 6 \ 16 | --runMode genomeGenerate \ 17 | --genomeDir chr1_hg38_index \ 18 | --genomeFastaFiles /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.dna.chromosome.1.fa \ 19 | --sjdbGTFfile /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \ 20 | --sjdbOverhang 99 21 | --------------------------------------------------------------------------------