├── secondary_analysis_scripts ├── README.txt ├── src │ ├── ChainedNotebookSupport.R │ ├── CountsPcaPlotter.R │ └── PcaPlotter.R └── 1_RNASeq_Count_Metadata_Annotation_Load_and_Integration.ipynb ├── reference ├── README.txt ├── Homo_sapiens_GRCh38p13_gencodev38_ANNOT.Rdata └── KavitaSamplesMetadata.tsv ├── README.md └── primary_analysis_scripts ├── ReadMe.md ├── submit_dl_counts.sh ├── submit_dl_fastqs.sh ├── calculate_counts ├── calculate_counts.sh ├── RSEM_gene_parser.py ├── RSEM_isoform_parser.py └── RSEM_count_parser.py ├── submit_humanPE.sh ├── dl_counts.sh └── run_human_PE_aws.sh /secondary_analysis_scripts/README.txt: -------------------------------------------------------------------------------- 1 | This directory holds jupyter notebooks and associated source scripts used to perform the analysis. -------------------------------------------------------------------------------- /reference/README.txt: -------------------------------------------------------------------------------- 1 | This directory holds files referenced by the analysis, such as papers, metadata files, Rdata annotation files, etc. 2 | -------------------------------------------------------------------------------- /reference/Homo_sapiens_GRCh38p13_gencodev38_ANNOT.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucsd-ccbb/VK2-vaginal-epithelial-cell-RNA-seq-analysis/main/reference/Homo_sapiens_GRCh38p13_gencodev38_ANNOT.Rdata -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VK2 vaginal epithelial cell RNA seq analysis 2 | 3 | Comprehensive Jupyter notebooks and supporting code to reproduce the analysis from the paper 'Resident microbes shape the vaginal epithelial glycan landscape' (link coming soon) 4 | -------------------------------------------------------------------------------- /primary_analysis_scripts/ReadMe.md: -------------------------------------------------------------------------------- 1 | This folder contains scripts that were used for the following: 2 | 3 | 1. Downloading fastq files (*submit_dl_fastqs.sh*) 4 | 2. Running QC, trim, and alignment (*submit_humanPE.sh*) 5 | 3. Download results to create a counts matrix file. (*submit_dl_counts.sh* and *calculate_counts/calculate_counts.sh*) 6 | -------------------------------------------------------------------------------- /primary_analysis_scripts/submit_dl_counts.sh: -------------------------------------------------------------------------------- 1 | sh dl_counts.sh VK2_NanH2_PolyB25_1h_5 2 | sh dl_counts.sh VK2_NanH2_PolyB25_1h_6 3 | sh dl_counts.sh VK2_NanH2_PolyB25_1h_7 4 | sh dl_counts.sh VK2_NanH2_PolyB25_1h_8 5 | sh dl_counts.sh VK2_NanH2_PolyB25_2h_1 6 | sh dl_counts.sh VK2_NanH2_PolyB25_2h_2 7 | sh dl_counts.sh VK2_NanH2_PolyB25_2h_3 8 | sh dl_counts.sh VK2_NanH2_PolyB25_2h_4 9 | sh dl_counts.sh VK2_Pet28a_PolyB25_2h_9 10 | sh dl_counts.sh VK2_Pet28a_PolyB25_2h_10 11 | sh dl_counts.sh VK2_Pet28a_PolyB25_2h_11 12 | sh dl_counts.sh VK2_Pet28a_PolyB25_2h_12 -------------------------------------------------------------------------------- /primary_analysis_scripts/submit_dl_fastqs.sh: -------------------------------------------------------------------------------- 1 | sh dl_fastqs_igm.sh VK2_NanH2_PolyB25_1h_5 S41 2 | sh dl_fastqs_igm.sh VK2_NanH2_PolyB25_1h_6 S42 3 | sh dl_fastqs_igm.sh VK2_NanH2_PolyB25_1h_7 S43 4 | sh dl_fastqs_igm.sh VK2_NanH2_PolyB25_1h_8 S44 5 | sh dl_fastqs_igm.sh VK2_NanH2_PolyB25_2h_1 S37 6 | sh dl_fastqs_igm.sh VK2_NanH2_PolyB25_2h_2 S38 7 | sh dl_fastqs_igm.sh VK2_NanH2_PolyB25_2h_3 S39 8 | sh dl_fastqs_igm.sh VK2_NanH2_PolyB25_2h_4 S40 9 | sh dl_fastqs_igm.sh VK2_Pet28a_PolyB25_2h_9 S45 10 | sh dl_fastqs_igm.sh VK2_Pet28a_PolyB25_2h_10 S46 11 | sh dl_fastqs_igm.sh VK2_Pet28a_PolyB25_2h_11 S47 12 | sh dl_fastqs_igm.sh VK2_Pet28a_PolyB25_2h_12 S48 -------------------------------------------------------------------------------- /primary_analysis_scripts/calculate_counts/calculate_counts.sh: -------------------------------------------------------------------------------- 1 | data_dir=/Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_analysis 2 | 3 | python /Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_scripts/calculate_counts/RSEM_count_parser.py $data_dir 4 | python /Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_scripts/calculate_counts/RSEM_gene_parser.py $data_dir 5 | python /Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_scripts/calculate_counts/RSEM_isoform_parser.py $data_dir 6 | -------------------------------------------------------------------------------- /primary_analysis_scripts/submit_humanPE.sh: -------------------------------------------------------------------------------- 1 | sbatch -n 16 run_human_PE_aws.sh VK2_NanH2_PolyB25_1h_5 S41 2 | sbatch -n 16 run_human_PE_aws.sh VK2_NanH2_PolyB25_1h_6 S42 3 | sbatch -n 16 run_human_PE_aws.sh VK2_NanH2_PolyB25_1h_7 S43 4 | sbatch -n 16 run_human_PE_aws.sh VK2_NanH2_PolyB25_1h_8 S44 5 | sbatch -n 16 run_human_PE_aws.sh VK2_NanH2_PolyB25_2h_1 S37 6 | sbatch -n 16 run_human_PE_aws.sh VK2_NanH2_PolyB25_2h_2 S38 7 | sbatch -n 16 run_human_PE_aws.sh VK2_NanH2_PolyB25_2h_3 S39 8 | sbatch -n 16 run_human_PE_aws.sh VK2_NanH2_PolyB25_2h_4 S40 9 | sbatch -n 16 run_human_PE_aws.sh VK2_Pet28a_PolyB25_2h_9 S45 10 | sbatch -n 16 run_human_PE_aws.sh VK2_Pet28a_PolyB25_2h_10 S46 11 | sbatch -n 16 run_human_PE_aws.sh VK2_Pet28a_PolyB25_2h_11 S47 12 | sbatch -n 16 run_human_PE_aws.sh VK2_Pet28a_PolyB25_2h_12 S48 -------------------------------------------------------------------------------- /reference/KavitaSamplesMetadata.tsv: -------------------------------------------------------------------------------- 1 | Sample Name Sample Code Reference Genome (Organism) Gender Condition Time point 2 | VK2_NanH2_PolyB25_2h_1 K001 Human Female NanH2 treated 2 hour 3 | VK2_NanH2_PolyB25_2h_2 K002 Human Female NanH2 treated 2 hour 4 | VK2_NanH2_PolyB25_2h_3 K003 Human Female NanH2 treated 2 hour 5 | VK2_NanH2_PolyB25_2h_4 K004 Human Female NanH2 treated 2 hour 6 | VK2_NanH2_PolyB25_1h_5 K005 Human Female NanH2 treated 1 hour 7 | VK2_NanH2_PolyB25_1h_6 K006 Human Female NanH2 treated 1 hour 8 | VK2_NanH2_PolyB25_1h_7 K007 Human Female NanH2 treated 1 hour 9 | VK2_NanH2_PolyB25_1h_8 K008 Human Female NanH2 treated 1 hour 10 | VK2_Pet28a_PolyB25_2h_9 K009 Human Female Vector treated 2 hour 11 | VK2_Pet28a_PolyB25_2h_10 K010 Human Female Vector treated 2 hour 12 | VK2_Pet28a_PolyB25_2h_11 K011 Human Female Vector treated 2 hour 13 | VK2_Pet28a_PolyB25_2h_12 K012 Human Female Vector treated 2 hour -------------------------------------------------------------------------------- /secondary_analysis_scripts/src/ChainedNotebookSupport.R: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------------------------- 2 | # Copyright (c) 2018 UC San Diego Center for Computational Biology & Bioinformatics 3 | # 4 | # Distributed under the terms of the MIT License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------------- 8 | # Initial author: Amanda Birmingham 9 | 10 | makeRunName = function(gProjectName, gStepName){ 11 | return(paste0(gProjectName, "_", gStepName, "_", gsub("[: -]", "", strptime(Sys.time(), "%Y-%m-%d %H:%M:%S"), perl=TRUE))) 12 | } 13 | 14 | writeWorkspaceImage = function(outputDir, runName){ 15 | fileName = sprintf("%s.RData",runName) 16 | save.image(file=file.path(outputDir, fileName)) 17 | print(paste0("Output file: ",fileName)) 18 | } 19 | 20 | # from https://www.r-bloggers.com/safe-loading-of-rdata-files-2/ 21 | loadToEnvironment <- function(RData, env = new.env()){ 22 | load(RData, env) 23 | return(env) 24 | } 25 | -------------------------------------------------------------------------------- /primary_analysis_scripts/dl_counts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #yes | sudo yum install perl-Env 4 | 5 | # Use this script to download fastqc and STAR/rsem files to make counts matrices and compile a multiqc report 6 | 7 | filename=$1 8 | data_dir="/Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_analysis" 9 | s3_addr=s3://ccbb-data-upload/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_analysis 10 | 11 | workspace=$data_dir/$filename 12 | 13 | mkdir -p $workspace 14 | cd $workspace 15 | 16 | # download fastqcs, .genes.results, .isoforms.results, .stat folder 17 | aws s3 cp $s3_addr/$filename/$filename"_R1.trim_fastqc.html" . 18 | aws s3 cp $s3_addr/$filename/$filename"_R1.trim_fastqc.zip" . 19 | aws s3 cp $s3_addr/$filename/$filename"_R2.trim_fastqc.html" . 20 | aws s3 cp $s3_addr/$filename/$filename"_R2.trim_fastqc.zip" . 21 | aws s3 cp $s3_addr/$filename/$filename".genes.results" . 22 | aws s3 cp $s3_addr/$filename/$filename".isoforms.results" . 23 | aws s3 cp $s3_addr/$filename/$filename".stat" . --recursive 24 | -------------------------------------------------------------------------------- /primary_analysis_scripts/calculate_counts/RSEM_gene_parser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Guorong Xu' 2 | 3 | import re 4 | import os 5 | import sys 6 | 7 | def read_data_file(input_file): 8 | expression_list = [] 9 | with open(input_file) as f: 10 | for line in f: 11 | if not line.startswith("gene_id"): 12 | expression_list.append(line) 13 | return expression_list 14 | 15 | if __name__ == "__main__": 16 | workspace = sys.argv[1] 17 | sample_list = {} 18 | filewriter = open(workspace + "/all_genes_results.txt", "w") 19 | 20 | for dirpath, directories, filenames in os.walk(workspace): 21 | for filename in filenames: 22 | if filename.endswith(".genes.results"): # output of calculate expression 23 | input_file = os.path.join(dirpath, filename) 24 | expression_list = read_data_file(input_file) 25 | sample_list.update({filename:expression_list}) 26 | # no files are found 27 | if len(sample_list) == 0: 28 | raise FileNotFoundError("ERROR! No \"genes.results\" files are found.") 29 | 30 | filewriter.write("gene_id\ttranscript_id(s)") 31 | for sample in sample_list: 32 | filewriter.write("\t" + sample + "_length\t" + sample + "_effective_length\t" + sample + "_expected_count\t" + sample + "_TPM\t" + sample + "_FPKM") 33 | filewriter.write("\n") 34 | 35 | for line_num in range(0, len(expression_list)): 36 | for index, sample in enumerate(sample_list): 37 | expression_list = sample_list.get(sample) 38 | expression_values = expression_list[line_num] 39 | fields = re.split(r'\t+', expression_values) 40 | if index == 0: 41 | filewriter.write(fields[0] + "\t" + fields[1] + "\t" + fields[2] + "\t" + fields[3] + "\t" + fields[4] + "\t" + fields[5] + "\t" + fields[6].rstrip()) 42 | else: 43 | filewriter.write("\t" + fields[2] + "\t" + fields[3] + "\t" + fields[4] + "\t" + fields[5] + "\t" + fields[6].rstrip()) 44 | 45 | filewriter.write("\n") 46 | 47 | filewriter.close() 48 | 49 | all_genes_expression_list = read_data_file(workspace + "/all_genes_results.txt") 50 | 51 | if len(expression_list) == len(all_genes_expression_list): 52 | exit(0) 53 | else: 54 | exit(1) -------------------------------------------------------------------------------- /primary_analysis_scripts/calculate_counts/RSEM_isoform_parser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Guorong Xu' 2 | 3 | import re 4 | import os 5 | import sys 6 | 7 | def read_data_file(input_file): 8 | expression_list = [] 9 | with open(input_file) as f: 10 | for line in f: 11 | if not line.startswith("transcript_id"): 12 | expression_list.append(line) 13 | return expression_list 14 | 15 | if __name__ == "__main__": 16 | workspace = sys.argv[1] 17 | sample_list = {} 18 | filewriter = open(workspace + "/all_isoforms_results.txt", "w") 19 | 20 | for dirpath, directories, filenames in os.walk(workspace): 21 | for filename in filenames: 22 | if filename.endswith(".isoforms.results"): # output of calculate expression 23 | input_file = os.path.join(dirpath, filename) 24 | expression_list = read_data_file(input_file) 25 | sample_list.update({filename:expression_list}) 26 | # no files are found 27 | if len(sample_list) == 0: 28 | raise FileNotFoundError("ERROR! No \"isoforms.results\" files are found.") 29 | 30 | filewriter.write("transcript_id\tgene_id") 31 | for sample in sample_list: 32 | filewriter.write("\t" + sample + "_length\t" + sample + "_effective_length\t" + sample + "_expected_count\t" + sample + "_TPM\t" + sample + "_FPKM\t" + sample + "_IsoPct") 33 | filewriter.write("\n") 34 | 35 | for line_num in range(0, len(expression_list)): 36 | for index, sample in enumerate(sample_list): 37 | expression_list = sample_list.get(sample) 38 | expression_values = expression_list[line_num] 39 | fields = re.split(r'\t+', expression_values) 40 | if index == 0: 41 | filewriter.write(fields[0] + "\t" + fields[1] + "\t" + fields[2] + "\t" + fields[3] + "\t" + fields[4] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7].rstrip()) 42 | else: 43 | filewriter.write("\t" + fields[2] + "\t" + fields[3] + "\t" + fields[4] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7].rstrip()) 44 | 45 | filewriter.write("\n") 46 | 47 | filewriter.close() 48 | 49 | all_genes_expression_list = read_data_file(workspace + "/all_isoforms_results.txt") 50 | 51 | if len(expression_list) == len(all_genes_expression_list): 52 | exit(0) 53 | else: 54 | exit(1) -------------------------------------------------------------------------------- /primary_analysis_scripts/run_human_PE_aws.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | yes | sudo yum install perl-Env 4 | 5 | filename=$1 6 | sample_num=$2 7 | workspace=/scratch/workspace/$filename 8 | 9 | STAR=/shared/software/STAR/2.5.1a/bin/Linux_x86_64 10 | 11 | star_ref=/shared/software/STAR_index/Hsapiens_h38p13_v38/Hsapiens_h38p13_v38 #gencode v38 12 | fastqc=/shared/software/FastQC/fastqc #v0.11.8 13 | trimmomatic=/shared/software/Trimmomatic-0.38/trimmomatic-0.38.jar 14 | rsem=/shared/software/RSEM-1.3.0/rsem-calculate-expression 15 | aws_addr=s3://ccbb-data-upload/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression 16 | 17 | mkdir -p $workspace 18 | 19 | cd $workspace 20 | echo $PWD 21 | 22 | ## Download data ## 23 | aws s3 cp $aws_addr"/fastq/"$filename/$filename"_"$sample_num"_L004_R1_001.fastq.gz" $workspace/$filename"_R1.fastq.gz" 24 | aws s3 cp $aws_addr"/fastq/"$filename/$filename"_"$sample_num"_L004_R2_001.fastq.gz" $workspace/$filename"_R2.fastq.gz" 25 | 26 | export _JAVA_OPTIONS=-Djavax.accessibility.assistive_technologies= 27 | $fastqc $workspace/$filename"_R1.fastq.gz" -o $workspace/ 28 | $fastqc $workspace/$filename"_R2.fastq.gz" -o $workspace/ 29 | 30 | ## Trim ## 31 | java -jar $trimmomatic PE -threads 5 -phred33 -trimlog $workspace/trimlog.log $workspace/$filename"_R1.fastq.gz" $workspace/$filename"_R2.fastq.gz" $workspace/$filename"_R1.trim.fastq.gz" $workspace/$filename"_R1.unpaired.fastq.gz" $workspace/$filename"_R2.trim.fastq.gz" $workspace/$filename"_R2.unpaired.fastq.gz" LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:27 ILLUMINACLIP:/shared/software/Trimmomatic-0.38/adapters/NexteraPE-PE.fa:2:30:10 32 | 33 | ## Check for adapters ## 34 | $fastqc $workspace/$filename"_R1.trim.fastq.gz" -o $workspace/ 35 | $fastqc $workspace/$filename"_R2.trim.fastq.gz" -o $workspace/ 36 | 37 | ## Default rsem STAR wrapper ## 38 | $rsem --paired-end --star --star-path $STAR --star-gzipped-read-file -p 8 $workspace/$filename"_R1.trim.fastq.gz" $workspace/$filename"_R2.trim.fastq.gz" $star_ref $workspace/$filename 39 | 40 | rm $workspace/$filename"_R1.fastq.gz" 41 | rm $workspace/$filename"_R2.fastq.gz" 42 | rm $workspace/$filename"_R1.trim.fastq.gz" 43 | rm $workspace/$filename"_R2.trim.fastq.gz" 44 | rm $workspace/$filename"_R1.unpaired.fastq.gz" 45 | rm $workspace/$filename"_R2.unpaired.fastq.gz" 46 | rm $workspace/trimlog.log 47 | #upload results to S3 48 | aws s3 cp $workspace $aws_addr/primary_analysis/$filename/ --recursive 49 | -------------------------------------------------------------------------------- /primary_analysis_scripts/calculate_counts/RSEM_count_parser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Guorong Xu' 2 | 3 | import re 4 | import os 5 | import sys 6 | 7 | def read_data_file(input_file): 8 | expression_list = {} 9 | with open(input_file) as f: 10 | for line_num, line in enumerate(f): 11 | if line_num == 0: 12 | expression_list.update({"alignment_statistics":line.rstrip()}) 13 | if line_num == 1: 14 | expression_list.update({"alignment_certainty":line.rstrip()}) 15 | if line_num == 2: 16 | expression_list.update({"alignment_Hits":line.rstrip()}) 17 | if line_num > 2: 18 | # split the string based on tab 19 | fields = re.split(r'\t+', line) 20 | expression_list.update({fields[0]:fields[1].rstrip()}) 21 | 22 | return expression_list 23 | 24 | if __name__ == "__main__": 25 | workspace = sys.argv[1] 26 | sample_list = {} 27 | filewriter = open(workspace + "/all_counts_results.txt", "w") 28 | 29 | for dirpath, directories, filenames in os.walk(workspace): 30 | for filename in filenames: 31 | if filename.endswith(".cnt"): # output of calculate expression 32 | input_file = os.path.join(dirpath, filename) 33 | expression_list = read_data_file(input_file) 34 | sample_list.update({filename:expression_list}) 35 | # no files are found 36 | if len(sample_list) == 0: 37 | raise FileNotFoundError("ERROR! No \"cnt\" files are found.") 38 | 39 | filewriter.write("item") 40 | # sample is filename.cnt 41 | for sample in sample_list: 42 | filewriter.write("\t" + sample + "_counts") 43 | filewriter.write("\n") 44 | 45 | for header in ["alignment_statistics", "alignment_certainty", "alignment_Hits"]: 46 | filewriter.write(header) 47 | for sample in sample_list: 48 | expression_list = sample_list.get(sample) 49 | expression_values = expression_list.get(header) 50 | filewriter.write("\t" + expression_values) 51 | filewriter.write("\n") 52 | 53 | for line_num in range(0, 100): 54 | filewriter.write(str(line_num) + "\t") 55 | for sample in sample_list: 56 | expression_list = sample_list.get(sample) 57 | if str(line_num) in expression_list: 58 | filewriter.write(expression_list.get(str(line_num)) + "\t") 59 | else: 60 | filewriter.write(str(0) + "\t") 61 | filewriter.write("\n") 62 | 63 | for header in ["Inf"]: 64 | filewriter.write(header) 65 | for sample in sample_list: 66 | expression_list = sample_list.get(sample) 67 | expression_values = expression_list.get(header) 68 | filewriter.write("\t" + expression_values) 69 | filewriter.write("\n") 70 | 71 | filewriter.close() -------------------------------------------------------------------------------- /secondary_analysis_scripts/src/CountsPcaPlotter.R: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------------------------- 2 | # Copyright (c) 2018 UC San Diego Center for Computational Biology & Bioinformatics 3 | # 4 | # Distributed under the terms of the MIT License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------------- 8 | # Initial author: Amanda Birmingham 9 | 10 | library(edgeR) 11 | 12 | expandDesignDf<-function(countsDf, designDf, 13 | sampleNameColName = "sample_name"){ 14 | aDgeList <- DGEList(counts=countsDf) 15 | mergedDesignDf = merge(x=designDf, y=aDgeList$samples, 16 | by.y="row.names", 17 | by.x=sampleNameColName) 18 | return(mergedDesignDf) 19 | } 20 | 21 | syncCountSampleOrderToDesignDf<-function(counts_df, designDf, 22 | sampleColName="sample_name"){ 23 | 24 | sampleNamesInOrder = designDf[[sampleColName]] 25 | 26 | # check for samples in the design file that aren't in the counts file 27 | missingSamples = setdiff(sampleNamesInOrder, colnames(counts_df)) 28 | if (length(missingSamples)>0){ 29 | print(missingSamples) 30 | stop("Above samples are in design file but missing from counts file") 31 | } 32 | 33 | # ensure that the order of the samples in the counts table is 34 | # the same as the order of the samples in the design table 35 | reordered_counts_df = counts_df[sampleNamesInOrder] 36 | return(reordered_counts_df) 37 | } 38 | 39 | reformatDfForPca<-function(counts_df, designDf, 40 | sampleColName="sample_name"){ 41 | reordered_counts_df = syncCountSampleOrderToDesignDf( 42 | counts_df, designDf, sampleColName) 43 | 44 | # now transform the counts df so it is samples are in rows 45 | # (as in the design file) rather than in rows 46 | transformed_df = t(reordered_counts_df) 47 | return(transformed_df) 48 | } 49 | 50 | makeAndPrintRawCountsPca<-function(countsDf, designDf, 51 | pointShapeColName, 52 | designSampleNameColName="sample_name", 53 | libSizeColName = "lib.size", 54 | designColNameForLabels=NULL, labelOnlyOutliers=TRUE){ 55 | 56 | if (!libSizeColName %in% colnames(designDf)){ 57 | designDf = expandDesignDf(countsDf, designDf, 58 | designSampleNameColName) 59 | } 60 | 61 | rawTitle = "PCA of Raw Counts" 62 | display_markdown(rawTitle) 63 | countsPca = doPcaFromSamplesAsColsDf(countsDf, designDf, 64 | designSampleNameColName) 65 | rawPlot = make2dPcaPlot(countsPca, designDf, pointShapeColName, 66 | libSizeColName, designColNameForLabels, 67 | labelOnlyOutliers) 68 | print(rawPlot + ggtitle(rawTitle)) 69 | } 70 | 71 | makeAndPrintCpmsPca<-function(countsDf, designDf, 72 | pointShapeColName, 73 | designSampleNameColName="sample_name", 74 | libSizeColName = "lib.size", 75 | designColNameForLabels=NULL, labelOnlyOutliers=TRUE){ 76 | 77 | if (!libSizeColName %in% colnames(designDf)){ 78 | designDf = expandDesignDf(countsDf, designDf, 79 | designSampleNameColName) 80 | } 81 | 82 | cpmsDf = getCpmsDf(countsDf) 83 | 84 | normTitle = "PCA of Normalized Counts" 85 | designAndPca = makeAndPrintPca(normTitle) 86 | 87 | display_markdown(normTitle) 88 | cpmsPca = doPcaFromSamplesAsColsDf(cpmsDf, designDf, 89 | designSampleNameColName) 90 | normPlot = make2dPcaPlot(cpmsPca, designDf, pointShapeColName, 91 | libSizeColName, designColNameForLabels, 92 | labelOnlyOutliers) 93 | print(normPlot + ggtitle(normTitle)) 94 | 95 | return(list(rawcounts=countsDf, design=designDf, cpms=cpmsDf, 96 | cpmsPca=cpmsPca)) 97 | } 98 | 99 | # TODO: come back and integrate this function with 100 | # makeAndPrintRawCountsPca and makeAndPrintCpmsPca 101 | makeAndPrintPca<-function(title, countsDf, designDf, 102 | pointShapeColName, 103 | designSampleNameColName="sample_name", 104 | libSizeColName = "lib.size", 105 | designColNameForLabels=NULL, labelOnlyOutliers=TRUE){ 106 | 107 | if (!libSizeColName %in% colnames(designDf)){ 108 | designDf = expandDesignDf(countsDf, designDf, 109 | designSampleNameColName) 110 | } 111 | 112 | display_markdown(title) 113 | aPca = doPcaFromSamplesAsColsDf(countsDf, designDf, designSampleNameColName) 114 | aPlot = make2dPcaPlot(aPca, designDf, pointShapeColName, 115 | libSizeColName, designColNameForLabels, 116 | labelOnlyOutliers) 117 | print(aPlot + ggtitle(title)) 118 | 119 | return(list(design=designDf, pca=aPca)) 120 | } 121 | 122 | getCpmsDf<-function(counts_df){ 123 | y <- DGEList(counts=counts_df) 124 | cpm_matrix = cpm(y) 125 | cpmDf = data.frame(cpm_matrix) 126 | return(cpmDf) 127 | } 128 | -------------------------------------------------------------------------------- /secondary_analysis_scripts/src/PcaPlotter.R: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------------------------- 2 | # Copyright (c) 2018 UC San Diego Center for Computational Biology & Bioinformatics 3 | # 4 | # Distributed under the terms of the MIT License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------------- 8 | # Initial author: Amanda Birmingham 9 | 10 | library(cowplot) 11 | library(ggplot2) 12 | library(grid) 13 | library(IRdisplay) 14 | 15 | makeAndPrintPca <-function(data_df, design_df, 16 | design_col_name_for_shapes=NULL, design_col_name_for_colors=NULL, 17 | design_col_name_for_labels=NULL, labelOutliersOnly=TRUE, 18 | shrink_viewport=FALSE) { 19 | 20 | pcaResults = doAndPrintScaledPcaOnSamplesAsRowsDf(data_df) 21 | makeAndPrintPcaPlot(pcaResults, design_df, design_col_name_for_shapes, 22 | design_col_name_for_colors, design_col_name_for_labels, 23 | labelOutliersOnly, TRUE, shrink_viewport) 24 | } 25 | 26 | makeAndPrintPcaPlot<-function(pcaResults, design_df=NULL, 27 | design_col_name_for_shapes=NULL, design_col_name_for_colors=NULL, 28 | design_col_name_for_labels=NULL, labelOutliersOnly=TRUE, 29 | add_hotelling_ellipse=TRUE, shrink_viewport=FALSE) { 30 | 31 | pcaPlot = make2dPcaPlot(pcaResults, design_df, 32 | design_col_name_for_shapes, design_col_name_for_colors, 33 | design_col_name_for_labels, labelOutliersOnly, 34 | add_hotelling_ellipse) 35 | printPlotInViewport(pcaPlot, shrink_viewport) 36 | } 37 | 38 | printPlotInViewport<-function(pcaPlot, shrink_viewport=FALSE){ 39 | # NB that this method does NOT set the canvas back to the 40 | # default size after being called--when I try to do that, 41 | # the reset happens before the plot is rendered, thus 42 | # nullifying my attempts to resize the canvas to fit the 43 | # image (even if I try using Sys.sleep, etc). 44 | # Until I can spend more time exploring how to prevent that, 45 | # it is necessary to call resetPlotSize() after any run 46 | # of this method. 47 | 48 | viewport_val = NULL 49 | if (shrink_viewport==TRUE) { 50 | viewport_val = viewport(width=unit(0.8, "npc")) 51 | } 52 | 53 | startingWidth = getOption("repr.plot.width") 54 | startingHeight = getOption("repr.plot.height") 55 | 56 | # resize image to max width, appropriate height 57 | # to remove excessive whitespace in default square 58 | # image canvas if real image is not square 59 | aspectRatio = findPlotAspectRatio(pcaPlot) 60 | plotHeight = startingWidth/aspectRatio 61 | options(repr.plot.width=startingWidth, repr.plot.height=plotHeight) 62 | 63 | suppressWarnings(print(pcaPlot, vp=viewport_val)) 64 | } 65 | 66 | make2dPcaPlot<-function(pca_result, design_df=NULL, 67 | design_col_name_for_shapes=NULL, design_col_name_for_colors=NULL, 68 | design_col_name_for_labels=NULL, label_only_outliers=TRUE, 69 | add_hotelling_ellipse=TRUE){ 70 | 71 | scores = data.frame(pca_result$x[,1:2]) 72 | 73 | if (add_hotelling_ellipse){ 74 | hotelling_ellipse = data.frame(getHotellingT2Ellipse( 75 | pca_result$x[,1], pca_result$x[,2])) 76 | colnames(hotelling_ellipse) = c("PC1", "PC2") 77 | } 78 | 79 | shape_values = rep("",nrow(pca_result$x)) 80 | color_values = rep("",nrow(pca_result$x)) 81 | label_values = rep("",nrow(pca_result$x)) 82 | if (!is.null(design_col_name_for_shapes)){ 83 | design_df[[design_col_name_for_shapes]] = factor( 84 | design_df[[design_col_name_for_shapes]]) 85 | shape_values = design_df[[design_col_name_for_shapes]] 86 | scores = cbind(scores, shape_values) 87 | } 88 | if (!is.null(design_col_name_for_colors)){ 89 | color_values = design_df[[design_col_name_for_colors]] 90 | scores = cbind(scores, color_values) 91 | } 92 | if (!is.null(design_col_name_for_labels)){ 93 | if (label_only_outliers) { 94 | includeValues = getWhetherPointsAreOutliers(pca_result) 95 | label_values = ifelse(includeValues, 96 | design_df[[design_col_name_for_labels]],'') 97 | } else { 98 | label_values = design_df[[design_col_name_for_labels]] 99 | } 100 | scores = cbind(scores, label_values) 101 | } 102 | 103 | pc1.2 = ggplot(scores, aes(x=PC1, y=PC2)) + 104 | geom_point(aes(shape=shape_values, 105 | color=color_values), size = 4) + 106 | scale_shape_manual(values=c(0:length(shape_values))) + 107 | coord_fixed(1/1) + 108 | labs(color=design_col_name_for_colors, 109 | shape=design_col_name_for_shapes) 110 | 111 | if (add_hotelling_ellipse){ 112 | pc1.2 = pc1.2 + geom_path(data=hotelling_ellipse) 113 | } 114 | 115 | if (!is.null(design_col_name_for_labels)){ 116 | pc1.2 = pc1.2 + geom_text(aes(label=label_values), 117 | hjust=0, vjust=0) 118 | } 119 | 120 | if (is.numeric(color_values)) { 121 | pc1.2 = pc1.2 + scale_color_gradient(low="blue", high="red") 122 | } 123 | 124 | pc1.2 = pc1.2 + coord_fixed() 125 | return (pc1.2) 126 | } 127 | 128 | findPlotAspectRatio<-function(aGgplot){ 129 | # get the x- and y-axis ranges actually used in the graph 130 | builtPlot = ggplot_build(aGgplot) 131 | 132 | # pre-ggplot2 version 2.2 133 | yRange <- builtPlot$panel$ranges[[1]]$y.range 134 | xRange <- builtPlot$panel$ranges[[1]]$x.range 135 | 136 | # ggplot2 version 2.2 and later 137 | if (is.null(yRange)){ 138 | yRange = builtPlot$layout$panel_ranges[[1]]$y.range 139 | xRange <- builtPlot$layout$panel_ranges[[1]]$x.range 140 | } 141 | 142 | aspectRatio <- (max(xRange)-min(xRange))/(max(yRange)-min(yRange)) 143 | return(aspectRatio) 144 | } 145 | 146 | doAndPrintScaledPcaOnSamplesAsRowsDf<-function(data_df){ 147 | pcaResults = doScaledPcaOnSamplesAsRowsDf(data_df) 148 | display(summary(pcaResults)$importance) 149 | return(pcaResults) 150 | } 151 | 152 | doPcaFromSamplesAsColsDf<-function(samplesAsColsDf, designDf, 153 | sampleNameDesignColName = "sample_name"){ 154 | 155 | transformedDf = reformatDfForPca(samplesAsColsDf, designDf, 156 | sampleNameDesignColName) 157 | pcaResults = doAndPrintScaledPcaOnSamplesAsRowsDf(transformedDf) 158 | return(pcaResults) 159 | } 160 | 161 | doScaledPcaOnSamplesAsRowsDf<-function(data_df){ 162 | # remove any columns that are constant in order to allow scaling 163 | variable_df = data_df[,apply(data_df, 2, var, na.rm=TRUE) != 0] 164 | pca_result = prcomp(variable_df, scale = TRUE) 165 | } 166 | 167 | isPointOutsideEllipse<-function(x, y, ellipseCenterAndRadii){ 168 | ellipseEqnValue = ((x - ellipseCenterAndRadii[1])^2)/( 169 | (ellipseCenterAndRadii[3])^2) + 170 | ((y - ellipseCenterAndRadii[2])^2)/((ellipseCenterAndRadii[4])^2) 171 | return(ellipseEqnValue > 1) 172 | } 173 | 174 | getWhetherPointsAreOutliers<-function(pcaResults){ 175 | xVals = pcaResults$x[,1] 176 | yVals = pcaResults$x[,2] 177 | ellipseInfo = getHotellingT2EllipseCenterAndRadii(xVals, yVals) 178 | isOutsideEllipse = mapply(isPointOutsideEllipse, xVals, yVals, 179 | MoreArgs=list(ellipseCenterAndRadii=ellipseInfo)) 180 | return(isOutsideEllipse) 181 | } 182 | 183 | getHotellingT2Ellipse <-function (x, y, alfa = 0.95, len = 200) { 184 | ellipseInfo = getHotellingT2EllipseCenterAndRadii(x, y, alfa) 185 | mypi <- seq(0, 2 * pi, length = len) 186 | r1 = ellipseInfo[3] 187 | r2 = ellipseInfo[4] 188 | cbind(r1 * cos(mypi) + mean(x), r2 * sin(mypi) + mean(y)) 189 | } 190 | 191 | getHotellingT2EllipseCenterAndRadii<-function(x, y, alfa = 0.95){ 192 | # NOTE: this logic, except for the return statement, 193 | # is a trimmed COPY-PASTE of the simpleEllipse method in the 194 | # pcaMethods package. However, although the authors included 195 | # this method in the documentation as public, they forgot to 196 | # *make* it public. Also, that package 197 | # seems to bog down my notebook for unknown reasons. 198 | N <- length(x) 199 | r1 <- sqrt(var(x) * qf(alfa, 2, N - 2) * (2 * (N^2 - 1)/(N * 200 | (N - 2)))) 201 | r2 <- sqrt(var(y) * qf(alfa, 2, N - 2) * (2 * (N^2 - 1)/(N * 202 | (N - 2)))) 203 | return(c(mean(x), mean(y),r1,r2)) 204 | } 205 | 206 | 207 | expandPlot<-function(aPlot, additiveExpandValue=25, 208 | shrink_viewport=FALSE){ 209 | aPlot = aPlot + scale_x_continuous( 210 | expand =(c(0.05,additiveExpandValue))) 211 | printPlotInViewport(aPlot, shrink_viewport=FALSE) 212 | } 213 | 214 | 215 | -------------------------------------------------------------------------------- /secondary_analysis_scripts/1_RNASeq_Count_Metadata_Annotation_Load_and_Integration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | " # Dr. Kativa Agarwal RNA-seq analysis of VK2 cells\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# RNASeq Data Integration" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "* Daisy Chilin-Fuentes, CCBB (dchilinfuentes@ucsd.edu)\n", 22 | "* Based on upstream analysis by Daisy Chilin-Fuentes, CCBB (dchilinfuentes@ucsd.edu)\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "* Modeled on \"RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR\" ([1](#Citations))\n", 30 | "\n", 31 | "## Table of Contents\n", 32 | "* [Background](#Background)\n", 33 | "* [Introduction](#Introduction)\n", 34 | "* [Parameter Input](#Parameter-Input)\n", 35 | "* [Library Import](#Library-Import)\n", 36 | "* [Data Import](#Data-Import)\n", 37 | " * [Count Data](#Count-Data)\n", 38 | " * [Metadata](#Metadata)\n", 39 | " * [Annotations](#Annotations)\n", 40 | "* [Gene Separation By Coding Status](#Gene-Separation-By-Coding-Status)\n", 41 | "* [Data Integration](#Data-Integration)\n", 42 | "* [Annotation Integration](#Annotation-Integration)\n", 43 | "* [Summary](#Summary)\n", 44 | "* [Citations](#Citations)\n", 45 | "* [Appendix: R Session Info](#Appendix:-R-Session-Info)\n", 46 | "\n", 47 | "\n", 48 | "## Background" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "The count data analyzed in this notebook were produced by the upstream analysis of Daisy Chilin-Fuentes of CCBB, who received raw sequencing data and performed quality control, trimming, alignment, and quantification of reads.\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "[Table of Contents](#Table-of-Contents)\n", 63 | "\n", 64 | "## Introduction\n", 65 | "\n", 66 | "This notebook takes in per-gene-per-sample count data (prepared either externally or by the \"RNASeq_RSEM_QC_and_Counts_Preparation\" notebook) and per-sample metadata RNASeq data, and uses the edgeR ([2](#Citations)) Bioconductor ([3](#Citations)) package written in R ([4](#Citations)) to integrate and annotate these inputs in preparation for data exploration and preprocessing." 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "[Table of Contents](#Table-of-Contents)\n", 74 | "\n", 75 | "## Parameter Input" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 1, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "gProjectName = \"Agarwal-Lewis_bulkRNAseq\"\n", 85 | "gGeneCountsFilename = \"counts.txt\"\n", 86 | "gMetadataFilename = \"KavitaSamplesMetadata.tsv\"\n", 87 | "\n", 88 | "\n", 89 | "gAnnotationsRdataFilename = \"Homo_sapiens_GRCh38p13_gencodev38_ANNOT.Rdata\"" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 3, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stderr", 99 | "output_type": "stream", 100 | "text": [ 101 | "Warning message in dir.create(paste0(gOutputDir, \"/data_integration\")):\n", 102 | "“'../secondary_analysis_results/data_integration' already exists”\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "gSourceDir = \"./src/\" # note trailing slash here but not below\n", 108 | "gOutputDir = \"../secondary_analysis_results\"\n", 109 | "gReferenceDir = \"../reference\"\n", 110 | "gInterimDir = \"../interim\"\n", 111 | "gGeneCountsFp = file.path(gOutputDir, \"RSEM_QC_and_Counts_Preparation\", gGeneCountsFilename)\n", 112 | "gMetadataFp = file.path(gReferenceDir, gMetadataFilename)\n", 113 | "\n", 114 | "gOutputDir_nb1 <- paste0(gOutputDir, \"/data_integration\")\n", 115 | "dir.create(paste0(gOutputDir, \"/data_integration\")) \n" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Import shared source code to load and save previous notebooks' environments:\n", 125 | "source(paste0(gSourceDir, \"ChainedNotebookSupport.R\"))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Populate the run name parameter automatically to ensure that outputs from different runs do not overwrite each other:" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 5, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/html": [ 143 | "'20230302161642'" 144 | ], 145 | "text/latex": [ 146 | "'20230302161642'" 147 | ], 148 | "text/markdown": [ 149 | "'20230302161642'" 150 | ], 151 | "text/plain": [ 152 | "[1] \"20230302161642\"" 153 | ] 154 | }, 155 | "metadata": {}, 156 | "output_type": "display_data" 157 | } 158 | ], 159 | "source": [ 160 | "gRunName = format(Sys.time(), \"%Y%m%d%H%M%S\")\n", 161 | "gRunName" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 48, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/html": [ 172 | "'../secondary_analysis_results/data_integration/20230302161642/20230302161642'" 173 | ], 174 | "text/latex": [ 175 | "'../secondary\\_analysis\\_results/data\\_integration/20230302161642/20230302161642'" 176 | ], 177 | "text/markdown": [ 178 | "'../secondary_analysis_results/data_integration/20230302161642/20230302161642'" 179 | ], 180 | "text/plain": [ 181 | "[1] \"../secondary_analysis_results/data_integration/20230302161642/20230302161642\"" 182 | ] 183 | }, 184 | "metadata": {}, 185 | "output_type": "display_data" 186 | } 187 | ], 188 | "source": [ 189 | "# Create subdirectory with timestamp to keep multiple iterations separated\n", 190 | "gOutputDir_nb1 <- paste0(gOutputDir_nb1, \"/\", gRunName)\n", 191 | "dir.create(gOutputDir_nb1) \n", 192 | "gOutputDir_nb1" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "[Table of Contents](#Table-of-Contents)\n", 207 | "\n", 208 | "## Library Import\n", 209 | "\n", 210 | "Import the necessary R, Bioconductor, and CCBB libraries for the analysis:" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 7, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "#if (!requireNamespace(\"BiocManager\", quietly = TRUE))\n", 220 | "# install.packages(\"BiocManager\")" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 8, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "#BiocManager::install(\"edgeR\", version = \"3.8\")" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 9, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "#BiocManager::install(\"Homo.sapiens\", version = \"3.8\")" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 49, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "library(Homo.sapiens)\n", 248 | "gOrganismPackage = Homo.sapiens" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 50, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "library(edgeR)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "[Table of Contents](#Table-of-Contents)\n", 265 | "\n", 266 | "\n", 267 | "## Data Import\n", 268 | "\n", 269 | "### Count Data\n", 270 | "\n", 271 | "Import the count data file in which rows are genes identifiers, columns are sample identifiers, and row/column intersections contain the number of counts for the relevant gene in the relevant sample:" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 51, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/html": [ 282 | "\n", 287 | "
  1. 60605
  2. 12
\n" 288 | ], 289 | "text/latex": [ 290 | "\\begin{enumerate*}\n", 291 | "\\item 60605\n", 292 | "\\item 12\n", 293 | "\\end{enumerate*}\n" 294 | ], 295 | "text/markdown": [ 296 | "1. 60605\n", 297 | "2. 12\n", 298 | "\n", 299 | "\n" 300 | ], 301 | "text/plain": [ 302 | "[1] 60605 12" 303 | ] 304 | }, 305 | "metadata": {}, 306 | "output_type": "display_data" 307 | } 308 | ], 309 | "source": [ 310 | "# Read in counts file containing info on all samples and genes\n", 311 | "gUnorderedGeneCountsDf <- read.csv(gGeneCountsFp, sep=\"\\t\", stringsAsFactors=FALSE, row.names=1)\n", 312 | "dim(gUnorderedGeneCountsDf)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 52, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/html": [ 323 | "\n", 324 | "\n", 325 | "\n", 326 | "\t\n", 327 | "\t\n", 328 | "\n", 329 | "\n", 330 | "\t\n", 331 | "\t\n", 332 | "\t\n", 333 | "\t\n", 334 | "\t\n", 335 | "\t\n", 336 | "\n", 337 | "
A data.frame: 6 × 12
VK2_NanH2_PolyB25_1h_5VK2_NanH2_PolyB25_1h_6VK2_NanH2_PolyB25_1h_7VK2_NanH2_PolyB25_1h_8VK2_NanH2_PolyB25_2h_1VK2_NanH2_PolyB25_2h_2VK2_NanH2_PolyB25_2h_3VK2_NanH2_PolyB25_2h_4VK2_Pet28a_PolyB25_2h_10VK2_Pet28a_PolyB25_2h_11VK2_Pet28a_PolyB25_2h_12VK2_Pet28a_PolyB25_2h_9
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
ENSG00000000003.15 502.00 416.00 424.00 497.00 356.00 542.00 364.00 299.00 387.00 465.00 366.00 323.00
ENSG00000000005.6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
ENSG00000000419.142340.002232.042253.002573.362080.113080.002206.801721.002084.002412.422170.071904.05
ENSG00000000457.14 153.65 94.84 112.33 130.23 113.88 119.35 85.84 102.65 98.34 120.36 91.56 119.77
ENSG00000000460.17 254.35 200.16 242.67 292.77 181.12 285.65 181.16 209.35 189.66 190.64 204.44 234.23
ENSG00000000938.13 10.00 6.00 6.00 9.00 4.00 21.00 10.00 18.00 13.00 17.00 13.00 10.00
\n" 338 | ], 339 | "text/latex": [ 340 | "A data.frame: 6 × 12\n", 341 | "\\begin{tabular}{r|llllllllllll}\n", 342 | " & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & VK2\\_NanH2\\_PolyB25\\_1h\\_7 & VK2\\_NanH2\\_PolyB25\\_1h\\_8 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & VK2\\_Pet28a\\_PolyB25\\_2h\\_10 & VK2\\_Pet28a\\_PolyB25\\_2h\\_11 & VK2\\_Pet28a\\_PolyB25\\_2h\\_12 & VK2\\_Pet28a\\_PolyB25\\_2h\\_9\\\\\n", 343 | " & & & & & & & & & & & & \\\\\n", 344 | "\\hline\n", 345 | "\tENSG00000000003.15 & 502.00 & 416.00 & 424.00 & 497.00 & 356.00 & 542.00 & 364.00 & 299.00 & 387.00 & 465.00 & 366.00 & 323.00\\\\\n", 346 | "\tENSG00000000005.6 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00\\\\\n", 347 | "\tENSG00000000419.14 & 2340.00 & 2232.04 & 2253.00 & 2573.36 & 2080.11 & 3080.00 & 2206.80 & 1721.00 & 2084.00 & 2412.42 & 2170.07 & 1904.05\\\\\n", 348 | "\tENSG00000000457.14 & 153.65 & 94.84 & 112.33 & 130.23 & 113.88 & 119.35 & 85.84 & 102.65 & 98.34 & 120.36 & 91.56 & 119.77\\\\\n", 349 | "\tENSG00000000460.17 & 254.35 & 200.16 & 242.67 & 292.77 & 181.12 & 285.65 & 181.16 & 209.35 & 189.66 & 190.64 & 204.44 & 234.23\\\\\n", 350 | "\tENSG00000000938.13 & 10.00 & 6.00 & 6.00 & 9.00 & 4.00 & 21.00 & 10.00 & 18.00 & 13.00 & 17.00 & 13.00 & 10.00\\\\\n", 351 | "\\end{tabular}\n" 352 | ], 353 | "text/markdown": [ 354 | "\n", 355 | "A data.frame: 6 × 12\n", 356 | "\n", 357 | "| | VK2_NanH2_PolyB25_1h_5 <dbl> | VK2_NanH2_PolyB25_1h_6 <dbl> | VK2_NanH2_PolyB25_1h_7 <dbl> | VK2_NanH2_PolyB25_1h_8 <dbl> | VK2_NanH2_PolyB25_2h_1 <dbl> | VK2_NanH2_PolyB25_2h_2 <dbl> | VK2_NanH2_PolyB25_2h_3 <dbl> | VK2_NanH2_PolyB25_2h_4 <dbl> | VK2_Pet28a_PolyB25_2h_10 <dbl> | VK2_Pet28a_PolyB25_2h_11 <dbl> | VK2_Pet28a_PolyB25_2h_12 <dbl> | VK2_Pet28a_PolyB25_2h_9 <dbl> |\n", 358 | "|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", 359 | "| ENSG00000000003.15 | 502.00 | 416.00 | 424.00 | 497.00 | 356.00 | 542.00 | 364.00 | 299.00 | 387.00 | 465.00 | 366.00 | 323.00 |\n", 360 | "| ENSG00000000005.6 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |\n", 361 | "| ENSG00000000419.14 | 2340.00 | 2232.04 | 2253.00 | 2573.36 | 2080.11 | 3080.00 | 2206.80 | 1721.00 | 2084.00 | 2412.42 | 2170.07 | 1904.05 |\n", 362 | "| ENSG00000000457.14 | 153.65 | 94.84 | 112.33 | 130.23 | 113.88 | 119.35 | 85.84 | 102.65 | 98.34 | 120.36 | 91.56 | 119.77 |\n", 363 | "| ENSG00000000460.17 | 254.35 | 200.16 | 242.67 | 292.77 | 181.12 | 285.65 | 181.16 | 209.35 | 189.66 | 190.64 | 204.44 | 234.23 |\n", 364 | "| ENSG00000000938.13 | 10.00 | 6.00 | 6.00 | 9.00 | 4.00 | 21.00 | 10.00 | 18.00 | 13.00 | 17.00 | 13.00 | 10.00 |\n", 365 | "\n" 366 | ], 367 | "text/plain": [ 368 | " VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_6\n", 369 | "ENSG00000000003.15 502.00 416.00 \n", 370 | "ENSG00000000005.6 0.00 0.00 \n", 371 | "ENSG00000000419.14 2340.00 2232.04 \n", 372 | "ENSG00000000457.14 153.65 94.84 \n", 373 | "ENSG00000000460.17 254.35 200.16 \n", 374 | "ENSG00000000938.13 10.00 6.00 \n", 375 | " VK2_NanH2_PolyB25_1h_7 VK2_NanH2_PolyB25_1h_8\n", 376 | "ENSG00000000003.15 424.00 497.00 \n", 377 | "ENSG00000000005.6 0.00 0.00 \n", 378 | "ENSG00000000419.14 2253.00 2573.36 \n", 379 | "ENSG00000000457.14 112.33 130.23 \n", 380 | "ENSG00000000460.17 242.67 292.77 \n", 381 | "ENSG00000000938.13 6.00 9.00 \n", 382 | " VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_2\n", 383 | "ENSG00000000003.15 356.00 542.00 \n", 384 | "ENSG00000000005.6 0.00 0.00 \n", 385 | "ENSG00000000419.14 2080.11 3080.00 \n", 386 | "ENSG00000000457.14 113.88 119.35 \n", 387 | "ENSG00000000460.17 181.12 285.65 \n", 388 | "ENSG00000000938.13 4.00 21.00 \n", 389 | " VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_4\n", 390 | "ENSG00000000003.15 364.00 299.00 \n", 391 | "ENSG00000000005.6 0.00 0.00 \n", 392 | "ENSG00000000419.14 2206.80 1721.00 \n", 393 | "ENSG00000000457.14 85.84 102.65 \n", 394 | "ENSG00000000460.17 181.16 209.35 \n", 395 | "ENSG00000000938.13 10.00 18.00 \n", 396 | " VK2_Pet28a_PolyB25_2h_10 VK2_Pet28a_PolyB25_2h_11\n", 397 | "ENSG00000000003.15 387.00 465.00 \n", 398 | "ENSG00000000005.6 0.00 0.00 \n", 399 | "ENSG00000000419.14 2084.00 2412.42 \n", 400 | "ENSG00000000457.14 98.34 120.36 \n", 401 | "ENSG00000000460.17 189.66 190.64 \n", 402 | "ENSG00000000938.13 13.00 17.00 \n", 403 | " VK2_Pet28a_PolyB25_2h_12 VK2_Pet28a_PolyB25_2h_9\n", 404 | "ENSG00000000003.15 366.00 323.00 \n", 405 | "ENSG00000000005.6 0.00 0.00 \n", 406 | "ENSG00000000419.14 2170.07 1904.05 \n", 407 | "ENSG00000000457.14 91.56 119.77 \n", 408 | "ENSG00000000460.17 204.44 234.23 \n", 409 | "ENSG00000000938.13 13.00 10.00 " 410 | ] 411 | }, 412 | "metadata": {}, 413 | "output_type": "display_data" 414 | } 415 | ], 416 | "source": [ 417 | "head(gUnorderedGeneCountsDf)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 53, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "detectParRecords = function(geneCountsDf){\n", 427 | " gene_names <- rownames(geneCountsDf)\n", 428 | " PAR_genes <- gene_names[grep(\"_PAR_\", gene_names)] \n", 429 | " if (length(PAR_genes) == 0){\n", 430 | " print(\"No PAR genes detected; analysis can proceed.\")\n", 431 | " } else {\n", 432 | " print(\"ERROR: PAR genes found. These must be removed before continuing analysis.\")\n", 433 | " }\n", 434 | " return(PAR_genes)\n", 435 | "}" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 54, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stdout", 445 | "output_type": "stream", 446 | "text": [ 447 | "[1] \"No PAR genes detected; analysis can proceed.\"\n" 448 | ] 449 | }, 450 | { 451 | "data": { 452 | "text/html": [], 453 | "text/latex": [], 454 | "text/markdown": [], 455 | "text/plain": [ 456 | "character(0)" 457 | ] 458 | }, 459 | "metadata": {}, 460 | "output_type": "display_data" 461 | } 462 | ], 463 | "source": [ 464 | "detectParRecords(gUnorderedGeneCountsDf)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "No assumption is made that the columns (samples) of the gene count file are currently ordered in the order desirable for the differential expression analysis." 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "[Table of Contents](#Table-of-Contents)\n", 479 | "\n", 480 | "### Metadata\n", 481 | "\n", 482 | "> For downstream analysis, sample-level information related to the experimental design needs to be associated with the columns of the counts matrix. This should include experimental variables, both biological and technical, that could have an effect on expression levels. Examples [could] include cell type (basal, LP and ML in this experiment), genotype (wild-type, knock-out), phenotype (disease status, sex, age), sample treatment (drug, control) and batch information (date experiment was performed if samples were collected and analysed at distinct time points) to name just a few. ([1](#Citations))\n", 483 | "\n", 484 | "Import a metadata file in which rows are sample identifiers, columns are metadata features (e.g., subject id, time point, etc) and row/column intersections contain the value of the relevant feature for the relevant sample:" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 55, 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "data": { 494 | "text/html": [ 495 | "\n", 500 | "
  1. 12
  2. 6
\n" 501 | ], 502 | "text/latex": [ 503 | "\\begin{enumerate*}\n", 504 | "\\item 12\n", 505 | "\\item 6\n", 506 | "\\end{enumerate*}\n" 507 | ], 508 | "text/markdown": [ 509 | "1. 12\n", 510 | "2. 6\n", 511 | "\n", 512 | "\n" 513 | ], 514 | "text/plain": [ 515 | "[1] 12 6" 516 | ] 517 | }, 518 | "metadata": {}, 519 | "output_type": "display_data" 520 | } 521 | ], 522 | "source": [ 523 | "#Read in metadata\n", 524 | "gMetadataDf <- read.csv(gMetadataFp, stringsAsFactors=FALSE, sep = \"\\t\")\n", 525 | "dim(gMetadataDf)" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 56, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/html": [ 536 | "\n", 537 | "\n", 538 | "\n", 539 | "\t\n", 540 | "\t\n", 541 | "\n", 542 | "\n", 543 | "\t\n", 544 | "\t\n", 545 | "\t\n", 546 | "\t\n", 547 | "\t\n", 548 | "\t\n", 549 | "\n", 550 | "
A data.frame: 6 × 6
Sample.NameSample.CodeReference.Genome..Organism.GenderConditionTime.point
<chr><chr><chr><chr><chr><chr>
1VK2_NanH2_PolyB25_2h_1K001HumanFemaleNanH2 treated2 hour
2VK2_NanH2_PolyB25_2h_2K002HumanFemaleNanH2 treated2 hour
3VK2_NanH2_PolyB25_2h_3K003HumanFemaleNanH2 treated2 hour
4VK2_NanH2_PolyB25_2h_4K004HumanFemaleNanH2 treated2 hour
5VK2_NanH2_PolyB25_1h_5K005HumanFemaleNanH2 treated1 hour
6VK2_NanH2_PolyB25_1h_6K006HumanFemaleNanH2 treated1 hour
\n" 551 | ], 552 | "text/latex": [ 553 | "A data.frame: 6 × 6\n", 554 | "\\begin{tabular}{r|llllll}\n", 555 | " & Sample.Name & Sample.Code & Reference.Genome..Organism. & Gender & Condition & Time.point\\\\\n", 556 | " & & & & & & \\\\\n", 557 | "\\hline\n", 558 | "\t1 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & K001 & Human & Female & NanH2 treated & 2 hour\\\\\n", 559 | "\t2 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & K002 & Human & Female & NanH2 treated & 2 hour\\\\\n", 560 | "\t3 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & K003 & Human & Female & NanH2 treated & 2 hour\\\\\n", 561 | "\t4 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & K004 & Human & Female & NanH2 treated & 2 hour\\\\\n", 562 | "\t5 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & K005 & Human & Female & NanH2 treated & 1 hour\\\\\n", 563 | "\t6 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & K006 & Human & Female & NanH2 treated & 1 hour\\\\\n", 564 | "\\end{tabular}\n" 565 | ], 566 | "text/markdown": [ 567 | "\n", 568 | "A data.frame: 6 × 6\n", 569 | "\n", 570 | "| | Sample.Name <chr> | Sample.Code <chr> | Reference.Genome..Organism. <chr> | Gender <chr> | Condition <chr> | Time.point <chr> |\n", 571 | "|---|---|---|---|---|---|---|\n", 572 | "| 1 | VK2_NanH2_PolyB25_2h_1 | K001 | Human | Female | NanH2 treated | 2 hour |\n", 573 | "| 2 | VK2_NanH2_PolyB25_2h_2 | K002 | Human | Female | NanH2 treated | 2 hour |\n", 574 | "| 3 | VK2_NanH2_PolyB25_2h_3 | K003 | Human | Female | NanH2 treated | 2 hour |\n", 575 | "| 4 | VK2_NanH2_PolyB25_2h_4 | K004 | Human | Female | NanH2 treated | 2 hour |\n", 576 | "| 5 | VK2_NanH2_PolyB25_1h_5 | K005 | Human | Female | NanH2 treated | 1 hour |\n", 577 | "| 6 | VK2_NanH2_PolyB25_1h_6 | K006 | Human | Female | NanH2 treated | 1 hour |\n", 578 | "\n" 579 | ], 580 | "text/plain": [ 581 | " Sample.Name Sample.Code Reference.Genome..Organism. Gender\n", 582 | "1 VK2_NanH2_PolyB25_2h_1 K001 Human Female\n", 583 | "2 VK2_NanH2_PolyB25_2h_2 K002 Human Female\n", 584 | "3 VK2_NanH2_PolyB25_2h_3 K003 Human Female\n", 585 | "4 VK2_NanH2_PolyB25_2h_4 K004 Human Female\n", 586 | "5 VK2_NanH2_PolyB25_1h_5 K005 Human Female\n", 587 | "6 VK2_NanH2_PolyB25_1h_6 K006 Human Female\n", 588 | " Condition Time.point\n", 589 | "1 NanH2 treated 2 hour \n", 590 | "2 NanH2 treated 2 hour \n", 591 | "3 NanH2 treated 2 hour \n", 592 | "4 NanH2 treated 2 hour \n", 593 | "5 NanH2 treated 1 hour \n", 594 | "6 NanH2 treated 1 hour " 595 | ] 596 | }, 597 | "metadata": {}, 598 | "output_type": "display_data" 599 | } 600 | ], 601 | "source": [ 602 | "head(gMetadataDf)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 59, 608 | "metadata": {}, 609 | "outputs": [ 610 | { 611 | "data": { 612 | "text/html": [ 613 | "\n", 614 | "\n", 615 | "\n", 616 | "\t\n", 617 | "\t\n", 618 | "\n", 619 | "\n", 620 | "\t\n", 621 | "\t\n", 622 | "\t\n", 623 | "\t\n", 624 | "\t\n", 625 | "\t\n", 626 | "\n", 627 | "
A data.frame: 6 × 7
Sample.NameSample.CodeReference.Genome..Organism.GenderConditionTime.pointCondition_Time
<chr><chr><chr><chr><chr><chr><chr>
1VK2_NanH2_PolyB25_2h_1K001HumanFemaleNanH2_treated2HNanH2_treated_2H
2VK2_NanH2_PolyB25_2h_2K002HumanFemaleNanH2_treated2HNanH2_treated_2H
3VK2_NanH2_PolyB25_2h_3K003HumanFemaleNanH2_treated2HNanH2_treated_2H
4VK2_NanH2_PolyB25_2h_4K004HumanFemaleNanH2_treated2HNanH2_treated_2H
5VK2_NanH2_PolyB25_1h_5K005HumanFemaleNanH2_treated1HNanH2_treated_1H
6VK2_NanH2_PolyB25_1h_6K006HumanFemaleNanH2_treated1HNanH2_treated_1H
\n" 628 | ], 629 | "text/latex": [ 630 | "A data.frame: 6 × 7\n", 631 | "\\begin{tabular}{r|lllllll}\n", 632 | " & Sample.Name & Sample.Code & Reference.Genome..Organism. & Gender & Condition & Time.point & Condition\\_Time\\\\\n", 633 | " & & & & & & & \\\\\n", 634 | "\\hline\n", 635 | "\t1 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & K001 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 636 | "\t2 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & K002 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 637 | "\t3 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & K003 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 638 | "\t4 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & K004 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 639 | "\t5 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & K005 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n", 640 | "\t6 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & K006 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n", 641 | "\\end{tabular}\n" 642 | ], 643 | "text/markdown": [ 644 | "\n", 645 | "A data.frame: 6 × 7\n", 646 | "\n", 647 | "| | Sample.Name <chr> | Sample.Code <chr> | Reference.Genome..Organism. <chr> | Gender <chr> | Condition <chr> | Time.point <chr> | Condition_Time <chr> |\n", 648 | "|---|---|---|---|---|---|---|---|\n", 649 | "| 1 | VK2_NanH2_PolyB25_2h_1 | K001 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 650 | "| 2 | VK2_NanH2_PolyB25_2h_2 | K002 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 651 | "| 3 | VK2_NanH2_PolyB25_2h_3 | K003 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 652 | "| 4 | VK2_NanH2_PolyB25_2h_4 | K004 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 653 | "| 5 | VK2_NanH2_PolyB25_1h_5 | K005 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n", 654 | "| 6 | VK2_NanH2_PolyB25_1h_6 | K006 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n", 655 | "\n" 656 | ], 657 | "text/plain": [ 658 | " Sample.Name Sample.Code Reference.Genome..Organism. Gender\n", 659 | "1 VK2_NanH2_PolyB25_2h_1 K001 Human Female\n", 660 | "2 VK2_NanH2_PolyB25_2h_2 K002 Human Female\n", 661 | "3 VK2_NanH2_PolyB25_2h_3 K003 Human Female\n", 662 | "4 VK2_NanH2_PolyB25_2h_4 K004 Human Female\n", 663 | "5 VK2_NanH2_PolyB25_1h_5 K005 Human Female\n", 664 | "6 VK2_NanH2_PolyB25_1h_6 K006 Human Female\n", 665 | " Condition Time.point Condition_Time \n", 666 | "1 NanH2_treated 2H NanH2_treated_2H\n", 667 | "2 NanH2_treated 2H NanH2_treated_2H\n", 668 | "3 NanH2_treated 2H NanH2_treated_2H\n", 669 | "4 NanH2_treated 2H NanH2_treated_2H\n", 670 | "5 NanH2_treated 1H NanH2_treated_1H\n", 671 | "6 NanH2_treated 1H NanH2_treated_1H" 672 | ] 673 | }, 674 | "metadata": {}, 675 | "output_type": "display_data" 676 | } 677 | ], 678 | "source": [ 679 | "#remove spaces\n", 680 | "gMetadataDf$Condition <- gsub(\" \", \"_\", gMetadataDf$Condition)\n", 681 | "gMetadataDf$Time.point <- gsub(\"2 hour\", \"2H\", gMetadataDf$Time.point)\n", 682 | "gMetadataDf$Time.point <- gsub(\"1 hour\", \"1H\", gMetadataDf$Time.point)\n", 683 | "\n", 684 | "\n", 685 | "#add another col\n", 686 | "gMetadataDf$Condition_Time <- paste(gMetadataDf$Condition, gMetadataDf$Time.point, sep=\"_\")\n", 687 | "head(gMetadataDf)" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": 60, 693 | "metadata": {}, 694 | "outputs": [], 695 | "source": [ 696 | "gSampleNames = gMetadataDf[[\"Sample.Name\"]]" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "Check the dimensions of the count data and the metadata to ensure that the count dataframe has the same number of columns (samples) as the metadata dataframe has rows (again, samples), and that the sample names are the same in both: " 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 61, 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "data": { 713 | "text/html": [ 714 | "\n", 719 | "
  1. 60605
  2. 12
\n" 720 | ], 721 | "text/latex": [ 722 | "\\begin{enumerate*}\n", 723 | "\\item 60605\n", 724 | "\\item 12\n", 725 | "\\end{enumerate*}\n" 726 | ], 727 | "text/markdown": [ 728 | "1. 60605\n", 729 | "2. 12\n", 730 | "\n", 731 | "\n" 732 | ], 733 | "text/plain": [ 734 | "[1] 60605 12" 735 | ] 736 | }, 737 | "metadata": {}, 738 | "output_type": "display_data" 739 | }, 740 | { 741 | "data": { 742 | "text/html": [ 743 | "\n", 748 | "
  1. 12
  2. 7
\n" 749 | ], 750 | "text/latex": [ 751 | "\\begin{enumerate*}\n", 752 | "\\item 12\n", 753 | "\\item 7\n", 754 | "\\end{enumerate*}\n" 755 | ], 756 | "text/markdown": [ 757 | "1. 12\n", 758 | "2. 7\n", 759 | "\n", 760 | "\n" 761 | ], 762 | "text/plain": [ 763 | "[1] 12 7" 764 | ] 765 | }, 766 | "metadata": {}, 767 | "output_type": "display_data" 768 | }, 769 | { 770 | "data": { 771 | "text/html": [ 772 | "TRUE" 773 | ], 774 | "text/latex": [ 775 | "TRUE" 776 | ], 777 | "text/markdown": [ 778 | "TRUE" 779 | ], 780 | "text/plain": [ 781 | "[1] TRUE" 782 | ] 783 | }, 784 | "metadata": {}, 785 | "output_type": "display_data" 786 | } 787 | ], 788 | "source": [ 789 | "dim(gUnorderedGeneCountsDf)\n", 790 | "dim(gMetadataDf)\n", 791 | "\n", 792 | "all(colnames(gUnorderedGeneCountsDf) %in% gSampleNames)" 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "metadata": {}, 798 | "source": [ 799 | "Assume that the order of the samples shown in the metadata is the desired order, and reorder the columns in the counts table to match it:" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": 62, 805 | "metadata": {}, 806 | "outputs": [ 807 | { 808 | "data": { 809 | "text/html": [ 810 | "\n", 811 | "\n", 812 | "\n", 813 | "\t\n", 814 | "\t\n", 815 | "\n", 816 | "\n", 817 | "\t\n", 818 | "\t\n", 819 | "\t\n", 820 | "\t\n", 821 | "\t\n", 822 | "\t\n", 823 | "\n", 824 | "
A data.frame: 6 × 12
VK2_NanH2_PolyB25_2h_1VK2_NanH2_PolyB25_2h_2VK2_NanH2_PolyB25_2h_3VK2_NanH2_PolyB25_2h_4VK2_NanH2_PolyB25_1h_5VK2_NanH2_PolyB25_1h_6VK2_NanH2_PolyB25_1h_7VK2_NanH2_PolyB25_1h_8VK2_Pet28a_PolyB25_2h_9VK2_Pet28a_PolyB25_2h_10VK2_Pet28a_PolyB25_2h_11VK2_Pet28a_PolyB25_2h_12
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
ENSG00000000003.15 356.00 542.00 364.00 299.00 502.00 416.00 424.00 497.00 323.00 387.00 465.00 366.00
ENSG00000000005.6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
ENSG00000000419.142080.113080.002206.801721.002340.002232.042253.002573.361904.052084.002412.422170.07
ENSG00000000457.14 113.88 119.35 85.84 102.65 153.65 94.84 112.33 130.23 119.77 98.34 120.36 91.56
ENSG00000000460.17 181.12 285.65 181.16 209.35 254.35 200.16 242.67 292.77 234.23 189.66 190.64 204.44
ENSG00000000938.13 4.00 21.00 10.00 18.00 10.00 6.00 6.00 9.00 10.00 13.00 17.00 13.00
\n" 825 | ], 826 | "text/latex": [ 827 | "A data.frame: 6 × 12\n", 828 | "\\begin{tabular}{r|llllllllllll}\n", 829 | " & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & VK2\\_NanH2\\_PolyB25\\_1h\\_7 & VK2\\_NanH2\\_PolyB25\\_1h\\_8 & VK2\\_Pet28a\\_PolyB25\\_2h\\_9 & VK2\\_Pet28a\\_PolyB25\\_2h\\_10 & VK2\\_Pet28a\\_PolyB25\\_2h\\_11 & VK2\\_Pet28a\\_PolyB25\\_2h\\_12\\\\\n", 830 | " & & & & & & & & & & & & \\\\\n", 831 | "\\hline\n", 832 | "\tENSG00000000003.15 & 356.00 & 542.00 & 364.00 & 299.00 & 502.00 & 416.00 & 424.00 & 497.00 & 323.00 & 387.00 & 465.00 & 366.00\\\\\n", 833 | "\tENSG00000000005.6 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00\\\\\n", 834 | "\tENSG00000000419.14 & 2080.11 & 3080.00 & 2206.80 & 1721.00 & 2340.00 & 2232.04 & 2253.00 & 2573.36 & 1904.05 & 2084.00 & 2412.42 & 2170.07\\\\\n", 835 | "\tENSG00000000457.14 & 113.88 & 119.35 & 85.84 & 102.65 & 153.65 & 94.84 & 112.33 & 130.23 & 119.77 & 98.34 & 120.36 & 91.56\\\\\n", 836 | "\tENSG00000000460.17 & 181.12 & 285.65 & 181.16 & 209.35 & 254.35 & 200.16 & 242.67 & 292.77 & 234.23 & 189.66 & 190.64 & 204.44\\\\\n", 837 | "\tENSG00000000938.13 & 4.00 & 21.00 & 10.00 & 18.00 & 10.00 & 6.00 & 6.00 & 9.00 & 10.00 & 13.00 & 17.00 & 13.00\\\\\n", 838 | "\\end{tabular}\n" 839 | ], 840 | "text/markdown": [ 841 | "\n", 842 | "A data.frame: 6 × 12\n", 843 | "\n", 844 | "| | VK2_NanH2_PolyB25_2h_1 <dbl> | VK2_NanH2_PolyB25_2h_2 <dbl> | VK2_NanH2_PolyB25_2h_3 <dbl> | VK2_NanH2_PolyB25_2h_4 <dbl> | VK2_NanH2_PolyB25_1h_5 <dbl> | VK2_NanH2_PolyB25_1h_6 <dbl> | VK2_NanH2_PolyB25_1h_7 <dbl> | VK2_NanH2_PolyB25_1h_8 <dbl> | VK2_Pet28a_PolyB25_2h_9 <dbl> | VK2_Pet28a_PolyB25_2h_10 <dbl> | VK2_Pet28a_PolyB25_2h_11 <dbl> | VK2_Pet28a_PolyB25_2h_12 <dbl> |\n", 845 | "|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", 846 | "| ENSG00000000003.15 | 356.00 | 542.00 | 364.00 | 299.00 | 502.00 | 416.00 | 424.00 | 497.00 | 323.00 | 387.00 | 465.00 | 366.00 |\n", 847 | "| ENSG00000000005.6 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |\n", 848 | "| ENSG00000000419.14 | 2080.11 | 3080.00 | 2206.80 | 1721.00 | 2340.00 | 2232.04 | 2253.00 | 2573.36 | 1904.05 | 2084.00 | 2412.42 | 2170.07 |\n", 849 | "| ENSG00000000457.14 | 113.88 | 119.35 | 85.84 | 102.65 | 153.65 | 94.84 | 112.33 | 130.23 | 119.77 | 98.34 | 120.36 | 91.56 |\n", 850 | "| ENSG00000000460.17 | 181.12 | 285.65 | 181.16 | 209.35 | 254.35 | 200.16 | 242.67 | 292.77 | 234.23 | 189.66 | 190.64 | 204.44 |\n", 851 | "| ENSG00000000938.13 | 4.00 | 21.00 | 10.00 | 18.00 | 10.00 | 6.00 | 6.00 | 9.00 | 10.00 | 13.00 | 17.00 | 13.00 |\n", 852 | "\n" 853 | ], 854 | "text/plain": [ 855 | " VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_2\n", 856 | "ENSG00000000003.15 356.00 542.00 \n", 857 | "ENSG00000000005.6 0.00 0.00 \n", 858 | "ENSG00000000419.14 2080.11 3080.00 \n", 859 | "ENSG00000000457.14 113.88 119.35 \n", 860 | "ENSG00000000460.17 181.12 285.65 \n", 861 | "ENSG00000000938.13 4.00 21.00 \n", 862 | " VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_4\n", 863 | "ENSG00000000003.15 364.00 299.00 \n", 864 | "ENSG00000000005.6 0.00 0.00 \n", 865 | "ENSG00000000419.14 2206.80 1721.00 \n", 866 | "ENSG00000000457.14 85.84 102.65 \n", 867 | "ENSG00000000460.17 181.16 209.35 \n", 868 | "ENSG00000000938.13 10.00 18.00 \n", 869 | " VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_6\n", 870 | "ENSG00000000003.15 502.00 416.00 \n", 871 | "ENSG00000000005.6 0.00 0.00 \n", 872 | "ENSG00000000419.14 2340.00 2232.04 \n", 873 | "ENSG00000000457.14 153.65 94.84 \n", 874 | "ENSG00000000460.17 254.35 200.16 \n", 875 | "ENSG00000000938.13 10.00 6.00 \n", 876 | " VK2_NanH2_PolyB25_1h_7 VK2_NanH2_PolyB25_1h_8\n", 877 | "ENSG00000000003.15 424.00 497.00 \n", 878 | "ENSG00000000005.6 0.00 0.00 \n", 879 | "ENSG00000000419.14 2253.00 2573.36 \n", 880 | "ENSG00000000457.14 112.33 130.23 \n", 881 | "ENSG00000000460.17 242.67 292.77 \n", 882 | "ENSG00000000938.13 6.00 9.00 \n", 883 | " VK2_Pet28a_PolyB25_2h_9 VK2_Pet28a_PolyB25_2h_10\n", 884 | "ENSG00000000003.15 323.00 387.00 \n", 885 | "ENSG00000000005.6 0.00 0.00 \n", 886 | "ENSG00000000419.14 1904.05 2084.00 \n", 887 | "ENSG00000000457.14 119.77 98.34 \n", 888 | "ENSG00000000460.17 234.23 189.66 \n", 889 | "ENSG00000000938.13 10.00 13.00 \n", 890 | " VK2_Pet28a_PolyB25_2h_11 VK2_Pet28a_PolyB25_2h_12\n", 891 | "ENSG00000000003.15 465.00 366.00 \n", 892 | "ENSG00000000005.6 0.00 0.00 \n", 893 | "ENSG00000000419.14 2412.42 2170.07 \n", 894 | "ENSG00000000457.14 120.36 91.56 \n", 895 | "ENSG00000000460.17 190.64 204.44 \n", 896 | "ENSG00000000938.13 17.00 13.00 " 897 | ] 898 | }, 899 | "metadata": {}, 900 | "output_type": "display_data" 901 | } 902 | ], 903 | "source": [ 904 | "gGeneCountsDf = gUnorderedGeneCountsDf[gSampleNames]\n", 905 | "head(gGeneCountsDf)" 906 | ] 907 | }, 908 | { 909 | "cell_type": "markdown", 910 | "metadata": {}, 911 | "source": [ 912 | "If the count file gene identifiers do NOT include version numbers (e.g., the \".4\" part in a gene identifier like \"ENSG00000268020.4\"), then it is necessary to truncate the version information from the public annotation data to be used below in order to match the annotation data gene identifiers to the count file gene identifiers. Set the flag for version removal accordingly:" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 63, 918 | "metadata": {}, 919 | "outputs": [], 920 | "source": [ 921 | "gRemoveVersion <- FALSE\n" 922 | ] 923 | }, 924 | { 925 | "cell_type": "markdown", 926 | "metadata": {}, 927 | "source": [ 928 | "[Table of Contents](#Table-of-Contents)\n", 929 | "\n", 930 | "### Annotations\n", 931 | "\n", 932 | "If a previously created file of the gene annotations has been provided, load it:" 933 | ] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": 64, 938 | "metadata": {}, 939 | "outputs": [ 940 | { 941 | "data": { 942 | "text/html": [ 943 | "\n", 944 | "\n", 945 | "\n", 946 | "\t\n", 947 | "\t\n", 948 | "\n", 949 | "\n", 950 | "\t\n", 951 | "\t\n", 952 | "\t\n", 953 | "\t\n", 954 | "\t\n", 955 | "\t\n", 956 | "\n", 957 | "
A data.frame: 6 × 3
gene_typegene_idtranscript_id
<chr><chr><chr>
1transcribed_unprocessed_pseudogeneENSG00000223972.5ENST00000456328.2
2transcribed_unprocessed_pseudogeneENSG00000223972.5ENST00000450305.2
3unprocessed_pseudogene ENSG00000227232.5ENST00000488147.1
4miRNA ENSG00000278267.1ENST00000619216.1
5lncRNA ENSG00000243485.5ENST00000473358.1
6lncRNA ENSG00000243485.5ENST00000469289.1
\n" 958 | ], 959 | "text/latex": [ 960 | "A data.frame: 6 × 3\n", 961 | "\\begin{tabular}{r|lll}\n", 962 | " & gene\\_type & gene\\_id & transcript\\_id\\\\\n", 963 | " & & & \\\\\n", 964 | "\\hline\n", 965 | "\t1 & transcribed\\_unprocessed\\_pseudogene & ENSG00000223972.5 & ENST00000456328.2\\\\\n", 966 | "\t2 & transcribed\\_unprocessed\\_pseudogene & ENSG00000223972.5 & ENST00000450305.2\\\\\n", 967 | "\t3 & unprocessed\\_pseudogene & ENSG00000227232.5 & ENST00000488147.1\\\\\n", 968 | "\t4 & miRNA & ENSG00000278267.1 & ENST00000619216.1\\\\\n", 969 | "\t5 & lncRNA & ENSG00000243485.5 & ENST00000473358.1\\\\\n", 970 | "\t6 & lncRNA & ENSG00000243485.5 & ENST00000469289.1\\\\\n", 971 | "\\end{tabular}\n" 972 | ], 973 | "text/markdown": [ 974 | "\n", 975 | "A data.frame: 6 × 3\n", 976 | "\n", 977 | "| | gene_type <chr> | gene_id <chr> | transcript_id <chr> |\n", 978 | "|---|---|---|---|\n", 979 | "| 1 | transcribed_unprocessed_pseudogene | ENSG00000223972.5 | ENST00000456328.2 |\n", 980 | "| 2 | transcribed_unprocessed_pseudogene | ENSG00000223972.5 | ENST00000450305.2 |\n", 981 | "| 3 | unprocessed_pseudogene | ENSG00000227232.5 | ENST00000488147.1 |\n", 982 | "| 4 | miRNA | ENSG00000278267.1 | ENST00000619216.1 |\n", 983 | "| 5 | lncRNA | ENSG00000243485.5 | ENST00000473358.1 |\n", 984 | "| 6 | lncRNA | ENSG00000243485.5 | ENST00000469289.1 |\n", 985 | "\n" 986 | ], 987 | "text/plain": [ 988 | " gene_type gene_id transcript_id \n", 989 | "1 transcribed_unprocessed_pseudogene ENSG00000223972.5 ENST00000456328.2\n", 990 | "2 transcribed_unprocessed_pseudogene ENSG00000223972.5 ENST00000450305.2\n", 991 | "3 unprocessed_pseudogene ENSG00000227232.5 ENST00000488147.1\n", 992 | "4 miRNA ENSG00000278267.1 ENST00000619216.1\n", 993 | "5 lncRNA ENSG00000243485.5 ENST00000473358.1\n", 994 | "6 lncRNA ENSG00000243485.5 ENST00000469289.1" 995 | ] 996 | }, 997 | "metadata": {}, 998 | "output_type": "display_data" 999 | } 1000 | ], 1001 | "source": [ 1002 | "\n", 1003 | "if (!is.null(gAnnotationsRdataFilename)) {\n", 1004 | " gAnnotationsRdataFp = file.path(gReferenceDir, gAnnotationsRdataFilename) \n", 1005 | " \n", 1006 | " # Import the R data object containing gene annotations and load its dataframe into a variable:\n", 1007 | " gAnnotationEnv = loadToEnvironment(gAnnotationsRdataFp)\n", 1008 | " gGeneTypeAnnotationsDf = gAnnotationEnv$ANNOT\n", 1009 | " \n", 1010 | " head(gGeneTypeAnnotationsDf)\n", 1011 | "} else {\n", 1012 | " print(\"No annotations provided.\")\n", 1013 | "}" 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "markdown", 1018 | "metadata": {}, 1019 | "source": [ 1020 | "[Table of Contents](#Table-of-Contents)\n", 1021 | "\n", 1022 | "## Gene Separation By Coding Status" 1023 | ] 1024 | }, 1025 | { 1026 | "cell_type": "markdown", 1027 | "metadata": {}, 1028 | "source": [ 1029 | "Gene annotations are records of each gene's identifier and symbol, where the gene begins and ends on the genome sequence, and whether it is anticipated to be a coding gene or not. There are multiple sources of gene annotations." 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "markdown", 1034 | "metadata": {}, 1035 | "source": [ 1036 | "Here we use the human gene annotations from the Gencode project, Release 43 (GRCh38.p13). \n", 1037 | "\n" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": 65, 1043 | "metadata": {}, 1044 | "outputs": [], 1045 | "source": [ 1046 | "splitGeneCountsByCodingStatus = function(geneCountDf, gtfDf, removeVersion=FALSE){\n", 1047 | " #Subset GTF by protein coding and noncoding\n", 1048 | " ANNOT_protein_coding <- subset(gtfDf, gene_type == \"protein_coding\")\n", 1049 | "# ANNOT_ncRNA <- subset(gtfDf, gene_type %in% c(\"lincRNA\", \"antisense\", \"processed_transcript\",\"sense_overlapping\", \"sense_intronic\"))\n", 1050 | " ANNOT_ncRNA <- subset(gtfDf, gene_type %in% c(\"lncRNA\", \"antisense\", \"processed_transcript\",\"sense_overlapping\", \"sense_intronic\"))\n", 1051 | "\n", 1052 | " #make list of IDs to query\n", 1053 | " protein_coding_ids <- ANNOT_protein_coding$gene_id\n", 1054 | " ncRNA_ids <- ANNOT_ncRNA$gene_id\n", 1055 | " \n", 1056 | " if (removeVersion){\n", 1057 | " protein_coding_ids <- removeAccessionVersion(protein_coding_ids)\n", 1058 | " ncRNA_ids <- removeAccessionVersion(ncRNA_ids) \n", 1059 | " }\n", 1060 | "\n", 1061 | " #subset geneCounts\n", 1062 | " geneCount_protein_coding <- subset(geneCountDf, row.names(geneCountDf) %in% protein_coding_ids)\n", 1063 | " geneCount_ncRNA <- subset(geneCountDf, row.names(geneCountDf) %in% ncRNA_ids)\n", 1064 | " return(list(codingGeneCountDf=geneCount_protein_coding, noncodingGeneCountDf=geneCount_ncRNA))\n", 1065 | "}\n", 1066 | "\n", 1067 | "removeAccessionVersion = function(accessionVector){\n", 1068 | " return (gsub(\"\\\\..*\",\"\",accessionVector))\n", 1069 | "}\n", 1070 | "\n", 1071 | "writeSubsetCounts = function(subsetCountsDf, outputDir, runName, fileSuffix){\n", 1072 | " fileName = sprintf(fileSuffix, runName)\n", 1073 | " write.csv(subsetCountsDf, file.path(outputDir, fileName))\n", 1074 | " print(paste0(\"Output file: \",fileName))\n", 1075 | "}\n", 1076 | "\n", 1077 | "writeSubsetsCounts = function(splitGeneCountDfsList, outputDir, runName){\n", 1078 | " writeSubsetCounts(splitGeneCountDfsList$codingGeneCountDf, outputDir, runName,\"%s_raw_pc_genes_counts.csv\")\n", 1079 | " writeSubsetCounts(splitGeneCountDfsList$noncodingGeneCountDf, outputDir, runName,\"%s_raw_nc_genes_counts.csv\")\n", 1080 | "}" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "markdown", 1085 | "metadata": {}, 1086 | "source": [ 1087 | "Split the count data into coding and non-coding subsets, and extract each subset into a file based on the annotation file provided in the input parameters:" 1088 | ] 1089 | }, 1090 | { 1091 | "cell_type": "code", 1092 | "execution_count": 66, 1093 | "metadata": {}, 1094 | "outputs": [], 1095 | "source": [ 1096 | "gSplitGeneCountDfsList = splitGeneCountsByCodingStatus(gGeneCountsDf, gGeneTypeAnnotationsDf, gRemoveVersion)" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": 67, 1102 | "metadata": {}, 1103 | "outputs": [ 1104 | { 1105 | "data": { 1106 | "text/html": [ 1107 | "\n", 1112 | "
  1. 60605
  2. 12
\n" 1113 | ], 1114 | "text/latex": [ 1115 | "\\begin{enumerate*}\n", 1116 | "\\item 60605\n", 1117 | "\\item 12\n", 1118 | "\\end{enumerate*}\n" 1119 | ], 1120 | "text/markdown": [ 1121 | "1. 60605\n", 1122 | "2. 12\n", 1123 | "\n", 1124 | "\n" 1125 | ], 1126 | "text/plain": [ 1127 | "[1] 60605 12" 1128 | ] 1129 | }, 1130 | "metadata": {}, 1131 | "output_type": "display_data" 1132 | }, 1133 | { 1134 | "data": { 1135 | "text/html": [ 1136 | "\n", 1141 | "
  1. 19937
  2. 12
\n" 1142 | ], 1143 | "text/latex": [ 1144 | "\\begin{enumerate*}\n", 1145 | "\\item 19937\n", 1146 | "\\item 12\n", 1147 | "\\end{enumerate*}\n" 1148 | ], 1149 | "text/markdown": [ 1150 | "1. 19937\n", 1151 | "2. 12\n", 1152 | "\n", 1153 | "\n" 1154 | ], 1155 | "text/plain": [ 1156 | "[1] 19937 12" 1157 | ] 1158 | }, 1159 | "metadata": {}, 1160 | "output_type": "display_data" 1161 | }, 1162 | { 1163 | "data": { 1164 | "text/html": [ 1165 | "\n", 1170 | "
  1. 16876
  2. 12
\n" 1171 | ], 1172 | "text/latex": [ 1173 | "\\begin{enumerate*}\n", 1174 | "\\item 16876\n", 1175 | "\\item 12\n", 1176 | "\\end{enumerate*}\n" 1177 | ], 1178 | "text/markdown": [ 1179 | "1. 16876\n", 1180 | "2. 12\n", 1181 | "\n", 1182 | "\n" 1183 | ], 1184 | "text/plain": [ 1185 | "[1] 16876 12" 1186 | ] 1187 | }, 1188 | "metadata": {}, 1189 | "output_type": "display_data" 1190 | } 1191 | ], 1192 | "source": [ 1193 | "dim(gGeneCountsDf)\n", 1194 | "dim(gSplitGeneCountDfsList$codingGeneCountDf)\n", 1195 | "dim(gSplitGeneCountDfsList$noncodingGeneCountDf)" 1196 | ] 1197 | }, 1198 | { 1199 | "cell_type": "markdown", 1200 | "metadata": {}, 1201 | "source": [ 1202 | " Of the original 60,605 Ensembl genes in the dataset, 19,937 are known coding genes. " 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "code", 1207 | "execution_count": 68, 1208 | "metadata": {}, 1209 | "outputs": [ 1210 | { 1211 | "name": "stdout", 1212 | "output_type": "stream", 1213 | "text": [ 1214 | "[1] \"Output file: 20230302161642_raw_pc_genes_counts.csv\"\n", 1215 | "[1] \"Output file: 20230302161642_raw_nc_genes_counts.csv\"\n" 1216 | ] 1217 | } 1218 | ], 1219 | "source": [ 1220 | "writeSubsetsCounts(gSplitGeneCountDfsList, gOutputDir_nb1, gRunName)" 1221 | ] 1222 | }, 1223 | { 1224 | "cell_type": "markdown", 1225 | "metadata": {}, 1226 | "source": [ 1227 | "[Table of Contents](#Table-of-Contents)\n", 1228 | "\n", 1229 | "## Data Integration\n", 1230 | "\n", 1231 | "\n", 1232 | "\n", 1233 | "Integrate the count data and the metadata into an edgeR DGEList object for use in downstream analysis:\n", 1234 | "\n", 1235 | "> Our DGEList-object contains a samples data frame that stores both ... group ... and batch ... information, each of which consists of ... distinct levels. Note that within x$samples, library sizes are automatically calculated for each sample and normalisation factors are set to 1. ([1](#Citations))" 1236 | ] 1237 | }, 1238 | { 1239 | "cell_type": "code", 1240 | "execution_count": 69, 1241 | "metadata": {}, 1242 | "outputs": [], 1243 | "source": [ 1244 | "gGeneType = \"all\"\n", 1245 | "gRelevantGeneCountsDf <- gGeneCountsDf" 1246 | ] 1247 | }, 1248 | { 1249 | "cell_type": "code", 1250 | "execution_count": 70, 1251 | "metadata": {}, 1252 | "outputs": [], 1253 | "source": [ 1254 | "# create a DGEList object\n", 1255 | "makeDgeList = function(countsDf, metadataDf, groupColName){\n", 1256 | " # remove the accession version (.##etc) from the ensembl gene id\n", 1257 | " id_list <- gsub(\"[.].*$\",\"\", row.names(countsDf))\n", 1258 | " row.names(countsDf) <- id_list\n", 1259 | "\n", 1260 | " x <- DGEList(counts = countsDf, lib.size = colSums(countsDf),\n", 1261 | " norm.factors = rep(1,ncol(countsDf)), samples = metadataDf,\n", 1262 | " group = metadataDf[[groupColName]], genes = NULL, remove.zeros = FALSE)\n", 1263 | " return(x)\n", 1264 | "}" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "code", 1269 | "execution_count": 71, 1270 | "metadata": {}, 1271 | "outputs": [], 1272 | "source": [ 1273 | "gGroupCategory = \"Condition_Time\" # e.g., \"day\"" 1274 | ] 1275 | }, 1276 | { 1277 | "cell_type": "code", 1278 | "execution_count": 72, 1279 | "metadata": {}, 1280 | "outputs": [ 1281 | { 1282 | "data": { 1283 | "text/html": [ 1284 | "\n", 1289 | "
  1. 'counts'
  2. 'samples'
\n" 1290 | ], 1291 | "text/latex": [ 1292 | "\\begin{enumerate*}\n", 1293 | "\\item 'counts'\n", 1294 | "\\item 'samples'\n", 1295 | "\\end{enumerate*}\n" 1296 | ], 1297 | "text/markdown": [ 1298 | "1. 'counts'\n", 1299 | "2. 'samples'\n", 1300 | "\n", 1301 | "\n" 1302 | ], 1303 | "text/plain": [ 1304 | "[1] \"counts\" \"samples\"" 1305 | ] 1306 | }, 1307 | "metadata": {}, 1308 | "output_type": "display_data" 1309 | } 1310 | ], 1311 | "source": [ 1312 | "gDgeList = makeDgeList(gRelevantGeneCountsDf, gMetadataDf, gGroupCategory)\n", 1313 | "names(gDgeList)" 1314 | ] 1315 | }, 1316 | { 1317 | "cell_type": "markdown", 1318 | "metadata": {}, 1319 | "source": [ 1320 | "As a sanity-check, look at representative content from the DGEList:" 1321 | ] 1322 | }, 1323 | { 1324 | "cell_type": "code", 1325 | "execution_count": 73, 1326 | "metadata": {}, 1327 | "outputs": [ 1328 | { 1329 | "data": { 1330 | "text/html": [ 1331 | "\n", 1332 | "\n", 1333 | "\n", 1334 | "\t\n", 1335 | "\n", 1336 | "\n", 1337 | "\t\n", 1338 | "\t\n", 1339 | "\t\n", 1340 | "\t\n", 1341 | "\t\n", 1342 | "\t\n", 1343 | "\n", 1344 | "
A matrix: 6 × 12 of type dbl
VK2_NanH2_PolyB25_2h_1VK2_NanH2_PolyB25_2h_2VK2_NanH2_PolyB25_2h_3VK2_NanH2_PolyB25_2h_4VK2_NanH2_PolyB25_1h_5VK2_NanH2_PolyB25_1h_6VK2_NanH2_PolyB25_1h_7VK2_NanH2_PolyB25_1h_8VK2_Pet28a_PolyB25_2h_9VK2_Pet28a_PolyB25_2h_10VK2_Pet28a_PolyB25_2h_11VK2_Pet28a_PolyB25_2h_12
ENSG00000000003 356.00 542.00 364.00 299.00 502.00 416.00 424.00 497.00 323.00 387.00 465.00 366.00
ENSG00000000005 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
ENSG000000004192080.113080.002206.801721.002340.002232.042253.002573.361904.052084.002412.422170.07
ENSG00000000457 113.88 119.35 85.84 102.65 153.65 94.84 112.33 130.23 119.77 98.34 120.36 91.56
ENSG00000000460 181.12 285.65 181.16 209.35 254.35 200.16 242.67 292.77 234.23 189.66 190.64 204.44
ENSG00000000938 4.00 21.00 10.00 18.00 10.00 6.00 6.00 9.00 10.00 13.00 17.00 13.00
\n" 1345 | ], 1346 | "text/latex": [ 1347 | "A matrix: 6 × 12 of type dbl\n", 1348 | "\\begin{tabular}{r|llllllllllll}\n", 1349 | " & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & VK2\\_NanH2\\_PolyB25\\_1h\\_7 & VK2\\_NanH2\\_PolyB25\\_1h\\_8 & VK2\\_Pet28a\\_PolyB25\\_2h\\_9 & VK2\\_Pet28a\\_PolyB25\\_2h\\_10 & VK2\\_Pet28a\\_PolyB25\\_2h\\_11 & VK2\\_Pet28a\\_PolyB25\\_2h\\_12\\\\\n", 1350 | "\\hline\n", 1351 | "\tENSG00000000003 & 356.00 & 542.00 & 364.00 & 299.00 & 502.00 & 416.00 & 424.00 & 497.00 & 323.00 & 387.00 & 465.00 & 366.00\\\\\n", 1352 | "\tENSG00000000005 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00\\\\\n", 1353 | "\tENSG00000000419 & 2080.11 & 3080.00 & 2206.80 & 1721.00 & 2340.00 & 2232.04 & 2253.00 & 2573.36 & 1904.05 & 2084.00 & 2412.42 & 2170.07\\\\\n", 1354 | "\tENSG00000000457 & 113.88 & 119.35 & 85.84 & 102.65 & 153.65 & 94.84 & 112.33 & 130.23 & 119.77 & 98.34 & 120.36 & 91.56\\\\\n", 1355 | "\tENSG00000000460 & 181.12 & 285.65 & 181.16 & 209.35 & 254.35 & 200.16 & 242.67 & 292.77 & 234.23 & 189.66 & 190.64 & 204.44\\\\\n", 1356 | "\tENSG00000000938 & 4.00 & 21.00 & 10.00 & 18.00 & 10.00 & 6.00 & 6.00 & 9.00 & 10.00 & 13.00 & 17.00 & 13.00\\\\\n", 1357 | "\\end{tabular}\n" 1358 | ], 1359 | "text/markdown": [ 1360 | "\n", 1361 | "A matrix: 6 × 12 of type dbl\n", 1362 | "\n", 1363 | "| | VK2_NanH2_PolyB25_2h_1 | VK2_NanH2_PolyB25_2h_2 | VK2_NanH2_PolyB25_2h_3 | VK2_NanH2_PolyB25_2h_4 | VK2_NanH2_PolyB25_1h_5 | VK2_NanH2_PolyB25_1h_6 | VK2_NanH2_PolyB25_1h_7 | VK2_NanH2_PolyB25_1h_8 | VK2_Pet28a_PolyB25_2h_9 | VK2_Pet28a_PolyB25_2h_10 | VK2_Pet28a_PolyB25_2h_11 | VK2_Pet28a_PolyB25_2h_12 |\n", 1364 | "|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", 1365 | "| ENSG00000000003 | 356.00 | 542.00 | 364.00 | 299.00 | 502.00 | 416.00 | 424.00 | 497.00 | 323.00 | 387.00 | 465.00 | 366.00 |\n", 1366 | "| ENSG00000000005 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |\n", 1367 | "| ENSG00000000419 | 2080.11 | 3080.00 | 2206.80 | 1721.00 | 2340.00 | 2232.04 | 2253.00 | 2573.36 | 1904.05 | 2084.00 | 2412.42 | 2170.07 |\n", 1368 | "| ENSG00000000457 | 113.88 | 119.35 | 85.84 | 102.65 | 153.65 | 94.84 | 112.33 | 130.23 | 119.77 | 98.34 | 120.36 | 91.56 |\n", 1369 | "| ENSG00000000460 | 181.12 | 285.65 | 181.16 | 209.35 | 254.35 | 200.16 | 242.67 | 292.77 | 234.23 | 189.66 | 190.64 | 204.44 |\n", 1370 | "| ENSG00000000938 | 4.00 | 21.00 | 10.00 | 18.00 | 10.00 | 6.00 | 6.00 | 9.00 | 10.00 | 13.00 | 17.00 | 13.00 |\n", 1371 | "\n" 1372 | ], 1373 | "text/plain": [ 1374 | " VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_2\n", 1375 | "ENSG00000000003 356.00 542.00 \n", 1376 | "ENSG00000000005 0.00 0.00 \n", 1377 | "ENSG00000000419 2080.11 3080.00 \n", 1378 | "ENSG00000000457 113.88 119.35 \n", 1379 | "ENSG00000000460 181.12 285.65 \n", 1380 | "ENSG00000000938 4.00 21.00 \n", 1381 | " VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_4\n", 1382 | "ENSG00000000003 364.00 299.00 \n", 1383 | "ENSG00000000005 0.00 0.00 \n", 1384 | "ENSG00000000419 2206.80 1721.00 \n", 1385 | "ENSG00000000457 85.84 102.65 \n", 1386 | "ENSG00000000460 181.16 209.35 \n", 1387 | "ENSG00000000938 10.00 18.00 \n", 1388 | " VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_6\n", 1389 | "ENSG00000000003 502.00 416.00 \n", 1390 | "ENSG00000000005 0.00 0.00 \n", 1391 | "ENSG00000000419 2340.00 2232.04 \n", 1392 | "ENSG00000000457 153.65 94.84 \n", 1393 | "ENSG00000000460 254.35 200.16 \n", 1394 | "ENSG00000000938 10.00 6.00 \n", 1395 | " VK2_NanH2_PolyB25_1h_7 VK2_NanH2_PolyB25_1h_8\n", 1396 | "ENSG00000000003 424.00 497.00 \n", 1397 | "ENSG00000000005 0.00 0.00 \n", 1398 | "ENSG00000000419 2253.00 2573.36 \n", 1399 | "ENSG00000000457 112.33 130.23 \n", 1400 | "ENSG00000000460 242.67 292.77 \n", 1401 | "ENSG00000000938 6.00 9.00 \n", 1402 | " VK2_Pet28a_PolyB25_2h_9 VK2_Pet28a_PolyB25_2h_10\n", 1403 | "ENSG00000000003 323.00 387.00 \n", 1404 | "ENSG00000000005 0.00 0.00 \n", 1405 | "ENSG00000000419 1904.05 2084.00 \n", 1406 | "ENSG00000000457 119.77 98.34 \n", 1407 | "ENSG00000000460 234.23 189.66 \n", 1408 | "ENSG00000000938 10.00 13.00 \n", 1409 | " VK2_Pet28a_PolyB25_2h_11 VK2_Pet28a_PolyB25_2h_12\n", 1410 | "ENSG00000000003 465.00 366.00 \n", 1411 | "ENSG00000000005 0.00 0.00 \n", 1412 | "ENSG00000000419 2412.42 2170.07 \n", 1413 | "ENSG00000000457 120.36 91.56 \n", 1414 | "ENSG00000000460 190.64 204.44 \n", 1415 | "ENSG00000000938 17.00 13.00 " 1416 | ] 1417 | }, 1418 | "metadata": {}, 1419 | "output_type": "display_data" 1420 | }, 1421 | { 1422 | "data": { 1423 | "text/html": [ 1424 | "\n", 1425 | "\n", 1426 | "\n", 1427 | "\t\n", 1428 | "\t\n", 1429 | "\n", 1430 | "\n", 1431 | "\t\n", 1432 | "\t\n", 1433 | "\t\n", 1434 | "\t\n", 1435 | "\t\n", 1436 | "\t\n", 1437 | "\n", 1438 | "
A data.frame: 6 × 10
grouplib.sizenorm.factorsSample.NameSample.CodeReference.Genome..Organism.GenderConditionTime.pointCondition_Time
<fct><dbl><dbl><chr><chr><chr><chr><chr><chr><chr>
VK2_NanH2_PolyB25_2h_1NanH2_treated_2H224908891VK2_NanH2_PolyB25_2h_1K001HumanFemaleNanH2_treated2HNanH2_treated_2H
VK2_NanH2_PolyB25_2h_2NanH2_treated_2H312324801VK2_NanH2_PolyB25_2h_2K002HumanFemaleNanH2_treated2HNanH2_treated_2H
VK2_NanH2_PolyB25_2h_3NanH2_treated_2H219013421VK2_NanH2_PolyB25_2h_3K003HumanFemaleNanH2_treated2HNanH2_treated_2H
VK2_NanH2_PolyB25_2h_4NanH2_treated_2H196371141VK2_NanH2_PolyB25_2h_4K004HumanFemaleNanH2_treated2HNanH2_treated_2H
VK2_NanH2_PolyB25_1h_5NanH2_treated_1H274680161VK2_NanH2_PolyB25_1h_5K005HumanFemaleNanH2_treated1HNanH2_treated_1H
VK2_NanH2_PolyB25_1h_6NanH2_treated_1H249358031VK2_NanH2_PolyB25_1h_6K006HumanFemaleNanH2_treated1HNanH2_treated_1H
\n" 1439 | ], 1440 | "text/latex": [ 1441 | "A data.frame: 6 × 10\n", 1442 | "\\begin{tabular}{r|llllllllll}\n", 1443 | " & group & lib.size & norm.factors & Sample.Name & Sample.Code & Reference.Genome..Organism. & Gender & Condition & Time.point & Condition\\_Time\\\\\n", 1444 | " & & & & & & & & & & \\\\\n", 1445 | "\\hline\n", 1446 | "\tVK2\\_NanH2\\_PolyB25\\_2h\\_1 & NanH2\\_treated\\_2H & 22490889 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & K001 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 1447 | "\tVK2\\_NanH2\\_PolyB25\\_2h\\_2 & NanH2\\_treated\\_2H & 31232480 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & K002 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 1448 | "\tVK2\\_NanH2\\_PolyB25\\_2h\\_3 & NanH2\\_treated\\_2H & 21901342 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & K003 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 1449 | "\tVK2\\_NanH2\\_PolyB25\\_2h\\_4 & NanH2\\_treated\\_2H & 19637114 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & K004 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 1450 | "\tVK2\\_NanH2\\_PolyB25\\_1h\\_5 & NanH2\\_treated\\_1H & 27468016 & 1 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & K005 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n", 1451 | "\tVK2\\_NanH2\\_PolyB25\\_1h\\_6 & NanH2\\_treated\\_1H & 24935803 & 1 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & K006 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n", 1452 | "\\end{tabular}\n" 1453 | ], 1454 | "text/markdown": [ 1455 | "\n", 1456 | "A data.frame: 6 × 10\n", 1457 | "\n", 1458 | "| | group <fct> | lib.size <dbl> | norm.factors <dbl> | Sample.Name <chr> | Sample.Code <chr> | Reference.Genome..Organism. <chr> | Gender <chr> | Condition <chr> | Time.point <chr> | Condition_Time <chr> |\n", 1459 | "|---|---|---|---|---|---|---|---|---|---|---|\n", 1460 | "| VK2_NanH2_PolyB25_2h_1 | NanH2_treated_2H | 22490889 | 1 | VK2_NanH2_PolyB25_2h_1 | K001 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 1461 | "| VK2_NanH2_PolyB25_2h_2 | NanH2_treated_2H | 31232480 | 1 | VK2_NanH2_PolyB25_2h_2 | K002 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 1462 | "| VK2_NanH2_PolyB25_2h_3 | NanH2_treated_2H | 21901342 | 1 | VK2_NanH2_PolyB25_2h_3 | K003 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 1463 | "| VK2_NanH2_PolyB25_2h_4 | NanH2_treated_2H | 19637114 | 1 | VK2_NanH2_PolyB25_2h_4 | K004 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 1464 | "| VK2_NanH2_PolyB25_1h_5 | NanH2_treated_1H | 27468016 | 1 | VK2_NanH2_PolyB25_1h_5 | K005 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n", 1465 | "| VK2_NanH2_PolyB25_1h_6 | NanH2_treated_1H | 24935803 | 1 | VK2_NanH2_PolyB25_1h_6 | K006 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n", 1466 | "\n" 1467 | ], 1468 | "text/plain": [ 1469 | " group lib.size norm.factors\n", 1470 | "VK2_NanH2_PolyB25_2h_1 NanH2_treated_2H 22490889 1 \n", 1471 | "VK2_NanH2_PolyB25_2h_2 NanH2_treated_2H 31232480 1 \n", 1472 | "VK2_NanH2_PolyB25_2h_3 NanH2_treated_2H 21901342 1 \n", 1473 | "VK2_NanH2_PolyB25_2h_4 NanH2_treated_2H 19637114 1 \n", 1474 | "VK2_NanH2_PolyB25_1h_5 NanH2_treated_1H 27468016 1 \n", 1475 | "VK2_NanH2_PolyB25_1h_6 NanH2_treated_1H 24935803 1 \n", 1476 | " Sample.Name Sample.Code\n", 1477 | "VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_1 K001 \n", 1478 | "VK2_NanH2_PolyB25_2h_2 VK2_NanH2_PolyB25_2h_2 K002 \n", 1479 | "VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_3 K003 \n", 1480 | "VK2_NanH2_PolyB25_2h_4 VK2_NanH2_PolyB25_2h_4 K004 \n", 1481 | "VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_5 K005 \n", 1482 | "VK2_NanH2_PolyB25_1h_6 VK2_NanH2_PolyB25_1h_6 K006 \n", 1483 | " Reference.Genome..Organism. Gender Condition \n", 1484 | "VK2_NanH2_PolyB25_2h_1 Human Female NanH2_treated\n", 1485 | "VK2_NanH2_PolyB25_2h_2 Human Female NanH2_treated\n", 1486 | "VK2_NanH2_PolyB25_2h_3 Human Female NanH2_treated\n", 1487 | "VK2_NanH2_PolyB25_2h_4 Human Female NanH2_treated\n", 1488 | "VK2_NanH2_PolyB25_1h_5 Human Female NanH2_treated\n", 1489 | "VK2_NanH2_PolyB25_1h_6 Human Female NanH2_treated\n", 1490 | " Time.point Condition_Time \n", 1491 | "VK2_NanH2_PolyB25_2h_1 2H NanH2_treated_2H\n", 1492 | "VK2_NanH2_PolyB25_2h_2 2H NanH2_treated_2H\n", 1493 | "VK2_NanH2_PolyB25_2h_3 2H NanH2_treated_2H\n", 1494 | "VK2_NanH2_PolyB25_2h_4 2H NanH2_treated_2H\n", 1495 | "VK2_NanH2_PolyB25_1h_5 1H NanH2_treated_1H\n", 1496 | "VK2_NanH2_PolyB25_1h_6 1H NanH2_treated_1H" 1497 | ] 1498 | }, 1499 | "metadata": {}, 1500 | "output_type": "display_data" 1501 | } 1502 | ], 1503 | "source": [ 1504 | "head(gDgeList$counts)\n", 1505 | "head(gDgeList$samples)" 1506 | ] 1507 | }, 1508 | { 1509 | "cell_type": "markdown", 1510 | "metadata": {}, 1511 | "source": [ 1512 | "[Table of Contents](#Table-of-Contents)\n", 1513 | "\n", 1514 | "## Annotation Integration" 1515 | ] 1516 | }, 1517 | { 1518 | "cell_type": "markdown", 1519 | "metadata": {}, 1520 | "source": [ 1521 | "Next, extend the DGEList object with annotation information about the genes that have count data with symbol and EntrezId information, based upon their Ensembl ids.\n", 1522 | "\n", 1523 | "> A second data frame named genes in the DGEList-object is used to store gene-level information associated with rows of the counts matrix. This information can be retrieved using organism specific packages such as Mus.musculus (Bioconductor Core Team 2016b) for mouse (or Homo.sapiens (Bioconductor Core Team 2016a) for human) ....\n", 1524 | ">\n", 1525 | "> The type of information that can be retrieved includes gene symbols, gene names, chromosome names and locations, Entrez gene IDs, Refseq gene IDs and Ensembl gene IDs to name just a few. .... Mus.musculus [and other organism-specific packages] packages information from various sources and allows users to choose between many different gene IDs as the key. ([1](#Citations))" 1526 | ] 1527 | }, 1528 | { 1529 | "cell_type": "code", 1530 | "execution_count": 74, 1531 | "metadata": {}, 1532 | "outputs": [], 1533 | "source": [ 1534 | "getGeneDf = function(dgeList, organismPackage){\n", 1535 | " geneid <- rownames(dgeList)\n", 1536 | " genes <- select(organismPackage, keys=geneid, columns=c(\"SYMBOL\", \"ENSEMBL\", \"ENTREZID\"), \n", 1537 | " keytype=\"ENSEMBL\")\n", 1538 | " return(genes)\n", 1539 | "}" 1540 | ] 1541 | }, 1542 | { 1543 | "cell_type": "code", 1544 | "execution_count": 75, 1545 | "metadata": {}, 1546 | "outputs": [ 1547 | { 1548 | "name": "stderr", 1549 | "output_type": "stream", 1550 | "text": [ 1551 | "'select()' returned 1:many mapping between keys and columns\n", 1552 | "\n" 1553 | ] 1554 | }, 1555 | { 1556 | "data": { 1557 | "text/html": [ 1558 | "\n", 1563 | "
  1. 60846
  2. 3
\n" 1564 | ], 1565 | "text/latex": [ 1566 | "\\begin{enumerate*}\n", 1567 | "\\item 60846\n", 1568 | "\\item 3\n", 1569 | "\\end{enumerate*}\n" 1570 | ], 1571 | "text/markdown": [ 1572 | "1. 60846\n", 1573 | "2. 3\n", 1574 | "\n", 1575 | "\n" 1576 | ], 1577 | "text/plain": [ 1578 | "[1] 60846 3" 1579 | ] 1580 | }, 1581 | "metadata": {}, 1582 | "output_type": "display_data" 1583 | } 1584 | ], 1585 | "source": [ 1586 | "gRawGenesDf = getGeneDf(gDgeList, gOrganismPackage)\n", 1587 | "dim(gRawGenesDf)" 1588 | ] 1589 | }, 1590 | { 1591 | "cell_type": "code", 1592 | "execution_count": 76, 1593 | "metadata": {}, 1594 | "outputs": [], 1595 | "source": [ 1596 | "# Add gene type to gRawGenesDf\n", 1597 | "gGeneTypeAnnotationsDf.rmdec <- gGeneTypeAnnotationsDf\n", 1598 | "gGeneTypeAnnotationsDf.rmdec$gene_id <- gsub(\"\\\\..*\",\"\",gGeneTypeAnnotationsDf.rmdec$gene_id)\n", 1599 | "gRawGenesDf$gene_type <- gGeneTypeAnnotationsDf$gene_type[match(gRawGenesDf$ENSEMBL, gGeneTypeAnnotationsDf.rmdec$gene_id)]" 1600 | ] 1601 | }, 1602 | { 1603 | "cell_type": "code", 1604 | "execution_count": 77, 1605 | "metadata": {}, 1606 | "outputs": [ 1607 | { 1608 | "data": { 1609 | "text/html": [ 1610 | "\n", 1611 | "\n", 1612 | "\n", 1613 | "\t\n", 1614 | "\t\n", 1615 | "\n", 1616 | "\n", 1617 | "\t\n", 1618 | "\t\n", 1619 | "\t\n", 1620 | "\t\n", 1621 | "\t\n", 1622 | "\t\n", 1623 | "\n", 1624 | "
A data.frame: 6 × 4
ENSEMBLENTREZIDSYMBOLgene_type
<chr><chr><chr><chr>
1ENSG000000000037105 TSPAN6 protein_coding
2ENSG0000000000564102TNMD protein_coding
3ENSG000000004198813 DPM1 protein_coding
4ENSG0000000045757147SCYL3 protein_coding
5ENSG0000000046055732C1orf112protein_coding
6ENSG000000009382268 FGR protein_coding
\n" 1625 | ], 1626 | "text/latex": [ 1627 | "A data.frame: 6 × 4\n", 1628 | "\\begin{tabular}{r|llll}\n", 1629 | " & ENSEMBL & ENTREZID & SYMBOL & gene\\_type\\\\\n", 1630 | " & & & & \\\\\n", 1631 | "\\hline\n", 1632 | "\t1 & ENSG00000000003 & 7105 & TSPAN6 & protein\\_coding\\\\\n", 1633 | "\t2 & ENSG00000000005 & 64102 & TNMD & protein\\_coding\\\\\n", 1634 | "\t3 & ENSG00000000419 & 8813 & DPM1 & protein\\_coding\\\\\n", 1635 | "\t4 & ENSG00000000457 & 57147 & SCYL3 & protein\\_coding\\\\\n", 1636 | "\t5 & ENSG00000000460 & 55732 & C1orf112 & protein\\_coding\\\\\n", 1637 | "\t6 & ENSG00000000938 & 2268 & FGR & protein\\_coding\\\\\n", 1638 | "\\end{tabular}\n" 1639 | ], 1640 | "text/markdown": [ 1641 | "\n", 1642 | "A data.frame: 6 × 4\n", 1643 | "\n", 1644 | "| | ENSEMBL <chr> | ENTREZID <chr> | SYMBOL <chr> | gene_type <chr> |\n", 1645 | "|---|---|---|---|---|\n", 1646 | "| 1 | ENSG00000000003 | 7105 | TSPAN6 | protein_coding |\n", 1647 | "| 2 | ENSG00000000005 | 64102 | TNMD | protein_coding |\n", 1648 | "| 3 | ENSG00000000419 | 8813 | DPM1 | protein_coding |\n", 1649 | "| 4 | ENSG00000000457 | 57147 | SCYL3 | protein_coding |\n", 1650 | "| 5 | ENSG00000000460 | 55732 | C1orf112 | protein_coding |\n", 1651 | "| 6 | ENSG00000000938 | 2268 | FGR | protein_coding |\n", 1652 | "\n" 1653 | ], 1654 | "text/plain": [ 1655 | " ENSEMBL ENTREZID SYMBOL gene_type \n", 1656 | "1 ENSG00000000003 7105 TSPAN6 protein_coding\n", 1657 | "2 ENSG00000000005 64102 TNMD protein_coding\n", 1658 | "3 ENSG00000000419 8813 DPM1 protein_coding\n", 1659 | "4 ENSG00000000457 57147 SCYL3 protein_coding\n", 1660 | "5 ENSG00000000460 55732 C1orf112 protein_coding\n", 1661 | "6 ENSG00000000938 2268 FGR protein_coding" 1662 | ] 1663 | }, 1664 | "metadata": {}, 1665 | "output_type": "display_data" 1666 | } 1667 | ], 1668 | "source": [ 1669 | "head(gRawGenesDf)" 1670 | ] 1671 | }, 1672 | { 1673 | "cell_type": "markdown", 1674 | "metadata": {}, 1675 | "source": [ 1676 | "> [G]ene IDs may not map one-to-one to the gene information of interest. It is important to check for duplicated gene IDs. ([1](#Citations))\n", 1677 | "\n", 1678 | "Examine how many records in the annotation dataset have the same id (for the gene identifier type--either ENSEMBL or ENTREZ--set below) as another record occurring earlier in the dataset:" 1679 | ] 1680 | }, 1681 | { 1682 | "cell_type": "code", 1683 | "execution_count": 78, 1684 | "metadata": {}, 1685 | "outputs": [], 1686 | "source": [ 1687 | "gGeneIdCol <- \"ENSEMBL\"\n", 1688 | "# gGeneIdCol <- \"ENTREZ\"" 1689 | ] 1690 | }, 1691 | { 1692 | "cell_type": "code", 1693 | "execution_count": 79, 1694 | "metadata": {}, 1695 | "outputs": [ 1696 | { 1697 | "data": { 1698 | "text/html": [ 1699 | "241" 1700 | ], 1701 | "text/latex": [ 1702 | "241" 1703 | ], 1704 | "text/markdown": [ 1705 | "241" 1706 | ], 1707 | "text/plain": [ 1708 | "[1] 241" 1709 | ] 1710 | }, 1711 | "metadata": {}, 1712 | "output_type": "display_data" 1713 | } 1714 | ], 1715 | "source": [ 1716 | "gDuplicatesMask = duplicated(gRawGenesDf[[gGeneIdCol]])\n", 1717 | "sum(gDuplicatesMask) # Sum counts only those with a value of TRUE" 1718 | ] 1719 | }, 1720 | { 1721 | "cell_type": "markdown", 1722 | "metadata": {}, 1723 | "source": [ 1724 | "Note that this sum includes only the second (or greater) instances of records for each gene id; the first record for each gene id is not included in this duplicate set.\n", 1725 | "\n", 1726 | "Write a file of the duplicate records that can be examined if desired: " 1727 | ] 1728 | }, 1729 | { 1730 | "cell_type": "code", 1731 | "execution_count": 80, 1732 | "metadata": {}, 1733 | "outputs": [], 1734 | "source": [ 1735 | "writeOutRemovedDuplicates = function(countsDf, duplicatesMask, outputDir, runName, geneType){\n", 1736 | " fileName = sprintf(\"%s_duplicated_%s_genes_records.csv\",runName, geneType)\n", 1737 | " duplicatedCountsDf = countsDf[duplicatesMask,]\n", 1738 | " write.csv(duplicatedCountsDf, file.path(outputDir, fileName))\n", 1739 | " print(paste0(\"Output file: \",fileName))\n", 1740 | "}" 1741 | ] 1742 | }, 1743 | { 1744 | "cell_type": "code", 1745 | "execution_count": 81, 1746 | "metadata": {}, 1747 | "outputs": [ 1748 | { 1749 | "name": "stdout", 1750 | "output_type": "stream", 1751 | "text": [ 1752 | "[1] \"Output file: 20230302161642_duplicated_all_genes_records.csv\"\n" 1753 | ] 1754 | } 1755 | ], 1756 | "source": [ 1757 | "writeOutRemovedDuplicates(gRawGenesDf, gDuplicatesMask, gOutputDir_nb1, gRunName, gGeneType)" 1758 | ] 1759 | }, 1760 | { 1761 | "cell_type": "markdown", 1762 | "metadata": {}, 1763 | "source": [ 1764 | " As a basic approach, duplicate records for gene ids already existing in the annotation are removed" 1765 | ] 1766 | }, 1767 | { 1768 | "cell_type": "code", 1769 | "execution_count": 82, 1770 | "metadata": {}, 1771 | "outputs": [], 1772 | "source": [ 1773 | "gDeduplicatedGenesDf = gRawGenesDf[!duplicated(gRawGenesDf[[gGeneIdCol]]),]" 1774 | ] 1775 | }, 1776 | { 1777 | "cell_type": "markdown", 1778 | "metadata": {}, 1779 | "source": [ 1780 | "After deduplication, check the dimensions of the count data and the gene annotation data to ensure that the count dataframe has the same number of rows (genes) as the gene annotation dataframe has rows (again, genes), and that the gene names are the same in both:" 1781 | ] 1782 | }, 1783 | { 1784 | "cell_type": "code", 1785 | "execution_count": 83, 1786 | "metadata": {}, 1787 | "outputs": [ 1788 | { 1789 | "data": { 1790 | "text/html": [ 1791 | "\n", 1796 | "
  1. 60605
  2. 12
\n" 1797 | ], 1798 | "text/latex": [ 1799 | "\\begin{enumerate*}\n", 1800 | "\\item 60605\n", 1801 | "\\item 12\n", 1802 | "\\end{enumerate*}\n" 1803 | ], 1804 | "text/markdown": [ 1805 | "1. 60605\n", 1806 | "2. 12\n", 1807 | "\n", 1808 | "\n" 1809 | ], 1810 | "text/plain": [ 1811 | "[1] 60605 12" 1812 | ] 1813 | }, 1814 | "metadata": {}, 1815 | "output_type": "display_data" 1816 | }, 1817 | { 1818 | "data": { 1819 | "text/html": [ 1820 | "\n", 1825 | "
  1. 60605
  2. 4
\n" 1826 | ], 1827 | "text/latex": [ 1828 | "\\begin{enumerate*}\n", 1829 | "\\item 60605\n", 1830 | "\\item 4\n", 1831 | "\\end{enumerate*}\n" 1832 | ], 1833 | "text/markdown": [ 1834 | "1. 60605\n", 1835 | "2. 4\n", 1836 | "\n", 1837 | "\n" 1838 | ], 1839 | "text/plain": [ 1840 | "[1] 60605 4" 1841 | ] 1842 | }, 1843 | "metadata": {}, 1844 | "output_type": "display_data" 1845 | }, 1846 | { 1847 | "data": { 1848 | "text/html": [ 1849 | "TRUE" 1850 | ], 1851 | "text/latex": [ 1852 | "TRUE" 1853 | ], 1854 | "text/markdown": [ 1855 | "TRUE" 1856 | ], 1857 | "text/plain": [ 1858 | "[1] TRUE" 1859 | ] 1860 | }, 1861 | "metadata": {}, 1862 | "output_type": "display_data" 1863 | } 1864 | ], 1865 | "source": [ 1866 | "dim(gDgeList$counts)\n", 1867 | "dim(gDeduplicatedGenesDf)\n", 1868 | "\n", 1869 | "all(rownames(gDgeList$counts) %in% gDeduplicatedGenesDf[[gGeneIdCol]])" 1870 | ] 1871 | }, 1872 | { 1873 | "cell_type": "markdown", 1874 | "metadata": {}, 1875 | "source": [ 1876 | "Add the annotation information to the DGEList object:" 1877 | ] 1878 | }, 1879 | { 1880 | "cell_type": "code", 1881 | "execution_count": 84, 1882 | "metadata": {}, 1883 | "outputs": [ 1884 | { 1885 | "data": { 1886 | "text/html": [ 1887 | "\n", 1892 | "
  1. 'counts'
  2. 'samples'
  3. 'genes'
\n" 1893 | ], 1894 | "text/latex": [ 1895 | "\\begin{enumerate*}\n", 1896 | "\\item 'counts'\n", 1897 | "\\item 'samples'\n", 1898 | "\\item 'genes'\n", 1899 | "\\end{enumerate*}\n" 1900 | ], 1901 | "text/markdown": [ 1902 | "1. 'counts'\n", 1903 | "2. 'samples'\n", 1904 | "3. 'genes'\n", 1905 | "\n", 1906 | "\n" 1907 | ], 1908 | "text/plain": [ 1909 | "[1] \"counts\" \"samples\" \"genes\" " 1910 | ] 1911 | }, 1912 | "metadata": {}, 1913 | "output_type": "display_data" 1914 | } 1915 | ], 1916 | "source": [ 1917 | "gDgeList$genes = gDeduplicatedGenesDf\n", 1918 | "names(gDgeList)" 1919 | ] 1920 | }, 1921 | { 1922 | "cell_type": "markdown", 1923 | "metadata": {}, 1924 | "source": [ 1925 | "As a sanity-check, look at representative content from the DGEList:" 1926 | ] 1927 | }, 1928 | { 1929 | "cell_type": "code", 1930 | "execution_count": 85, 1931 | "metadata": {}, 1932 | "outputs": [ 1933 | { 1934 | "data": { 1935 | "text/html": [ 1936 | "\n", 1937 | "\n", 1938 | "\n", 1939 | "\t\n", 1940 | "\n", 1941 | "\n", 1942 | "\t\n", 1943 | "\t\n", 1944 | "\t\n", 1945 | "\t\n", 1946 | "\t\n", 1947 | "\t\n", 1948 | "\n", 1949 | "
A matrix: 6 × 12 of type dbl
VK2_NanH2_PolyB25_2h_1VK2_NanH2_PolyB25_2h_2VK2_NanH2_PolyB25_2h_3VK2_NanH2_PolyB25_2h_4VK2_NanH2_PolyB25_1h_5VK2_NanH2_PolyB25_1h_6VK2_NanH2_PolyB25_1h_7VK2_NanH2_PolyB25_1h_8VK2_Pet28a_PolyB25_2h_9VK2_Pet28a_PolyB25_2h_10VK2_Pet28a_PolyB25_2h_11VK2_Pet28a_PolyB25_2h_12
ENSG00000000003 356.00 542.00 364.00 299.00 502.00 416.00 424.00 497.00 323.00 387.00 465.00 366.00
ENSG00000000005 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
ENSG000000004192080.113080.002206.801721.002340.002232.042253.002573.361904.052084.002412.422170.07
ENSG00000000457 113.88 119.35 85.84 102.65 153.65 94.84 112.33 130.23 119.77 98.34 120.36 91.56
ENSG00000000460 181.12 285.65 181.16 209.35 254.35 200.16 242.67 292.77 234.23 189.66 190.64 204.44
ENSG00000000938 4.00 21.00 10.00 18.00 10.00 6.00 6.00 9.00 10.00 13.00 17.00 13.00
\n" 1950 | ], 1951 | "text/latex": [ 1952 | "A matrix: 6 × 12 of type dbl\n", 1953 | "\\begin{tabular}{r|llllllllllll}\n", 1954 | " & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & VK2\\_NanH2\\_PolyB25\\_1h\\_7 & VK2\\_NanH2\\_PolyB25\\_1h\\_8 & VK2\\_Pet28a\\_PolyB25\\_2h\\_9 & VK2\\_Pet28a\\_PolyB25\\_2h\\_10 & VK2\\_Pet28a\\_PolyB25\\_2h\\_11 & VK2\\_Pet28a\\_PolyB25\\_2h\\_12\\\\\n", 1955 | "\\hline\n", 1956 | "\tENSG00000000003 & 356.00 & 542.00 & 364.00 & 299.00 & 502.00 & 416.00 & 424.00 & 497.00 & 323.00 & 387.00 & 465.00 & 366.00\\\\\n", 1957 | "\tENSG00000000005 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00\\\\\n", 1958 | "\tENSG00000000419 & 2080.11 & 3080.00 & 2206.80 & 1721.00 & 2340.00 & 2232.04 & 2253.00 & 2573.36 & 1904.05 & 2084.00 & 2412.42 & 2170.07\\\\\n", 1959 | "\tENSG00000000457 & 113.88 & 119.35 & 85.84 & 102.65 & 153.65 & 94.84 & 112.33 & 130.23 & 119.77 & 98.34 & 120.36 & 91.56\\\\\n", 1960 | "\tENSG00000000460 & 181.12 & 285.65 & 181.16 & 209.35 & 254.35 & 200.16 & 242.67 & 292.77 & 234.23 & 189.66 & 190.64 & 204.44\\\\\n", 1961 | "\tENSG00000000938 & 4.00 & 21.00 & 10.00 & 18.00 & 10.00 & 6.00 & 6.00 & 9.00 & 10.00 & 13.00 & 17.00 & 13.00\\\\\n", 1962 | "\\end{tabular}\n" 1963 | ], 1964 | "text/markdown": [ 1965 | "\n", 1966 | "A matrix: 6 × 12 of type dbl\n", 1967 | "\n", 1968 | "| | VK2_NanH2_PolyB25_2h_1 | VK2_NanH2_PolyB25_2h_2 | VK2_NanH2_PolyB25_2h_3 | VK2_NanH2_PolyB25_2h_4 | VK2_NanH2_PolyB25_1h_5 | VK2_NanH2_PolyB25_1h_6 | VK2_NanH2_PolyB25_1h_7 | VK2_NanH2_PolyB25_1h_8 | VK2_Pet28a_PolyB25_2h_9 | VK2_Pet28a_PolyB25_2h_10 | VK2_Pet28a_PolyB25_2h_11 | VK2_Pet28a_PolyB25_2h_12 |\n", 1969 | "|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", 1970 | "| ENSG00000000003 | 356.00 | 542.00 | 364.00 | 299.00 | 502.00 | 416.00 | 424.00 | 497.00 | 323.00 | 387.00 | 465.00 | 366.00 |\n", 1971 | "| ENSG00000000005 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |\n", 1972 | "| ENSG00000000419 | 2080.11 | 3080.00 | 2206.80 | 1721.00 | 2340.00 | 2232.04 | 2253.00 | 2573.36 | 1904.05 | 2084.00 | 2412.42 | 2170.07 |\n", 1973 | "| ENSG00000000457 | 113.88 | 119.35 | 85.84 | 102.65 | 153.65 | 94.84 | 112.33 | 130.23 | 119.77 | 98.34 | 120.36 | 91.56 |\n", 1974 | "| ENSG00000000460 | 181.12 | 285.65 | 181.16 | 209.35 | 254.35 | 200.16 | 242.67 | 292.77 | 234.23 | 189.66 | 190.64 | 204.44 |\n", 1975 | "| ENSG00000000938 | 4.00 | 21.00 | 10.00 | 18.00 | 10.00 | 6.00 | 6.00 | 9.00 | 10.00 | 13.00 | 17.00 | 13.00 |\n", 1976 | "\n" 1977 | ], 1978 | "text/plain": [ 1979 | " VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_2\n", 1980 | "ENSG00000000003 356.00 542.00 \n", 1981 | "ENSG00000000005 0.00 0.00 \n", 1982 | "ENSG00000000419 2080.11 3080.00 \n", 1983 | "ENSG00000000457 113.88 119.35 \n", 1984 | "ENSG00000000460 181.12 285.65 \n", 1985 | "ENSG00000000938 4.00 21.00 \n", 1986 | " VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_4\n", 1987 | "ENSG00000000003 364.00 299.00 \n", 1988 | "ENSG00000000005 0.00 0.00 \n", 1989 | "ENSG00000000419 2206.80 1721.00 \n", 1990 | "ENSG00000000457 85.84 102.65 \n", 1991 | "ENSG00000000460 181.16 209.35 \n", 1992 | "ENSG00000000938 10.00 18.00 \n", 1993 | " VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_6\n", 1994 | "ENSG00000000003 502.00 416.00 \n", 1995 | "ENSG00000000005 0.00 0.00 \n", 1996 | "ENSG00000000419 2340.00 2232.04 \n", 1997 | "ENSG00000000457 153.65 94.84 \n", 1998 | "ENSG00000000460 254.35 200.16 \n", 1999 | "ENSG00000000938 10.00 6.00 \n", 2000 | " VK2_NanH2_PolyB25_1h_7 VK2_NanH2_PolyB25_1h_8\n", 2001 | "ENSG00000000003 424.00 497.00 \n", 2002 | "ENSG00000000005 0.00 0.00 \n", 2003 | "ENSG00000000419 2253.00 2573.36 \n", 2004 | "ENSG00000000457 112.33 130.23 \n", 2005 | "ENSG00000000460 242.67 292.77 \n", 2006 | "ENSG00000000938 6.00 9.00 \n", 2007 | " VK2_Pet28a_PolyB25_2h_9 VK2_Pet28a_PolyB25_2h_10\n", 2008 | "ENSG00000000003 323.00 387.00 \n", 2009 | "ENSG00000000005 0.00 0.00 \n", 2010 | "ENSG00000000419 1904.05 2084.00 \n", 2011 | "ENSG00000000457 119.77 98.34 \n", 2012 | "ENSG00000000460 234.23 189.66 \n", 2013 | "ENSG00000000938 10.00 13.00 \n", 2014 | " VK2_Pet28a_PolyB25_2h_11 VK2_Pet28a_PolyB25_2h_12\n", 2015 | "ENSG00000000003 465.00 366.00 \n", 2016 | "ENSG00000000005 0.00 0.00 \n", 2017 | "ENSG00000000419 2412.42 2170.07 \n", 2018 | "ENSG00000000457 120.36 91.56 \n", 2019 | "ENSG00000000460 190.64 204.44 \n", 2020 | "ENSG00000000938 17.00 13.00 " 2021 | ] 2022 | }, 2023 | "metadata": {}, 2024 | "output_type": "display_data" 2025 | }, 2026 | { 2027 | "data": { 2028 | "text/html": [ 2029 | "\n", 2030 | "\n", 2031 | "\n", 2032 | "\t\n", 2033 | "\t\n", 2034 | "\n", 2035 | "\n", 2036 | "\t\n", 2037 | "\t\n", 2038 | "\t\n", 2039 | "\t\n", 2040 | "\t\n", 2041 | "\t\n", 2042 | "\n", 2043 | "
A data.frame: 6 × 10
grouplib.sizenorm.factorsSample.NameSample.CodeReference.Genome..Organism.GenderConditionTime.pointCondition_Time
<fct><dbl><dbl><chr><chr><chr><chr><chr><chr><chr>
VK2_NanH2_PolyB25_2h_1NanH2_treated_2H224908891VK2_NanH2_PolyB25_2h_1K001HumanFemaleNanH2_treated2HNanH2_treated_2H
VK2_NanH2_PolyB25_2h_2NanH2_treated_2H312324801VK2_NanH2_PolyB25_2h_2K002HumanFemaleNanH2_treated2HNanH2_treated_2H
VK2_NanH2_PolyB25_2h_3NanH2_treated_2H219013421VK2_NanH2_PolyB25_2h_3K003HumanFemaleNanH2_treated2HNanH2_treated_2H
VK2_NanH2_PolyB25_2h_4NanH2_treated_2H196371141VK2_NanH2_PolyB25_2h_4K004HumanFemaleNanH2_treated2HNanH2_treated_2H
VK2_NanH2_PolyB25_1h_5NanH2_treated_1H274680161VK2_NanH2_PolyB25_1h_5K005HumanFemaleNanH2_treated1HNanH2_treated_1H
VK2_NanH2_PolyB25_1h_6NanH2_treated_1H249358031VK2_NanH2_PolyB25_1h_6K006HumanFemaleNanH2_treated1HNanH2_treated_1H
\n" 2044 | ], 2045 | "text/latex": [ 2046 | "A data.frame: 6 × 10\n", 2047 | "\\begin{tabular}{r|llllllllll}\n", 2048 | " & group & lib.size & norm.factors & Sample.Name & Sample.Code & Reference.Genome..Organism. & Gender & Condition & Time.point & Condition\\_Time\\\\\n", 2049 | " & & & & & & & & & & \\\\\n", 2050 | "\\hline\n", 2051 | "\tVK2\\_NanH2\\_PolyB25\\_2h\\_1 & NanH2\\_treated\\_2H & 22490889 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & K001 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 2052 | "\tVK2\\_NanH2\\_PolyB25\\_2h\\_2 & NanH2\\_treated\\_2H & 31232480 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & K002 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 2053 | "\tVK2\\_NanH2\\_PolyB25\\_2h\\_3 & NanH2\\_treated\\_2H & 21901342 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & K003 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 2054 | "\tVK2\\_NanH2\\_PolyB25\\_2h\\_4 & NanH2\\_treated\\_2H & 19637114 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & K004 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n", 2055 | "\tVK2\\_NanH2\\_PolyB25\\_1h\\_5 & NanH2\\_treated\\_1H & 27468016 & 1 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & K005 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n", 2056 | "\tVK2\\_NanH2\\_PolyB25\\_1h\\_6 & NanH2\\_treated\\_1H & 24935803 & 1 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & K006 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n", 2057 | "\\end{tabular}\n" 2058 | ], 2059 | "text/markdown": [ 2060 | "\n", 2061 | "A data.frame: 6 × 10\n", 2062 | "\n", 2063 | "| | group <fct> | lib.size <dbl> | norm.factors <dbl> | Sample.Name <chr> | Sample.Code <chr> | Reference.Genome..Organism. <chr> | Gender <chr> | Condition <chr> | Time.point <chr> | Condition_Time <chr> |\n", 2064 | "|---|---|---|---|---|---|---|---|---|---|---|\n", 2065 | "| VK2_NanH2_PolyB25_2h_1 | NanH2_treated_2H | 22490889 | 1 | VK2_NanH2_PolyB25_2h_1 | K001 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 2066 | "| VK2_NanH2_PolyB25_2h_2 | NanH2_treated_2H | 31232480 | 1 | VK2_NanH2_PolyB25_2h_2 | K002 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 2067 | "| VK2_NanH2_PolyB25_2h_3 | NanH2_treated_2H | 21901342 | 1 | VK2_NanH2_PolyB25_2h_3 | K003 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 2068 | "| VK2_NanH2_PolyB25_2h_4 | NanH2_treated_2H | 19637114 | 1 | VK2_NanH2_PolyB25_2h_4 | K004 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n", 2069 | "| VK2_NanH2_PolyB25_1h_5 | NanH2_treated_1H | 27468016 | 1 | VK2_NanH2_PolyB25_1h_5 | K005 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n", 2070 | "| VK2_NanH2_PolyB25_1h_6 | NanH2_treated_1H | 24935803 | 1 | VK2_NanH2_PolyB25_1h_6 | K006 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n", 2071 | "\n" 2072 | ], 2073 | "text/plain": [ 2074 | " group lib.size norm.factors\n", 2075 | "VK2_NanH2_PolyB25_2h_1 NanH2_treated_2H 22490889 1 \n", 2076 | "VK2_NanH2_PolyB25_2h_2 NanH2_treated_2H 31232480 1 \n", 2077 | "VK2_NanH2_PolyB25_2h_3 NanH2_treated_2H 21901342 1 \n", 2078 | "VK2_NanH2_PolyB25_2h_4 NanH2_treated_2H 19637114 1 \n", 2079 | "VK2_NanH2_PolyB25_1h_5 NanH2_treated_1H 27468016 1 \n", 2080 | "VK2_NanH2_PolyB25_1h_6 NanH2_treated_1H 24935803 1 \n", 2081 | " Sample.Name Sample.Code\n", 2082 | "VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_1 K001 \n", 2083 | "VK2_NanH2_PolyB25_2h_2 VK2_NanH2_PolyB25_2h_2 K002 \n", 2084 | "VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_3 K003 \n", 2085 | "VK2_NanH2_PolyB25_2h_4 VK2_NanH2_PolyB25_2h_4 K004 \n", 2086 | "VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_5 K005 \n", 2087 | "VK2_NanH2_PolyB25_1h_6 VK2_NanH2_PolyB25_1h_6 K006 \n", 2088 | " Reference.Genome..Organism. Gender Condition \n", 2089 | "VK2_NanH2_PolyB25_2h_1 Human Female NanH2_treated\n", 2090 | "VK2_NanH2_PolyB25_2h_2 Human Female NanH2_treated\n", 2091 | "VK2_NanH2_PolyB25_2h_3 Human Female NanH2_treated\n", 2092 | "VK2_NanH2_PolyB25_2h_4 Human Female NanH2_treated\n", 2093 | "VK2_NanH2_PolyB25_1h_5 Human Female NanH2_treated\n", 2094 | "VK2_NanH2_PolyB25_1h_6 Human Female NanH2_treated\n", 2095 | " Time.point Condition_Time \n", 2096 | "VK2_NanH2_PolyB25_2h_1 2H NanH2_treated_2H\n", 2097 | "VK2_NanH2_PolyB25_2h_2 2H NanH2_treated_2H\n", 2098 | "VK2_NanH2_PolyB25_2h_3 2H NanH2_treated_2H\n", 2099 | "VK2_NanH2_PolyB25_2h_4 2H NanH2_treated_2H\n", 2100 | "VK2_NanH2_PolyB25_1h_5 1H NanH2_treated_1H\n", 2101 | "VK2_NanH2_PolyB25_1h_6 1H NanH2_treated_1H" 2102 | ] 2103 | }, 2104 | "metadata": {}, 2105 | "output_type": "display_data" 2106 | }, 2107 | { 2108 | "data": { 2109 | "text/html": [ 2110 | "\n", 2111 | "\n", 2112 | "\n", 2113 | "\t\n", 2114 | "\t\n", 2115 | "\n", 2116 | "\n", 2117 | "\t\n", 2118 | "\t\n", 2119 | "\t\n", 2120 | "\t\n", 2121 | "\t\n", 2122 | "\t\n", 2123 | "\n", 2124 | "
A data.frame: 6 × 4
ENSEMBLENTREZIDSYMBOLgene_type
<chr><chr><chr><chr>
1ENSG000000000037105 TSPAN6 protein_coding
2ENSG0000000000564102TNMD protein_coding
3ENSG000000004198813 DPM1 protein_coding
4ENSG0000000045757147SCYL3 protein_coding
5ENSG0000000046055732C1orf112protein_coding
6ENSG000000009382268 FGR protein_coding
\n" 2125 | ], 2126 | "text/latex": [ 2127 | "A data.frame: 6 × 4\n", 2128 | "\\begin{tabular}{r|llll}\n", 2129 | " & ENSEMBL & ENTREZID & SYMBOL & gene\\_type\\\\\n", 2130 | " & & & & \\\\\n", 2131 | "\\hline\n", 2132 | "\t1 & ENSG00000000003 & 7105 & TSPAN6 & protein\\_coding\\\\\n", 2133 | "\t2 & ENSG00000000005 & 64102 & TNMD & protein\\_coding\\\\\n", 2134 | "\t3 & ENSG00000000419 & 8813 & DPM1 & protein\\_coding\\\\\n", 2135 | "\t4 & ENSG00000000457 & 57147 & SCYL3 & protein\\_coding\\\\\n", 2136 | "\t5 & ENSG00000000460 & 55732 & C1orf112 & protein\\_coding\\\\\n", 2137 | "\t6 & ENSG00000000938 & 2268 & FGR & protein\\_coding\\\\\n", 2138 | "\\end{tabular}\n" 2139 | ], 2140 | "text/markdown": [ 2141 | "\n", 2142 | "A data.frame: 6 × 4\n", 2143 | "\n", 2144 | "| | ENSEMBL <chr> | ENTREZID <chr> | SYMBOL <chr> | gene_type <chr> |\n", 2145 | "|---|---|---|---|---|\n", 2146 | "| 1 | ENSG00000000003 | 7105 | TSPAN6 | protein_coding |\n", 2147 | "| 2 | ENSG00000000005 | 64102 | TNMD | protein_coding |\n", 2148 | "| 3 | ENSG00000000419 | 8813 | DPM1 | protein_coding |\n", 2149 | "| 4 | ENSG00000000457 | 57147 | SCYL3 | protein_coding |\n", 2150 | "| 5 | ENSG00000000460 | 55732 | C1orf112 | protein_coding |\n", 2151 | "| 6 | ENSG00000000938 | 2268 | FGR | protein_coding |\n", 2152 | "\n" 2153 | ], 2154 | "text/plain": [ 2155 | " ENSEMBL ENTREZID SYMBOL gene_type \n", 2156 | "1 ENSG00000000003 7105 TSPAN6 protein_coding\n", 2157 | "2 ENSG00000000005 64102 TNMD protein_coding\n", 2158 | "3 ENSG00000000419 8813 DPM1 protein_coding\n", 2159 | "4 ENSG00000000457 57147 SCYL3 protein_coding\n", 2160 | "5 ENSG00000000460 55732 C1orf112 protein_coding\n", 2161 | "6 ENSG00000000938 2268 FGR protein_coding" 2162 | ] 2163 | }, 2164 | "metadata": {}, 2165 | "output_type": "display_data" 2166 | } 2167 | ], 2168 | "source": [ 2169 | "head(gDgeList$counts)\n", 2170 | "head(gDgeList$samples)\n", 2171 | "head(gDgeList$genes)" 2172 | ] 2173 | }, 2174 | { 2175 | "cell_type": "markdown", 2176 | "metadata": {}, 2177 | "source": [ 2178 | "[Table of Contents](#Table-of-Contents)\n", 2179 | "\n", 2180 | "## Summary" 2181 | ] 2182 | }, 2183 | { 2184 | "cell_type": "markdown", 2185 | "metadata": {}, 2186 | "source": [ 2187 | "**Gene annotations**\n", 2188 | "* Human gene annotations were taken from the Gencode project, Release 43 (GRCh38.p13).\n", 2189 | "\n", 2190 | "**Gene type filtering**\n", 2191 | "* This analysis was includes protein-coding genes and non-coding genes. Of the original 60,605 Ensembl genes in the dataset, 19,937 are known coding genes. \n" 2192 | ] 2193 | }, 2194 | { 2195 | "cell_type": "markdown", 2196 | "metadata": {}, 2197 | "source": [ 2198 | "Save the workspace objects for future reference:" 2199 | ] 2200 | }, 2201 | { 2202 | "cell_type": "code", 2203 | "execution_count": 86, 2204 | "metadata": {}, 2205 | "outputs": [ 2206 | { 2207 | "name": "stdout", 2208 | "output_type": "stream", 2209 | "text": [ 2210 | "[1] \"Output file: 20230302161642_data_integration.RData\"\n" 2211 | ] 2212 | } 2213 | ], 2214 | "source": [ 2215 | "writeWorkspaceImage(gInterimDir, paste0(gRunName,\"_data_integration\"))" 2216 | ] 2217 | }, 2218 | { 2219 | "cell_type": "markdown", 2220 | "metadata": {}, 2221 | "source": [ 2222 | "[Table of Contents](#Table-of-Contents)\n", 2223 | "\n", 2224 | "## Citations\n", 2225 | "\n", 2226 | "1. Law CW, Alhamdoosh M, Su S, Smyth GK, Ritchie ME. RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR. Version 2. F1000Res. 2016 Jun 17 [revised 2016 Jan 1];5:1408.\n", 2227 | "2. Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics 26, 139-140.\n", 2228 | "3. Huber W, Carey VJ, Gentleman R, Anders S, Carlson M, Carvalho BS, Bravo HC, Davis S, Gatto L, Girke T, Gottardo R, Hahne F, Hansen KD, Irizarry RA, Lawrence M, Love MI, MacDonald J, Obenchain V, Oleś AK, Pagès H, Reyes A, Shannon P, Smyth GK, Tenenbaum D, Waldron L, Morgan M. Orchestrating high-throughput genomic analysis with Bioconductor. Nat Methods. 2015 Feb;12(2):115-21.\n", 2229 | "4. R Core Team (2016). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/." 2230 | ] 2231 | }, 2232 | { 2233 | "cell_type": "markdown", 2234 | "metadata": {}, 2235 | "source": [ 2236 | "[Table of Contents](#Table-of-Contents)\n", 2237 | "\n", 2238 | "## Appendix: R Session Info" 2239 | ] 2240 | }, 2241 | { 2242 | "cell_type": "code", 2243 | "execution_count": 87, 2244 | "metadata": {}, 2245 | "outputs": [ 2246 | { 2247 | "data": { 2248 | "text/plain": [ 2249 | "[1] \"2023-03-03 10:48:37 MST\"" 2250 | ] 2251 | }, 2252 | "metadata": {}, 2253 | "output_type": "display_data" 2254 | }, 2255 | { 2256 | "data": { 2257 | "text/plain": [ 2258 | "R version 4.1.3 (2022-03-10)\n", 2259 | "Platform: x86_64-apple-darwin13.4.0 (64-bit)\n", 2260 | "Running under: macOS Big Sur/Monterey 10.16\n", 2261 | "\n", 2262 | "Matrix products: default\n", 2263 | "BLAS/LAPACK: /Users/dchilinfuentes/opt/anaconda3/envs/RNAseq_env/lib/libopenblasp-r0.3.20.dylib\n", 2264 | "\n", 2265 | "locale:\n", 2266 | "[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n", 2267 | "\n", 2268 | "attached base packages:\n", 2269 | "[1] stats4 stats graphics grDevices utils datasets methods \n", 2270 | "[8] base \n", 2271 | "\n", 2272 | "other attached packages:\n", 2273 | " [1] edgeR_3.36.0 \n", 2274 | " [2] limma_3.50.3 \n", 2275 | " [3] Homo.sapiens_1.3.1 \n", 2276 | " [4] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2\n", 2277 | " [5] org.Hs.eg.db_3.14.0 \n", 2278 | " [6] GO.db_3.14.0 \n", 2279 | " [7] OrganismDbi_1.36.0 \n", 2280 | " [8] GenomicFeatures_1.46.5 \n", 2281 | " [9] GenomicRanges_1.46.1 \n", 2282 | "[10] GenomeInfoDb_1.30.1 \n", 2283 | "[11] AnnotationDbi_1.56.2 \n", 2284 | "[12] IRanges_2.28.0 \n", 2285 | "[13] S4Vectors_0.32.4 \n", 2286 | "[14] Biobase_2.54.0 \n", 2287 | "[15] BiocGenerics_0.40.0 \n", 2288 | "\n", 2289 | "loaded via a namespace (and not attached):\n", 2290 | " [1] MatrixGenerics_1.6.0 httr_1.4.3 \n", 2291 | " [3] bit64_4.0.5 jsonlite_1.8.0 \n", 2292 | " [5] assertthat_0.2.1 BiocManager_1.30.18 \n", 2293 | " [7] BiocFileCache_2.2.1 RBGL_1.70.0 \n", 2294 | " [9] blob_1.2.3 GenomeInfoDbData_1.2.7 \n", 2295 | "[11] Rsamtools_2.10.0 yaml_2.3.5 \n", 2296 | "[13] progress_1.2.2 pillar_1.8.0 \n", 2297 | "[15] RSQLite_2.2.15 lattice_0.20-45 \n", 2298 | "[17] glue_1.6.2 uuid_1.1-0 \n", 2299 | "[19] digest_0.6.29 XVector_0.34.0 \n", 2300 | "[21] htmltools_0.5.3 Matrix_1.4-1 \n", 2301 | "[23] XML_3.99-0.10 pkgconfig_2.0.3 \n", 2302 | "[25] biomaRt_2.50.3 zlibbioc_1.40.0 \n", 2303 | "[27] purrr_0.3.4 BiocParallel_1.28.3 \n", 2304 | "[29] tibble_3.1.8 KEGGREST_1.34.0 \n", 2305 | "[31] generics_0.1.3 ellipsis_0.3.2 \n", 2306 | "[33] cachem_1.0.6 SummarizedExperiment_1.24.0\n", 2307 | "[35] repr_1.1.4 cli_3.3.0 \n", 2308 | "[37] magrittr_2.0.3 crayon_1.5.1 \n", 2309 | "[39] memoise_2.0.1 evaluate_0.16 \n", 2310 | "[41] fansi_1.0.3 xml2_1.3.3 \n", 2311 | "[43] graph_1.72.0 tools_4.1.3 \n", 2312 | "[45] prettyunits_1.1.1 hms_1.1.1 \n", 2313 | "[47] BiocIO_1.4.0 lifecycle_1.0.1 \n", 2314 | "[49] matrixStats_0.62.0 stringr_1.4.0 \n", 2315 | "[51] locfit_1.5-9.6 DelayedArray_0.20.0 \n", 2316 | "[53] Biostrings_2.62.0 compiler_4.1.3 \n", 2317 | "[55] rlang_1.0.4 grid_4.1.3 \n", 2318 | "[57] RCurl_1.98-1.8 pbdZMQ_0.3-7 \n", 2319 | "[59] IRkernel_1.3 rjson_0.2.21 \n", 2320 | "[61] rappdirs_0.3.3 bitops_1.0-7 \n", 2321 | "[63] base64enc_0.1-3 restfulr_0.0.15 \n", 2322 | "[65] DBI_1.1.3 curl_4.3.2 \n", 2323 | "[67] R6_2.5.1 GenomicAlignments_1.30.0 \n", 2324 | "[69] dplyr_1.0.9 rtracklayer_1.54.0 \n", 2325 | "[71] fastmap_1.1.0 bit_4.0.4 \n", 2326 | "[73] utf8_1.2.2 filelock_1.0.2 \n", 2327 | "[75] stringi_1.7.8 parallel_4.1.3 \n", 2328 | "[77] IRdisplay_1.1 Rcpp_1.0.9 \n", 2329 | "[79] vctrs_0.4.1 png_0.1-7 \n", 2330 | "[81] dbplyr_2.2.1 tidyselect_1.1.2 " 2331 | ] 2332 | }, 2333 | "metadata": {}, 2334 | "output_type": "display_data" 2335 | } 2336 | ], 2337 | "source": [ 2338 | "Sys.time()\n", 2339 | "sessionInfo()" 2340 | ] 2341 | }, 2342 | { 2343 | "cell_type": "markdown", 2344 | "metadata": {}, 2345 | "source": [ 2346 | "[Table of Contents](#Table-of-Contents)\n", 2347 | "\n", 2348 | "Copyright (c) 2018 UC San Diego Center for Computational Biology & Bioinformatics under the MIT License\n", 2349 | "\n", 2350 | "Notebook template by Amanda Birmingham" 2351 | ] 2352 | } 2353 | ], 2354 | "metadata": { 2355 | "kernelspec": { 2356 | "display_name": "R", 2357 | "language": "R", 2358 | "name": "ir" 2359 | }, 2360 | "language_info": { 2361 | "codemirror_mode": "r", 2362 | "file_extension": ".r", 2363 | "mimetype": "text/x-r-source", 2364 | "name": "R", 2365 | "pygments_lexer": "r", 2366 | "version": "4.1.3" 2367 | } 2368 | }, 2369 | "nbformat": 4, 2370 | "nbformat_minor": 2 2371 | } 2372 | --------------------------------------------------------------------------------