├── secondary_analysis_scripts
    ├── README.txt
    ├── src
    │   ├── ChainedNotebookSupport.R
    │   ├── CountsPcaPlotter.R
    │   └── PcaPlotter.R
    └── 1_RNASeq_Count_Metadata_Annotation_Load_and_Integration.ipynb
├── reference
    ├── README.txt
    ├── Homo_sapiens_GRCh38p13_gencodev38_ANNOT.Rdata
    └── KavitaSamplesMetadata.tsv
├── README.md
└── primary_analysis_scripts
    ├── ReadMe.md
    ├── submit_dl_counts.sh
    ├── submit_dl_fastqs.sh
    ├── calculate_counts
        ├── calculate_counts.sh
        ├── RSEM_gene_parser.py
        ├── RSEM_isoform_parser.py
        └── RSEM_count_parser.py
    ├── submit_humanPE.sh
    ├── dl_counts.sh
    └── run_human_PE_aws.sh


/secondary_analysis_scripts/README.txt:
--------------------------------------------------------------------------------
1 | This directory holds jupyter notebooks and associated source scripts used to perform the analysis.


--------------------------------------------------------------------------------
/reference/README.txt:
--------------------------------------------------------------------------------
1 | This directory holds files referenced by the analysis, such as papers, metadata files, Rdata annotation files, etc.
2 | 


--------------------------------------------------------------------------------
/reference/Homo_sapiens_GRCh38p13_gencodev38_ANNOT.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucsd-ccbb/VK2-vaginal-epithelial-cell-RNA-seq-analysis/main/reference/Homo_sapiens_GRCh38p13_gencodev38_ANNOT.Rdata


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VK2 vaginal epithelial cell RNA seq analysis
2 | 
3 | Comprehensive Jupyter notebooks and supporting code to reproduce the analysis from the paper 'Resident microbes shape the vaginal epithelial glycan landscape' (link coming soon)
4 | 


--------------------------------------------------------------------------------
/primary_analysis_scripts/ReadMe.md:
--------------------------------------------------------------------------------
1 | This folder contains scripts that were used  for the following:
2 | 
3 | 1. Downloading fastq files (*submit_dl_fastqs.sh*)
4 | 2. Running QC, trim, and alignment (*submit_humanPE.sh*)
5 | 3. Download results to create a counts matrix file. (*submit_dl_counts.sh* and *calculate_counts/calculate_counts.sh*)
6 | 


--------------------------------------------------------------------------------
/primary_analysis_scripts/submit_dl_counts.sh:
--------------------------------------------------------------------------------
 1 | sh dl_counts.sh	VK2_NanH2_PolyB25_1h_5	
 2 | sh dl_counts.sh	VK2_NanH2_PolyB25_1h_6	
 3 | sh dl_counts.sh	VK2_NanH2_PolyB25_1h_7	
 4 | sh dl_counts.sh	VK2_NanH2_PolyB25_1h_8	
 5 | sh dl_counts.sh	VK2_NanH2_PolyB25_2h_1	
 6 | sh dl_counts.sh	VK2_NanH2_PolyB25_2h_2	
 7 | sh dl_counts.sh	VK2_NanH2_PolyB25_2h_3	
 8 | sh dl_counts.sh	VK2_NanH2_PolyB25_2h_4
 9 | sh dl_counts.sh	VK2_Pet28a_PolyB25_2h_9	
10 | sh dl_counts.sh	VK2_Pet28a_PolyB25_2h_10	
11 | sh dl_counts.sh	VK2_Pet28a_PolyB25_2h_11	
12 | sh dl_counts.sh	VK2_Pet28a_PolyB25_2h_12	


--------------------------------------------------------------------------------
/primary_analysis_scripts/submit_dl_fastqs.sh:
--------------------------------------------------------------------------------
 1 | sh dl_fastqs_igm.sh	VK2_NanH2_PolyB25_1h_5	S41
 2 | sh dl_fastqs_igm.sh	VK2_NanH2_PolyB25_1h_6	S42
 3 | sh dl_fastqs_igm.sh	VK2_NanH2_PolyB25_1h_7	S43
 4 | sh dl_fastqs_igm.sh	VK2_NanH2_PolyB25_1h_8	S44
 5 | sh dl_fastqs_igm.sh	VK2_NanH2_PolyB25_2h_1	S37
 6 | sh dl_fastqs_igm.sh	VK2_NanH2_PolyB25_2h_2	S38
 7 | sh dl_fastqs_igm.sh	VK2_NanH2_PolyB25_2h_3	S39
 8 | sh dl_fastqs_igm.sh	VK2_NanH2_PolyB25_2h_4	S40
 9 | sh dl_fastqs_igm.sh	VK2_Pet28a_PolyB25_2h_9	S45
10 | sh dl_fastqs_igm.sh	VK2_Pet28a_PolyB25_2h_10	S46
11 | sh dl_fastqs_igm.sh	VK2_Pet28a_PolyB25_2h_11	S47
12 | sh dl_fastqs_igm.sh	VK2_Pet28a_PolyB25_2h_12	S48


--------------------------------------------------------------------------------
/primary_analysis_scripts/calculate_counts/calculate_counts.sh:
--------------------------------------------------------------------------------
1 | data_dir=/Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_analysis
2 | 
3 | python /Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_scripts/calculate_counts/RSEM_count_parser.py $data_dir
4 | python /Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_scripts/calculate_counts/RSEM_gene_parser.py $data_dir
5 | python /Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_scripts/calculate_counts/RSEM_isoform_parser.py $data_dir
6 | 


--------------------------------------------------------------------------------
/primary_analysis_scripts/submit_humanPE.sh:
--------------------------------------------------------------------------------
 1 | sbatch -n 16 run_human_PE_aws.sh	VK2_NanH2_PolyB25_1h_5	S41
 2 | sbatch -n 16 run_human_PE_aws.sh	VK2_NanH2_PolyB25_1h_6	S42
 3 | sbatch -n 16 run_human_PE_aws.sh	VK2_NanH2_PolyB25_1h_7	S43
 4 | sbatch -n 16 run_human_PE_aws.sh	VK2_NanH2_PolyB25_1h_8	S44
 5 | sbatch -n 16 run_human_PE_aws.sh	VK2_NanH2_PolyB25_2h_1	S37
 6 | sbatch -n 16 run_human_PE_aws.sh	VK2_NanH2_PolyB25_2h_2	S38
 7 | sbatch -n 16 run_human_PE_aws.sh	VK2_NanH2_PolyB25_2h_3	S39
 8 | sbatch -n 16 run_human_PE_aws.sh	VK2_NanH2_PolyB25_2h_4	S40
 9 | sbatch -n 16 run_human_PE_aws.sh	VK2_Pet28a_PolyB25_2h_9	S45
10 | sbatch -n 16 run_human_PE_aws.sh	VK2_Pet28a_PolyB25_2h_10	S46
11 | sbatch -n 16 run_human_PE_aws.sh	VK2_Pet28a_PolyB25_2h_11	S47
12 | sbatch -n 16 run_human_PE_aws.sh	VK2_Pet28a_PolyB25_2h_12	S48


--------------------------------------------------------------------------------
/reference/KavitaSamplesMetadata.tsv:
--------------------------------------------------------------------------------
 1 | Sample Name	Sample Code	Reference Genome (Organism)	Gender	Condition	Time point
 2 | VK2_NanH2_PolyB25_2h_1	K001	Human	Female	NanH2 treated	2 hour
 3 | VK2_NanH2_PolyB25_2h_2	K002	Human	Female	NanH2 treated	2 hour
 4 | VK2_NanH2_PolyB25_2h_3	K003	Human	Female	NanH2 treated	2 hour
 5 | VK2_NanH2_PolyB25_2h_4	K004	Human	Female	NanH2 treated	2 hour
 6 | VK2_NanH2_PolyB25_1h_5	K005	Human	Female	NanH2 treated	1 hour
 7 | VK2_NanH2_PolyB25_1h_6	K006	Human	Female	NanH2 treated	1 hour
 8 | VK2_NanH2_PolyB25_1h_7	K007	Human	Female	NanH2 treated	1 hour
 9 | VK2_NanH2_PolyB25_1h_8	K008	Human	Female	NanH2 treated	1 hour
10 | VK2_Pet28a_PolyB25_2h_9	K009	Human	Female	Vector treated	2 hour
11 | VK2_Pet28a_PolyB25_2h_10	K010	Human	Female	Vector treated	2 hour
12 | VK2_Pet28a_PolyB25_2h_11	K011	Human	Female	Vector treated	2 hour
13 | VK2_Pet28a_PolyB25_2h_12	K012	Human	Female	Vector treated	2 hour


--------------------------------------------------------------------------------
/secondary_analysis_scripts/src/ChainedNotebookSupport.R:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------------------------
 2 | # Copyright (c) 2018 UC San Diego Center for Computational Biology & Bioinformatics
 3 | #
 4 | # Distributed under the terms of the MIT License.
 5 | #
 6 | # The full license is in the file LICENSE, distributed with this software.
 7 | # ----------------------------------------------------------------------------------
 8 | # Initial author: Amanda Birmingham
 9 | 
10 | makeRunName = function(gProjectName, gStepName){
11 | 	return(paste0(gProjectName, "_", gStepName, "_", gsub("[: -]", "", strptime(Sys.time(), "%Y-%m-%d %H:%M:%S"), perl=TRUE)))
12 | }
13 | 
14 | writeWorkspaceImage = function(outputDir, runName){
15 |     fileName = sprintf("%s.RData",runName)
16 |     save.image(file=file.path(outputDir, fileName))
17 |     print(paste0("Output file: ",fileName))
18 | }
19 | 
20 | # from https://www.r-bloggers.com/safe-loading-of-rdata-files-2/
21 | loadToEnvironment <- function(RData, env = new.env()){
22 |   load(RData, env)
23 |   return(env)
24 | }
25 | 


--------------------------------------------------------------------------------
/primary_analysis_scripts/dl_counts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #yes | sudo yum install perl-Env
 4 | 
 5 | # Use this script to download fastqc and STAR/rsem files to make counts matrices and compile a multiqc report
 6 | 
 7 | filename=$1
 8 | data_dir="/Volumes/TOSHIBA-EXT/UCSD-CCBB/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_analysis"
 9 | s3_addr=s3://ccbb-data-upload/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression/primary_analysis
10 | 
11 | workspace=$data_dir/$filename
12 | 
13 | mkdir -p $workspace
14 | cd $workspace
15 | 
16 | # download fastqcs, .genes.results, .isoforms.results, .stat folder
17 | aws s3 cp $s3_addr/$filename/$filename"_R1.trim_fastqc.html" .
18 | aws s3 cp $s3_addr/$filename/$filename"_R1.trim_fastqc.zip" .
19 | aws s3 cp $s3_addr/$filename/$filename"_R2.trim_fastqc.html" .
20 | aws s3 cp $s3_addr/$filename/$filename"_R2.trim_fastqc.zip" .
21 | aws s3 cp $s3_addr/$filename/$filename".genes.results" .
22 | aws s3 cp $s3_addr/$filename/$filename".isoforms.results" .
23 | aws s3 cp $s3_addr/$filename/$filename".stat" . --recursive
24 | 


--------------------------------------------------------------------------------
/primary_analysis_scripts/calculate_counts/RSEM_gene_parser.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Guorong Xu<g1xu@ucsd.edu>'
 2 | 
 3 | import re
 4 | import os
 5 | import sys
 6 | 
 7 | def read_data_file(input_file):
 8 |     expression_list = []
 9 |     with open(input_file) as f:
10 |         for line in f:
11 |             if not line.startswith("gene_id"):
12 |                expression_list.append(line)
13 |     return expression_list
14 | 
15 | if __name__ == "__main__":
16 |     workspace = sys.argv[1]
17 |     sample_list = {}
18 |     filewriter = open(workspace + "/all_genes_results.txt", "w")
19 | 
20 |     for dirpath, directories, filenames in os.walk(workspace):
21 |         for filename in filenames:
22 |             if filename.endswith(".genes.results"):     # output of calculate expression
23 |                 input_file = os.path.join(dirpath, filename)
24 |                 expression_list = read_data_file(input_file)
25 |                 sample_list.update({filename:expression_list})
26 |     # no files are found
27 |     if len(sample_list) == 0:
28 |         raise FileNotFoundError("ERROR! No \"genes.results\" files are found.")
29 | 
30 |     filewriter.write("gene_id\ttranscript_id(s)")
31 |     for sample in sample_list:
32 |         filewriter.write("\t" + sample + "_length\t" + sample + "_effective_length\t" + sample + "_expected_count\t" + sample + "_TPM\t" + sample + "_FPKM")
33 |     filewriter.write("\n")
34 | 
35 |     for line_num in range(0, len(expression_list)):
36 |         for index, sample in enumerate(sample_list):
37 |             expression_list = sample_list.get(sample)
38 |             expression_values = expression_list[line_num]
39 |             fields = re.split(r'\t+', expression_values)
40 |             if index == 0:
41 |                 filewriter.write(fields[0] + "\t" + fields[1] + "\t" + fields[2] + "\t" + fields[3] + "\t" + fields[4] + "\t" + fields[5] + "\t" + fields[6].rstrip())
42 |             else:
43 |                 filewriter.write("\t" + fields[2] + "\t" + fields[3] + "\t" + fields[4] + "\t" + fields[5] + "\t" + fields[6].rstrip())
44 | 
45 |         filewriter.write("\n")
46 | 
47 |     filewriter.close()
48 | 
49 |     all_genes_expression_list = read_data_file(workspace + "/all_genes_results.txt")
50 | 
51 |     if len(expression_list) == len(all_genes_expression_list):
52 |         exit(0)
53 |     else:
54 |         exit(1)


--------------------------------------------------------------------------------
/primary_analysis_scripts/calculate_counts/RSEM_isoform_parser.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Guorong Xu<g1xu@ucsd.edu>'
 2 | 
 3 | import re
 4 | import os
 5 | import sys
 6 | 
 7 | def read_data_file(input_file):
 8 |     expression_list = []
 9 |     with open(input_file) as f:
10 |         for line in f:
11 |             if not line.startswith("transcript_id"):
12 |                expression_list.append(line)
13 |     return expression_list
14 | 
15 | if __name__ == "__main__":
16 |     workspace = sys.argv[1]
17 |     sample_list = {}
18 |     filewriter = open(workspace + "/all_isoforms_results.txt", "w")
19 | 
20 |     for dirpath, directories, filenames in os.walk(workspace):
21 |         for filename in filenames:
22 |             if filename.endswith(".isoforms.results"):      # output of calculate expression
23 |                 input_file = os.path.join(dirpath, filename)
24 |                 expression_list = read_data_file(input_file)
25 |                 sample_list.update({filename:expression_list})
26 |     # no files are found
27 |     if len(sample_list) == 0:
28 |         raise FileNotFoundError("ERROR! No \"isoforms.results\" files are found.")
29 | 
30 |     filewriter.write("transcript_id\tgene_id")
31 |     for sample in sample_list:
32 |         filewriter.write("\t" + sample + "_length\t" + sample + "_effective_length\t" + sample + "_expected_count\t" + sample + "_TPM\t" + sample + "_FPKM\t" + sample + "_IsoPct")
33 |     filewriter.write("\n")
34 | 
35 |     for line_num in range(0, len(expression_list)):
36 |         for index, sample in enumerate(sample_list):
37 |             expression_list = sample_list.get(sample)
38 |             expression_values = expression_list[line_num]
39 |             fields = re.split(r'\t+', expression_values)
40 |             if index == 0:
41 |                 filewriter.write(fields[0] + "\t" + fields[1] + "\t" + fields[2] + "\t" + fields[3] + "\t" + fields[4] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7].rstrip())
42 |             else:
43 |                 filewriter.write("\t" + fields[2] + "\t" + fields[3] + "\t" + fields[4] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7].rstrip())
44 | 
45 |         filewriter.write("\n")
46 | 
47 |     filewriter.close()
48 | 
49 |     all_genes_expression_list = read_data_file(workspace + "/all_isoforms_results.txt")
50 | 
51 |     if len(expression_list) == len(all_genes_expression_list):
52 |         exit(0)
53 |     else:
54 |         exit(1)


--------------------------------------------------------------------------------
/primary_analysis_scripts/run_human_PE_aws.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | yes | sudo yum install perl-Env
 4 | 
 5 | filename=$1
 6 | sample_num=$2
 7 | workspace=/scratch/workspace/$filename
 8 | 
 9 | STAR=/shared/software/STAR/2.5.1a/bin/Linux_x86_64
10 | 
11 | star_ref=/shared/software/STAR_index/Hsapiens_h38p13_v38/Hsapiens_h38p13_v38 #gencode v38
12 | fastqc=/shared/software/FastQC/fastqc #v0.11.8
13 | trimmomatic=/shared/software/Trimmomatic-0.38/trimmomatic-0.38.jar
14 | rsem=/shared/software/RSEM-1.3.0/rsem-calculate-expression
15 | aws_addr=s3://ccbb-data-upload/2023/20230207_Agarwal_Lewis_Human_Bulk_RNA-Seq_Differential_Expression
16 | 
17 | mkdir -p $workspace
18 | 
19 | cd $workspace
20 | echo $PWD
21 | 
22 | ## Download data ##
23 | aws s3 cp $aws_addr"/fastq/"$filename/$filename"_"$sample_num"_L004_R1_001.fastq.gz" $workspace/$filename"_R1.fastq.gz"
24 | aws s3 cp $aws_addr"/fastq/"$filename/$filename"_"$sample_num"_L004_R2_001.fastq.gz" $workspace/$filename"_R2.fastq.gz"
25 | 
26 | export _JAVA_OPTIONS=-Djavax.accessibility.assistive_technologies=
27 | $fastqc $workspace/$filename"_R1.fastq.gz" -o $workspace/
28 | $fastqc $workspace/$filename"_R2.fastq.gz" -o $workspace/
29 | 
30 | ## Trim ##
31 | java -jar $trimmomatic PE -threads 5 -phred33 -trimlog $workspace/trimlog.log $workspace/$filename"_R1.fastq.gz" $workspace/$filename"_R2.fastq.gz" $workspace/$filename"_R1.trim.fastq.gz" $workspace/$filename"_R1.unpaired.fastq.gz" $workspace/$filename"_R2.trim.fastq.gz" $workspace/$filename"_R2.unpaired.fastq.gz" LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:27 ILLUMINACLIP:/shared/software/Trimmomatic-0.38/adapters/NexteraPE-PE.fa:2:30:10
32 | 
33 | ## Check for adapters ##
34 | $fastqc $workspace/$filename"_R1.trim.fastq.gz" -o $workspace/
35 | $fastqc $workspace/$filename"_R2.trim.fastq.gz" -o $workspace/
36 | 
37 | ## Default rsem STAR wrapper ##
38 | $rsem --paired-end --star --star-path $STAR --star-gzipped-read-file -p 8 $workspace/$filename"_R1.trim.fastq.gz" $workspace/$filename"_R2.trim.fastq.gz" $star_ref $workspace/$filename
39 | 
40 | rm $workspace/$filename"_R1.fastq.gz"
41 | rm $workspace/$filename"_R2.fastq.gz"
42 | rm $workspace/$filename"_R1.trim.fastq.gz"
43 | rm $workspace/$filename"_R2.trim.fastq.gz"
44 | rm $workspace/$filename"_R1.unpaired.fastq.gz"
45 | rm $workspace/$filename"_R2.unpaired.fastq.gz"
46 | rm $workspace/trimlog.log
47 | #upload results to S3
48 | aws s3 cp $workspace $aws_addr/primary_analysis/$filename/ --recursive
49 | 


--------------------------------------------------------------------------------
/primary_analysis_scripts/calculate_counts/RSEM_count_parser.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Guorong Xu<g1xu@ucsd.edu>'
 2 | 
 3 | import re
 4 | import os
 5 | import sys
 6 | 
 7 | def read_data_file(input_file):
 8 |     expression_list = {}
 9 |     with open(input_file) as f:
10 |         for line_num, line in enumerate(f):
11 |             if line_num == 0:
12 |                 expression_list.update({"alignment_statistics":line.rstrip()})
13 |             if line_num == 1:
14 |                 expression_list.update({"alignment_certainty":line.rstrip()})
15 |             if line_num == 2:
16 |                 expression_list.update({"alignment_Hits":line.rstrip()})
17 |             if line_num > 2:
18 |                 # split the string based on tab
19 |                 fields = re.split(r'\t+', line)
20 |                 expression_list.update({fields[0]:fields[1].rstrip()})
21 | 
22 |     return expression_list
23 | 
24 | if __name__ == "__main__":
25 |     workspace = sys.argv[1]
26 |     sample_list = {}
27 |     filewriter = open(workspace + "/all_counts_results.txt", "w")
28 | 
29 |     for dirpath, directories, filenames in os.walk(workspace):
30 |         for filename in filenames:
31 |             if filename.endswith(".cnt"):       # output of calculate expression
32 |                 input_file = os.path.join(dirpath, filename)
33 |                 expression_list = read_data_file(input_file)
34 |                 sample_list.update({filename:expression_list})
35 |     # no files are found
36 |     if len(sample_list) == 0:
37 |         raise FileNotFoundError("ERROR! No \"cnt\" files are found.")
38 | 
39 |     filewriter.write("item")
40 |     # sample is filename.cnt
41 |     for sample in sample_list:
42 |         filewriter.write("\t" + sample + "_counts")
43 |     filewriter.write("\n")
44 | 
45 |     for header in ["alignment_statistics", "alignment_certainty", "alignment_Hits"]:
46 |         filewriter.write(header)
47 |         for sample in sample_list:
48 |             expression_list = sample_list.get(sample)
49 |             expression_values = expression_list.get(header)
50 |             filewriter.write("\t" + expression_values)
51 |         filewriter.write("\n")
52 | 
53 |     for line_num in range(0, 100):
54 |         filewriter.write(str(line_num) + "\t")
55 |         for sample in sample_list:
56 |             expression_list = sample_list.get(sample)
57 |             if str(line_num) in expression_list:
58 |                 filewriter.write(expression_list.get(str(line_num)) + "\t")
59 |             else:
60 |                 filewriter.write(str(0) + "\t")
61 |         filewriter.write("\n")
62 | 
63 |     for header in ["Inf"]:
64 |         filewriter.write(header)
65 |         for sample in sample_list:
66 |             expression_list = sample_list.get(sample)
67 |             expression_values = expression_list.get(header)
68 |             filewriter.write("\t" + expression_values)
69 |         filewriter.write("\n")
70 |     
71 |     filewriter.close()


--------------------------------------------------------------------------------
/secondary_analysis_scripts/src/CountsPcaPlotter.R:
--------------------------------------------------------------------------------
  1 | # ---------------------------------------------------------------------------------
  2 | # Copyright (c) 2018 UC San Diego Center for Computational Biology & Bioinformatics
  3 | #
  4 | # Distributed under the terms of the MIT License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------------
  8 | # Initial author: Amanda Birmingham
  9 | 
 10 | library(edgeR)
 11 | 
 12 | expandDesignDf<-function(countsDf, designDf,
 13 |                          sampleNameColName = "sample_name"){
 14 |   aDgeList <- DGEList(counts=countsDf)
 15 |   mergedDesignDf = merge(x=designDf, y=aDgeList$samples,
 16 |                          by.y="row.names",
 17 |                          by.x=sampleNameColName)
 18 |   return(mergedDesignDf)
 19 | }
 20 | 
 21 | syncCountSampleOrderToDesignDf<-function(counts_df, designDf,
 22 |                                          sampleColName="sample_name"){
 23 | 
 24 |   sampleNamesInOrder = designDf[[sampleColName]]
 25 | 
 26 |   # check for samples in the design file that aren't in the counts file
 27 |   missingSamples = setdiff(sampleNamesInOrder, colnames(counts_df))
 28 |   if (length(missingSamples)>0){
 29 |     print(missingSamples)
 30 |     stop("Above samples are in design file but missing from counts file")
 31 |   }
 32 | 
 33 |   # ensure that the order of the samples in the counts table is
 34 |   # the same as the order of the samples in the design table
 35 |   reordered_counts_df = counts_df[sampleNamesInOrder]
 36 |   return(reordered_counts_df)
 37 | }
 38 | 
 39 | reformatDfForPca<-function(counts_df, designDf,
 40 |                            sampleColName="sample_name"){
 41 |   reordered_counts_df = syncCountSampleOrderToDesignDf(
 42 |     counts_df, designDf, sampleColName)
 43 | 
 44 |   # now transform the counts df so it is samples are in rows
 45 |   # (as in the design file) rather than in rows
 46 |   transformed_df = t(reordered_counts_df)
 47 |   return(transformed_df)
 48 | }
 49 | 
 50 | makeAndPrintRawCountsPca<-function(countsDf, designDf,
 51 |                                    pointShapeColName,
 52 |                                    designSampleNameColName="sample_name",
 53 |                                    libSizeColName = "lib.size",
 54 |                                    designColNameForLabels=NULL, labelOnlyOutliers=TRUE){
 55 | 
 56 |   if (!libSizeColName %in% colnames(designDf)){
 57 |     designDf = expandDesignDf(countsDf, designDf,
 58 |                               designSampleNameColName)
 59 |   }
 60 | 
 61 |   rawTitle = "PCA of Raw Counts"
 62 |   display_markdown(rawTitle)
 63 |   countsPca = doPcaFromSamplesAsColsDf(countsDf, designDf,
 64 |                                        designSampleNameColName)
 65 |   rawPlot = make2dPcaPlot(countsPca, designDf, pointShapeColName,
 66 |                           libSizeColName, designColNameForLabels,
 67 |                           labelOnlyOutliers)
 68 |   print(rawPlot + ggtitle(rawTitle))
 69 | }
 70 | 
 71 | makeAndPrintCpmsPca<-function(countsDf, designDf,
 72 |                               pointShapeColName,
 73 |                               designSampleNameColName="sample_name",
 74 |                               libSizeColName = "lib.size",
 75 |                               designColNameForLabels=NULL, labelOnlyOutliers=TRUE){
 76 | 
 77 |   if (!libSizeColName %in% colnames(designDf)){
 78 |     designDf = expandDesignDf(countsDf, designDf,
 79 |                               designSampleNameColName)
 80 |   }
 81 | 
 82 |   cpmsDf = getCpmsDf(countsDf)
 83 | 
 84 |   normTitle = "PCA of Normalized Counts"
 85 |   designAndPca = makeAndPrintPca(normTitle)
 86 | 
 87 |   display_markdown(normTitle)
 88 |   cpmsPca = doPcaFromSamplesAsColsDf(cpmsDf, designDf,
 89 |                                      designSampleNameColName)
 90 |   normPlot = make2dPcaPlot(cpmsPca, designDf, pointShapeColName,
 91 |                            libSizeColName, designColNameForLabels,
 92 |                            labelOnlyOutliers)
 93 |   print(normPlot + ggtitle(normTitle))
 94 | 
 95 |   return(list(rawcounts=countsDf, design=designDf, cpms=cpmsDf,
 96 |               cpmsPca=cpmsPca))
 97 | }
 98 | 
 99 | # TODO: come back and integrate this function with
100 | # makeAndPrintRawCountsPca and makeAndPrintCpmsPca
101 | makeAndPrintPca<-function(title, countsDf, designDf,
102 |                           pointShapeColName,
103 |                           designSampleNameColName="sample_name",
104 |                           libSizeColName = "lib.size",
105 |                           designColNameForLabels=NULL, labelOnlyOutliers=TRUE){
106 | 
107 |   if (!libSizeColName %in% colnames(designDf)){
108 |     designDf = expandDesignDf(countsDf, designDf,
109 |                               designSampleNameColName)
110 |   }
111 | 
112 |   display_markdown(title)
113 |   aPca = doPcaFromSamplesAsColsDf(countsDf, designDf, designSampleNameColName)
114 |   aPlot = make2dPcaPlot(aPca, designDf, pointShapeColName,
115 |                            libSizeColName, designColNameForLabels,
116 |                            labelOnlyOutliers)
117 |   print(aPlot + ggtitle(title))
118 | 
119 |   return(list(design=designDf, pca=aPca))
120 | }
121 | 
122 | getCpmsDf<-function(counts_df){
123 |   y <- DGEList(counts=counts_df)
124 |   cpm_matrix = cpm(y)
125 |   cpmDf = data.frame(cpm_matrix)
126 |   return(cpmDf)
127 | }
128 | 


--------------------------------------------------------------------------------
/secondary_analysis_scripts/src/PcaPlotter.R:
--------------------------------------------------------------------------------
  1 | # ---------------------------------------------------------------------------------
  2 | # Copyright (c) 2018 UC San Diego Center for Computational Biology & Bioinformatics
  3 | #
  4 | # Distributed under the terms of the MIT License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------------
  8 | # Initial author: Amanda Birmingham
  9 | 
 10 | library(cowplot)
 11 | library(ggplot2)
 12 | library(grid)
 13 | library(IRdisplay)
 14 | 
 15 | makeAndPrintPca <-function(data_df, design_df,
 16 |                            design_col_name_for_shapes=NULL, design_col_name_for_colors=NULL,
 17 |                            design_col_name_for_labels=NULL, labelOutliersOnly=TRUE,
 18 |                            shrink_viewport=FALSE) {
 19 | 
 20 |   pcaResults = doAndPrintScaledPcaOnSamplesAsRowsDf(data_df)
 21 |   makeAndPrintPcaPlot(pcaResults, design_df, design_col_name_for_shapes,
 22 |                       design_col_name_for_colors, design_col_name_for_labels,
 23 |                       labelOutliersOnly, TRUE, shrink_viewport)
 24 | }
 25 | 
 26 | makeAndPrintPcaPlot<-function(pcaResults, design_df=NULL,
 27 |                               design_col_name_for_shapes=NULL, design_col_name_for_colors=NULL,
 28 |                               design_col_name_for_labels=NULL, labelOutliersOnly=TRUE,
 29 |                               add_hotelling_ellipse=TRUE, shrink_viewport=FALSE) {
 30 | 
 31 |   pcaPlot = make2dPcaPlot(pcaResults, design_df,
 32 |                           design_col_name_for_shapes, design_col_name_for_colors,
 33 |                           design_col_name_for_labels, labelOutliersOnly,
 34 |                           add_hotelling_ellipse)
 35 |   printPlotInViewport(pcaPlot, shrink_viewport)
 36 | }
 37 | 
 38 | printPlotInViewport<-function(pcaPlot, shrink_viewport=FALSE){
 39 |   # NB that this method does NOT set the canvas back to the
 40 |   # default size after being called--when I try to do that,
 41 |   # the reset happens before the plot is rendered, thus
 42 |   # nullifying my attempts to resize the canvas to fit the
 43 |   # image (even if I try using Sys.sleep, etc).
 44 |   # Until I can spend more time exploring how to prevent that,
 45 |   # it is necessary to call resetPlotSize() after any run
 46 |   # of this method.
 47 | 
 48 |   viewport_val = NULL
 49 |   if (shrink_viewport==TRUE) {
 50 |     viewport_val = viewport(width=unit(0.8, "npc"))
 51 |   }
 52 | 
 53 |   startingWidth = getOption("repr.plot.width")
 54 |   startingHeight = getOption("repr.plot.height")
 55 | 
 56 |   # resize image to max width, appropriate height
 57 |   # to remove excessive whitespace in default square
 58 |   # image canvas if real image is not square
 59 |   aspectRatio = findPlotAspectRatio(pcaPlot)
 60 |   plotHeight = startingWidth/aspectRatio
 61 |   options(repr.plot.width=startingWidth, repr.plot.height=plotHeight)
 62 | 
 63 |   suppressWarnings(print(pcaPlot, vp=viewport_val))
 64 | }
 65 | 
 66 | make2dPcaPlot<-function(pca_result, design_df=NULL,
 67 |                         design_col_name_for_shapes=NULL, design_col_name_for_colors=NULL,
 68 |                         design_col_name_for_labels=NULL, label_only_outliers=TRUE,
 69 |                         add_hotelling_ellipse=TRUE){
 70 | 
 71 |   scores = data.frame(pca_result$x[,1:2])
 72 | 
 73 |   if (add_hotelling_ellipse){
 74 |     hotelling_ellipse = data.frame(getHotellingT2Ellipse(
 75 |       pca_result$x[,1], pca_result$x[,2]))
 76 |     colnames(hotelling_ellipse) = c("PC1", "PC2")
 77 |   }
 78 | 
 79 |   shape_values = rep("",nrow(pca_result$x))
 80 |   color_values = rep("",nrow(pca_result$x))
 81 |   label_values = rep("",nrow(pca_result$x))
 82 |   if (!is.null(design_col_name_for_shapes)){
 83 |     design_df[[design_col_name_for_shapes]] = factor(
 84 |       design_df[[design_col_name_for_shapes]])
 85 |     shape_values = design_df[[design_col_name_for_shapes]]
 86 |     scores = cbind(scores, shape_values)
 87 |   }
 88 |   if (!is.null(design_col_name_for_colors)){
 89 |     color_values = design_df[[design_col_name_for_colors]]
 90 |     scores = cbind(scores, color_values)
 91 |   }
 92 |   if (!is.null(design_col_name_for_labels)){
 93 |     if (label_only_outliers) {
 94 |       includeValues = getWhetherPointsAreOutliers(pca_result)
 95 |       label_values = ifelse(includeValues,
 96 |                             design_df[[design_col_name_for_labels]],'')
 97 |     } else {
 98 |       label_values = design_df[[design_col_name_for_labels]]
 99 |     }
100 |     scores = cbind(scores, label_values)
101 |   }
102 | 
103 |   pc1.2 = ggplot(scores, aes(x=PC1, y=PC2)) +
104 |     geom_point(aes(shape=shape_values,
105 |                    color=color_values), size = 4) + 
106 |     scale_shape_manual(values=c(0:length(shape_values))) +
107 |     coord_fixed(1/1) +
108 |     labs(color=design_col_name_for_colors,
109 |          shape=design_col_name_for_shapes)
110 | 
111 |   if (add_hotelling_ellipse){
112 |     pc1.2 = pc1.2 + geom_path(data=hotelling_ellipse)
113 |   }
114 | 
115 |   if (!is.null(design_col_name_for_labels)){
116 |     pc1.2 = pc1.2 + geom_text(aes(label=label_values),
117 |                               hjust=0, vjust=0)
118 |   }
119 | 
120 |   if (is.numeric(color_values)) {
121 |     pc1.2 = pc1.2 + scale_color_gradient(low="blue", high="red")
122 |   }
123 | 
124 |   pc1.2 = pc1.2 + coord_fixed()
125 |   return (pc1.2)
126 | }
127 | 
128 | findPlotAspectRatio<-function(aGgplot){
129 |   # get the x- and y-axis ranges actually used in the graph
130 |   builtPlot = ggplot_build(aGgplot)
131 | 
132 |   # pre-ggplot2 version 2.2
133 |   yRange <- builtPlot$panel$ranges[[1]]$y.range
134 |   xRange <- builtPlot$panel$ranges[[1]]$x.range
135 | 
136 |   # ggplot2 version 2.2 and later
137 |   if (is.null(yRange)){
138 |     yRange = builtPlot$layout$panel_ranges[[1]]$y.range
139 |     xRange <- builtPlot$layout$panel_ranges[[1]]$x.range
140 |   }
141 | 
142 |   aspectRatio <- (max(xRange)-min(xRange))/(max(yRange)-min(yRange))
143 |   return(aspectRatio)
144 | }
145 | 
146 | doAndPrintScaledPcaOnSamplesAsRowsDf<-function(data_df){
147 |   pcaResults = doScaledPcaOnSamplesAsRowsDf(data_df)
148 |   display(summary(pcaResults)$importance)
149 |   return(pcaResults)
150 | }
151 | 
152 | doPcaFromSamplesAsColsDf<-function(samplesAsColsDf, designDf,
153 |                                    sampleNameDesignColName = "sample_name"){
154 | 
155 |   transformedDf = reformatDfForPca(samplesAsColsDf, designDf,
156 |                                    sampleNameDesignColName)
157 |   pcaResults = doAndPrintScaledPcaOnSamplesAsRowsDf(transformedDf)
158 |   return(pcaResults)
159 | }
160 | 
161 | doScaledPcaOnSamplesAsRowsDf<-function(data_df){
162 |   # remove any columns that are constant in order to allow scaling
163 |   variable_df = data_df[,apply(data_df, 2, var, na.rm=TRUE) != 0]
164 |   pca_result = prcomp(variable_df, scale = TRUE)
165 | }
166 | 
167 | isPointOutsideEllipse<-function(x, y, ellipseCenterAndRadii){
168 |   ellipseEqnValue = ((x - ellipseCenterAndRadii[1])^2)/(
169 |     (ellipseCenterAndRadii[3])^2) +
170 |     ((y - ellipseCenterAndRadii[2])^2)/((ellipseCenterAndRadii[4])^2)
171 |   return(ellipseEqnValue > 1)
172 | }
173 | 
174 | getWhetherPointsAreOutliers<-function(pcaResults){
175 |   xVals = pcaResults$x[,1]
176 |   yVals = pcaResults$x[,2]
177 |   ellipseInfo = getHotellingT2EllipseCenterAndRadii(xVals, yVals)
178 |   isOutsideEllipse = mapply(isPointOutsideEllipse, xVals, yVals,
179 |                             MoreArgs=list(ellipseCenterAndRadii=ellipseInfo))
180 |   return(isOutsideEllipse)
181 | }
182 | 
183 | getHotellingT2Ellipse <-function (x, y, alfa = 0.95, len = 200) {
184 |   ellipseInfo = getHotellingT2EllipseCenterAndRadii(x, y, alfa)
185 |   mypi <- seq(0, 2 * pi, length = len)
186 |   r1 = ellipseInfo[3]
187 |   r2 = ellipseInfo[4]
188 |   cbind(r1 * cos(mypi) + mean(x), r2 * sin(mypi) + mean(y))
189 | }
190 | 
191 | getHotellingT2EllipseCenterAndRadii<-function(x, y, alfa = 0.95){
192 |   # NOTE: this logic, except for the return statement,
193 |   # is a trimmed COPY-PASTE of the simpleEllipse method in the
194 |   # pcaMethods package.  However, although the authors included
195 |   # this method in the documentation as public, they forgot to
196 |   # *make* it public.  Also, that package
197 |   # seems to bog down my notebook for unknown reasons.
198 |   N <- length(x)
199 |   r1 <- sqrt(var(x) * qf(alfa, 2, N - 2) * (2 * (N^2 - 1)/(N *
200 |                                                              (N - 2))))
201 |   r2 <- sqrt(var(y) * qf(alfa, 2, N - 2) * (2 * (N^2 - 1)/(N *
202 |                                                              (N - 2))))
203 |   return(c(mean(x), mean(y),r1,r2))
204 | }
205 | 
206 | 
207 | expandPlot<-function(aPlot, additiveExpandValue=25,
208 |                      shrink_viewport=FALSE){
209 |   aPlot = aPlot + scale_x_continuous(
210 |     expand =(c(0.05,additiveExpandValue)))
211 |   printPlotInViewport(aPlot, shrink_viewport=FALSE)
212 | }
213 | 
214 | 
215 | 


--------------------------------------------------------------------------------
/secondary_analysis_scripts/1_RNASeq_Count_Metadata_Annotation_Load_and_Integration.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     " # Dr. Kativa Agarwal RNA-seq analysis of VK2 cells\n"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "# RNASeq Data Integration"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "* Daisy Chilin-Fuentes, CCBB (dchilinfuentes@ucsd.edu)\n",
  22 |     "* Based on upstream analysis by Daisy Chilin-Fuentes, CCBB (dchilinfuentes@ucsd.edu)\n"
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "markdown",
  27 |    "metadata": {},
  28 |    "source": [
  29 |     "* Modeled on \"RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR\" ([1](#Citations))\n",
  30 |     "\n",
  31 |     "## Table of Contents\n",
  32 |     "* [Background](#Background)\n",
  33 |     "* [Introduction](#Introduction)\n",
  34 |     "* [Parameter Input](#Parameter-Input)\n",
  35 |     "* [Library Import](#Library-Import)\n",
  36 |     "* [Data Import](#Data-Import)\n",
  37 |     "    * [Count Data](#Count-Data)\n",
  38 |     "    * [Metadata](#Metadata)\n",
  39 |     "    * [Annotations](#Annotations)\n",
  40 |     "* [Gene Separation By Coding Status](#Gene-Separation-By-Coding-Status)\n",
  41 |     "* [Data Integration](#Data-Integration)\n",
  42 |     "* [Annotation Integration](#Annotation-Integration)\n",
  43 |     "* [Summary](#Summary)\n",
  44 |     "* [Citations](#Citations)\n",
  45 |     "* [Appendix: R Session Info](#Appendix:-R-Session-Info)\n",
  46 |     "\n",
  47 |     "\n",
  48 |     "## Background"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "markdown",
  53 |    "metadata": {},
  54 |    "source": [
  55 |     "The count data analyzed in this notebook were produced by the upstream analysis of Daisy Chilin-Fuentes of CCBB, who received raw sequencing data and performed quality control, trimming, alignment, and quantification of reads.\n"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "markdown",
  60 |    "metadata": {},
  61 |    "source": [
  62 |     "[Table of Contents](#Table-of-Contents)\n",
  63 |     "\n",
  64 |     "## Introduction\n",
  65 |     "\n",
  66 |     "This notebook takes in per-gene-per-sample count data (prepared either externally or by the  \"RNASeq_RSEM_QC_and_Counts_Preparation\" notebook) and per-sample metadata RNASeq data, and uses the edgeR ([2](#Citations)) Bioconductor ([3](#Citations)) package written in R ([4](#Citations)) to integrate and annotate these inputs in preparation for data exploration and preprocessing."
  67 |    ]
  68 |   },
  69 |   {
  70 |    "cell_type": "markdown",
  71 |    "metadata": {},
  72 |    "source": [
  73 |     "[Table of Contents](#Table-of-Contents)\n",
  74 |     "\n",
  75 |     "## Parameter Input"
  76 |    ]
  77 |   },
  78 |   {
  79 |    "cell_type": "code",
  80 |    "execution_count": 1,
  81 |    "metadata": {},
  82 |    "outputs": [],
  83 |    "source": [
  84 |     "gProjectName = \"Agarwal-Lewis_bulkRNAseq\"\n",
  85 |     "gGeneCountsFilename = \"counts.txt\"\n",
  86 |     "gMetadataFilename = \"KavitaSamplesMetadata.tsv\"\n",
  87 |     "\n",
  88 |     "\n",
  89 |     "gAnnotationsRdataFilename = \"Homo_sapiens_GRCh38p13_gencodev38_ANNOT.Rdata\""
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "code",
  94 |    "execution_count": 3,
  95 |    "metadata": {},
  96 |    "outputs": [
  97 |     {
  98 |      "name": "stderr",
  99 |      "output_type": "stream",
 100 |      "text": [
 101 |       "Warning message in dir.create(paste0(gOutputDir, \"/data_integration\")):\n",
 102 |       "“'../secondary_analysis_results/data_integration' already exists”\n"
 103 |      ]
 104 |     }
 105 |    ],
 106 |    "source": [
 107 |     "gSourceDir = \"./src/\" # note trailing slash here but not below\n",
 108 |     "gOutputDir = \"../secondary_analysis_results\"\n",
 109 |     "gReferenceDir = \"../reference\"\n",
 110 |     "gInterimDir = \"../interim\"\n",
 111 |     "gGeneCountsFp = file.path(gOutputDir, \"RSEM_QC_and_Counts_Preparation\", gGeneCountsFilename)\n",
 112 |     "gMetadataFp = file.path(gReferenceDir, gMetadataFilename)\n",
 113 |     "\n",
 114 |     "gOutputDir_nb1 <- paste0(gOutputDir, \"/data_integration\")\n",
 115 |     "dir.create(paste0(gOutputDir, \"/data_integration\")) \n"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": 4,
 121 |    "metadata": {},
 122 |    "outputs": [],
 123 |    "source": [
 124 |     "# Import shared source code to load and save previous notebooks' environments:\n",
 125 |     "source(paste0(gSourceDir, \"ChainedNotebookSupport.R\"))"
 126 |    ]
 127 |   },
 128 |   {
 129 |    "cell_type": "markdown",
 130 |    "metadata": {},
 131 |    "source": [
 132 |     "Populate the run name parameter automatically to ensure that outputs from different runs do not overwrite each other:"
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "code",
 137 |    "execution_count": 5,
 138 |    "metadata": {},
 139 |    "outputs": [
 140 |     {
 141 |      "data": {
 142 |       "text/html": [
 143 |        "'20230302161642'"
 144 |       ],
 145 |       "text/latex": [
 146 |        "'20230302161642'"
 147 |       ],
 148 |       "text/markdown": [
 149 |        "'20230302161642'"
 150 |       ],
 151 |       "text/plain": [
 152 |        "[1] \"20230302161642\""
 153 |       ]
 154 |      },
 155 |      "metadata": {},
 156 |      "output_type": "display_data"
 157 |     }
 158 |    ],
 159 |    "source": [
 160 |     "gRunName = format(Sys.time(), \"%Y%m%d%H%M%S\")\n",
 161 |     "gRunName"
 162 |    ]
 163 |   },
 164 |   {
 165 |    "cell_type": "code",
 166 |    "execution_count": 48,
 167 |    "metadata": {},
 168 |    "outputs": [
 169 |     {
 170 |      "data": {
 171 |       "text/html": [
 172 |        "'../secondary_analysis_results/data_integration/20230302161642/20230302161642'"
 173 |       ],
 174 |       "text/latex": [
 175 |        "'../secondary\\_analysis\\_results/data\\_integration/20230302161642/20230302161642'"
 176 |       ],
 177 |       "text/markdown": [
 178 |        "'../secondary_analysis_results/data_integration/20230302161642/20230302161642'"
 179 |       ],
 180 |       "text/plain": [
 181 |        "[1] \"../secondary_analysis_results/data_integration/20230302161642/20230302161642\""
 182 |       ]
 183 |      },
 184 |      "metadata": {},
 185 |      "output_type": "display_data"
 186 |     }
 187 |    ],
 188 |    "source": [
 189 |     "# Create subdirectory with timestamp to keep multiple iterations separated\n",
 190 |     "gOutputDir_nb1 <- paste0(gOutputDir_nb1, \"/\", gRunName)\n",
 191 |     "dir.create(gOutputDir_nb1) \n",
 192 |     "gOutputDir_nb1"
 193 |    ]
 194 |   },
 195 |   {
 196 |    "cell_type": "code",
 197 |    "execution_count": null,
 198 |    "metadata": {},
 199 |    "outputs": [],
 200 |    "source": []
 201 |   },
 202 |   {
 203 |    "cell_type": "markdown",
 204 |    "metadata": {},
 205 |    "source": [
 206 |     "[Table of Contents](#Table-of-Contents)\n",
 207 |     "\n",
 208 |     "## Library Import\n",
 209 |     "\n",
 210 |     "Import the necessary R, Bioconductor, and CCBB libraries for the analysis:"
 211 |    ]
 212 |   },
 213 |   {
 214 |    "cell_type": "code",
 215 |    "execution_count": 7,
 216 |    "metadata": {},
 217 |    "outputs": [],
 218 |    "source": [
 219 |     "#if (!requireNamespace(\"BiocManager\", quietly = TRUE))\n",
 220 |     "#    install.packages(\"BiocManager\")"
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "code",
 225 |    "execution_count": 8,
 226 |    "metadata": {},
 227 |    "outputs": [],
 228 |    "source": [
 229 |     "#BiocManager::install(\"edgeR\", version = \"3.8\")"
 230 |    ]
 231 |   },
 232 |   {
 233 |    "cell_type": "code",
 234 |    "execution_count": 9,
 235 |    "metadata": {},
 236 |    "outputs": [],
 237 |    "source": [
 238 |     "#BiocManager::install(\"Homo.sapiens\", version = \"3.8\")"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "code",
 243 |    "execution_count": 49,
 244 |    "metadata": {},
 245 |    "outputs": [],
 246 |    "source": [
 247 |     "library(Homo.sapiens)\n",
 248 |     "gOrganismPackage = Homo.sapiens"
 249 |    ]
 250 |   },
 251 |   {
 252 |    "cell_type": "code",
 253 |    "execution_count": 50,
 254 |    "metadata": {},
 255 |    "outputs": [],
 256 |    "source": [
 257 |     "library(edgeR)"
 258 |    ]
 259 |   },
 260 |   {
 261 |    "cell_type": "markdown",
 262 |    "metadata": {},
 263 |    "source": [
 264 |     "[Table of Contents](#Table-of-Contents)\n",
 265 |     "\n",
 266 |     "\n",
 267 |     "## Data Import\n",
 268 |     "\n",
 269 |     "### Count Data\n",
 270 |     "\n",
 271 |     "Import the count data file in which rows are genes identifiers, columns are sample identifiers, and row/column intersections contain the number of counts for the relevant gene in the relevant sample:"
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "code",
 276 |    "execution_count": 51,
 277 |    "metadata": {},
 278 |    "outputs": [
 279 |     {
 280 |      "data": {
 281 |       "text/html": [
 282 |        "<style>\n",
 283 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
 284 |        ".list-inline>li {display: inline-block}\n",
 285 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
 286 |        "</style>\n",
 287 |        "<ol class=list-inline><li>60605</li><li>12</li></ol>\n"
 288 |       ],
 289 |       "text/latex": [
 290 |        "\\begin{enumerate*}\n",
 291 |        "\\item 60605\n",
 292 |        "\\item 12\n",
 293 |        "\\end{enumerate*}\n"
 294 |       ],
 295 |       "text/markdown": [
 296 |        "1. 60605\n",
 297 |        "2. 12\n",
 298 |        "\n",
 299 |        "\n"
 300 |       ],
 301 |       "text/plain": [
 302 |        "[1] 60605    12"
 303 |       ]
 304 |      },
 305 |      "metadata": {},
 306 |      "output_type": "display_data"
 307 |     }
 308 |    ],
 309 |    "source": [
 310 |     "# Read in counts file containing info on all samples and genes\n",
 311 |     "gUnorderedGeneCountsDf <- read.csv(gGeneCountsFp, sep=\"\\t\", stringsAsFactors=FALSE, row.names=1)\n",
 312 |     "dim(gUnorderedGeneCountsDf)"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": 52,
 318 |    "metadata": {},
 319 |    "outputs": [
 320 |     {
 321 |      "data": {
 322 |       "text/html": [
 323 |        "<table class=\"dataframe\">\n",
 324 |        "<caption>A data.frame: 6 × 12</caption>\n",
 325 |        "<thead>\n",
 326 |        "\t<tr><th></th><th scope=col>VK2_NanH2_PolyB25_1h_5</th><th scope=col>VK2_NanH2_PolyB25_1h_6</th><th scope=col>VK2_NanH2_PolyB25_1h_7</th><th scope=col>VK2_NanH2_PolyB25_1h_8</th><th scope=col>VK2_NanH2_PolyB25_2h_1</th><th scope=col>VK2_NanH2_PolyB25_2h_2</th><th scope=col>VK2_NanH2_PolyB25_2h_3</th><th scope=col>VK2_NanH2_PolyB25_2h_4</th><th scope=col>VK2_Pet28a_PolyB25_2h_10</th><th scope=col>VK2_Pet28a_PolyB25_2h_11</th><th scope=col>VK2_Pet28a_PolyB25_2h_12</th><th scope=col>VK2_Pet28a_PolyB25_2h_9</th></tr>\n",
 327 |        "\t<tr><th></th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
 328 |        "</thead>\n",
 329 |        "<tbody>\n",
 330 |        "\t<tr><th scope=row>ENSG00000000003.15</th><td> 502.00</td><td> 416.00</td><td> 424.00</td><td> 497.00</td><td> 356.00</td><td> 542.00</td><td> 364.00</td><td> 299.00</td><td> 387.00</td><td> 465.00</td><td> 366.00</td><td> 323.00</td></tr>\n",
 331 |        "\t<tr><th scope=row>ENSG00000000005.6</th><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td></tr>\n",
 332 |        "\t<tr><th scope=row>ENSG00000000419.14</th><td>2340.00</td><td>2232.04</td><td>2253.00</td><td>2573.36</td><td>2080.11</td><td>3080.00</td><td>2206.80</td><td>1721.00</td><td>2084.00</td><td>2412.42</td><td>2170.07</td><td>1904.05</td></tr>\n",
 333 |        "\t<tr><th scope=row>ENSG00000000457.14</th><td> 153.65</td><td>  94.84</td><td> 112.33</td><td> 130.23</td><td> 113.88</td><td> 119.35</td><td>  85.84</td><td> 102.65</td><td>  98.34</td><td> 120.36</td><td>  91.56</td><td> 119.77</td></tr>\n",
 334 |        "\t<tr><th scope=row>ENSG00000000460.17</th><td> 254.35</td><td> 200.16</td><td> 242.67</td><td> 292.77</td><td> 181.12</td><td> 285.65</td><td> 181.16</td><td> 209.35</td><td> 189.66</td><td> 190.64</td><td> 204.44</td><td> 234.23</td></tr>\n",
 335 |        "\t<tr><th scope=row>ENSG00000000938.13</th><td>  10.00</td><td>   6.00</td><td>   6.00</td><td>   9.00</td><td>   4.00</td><td>  21.00</td><td>  10.00</td><td>  18.00</td><td>  13.00</td><td>  17.00</td><td>  13.00</td><td>  10.00</td></tr>\n",
 336 |        "</tbody>\n",
 337 |        "</table>\n"
 338 |       ],
 339 |       "text/latex": [
 340 |        "A data.frame: 6 × 12\n",
 341 |        "\\begin{tabular}{r|llllllllllll}\n",
 342 |        "  & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & VK2\\_NanH2\\_PolyB25\\_1h\\_7 & VK2\\_NanH2\\_PolyB25\\_1h\\_8 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & VK2\\_Pet28a\\_PolyB25\\_2h\\_10 & VK2\\_Pet28a\\_PolyB25\\_2h\\_11 & VK2\\_Pet28a\\_PolyB25\\_2h\\_12 & VK2\\_Pet28a\\_PolyB25\\_2h\\_9\\\\\n",
 343 |        "  & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl>\\\\\n",
 344 |        "\\hline\n",
 345 |        "\tENSG00000000003.15 &  502.00 &  416.00 &  424.00 &  497.00 &  356.00 &  542.00 &  364.00 &  299.00 &  387.00 &  465.00 &  366.00 &  323.00\\\\\n",
 346 |        "\tENSG00000000005.6 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00\\\\\n",
 347 |        "\tENSG00000000419.14 & 2340.00 & 2232.04 & 2253.00 & 2573.36 & 2080.11 & 3080.00 & 2206.80 & 1721.00 & 2084.00 & 2412.42 & 2170.07 & 1904.05\\\\\n",
 348 |        "\tENSG00000000457.14 &  153.65 &   94.84 &  112.33 &  130.23 &  113.88 &  119.35 &   85.84 &  102.65 &   98.34 &  120.36 &   91.56 &  119.77\\\\\n",
 349 |        "\tENSG00000000460.17 &  254.35 &  200.16 &  242.67 &  292.77 &  181.12 &  285.65 &  181.16 &  209.35 &  189.66 &  190.64 &  204.44 &  234.23\\\\\n",
 350 |        "\tENSG00000000938.13 &   10.00 &    6.00 &    6.00 &    9.00 &    4.00 &   21.00 &   10.00 &   18.00 &   13.00 &   17.00 &   13.00 &   10.00\\\\\n",
 351 |        "\\end{tabular}\n"
 352 |       ],
 353 |       "text/markdown": [
 354 |        "\n",
 355 |        "A data.frame: 6 × 12\n",
 356 |        "\n",
 357 |        "| <!--/--> | VK2_NanH2_PolyB25_1h_5 &lt;dbl&gt; | VK2_NanH2_PolyB25_1h_6 &lt;dbl&gt; | VK2_NanH2_PolyB25_1h_7 &lt;dbl&gt; | VK2_NanH2_PolyB25_1h_8 &lt;dbl&gt; | VK2_NanH2_PolyB25_2h_1 &lt;dbl&gt; | VK2_NanH2_PolyB25_2h_2 &lt;dbl&gt; | VK2_NanH2_PolyB25_2h_3 &lt;dbl&gt; | VK2_NanH2_PolyB25_2h_4 &lt;dbl&gt; | VK2_Pet28a_PolyB25_2h_10 &lt;dbl&gt; | VK2_Pet28a_PolyB25_2h_11 &lt;dbl&gt; | VK2_Pet28a_PolyB25_2h_12 &lt;dbl&gt; | VK2_Pet28a_PolyB25_2h_9 &lt;dbl&gt; |\n",
 358 |        "|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
 359 |        "| ENSG00000000003.15 |  502.00 |  416.00 |  424.00 |  497.00 |  356.00 |  542.00 |  364.00 |  299.00 |  387.00 |  465.00 |  366.00 |  323.00 |\n",
 360 |        "| ENSG00000000005.6 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |\n",
 361 |        "| ENSG00000000419.14 | 2340.00 | 2232.04 | 2253.00 | 2573.36 | 2080.11 | 3080.00 | 2206.80 | 1721.00 | 2084.00 | 2412.42 | 2170.07 | 1904.05 |\n",
 362 |        "| ENSG00000000457.14 |  153.65 |   94.84 |  112.33 |  130.23 |  113.88 |  119.35 |   85.84 |  102.65 |   98.34 |  120.36 |   91.56 |  119.77 |\n",
 363 |        "| ENSG00000000460.17 |  254.35 |  200.16 |  242.67 |  292.77 |  181.12 |  285.65 |  181.16 |  209.35 |  189.66 |  190.64 |  204.44 |  234.23 |\n",
 364 |        "| ENSG00000000938.13 |   10.00 |    6.00 |    6.00 |    9.00 |    4.00 |   21.00 |   10.00 |   18.00 |   13.00 |   17.00 |   13.00 |   10.00 |\n",
 365 |        "\n"
 366 |       ],
 367 |       "text/plain": [
 368 |        "                   VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_6\n",
 369 |        "ENSG00000000003.15  502.00                 416.00               \n",
 370 |        "ENSG00000000005.6     0.00                   0.00               \n",
 371 |        "ENSG00000000419.14 2340.00                2232.04               \n",
 372 |        "ENSG00000000457.14  153.65                  94.84               \n",
 373 |        "ENSG00000000460.17  254.35                 200.16               \n",
 374 |        "ENSG00000000938.13   10.00                   6.00               \n",
 375 |        "                   VK2_NanH2_PolyB25_1h_7 VK2_NanH2_PolyB25_1h_8\n",
 376 |        "ENSG00000000003.15  424.00                 497.00               \n",
 377 |        "ENSG00000000005.6     0.00                   0.00               \n",
 378 |        "ENSG00000000419.14 2253.00                2573.36               \n",
 379 |        "ENSG00000000457.14  112.33                 130.23               \n",
 380 |        "ENSG00000000460.17  242.67                 292.77               \n",
 381 |        "ENSG00000000938.13    6.00                   9.00               \n",
 382 |        "                   VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_2\n",
 383 |        "ENSG00000000003.15  356.00                 542.00               \n",
 384 |        "ENSG00000000005.6     0.00                   0.00               \n",
 385 |        "ENSG00000000419.14 2080.11                3080.00               \n",
 386 |        "ENSG00000000457.14  113.88                 119.35               \n",
 387 |        "ENSG00000000460.17  181.12                 285.65               \n",
 388 |        "ENSG00000000938.13    4.00                  21.00               \n",
 389 |        "                   VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_4\n",
 390 |        "ENSG00000000003.15  364.00                 299.00               \n",
 391 |        "ENSG00000000005.6     0.00                   0.00               \n",
 392 |        "ENSG00000000419.14 2206.80                1721.00               \n",
 393 |        "ENSG00000000457.14   85.84                 102.65               \n",
 394 |        "ENSG00000000460.17  181.16                 209.35               \n",
 395 |        "ENSG00000000938.13   10.00                  18.00               \n",
 396 |        "                   VK2_Pet28a_PolyB25_2h_10 VK2_Pet28a_PolyB25_2h_11\n",
 397 |        "ENSG00000000003.15  387.00                   465.00                 \n",
 398 |        "ENSG00000000005.6     0.00                     0.00                 \n",
 399 |        "ENSG00000000419.14 2084.00                  2412.42                 \n",
 400 |        "ENSG00000000457.14   98.34                   120.36                 \n",
 401 |        "ENSG00000000460.17  189.66                   190.64                 \n",
 402 |        "ENSG00000000938.13   13.00                    17.00                 \n",
 403 |        "                   VK2_Pet28a_PolyB25_2h_12 VK2_Pet28a_PolyB25_2h_9\n",
 404 |        "ENSG00000000003.15  366.00                   323.00                \n",
 405 |        "ENSG00000000005.6     0.00                     0.00                \n",
 406 |        "ENSG00000000419.14 2170.07                  1904.05                \n",
 407 |        "ENSG00000000457.14   91.56                   119.77                \n",
 408 |        "ENSG00000000460.17  204.44                   234.23                \n",
 409 |        "ENSG00000000938.13   13.00                    10.00                "
 410 |       ]
 411 |      },
 412 |      "metadata": {},
 413 |      "output_type": "display_data"
 414 |     }
 415 |    ],
 416 |    "source": [
 417 |     "head(gUnorderedGeneCountsDf)"
 418 |    ]
 419 |   },
 420 |   {
 421 |    "cell_type": "code",
 422 |    "execution_count": 53,
 423 |    "metadata": {},
 424 |    "outputs": [],
 425 |    "source": [
 426 |     "detectParRecords = function(geneCountsDf){\n",
 427 |     "    gene_names <- rownames(geneCountsDf)\n",
 428 |     "    PAR_genes <- gene_names[grep(\"_PAR_\", gene_names)] \n",
 429 |     "    if (length(PAR_genes) == 0){\n",
 430 |     "        print(\"No PAR genes detected; analysis can proceed.\")\n",
 431 |     "    } else {\n",
 432 |     "        print(\"ERROR: PAR genes found.  These must be removed before continuing analysis.\")\n",
 433 |     "    }\n",
 434 |     "    return(PAR_genes)\n",
 435 |     "}"
 436 |    ]
 437 |   },
 438 |   {
 439 |    "cell_type": "code",
 440 |    "execution_count": 54,
 441 |    "metadata": {},
 442 |    "outputs": [
 443 |     {
 444 |      "name": "stdout",
 445 |      "output_type": "stream",
 446 |      "text": [
 447 |       "[1] \"No PAR genes detected; analysis can proceed.\"\n"
 448 |      ]
 449 |     },
 450 |     {
 451 |      "data": {
 452 |       "text/html": [],
 453 |       "text/latex": [],
 454 |       "text/markdown": [],
 455 |       "text/plain": [
 456 |        "character(0)"
 457 |       ]
 458 |      },
 459 |      "metadata": {},
 460 |      "output_type": "display_data"
 461 |     }
 462 |    ],
 463 |    "source": [
 464 |     "detectParRecords(gUnorderedGeneCountsDf)"
 465 |    ]
 466 |   },
 467 |   {
 468 |    "cell_type": "markdown",
 469 |    "metadata": {},
 470 |    "source": [
 471 |     "No assumption is made that the columns (samples) of the gene count file are currently ordered in the order desirable for the differential expression analysis."
 472 |    ]
 473 |   },
 474 |   {
 475 |    "cell_type": "markdown",
 476 |    "metadata": {},
 477 |    "source": [
 478 |     "[Table of Contents](#Table-of-Contents)\n",
 479 |     "\n",
 480 |     "### Metadata\n",
 481 |     "\n",
 482 |     "> For downstream analysis, sample-level information related to the experimental design needs to be associated with the columns of the counts matrix. This should include experimental variables, both biological and technical, that could have an effect on expression levels. Examples [could] include cell type (basal, LP and ML in this experiment), genotype (wild-type, knock-out), phenotype (disease status, sex, age), sample treatment (drug, control) and batch information (date experiment was performed if samples were collected and analysed at distinct time points) to name just a few. ([1](#Citations))\n",
 483 |     "\n",
 484 |     "Import a metadata file in which rows are sample identifiers, columns are metadata features (e.g., subject id, time point, etc) and row/column intersections contain the value of the relevant feature for the relevant sample:"
 485 |    ]
 486 |   },
 487 |   {
 488 |    "cell_type": "code",
 489 |    "execution_count": 55,
 490 |    "metadata": {},
 491 |    "outputs": [
 492 |     {
 493 |      "data": {
 494 |       "text/html": [
 495 |        "<style>\n",
 496 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
 497 |        ".list-inline>li {display: inline-block}\n",
 498 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
 499 |        "</style>\n",
 500 |        "<ol class=list-inline><li>12</li><li>6</li></ol>\n"
 501 |       ],
 502 |       "text/latex": [
 503 |        "\\begin{enumerate*}\n",
 504 |        "\\item 12\n",
 505 |        "\\item 6\n",
 506 |        "\\end{enumerate*}\n"
 507 |       ],
 508 |       "text/markdown": [
 509 |        "1. 12\n",
 510 |        "2. 6\n",
 511 |        "\n",
 512 |        "\n"
 513 |       ],
 514 |       "text/plain": [
 515 |        "[1] 12  6"
 516 |       ]
 517 |      },
 518 |      "metadata": {},
 519 |      "output_type": "display_data"
 520 |     }
 521 |    ],
 522 |    "source": [
 523 |     "#Read in metadata\n",
 524 |     "gMetadataDf <- read.csv(gMetadataFp, stringsAsFactors=FALSE, sep = \"\\t\")\n",
 525 |     "dim(gMetadataDf)"
 526 |    ]
 527 |   },
 528 |   {
 529 |    "cell_type": "code",
 530 |    "execution_count": 56,
 531 |    "metadata": {},
 532 |    "outputs": [
 533 |     {
 534 |      "data": {
 535 |       "text/html": [
 536 |        "<table class=\"dataframe\">\n",
 537 |        "<caption>A data.frame: 6 × 6</caption>\n",
 538 |        "<thead>\n",
 539 |        "\t<tr><th></th><th scope=col>Sample.Name</th><th scope=col>Sample.Code</th><th scope=col>Reference.Genome..Organism.</th><th scope=col>Gender</th><th scope=col>Condition</th><th scope=col>Time.point</th></tr>\n",
 540 |        "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
 541 |        "</thead>\n",
 542 |        "<tbody>\n",
 543 |        "\t<tr><th scope=row>1</th><td>VK2_NanH2_PolyB25_2h_1</td><td>K001</td><td>Human</td><td>Female</td><td>NanH2 treated</td><td>2 hour</td></tr>\n",
 544 |        "\t<tr><th scope=row>2</th><td>VK2_NanH2_PolyB25_2h_2</td><td>K002</td><td>Human</td><td>Female</td><td>NanH2 treated</td><td>2 hour</td></tr>\n",
 545 |        "\t<tr><th scope=row>3</th><td>VK2_NanH2_PolyB25_2h_3</td><td>K003</td><td>Human</td><td>Female</td><td>NanH2 treated</td><td>2 hour</td></tr>\n",
 546 |        "\t<tr><th scope=row>4</th><td>VK2_NanH2_PolyB25_2h_4</td><td>K004</td><td>Human</td><td>Female</td><td>NanH2 treated</td><td>2 hour</td></tr>\n",
 547 |        "\t<tr><th scope=row>5</th><td>VK2_NanH2_PolyB25_1h_5</td><td>K005</td><td>Human</td><td>Female</td><td>NanH2 treated</td><td>1 hour</td></tr>\n",
 548 |        "\t<tr><th scope=row>6</th><td>VK2_NanH2_PolyB25_1h_6</td><td>K006</td><td>Human</td><td>Female</td><td>NanH2 treated</td><td>1 hour</td></tr>\n",
 549 |        "</tbody>\n",
 550 |        "</table>\n"
 551 |       ],
 552 |       "text/latex": [
 553 |        "A data.frame: 6 × 6\n",
 554 |        "\\begin{tabular}{r|llllll}\n",
 555 |        "  & Sample.Name & Sample.Code & Reference.Genome..Organism. & Gender & Condition & Time.point\\\\\n",
 556 |        "  & <chr> & <chr> & <chr> & <chr> & <chr> & <chr>\\\\\n",
 557 |        "\\hline\n",
 558 |        "\t1 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & K001 & Human & Female & NanH2 treated & 2 hour\\\\\n",
 559 |        "\t2 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & K002 & Human & Female & NanH2 treated & 2 hour\\\\\n",
 560 |        "\t3 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & K003 & Human & Female & NanH2 treated & 2 hour\\\\\n",
 561 |        "\t4 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & K004 & Human & Female & NanH2 treated & 2 hour\\\\\n",
 562 |        "\t5 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & K005 & Human & Female & NanH2 treated & 1 hour\\\\\n",
 563 |        "\t6 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & K006 & Human & Female & NanH2 treated & 1 hour\\\\\n",
 564 |        "\\end{tabular}\n"
 565 |       ],
 566 |       "text/markdown": [
 567 |        "\n",
 568 |        "A data.frame: 6 × 6\n",
 569 |        "\n",
 570 |        "| <!--/--> | Sample.Name &lt;chr&gt; | Sample.Code &lt;chr&gt; | Reference.Genome..Organism. &lt;chr&gt; | Gender &lt;chr&gt; | Condition &lt;chr&gt; | Time.point &lt;chr&gt; |\n",
 571 |        "|---|---|---|---|---|---|---|\n",
 572 |        "| 1 | VK2_NanH2_PolyB25_2h_1 | K001 | Human | Female | NanH2 treated | 2 hour |\n",
 573 |        "| 2 | VK2_NanH2_PolyB25_2h_2 | K002 | Human | Female | NanH2 treated | 2 hour |\n",
 574 |        "| 3 | VK2_NanH2_PolyB25_2h_3 | K003 | Human | Female | NanH2 treated | 2 hour |\n",
 575 |        "| 4 | VK2_NanH2_PolyB25_2h_4 | K004 | Human | Female | NanH2 treated | 2 hour |\n",
 576 |        "| 5 | VK2_NanH2_PolyB25_1h_5 | K005 | Human | Female | NanH2 treated | 1 hour |\n",
 577 |        "| 6 | VK2_NanH2_PolyB25_1h_6 | K006 | Human | Female | NanH2 treated | 1 hour |\n",
 578 |        "\n"
 579 |       ],
 580 |       "text/plain": [
 581 |        "  Sample.Name            Sample.Code Reference.Genome..Organism. Gender\n",
 582 |        "1 VK2_NanH2_PolyB25_2h_1 K001        Human                       Female\n",
 583 |        "2 VK2_NanH2_PolyB25_2h_2 K002        Human                       Female\n",
 584 |        "3 VK2_NanH2_PolyB25_2h_3 K003        Human                       Female\n",
 585 |        "4 VK2_NanH2_PolyB25_2h_4 K004        Human                       Female\n",
 586 |        "5 VK2_NanH2_PolyB25_1h_5 K005        Human                       Female\n",
 587 |        "6 VK2_NanH2_PolyB25_1h_6 K006        Human                       Female\n",
 588 |        "  Condition     Time.point\n",
 589 |        "1 NanH2 treated 2 hour    \n",
 590 |        "2 NanH2 treated 2 hour    \n",
 591 |        "3 NanH2 treated 2 hour    \n",
 592 |        "4 NanH2 treated 2 hour    \n",
 593 |        "5 NanH2 treated 1 hour    \n",
 594 |        "6 NanH2 treated 1 hour    "
 595 |       ]
 596 |      },
 597 |      "metadata": {},
 598 |      "output_type": "display_data"
 599 |     }
 600 |    ],
 601 |    "source": [
 602 |     "head(gMetadataDf)"
 603 |    ]
 604 |   },
 605 |   {
 606 |    "cell_type": "code",
 607 |    "execution_count": 59,
 608 |    "metadata": {},
 609 |    "outputs": [
 610 |     {
 611 |      "data": {
 612 |       "text/html": [
 613 |        "<table class=\"dataframe\">\n",
 614 |        "<caption>A data.frame: 6 × 7</caption>\n",
 615 |        "<thead>\n",
 616 |        "\t<tr><th></th><th scope=col>Sample.Name</th><th scope=col>Sample.Code</th><th scope=col>Reference.Genome..Organism.</th><th scope=col>Gender</th><th scope=col>Condition</th><th scope=col>Time.point</th><th scope=col>Condition_Time</th></tr>\n",
 617 |        "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
 618 |        "</thead>\n",
 619 |        "<tbody>\n",
 620 |        "\t<tr><th scope=row>1</th><td>VK2_NanH2_PolyB25_2h_1</td><td>K001</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
 621 |        "\t<tr><th scope=row>2</th><td>VK2_NanH2_PolyB25_2h_2</td><td>K002</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
 622 |        "\t<tr><th scope=row>3</th><td>VK2_NanH2_PolyB25_2h_3</td><td>K003</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
 623 |        "\t<tr><th scope=row>4</th><td>VK2_NanH2_PolyB25_2h_4</td><td>K004</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
 624 |        "\t<tr><th scope=row>5</th><td>VK2_NanH2_PolyB25_1h_5</td><td>K005</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>1H</td><td>NanH2_treated_1H</td></tr>\n",
 625 |        "\t<tr><th scope=row>6</th><td>VK2_NanH2_PolyB25_1h_6</td><td>K006</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>1H</td><td>NanH2_treated_1H</td></tr>\n",
 626 |        "</tbody>\n",
 627 |        "</table>\n"
 628 |       ],
 629 |       "text/latex": [
 630 |        "A data.frame: 6 × 7\n",
 631 |        "\\begin{tabular}{r|lllllll}\n",
 632 |        "  & Sample.Name & Sample.Code & Reference.Genome..Organism. & Gender & Condition & Time.point & Condition\\_Time\\\\\n",
 633 |        "  & <chr> & <chr> & <chr> & <chr> & <chr> & <chr> & <chr>\\\\\n",
 634 |        "\\hline\n",
 635 |        "\t1 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & K001 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
 636 |        "\t2 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & K002 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
 637 |        "\t3 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & K003 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
 638 |        "\t4 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & K004 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
 639 |        "\t5 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & K005 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n",
 640 |        "\t6 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & K006 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n",
 641 |        "\\end{tabular}\n"
 642 |       ],
 643 |       "text/markdown": [
 644 |        "\n",
 645 |        "A data.frame: 6 × 7\n",
 646 |        "\n",
 647 |        "| <!--/--> | Sample.Name &lt;chr&gt; | Sample.Code &lt;chr&gt; | Reference.Genome..Organism. &lt;chr&gt; | Gender &lt;chr&gt; | Condition &lt;chr&gt; | Time.point &lt;chr&gt; | Condition_Time &lt;chr&gt; |\n",
 648 |        "|---|---|---|---|---|---|---|---|\n",
 649 |        "| 1 | VK2_NanH2_PolyB25_2h_1 | K001 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
 650 |        "| 2 | VK2_NanH2_PolyB25_2h_2 | K002 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
 651 |        "| 3 | VK2_NanH2_PolyB25_2h_3 | K003 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
 652 |        "| 4 | VK2_NanH2_PolyB25_2h_4 | K004 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
 653 |        "| 5 | VK2_NanH2_PolyB25_1h_5 | K005 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n",
 654 |        "| 6 | VK2_NanH2_PolyB25_1h_6 | K006 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n",
 655 |        "\n"
 656 |       ],
 657 |       "text/plain": [
 658 |        "  Sample.Name            Sample.Code Reference.Genome..Organism. Gender\n",
 659 |        "1 VK2_NanH2_PolyB25_2h_1 K001        Human                       Female\n",
 660 |        "2 VK2_NanH2_PolyB25_2h_2 K002        Human                       Female\n",
 661 |        "3 VK2_NanH2_PolyB25_2h_3 K003        Human                       Female\n",
 662 |        "4 VK2_NanH2_PolyB25_2h_4 K004        Human                       Female\n",
 663 |        "5 VK2_NanH2_PolyB25_1h_5 K005        Human                       Female\n",
 664 |        "6 VK2_NanH2_PolyB25_1h_6 K006        Human                       Female\n",
 665 |        "  Condition     Time.point Condition_Time  \n",
 666 |        "1 NanH2_treated 2H         NanH2_treated_2H\n",
 667 |        "2 NanH2_treated 2H         NanH2_treated_2H\n",
 668 |        "3 NanH2_treated 2H         NanH2_treated_2H\n",
 669 |        "4 NanH2_treated 2H         NanH2_treated_2H\n",
 670 |        "5 NanH2_treated 1H         NanH2_treated_1H\n",
 671 |        "6 NanH2_treated 1H         NanH2_treated_1H"
 672 |       ]
 673 |      },
 674 |      "metadata": {},
 675 |      "output_type": "display_data"
 676 |     }
 677 |    ],
 678 |    "source": [
 679 |     "#remove spaces\n",
 680 |     "gMetadataDf$Condition <- gsub(\" \", \"_\", gMetadataDf$Condition)\n",
 681 |     "gMetadataDf$Time.point <- gsub(\"2 hour\", \"2H\", gMetadataDf$Time.point)\n",
 682 |     "gMetadataDf$Time.point <- gsub(\"1 hour\", \"1H\", gMetadataDf$Time.point)\n",
 683 |     "\n",
 684 |     "\n",
 685 |     "#add another col\n",
 686 |     "gMetadataDf$Condition_Time <- paste(gMetadataDf$Condition, gMetadataDf$Time.point, sep=\"_\")\n",
 687 |     "head(gMetadataDf)"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": 60,
 693 |    "metadata": {},
 694 |    "outputs": [],
 695 |    "source": [
 696 |     "gSampleNames = gMetadataDf[[\"Sample.Name\"]]"
 697 |    ]
 698 |   },
 699 |   {
 700 |    "cell_type": "markdown",
 701 |    "metadata": {},
 702 |    "source": [
 703 |     "Check the dimensions of the count data and the metadata to ensure that the count dataframe has the same number of columns (samples) as the metadata dataframe has rows (again, samples), and that the sample names are the same in both: "
 704 |    ]
 705 |   },
 706 |   {
 707 |    "cell_type": "code",
 708 |    "execution_count": 61,
 709 |    "metadata": {},
 710 |    "outputs": [
 711 |     {
 712 |      "data": {
 713 |       "text/html": [
 714 |        "<style>\n",
 715 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
 716 |        ".list-inline>li {display: inline-block}\n",
 717 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
 718 |        "</style>\n",
 719 |        "<ol class=list-inline><li>60605</li><li>12</li></ol>\n"
 720 |       ],
 721 |       "text/latex": [
 722 |        "\\begin{enumerate*}\n",
 723 |        "\\item 60605\n",
 724 |        "\\item 12\n",
 725 |        "\\end{enumerate*}\n"
 726 |       ],
 727 |       "text/markdown": [
 728 |        "1. 60605\n",
 729 |        "2. 12\n",
 730 |        "\n",
 731 |        "\n"
 732 |       ],
 733 |       "text/plain": [
 734 |        "[1] 60605    12"
 735 |       ]
 736 |      },
 737 |      "metadata": {},
 738 |      "output_type": "display_data"
 739 |     },
 740 |     {
 741 |      "data": {
 742 |       "text/html": [
 743 |        "<style>\n",
 744 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
 745 |        ".list-inline>li {display: inline-block}\n",
 746 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
 747 |        "</style>\n",
 748 |        "<ol class=list-inline><li>12</li><li>7</li></ol>\n"
 749 |       ],
 750 |       "text/latex": [
 751 |        "\\begin{enumerate*}\n",
 752 |        "\\item 12\n",
 753 |        "\\item 7\n",
 754 |        "\\end{enumerate*}\n"
 755 |       ],
 756 |       "text/markdown": [
 757 |        "1. 12\n",
 758 |        "2. 7\n",
 759 |        "\n",
 760 |        "\n"
 761 |       ],
 762 |       "text/plain": [
 763 |        "[1] 12  7"
 764 |       ]
 765 |      },
 766 |      "metadata": {},
 767 |      "output_type": "display_data"
 768 |     },
 769 |     {
 770 |      "data": {
 771 |       "text/html": [
 772 |        "TRUE"
 773 |       ],
 774 |       "text/latex": [
 775 |        "TRUE"
 776 |       ],
 777 |       "text/markdown": [
 778 |        "TRUE"
 779 |       ],
 780 |       "text/plain": [
 781 |        "[1] TRUE"
 782 |       ]
 783 |      },
 784 |      "metadata": {},
 785 |      "output_type": "display_data"
 786 |     }
 787 |    ],
 788 |    "source": [
 789 |     "dim(gUnorderedGeneCountsDf)\n",
 790 |     "dim(gMetadataDf)\n",
 791 |     "\n",
 792 |     "all(colnames(gUnorderedGeneCountsDf) %in% gSampleNames)"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "markdown",
 797 |    "metadata": {},
 798 |    "source": [
 799 |     "Assume that the order of the samples shown in the metadata is the desired order, and reorder the columns in the counts table to match it:"
 800 |    ]
 801 |   },
 802 |   {
 803 |    "cell_type": "code",
 804 |    "execution_count": 62,
 805 |    "metadata": {},
 806 |    "outputs": [
 807 |     {
 808 |      "data": {
 809 |       "text/html": [
 810 |        "<table class=\"dataframe\">\n",
 811 |        "<caption>A data.frame: 6 × 12</caption>\n",
 812 |        "<thead>\n",
 813 |        "\t<tr><th></th><th scope=col>VK2_NanH2_PolyB25_2h_1</th><th scope=col>VK2_NanH2_PolyB25_2h_2</th><th scope=col>VK2_NanH2_PolyB25_2h_3</th><th scope=col>VK2_NanH2_PolyB25_2h_4</th><th scope=col>VK2_NanH2_PolyB25_1h_5</th><th scope=col>VK2_NanH2_PolyB25_1h_6</th><th scope=col>VK2_NanH2_PolyB25_1h_7</th><th scope=col>VK2_NanH2_PolyB25_1h_8</th><th scope=col>VK2_Pet28a_PolyB25_2h_9</th><th scope=col>VK2_Pet28a_PolyB25_2h_10</th><th scope=col>VK2_Pet28a_PolyB25_2h_11</th><th scope=col>VK2_Pet28a_PolyB25_2h_12</th></tr>\n",
 814 |        "\t<tr><th></th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
 815 |        "</thead>\n",
 816 |        "<tbody>\n",
 817 |        "\t<tr><th scope=row>ENSG00000000003.15</th><td> 356.00</td><td> 542.00</td><td> 364.00</td><td> 299.00</td><td> 502.00</td><td> 416.00</td><td> 424.00</td><td> 497.00</td><td> 323.00</td><td> 387.00</td><td> 465.00</td><td> 366.00</td></tr>\n",
 818 |        "\t<tr><th scope=row>ENSG00000000005.6</th><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td></tr>\n",
 819 |        "\t<tr><th scope=row>ENSG00000000419.14</th><td>2080.11</td><td>3080.00</td><td>2206.80</td><td>1721.00</td><td>2340.00</td><td>2232.04</td><td>2253.00</td><td>2573.36</td><td>1904.05</td><td>2084.00</td><td>2412.42</td><td>2170.07</td></tr>\n",
 820 |        "\t<tr><th scope=row>ENSG00000000457.14</th><td> 113.88</td><td> 119.35</td><td>  85.84</td><td> 102.65</td><td> 153.65</td><td>  94.84</td><td> 112.33</td><td> 130.23</td><td> 119.77</td><td>  98.34</td><td> 120.36</td><td>  91.56</td></tr>\n",
 821 |        "\t<tr><th scope=row>ENSG00000000460.17</th><td> 181.12</td><td> 285.65</td><td> 181.16</td><td> 209.35</td><td> 254.35</td><td> 200.16</td><td> 242.67</td><td> 292.77</td><td> 234.23</td><td> 189.66</td><td> 190.64</td><td> 204.44</td></tr>\n",
 822 |        "\t<tr><th scope=row>ENSG00000000938.13</th><td>   4.00</td><td>  21.00</td><td>  10.00</td><td>  18.00</td><td>  10.00</td><td>   6.00</td><td>   6.00</td><td>   9.00</td><td>  10.00</td><td>  13.00</td><td>  17.00</td><td>  13.00</td></tr>\n",
 823 |        "</tbody>\n",
 824 |        "</table>\n"
 825 |       ],
 826 |       "text/latex": [
 827 |        "A data.frame: 6 × 12\n",
 828 |        "\\begin{tabular}{r|llllllllllll}\n",
 829 |        "  & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & VK2\\_NanH2\\_PolyB25\\_1h\\_7 & VK2\\_NanH2\\_PolyB25\\_1h\\_8 & VK2\\_Pet28a\\_PolyB25\\_2h\\_9 & VK2\\_Pet28a\\_PolyB25\\_2h\\_10 & VK2\\_Pet28a\\_PolyB25\\_2h\\_11 & VK2\\_Pet28a\\_PolyB25\\_2h\\_12\\\\\n",
 830 |        "  & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl>\\\\\n",
 831 |        "\\hline\n",
 832 |        "\tENSG00000000003.15 &  356.00 &  542.00 &  364.00 &  299.00 &  502.00 &  416.00 &  424.00 &  497.00 &  323.00 &  387.00 &  465.00 &  366.00\\\\\n",
 833 |        "\tENSG00000000005.6 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00\\\\\n",
 834 |        "\tENSG00000000419.14 & 2080.11 & 3080.00 & 2206.80 & 1721.00 & 2340.00 & 2232.04 & 2253.00 & 2573.36 & 1904.05 & 2084.00 & 2412.42 & 2170.07\\\\\n",
 835 |        "\tENSG00000000457.14 &  113.88 &  119.35 &   85.84 &  102.65 &  153.65 &   94.84 &  112.33 &  130.23 &  119.77 &   98.34 &  120.36 &   91.56\\\\\n",
 836 |        "\tENSG00000000460.17 &  181.12 &  285.65 &  181.16 &  209.35 &  254.35 &  200.16 &  242.67 &  292.77 &  234.23 &  189.66 &  190.64 &  204.44\\\\\n",
 837 |        "\tENSG00000000938.13 &    4.00 &   21.00 &   10.00 &   18.00 &   10.00 &    6.00 &    6.00 &    9.00 &   10.00 &   13.00 &   17.00 &   13.00\\\\\n",
 838 |        "\\end{tabular}\n"
 839 |       ],
 840 |       "text/markdown": [
 841 |        "\n",
 842 |        "A data.frame: 6 × 12\n",
 843 |        "\n",
 844 |        "| <!--/--> | VK2_NanH2_PolyB25_2h_1 &lt;dbl&gt; | VK2_NanH2_PolyB25_2h_2 &lt;dbl&gt; | VK2_NanH2_PolyB25_2h_3 &lt;dbl&gt; | VK2_NanH2_PolyB25_2h_4 &lt;dbl&gt; | VK2_NanH2_PolyB25_1h_5 &lt;dbl&gt; | VK2_NanH2_PolyB25_1h_6 &lt;dbl&gt; | VK2_NanH2_PolyB25_1h_7 &lt;dbl&gt; | VK2_NanH2_PolyB25_1h_8 &lt;dbl&gt; | VK2_Pet28a_PolyB25_2h_9 &lt;dbl&gt; | VK2_Pet28a_PolyB25_2h_10 &lt;dbl&gt; | VK2_Pet28a_PolyB25_2h_11 &lt;dbl&gt; | VK2_Pet28a_PolyB25_2h_12 &lt;dbl&gt; |\n",
 845 |        "|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
 846 |        "| ENSG00000000003.15 |  356.00 |  542.00 |  364.00 |  299.00 |  502.00 |  416.00 |  424.00 |  497.00 |  323.00 |  387.00 |  465.00 |  366.00 |\n",
 847 |        "| ENSG00000000005.6 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |\n",
 848 |        "| ENSG00000000419.14 | 2080.11 | 3080.00 | 2206.80 | 1721.00 | 2340.00 | 2232.04 | 2253.00 | 2573.36 | 1904.05 | 2084.00 | 2412.42 | 2170.07 |\n",
 849 |        "| ENSG00000000457.14 |  113.88 |  119.35 |   85.84 |  102.65 |  153.65 |   94.84 |  112.33 |  130.23 |  119.77 |   98.34 |  120.36 |   91.56 |\n",
 850 |        "| ENSG00000000460.17 |  181.12 |  285.65 |  181.16 |  209.35 |  254.35 |  200.16 |  242.67 |  292.77 |  234.23 |  189.66 |  190.64 |  204.44 |\n",
 851 |        "| ENSG00000000938.13 |    4.00 |   21.00 |   10.00 |   18.00 |   10.00 |    6.00 |    6.00 |    9.00 |   10.00 |   13.00 |   17.00 |   13.00 |\n",
 852 |        "\n"
 853 |       ],
 854 |       "text/plain": [
 855 |        "                   VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_2\n",
 856 |        "ENSG00000000003.15  356.00                 542.00               \n",
 857 |        "ENSG00000000005.6     0.00                   0.00               \n",
 858 |        "ENSG00000000419.14 2080.11                3080.00               \n",
 859 |        "ENSG00000000457.14  113.88                 119.35               \n",
 860 |        "ENSG00000000460.17  181.12                 285.65               \n",
 861 |        "ENSG00000000938.13    4.00                  21.00               \n",
 862 |        "                   VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_4\n",
 863 |        "ENSG00000000003.15  364.00                 299.00               \n",
 864 |        "ENSG00000000005.6     0.00                   0.00               \n",
 865 |        "ENSG00000000419.14 2206.80                1721.00               \n",
 866 |        "ENSG00000000457.14   85.84                 102.65               \n",
 867 |        "ENSG00000000460.17  181.16                 209.35               \n",
 868 |        "ENSG00000000938.13   10.00                  18.00               \n",
 869 |        "                   VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_6\n",
 870 |        "ENSG00000000003.15  502.00                 416.00               \n",
 871 |        "ENSG00000000005.6     0.00                   0.00               \n",
 872 |        "ENSG00000000419.14 2340.00                2232.04               \n",
 873 |        "ENSG00000000457.14  153.65                  94.84               \n",
 874 |        "ENSG00000000460.17  254.35                 200.16               \n",
 875 |        "ENSG00000000938.13   10.00                   6.00               \n",
 876 |        "                   VK2_NanH2_PolyB25_1h_7 VK2_NanH2_PolyB25_1h_8\n",
 877 |        "ENSG00000000003.15  424.00                 497.00               \n",
 878 |        "ENSG00000000005.6     0.00                   0.00               \n",
 879 |        "ENSG00000000419.14 2253.00                2573.36               \n",
 880 |        "ENSG00000000457.14  112.33                 130.23               \n",
 881 |        "ENSG00000000460.17  242.67                 292.77               \n",
 882 |        "ENSG00000000938.13    6.00                   9.00               \n",
 883 |        "                   VK2_Pet28a_PolyB25_2h_9 VK2_Pet28a_PolyB25_2h_10\n",
 884 |        "ENSG00000000003.15  323.00                  387.00                 \n",
 885 |        "ENSG00000000005.6     0.00                    0.00                 \n",
 886 |        "ENSG00000000419.14 1904.05                 2084.00                 \n",
 887 |        "ENSG00000000457.14  119.77                   98.34                 \n",
 888 |        "ENSG00000000460.17  234.23                  189.66                 \n",
 889 |        "ENSG00000000938.13   10.00                   13.00                 \n",
 890 |        "                   VK2_Pet28a_PolyB25_2h_11 VK2_Pet28a_PolyB25_2h_12\n",
 891 |        "ENSG00000000003.15  465.00                   366.00                 \n",
 892 |        "ENSG00000000005.6     0.00                     0.00                 \n",
 893 |        "ENSG00000000419.14 2412.42                  2170.07                 \n",
 894 |        "ENSG00000000457.14  120.36                    91.56                 \n",
 895 |        "ENSG00000000460.17  190.64                   204.44                 \n",
 896 |        "ENSG00000000938.13   17.00                    13.00                 "
 897 |       ]
 898 |      },
 899 |      "metadata": {},
 900 |      "output_type": "display_data"
 901 |     }
 902 |    ],
 903 |    "source": [
 904 |     "gGeneCountsDf = gUnorderedGeneCountsDf[gSampleNames]\n",
 905 |     "head(gGeneCountsDf)"
 906 |    ]
 907 |   },
 908 |   {
 909 |    "cell_type": "markdown",
 910 |    "metadata": {},
 911 |    "source": [
 912 |     "If the count file gene identifiers do NOT include version numbers (e.g., the \".4\" part in a gene identifier like \"ENSG00000268020.4\"), then it is necessary to truncate the version information from the public annotation data to be used below in order to match the annotation data gene identifiers to the count file gene identifiers.  Set the flag for version removal accordingly:"
 913 |    ]
 914 |   },
 915 |   {
 916 |    "cell_type": "code",
 917 |    "execution_count": 63,
 918 |    "metadata": {},
 919 |    "outputs": [],
 920 |    "source": [
 921 |     "gRemoveVersion <- FALSE\n"
 922 |    ]
 923 |   },
 924 |   {
 925 |    "cell_type": "markdown",
 926 |    "metadata": {},
 927 |    "source": [
 928 |     "[Table of Contents](#Table-of-Contents)\n",
 929 |     "\n",
 930 |     "###  Annotations\n",
 931 |     "\n",
 932 |     "If a previously created file of the gene annotations has been provided, load it:"
 933 |    ]
 934 |   },
 935 |   {
 936 |    "cell_type": "code",
 937 |    "execution_count": 64,
 938 |    "metadata": {},
 939 |    "outputs": [
 940 |     {
 941 |      "data": {
 942 |       "text/html": [
 943 |        "<table class=\"dataframe\">\n",
 944 |        "<caption>A data.frame: 6 × 3</caption>\n",
 945 |        "<thead>\n",
 946 |        "\t<tr><th></th><th scope=col>gene_type</th><th scope=col>gene_id</th><th scope=col>transcript_id</th></tr>\n",
 947 |        "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
 948 |        "</thead>\n",
 949 |        "<tbody>\n",
 950 |        "\t<tr><th scope=row>1</th><td>transcribed_unprocessed_pseudogene</td><td>ENSG00000223972.5</td><td>ENST00000456328.2</td></tr>\n",
 951 |        "\t<tr><th scope=row>2</th><td>transcribed_unprocessed_pseudogene</td><td>ENSG00000223972.5</td><td>ENST00000450305.2</td></tr>\n",
 952 |        "\t<tr><th scope=row>3</th><td>unprocessed_pseudogene            </td><td>ENSG00000227232.5</td><td>ENST00000488147.1</td></tr>\n",
 953 |        "\t<tr><th scope=row>4</th><td>miRNA                             </td><td>ENSG00000278267.1</td><td>ENST00000619216.1</td></tr>\n",
 954 |        "\t<tr><th scope=row>5</th><td>lncRNA                            </td><td>ENSG00000243485.5</td><td>ENST00000473358.1</td></tr>\n",
 955 |        "\t<tr><th scope=row>6</th><td>lncRNA                            </td><td>ENSG00000243485.5</td><td>ENST00000469289.1</td></tr>\n",
 956 |        "</tbody>\n",
 957 |        "</table>\n"
 958 |       ],
 959 |       "text/latex": [
 960 |        "A data.frame: 6 × 3\n",
 961 |        "\\begin{tabular}{r|lll}\n",
 962 |        "  & gene\\_type & gene\\_id & transcript\\_id\\\\\n",
 963 |        "  & <chr> & <chr> & <chr>\\\\\n",
 964 |        "\\hline\n",
 965 |        "\t1 & transcribed\\_unprocessed\\_pseudogene & ENSG00000223972.5 & ENST00000456328.2\\\\\n",
 966 |        "\t2 & transcribed\\_unprocessed\\_pseudogene & ENSG00000223972.5 & ENST00000450305.2\\\\\n",
 967 |        "\t3 & unprocessed\\_pseudogene             & ENSG00000227232.5 & ENST00000488147.1\\\\\n",
 968 |        "\t4 & miRNA                              & ENSG00000278267.1 & ENST00000619216.1\\\\\n",
 969 |        "\t5 & lncRNA                             & ENSG00000243485.5 & ENST00000473358.1\\\\\n",
 970 |        "\t6 & lncRNA                             & ENSG00000243485.5 & ENST00000469289.1\\\\\n",
 971 |        "\\end{tabular}\n"
 972 |       ],
 973 |       "text/markdown": [
 974 |        "\n",
 975 |        "A data.frame: 6 × 3\n",
 976 |        "\n",
 977 |        "| <!--/--> | gene_type &lt;chr&gt; | gene_id &lt;chr&gt; | transcript_id &lt;chr&gt; |\n",
 978 |        "|---|---|---|---|\n",
 979 |        "| 1 | transcribed_unprocessed_pseudogene | ENSG00000223972.5 | ENST00000456328.2 |\n",
 980 |        "| 2 | transcribed_unprocessed_pseudogene | ENSG00000223972.5 | ENST00000450305.2 |\n",
 981 |        "| 3 | unprocessed_pseudogene             | ENSG00000227232.5 | ENST00000488147.1 |\n",
 982 |        "| 4 | miRNA                              | ENSG00000278267.1 | ENST00000619216.1 |\n",
 983 |        "| 5 | lncRNA                             | ENSG00000243485.5 | ENST00000473358.1 |\n",
 984 |        "| 6 | lncRNA                             | ENSG00000243485.5 | ENST00000469289.1 |\n",
 985 |        "\n"
 986 |       ],
 987 |       "text/plain": [
 988 |        "  gene_type                          gene_id           transcript_id    \n",
 989 |        "1 transcribed_unprocessed_pseudogene ENSG00000223972.5 ENST00000456328.2\n",
 990 |        "2 transcribed_unprocessed_pseudogene ENSG00000223972.5 ENST00000450305.2\n",
 991 |        "3 unprocessed_pseudogene             ENSG00000227232.5 ENST00000488147.1\n",
 992 |        "4 miRNA                              ENSG00000278267.1 ENST00000619216.1\n",
 993 |        "5 lncRNA                             ENSG00000243485.5 ENST00000473358.1\n",
 994 |        "6 lncRNA                             ENSG00000243485.5 ENST00000469289.1"
 995 |       ]
 996 |      },
 997 |      "metadata": {},
 998 |      "output_type": "display_data"
 999 |     }
1000 |    ],
1001 |    "source": [
1002 |     "\n",
1003 |     "if (!is.null(gAnnotationsRdataFilename)) {\n",
1004 |     "    gAnnotationsRdataFp = file.path(gReferenceDir, gAnnotationsRdataFilename)  \n",
1005 |     "    \n",
1006 |     "    # Import the R data object containing gene annotations and load its dataframe into a variable:\n",
1007 |     "    gAnnotationEnv = loadToEnvironment(gAnnotationsRdataFp)\n",
1008 |     "    gGeneTypeAnnotationsDf = gAnnotationEnv$ANNOT\n",
1009 |     "    \n",
1010 |     "    head(gGeneTypeAnnotationsDf)\n",
1011 |     "} else {\n",
1012 |     "    print(\"No annotations provided.\")\n",
1013 |     "}"
1014 |    ]
1015 |   },
1016 |   {
1017 |    "cell_type": "markdown",
1018 |    "metadata": {},
1019 |    "source": [
1020 |     "[Table of Contents](#Table-of-Contents)\n",
1021 |     "\n",
1022 |     "## Gene Separation By Coding Status"
1023 |    ]
1024 |   },
1025 |   {
1026 |    "cell_type": "markdown",
1027 |    "metadata": {},
1028 |    "source": [
1029 |     "Gene annotations are records of each gene's identifier and symbol, where the gene begins and ends on the genome sequence, and whether it is anticipated to be a coding gene or not.  There are multiple sources of gene annotations."
1030 |    ]
1031 |   },
1032 |   {
1033 |    "cell_type": "markdown",
1034 |    "metadata": {},
1035 |    "source": [
1036 |     "Here we use the human gene annotations from the Gencode project, Release 43 (GRCh38.p13). \n",
1037 |     "\n"
1038 |    ]
1039 |   },
1040 |   {
1041 |    "cell_type": "code",
1042 |    "execution_count": 65,
1043 |    "metadata": {},
1044 |    "outputs": [],
1045 |    "source": [
1046 |     "splitGeneCountsByCodingStatus = function(geneCountDf, gtfDf, removeVersion=FALSE){\n",
1047 |     "    #Subset GTF by protein coding and noncoding\n",
1048 |     "    ANNOT_protein_coding <- subset(gtfDf, gene_type == \"protein_coding\")\n",
1049 |     "#     ANNOT_ncRNA <- subset(gtfDf, gene_type %in% c(\"lincRNA\", \"antisense\", \"processed_transcript\",\"sense_overlapping\", \"sense_intronic\"))\n",
1050 |     "    ANNOT_ncRNA <- subset(gtfDf, gene_type %in% c(\"lncRNA\", \"antisense\", \"processed_transcript\",\"sense_overlapping\", \"sense_intronic\"))\n",
1051 |     "\n",
1052 |     "    #make list of IDs to query\n",
1053 |     "    protein_coding_ids <- ANNOT_protein_coding$gene_id\n",
1054 |     "    ncRNA_ids <- ANNOT_ncRNA$gene_id\n",
1055 |     "    \n",
1056 |     "    if (removeVersion){\n",
1057 |     "        protein_coding_ids <- removeAccessionVersion(protein_coding_ids)\n",
1058 |     "        ncRNA_ids <- removeAccessionVersion(ncRNA_ids)        \n",
1059 |     "    }\n",
1060 |     "\n",
1061 |     "    #subset geneCounts\n",
1062 |     "    geneCount_protein_coding <- subset(geneCountDf, row.names(geneCountDf) %in% protein_coding_ids)\n",
1063 |     "    geneCount_ncRNA <- subset(geneCountDf, row.names(geneCountDf) %in% ncRNA_ids)\n",
1064 |     "    return(list(codingGeneCountDf=geneCount_protein_coding, noncodingGeneCountDf=geneCount_ncRNA))\n",
1065 |     "}\n",
1066 |     "\n",
1067 |     "removeAccessionVersion = function(accessionVector){\n",
1068 |     "    return (gsub(\"\\\\..*\",\"\",accessionVector))\n",
1069 |     "}\n",
1070 |     "\n",
1071 |     "writeSubsetCounts = function(subsetCountsDf, outputDir, runName, fileSuffix){\n",
1072 |     "    fileName = sprintf(fileSuffix, runName)\n",
1073 |     "    write.csv(subsetCountsDf, file.path(outputDir, fileName))\n",
1074 |     "    print(paste0(\"Output file: \",fileName))\n",
1075 |     "}\n",
1076 |     "\n",
1077 |     "writeSubsetsCounts = function(splitGeneCountDfsList, outputDir, runName){\n",
1078 |     "    writeSubsetCounts(splitGeneCountDfsList$codingGeneCountDf, outputDir, runName,\"%s_raw_pc_genes_counts.csv\")\n",
1079 |     "    writeSubsetCounts(splitGeneCountDfsList$noncodingGeneCountDf, outputDir, runName,\"%s_raw_nc_genes_counts.csv\")\n",
1080 |     "}"
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "markdown",
1085 |    "metadata": {},
1086 |    "source": [
1087 |     "Split the count data into coding and non-coding subsets, and extract each subset into a file based on the annotation file provided in the input parameters:"
1088 |    ]
1089 |   },
1090 |   {
1091 |    "cell_type": "code",
1092 |    "execution_count": 66,
1093 |    "metadata": {},
1094 |    "outputs": [],
1095 |    "source": [
1096 |     "gSplitGeneCountDfsList = splitGeneCountsByCodingStatus(gGeneCountsDf, gGeneTypeAnnotationsDf, gRemoveVersion)"
1097 |    ]
1098 |   },
1099 |   {
1100 |    "cell_type": "code",
1101 |    "execution_count": 67,
1102 |    "metadata": {},
1103 |    "outputs": [
1104 |     {
1105 |      "data": {
1106 |       "text/html": [
1107 |        "<style>\n",
1108 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
1109 |        ".list-inline>li {display: inline-block}\n",
1110 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
1111 |        "</style>\n",
1112 |        "<ol class=list-inline><li>60605</li><li>12</li></ol>\n"
1113 |       ],
1114 |       "text/latex": [
1115 |        "\\begin{enumerate*}\n",
1116 |        "\\item 60605\n",
1117 |        "\\item 12\n",
1118 |        "\\end{enumerate*}\n"
1119 |       ],
1120 |       "text/markdown": [
1121 |        "1. 60605\n",
1122 |        "2. 12\n",
1123 |        "\n",
1124 |        "\n"
1125 |       ],
1126 |       "text/plain": [
1127 |        "[1] 60605    12"
1128 |       ]
1129 |      },
1130 |      "metadata": {},
1131 |      "output_type": "display_data"
1132 |     },
1133 |     {
1134 |      "data": {
1135 |       "text/html": [
1136 |        "<style>\n",
1137 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
1138 |        ".list-inline>li {display: inline-block}\n",
1139 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
1140 |        "</style>\n",
1141 |        "<ol class=list-inline><li>19937</li><li>12</li></ol>\n"
1142 |       ],
1143 |       "text/latex": [
1144 |        "\\begin{enumerate*}\n",
1145 |        "\\item 19937\n",
1146 |        "\\item 12\n",
1147 |        "\\end{enumerate*}\n"
1148 |       ],
1149 |       "text/markdown": [
1150 |        "1. 19937\n",
1151 |        "2. 12\n",
1152 |        "\n",
1153 |        "\n"
1154 |       ],
1155 |       "text/plain": [
1156 |        "[1] 19937    12"
1157 |       ]
1158 |      },
1159 |      "metadata": {},
1160 |      "output_type": "display_data"
1161 |     },
1162 |     {
1163 |      "data": {
1164 |       "text/html": [
1165 |        "<style>\n",
1166 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
1167 |        ".list-inline>li {display: inline-block}\n",
1168 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
1169 |        "</style>\n",
1170 |        "<ol class=list-inline><li>16876</li><li>12</li></ol>\n"
1171 |       ],
1172 |       "text/latex": [
1173 |        "\\begin{enumerate*}\n",
1174 |        "\\item 16876\n",
1175 |        "\\item 12\n",
1176 |        "\\end{enumerate*}\n"
1177 |       ],
1178 |       "text/markdown": [
1179 |        "1. 16876\n",
1180 |        "2. 12\n",
1181 |        "\n",
1182 |        "\n"
1183 |       ],
1184 |       "text/plain": [
1185 |        "[1] 16876    12"
1186 |       ]
1187 |      },
1188 |      "metadata": {},
1189 |      "output_type": "display_data"
1190 |     }
1191 |    ],
1192 |    "source": [
1193 |     "dim(gGeneCountsDf)\n",
1194 |     "dim(gSplitGeneCountDfsList$codingGeneCountDf)\n",
1195 |     "dim(gSplitGeneCountDfsList$noncodingGeneCountDf)"
1196 |    ]
1197 |   },
1198 |   {
1199 |    "cell_type": "markdown",
1200 |    "metadata": {},
1201 |    "source": [
1202 |     " Of the original 60,605 Ensembl genes in the dataset, 19,937 are known coding genes.  "
1203 |    ]
1204 |   },
1205 |   {
1206 |    "cell_type": "code",
1207 |    "execution_count": 68,
1208 |    "metadata": {},
1209 |    "outputs": [
1210 |     {
1211 |      "name": "stdout",
1212 |      "output_type": "stream",
1213 |      "text": [
1214 |       "[1] \"Output file: 20230302161642_raw_pc_genes_counts.csv\"\n",
1215 |       "[1] \"Output file: 20230302161642_raw_nc_genes_counts.csv\"\n"
1216 |      ]
1217 |     }
1218 |    ],
1219 |    "source": [
1220 |     "writeSubsetsCounts(gSplitGeneCountDfsList, gOutputDir_nb1, gRunName)"
1221 |    ]
1222 |   },
1223 |   {
1224 |    "cell_type": "markdown",
1225 |    "metadata": {},
1226 |    "source": [
1227 |     "[Table of Contents](#Table-of-Contents)\n",
1228 |     "\n",
1229 |     "## Data Integration\n",
1230 |     "\n",
1231 |     "\n",
1232 |     "\n",
1233 |     "Integrate the count data and the metadata into an edgeR DGEList object for use in downstream analysis:\n",
1234 |     "\n",
1235 |     "> Our DGEList-object contains a samples data frame that stores both ... group ... and batch ... information, each of which consists of ... distinct levels. Note that within x$samples, library sizes are automatically calculated for each sample and normalisation factors are set to 1. ([1](#Citations))"
1236 |    ]
1237 |   },
1238 |   {
1239 |    "cell_type": "code",
1240 |    "execution_count": 69,
1241 |    "metadata": {},
1242 |    "outputs": [],
1243 |    "source": [
1244 |     "gGeneType = \"all\"\n",
1245 |     "gRelevantGeneCountsDf <- gGeneCountsDf"
1246 |    ]
1247 |   },
1248 |   {
1249 |    "cell_type": "code",
1250 |    "execution_count": 70,
1251 |    "metadata": {},
1252 |    "outputs": [],
1253 |    "source": [
1254 |     "# create a DGEList object\n",
1255 |     "makeDgeList = function(countsDf, metadataDf, groupColName){\n",
1256 |     "    # remove the accession version (.##etc) from the ensembl gene id\n",
1257 |     "    id_list <- gsub(\"[.].*$\",\"\", row.names(countsDf))\n",
1258 |     "    row.names(countsDf) <- id_list\n",
1259 |     "\n",
1260 |     "    x <- DGEList(counts = countsDf, lib.size = colSums(countsDf),\n",
1261 |     "    norm.factors = rep(1,ncol(countsDf)), samples = metadataDf,\n",
1262 |     "        group = metadataDf[[groupColName]], genes = NULL, remove.zeros = FALSE)\n",
1263 |     "    return(x)\n",
1264 |     "}"
1265 |    ]
1266 |   },
1267 |   {
1268 |    "cell_type": "code",
1269 |    "execution_count": 71,
1270 |    "metadata": {},
1271 |    "outputs": [],
1272 |    "source": [
1273 |     "gGroupCategory = \"Condition_Time\" # e.g., \"day\""
1274 |    ]
1275 |   },
1276 |   {
1277 |    "cell_type": "code",
1278 |    "execution_count": 72,
1279 |    "metadata": {},
1280 |    "outputs": [
1281 |     {
1282 |      "data": {
1283 |       "text/html": [
1284 |        "<style>\n",
1285 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
1286 |        ".list-inline>li {display: inline-block}\n",
1287 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
1288 |        "</style>\n",
1289 |        "<ol class=list-inline><li>'counts'</li><li>'samples'</li></ol>\n"
1290 |       ],
1291 |       "text/latex": [
1292 |        "\\begin{enumerate*}\n",
1293 |        "\\item 'counts'\n",
1294 |        "\\item 'samples'\n",
1295 |        "\\end{enumerate*}\n"
1296 |       ],
1297 |       "text/markdown": [
1298 |        "1. 'counts'\n",
1299 |        "2. 'samples'\n",
1300 |        "\n",
1301 |        "\n"
1302 |       ],
1303 |       "text/plain": [
1304 |        "[1] \"counts\"  \"samples\""
1305 |       ]
1306 |      },
1307 |      "metadata": {},
1308 |      "output_type": "display_data"
1309 |     }
1310 |    ],
1311 |    "source": [
1312 |     "gDgeList = makeDgeList(gRelevantGeneCountsDf, gMetadataDf, gGroupCategory)\n",
1313 |     "names(gDgeList)"
1314 |    ]
1315 |   },
1316 |   {
1317 |    "cell_type": "markdown",
1318 |    "metadata": {},
1319 |    "source": [
1320 |     "As a sanity-check, look at representative content from the DGEList:"
1321 |    ]
1322 |   },
1323 |   {
1324 |    "cell_type": "code",
1325 |    "execution_count": 73,
1326 |    "metadata": {},
1327 |    "outputs": [
1328 |     {
1329 |      "data": {
1330 |       "text/html": [
1331 |        "<table class=\"dataframe\">\n",
1332 |        "<caption>A matrix: 6 × 12 of type dbl</caption>\n",
1333 |        "<thead>\n",
1334 |        "\t<tr><th></th><th scope=col>VK2_NanH2_PolyB25_2h_1</th><th scope=col>VK2_NanH2_PolyB25_2h_2</th><th scope=col>VK2_NanH2_PolyB25_2h_3</th><th scope=col>VK2_NanH2_PolyB25_2h_4</th><th scope=col>VK2_NanH2_PolyB25_1h_5</th><th scope=col>VK2_NanH2_PolyB25_1h_6</th><th scope=col>VK2_NanH2_PolyB25_1h_7</th><th scope=col>VK2_NanH2_PolyB25_1h_8</th><th scope=col>VK2_Pet28a_PolyB25_2h_9</th><th scope=col>VK2_Pet28a_PolyB25_2h_10</th><th scope=col>VK2_Pet28a_PolyB25_2h_11</th><th scope=col>VK2_Pet28a_PolyB25_2h_12</th></tr>\n",
1335 |        "</thead>\n",
1336 |        "<tbody>\n",
1337 |        "\t<tr><th scope=row>ENSG00000000003</th><td> 356.00</td><td> 542.00</td><td> 364.00</td><td> 299.00</td><td> 502.00</td><td> 416.00</td><td> 424.00</td><td> 497.00</td><td> 323.00</td><td> 387.00</td><td> 465.00</td><td> 366.00</td></tr>\n",
1338 |        "\t<tr><th scope=row>ENSG00000000005</th><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td></tr>\n",
1339 |        "\t<tr><th scope=row>ENSG00000000419</th><td>2080.11</td><td>3080.00</td><td>2206.80</td><td>1721.00</td><td>2340.00</td><td>2232.04</td><td>2253.00</td><td>2573.36</td><td>1904.05</td><td>2084.00</td><td>2412.42</td><td>2170.07</td></tr>\n",
1340 |        "\t<tr><th scope=row>ENSG00000000457</th><td> 113.88</td><td> 119.35</td><td>  85.84</td><td> 102.65</td><td> 153.65</td><td>  94.84</td><td> 112.33</td><td> 130.23</td><td> 119.77</td><td>  98.34</td><td> 120.36</td><td>  91.56</td></tr>\n",
1341 |        "\t<tr><th scope=row>ENSG00000000460</th><td> 181.12</td><td> 285.65</td><td> 181.16</td><td> 209.35</td><td> 254.35</td><td> 200.16</td><td> 242.67</td><td> 292.77</td><td> 234.23</td><td> 189.66</td><td> 190.64</td><td> 204.44</td></tr>\n",
1342 |        "\t<tr><th scope=row>ENSG00000000938</th><td>   4.00</td><td>  21.00</td><td>  10.00</td><td>  18.00</td><td>  10.00</td><td>   6.00</td><td>   6.00</td><td>   9.00</td><td>  10.00</td><td>  13.00</td><td>  17.00</td><td>  13.00</td></tr>\n",
1343 |        "</tbody>\n",
1344 |        "</table>\n"
1345 |       ],
1346 |       "text/latex": [
1347 |        "A matrix: 6 × 12 of type dbl\n",
1348 |        "\\begin{tabular}{r|llllllllllll}\n",
1349 |        "  & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & VK2\\_NanH2\\_PolyB25\\_1h\\_7 & VK2\\_NanH2\\_PolyB25\\_1h\\_8 & VK2\\_Pet28a\\_PolyB25\\_2h\\_9 & VK2\\_Pet28a\\_PolyB25\\_2h\\_10 & VK2\\_Pet28a\\_PolyB25\\_2h\\_11 & VK2\\_Pet28a\\_PolyB25\\_2h\\_12\\\\\n",
1350 |        "\\hline\n",
1351 |        "\tENSG00000000003 &  356.00 &  542.00 &  364.00 &  299.00 &  502.00 &  416.00 &  424.00 &  497.00 &  323.00 &  387.00 &  465.00 &  366.00\\\\\n",
1352 |        "\tENSG00000000005 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00\\\\\n",
1353 |        "\tENSG00000000419 & 2080.11 & 3080.00 & 2206.80 & 1721.00 & 2340.00 & 2232.04 & 2253.00 & 2573.36 & 1904.05 & 2084.00 & 2412.42 & 2170.07\\\\\n",
1354 |        "\tENSG00000000457 &  113.88 &  119.35 &   85.84 &  102.65 &  153.65 &   94.84 &  112.33 &  130.23 &  119.77 &   98.34 &  120.36 &   91.56\\\\\n",
1355 |        "\tENSG00000000460 &  181.12 &  285.65 &  181.16 &  209.35 &  254.35 &  200.16 &  242.67 &  292.77 &  234.23 &  189.66 &  190.64 &  204.44\\\\\n",
1356 |        "\tENSG00000000938 &    4.00 &   21.00 &   10.00 &   18.00 &   10.00 &    6.00 &    6.00 &    9.00 &   10.00 &   13.00 &   17.00 &   13.00\\\\\n",
1357 |        "\\end{tabular}\n"
1358 |       ],
1359 |       "text/markdown": [
1360 |        "\n",
1361 |        "A matrix: 6 × 12 of type dbl\n",
1362 |        "\n",
1363 |        "| <!--/--> | VK2_NanH2_PolyB25_2h_1 | VK2_NanH2_PolyB25_2h_2 | VK2_NanH2_PolyB25_2h_3 | VK2_NanH2_PolyB25_2h_4 | VK2_NanH2_PolyB25_1h_5 | VK2_NanH2_PolyB25_1h_6 | VK2_NanH2_PolyB25_1h_7 | VK2_NanH2_PolyB25_1h_8 | VK2_Pet28a_PolyB25_2h_9 | VK2_Pet28a_PolyB25_2h_10 | VK2_Pet28a_PolyB25_2h_11 | VK2_Pet28a_PolyB25_2h_12 |\n",
1364 |        "|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
1365 |        "| ENSG00000000003 |  356.00 |  542.00 |  364.00 |  299.00 |  502.00 |  416.00 |  424.00 |  497.00 |  323.00 |  387.00 |  465.00 |  366.00 |\n",
1366 |        "| ENSG00000000005 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |\n",
1367 |        "| ENSG00000000419 | 2080.11 | 3080.00 | 2206.80 | 1721.00 | 2340.00 | 2232.04 | 2253.00 | 2573.36 | 1904.05 | 2084.00 | 2412.42 | 2170.07 |\n",
1368 |        "| ENSG00000000457 |  113.88 |  119.35 |   85.84 |  102.65 |  153.65 |   94.84 |  112.33 |  130.23 |  119.77 |   98.34 |  120.36 |   91.56 |\n",
1369 |        "| ENSG00000000460 |  181.12 |  285.65 |  181.16 |  209.35 |  254.35 |  200.16 |  242.67 |  292.77 |  234.23 |  189.66 |  190.64 |  204.44 |\n",
1370 |        "| ENSG00000000938 |    4.00 |   21.00 |   10.00 |   18.00 |   10.00 |    6.00 |    6.00 |    9.00 |   10.00 |   13.00 |   17.00 |   13.00 |\n",
1371 |        "\n"
1372 |       ],
1373 |       "text/plain": [
1374 |        "                VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_2\n",
1375 |        "ENSG00000000003  356.00                 542.00               \n",
1376 |        "ENSG00000000005    0.00                   0.00               \n",
1377 |        "ENSG00000000419 2080.11                3080.00               \n",
1378 |        "ENSG00000000457  113.88                 119.35               \n",
1379 |        "ENSG00000000460  181.12                 285.65               \n",
1380 |        "ENSG00000000938    4.00                  21.00               \n",
1381 |        "                VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_4\n",
1382 |        "ENSG00000000003  364.00                 299.00               \n",
1383 |        "ENSG00000000005    0.00                   0.00               \n",
1384 |        "ENSG00000000419 2206.80                1721.00               \n",
1385 |        "ENSG00000000457   85.84                 102.65               \n",
1386 |        "ENSG00000000460  181.16                 209.35               \n",
1387 |        "ENSG00000000938   10.00                  18.00               \n",
1388 |        "                VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_6\n",
1389 |        "ENSG00000000003  502.00                 416.00               \n",
1390 |        "ENSG00000000005    0.00                   0.00               \n",
1391 |        "ENSG00000000419 2340.00                2232.04               \n",
1392 |        "ENSG00000000457  153.65                  94.84               \n",
1393 |        "ENSG00000000460  254.35                 200.16               \n",
1394 |        "ENSG00000000938   10.00                   6.00               \n",
1395 |        "                VK2_NanH2_PolyB25_1h_7 VK2_NanH2_PolyB25_1h_8\n",
1396 |        "ENSG00000000003  424.00                 497.00               \n",
1397 |        "ENSG00000000005    0.00                   0.00               \n",
1398 |        "ENSG00000000419 2253.00                2573.36               \n",
1399 |        "ENSG00000000457  112.33                 130.23               \n",
1400 |        "ENSG00000000460  242.67                 292.77               \n",
1401 |        "ENSG00000000938    6.00                   9.00               \n",
1402 |        "                VK2_Pet28a_PolyB25_2h_9 VK2_Pet28a_PolyB25_2h_10\n",
1403 |        "ENSG00000000003  323.00                  387.00                 \n",
1404 |        "ENSG00000000005    0.00                    0.00                 \n",
1405 |        "ENSG00000000419 1904.05                 2084.00                 \n",
1406 |        "ENSG00000000457  119.77                   98.34                 \n",
1407 |        "ENSG00000000460  234.23                  189.66                 \n",
1408 |        "ENSG00000000938   10.00                   13.00                 \n",
1409 |        "                VK2_Pet28a_PolyB25_2h_11 VK2_Pet28a_PolyB25_2h_12\n",
1410 |        "ENSG00000000003  465.00                   366.00                 \n",
1411 |        "ENSG00000000005    0.00                     0.00                 \n",
1412 |        "ENSG00000000419 2412.42                  2170.07                 \n",
1413 |        "ENSG00000000457  120.36                    91.56                 \n",
1414 |        "ENSG00000000460  190.64                   204.44                 \n",
1415 |        "ENSG00000000938   17.00                    13.00                 "
1416 |       ]
1417 |      },
1418 |      "metadata": {},
1419 |      "output_type": "display_data"
1420 |     },
1421 |     {
1422 |      "data": {
1423 |       "text/html": [
1424 |        "<table class=\"dataframe\">\n",
1425 |        "<caption>A data.frame: 6 × 10</caption>\n",
1426 |        "<thead>\n",
1427 |        "\t<tr><th></th><th scope=col>group</th><th scope=col>lib.size</th><th scope=col>norm.factors</th><th scope=col>Sample.Name</th><th scope=col>Sample.Code</th><th scope=col>Reference.Genome..Organism.</th><th scope=col>Gender</th><th scope=col>Condition</th><th scope=col>Time.point</th><th scope=col>Condition_Time</th></tr>\n",
1428 |        "\t<tr><th></th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
1429 |        "</thead>\n",
1430 |        "<tbody>\n",
1431 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_2h_1</th><td>NanH2_treated_2H</td><td>22490889</td><td>1</td><td>VK2_NanH2_PolyB25_2h_1</td><td>K001</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
1432 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_2h_2</th><td>NanH2_treated_2H</td><td>31232480</td><td>1</td><td>VK2_NanH2_PolyB25_2h_2</td><td>K002</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
1433 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_2h_3</th><td>NanH2_treated_2H</td><td>21901342</td><td>1</td><td>VK2_NanH2_PolyB25_2h_3</td><td>K003</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
1434 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_2h_4</th><td>NanH2_treated_2H</td><td>19637114</td><td>1</td><td>VK2_NanH2_PolyB25_2h_4</td><td>K004</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
1435 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_1h_5</th><td>NanH2_treated_1H</td><td>27468016</td><td>1</td><td>VK2_NanH2_PolyB25_1h_5</td><td>K005</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>1H</td><td>NanH2_treated_1H</td></tr>\n",
1436 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_1h_6</th><td>NanH2_treated_1H</td><td>24935803</td><td>1</td><td>VK2_NanH2_PolyB25_1h_6</td><td>K006</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>1H</td><td>NanH2_treated_1H</td></tr>\n",
1437 |        "</tbody>\n",
1438 |        "</table>\n"
1439 |       ],
1440 |       "text/latex": [
1441 |        "A data.frame: 6 × 10\n",
1442 |        "\\begin{tabular}{r|llllllllll}\n",
1443 |        "  & group & lib.size & norm.factors & Sample.Name & Sample.Code & Reference.Genome..Organism. & Gender & Condition & Time.point & Condition\\_Time\\\\\n",
1444 |        "  & <fct> & <dbl> & <dbl> & <chr> & <chr> & <chr> & <chr> & <chr> & <chr> & <chr>\\\\\n",
1445 |        "\\hline\n",
1446 |        "\tVK2\\_NanH2\\_PolyB25\\_2h\\_1 & NanH2\\_treated\\_2H & 22490889 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & K001 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
1447 |        "\tVK2\\_NanH2\\_PolyB25\\_2h\\_2 & NanH2\\_treated\\_2H & 31232480 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & K002 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
1448 |        "\tVK2\\_NanH2\\_PolyB25\\_2h\\_3 & NanH2\\_treated\\_2H & 21901342 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & K003 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
1449 |        "\tVK2\\_NanH2\\_PolyB25\\_2h\\_4 & NanH2\\_treated\\_2H & 19637114 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & K004 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
1450 |        "\tVK2\\_NanH2\\_PolyB25\\_1h\\_5 & NanH2\\_treated\\_1H & 27468016 & 1 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & K005 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n",
1451 |        "\tVK2\\_NanH2\\_PolyB25\\_1h\\_6 & NanH2\\_treated\\_1H & 24935803 & 1 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & K006 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n",
1452 |        "\\end{tabular}\n"
1453 |       ],
1454 |       "text/markdown": [
1455 |        "\n",
1456 |        "A data.frame: 6 × 10\n",
1457 |        "\n",
1458 |        "| <!--/--> | group &lt;fct&gt; | lib.size &lt;dbl&gt; | norm.factors &lt;dbl&gt; | Sample.Name &lt;chr&gt; | Sample.Code &lt;chr&gt; | Reference.Genome..Organism. &lt;chr&gt; | Gender &lt;chr&gt; | Condition &lt;chr&gt; | Time.point &lt;chr&gt; | Condition_Time &lt;chr&gt; |\n",
1459 |        "|---|---|---|---|---|---|---|---|---|---|---|\n",
1460 |        "| VK2_NanH2_PolyB25_2h_1 | NanH2_treated_2H | 22490889 | 1 | VK2_NanH2_PolyB25_2h_1 | K001 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
1461 |        "| VK2_NanH2_PolyB25_2h_2 | NanH2_treated_2H | 31232480 | 1 | VK2_NanH2_PolyB25_2h_2 | K002 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
1462 |        "| VK2_NanH2_PolyB25_2h_3 | NanH2_treated_2H | 21901342 | 1 | VK2_NanH2_PolyB25_2h_3 | K003 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
1463 |        "| VK2_NanH2_PolyB25_2h_4 | NanH2_treated_2H | 19637114 | 1 | VK2_NanH2_PolyB25_2h_4 | K004 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
1464 |        "| VK2_NanH2_PolyB25_1h_5 | NanH2_treated_1H | 27468016 | 1 | VK2_NanH2_PolyB25_1h_5 | K005 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n",
1465 |        "| VK2_NanH2_PolyB25_1h_6 | NanH2_treated_1H | 24935803 | 1 | VK2_NanH2_PolyB25_1h_6 | K006 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n",
1466 |        "\n"
1467 |       ],
1468 |       "text/plain": [
1469 |        "                       group            lib.size norm.factors\n",
1470 |        "VK2_NanH2_PolyB25_2h_1 NanH2_treated_2H 22490889 1           \n",
1471 |        "VK2_NanH2_PolyB25_2h_2 NanH2_treated_2H 31232480 1           \n",
1472 |        "VK2_NanH2_PolyB25_2h_3 NanH2_treated_2H 21901342 1           \n",
1473 |        "VK2_NanH2_PolyB25_2h_4 NanH2_treated_2H 19637114 1           \n",
1474 |        "VK2_NanH2_PolyB25_1h_5 NanH2_treated_1H 27468016 1           \n",
1475 |        "VK2_NanH2_PolyB25_1h_6 NanH2_treated_1H 24935803 1           \n",
1476 |        "                       Sample.Name            Sample.Code\n",
1477 |        "VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_1 K001       \n",
1478 |        "VK2_NanH2_PolyB25_2h_2 VK2_NanH2_PolyB25_2h_2 K002       \n",
1479 |        "VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_3 K003       \n",
1480 |        "VK2_NanH2_PolyB25_2h_4 VK2_NanH2_PolyB25_2h_4 K004       \n",
1481 |        "VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_5 K005       \n",
1482 |        "VK2_NanH2_PolyB25_1h_6 VK2_NanH2_PolyB25_1h_6 K006       \n",
1483 |        "                       Reference.Genome..Organism. Gender Condition    \n",
1484 |        "VK2_NanH2_PolyB25_2h_1 Human                       Female NanH2_treated\n",
1485 |        "VK2_NanH2_PolyB25_2h_2 Human                       Female NanH2_treated\n",
1486 |        "VK2_NanH2_PolyB25_2h_3 Human                       Female NanH2_treated\n",
1487 |        "VK2_NanH2_PolyB25_2h_4 Human                       Female NanH2_treated\n",
1488 |        "VK2_NanH2_PolyB25_1h_5 Human                       Female NanH2_treated\n",
1489 |        "VK2_NanH2_PolyB25_1h_6 Human                       Female NanH2_treated\n",
1490 |        "                       Time.point Condition_Time  \n",
1491 |        "VK2_NanH2_PolyB25_2h_1 2H         NanH2_treated_2H\n",
1492 |        "VK2_NanH2_PolyB25_2h_2 2H         NanH2_treated_2H\n",
1493 |        "VK2_NanH2_PolyB25_2h_3 2H         NanH2_treated_2H\n",
1494 |        "VK2_NanH2_PolyB25_2h_4 2H         NanH2_treated_2H\n",
1495 |        "VK2_NanH2_PolyB25_1h_5 1H         NanH2_treated_1H\n",
1496 |        "VK2_NanH2_PolyB25_1h_6 1H         NanH2_treated_1H"
1497 |       ]
1498 |      },
1499 |      "metadata": {},
1500 |      "output_type": "display_data"
1501 |     }
1502 |    ],
1503 |    "source": [
1504 |     "head(gDgeList$counts)\n",
1505 |     "head(gDgeList$samples)"
1506 |    ]
1507 |   },
1508 |   {
1509 |    "cell_type": "markdown",
1510 |    "metadata": {},
1511 |    "source": [
1512 |     "[Table of Contents](#Table-of-Contents)\n",
1513 |     "\n",
1514 |     "## Annotation Integration"
1515 |    ]
1516 |   },
1517 |   {
1518 |    "cell_type": "markdown",
1519 |    "metadata": {},
1520 |    "source": [
1521 |     "Next, extend the DGEList object with annotation information about the genes that have count data with symbol and EntrezId information, based upon their Ensembl ids.\n",
1522 |     "\n",
1523 |     "> A second data frame named genes in the DGEList-object is used to store gene-level information associated with rows of the counts matrix. This information can be retrieved using organism specific packages such as Mus.musculus (Bioconductor Core Team 2016b) for mouse (or Homo.sapiens (Bioconductor Core Team 2016a) for human) ....\n",
1524 |     ">\n",
1525 |     "> The type of information that can be retrieved includes gene symbols, gene names, chromosome names and locations, Entrez gene IDs, Refseq gene IDs and Ensembl gene IDs to name just a few. .... Mus.musculus [and other organism-specific packages] packages information from various sources and allows users to choose between many different gene IDs as the key. ([1](#Citations))"
1526 |    ]
1527 |   },
1528 |   {
1529 |    "cell_type": "code",
1530 |    "execution_count": 74,
1531 |    "metadata": {},
1532 |    "outputs": [],
1533 |    "source": [
1534 |     "getGeneDf = function(dgeList, organismPackage){\n",
1535 |     "    geneid <-  rownames(dgeList)\n",
1536 |     "    genes <- select(organismPackage, keys=geneid, columns=c(\"SYMBOL\", \"ENSEMBL\", \"ENTREZID\"), \n",
1537 |     "                    keytype=\"ENSEMBL\")\n",
1538 |     "    return(genes)\n",
1539 |     "}"
1540 |    ]
1541 |   },
1542 |   {
1543 |    "cell_type": "code",
1544 |    "execution_count": 75,
1545 |    "metadata": {},
1546 |    "outputs": [
1547 |     {
1548 |      "name": "stderr",
1549 |      "output_type": "stream",
1550 |      "text": [
1551 |       "'select()' returned 1:many mapping between keys and columns\n",
1552 |       "\n"
1553 |      ]
1554 |     },
1555 |     {
1556 |      "data": {
1557 |       "text/html": [
1558 |        "<style>\n",
1559 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
1560 |        ".list-inline>li {display: inline-block}\n",
1561 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
1562 |        "</style>\n",
1563 |        "<ol class=list-inline><li>60846</li><li>3</li></ol>\n"
1564 |       ],
1565 |       "text/latex": [
1566 |        "\\begin{enumerate*}\n",
1567 |        "\\item 60846\n",
1568 |        "\\item 3\n",
1569 |        "\\end{enumerate*}\n"
1570 |       ],
1571 |       "text/markdown": [
1572 |        "1. 60846\n",
1573 |        "2. 3\n",
1574 |        "\n",
1575 |        "\n"
1576 |       ],
1577 |       "text/plain": [
1578 |        "[1] 60846     3"
1579 |       ]
1580 |      },
1581 |      "metadata": {},
1582 |      "output_type": "display_data"
1583 |     }
1584 |    ],
1585 |    "source": [
1586 |     "gRawGenesDf = getGeneDf(gDgeList, gOrganismPackage)\n",
1587 |     "dim(gRawGenesDf)"
1588 |    ]
1589 |   },
1590 |   {
1591 |    "cell_type": "code",
1592 |    "execution_count": 76,
1593 |    "metadata": {},
1594 |    "outputs": [],
1595 |    "source": [
1596 |     "# Add gene type to gRawGenesDf\n",
1597 |     "gGeneTypeAnnotationsDf.rmdec <- gGeneTypeAnnotationsDf\n",
1598 |     "gGeneTypeAnnotationsDf.rmdec$gene_id <- gsub(\"\\\\..*\",\"\",gGeneTypeAnnotationsDf.rmdec$gene_id)\n",
1599 |     "gRawGenesDf$gene_type <- gGeneTypeAnnotationsDf$gene_type[match(gRawGenesDf$ENSEMBL, gGeneTypeAnnotationsDf.rmdec$gene_id)]"
1600 |    ]
1601 |   },
1602 |   {
1603 |    "cell_type": "code",
1604 |    "execution_count": 77,
1605 |    "metadata": {},
1606 |    "outputs": [
1607 |     {
1608 |      "data": {
1609 |       "text/html": [
1610 |        "<table class=\"dataframe\">\n",
1611 |        "<caption>A data.frame: 6 × 4</caption>\n",
1612 |        "<thead>\n",
1613 |        "\t<tr><th></th><th scope=col>ENSEMBL</th><th scope=col>ENTREZID</th><th scope=col>SYMBOL</th><th scope=col>gene_type</th></tr>\n",
1614 |        "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
1615 |        "</thead>\n",
1616 |        "<tbody>\n",
1617 |        "\t<tr><th scope=row>1</th><td>ENSG00000000003</td><td>7105 </td><td>TSPAN6  </td><td>protein_coding</td></tr>\n",
1618 |        "\t<tr><th scope=row>2</th><td>ENSG00000000005</td><td>64102</td><td>TNMD    </td><td>protein_coding</td></tr>\n",
1619 |        "\t<tr><th scope=row>3</th><td>ENSG00000000419</td><td>8813 </td><td>DPM1    </td><td>protein_coding</td></tr>\n",
1620 |        "\t<tr><th scope=row>4</th><td>ENSG00000000457</td><td>57147</td><td>SCYL3   </td><td>protein_coding</td></tr>\n",
1621 |        "\t<tr><th scope=row>5</th><td>ENSG00000000460</td><td>55732</td><td>C1orf112</td><td>protein_coding</td></tr>\n",
1622 |        "\t<tr><th scope=row>6</th><td>ENSG00000000938</td><td>2268 </td><td>FGR     </td><td>protein_coding</td></tr>\n",
1623 |        "</tbody>\n",
1624 |        "</table>\n"
1625 |       ],
1626 |       "text/latex": [
1627 |        "A data.frame: 6 × 4\n",
1628 |        "\\begin{tabular}{r|llll}\n",
1629 |        "  & ENSEMBL & ENTREZID & SYMBOL & gene\\_type\\\\\n",
1630 |        "  & <chr> & <chr> & <chr> & <chr>\\\\\n",
1631 |        "\\hline\n",
1632 |        "\t1 & ENSG00000000003 & 7105  & TSPAN6   & protein\\_coding\\\\\n",
1633 |        "\t2 & ENSG00000000005 & 64102 & TNMD     & protein\\_coding\\\\\n",
1634 |        "\t3 & ENSG00000000419 & 8813  & DPM1     & protein\\_coding\\\\\n",
1635 |        "\t4 & ENSG00000000457 & 57147 & SCYL3    & protein\\_coding\\\\\n",
1636 |        "\t5 & ENSG00000000460 & 55732 & C1orf112 & protein\\_coding\\\\\n",
1637 |        "\t6 & ENSG00000000938 & 2268  & FGR      & protein\\_coding\\\\\n",
1638 |        "\\end{tabular}\n"
1639 |       ],
1640 |       "text/markdown": [
1641 |        "\n",
1642 |        "A data.frame: 6 × 4\n",
1643 |        "\n",
1644 |        "| <!--/--> | ENSEMBL &lt;chr&gt; | ENTREZID &lt;chr&gt; | SYMBOL &lt;chr&gt; | gene_type &lt;chr&gt; |\n",
1645 |        "|---|---|---|---|---|\n",
1646 |        "| 1 | ENSG00000000003 | 7105  | TSPAN6   | protein_coding |\n",
1647 |        "| 2 | ENSG00000000005 | 64102 | TNMD     | protein_coding |\n",
1648 |        "| 3 | ENSG00000000419 | 8813  | DPM1     | protein_coding |\n",
1649 |        "| 4 | ENSG00000000457 | 57147 | SCYL3    | protein_coding |\n",
1650 |        "| 5 | ENSG00000000460 | 55732 | C1orf112 | protein_coding |\n",
1651 |        "| 6 | ENSG00000000938 | 2268  | FGR      | protein_coding |\n",
1652 |        "\n"
1653 |       ],
1654 |       "text/plain": [
1655 |        "  ENSEMBL         ENTREZID SYMBOL   gene_type     \n",
1656 |        "1 ENSG00000000003 7105     TSPAN6   protein_coding\n",
1657 |        "2 ENSG00000000005 64102    TNMD     protein_coding\n",
1658 |        "3 ENSG00000000419 8813     DPM1     protein_coding\n",
1659 |        "4 ENSG00000000457 57147    SCYL3    protein_coding\n",
1660 |        "5 ENSG00000000460 55732    C1orf112 protein_coding\n",
1661 |        "6 ENSG00000000938 2268     FGR      protein_coding"
1662 |       ]
1663 |      },
1664 |      "metadata": {},
1665 |      "output_type": "display_data"
1666 |     }
1667 |    ],
1668 |    "source": [
1669 |     "head(gRawGenesDf)"
1670 |    ]
1671 |   },
1672 |   {
1673 |    "cell_type": "markdown",
1674 |    "metadata": {},
1675 |    "source": [
1676 |     "> [G]ene IDs may not map one-to-one to the gene information of interest. It is important to check for duplicated gene IDs. ([1](#Citations))\n",
1677 |     "\n",
1678 |     "Examine how many records in the annotation dataset have the same id (for the gene identifier type--either ENSEMBL or ENTREZ--set below) as another record occurring earlier in the dataset:"
1679 |    ]
1680 |   },
1681 |   {
1682 |    "cell_type": "code",
1683 |    "execution_count": 78,
1684 |    "metadata": {},
1685 |    "outputs": [],
1686 |    "source": [
1687 |     "gGeneIdCol <- \"ENSEMBL\"\n",
1688 |     "# gGeneIdCol <- \"ENTREZ\""
1689 |    ]
1690 |   },
1691 |   {
1692 |    "cell_type": "code",
1693 |    "execution_count": 79,
1694 |    "metadata": {},
1695 |    "outputs": [
1696 |     {
1697 |      "data": {
1698 |       "text/html": [
1699 |        "241"
1700 |       ],
1701 |       "text/latex": [
1702 |        "241"
1703 |       ],
1704 |       "text/markdown": [
1705 |        "241"
1706 |       ],
1707 |       "text/plain": [
1708 |        "[1] 241"
1709 |       ]
1710 |      },
1711 |      "metadata": {},
1712 |      "output_type": "display_data"
1713 |     }
1714 |    ],
1715 |    "source": [
1716 |     "gDuplicatesMask = duplicated(gRawGenesDf[[gGeneIdCol]])\n",
1717 |     "sum(gDuplicatesMask) # Sum counts only those with a value of TRUE"
1718 |    ]
1719 |   },
1720 |   {
1721 |    "cell_type": "markdown",
1722 |    "metadata": {},
1723 |    "source": [
1724 |     "Note that this sum includes only the second (or greater) instances of records for each gene id; the first record for each gene id is not included in this duplicate set.\n",
1725 |     "\n",
1726 |     "Write a file of the duplicate records that can be examined if desired: "
1727 |    ]
1728 |   },
1729 |   {
1730 |    "cell_type": "code",
1731 |    "execution_count": 80,
1732 |    "metadata": {},
1733 |    "outputs": [],
1734 |    "source": [
1735 |     "writeOutRemovedDuplicates = function(countsDf, duplicatesMask, outputDir, runName, geneType){\n",
1736 |     "    fileName = sprintf(\"%s_duplicated_%s_genes_records.csv\",runName, geneType)\n",
1737 |     "    duplicatedCountsDf = countsDf[duplicatesMask,]\n",
1738 |     "    write.csv(duplicatedCountsDf, file.path(outputDir, fileName))\n",
1739 |     "    print(paste0(\"Output file: \",fileName))\n",
1740 |     "}"
1741 |    ]
1742 |   },
1743 |   {
1744 |    "cell_type": "code",
1745 |    "execution_count": 81,
1746 |    "metadata": {},
1747 |    "outputs": [
1748 |     {
1749 |      "name": "stdout",
1750 |      "output_type": "stream",
1751 |      "text": [
1752 |       "[1] \"Output file: 20230302161642_duplicated_all_genes_records.csv\"\n"
1753 |      ]
1754 |     }
1755 |    ],
1756 |    "source": [
1757 |     "writeOutRemovedDuplicates(gRawGenesDf, gDuplicatesMask, gOutputDir_nb1, gRunName, gGeneType)"
1758 |    ]
1759 |   },
1760 |   {
1761 |    "cell_type": "markdown",
1762 |    "metadata": {},
1763 |    "source": [
1764 |     " As a basic approach, duplicate records for gene ids already existing in the annotation are removed"
1765 |    ]
1766 |   },
1767 |   {
1768 |    "cell_type": "code",
1769 |    "execution_count": 82,
1770 |    "metadata": {},
1771 |    "outputs": [],
1772 |    "source": [
1773 |     "gDeduplicatedGenesDf = gRawGenesDf[!duplicated(gRawGenesDf[[gGeneIdCol]]),]"
1774 |    ]
1775 |   },
1776 |   {
1777 |    "cell_type": "markdown",
1778 |    "metadata": {},
1779 |    "source": [
1780 |     "After deduplication, check the dimensions of the count data and the gene annotation data to ensure that the count dataframe has the same number of rows (genes) as the gene annotation dataframe has rows (again, genes), and that the gene names are the same in both:"
1781 |    ]
1782 |   },
1783 |   {
1784 |    "cell_type": "code",
1785 |    "execution_count": 83,
1786 |    "metadata": {},
1787 |    "outputs": [
1788 |     {
1789 |      "data": {
1790 |       "text/html": [
1791 |        "<style>\n",
1792 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
1793 |        ".list-inline>li {display: inline-block}\n",
1794 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
1795 |        "</style>\n",
1796 |        "<ol class=list-inline><li>60605</li><li>12</li></ol>\n"
1797 |       ],
1798 |       "text/latex": [
1799 |        "\\begin{enumerate*}\n",
1800 |        "\\item 60605\n",
1801 |        "\\item 12\n",
1802 |        "\\end{enumerate*}\n"
1803 |       ],
1804 |       "text/markdown": [
1805 |        "1. 60605\n",
1806 |        "2. 12\n",
1807 |        "\n",
1808 |        "\n"
1809 |       ],
1810 |       "text/plain": [
1811 |        "[1] 60605    12"
1812 |       ]
1813 |      },
1814 |      "metadata": {},
1815 |      "output_type": "display_data"
1816 |     },
1817 |     {
1818 |      "data": {
1819 |       "text/html": [
1820 |        "<style>\n",
1821 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
1822 |        ".list-inline>li {display: inline-block}\n",
1823 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
1824 |        "</style>\n",
1825 |        "<ol class=list-inline><li>60605</li><li>4</li></ol>\n"
1826 |       ],
1827 |       "text/latex": [
1828 |        "\\begin{enumerate*}\n",
1829 |        "\\item 60605\n",
1830 |        "\\item 4\n",
1831 |        "\\end{enumerate*}\n"
1832 |       ],
1833 |       "text/markdown": [
1834 |        "1. 60605\n",
1835 |        "2. 4\n",
1836 |        "\n",
1837 |        "\n"
1838 |       ],
1839 |       "text/plain": [
1840 |        "[1] 60605     4"
1841 |       ]
1842 |      },
1843 |      "metadata": {},
1844 |      "output_type": "display_data"
1845 |     },
1846 |     {
1847 |      "data": {
1848 |       "text/html": [
1849 |        "TRUE"
1850 |       ],
1851 |       "text/latex": [
1852 |        "TRUE"
1853 |       ],
1854 |       "text/markdown": [
1855 |        "TRUE"
1856 |       ],
1857 |       "text/plain": [
1858 |        "[1] TRUE"
1859 |       ]
1860 |      },
1861 |      "metadata": {},
1862 |      "output_type": "display_data"
1863 |     }
1864 |    ],
1865 |    "source": [
1866 |     "dim(gDgeList$counts)\n",
1867 |     "dim(gDeduplicatedGenesDf)\n",
1868 |     "\n",
1869 |     "all(rownames(gDgeList$counts) %in% gDeduplicatedGenesDf[[gGeneIdCol]])"
1870 |    ]
1871 |   },
1872 |   {
1873 |    "cell_type": "markdown",
1874 |    "metadata": {},
1875 |    "source": [
1876 |     "Add the annotation information to the DGEList object:"
1877 |    ]
1878 |   },
1879 |   {
1880 |    "cell_type": "code",
1881 |    "execution_count": 84,
1882 |    "metadata": {},
1883 |    "outputs": [
1884 |     {
1885 |      "data": {
1886 |       "text/html": [
1887 |        "<style>\n",
1888 |        ".list-inline {list-style: none; margin:0; padding: 0}\n",
1889 |        ".list-inline>li {display: inline-block}\n",
1890 |        ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
1891 |        "</style>\n",
1892 |        "<ol class=list-inline><li>'counts'</li><li>'samples'</li><li>'genes'</li></ol>\n"
1893 |       ],
1894 |       "text/latex": [
1895 |        "\\begin{enumerate*}\n",
1896 |        "\\item 'counts'\n",
1897 |        "\\item 'samples'\n",
1898 |        "\\item 'genes'\n",
1899 |        "\\end{enumerate*}\n"
1900 |       ],
1901 |       "text/markdown": [
1902 |        "1. 'counts'\n",
1903 |        "2. 'samples'\n",
1904 |        "3. 'genes'\n",
1905 |        "\n",
1906 |        "\n"
1907 |       ],
1908 |       "text/plain": [
1909 |        "[1] \"counts\"  \"samples\" \"genes\"  "
1910 |       ]
1911 |      },
1912 |      "metadata": {},
1913 |      "output_type": "display_data"
1914 |     }
1915 |    ],
1916 |    "source": [
1917 |     "gDgeList$genes = gDeduplicatedGenesDf\n",
1918 |     "names(gDgeList)"
1919 |    ]
1920 |   },
1921 |   {
1922 |    "cell_type": "markdown",
1923 |    "metadata": {},
1924 |    "source": [
1925 |     "As a sanity-check, look at representative content from the DGEList:"
1926 |    ]
1927 |   },
1928 |   {
1929 |    "cell_type": "code",
1930 |    "execution_count": 85,
1931 |    "metadata": {},
1932 |    "outputs": [
1933 |     {
1934 |      "data": {
1935 |       "text/html": [
1936 |        "<table class=\"dataframe\">\n",
1937 |        "<caption>A matrix: 6 × 12 of type dbl</caption>\n",
1938 |        "<thead>\n",
1939 |        "\t<tr><th></th><th scope=col>VK2_NanH2_PolyB25_2h_1</th><th scope=col>VK2_NanH2_PolyB25_2h_2</th><th scope=col>VK2_NanH2_PolyB25_2h_3</th><th scope=col>VK2_NanH2_PolyB25_2h_4</th><th scope=col>VK2_NanH2_PolyB25_1h_5</th><th scope=col>VK2_NanH2_PolyB25_1h_6</th><th scope=col>VK2_NanH2_PolyB25_1h_7</th><th scope=col>VK2_NanH2_PolyB25_1h_8</th><th scope=col>VK2_Pet28a_PolyB25_2h_9</th><th scope=col>VK2_Pet28a_PolyB25_2h_10</th><th scope=col>VK2_Pet28a_PolyB25_2h_11</th><th scope=col>VK2_Pet28a_PolyB25_2h_12</th></tr>\n",
1940 |        "</thead>\n",
1941 |        "<tbody>\n",
1942 |        "\t<tr><th scope=row>ENSG00000000003</th><td> 356.00</td><td> 542.00</td><td> 364.00</td><td> 299.00</td><td> 502.00</td><td> 416.00</td><td> 424.00</td><td> 497.00</td><td> 323.00</td><td> 387.00</td><td> 465.00</td><td> 366.00</td></tr>\n",
1943 |        "\t<tr><th scope=row>ENSG00000000005</th><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td><td>   0.00</td></tr>\n",
1944 |        "\t<tr><th scope=row>ENSG00000000419</th><td>2080.11</td><td>3080.00</td><td>2206.80</td><td>1721.00</td><td>2340.00</td><td>2232.04</td><td>2253.00</td><td>2573.36</td><td>1904.05</td><td>2084.00</td><td>2412.42</td><td>2170.07</td></tr>\n",
1945 |        "\t<tr><th scope=row>ENSG00000000457</th><td> 113.88</td><td> 119.35</td><td>  85.84</td><td> 102.65</td><td> 153.65</td><td>  94.84</td><td> 112.33</td><td> 130.23</td><td> 119.77</td><td>  98.34</td><td> 120.36</td><td>  91.56</td></tr>\n",
1946 |        "\t<tr><th scope=row>ENSG00000000460</th><td> 181.12</td><td> 285.65</td><td> 181.16</td><td> 209.35</td><td> 254.35</td><td> 200.16</td><td> 242.67</td><td> 292.77</td><td> 234.23</td><td> 189.66</td><td> 190.64</td><td> 204.44</td></tr>\n",
1947 |        "\t<tr><th scope=row>ENSG00000000938</th><td>   4.00</td><td>  21.00</td><td>  10.00</td><td>  18.00</td><td>  10.00</td><td>   6.00</td><td>   6.00</td><td>   9.00</td><td>  10.00</td><td>  13.00</td><td>  17.00</td><td>  13.00</td></tr>\n",
1948 |        "</tbody>\n",
1949 |        "</table>\n"
1950 |       ],
1951 |       "text/latex": [
1952 |        "A matrix: 6 × 12 of type dbl\n",
1953 |        "\\begin{tabular}{r|llllllllllll}\n",
1954 |        "  & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & VK2\\_NanH2\\_PolyB25\\_1h\\_7 & VK2\\_NanH2\\_PolyB25\\_1h\\_8 & VK2\\_Pet28a\\_PolyB25\\_2h\\_9 & VK2\\_Pet28a\\_PolyB25\\_2h\\_10 & VK2\\_Pet28a\\_PolyB25\\_2h\\_11 & VK2\\_Pet28a\\_PolyB25\\_2h\\_12\\\\\n",
1955 |        "\\hline\n",
1956 |        "\tENSG00000000003 &  356.00 &  542.00 &  364.00 &  299.00 &  502.00 &  416.00 &  424.00 &  497.00 &  323.00 &  387.00 &  465.00 &  366.00\\\\\n",
1957 |        "\tENSG00000000005 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00 &    0.00\\\\\n",
1958 |        "\tENSG00000000419 & 2080.11 & 3080.00 & 2206.80 & 1721.00 & 2340.00 & 2232.04 & 2253.00 & 2573.36 & 1904.05 & 2084.00 & 2412.42 & 2170.07\\\\\n",
1959 |        "\tENSG00000000457 &  113.88 &  119.35 &   85.84 &  102.65 &  153.65 &   94.84 &  112.33 &  130.23 &  119.77 &   98.34 &  120.36 &   91.56\\\\\n",
1960 |        "\tENSG00000000460 &  181.12 &  285.65 &  181.16 &  209.35 &  254.35 &  200.16 &  242.67 &  292.77 &  234.23 &  189.66 &  190.64 &  204.44\\\\\n",
1961 |        "\tENSG00000000938 &    4.00 &   21.00 &   10.00 &   18.00 &   10.00 &    6.00 &    6.00 &    9.00 &   10.00 &   13.00 &   17.00 &   13.00\\\\\n",
1962 |        "\\end{tabular}\n"
1963 |       ],
1964 |       "text/markdown": [
1965 |        "\n",
1966 |        "A matrix: 6 × 12 of type dbl\n",
1967 |        "\n",
1968 |        "| <!--/--> | VK2_NanH2_PolyB25_2h_1 | VK2_NanH2_PolyB25_2h_2 | VK2_NanH2_PolyB25_2h_3 | VK2_NanH2_PolyB25_2h_4 | VK2_NanH2_PolyB25_1h_5 | VK2_NanH2_PolyB25_1h_6 | VK2_NanH2_PolyB25_1h_7 | VK2_NanH2_PolyB25_1h_8 | VK2_Pet28a_PolyB25_2h_9 | VK2_Pet28a_PolyB25_2h_10 | VK2_Pet28a_PolyB25_2h_11 | VK2_Pet28a_PolyB25_2h_12 |\n",
1969 |        "|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
1970 |        "| ENSG00000000003 |  356.00 |  542.00 |  364.00 |  299.00 |  502.00 |  416.00 |  424.00 |  497.00 |  323.00 |  387.00 |  465.00 |  366.00 |\n",
1971 |        "| ENSG00000000005 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |\n",
1972 |        "| ENSG00000000419 | 2080.11 | 3080.00 | 2206.80 | 1721.00 | 2340.00 | 2232.04 | 2253.00 | 2573.36 | 1904.05 | 2084.00 | 2412.42 | 2170.07 |\n",
1973 |        "| ENSG00000000457 |  113.88 |  119.35 |   85.84 |  102.65 |  153.65 |   94.84 |  112.33 |  130.23 |  119.77 |   98.34 |  120.36 |   91.56 |\n",
1974 |        "| ENSG00000000460 |  181.12 |  285.65 |  181.16 |  209.35 |  254.35 |  200.16 |  242.67 |  292.77 |  234.23 |  189.66 |  190.64 |  204.44 |\n",
1975 |        "| ENSG00000000938 |    4.00 |   21.00 |   10.00 |   18.00 |   10.00 |    6.00 |    6.00 |    9.00 |   10.00 |   13.00 |   17.00 |   13.00 |\n",
1976 |        "\n"
1977 |       ],
1978 |       "text/plain": [
1979 |        "                VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_2\n",
1980 |        "ENSG00000000003  356.00                 542.00               \n",
1981 |        "ENSG00000000005    0.00                   0.00               \n",
1982 |        "ENSG00000000419 2080.11                3080.00               \n",
1983 |        "ENSG00000000457  113.88                 119.35               \n",
1984 |        "ENSG00000000460  181.12                 285.65               \n",
1985 |        "ENSG00000000938    4.00                  21.00               \n",
1986 |        "                VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_4\n",
1987 |        "ENSG00000000003  364.00                 299.00               \n",
1988 |        "ENSG00000000005    0.00                   0.00               \n",
1989 |        "ENSG00000000419 2206.80                1721.00               \n",
1990 |        "ENSG00000000457   85.84                 102.65               \n",
1991 |        "ENSG00000000460  181.16                 209.35               \n",
1992 |        "ENSG00000000938   10.00                  18.00               \n",
1993 |        "                VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_6\n",
1994 |        "ENSG00000000003  502.00                 416.00               \n",
1995 |        "ENSG00000000005    0.00                   0.00               \n",
1996 |        "ENSG00000000419 2340.00                2232.04               \n",
1997 |        "ENSG00000000457  153.65                  94.84               \n",
1998 |        "ENSG00000000460  254.35                 200.16               \n",
1999 |        "ENSG00000000938   10.00                   6.00               \n",
2000 |        "                VK2_NanH2_PolyB25_1h_7 VK2_NanH2_PolyB25_1h_8\n",
2001 |        "ENSG00000000003  424.00                 497.00               \n",
2002 |        "ENSG00000000005    0.00                   0.00               \n",
2003 |        "ENSG00000000419 2253.00                2573.36               \n",
2004 |        "ENSG00000000457  112.33                 130.23               \n",
2005 |        "ENSG00000000460  242.67                 292.77               \n",
2006 |        "ENSG00000000938    6.00                   9.00               \n",
2007 |        "                VK2_Pet28a_PolyB25_2h_9 VK2_Pet28a_PolyB25_2h_10\n",
2008 |        "ENSG00000000003  323.00                  387.00                 \n",
2009 |        "ENSG00000000005    0.00                    0.00                 \n",
2010 |        "ENSG00000000419 1904.05                 2084.00                 \n",
2011 |        "ENSG00000000457  119.77                   98.34                 \n",
2012 |        "ENSG00000000460  234.23                  189.66                 \n",
2013 |        "ENSG00000000938   10.00                   13.00                 \n",
2014 |        "                VK2_Pet28a_PolyB25_2h_11 VK2_Pet28a_PolyB25_2h_12\n",
2015 |        "ENSG00000000003  465.00                   366.00                 \n",
2016 |        "ENSG00000000005    0.00                     0.00                 \n",
2017 |        "ENSG00000000419 2412.42                  2170.07                 \n",
2018 |        "ENSG00000000457  120.36                    91.56                 \n",
2019 |        "ENSG00000000460  190.64                   204.44                 \n",
2020 |        "ENSG00000000938   17.00                    13.00                 "
2021 |       ]
2022 |      },
2023 |      "metadata": {},
2024 |      "output_type": "display_data"
2025 |     },
2026 |     {
2027 |      "data": {
2028 |       "text/html": [
2029 |        "<table class=\"dataframe\">\n",
2030 |        "<caption>A data.frame: 6 × 10</caption>\n",
2031 |        "<thead>\n",
2032 |        "\t<tr><th></th><th scope=col>group</th><th scope=col>lib.size</th><th scope=col>norm.factors</th><th scope=col>Sample.Name</th><th scope=col>Sample.Code</th><th scope=col>Reference.Genome..Organism.</th><th scope=col>Gender</th><th scope=col>Condition</th><th scope=col>Time.point</th><th scope=col>Condition_Time</th></tr>\n",
2033 |        "\t<tr><th></th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
2034 |        "</thead>\n",
2035 |        "<tbody>\n",
2036 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_2h_1</th><td>NanH2_treated_2H</td><td>22490889</td><td>1</td><td>VK2_NanH2_PolyB25_2h_1</td><td>K001</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
2037 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_2h_2</th><td>NanH2_treated_2H</td><td>31232480</td><td>1</td><td>VK2_NanH2_PolyB25_2h_2</td><td>K002</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
2038 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_2h_3</th><td>NanH2_treated_2H</td><td>21901342</td><td>1</td><td>VK2_NanH2_PolyB25_2h_3</td><td>K003</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
2039 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_2h_4</th><td>NanH2_treated_2H</td><td>19637114</td><td>1</td><td>VK2_NanH2_PolyB25_2h_4</td><td>K004</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>2H</td><td>NanH2_treated_2H</td></tr>\n",
2040 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_1h_5</th><td>NanH2_treated_1H</td><td>27468016</td><td>1</td><td>VK2_NanH2_PolyB25_1h_5</td><td>K005</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>1H</td><td>NanH2_treated_1H</td></tr>\n",
2041 |        "\t<tr><th scope=row>VK2_NanH2_PolyB25_1h_6</th><td>NanH2_treated_1H</td><td>24935803</td><td>1</td><td>VK2_NanH2_PolyB25_1h_6</td><td>K006</td><td>Human</td><td>Female</td><td>NanH2_treated</td><td>1H</td><td>NanH2_treated_1H</td></tr>\n",
2042 |        "</tbody>\n",
2043 |        "</table>\n"
2044 |       ],
2045 |       "text/latex": [
2046 |        "A data.frame: 6 × 10\n",
2047 |        "\\begin{tabular}{r|llllllllll}\n",
2048 |        "  & group & lib.size & norm.factors & Sample.Name & Sample.Code & Reference.Genome..Organism. & Gender & Condition & Time.point & Condition\\_Time\\\\\n",
2049 |        "  & <fct> & <dbl> & <dbl> & <chr> & <chr> & <chr> & <chr> & <chr> & <chr> & <chr>\\\\\n",
2050 |        "\\hline\n",
2051 |        "\tVK2\\_NanH2\\_PolyB25\\_2h\\_1 & NanH2\\_treated\\_2H & 22490889 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_1 & K001 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
2052 |        "\tVK2\\_NanH2\\_PolyB25\\_2h\\_2 & NanH2\\_treated\\_2H & 31232480 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_2 & K002 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
2053 |        "\tVK2\\_NanH2\\_PolyB25\\_2h\\_3 & NanH2\\_treated\\_2H & 21901342 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_3 & K003 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
2054 |        "\tVK2\\_NanH2\\_PolyB25\\_2h\\_4 & NanH2\\_treated\\_2H & 19637114 & 1 & VK2\\_NanH2\\_PolyB25\\_2h\\_4 & K004 & Human & Female & NanH2\\_treated & 2H & NanH2\\_treated\\_2H\\\\\n",
2055 |        "\tVK2\\_NanH2\\_PolyB25\\_1h\\_5 & NanH2\\_treated\\_1H & 27468016 & 1 & VK2\\_NanH2\\_PolyB25\\_1h\\_5 & K005 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n",
2056 |        "\tVK2\\_NanH2\\_PolyB25\\_1h\\_6 & NanH2\\_treated\\_1H & 24935803 & 1 & VK2\\_NanH2\\_PolyB25\\_1h\\_6 & K006 & Human & Female & NanH2\\_treated & 1H & NanH2\\_treated\\_1H\\\\\n",
2057 |        "\\end{tabular}\n"
2058 |       ],
2059 |       "text/markdown": [
2060 |        "\n",
2061 |        "A data.frame: 6 × 10\n",
2062 |        "\n",
2063 |        "| <!--/--> | group &lt;fct&gt; | lib.size &lt;dbl&gt; | norm.factors &lt;dbl&gt; | Sample.Name &lt;chr&gt; | Sample.Code &lt;chr&gt; | Reference.Genome..Organism. &lt;chr&gt; | Gender &lt;chr&gt; | Condition &lt;chr&gt; | Time.point &lt;chr&gt; | Condition_Time &lt;chr&gt; |\n",
2064 |        "|---|---|---|---|---|---|---|---|---|---|---|\n",
2065 |        "| VK2_NanH2_PolyB25_2h_1 | NanH2_treated_2H | 22490889 | 1 | VK2_NanH2_PolyB25_2h_1 | K001 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
2066 |        "| VK2_NanH2_PolyB25_2h_2 | NanH2_treated_2H | 31232480 | 1 | VK2_NanH2_PolyB25_2h_2 | K002 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
2067 |        "| VK2_NanH2_PolyB25_2h_3 | NanH2_treated_2H | 21901342 | 1 | VK2_NanH2_PolyB25_2h_3 | K003 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
2068 |        "| VK2_NanH2_PolyB25_2h_4 | NanH2_treated_2H | 19637114 | 1 | VK2_NanH2_PolyB25_2h_4 | K004 | Human | Female | NanH2_treated | 2H | NanH2_treated_2H |\n",
2069 |        "| VK2_NanH2_PolyB25_1h_5 | NanH2_treated_1H | 27468016 | 1 | VK2_NanH2_PolyB25_1h_5 | K005 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n",
2070 |        "| VK2_NanH2_PolyB25_1h_6 | NanH2_treated_1H | 24935803 | 1 | VK2_NanH2_PolyB25_1h_6 | K006 | Human | Female | NanH2_treated | 1H | NanH2_treated_1H |\n",
2071 |        "\n"
2072 |       ],
2073 |       "text/plain": [
2074 |        "                       group            lib.size norm.factors\n",
2075 |        "VK2_NanH2_PolyB25_2h_1 NanH2_treated_2H 22490889 1           \n",
2076 |        "VK2_NanH2_PolyB25_2h_2 NanH2_treated_2H 31232480 1           \n",
2077 |        "VK2_NanH2_PolyB25_2h_3 NanH2_treated_2H 21901342 1           \n",
2078 |        "VK2_NanH2_PolyB25_2h_4 NanH2_treated_2H 19637114 1           \n",
2079 |        "VK2_NanH2_PolyB25_1h_5 NanH2_treated_1H 27468016 1           \n",
2080 |        "VK2_NanH2_PolyB25_1h_6 NanH2_treated_1H 24935803 1           \n",
2081 |        "                       Sample.Name            Sample.Code\n",
2082 |        "VK2_NanH2_PolyB25_2h_1 VK2_NanH2_PolyB25_2h_1 K001       \n",
2083 |        "VK2_NanH2_PolyB25_2h_2 VK2_NanH2_PolyB25_2h_2 K002       \n",
2084 |        "VK2_NanH2_PolyB25_2h_3 VK2_NanH2_PolyB25_2h_3 K003       \n",
2085 |        "VK2_NanH2_PolyB25_2h_4 VK2_NanH2_PolyB25_2h_4 K004       \n",
2086 |        "VK2_NanH2_PolyB25_1h_5 VK2_NanH2_PolyB25_1h_5 K005       \n",
2087 |        "VK2_NanH2_PolyB25_1h_6 VK2_NanH2_PolyB25_1h_6 K006       \n",
2088 |        "                       Reference.Genome..Organism. Gender Condition    \n",
2089 |        "VK2_NanH2_PolyB25_2h_1 Human                       Female NanH2_treated\n",
2090 |        "VK2_NanH2_PolyB25_2h_2 Human                       Female NanH2_treated\n",
2091 |        "VK2_NanH2_PolyB25_2h_3 Human                       Female NanH2_treated\n",
2092 |        "VK2_NanH2_PolyB25_2h_4 Human                       Female NanH2_treated\n",
2093 |        "VK2_NanH2_PolyB25_1h_5 Human                       Female NanH2_treated\n",
2094 |        "VK2_NanH2_PolyB25_1h_6 Human                       Female NanH2_treated\n",
2095 |        "                       Time.point Condition_Time  \n",
2096 |        "VK2_NanH2_PolyB25_2h_1 2H         NanH2_treated_2H\n",
2097 |        "VK2_NanH2_PolyB25_2h_2 2H         NanH2_treated_2H\n",
2098 |        "VK2_NanH2_PolyB25_2h_3 2H         NanH2_treated_2H\n",
2099 |        "VK2_NanH2_PolyB25_2h_4 2H         NanH2_treated_2H\n",
2100 |        "VK2_NanH2_PolyB25_1h_5 1H         NanH2_treated_1H\n",
2101 |        "VK2_NanH2_PolyB25_1h_6 1H         NanH2_treated_1H"
2102 |       ]
2103 |      },
2104 |      "metadata": {},
2105 |      "output_type": "display_data"
2106 |     },
2107 |     {
2108 |      "data": {
2109 |       "text/html": [
2110 |        "<table class=\"dataframe\">\n",
2111 |        "<caption>A data.frame: 6 × 4</caption>\n",
2112 |        "<thead>\n",
2113 |        "\t<tr><th></th><th scope=col>ENSEMBL</th><th scope=col>ENTREZID</th><th scope=col>SYMBOL</th><th scope=col>gene_type</th></tr>\n",
2114 |        "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
2115 |        "</thead>\n",
2116 |        "<tbody>\n",
2117 |        "\t<tr><th scope=row>1</th><td>ENSG00000000003</td><td>7105 </td><td>TSPAN6  </td><td>protein_coding</td></tr>\n",
2118 |        "\t<tr><th scope=row>2</th><td>ENSG00000000005</td><td>64102</td><td>TNMD    </td><td>protein_coding</td></tr>\n",
2119 |        "\t<tr><th scope=row>3</th><td>ENSG00000000419</td><td>8813 </td><td>DPM1    </td><td>protein_coding</td></tr>\n",
2120 |        "\t<tr><th scope=row>4</th><td>ENSG00000000457</td><td>57147</td><td>SCYL3   </td><td>protein_coding</td></tr>\n",
2121 |        "\t<tr><th scope=row>5</th><td>ENSG00000000460</td><td>55732</td><td>C1orf112</td><td>protein_coding</td></tr>\n",
2122 |        "\t<tr><th scope=row>6</th><td>ENSG00000000938</td><td>2268 </td><td>FGR     </td><td>protein_coding</td></tr>\n",
2123 |        "</tbody>\n",
2124 |        "</table>\n"
2125 |       ],
2126 |       "text/latex": [
2127 |        "A data.frame: 6 × 4\n",
2128 |        "\\begin{tabular}{r|llll}\n",
2129 |        "  & ENSEMBL & ENTREZID & SYMBOL & gene\\_type\\\\\n",
2130 |        "  & <chr> & <chr> & <chr> & <chr>\\\\\n",
2131 |        "\\hline\n",
2132 |        "\t1 & ENSG00000000003 & 7105  & TSPAN6   & protein\\_coding\\\\\n",
2133 |        "\t2 & ENSG00000000005 & 64102 & TNMD     & protein\\_coding\\\\\n",
2134 |        "\t3 & ENSG00000000419 & 8813  & DPM1     & protein\\_coding\\\\\n",
2135 |        "\t4 & ENSG00000000457 & 57147 & SCYL3    & protein\\_coding\\\\\n",
2136 |        "\t5 & ENSG00000000460 & 55732 & C1orf112 & protein\\_coding\\\\\n",
2137 |        "\t6 & ENSG00000000938 & 2268  & FGR      & protein\\_coding\\\\\n",
2138 |        "\\end{tabular}\n"
2139 |       ],
2140 |       "text/markdown": [
2141 |        "\n",
2142 |        "A data.frame: 6 × 4\n",
2143 |        "\n",
2144 |        "| <!--/--> | ENSEMBL &lt;chr&gt; | ENTREZID &lt;chr&gt; | SYMBOL &lt;chr&gt; | gene_type &lt;chr&gt; |\n",
2145 |        "|---|---|---|---|---|\n",
2146 |        "| 1 | ENSG00000000003 | 7105  | TSPAN6   | protein_coding |\n",
2147 |        "| 2 | ENSG00000000005 | 64102 | TNMD     | protein_coding |\n",
2148 |        "| 3 | ENSG00000000419 | 8813  | DPM1     | protein_coding |\n",
2149 |        "| 4 | ENSG00000000457 | 57147 | SCYL3    | protein_coding |\n",
2150 |        "| 5 | ENSG00000000460 | 55732 | C1orf112 | protein_coding |\n",
2151 |        "| 6 | ENSG00000000938 | 2268  | FGR      | protein_coding |\n",
2152 |        "\n"
2153 |       ],
2154 |       "text/plain": [
2155 |        "  ENSEMBL         ENTREZID SYMBOL   gene_type     \n",
2156 |        "1 ENSG00000000003 7105     TSPAN6   protein_coding\n",
2157 |        "2 ENSG00000000005 64102    TNMD     protein_coding\n",
2158 |        "3 ENSG00000000419 8813     DPM1     protein_coding\n",
2159 |        "4 ENSG00000000457 57147    SCYL3    protein_coding\n",
2160 |        "5 ENSG00000000460 55732    C1orf112 protein_coding\n",
2161 |        "6 ENSG00000000938 2268     FGR      protein_coding"
2162 |       ]
2163 |      },
2164 |      "metadata": {},
2165 |      "output_type": "display_data"
2166 |     }
2167 |    ],
2168 |    "source": [
2169 |     "head(gDgeList$counts)\n",
2170 |     "head(gDgeList$samples)\n",
2171 |     "head(gDgeList$genes)"
2172 |    ]
2173 |   },
2174 |   {
2175 |    "cell_type": "markdown",
2176 |    "metadata": {},
2177 |    "source": [
2178 |     "[Table of Contents](#Table-of-Contents)\n",
2179 |     "\n",
2180 |     "## Summary"
2181 |    ]
2182 |   },
2183 |   {
2184 |    "cell_type": "markdown",
2185 |    "metadata": {},
2186 |    "source": [
2187 |     "**Gene annotations**\n",
2188 |     "* Human gene annotations were taken from the Gencode project, Release 43 (GRCh38.p13).\n",
2189 |     "\n",
2190 |     "**Gene type filtering**\n",
2191 |     "* This analysis was includes protein-coding genes and non-coding genes.  Of the original 60,605 Ensembl genes in the dataset, 19,937 are known coding genes.  \n"
2192 |    ]
2193 |   },
2194 |   {
2195 |    "cell_type": "markdown",
2196 |    "metadata": {},
2197 |    "source": [
2198 |     "Save the workspace objects for future reference:"
2199 |    ]
2200 |   },
2201 |   {
2202 |    "cell_type": "code",
2203 |    "execution_count": 86,
2204 |    "metadata": {},
2205 |    "outputs": [
2206 |     {
2207 |      "name": "stdout",
2208 |      "output_type": "stream",
2209 |      "text": [
2210 |       "[1] \"Output file: 20230302161642_data_integration.RData\"\n"
2211 |      ]
2212 |     }
2213 |    ],
2214 |    "source": [
2215 |     "writeWorkspaceImage(gInterimDir, paste0(gRunName,\"_data_integration\"))"
2216 |    ]
2217 |   },
2218 |   {
2219 |    "cell_type": "markdown",
2220 |    "metadata": {},
2221 |    "source": [
2222 |     "[Table of Contents](#Table-of-Contents)\n",
2223 |     "\n",
2224 |     "## Citations\n",
2225 |     "\n",
2226 |     "1. Law CW, Alhamdoosh M, Su S, Smyth GK, Ritchie ME. RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR. Version 2. F1000Res. 2016 Jun 17 [revised 2016 Jan 1];5:1408.\n",
2227 |     "2. Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics 26, 139-140.\n",
2228 |     "3. Huber W, Carey VJ, Gentleman R, Anders S, Carlson M, Carvalho BS, Bravo HC, Davis S, Gatto L, Girke T, Gottardo R, Hahne F, Hansen KD, Irizarry RA, Lawrence M, Love MI, MacDonald J, Obenchain V, Oleś AK, Pagès H, Reyes A, Shannon P, Smyth GK, Tenenbaum D, Waldron L, Morgan M. Orchestrating high-throughput genomic analysis with Bioconductor. Nat Methods. 2015 Feb;12(2):115-21.\n",
2229 |     "4. R Core Team (2016). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/."
2230 |    ]
2231 |   },
2232 |   {
2233 |    "cell_type": "markdown",
2234 |    "metadata": {},
2235 |    "source": [
2236 |     "[Table of Contents](#Table-of-Contents)\n",
2237 |     "\n",
2238 |     "## Appendix: R Session Info"
2239 |    ]
2240 |   },
2241 |   {
2242 |    "cell_type": "code",
2243 |    "execution_count": 87,
2244 |    "metadata": {},
2245 |    "outputs": [
2246 |     {
2247 |      "data": {
2248 |       "text/plain": [
2249 |        "[1] \"2023-03-03 10:48:37 MST\""
2250 |       ]
2251 |      },
2252 |      "metadata": {},
2253 |      "output_type": "display_data"
2254 |     },
2255 |     {
2256 |      "data": {
2257 |       "text/plain": [
2258 |        "R version 4.1.3 (2022-03-10)\n",
2259 |        "Platform: x86_64-apple-darwin13.4.0 (64-bit)\n",
2260 |        "Running under: macOS Big Sur/Monterey 10.16\n",
2261 |        "\n",
2262 |        "Matrix products: default\n",
2263 |        "BLAS/LAPACK: /Users/dchilinfuentes/opt/anaconda3/envs/RNAseq_env/lib/libopenblasp-r0.3.20.dylib\n",
2264 |        "\n",
2265 |        "locale:\n",
2266 |        "[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n",
2267 |        "\n",
2268 |        "attached base packages:\n",
2269 |        "[1] stats4    stats     graphics  grDevices utils     datasets  methods  \n",
2270 |        "[8] base     \n",
2271 |        "\n",
2272 |        "other attached packages:\n",
2273 |        " [1] edgeR_3.36.0                           \n",
2274 |        " [2] limma_3.50.3                           \n",
2275 |        " [3] Homo.sapiens_1.3.1                     \n",
2276 |        " [4] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2\n",
2277 |        " [5] org.Hs.eg.db_3.14.0                    \n",
2278 |        " [6] GO.db_3.14.0                           \n",
2279 |        " [7] OrganismDbi_1.36.0                     \n",
2280 |        " [8] GenomicFeatures_1.46.5                 \n",
2281 |        " [9] GenomicRanges_1.46.1                   \n",
2282 |        "[10] GenomeInfoDb_1.30.1                    \n",
2283 |        "[11] AnnotationDbi_1.56.2                   \n",
2284 |        "[12] IRanges_2.28.0                         \n",
2285 |        "[13] S4Vectors_0.32.4                       \n",
2286 |        "[14] Biobase_2.54.0                         \n",
2287 |        "[15] BiocGenerics_0.40.0                    \n",
2288 |        "\n",
2289 |        "loaded via a namespace (and not attached):\n",
2290 |        " [1] MatrixGenerics_1.6.0        httr_1.4.3                 \n",
2291 |        " [3] bit64_4.0.5                 jsonlite_1.8.0             \n",
2292 |        " [5] assertthat_0.2.1            BiocManager_1.30.18        \n",
2293 |        " [7] BiocFileCache_2.2.1         RBGL_1.70.0                \n",
2294 |        " [9] blob_1.2.3                  GenomeInfoDbData_1.2.7     \n",
2295 |        "[11] Rsamtools_2.10.0            yaml_2.3.5                 \n",
2296 |        "[13] progress_1.2.2              pillar_1.8.0               \n",
2297 |        "[15] RSQLite_2.2.15              lattice_0.20-45            \n",
2298 |        "[17] glue_1.6.2                  uuid_1.1-0                 \n",
2299 |        "[19] digest_0.6.29               XVector_0.34.0             \n",
2300 |        "[21] htmltools_0.5.3             Matrix_1.4-1               \n",
2301 |        "[23] XML_3.99-0.10               pkgconfig_2.0.3            \n",
2302 |        "[25] biomaRt_2.50.3              zlibbioc_1.40.0            \n",
2303 |        "[27] purrr_0.3.4                 BiocParallel_1.28.3        \n",
2304 |        "[29] tibble_3.1.8                KEGGREST_1.34.0            \n",
2305 |        "[31] generics_0.1.3              ellipsis_0.3.2             \n",
2306 |        "[33] cachem_1.0.6                SummarizedExperiment_1.24.0\n",
2307 |        "[35] repr_1.1.4                  cli_3.3.0                  \n",
2308 |        "[37] magrittr_2.0.3              crayon_1.5.1               \n",
2309 |        "[39] memoise_2.0.1               evaluate_0.16              \n",
2310 |        "[41] fansi_1.0.3                 xml2_1.3.3                 \n",
2311 |        "[43] graph_1.72.0                tools_4.1.3                \n",
2312 |        "[45] prettyunits_1.1.1           hms_1.1.1                  \n",
2313 |        "[47] BiocIO_1.4.0                lifecycle_1.0.1            \n",
2314 |        "[49] matrixStats_0.62.0          stringr_1.4.0              \n",
2315 |        "[51] locfit_1.5-9.6              DelayedArray_0.20.0        \n",
2316 |        "[53] Biostrings_2.62.0           compiler_4.1.3             \n",
2317 |        "[55] rlang_1.0.4                 grid_4.1.3                 \n",
2318 |        "[57] RCurl_1.98-1.8              pbdZMQ_0.3-7               \n",
2319 |        "[59] IRkernel_1.3                rjson_0.2.21               \n",
2320 |        "[61] rappdirs_0.3.3              bitops_1.0-7               \n",
2321 |        "[63] base64enc_0.1-3             restfulr_0.0.15            \n",
2322 |        "[65] DBI_1.1.3                   curl_4.3.2                 \n",
2323 |        "[67] R6_2.5.1                    GenomicAlignments_1.30.0   \n",
2324 |        "[69] dplyr_1.0.9                 rtracklayer_1.54.0         \n",
2325 |        "[71] fastmap_1.1.0               bit_4.0.4                  \n",
2326 |        "[73] utf8_1.2.2                  filelock_1.0.2             \n",
2327 |        "[75] stringi_1.7.8               parallel_4.1.3             \n",
2328 |        "[77] IRdisplay_1.1               Rcpp_1.0.9                 \n",
2329 |        "[79] vctrs_0.4.1                 png_0.1-7                  \n",
2330 |        "[81] dbplyr_2.2.1                tidyselect_1.1.2           "
2331 |       ]
2332 |      },
2333 |      "metadata": {},
2334 |      "output_type": "display_data"
2335 |     }
2336 |    ],
2337 |    "source": [
2338 |     "Sys.time()\n",
2339 |     "sessionInfo()"
2340 |    ]
2341 |   },
2342 |   {
2343 |    "cell_type": "markdown",
2344 |    "metadata": {},
2345 |    "source": [
2346 |     "[Table of Contents](#Table-of-Contents)\n",
2347 |     "\n",
2348 |     "Copyright (c) 2018 UC San Diego Center for Computational Biology & Bioinformatics under the MIT License\n",
2349 |     "\n",
2350 |     "Notebook template by Amanda Birmingham"
2351 |    ]
2352 |   }
2353 |  ],
2354 |  "metadata": {
2355 |   "kernelspec": {
2356 |    "display_name": "R",
2357 |    "language": "R",
2358 |    "name": "ir"
2359 |   },
2360 |   "language_info": {
2361 |    "codemirror_mode": "r",
2362 |    "file_extension": ".r",
2363 |    "mimetype": "text/x-r-source",
2364 |    "name": "R",
2365 |    "pygments_lexer": "r",
2366 |    "version": "4.1.3"
2367 |   }
2368 |  },
2369 |  "nbformat": 4,
2370 |  "nbformat_minor": 2
2371 | }
2372 | 


--------------------------------------------------------------------------------