├── R
    └── script_enrichment_tomato.R
├── data
    └── produce_non_homologous_val_sets.py
├── moca_blue
    ├── README.md
    ├── mo_clu
    │   ├── mo_cluster_v2.0.R
    │   └── mo_cluster_v2.3.R
    ├── mo_imp
    │   ├── mo_imp_scores.v1.1.R
    │   └── rdf5_get_epm_contrib_scores.v1.1.R
    ├── mo_nom
    │   ├── rdf5_get_cwms_per_pattern.v1.0R.R
    │   ├── rdf5_get_pfm_per_pattern.v1.0R.R
    │   └── rdf5_get_pwm_per_pattern.v1.0R.R
    ├── mo_proj
    │   ├── mo_check_mapping-performance_V1.5.R
    │   ├── mo_check_mapping-performance_V1.7.R
    │   ├── mo_epm_prediction.v1.0.R
    │   ├── mo_feat-filter.v2.0.R
    │   ├── mo_feat-filter.v2.3.R
    │   ├── mo_feat-filter.v2.6.R
    │   ├── mo_feat-filter.v3.3.R
    │   ├── mo_feature_tester.v1.0.R
    │   ├── mo_gene_mapper_v0.1-2.R
    │   ├── mo_gene_mapper_v0.1.R
    │   ├── mo_genotype_variance.v1.4.R
    │   ├── motif_finder.v1.1.R
    │   ├── motif_predictabilityV1.0.R
    │   ├── motif_predictabilityV1.5.R
    │   └── occ_filter_v1.1.R
    ├── mo_ran
    │   ├── meta_motif_ranges_characteristics_TSS-TTS.1.4.R
    │   ├── meta_motif_ranges_characteristics_TSS-TTS.1.5.R
    │   ├── rdf5_get_seql_per_patternV2.1.R
    │   └── rdf5_get_seql_per_patternV2.R
    └── ref_seq
    │   ├── blamm_meV1.0.sh
    │   ├── extract_range_to_fasta.sh
    │   └── split_file.sh
├── model
    ├── create_generic_features.py
    ├── create_super_genome_gtf_tpm_for_msr.py
    ├── cross_specie_test_leaf.py
    ├── cross_specie_test_root.py
    ├── effect_of_different_outer_sizes.py
    ├── effect_of_different_utr_sizes.py
    ├── extract_motifs_msr.py
    ├── extract_motifs_ssr.py
    ├── fetch_genomes_and_annotation.sh
    ├── motif_discovery_msr_leaf.py
    ├── motif_discovery_ssr_leaf.py
    ├── motif_discovery_ssr_root.py
    ├── random_forest_msr.py
    ├── random_forest_ssr.py
    ├── train_msr_models_leaf.py
    ├── train_msr_models_root.py
    ├── train_ssr_ssc_models_leaf.py
    ├── train_ssr_ssc_models_root.py
    ├── utils.py
    └── validation_genes.pickle
├── readme.md
└── requirements.txt


/data/produce_non_homologous_val_sets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import pandas as pd
 4 | from Bio import SeqIO
 5 | proteins = ['Arabidopsis_thaliana.TAIR10.pep.all.fa', 'Zea_mays.Zm-B73-REFERENCE-NAM-5.0.pep.all.fa',
 6 |             'Solanum_lycopersicum.SL3.0.pep.all.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.pep.all.fa']
 7 | 
 8 | blast_outputs = ['BLAST_ara_to_ara', 'BLAST_zea_to_zea', 'BLAST_sol_to_sol', 'BLAST_sor_to_sor']
 9 | validation_genes = dict()
10 | for proteome, blast_output in zip(proteins, blast_outputs):
11 |     if os.path.exists(f'proteomes/{blast_output}'):
12 |         info = []
13 |         for rec in SeqIO.parse(f'proteomes/{proteome}', 'fasta'):
14 |             description = rec.description.split(' ')
15 |             protein_id = description[0]
16 |             gene_id = description[3].split(':')[-1]
17 |             chrom = description[2].split(':')[2]
18 |             info.append([gene_id, protein_id, chrom])
19 |         info = pd.DataFrame(info, columns=['gene_id', 'protein_id', 'chrom'])
20 |         info.index = info.protein_id.tolist()
21 |         print(info.head())
22 |         blast_out = pd.read_csv(f'proteomes/{blast_output}', sep='\t',
23 |                                 names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstat', 'qend',
24 |                                        'sstart', 'send', 'evalue', 'bitscore'])
25 |         blast_out['qgene'] = info.loc[blast_out.qseqid.tolist(), 'gene_id'].values
26 |         blast_out['sgene'] = info.loc[blast_out.sseqid.tolist(), 'gene_id'].values
27 |         blast_out['qchrom'] = info.loc[blast_out.qseqid.tolist(), 'chrom'].values
28 |         blast_out['schrom'] = info.loc[blast_out.sseqid.tolist(), 'chrom'].values
29 |         blast_out = blast_out[blast_out['evalue'] < 0.001]
30 |         blast_out = blast_out[blast_out['bitscore'] >= 50]
31 |         blast_out = blast_out[~blast_out['schrom'].isin(['Pt', 'Mt'])]
32 |         blast_out = blast_out[~blast_out['qchrom'].isin(['Pt', 'Mt'])]
33 |         blast_out['homologs_pairs'] = [x + '_' + y for x, y in zip(blast_out.qchrom, blast_out.schrom)]
34 |         val_set = []
35 |         for gene_grp in blast_out.groupby('qgene'):
36 |             if len(gene_grp[1].homologs_pairs.unique()) == 1:
37 |                 val_set.append(gene_grp[0])
38 |         validation_genes[f"{blast_output.split('_')[-1]}"] = val_set
39 | 
40 | if os.path.exists('../model/validation_genes.pickle'):
41 |     os.remove('../model/validation_genes.pickle')
42 | with open('../model/validation_genes.pickle', 'wb') as pickle_file:
43 |     pickle.dump(validation_genes, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
44 | 


--------------------------------------------------------------------------------
/moca_blue/README.md:
--------------------------------------------------------------------------------
 1 | # moca_blue
 2 | [2023-06-07]
 3 | 
 4 | MOCA BLUE
 5 | 
 6 | MOtif
 7 |   Characterization
 8 | &  Annotation
 9 |    from DEEP LEARNING feature enrichment
10 | 
11 | Welcome the the moca_blue suite!
12 | from Simon M. Zumkeller
13 | RStudio
14 | 2022.07.2 Build 576
15 | 
16 | This is a tool-box for the analyses of DNA motifs
17 | that have been derived from deep-learning model features extraction.
18 | moca_blue is currently in development.
19 | 
20 | Please find more detailed descriptions
21 | of the directories and their role within them, respectively.
22 | 
23 | This is a pipeline of consecutive operations that can be and will be availabe here.
24 | 
25 | INPUT DIRECTORY                /0MOTIFS                              /ref_seq
26 |                       - HDF5.file [feature extraction files]       - fastas
27 |                       _____|_________________                      - gffs
28 |                       |                      |                     - meta-data
29 | START DIRECTORY   /mo_nom                   /mo_range                |
30 | output          - get motif patterns      - get motif meta-data      |
31 |                 - motif annotation           |                       |
32 |                 - motif modification                                 |
33 |                       |______________________________________________|
34 |                       |                                |
35 |                   /mo_clu                             MAPPING to reference (external)
36 |                 - analyze motifs             |        use e.g. "blamm
37 |                 - compare/cluster            |        (https://github.com/biointec/blamm)
38 |                                              |        cp occurences.txt [results] /mo_proj
39 |                                              |_________|
40 |                                                   |
41 |                                                  /mo_proj
42 |                                                 - filter for meaningful matches
43 |                                                 - interpret model predictions
44 |                                                 - gene annotation
45 |                                                 - module generation
46 | 
47 | 
48 | 
49 | mo_nom  --------------------------
50 | 
51 | Extract motifs from MoDisco hdf5 files and assign nomenclature.
52 | Currently, there are three versions of the same script that can be used for the extraction of a given format of weight matrix.
53 | 
54 | rdf5_get_xxx_per_pattern.v1.0R.R
55 | 
56 | PFM - positional frequence matrix
57 | PWM - positional weight matrix (best for clustering/comparison)
58 | CWM - contribution weight matrix (best for mapping)
59 | 
60 | 
61 | 
62 | mo_range ------------------------
63 | 
64 | Motifs/ EPMs are not distributed at random in a genome.
65 | To optimize the search for motifs/EPMs in a genome or gene-space, these tools
66 | extract the positionally preferred ranges for each motif/EPM in a hdf5 file.
67 | 
68 | rdf5_get_seql_per_patternV2.R - Extract a list of seqlets and their positions from the hdf5 file
69 | 
70 | meta_motif_ranges_characteristics_TSS-TTS.1.1.R - Producee a table from the rdf5_get_seql_per_patternV2.R output
71 |   that provides the gene-space statistics for each motif/seqlet in reference to transcription start and stop sites (TSS, TTS)
72 | 
73 | 
74 | 
75 | mo_clu --------------------------
76 | 
77 | Analyse and Edit motif-files stored in jaspar-format here. Results should be stored in the "out" directory.
78 | 
79 | mo_cluster_v2.0R - generates dendrograms/trees based on distancy-matrix for different models.
80 | 
81 | 
82 | mo_old ---------------------------
83 | 
84 | Old and outdated scripts used for the moca_blue suite are stored here.
85 | 
86 | 
87 | 
88 | ref_seq -------------------------
89 | 
90 | Store genome data like fasta, gff and many more here for INPUT. 
91 | 
92 | 


--------------------------------------------------------------------------------
/moca_blue/mo_clu/mo_cluster_v2.0.R:
--------------------------------------------------------------------------------
  1 | #install.packages("BiocManager")
  2 | #if (!require("BiocManager", quietly = TRUE))
  3 | #  install.packages("BiocManager")
  4 | #
  5 | #BiocManager::install("DNABin")
  6 | #BiocManager::install("ggtree")
  7 | #install.packages("phangorn")
  8 | #library(TFBSTools)
  9 | #library(JASPAR2020)
 10 | library(universalmotif)
 11 | library(motifmatchr)
 12 | library(ape)
 13 | library(motifStack)
 14 | library(ade4)
 15 | #library(phangorn)
 16 | library(ggtree)
 17 | ##################################################
 18 | ###################### Setup for "moca_blue" enviroment
 19 | NAME0="rdf5_epm"
 20 | SPEC ="Zema"
 21 | MODEL="C0" # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
 22 | TYPE ="_pfm-motifs.jaspar"
 23 | #######################################################
 24 | #FILE1 = paste0(NAME0,SPEC,MODEL,TYPE)
 25 | 
 26 | FILE1 = "all_motifs_20230505.jaspar"
 27 | #######################################################
 28 | dirpath_in = "../Mo_Nom/out/"
 29 | dirpath_out = "./out/"
 30 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 31 | File1 <- paste0(dirpath_in,FILE1)
 32 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 33 | ##################################################
 34 | cwm1 <- read_jaspar(File1)
 35 | ##################################################
 36 | # Loop through each motif object in the list
 37 | for (i in seq_along(cwm1)) {
 38 |   
 39 |   # Extract the motif name and the number after the last "_" underscore
 40 |   motif_name <- attr(cwm1[[i]], "name")
 41 |   nsites <- as.numeric(sub(".+_(\\d+)$", "\\1", motif_name))
 42 |   
 43 |   # Assign the nsites value to the "nsites" field of the motif object
 44 |   cwm1[[i]]["nsites"] <- nsites
 45 | }
 46 | ##################################################
 47 | # Loop through each motif object in the list
 48 | for (i in seq_along(cwm1)) {
 49 |   
 50 |   # Extract the Total IC and the Consensus values from the motif object
 51 |   total_ic <- attr(cwm1[[i]], "icscore")
 52 |   total_ic_rounded <- round(total_ic, 1)
 53 |   
 54 |   consensus <- attr(cwm1[[i]], "consensus")
 55 |   
 56 |   # Combine the Total IC and the Consensus values separated by "_" to the motif name
 57 |   motif_name <- attr(cwm1[[i]], "name")
 58 |   new_motif_name <- paste0(motif_name, "_", total_ic_rounded, "_", consensus)
 59 |   
 60 |   # Assign the new motif name to the motif object
 61 |   attr(cwm1[[i]], "name") <- new_motif_name
 62 | }
 63 | ##################################################
 64 | pwm_uni0<-convert_motifs(
 65 |   cwm1, class = "TFBSTools-PWMatrix")
 66 | 
 67 | pcm<-convert_motifs(
 68 |   cwm1, class = "motifStack-pcm")
 69 | ##################################################
 70 | sum<-as.data.frame(summarise_motifs(pcm))
 71 | write.csv(sum, file = paste0(dirpath_out,FILE1,"summary.txt"))
 72 | ##################################################
 73 | c_pcm<-clusterMotifs(pcm) ### !!! TIME TO GET A COFFEEE !!! ###
 74 | hc<- c_pcm
 75 | motifs<-pcm[hc$order]
 76 | ##################################################
 77 | write.tree(as.phylo(c_pcm), file = paste0(dirpath_out,FILE1,"-SW.nwk"))
 78 | 
 79 | ########################################################
 80 | #motifs<-pcm[hc$order]
 81 | #motifs <- lapply(motifs, pcm2pfm)
 82 | #d1o alignment
 83 | #compare_motif()
 84 | ##########################################################
 85 | cwm1[[1]]
 86 | ##########################################################
 87 | c<-compare_motifs(cwm1, method = "EUCL")
 88 | c0<-as.data.frame(c)
 89 | write.csv(c0, file = paste0(dirpath_out,FILE1,"_matrix-EUCL.txt"))
 90 | #assign scores from data.frame to branches for selection
 91 | tree <- motif_tree(cwm1, layout = "rectangular", db.scores = "scores", method = "EUCL")
 92 | write.tree(as.phylo(tree), file = paste0(dirpath_out,FILE1,"-EUCL.nwk"))
 93 | ##########################################################
 94 | 
 95 | ##########################################################
 96 | c<-compare_motifs(cwm1, method = "WEUCL")
 97 | c0<-as.data.frame(c)
 98 | #assign scores from data.frame to branches for selection
 99 | tree <- motif_tree(cwm1, layout = "rectangular", db.scores = "scores", method = "WEUCL")
100 | write.tree(as.phylo(tree), file = paste0(dirpath_out,FILE1,"-WEUC2.nwk"))
101 | ############################################################################
102 | c<-compare_motifs(cwm1, method = "PCC")
103 | c0<-as.data.frame(c)
104 | #assign scores from data.frame to branches for selection
105 | #filter scores based on failed IC (low motif scores)
106 | tree <- motif_tree(cwm1, layout = "rectangular", db.scores = "scores", method = "PCC")
107 | write.tree(as.phylo(tree), file = paste0(dirpath_out,FILE1,"-PCC.nwk"))
108 | ############################################################################
109 | c<-compare_motifs(cwm1, method = "ALLR_LL")
110 | c0<-as.data.frame(c)
111 | #assign scores from data.frame to branches for selection
112 | tree <- motif_tree(cwm1, layout = "rectangular", db.scores = "scores", method = "ALLR_LL")
113 | write.tree(as.phylo(tree), file = paste0(dirpath_out,FILE1,"-ALLR_LL.nwk"))
114 | ############################################################################
115 | 
116 | #motif_pvalue(cwm1)
117 | 
118 | 
119 | 
120 | #ggtree::ggtree(b, 
121 | #               layout="circular") +  geom_tiplab2(size =2.5) + xlim(-15, 5)
122 | #cwm2 <- read_jaspar(File2)
123 | #plot(hclust(dist(a0)))
124 | #a<-compare_motifs(cwm1, method = "WPCC")
125 | #a0<-as.data.frame(a)
126 | #plot(hclust(dist(a0)))
127 | #b<-hclust(dist(a0))
128 | 
129 | #ggtree::ggtree(b, 
130 | #               layout="circular") +  geom_tiplab2(size =2.5) + xlim(-15, 5)
131 | #b0<-as.phylo(b)
132 | #write.tree(b0, file = paste(JOB_NAME,".nex"))
133 | 
134 | ######################################################################
135 | 
136 | 
137 | 
138 | #motif_tree(cwm1, 
139 | #           label = "name",
140 | #           size=.3,
141 | #           layout = "circular",
142 | #           method = "EUCL",
143 | #           legend = F,
144 | #           db.scores = cwm1)
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/moca_blue/mo_clu/mo_cluster_v2.3.R:
--------------------------------------------------------------------------------
 1 | # CLUSTER MOTIFS BASED ON SANDELIN WASSERMANN 2004
 2 | # GET NWK-TREE AND SETS OF HIGH SIMILARITY
 3 | #################################################
 4 | ###################### Setup for "moca_blue" enviroment
 5 | NAME0="rdf5_epm"
 6 | SPEC ="Zema"
 7 | MODEL="S0+M0" # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
 8 | TYPE ="_cwm-motifs.jaspar"
 9 | #######################################################
10 | #FILE1 = paste0(NAME0,SPEC,MODEL,TYPE)
11 | FILE1 = "rdf5_ZemaS0+M0_cwm-motifs.jaspar"
12 | #######################################################
13 | dirpath_in = "../mo_nom/out/"
14 | dirpath_out = "./out/"
15 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
16 | File1 <- paste0(dirpath_in,FILE1)
17 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
18 | library(grid)
19 | library(TFBSTools)
20 | library(motifStack)
21 | library(universalmotif)
22 | library(ape)
23 | library(ggtree)
24 | ##################################################
25 | cwm1 <- read_jaspar(File1)
26 | ##################################################
27 | # Loop through each motif object in the list
28 | for (i in seq_along(cwm1)) {
29 |   # Extract the motif name and the number after the last "_" underscore
30 |   motif_name <- attr(cwm1[[i]], "name")
31 |   nsites <- as.numeric(sub(".+_(\\d+)$", "\\1", motif_name))
32 |   # Assign the nsites value to the "nsites" field of the motif object
33 |   cwm1[[i]]["nsites"] <- nsites
34 | }
35 | ##################################################
36 | # Loop through each motif object in the list
37 | for (i in seq_along(cwm1)) {
38 |   # Extract the Total IC and the Consensus values from the motif object
39 |   total_ic <- attr(cwm1[[i]], "icscore")
40 |   total_ic_rounded <- round(total_ic, 1)
41 |   consensus <- attr(cwm1[[i]], "consensus")
42 |   # Combine the Total IC and the Consensus values separated by "_" to the motif name
43 |   motif_name <- attr(cwm1[[i]], "name")
44 |   new_motif_name <- paste0(motif_name, "_", total_ic_rounded, "_", consensus)
45 |   # Assign the new motif name to the motif object
46 |   attr(cwm1[[i]], "name") <- new_motif_name
47 | }
48 | ##################################################
49 | pwm_uni0<-convert_motifs(
50 |   cwm1, class = "TFBSTools-PWMatrix")
51 | pcm<-convert_motifs(
52 |   cwm1, class = "motifStack-pcm")
53 | ##################################################
54 | sum<-as.data.frame(summarise_motifs(pcm))
55 | write.csv(sum, file = paste0(dirpath_out,FILE1,"summary.txt"))
56 | ##################################################
57 | ##########################################################
58 | c<-compare_motifs(cwm1, method = "SW")
59 | c0<-as.data.frame(c)
60 | c1 <- as.matrix(c0)
61 | lower_percentile <- quantile(c1, probs = 0.05)
62 | upper_percentile <- quantile(c1, probs = 0.95)
63 | ##############         ####################      ##################
64 | ##############         ####################      ##################
65 | c2 <- c0-upper_percentile
66 | c3 <- as.data.frame(ifelse(c2 < 0, 0, 1))
67 | c3$sum <- rowSums(c3)
68 | epms_without_highly_similar_counterparts <- rownames(c3[c3$sum == 2, ])
69 | epms_with_highly_similar_counterparts <- rownames(c3[c3$sum != 2, ])
70 | write.table(epms_without_highly_similar_counterparts, file = paste0(dirpath_out, FILE1, "_epms_without_highly_similar_counterparts-SW.csv"), sep = "\t", col.names = NA, quote = FALSE)
71 | write.table(epms_with_highly_similar_counterparts, file = paste0(dirpath_out, FILE1, "_epms_with_highly_similar_counterparts-SW.csv"), sep = "\t", col.names = NA, quote = FALSE)
72 | write.table(c0, file = paste0(dirpath_out, FILE1, "_matrix-SW.csv"), sep = "\t", col.names = NA, quote = FALSE)
73 | #assign scores from data.frame to branches for selection
74 | comp_1 <- 1-c
75 | comp_1 <- as.dist(comp_1)
76 | #labels <- attr(comp_1, "Labels")
77 | comp_2 <- ape::as.phylo(hclust(comp_1))
78 | comp_2[["edge.length"]] <- comp_2[["edge.length"]]+1
79 | comp_2[["edge.length"]]
80 | # Create a rooted phylo object
81 | phylo_tree <- as.phylo(comp_2)
82 | # Save the tree with positive edge lengths
83 | write.tree(comp_2, file = paste0(dirpath_out, FILE1, "-Sandelin-Wassermann.nwk"))
84 | #################################################
85 | c_pcm<-clusterMotifs(pcm, method = "Smith-Waterman") ### !!! TIME TO GET A COFFEEE !!! ###
86 | hc<- c_pcm
87 | motifs<-pcm[hc$order]
88 | ##################################################
89 | write.tree(as.phylo(c_pcm), file = paste0(dirpath_out,FILE1,"-Smith-Waterman.nwk"))
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/moca_blue/mo_imp/mo_imp_scores.v1.1.R:
--------------------------------------------------------------------------------
  1 | ######################
  2 | library(rhdf5)
  3 | library(tidyr)
  4 | library(ggplot2)
  5 | library(ggseqlogo)
  6 | #setwd("~/ibg-4/Desktop/Rhome/moca_blue/mo_scores")
  7 | ###################### Setup for "moca_blue" enviroment
  8 | NAME0="rdf5_"
  9 | SPEC="Soly"
 10 | MODEL="S0" # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
 11 | DATE= "20230904"
 12 | #######################################################
 13 | FILE1= "solanum_scores.h5"
 14 | FILE2= "solanum_meta_saliency_info.csv"
 15 | #######################################################
 16 | dirpath_in1 = "../0MOTIFS/saliency_scores/"
 17 | dirpath_out = "./out"
 18 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 19 | file_path1 = file.path(dirpath_in1,FILE1)
 20 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 21 | model_parameter <- read.table(
 22 |   file.path(
 23 |     dirpath_in1,
 24 |     FILE2),
 25 |   header=TRUE,
 26 |   sep=",")
 27 | #######################################################
 28 | model_parameter$pred_class <- ifelse(model_parameter$pred_prob >= 0.5, 1, 0)
 29 | model_parameter$TRUE_class <- ifelse(model_parameter$pred_class == model_parameter$true_target, TRUE, FALSE)
 30 | #######################################################
 31 | h5file <- H5Fopen(file_path1, "H5F_ACC_RDONLY")
 32 | #h5ls(h5file)
 33 | saliency_scores <- h5read(h5file,
 34 |                             "contrib_scores")
 35 | #######################################################
 36 | num_arrays <- dim(saliency_scores)[3]
 37 | num_columns <- dim(saliency_scores)[2]
 38 | result_matrix <- matrix(0, nrow = num_arrays, ncol = num_columns)
 39 | for (i in 1:num_arrays) {
 40 |   current_array <- saliency_scores[,,i]
 41 |   column_sums <- colSums(current_array)
 42 |   result_matrix[i,] <- column_sums
 43 | }
 44 | imp_scores<-as.data.frame(result_matrix)
 45 | ##################################################################################
 46 | model_parameter$sum_imp_score <- rowSums(imp_scores)
 47 | model_parameter$sum_imp_score2 <- ifelse(model_parameter$sum_imp_score >= 0, 1, 0)
 48 | model_parameter$TRUE_pred_imp <- ifelse(model_parameter$sum_imp_score2 == model_parameter$true_target, TRUE, FALSE)
 49 | gene_imp_scores <- as.data.frame(model_parameter$gene_id)
 50 | gene_imp_scores <- cbind(gene_imp_scores, imp_scores)
 51 | colnames(gene_imp_scores)[1] <- "loc_ID"
 52 | ##################################################################################
 53 | write.table(model_parameter, file = paste0(DATE,NAME0, SPEC, MODEL,"-imp_score_parameter.csv"), sep = ",", col.names = NA, quote = FALSE)
 54 | #write.table(gene_imp_scores, file = paste0(DATE,NAME0, SPEC, MODEL,"-imp_score_array.csv"), sep = ",", col.names = NA, quote = FALSE)
 55 | 
 56 | #######################################################
 57 | 
 58 | selected_gene <- gene_imp_scores[gene_imp_scores[,1] == "Solyc01g111620.3", ]
 59 | 
 60 | #selected_gene <- imp_scores[1619, ]
 61 | selected_range <- selected_gene[, 750:1500]
 62 | # Convert the selected_range matrix to numeric
 63 | selected_range <- as.numeric(selected_range)
 64 | 
 65 | # Create a data frame for plotting
 66 | long_data <- data.frame(
 67 |   position = 750:1500,  # Assuming positions start from 1000
 68 |   importance = selected_range
 69 | )
 70 | 
 71 | # Create and display the line plot
 72 | 
 73 | 
 74 | line_plot <- ggplot(long_data, aes(x = position, y = importance)) +
 75 |   geom_line(size = 0.1, color = "black") +
 76 |   geom_vline(xintercept = 1500, color = "red", size = 0.5) +  # Added vertical line
 77 |   labs(title = "Range contrib scores",
 78 |        x = "Position Index",
 79 |        y = "Importance Value") +
 80 |   theme_minimal() +
 81 |   scale_x_continuous(breaks = seq(750, 1500, by = 50)) +  # Set the x-axis breaks
 82 |   theme(panel.grid = element_blank()) +  # Remove grid lines
 83 |   scale_y_continuous(limits = c(-0.1, 0.1)) +  # Set y-axis limits
 84 |   geom_hline(yintercept = 0, color = "gray", linetype = "dashed")  # Add gray line at y = 0
 85 | 
 86 | print(line_plot)
 87 | 
 88 | ####################
 89 | 
 90 | ####################
 91 | row_numbers <- which(gene_imp_scores[, 1] == "Solyc01g111620.3")
 92 | 
 93 | desired_matrix_subset <- saliency_scores[1:4, 750:1500, row_numbers]
 94 | head(desired_matrix_subset, 10)
 95 | #str(desired_matrix_subset)
 96 | rownames(desired_matrix_subset) <- c("A", "C", "G", "T")
 97 | ####### CONTINUE HERE #########
 98 | seq_plot <- ggseqlogo(desired_matrix_subset, method = 'custom', seq_type = 'dna') +
 99 |   ylab('1') 
100 | 
101 | 
102 | print(seq_plot)
103 | ####################
104 | 


--------------------------------------------------------------------------------
/moca_blue/mo_imp/rdf5_get_epm_contrib_scores.v1.1.R:
--------------------------------------------------------------------------------
  1 | #####################
  2 | # Extract the contribution scores of EPMs
  3 | # from hdf5 files from modisco
  4 | # Dr. SM Zumkeller 2023-08-25
  5 | ######################
  6 | 
  7 | 
  8 | #setwd("~/ibg-4/Desktop/Rhome/moca_blue/mo_nom")
  9 | ###################### Setup for "moca_blue" enviroment
 10 | NAME0="rdf5_CWMs"
 11 | SPEC="Arth"
 12 | MODEL="M0" # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
 13 | DATE = "20230904"
 14 | #
 15 | #
 16 | #
 17 | library(dplyr)
 18 | library(rhdf5)
 19 | #######################################################
 20 | FILE1= "Arabidopsis_MSR_modisco.hdf5"
 21 | #######################################################
 22 | dirpath_in = "../0MOTIFS/MODISCO_MSR"
 23 | dirpath_out = "./out"
 24 | #
 25 | file_path_out <- file.path(dirpath_out, paste0(DATE,"_",SPEC,MODEL,"_contrib_scores"))
 26 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 27 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 28 | h5file <- H5Fopen(file.path(
 29 |   dirpath_in,
 30 |   FILE1), "H5F_ACC_RDONLY")
 31 | #h5ls(h5file)
 32 | metacluster_group <- h5read(h5file,
 33 |                             "metacluster_idx_to_submetacluster_results")
 34 | #######################################################
 35 | # loop through the metaclusters 0 and 1
 36 | for (i in names(metacluster_group)) {
 37 |   metacluster <- metacluster_group[[i]]
 38 |   patterns = metacluster[['seqlets_to_patterns_result']][['patterns']]
 39 | }
 40 | # Define the pattern names to iterate over
 41 | ########################################################
 42 | length(patterns[['all_pattern_names']])
 43 | x0 = length(metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]])-1
 44 | x1 = length(metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]])-1
 45 | y01 = x0+x1
 46 | y02 = x0*2+x1*2
 47 | ###################################################
 48 | pattern_names <- paste0("pattern_", 0:y01) ############################################## !!! MANUAL ADJ REQUIRED
 49 | # Initialize an empty list to store the matrices
 50 | matricesF0 <- list()
 51 | matricesF1 <- list()
 52 | matricesR0 <- list()
 53 | matricesR1 <- list()
 54 | # Loop over the pattern names
 55 | for (pattern_name in pattern_names) {
 56 |   # Extract the matrix for the current pattern name
 57 |   matrixF0 <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["task0_contrib_scores"]][["fwd"]]
 58 |   matrixF1 <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["task0_contrib_scores"]][["fwd"]]
 59 |   matrixR0 <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["task0_contrib_scores"]][["rev"]]
 60 |   matrixR1 <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["task0_contrib_scores"]][["rev"]]
 61 |   #pfms in sequence
 62 |   #cwms in sequence [task0_contrib_scores] : contibution weight matrices
 63 |   # Append the matrix to the list
 64 |   matricesF0[[pattern_name]] <- matrixF0
 65 |   matricesF1[[pattern_name]] <- matrixF1
 66 |   matricesR0[[pattern_name]] <- matrixR0
 67 |   matricesR1[[pattern_name]] <- matrixR1
 68 | }
 69 | ##################################################################################### 
 70 | ###############                ASSIGN NOMENCLATURE            ###################
 71 | ###############                ASSIGN NOMENCLATURE            ###################
 72 | ###############                ASSIGN NOMENCLATURE            ###################
 73 | 
 74 | ##################################################################################### PFM to PWM to CWM
 75 | seqletls_lengths_p1 <- list()
 76 | # Loop through each pattern name and get the length of seqletls
 77 | for (pattern_name in pattern_names) {
 78 |   seqletls <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
 79 |   seqletls_lengths_p1[[pattern_name]] <- length(seqletls)
 80 | }
 81 | 
 82 | seqletls_lengths_p0 <- list()
 83 | # Loop through each pattern name and get the length of seqletls
 84 | for (pattern_name in pattern_names) {
 85 |   seqletls <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
 86 |   seqletls_lengths_p0[[pattern_name]] <- length(seqletls)
 87 | }
 88 | 
 89 | ################
 90 | names(matricesF0) <- paste0(names(matricesF0),
 91 |                             "_p0m",
 92 |                             sprintf("%02d",
 93 |                                     as.numeric(substring(names(matricesF0), 9))))
 94 | names(matricesF0) <- substr(names(matricesF0), nchar(names(matricesF0)) - 4, nchar(names(matricesF0)))
 95 | names(matricesF0) <- paste0(names(matricesF0),
 96 |                             "F")
 97 | names(matricesF0) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesF0))
 98 | ###############                  ####################            ###################
 99 | names(matricesF1) <- paste0(names(matricesF1),
100 |                             "_p1m",
101 |                             sprintf("%02d",
102 |                                     as.numeric(substring(names(matricesF1), 9))))
103 | names(matricesF1) <- substr(names(matricesF1), nchar(names(matricesF1)) - 4, nchar(names(matricesF1)))
104 | names(matricesF1) <- paste0(names(matricesF1),
105 |                             "F")
106 | names(matricesF1) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesF1))
107 | ###############                  ####################            ###################
108 | names(matricesR1) <- paste0(names(matricesR1),
109 |                             "_p1m",
110 |                             sprintf("%02d",
111 |                                     as.numeric(substring(names(matricesR1), 9))))
112 | names(matricesR1) <- substr(names(matricesR1), nchar(names(matricesR1)) - 4, nchar(names(matricesR1)))
113 | names(matricesR1) <- paste0(names(matricesR1),
114 |                             "R")
115 | names(matricesR1) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesR1))
116 | ###############                  ####################            ###################
117 | names(matricesR0) <- paste0(names(matricesR0),
118 |                             "_p0m",
119 |                             sprintf("%02d",
120 |                                     as.numeric(substring(names(matricesR0), 9))))
121 | names(matricesR0) <- substr(names(matricesR0), nchar(names(matricesR0)) - 4, nchar(names(matricesR0)))
122 | names(matricesR0) <- paste0(names(matricesR0),
123 |                             "R")
124 | names(matricesR0) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesR0))
125 | ###############                  ####################            ###################   # ADD NUMBER OF SEQLETS TO NAMES !!!!
126 | seqlets_count_p0 <- head(seqletls_lengths_p0, x0)
127 | 
128 | for (i in seq_along(seqlets_count_p0)) {
129 |   name <- paste0(names(matricesF0)[i], "_", seqlets_count_p0[[i]])
130 |   names(matricesF0)[i] <- name
131 | }
132 | 
133 | for (i in seq_along(seqlets_count_p0)) {
134 |   name <- paste0(names(matricesR0)[i], "_", seqlets_count_p0[[i]])
135 |   names(matricesR0)[i] <- name
136 | }
137 | 
138 | seqlets_count_p1 <- head(seqletls_lengths_p1, x1)
139 | 
140 | for (i in seq_along(seqlets_count_p1)) {
141 |   name <- paste0(names(matricesF1)[i], "_", seqlets_count_p1[[i]])
142 |   names(matricesF1)[i] <- name
143 | }
144 | 
145 | for (i in seq_along(seqlets_count_p1)) {
146 |   name <- paste0(names(matricesR1)[i], "_", seqlets_count_p1[[i]])
147 |   names(matricesR1)[i] <- name
148 | }
149 | #################################################################################### contribution scores, to weitgh matrix
150 | motifs <- c(matricesF0,matricesF1,matricesR0,matricesR1)
151 | #####################################################################################
152 | results_df1 <- data.frame(matrix_name = character(0), total_sum = numeric(0))
153 | for (matrix_name in names(matricesF0)) {
154 |   matrix <- matricesF0[[matrix_name]]
155 |   row_sums <- rowSums(matrix)
156 |   col_sums <- colSums(matrix)
157 |   total_sum <- 1*(sum(row_sums) + sum(col_sums))  # Calculate the total sum
158 |   max_val <- max(matrix)
159 |   min_val <- min(matrix)
160 |   # Append the result to the data frame
161 |   results_df1 <- rbind(results_df1, data.frame(motif = matrix_name,
162 |                                              contrib_score_sum = total_sum,
163 |                                              contrib_score_max = max_val,
164 |                                              contrib_score_min = min_val))
165 | }
166 | contrib_scores_F0 <- results_df1
167 | #####################################################################################
168 | results_df2 <- data.frame(matrix_name = character(0), total_sum = numeric(0))
169 | for (matrix_name in names(matricesF1)) {
170 |   matrix <- matricesF1[[matrix_name]]
171 |   row_sums <- rowSums(matrix)
172 |   col_sums <- colSums(matrix)
173 |   total_sum <- 1*((sum(row_sums) + sum(col_sums)))  # Calculate the total sum
174 |   max_val <- max(matrix)
175 |   min_val <- min(matrix)
176 |   # Append the result to the data frame
177 |   results_df2 <- rbind(results_df2, data.frame(motif = matrix_name,
178 |                                              contrib_score_sum = total_sum,
179 |                                              contrib_score_max = max_val,
180 |                                              contrib_score_min = min_val))
181 | }
182 | contrib_scores_F1 <- results_df2
183 | #####################################################################################
184 | results_df3 <- data.frame(matrix_name = character(0), total_sum = numeric(0))
185 | for (matrix_name in names(matricesR0)) {
186 |   matrix <- matricesR0[[matrix_name]]
187 |   row_sums <- rowSums(matrix)
188 |   col_sums <- colSums(matrix)
189 |   total_sum <- ((sum(row_sums) + sum(col_sums)))  # Calculate the total sum
190 |   max_val <- max(matrix)
191 |   min_val <- min(matrix)
192 |   # Append the result to the data frame
193 |   results_df3 <- rbind(results_df3, data.frame(motif = matrix_name,
194 |                                              contrib_score_sum = total_sum,
195 |                                              contrib_score_max = max_val,
196 |                                              contrib_score_min = min_val))
197 | }
198 | contrib_scores_R0 <- results_df3
199 | #####################################################################################
200 | results_df4 <- data.frame(matrix_name = character(0), total_sum = numeric(0))
201 | for (matrix_name in names(matricesR1)) {
202 |   matrix <- matricesR1[[matrix_name]]
203 |   row_sums <- rowSums(matrix)
204 |   col_sums <- colSums(matrix)
205 |   total_sum <- 1*((sum(row_sums) + sum(col_sums)))  # Calculate the total sum
206 |   max_val <- max(matrix)
207 |   min_val <- min(matrix)
208 |   # Append the result to the data frame
209 |   results_df4 <- rbind(results_df4, data.frame(motif = matrix_name,
210 |                                              contrib_score_sum = total_sum,
211 |                                              contrib_score_max = max_val,
212 |                                              contrib_score_min = min_val))
213 | }
214 | contrib_scores_R1 <- results_df4
215 | #####################################################################################
216 | contrib_score_table <- rbind(contrib_scores_F0, contrib_scores_R0, contrib_scores_F1, contrib_scores_R1)
217 | 
218 | 
219 | 
220 | write.csv(contrib_score_table, file=paste0(file_path_out,
221 |                                 ".csv"), row.names=FALSE)


--------------------------------------------------------------------------------
/moca_blue/mo_nom/rdf5_get_cwms_per_pattern.v1.0R.R:
--------------------------------------------------------------------------------
  1 | #####################
  2 | # This script extract motifs in CWM format from hdf5 files and stores them in jaspar format.
  3 | # The script also names the motifs according to proposed nomenclature and the selected input specs
  4 | ######################
  5 | library(rhdf5)
  6 | library(tidyr)
  7 | ###################### Setup for "moca_blue" enviroment
  8 | NAME0="rdf5_epm"
  9 | SPEC="Zema"
 10 | MODEL="C0" # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
 11 | #######################################################
 12 | FILE1= "modisco.hdf5"
 13 | #######################################################
 14 | dirpath_in = "../Motifs/MOTIFS_DC1_ZEA"
 15 | dirpath_out = "./out"
 16 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 17 | 
 18 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 19 | h5file <- H5Fopen(file.path(
 20 |   dirpath_in,
 21 |   FILE1), "H5F_ACC_RDONLY")
 22 | #h5ls(h5file)
 23 | metacluster_group <- h5read(h5file,
 24 |                             "metacluster_idx_to_submetacluster_results")
 25 | #######################################################
 26 | # loop through the metaclusters 0 and 1
 27 | for (i in names(metacluster_group)) {
 28 |   metacluster <- metacluster_group[[i]]
 29 |   patterns = metacluster[['seqlets_to_patterns_result']][['patterns']]
 30 | }
 31 | # Define the pattern names to iterate over
 32 | ########################################################
 33 | length(patterns[['all_pattern_names']])
 34 | x0 = length(metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]])-1
 35 | x1 = length(metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]])-1
 36 | y01 = x0+x1
 37 | y02 = x0*2+x1*2
 38 | ###################################################
 39 | pattern_names <- paste0("pattern_", 0:y01) ############################################## !!! MANUAL ADJ REQUIRED
 40 | # Initialize an empty list to store the matrices
 41 | matricesF0 <- list()
 42 | matricesF1 <- list()
 43 | matricesR0 <- list()
 44 | matricesR1 <- list()
 45 | # Loop over the pattern names
 46 | for (pattern_name in pattern_names) {
 47 |   # Extract the matrix for the current pattern name
 48 |   matrixF0 <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["task0_contrib_scores"]][["fwd"]]
 49 |   matrixF1 <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["task0_contrib_scores"]][["fwd"]]
 50 |   matrixR0 <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["task0_contrib_scores"]][["rev"]]
 51 |   matrixR1 <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["task0_contrib_scores"]][["rev"]]
 52 |   #pfms in sequence
 53 |   #cwms in sequence [task0_contrib_scores] : contibution weight matrices
 54 |   # Append the matrix to the list
 55 |   matricesF0[[pattern_name]] <- matrixF0
 56 |   matricesF1[[pattern_name]] <- matrixF1
 57 |   matricesR0[[pattern_name]] <- matrixR0
 58 |   matricesR1[[pattern_name]] <- matrixR1
 59 | }
 60 | ##################################################################################### PFM to PWM to CWM
 61 | seqletls_lengths_p1 <- list()
 62 | # Loop through each pattern name and get the length of seqletls
 63 | for (pattern_name in pattern_names) {
 64 |   seqletls <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
 65 |   seqletls_lengths_p1[[pattern_name]] <- length(seqletls)
 66 | }
 67 | 
 68 | seqletls_lengths_p0 <- list()
 69 | # Loop through each pattern name and get the length of seqletls
 70 | for (pattern_name in pattern_names) {
 71 |   seqletls <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
 72 |   seqletls_lengths_p0[[pattern_name]] <- length(seqletls)
 73 | }
 74 | #paste0(data$motif, "m", sprintf("%02d", as.numeric(substring(data$pattern, 9))))
 75 | ###############                  ####################            ###################
 76 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 77 | #for (pattern_name in names(matricesF0)) {
 78 | #  if (pattern_name %in% names(seqletls_lengths_p0)) {
 79 | #    matricesF0[[pattern_name]] <- matricesF0[[pattern_name]] * seqletls_lengths_p0[[pattern_name]]
 80 | #  }
 81 | #}
 82 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 83 | #for (pattern_name in names(matricesF1)) {
 84 | #  if (pattern_name %in% names(seqletls_lengths_p1)) {
 85 | #    matricesF1[[pattern_name]] <- matricesF1[[pattern_name]] * seqletls_lengths_p1[[pattern_name]]
 86 | #  }
 87 | #}
 88 | ##############
 89 | #for (pattern_name in names(matricesR0)) {
 90 | #  if (pattern_name %in% names(seqletls_lengths_p0)) {
 91 | #    matricesR0[[pattern_name]] <- matricesR0[[pattern_name]] * seqletls_lengths_p0[[pattern_name]]
 92 | #  }
 93 | #}
 94 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 95 | #for (pattern_name in names(matricesR1)) {
 96 | #  if (pattern_name %in% names(seqletls_lengths_p1)) {
 97 | #    matricesR1[[pattern_name]] <- matricesR1[[pattern_name]] * seqletls_lengths_p1[[pattern_name]]
 98 | #  }
 99 | #}
100 |   ###############                ASSIGN NOMENCLATURE            ###################
101 |  ###############                ASSIGN NOMENCLATURE            ###################
102 | ###############                ASSIGN NOMENCLATURE            ###################
103 | names(matricesF0) <- paste0(names(matricesF0),
104 |                             "_p0m",
105 |                             sprintf("%02d",
106 |                                     as.numeric(substring(names(matricesF0), 9))))
107 | names(matricesF0) <- substr(names(matricesF0), nchar(names(matricesF0)) - 4, nchar(names(matricesF0)))
108 | names(matricesF0) <- paste0(names(matricesF0),
109 |                             "F")
110 | names(matricesF0) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesF0))
111 | ###############                  ####################            ###################
112 | names(matricesF1) <- paste0(names(matricesF1),
113 |                             "_p1m",
114 |                             sprintf("%02d",
115 |                                     as.numeric(substring(names(matricesF1), 9))))
116 | names(matricesF1) <- substr(names(matricesF1), nchar(names(matricesF1)) - 4, nchar(names(matricesF1)))
117 | names(matricesF1) <- paste0(names(matricesF1),
118 |                             "F")
119 | names(matricesF1) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesF1))
120 | ###############                  ####################            ###################
121 | names(matricesR1) <- paste0(names(matricesR1),
122 |                             "_p1m",
123 |                             sprintf("%02d",
124 |                                     as.numeric(substring(names(matricesR1), 9))))
125 | names(matricesR1) <- substr(names(matricesR1), nchar(names(matricesR1)) - 4, nchar(names(matricesR1)))
126 | names(matricesR1) <- paste0(names(matricesR1),
127 |                             "R")
128 | names(matricesR1) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesR1))
129 | ###############                  ####################            ###################
130 | names(matricesR0) <- paste0(names(matricesR0),
131 |                             "_p0m",
132 |                             sprintf("%02d",
133 |                                     as.numeric(substring(names(matricesR0), 9))))
134 | names(matricesR0) <- substr(names(matricesR0), nchar(names(matricesR0)) - 4, nchar(names(matricesR0)))
135 | names(matricesR0) <- paste0(names(matricesR0),
136 |                             "R")
137 | names(matricesR0) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesR0))
138 | ###############                  ####################            ###################   # ADD NUMBER OF SEQLETS TO NAMES !!!!
139 | seqlets_count_p0 <- head(seqletls_lengths_p0, x0)
140 | 
141 | for (i in seq_along(seqlets_count_p0)) {
142 |   name <- paste0(names(matricesF0)[i], "_", seqlets_count_p0[[i]])
143 |   names(matricesF0)[i] <- name
144 | }
145 | 
146 | for (i in seq_along(seqlets_count_p0)) {
147 |   name <- paste0(names(matricesR0)[i], "_", seqlets_count_p0[[i]])
148 |   names(matricesR0)[i] <- name
149 | }
150 | 
151 | seqlets_count_p1 <- head(seqletls_lengths_p1, x1)
152 | 
153 | for (i in seq_along(seqlets_count_p1)) {
154 |   name <- paste0(names(matricesF1)[i], "_", seqlets_count_p1[[i]])
155 |   names(matricesF1)[i] <- name
156 | }
157 | 
158 | for (i in seq_along(seqlets_count_p1)) {
159 |   name <- paste0(names(matricesR1)[i], "_", seqlets_count_p1[[i]])
160 |   names(matricesR1)[i] <- name
161 | }
162 | #################################################################################### contribution scores, to weitgh matrix
163 | motifs <- c(matricesF0,matricesF1,matricesR0,matricesR1)
164 | 
165 | #m0 <- motifs[1]
166 | #m1 <- motifs[[1]]
167 | #name <- names(m0)[1]
168 | #seq_count <- sub(".*_([0-9]+)$", "\\1", name)
169 | #nfcwm <- abs(m1)
170 | #nfcwm <- as.numeric(seq_count)*(nfcwm/max(nfcwm))
171 | ####################################################################################
172 | for (i in seq_along(motifs)) {
173 |   m0 <- motifs[i]
174 |   m1 <- motifs[[i]]
175 |   name <- names(m0)[1]
176 |   seq_count <- sub(".*_([0-9]+)$", "\\1", name)
177 |   nfcwm <- abs(m1)
178 |   nfcwm <- round(as.numeric(seq_count)*(nfcwm/max(nfcwm)))
179 |   motifs[[i]] <- abs(nfcwm)
180 | }
181 | 
182 | ####################################################################################
183 | pfms<- array(unlist(motifs),dim = c(4, 14, y02))    # make correction here - not pfm but cwm!!!!! extracting from sequence gives the PPM ! position probability matrix!!!!
184 | ls_pfms<- list()
185 | for (idx in seq(1:y02)){
186 |   ls_pfms[[idx]] <-pfms[, , idx]
187 | }
188 | ls_pfms_str <- lapply(ls_pfms, function(x) {
189 |   apply(x, c(1, 2), as.character)
190 | })
191 | #####################################################################################
192 | create_text <- function(m){
193 |   res <- paste0(">motif", "\n")
194 |   rows <- c('A', 'C', 'G', 'T')
195 |   for(i in 1:nrow(m)){
196 |     res <- paste0(res, paste0(rows[i],' ', paste0('[', paste(as.character(m[i, 1:ncol(m)]), collapse = "\t"), ']', "\n")))
197 |   }
198 |   return(res)
199 | }
200 | ##########################                            ###############################
201 |                           ############################
202 | text <- lapply(ls_pfms, create_text)
203 | for (idx in seq(1:y02)){
204 |   text[[idx]] <- gsub("motif", paste0(names(motifs)[idx]), text[[idx]])
205 | }
206 | writeLines(unlist(text), paste0(NAME0,SPEC,MODEL,"_cwm-motifs.jaspar"))
207 | #####################################################################################
208 | 


--------------------------------------------------------------------------------
/moca_blue/mo_nom/rdf5_get_pfm_per_pattern.v1.0R.R:
--------------------------------------------------------------------------------
  1 | #####################
  2 | # This script extract motifs in CWM format from hdf5 files and stores them in jaspar format.
  3 | # The script also names the motifs according to proposed nomenclature and the selected input specs
  4 | ######################
  5 | library(rhdf5)
  6 | library(tidyr)
  7 | ###################### Setup for "moca_blue" enviroment
  8 | NAME0="rdf5_PFM_pattern"
  9 | SPEC="Zema"
 10 | MODEL="C0" # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
 11 | #######################################################
 12 | FILE1= "modisco.hdf5"
 13 | #######################################################
 14 | dirpath_in = "../Motifs/MOTIFS_DC1_ZEA"
 15 | dirpath_out = "./out"
 16 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 17 | 
 18 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 19 | h5file <- H5Fopen(file.path(
 20 |   dirpath_in,
 21 |   FILE1), "H5F_ACC_RDONLY")
 22 | #h5ls(h5file)
 23 | metacluster_group <- h5read(h5file,
 24 |                             "metacluster_idx_to_submetacluster_results")
 25 | #######################################################
 26 | # loop through the metaclusters 0 and 1
 27 | for (i in names(metacluster_group)) {
 28 |   metacluster <- metacluster_group[[i]]
 29 |   patterns = metacluster[['seqlets_to_patterns_result']][['patterns']]
 30 | }
 31 | # Define the pattern names to iterate over
 32 | ########################################################
 33 | length(patterns[['all_pattern_names']])
 34 | x0 = length(metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]])-1
 35 | x1 = length(metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]])-1
 36 | y01 = x0+x1
 37 | y02 = x0*2+x1*2
 38 | ###################################################
 39 | pattern_names <- paste0("pattern_", 0:y01) ############################################## !!! MANUAL ADJ REQUIRED
 40 | # Initialize an empty list to store the matrices
 41 | matricesF0 <- list()
 42 | matricesF1 <- list()
 43 | matricesR0 <- list()
 44 | matricesR1 <- list()
 45 | # Loop over the pattern names
 46 | for (pattern_name in pattern_names) {
 47 |   # Extract the matrix for the current pattern name
 48 |   matrixF0 <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["sequence"]][["fwd"]]
 49 |   matrixF1 <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["sequence"]][["fwd"]]
 50 |   matrixR0 <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["sequence"]][["rev"]]
 51 |   matrixR1 <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["sequence"]][["rev"]]
 52 |   #pfms in sequence
 53 |   #cwms in sequence [task0_contrib_scores] : contibution weight matrices
 54 |   # Append the matrix to the list
 55 |   matricesF0[[pattern_name]] <- matrixF0
 56 |   matricesF1[[pattern_name]] <- matrixF1
 57 |   matricesR0[[pattern_name]] <- matrixR0
 58 |   matricesR1[[pattern_name]] <- matrixR1
 59 | }
 60 | ##################################################################################### PFM to PWM to CWM
 61 | seqletls_lengths_p1 <- list()
 62 | # Loop through each pattern name and get the length of seqletls
 63 | for (pattern_name in pattern_names) {
 64 |   seqletls <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
 65 |   seqletls_lengths_p1[[pattern_name]] <- length(seqletls)
 66 | }
 67 | 
 68 | seqletls_lengths_p0 <- list()
 69 | # Loop through each pattern name and get the length of seqletls
 70 | for (pattern_name in pattern_names) {
 71 |   seqletls <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
 72 |   seqletls_lengths_p0[[pattern_name]] <- length(seqletls)
 73 | }
 74 | 
 75 | #paste0(data$motif, "m", sprintf("%02d", as.numeric(substring(data$pattern, 9))))
 76 | ###############                  ####################            ###################
 77 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 78 | for (pattern_name in names(matricesF0)) {
 79 |   if (pattern_name %in% names(seqletls_lengths_p0)) {
 80 |     matricesF0[[pattern_name]] <- matricesF0[[pattern_name]] * seqletls_lengths_p0[[pattern_name]]
 81 |   }
 82 | }
 83 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 84 | for (pattern_name in names(matricesF1)) {
 85 |   if (pattern_name %in% names(seqletls_lengths_p1)) {
 86 |     matricesF1[[pattern_name]] <- matricesF1[[pattern_name]] * seqletls_lengths_p1[[pattern_name]]
 87 |   }
 88 | }
 89 | ##############
 90 | for (pattern_name in names(matricesR0)) {
 91 |   if (pattern_name %in% names(seqletls_lengths_p0)) {
 92 |     matricesR0[[pattern_name]] <- matricesR0[[pattern_name]] * seqletls_lengths_p0[[pattern_name]]
 93 |  }
 94 | }
 95 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 96 | for (pattern_name in names(matricesR1)) {
 97 |   if (pattern_name %in% names(seqletls_lengths_p1)) {
 98 |     matricesR1[[pattern_name]] <- matricesR1[[pattern_name]] * seqletls_lengths_p1[[pattern_name]]
 99 |   }
100 | }
101 |   ###############                ASSIGN NOMENCLATURE            ###################
102 |  ###############                ASSIGN NOMENCLATURE            ###################
103 | ###############                ASSIGN NOMENCLATURE            ###################
104 | names(matricesF0) <- paste0(names(matricesF0),
105 |                             "_p0m",
106 |                             sprintf("%02d",
107 |                                     as.numeric(substring(names(matricesF0), 9))))
108 | names(matricesF0) <- substr(names(matricesF0), nchar(names(matricesF0)) - 4, nchar(names(matricesF0)))
109 | names(matricesF0) <- paste0(names(matricesF0),
110 |                             "F")
111 | names(matricesF0) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesF0))
112 | ###############                  ####################            ###################
113 | names(matricesF1) <- paste0(names(matricesF1),
114 |                             "_p1m",
115 |                             sprintf("%02d",
116 |                                     as.numeric(substring(names(matricesF1), 9))))
117 | names(matricesF1) <- substr(names(matricesF1), nchar(names(matricesF1)) - 4, nchar(names(matricesF1)))
118 | names(matricesF1) <- paste0(names(matricesF1),
119 |                             "F")
120 | names(matricesF1) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesF1))
121 | ###############                  ####################            ###################
122 | names(matricesR1) <- paste0(names(matricesR1),
123 |                             "_p1m",
124 |                             sprintf("%02d",
125 |                                     as.numeric(substring(names(matricesR1), 9))))
126 | names(matricesR1) <- substr(names(matricesR1), nchar(names(matricesR1)) - 4, nchar(names(matricesR1)))
127 | names(matricesR1) <- paste0(names(matricesR1),
128 |                             "R")
129 | names(matricesR1) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesR1))
130 | ###############                  ####################            ###################
131 | names(matricesR0) <- paste0(names(matricesR0),
132 |                             "_p0m",
133 |                             sprintf("%02d",
134 |                                     as.numeric(substring(names(matricesR0), 9))))
135 | names(matricesR0) <- substr(names(matricesR0), nchar(names(matricesR0)) - 4, nchar(names(matricesR0)))
136 | names(matricesR0) <- paste0(names(matricesR0),
137 |                             "R")
138 | names(matricesR0) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesR0))
139 | ###############                  ####################            ###################   # ADD NUMBER OF SEQLETS TO NAMES !!!!
140 | seqlets_count_p0 <- head(seqletls_lengths_p0, x0)
141 | 
142 | for (i in seq_along(seqlets_count_p0)) {
143 |   name <- paste0(names(matricesF0)[i], "_", seqlets_count_p0[[i]])
144 |   names(matricesF0)[i] <- name
145 | }
146 | 
147 | for (i in seq_along(seqlets_count_p0)) {
148 |   name <- paste0(names(matricesR0)[i], "_", seqlets_count_p0[[i]])
149 |   names(matricesR0)[i] <- name
150 | }
151 | 
152 | seqlets_count_p1 <- head(seqletls_lengths_p1, x1)
153 | 
154 | for (i in seq_along(seqlets_count_p1)) {
155 |   name <- paste0(names(matricesF1)[i], "_", seqlets_count_p1[[i]])
156 |   names(matricesF1)[i] <- name
157 | }
158 | 
159 | for (i in seq_along(seqlets_count_p1)) {
160 |   name <- paste0(names(matricesR1)[i], "_", seqlets_count_p1[[i]])
161 |   names(matricesR1)[i] <- name
162 | }
163 | 
164 | ############################################################################################################################
165 | motifs <- c(matricesF0,matricesF1,matricesR0,matricesR1)
166 | pfms<- array(unlist(motifs),dim = c(4, 14, y02))    # make correction here - not pfm but cwm!!!!! extracting from sequence gives the PPM ! position probability matrix!!!!
167 | ls_pfms<- list()
168 | for (idx in seq(1:y02)){
169 |   ls_pfms[[idx]] <-pfms[, , idx]
170 | }
171 | ls_pfms_str <- lapply(ls_pfms, function(x) {
172 |   apply(x, c(1, 2), as.character)
173 | })
174 | #####################################################################################
175 | create_text <- function(m){
176 |   res <- paste0(">motif", "\n")
177 |   rows <- c('A', 'C', 'G', 'T')
178 |   for(i in 1:nrow(m)){
179 |     res <- paste0(res, paste0(rows[i],' ', paste0('[', paste(as.character(m[i, 1:ncol(m)]), collapse = "\t"), ']', "\n")))
180 |   }
181 |   return(res)
182 | }
183 | ##########################                            ###############################
184 |                           ############################
185 | text <- lapply(ls_pfms, create_text)
186 | for (idx in seq(1:y02)){
187 |   text[[idx]] <- gsub("motif", paste0(names(motifs)[idx]), text[[idx]])
188 | }
189 | writeLines(unlist(text), paste0(NAME0,SPEC,MODEL,"_pfm-motifs.jaspar"))
190 | #####################################################################################
191 | 


--------------------------------------------------------------------------------
/moca_blue/mo_nom/rdf5_get_pwm_per_pattern.v1.0R.R:
--------------------------------------------------------------------------------
  1 | #####################
  2 | # This script extract motifs in CWM format from hdf5 files and stores them in jaspar format.
  3 | # The script also names the motifs according to proposed nomenclature and the selected input specs
  4 | ######################
  5 | library(rhdf5)
  6 | library(tidyr)
  7 | ###################### Setup for "moca_blue" enviroment
  8 | NAME0="rdf5_epm"
  9 | SPEC="Zema"
 10 | MODEL="C0" # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
 11 | #######################################################
 12 | FILE1= "modisco.hdf5"
 13 | #######################################################
 14 | dirpath_in = "../Motifs/MOTIFS_DC1_ZEA"
 15 | dirpath_out = "./out"
 16 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 17 | 
 18 | # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 19 | h5file <- H5Fopen(file.path(
 20 |   dirpath_in,
 21 |   FILE1), "H5F_ACC_RDONLY")
 22 | #h5ls(h5file)
 23 | metacluster_group <- h5read(h5file,
 24 |                             "metacluster_idx_to_submetacluster_results")
 25 | #######################################################
 26 | # loop through the metaclusters 0 and 1
 27 | for (i in names(metacluster_group)) {
 28 |   metacluster <- metacluster_group[[i]]
 29 |   patterns = metacluster[['seqlets_to_patterns_result']][['patterns']]
 30 | }
 31 | # Define the pattern names to iterate over
 32 | ########################################################
 33 | length(patterns[['all_pattern_names']])
 34 | x0 = length(metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]])-1
 35 | x1 = length(metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]])-1
 36 | y01 = x0+x1
 37 | y02 = x0*2+x1*2
 38 | ###################################################
 39 | pattern_names <- paste0("pattern_", 0:y01) ############################################## !!! MANUAL ADJ REQUIRED
 40 | # Initialize an empty list to store the matrices
 41 | matricesF0 <- list()
 42 | matricesF1 <- list()
 43 | matricesR0 <- list()
 44 | matricesR1 <- list()
 45 | # Loop over the pattern names
 46 | for (pattern_name in pattern_names) {
 47 |   # Extract the matrix for the current pattern name
 48 |   matrixF0 <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["sequence"]][["fwd"]]
 49 |   matrixF1 <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["sequence"]][["fwd"]]
 50 |   matrixR0 <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["sequence"]][["rev"]]
 51 |   matrixR1 <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["sequence"]][["rev"]]
 52 |   #pfms in sequence
 53 |   #cwms in sequence [task0_contrib_scores] : contibution weight matrices
 54 |   # Append the matrix to the list
 55 |   matricesF0[[pattern_name]] <- matrixF0
 56 |   matricesF1[[pattern_name]] <- matrixF1
 57 |   matricesR0[[pattern_name]] <- matrixR0
 58 |   matricesR1[[pattern_name]] <- matrixR1
 59 | }
 60 | ##################################################################################### PFM to PWM to CWM
 61 | seqletls_lengths_p1 <- list()
 62 | # Loop through each pattern name and get the length of seqletls
 63 | for (pattern_name in pattern_names) {
 64 |   seqletls <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
 65 |   seqletls_lengths_p1[[pattern_name]] <- length(seqletls)
 66 | }
 67 | 
 68 | seqletls_lengths_p0 <- list()
 69 | # Loop through each pattern name and get the length of seqletls
 70 | for (pattern_name in pattern_names) {
 71 |   seqletls <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
 72 |   seqletls_lengths_p0[[pattern_name]] <- length(seqletls)
 73 | }
 74 | #paste0(data$motif, "m", sprintf("%02d", as.numeric(substring(data$pattern, 9))))
 75 | ###############                  ####################            ###################
 76 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 77 | #for (pattern_name in names(matricesF0)) {
 78 | #  if (pattern_name %in% names(seqletls_lengths_p0)) {
 79 | #    matricesF0[[pattern_name]] <- matricesF0[[pattern_name]] * seqletls_lengths_p0[[pattern_name]]
 80 | #  }
 81 | #}
 82 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 83 | #for (pattern_name in names(matricesF1)) {
 84 | #  if (pattern_name %in% names(seqletls_lengths_p1)) {
 85 | #    matricesF1[[pattern_name]] <- matricesF1[[pattern_name]] * seqletls_lengths_p1[[pattern_name]]
 86 | #  }
 87 | #}
 88 | ##############
 89 | #for (pattern_name in names(matricesR0)) {
 90 | #  if (pattern_name %in% names(seqletls_lengths_p0)) {
 91 | #    matricesR0[[pattern_name]] <- matricesR0[[pattern_name]] * seqletls_lengths_p0[[pattern_name]]
 92 | #  }
 93 | #}
 94 | # Assuming seqletls_lengths_p0 is a list of pattern lengths and matricesF0 is a list of matrices with matching pattern names
 95 | #for (pattern_name in names(matricesR1)) {
 96 | #  if (pattern_name %in% names(seqletls_lengths_p1)) {
 97 | #    matricesR1[[pattern_name]] <- matricesR1[[pattern_name]] * seqletls_lengths_p1[[pattern_name]]
 98 | #  }
 99 | #}
100 |   ###############                ASSIGN NOMENCLATURE            ###################
101 |  ###############                ASSIGN NOMENCLATURE            ###################
102 | ###############                ASSIGN NOMENCLATURE            ###################
103 | names(matricesF0) <- paste0(names(matricesF0),
104 |                             "_p0m",
105 |                             sprintf("%02d",
106 |                                     as.numeric(substring(names(matricesF0), 9))))
107 | names(matricesF0) <- substr(names(matricesF0), nchar(names(matricesF0)) - 4, nchar(names(matricesF0)))
108 | names(matricesF0) <- paste0(names(matricesF0),
109 |                             "F")
110 | names(matricesF0) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesF0))
111 | ###############                  ####################            ###################
112 | names(matricesF1) <- paste0(names(matricesF1),
113 |                             "_p1m",
114 |                             sprintf("%02d",
115 |                                     as.numeric(substring(names(matricesF1), 9))))
116 | names(matricesF1) <- substr(names(matricesF1), nchar(names(matricesF1)) - 4, nchar(names(matricesF1)))
117 | names(matricesF1) <- paste0(names(matricesF1),
118 |                             "F")
119 | names(matricesF1) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesF1))
120 | ###############                  ####################            ###################
121 | names(matricesR1) <- paste0(names(matricesR1),
122 |                             "_p1m",
123 |                             sprintf("%02d",
124 |                                     as.numeric(substring(names(matricesR1), 9))))
125 | names(matricesR1) <- substr(names(matricesR1), nchar(names(matricesR1)) - 4, nchar(names(matricesR1)))
126 | names(matricesR1) <- paste0(names(matricesR1),
127 |                             "R")
128 | names(matricesR1) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesR1))
129 | ###############                  ####################            ###################
130 | names(matricesR0) <- paste0(names(matricesR0),
131 |                             "_p0m",
132 |                             sprintf("%02d",
133 |                                     as.numeric(substring(names(matricesR0), 9))))
134 | names(matricesR0) <- substr(names(matricesR0), nchar(names(matricesR0)) - 4, nchar(names(matricesR0)))
135 | names(matricesR0) <- paste0(names(matricesR0),
136 |                             "R")
137 | names(matricesR0) <- paste0("epm_", SPEC, "_", MODEL, "_", names(matricesR0))
138 | ###############                  ####################            ###################   # ADD NUMBER OF SEQLETS TO NAMES !!!!
139 | ###############                  ####################            ###################   # ADD NUMBER OF SEQLETS TO NAMES !!!!
140 | seqlets_count_p0 <- head(seqletls_lengths_p0, x0)
141 | 
142 | for (i in seq_along(seqlets_count_p0)) {
143 |   name <- paste0(names(matricesF0)[i], "_", seqlets_count_p0[[i]])
144 |   names(matricesF0)[i] <- name
145 | }
146 | 
147 | for (i in seq_along(seqlets_count_p0)) {
148 |   name <- paste0(names(matricesR0)[i], "_", seqlets_count_p0[[i]])
149 |   names(matricesR0)[i] <- name
150 | }
151 | 
152 | seqlets_count_p1 <- head(seqletls_lengths_p1, x1)
153 | 
154 | for (i in seq_along(seqlets_count_p1)) {
155 |   name <- paste0(names(matricesF1)[i], "_", seqlets_count_p1[[i]])
156 |   names(matricesF1)[i] <- name
157 | }
158 | 
159 | for (i in seq_along(seqlets_count_p1)) {
160 |   name <- paste0(names(matricesR1)[i], "_", seqlets_count_p1[[i]])
161 |   names(matricesR1)[i] <- name
162 | }
163 | ####################################################################################
164 | motifs <- c(matricesF0,matricesF1,matricesR0,matricesR1)
165 | pfms<- array(unlist(motifs),dim = c(4, 14, y02))    # make correction here - not pfm but cwm!!!!! extracting from sequence gives the PPM ! position probability matrix!!!!
166 | ls_pfms<- list()
167 | for (idx in seq(1:y02)){
168 |   ls_pfms[[idx]] <-pfms[, , idx]
169 | }
170 | ls_pfms_str <- lapply(ls_pfms, function(x) {
171 |   apply(x, c(1, 2), as.character)
172 | })
173 | #####################################################################################
174 | create_text <- function(m){
175 |   res <- paste0(">motif", "\n")
176 |   rows <- c('A', 'C', 'G', 'T')
177 |   for(i in 1:nrow(m)){
178 |     res <- paste0(res, paste0(rows[i],' ', paste0('[', paste(as.character(m[i, 1:ncol(m)]), collapse = "\t"), ']', "\n")))
179 |   }
180 |   return(res)
181 | }
182 | ##########################                            ###############################
183 |                           ############################
184 | text <- lapply(ls_pfms, create_text)
185 | for (idx in seq(1:y02)){
186 |   text[[idx]] <- gsub("motif", paste0(names(motifs)[idx]), text[[idx]])
187 | }
188 | writeLines(unlist(text), paste0(NAME0,SPEC,MODEL,"_pwm-motifs.jaspar"))
189 | #####################################################################################
190 | 


--------------------------------------------------------------------------------
/moca_blue/mo_proj/mo_epm_prediction.v1.0.R:
--------------------------------------------------------------------------------
  1 | # EPM Prediction TEST #
  2 | # Use the presence of EPMs as predictor for low and high gene expression 
  3 | # EPMs are generalized representations of seqlets with positive or negative 
  4 | # contribution scores in deepLift, associated to high and low gene expression prediction
  5 | # In general, genes with a positive sum contribution score are predicted to be highly expressed.
  6 | # Vice verse, genes with a negative sum contribution score are predicted to be lowly expressed. 
  7 | # Consequently, EPMs (associated to postive or negative contrib. scores/ prediction) 
  8 | # they can be used equally for interpretation. 
  9 | # Here occurences of EPMs per gene are counted. EPMs of p0 are substracted by p1. If the resulting value is positive,
 10 | # genes are predicted to be highly expressed and vice verse. 
 11 | 
 12 | # Dr. Simon M. Zumkeller 2023-08-25
 13 | 
 14 | ######## ######### ######### ######### ######## ########
 15 | file1 <- "ArthS0Arth-0e3-cwm-W2_gene_none20230825-q1q9.csv"
 16 | file2 <- "Arth_S0_predictions.csv" # file with predicted probabilies for expression 0-1
 17 | file4 <- "Arabidopsis_thaliana_TPMs-peleke-etal2023.csv" #" file with measurement of models (TPM, quartile classes 0,1,2)
 18 | file3 <- "20230825_ArthS0_contrib_scores.csv" # File with average contributions scores for each EPM
 19 | dirpath_1 <- "../../ref_seq"
 20 | dirpath_2 <- "./out"
 21 | dirpath_3 <- "../../mo_nom/out"
 22 | file_path_in_file4 <- file.path(dirpath_1, file4)
 23 | file_path_in_file3 <- file.path(dirpath_3, file3)
 24 | ######## ######### ######### ######### ######## ########
 25 | #
 26 | #
 27 | #
 28 | #
 29 | ######## ######### ######### ######### ######## ########
 30 | mm0 <- read.table(
 31 |   file.path(
 32 |     dirpath_2,
 33 |     file1),
 34 |   header=TRUE,
 35 |   sep=",")
 36 | #
 37 | #
 38 | #
 39 | mm1 <- mm0[, c("loc_ID",
 40 |                       "motif")]
 41 | colnames(mm1)[1]<- "loc_ID"
 42 | mm1$p0s <- ifelse(sapply(mm1$motif, grepl, pattern = "p0m"), 1, 0)
 43 | mm1$p1s <- ifelse(sapply(mm1$motif, grepl, pattern = "p1m"), 1, 0)
 44 | mm2 <- mm1 %>%
 45 |   group_by(loc_ID) %>%
 46 |   summarize(p0_count = sum(p0s), p1_count = sum(p1s))
 47 | mm2$epm_pred_val <- mm2$p0_count- mm2$p1_count
 48 | mm2$epm_pred <- ifelse(mm2$epm_pred_val >= 0, 1, 0)
 49 | mm3 <- mm2[, c("loc_ID",
 50 |                "epm_pred")]
 51 | value_counts0 <- table(mm3$epm_pred)   # epm_pred value of 0 stands for low expression and 1 for high
 52 | ########### ############## ############ #####################
 53 | #
 54 | #
 55 | #
 56 | #
 57 | ########### ############## ############ #####################
 58 | if (!file.exists(file2)) {
 59 |   mm0 <- read.table(file.path(dirpath_2, file1), header=TRUE, sep=",")
 60 |   unique_loc_ID <- unique(mm0$loc_ID)  # Remove duplicates
 61 |   model0 <- data.frame(loc_ID = unique_loc_ID, prob = sample(c(0, 1), size = length(unique_loc_ID), replace = TRUE))
 62 |   print("file2 missing")
 63 |   file2_state <- c("FALSE")
 64 | } else {
 65 |   model0 <- read.table(file2, header=TRUE, sep=",")
 66 |   colnames(model0) <- c("loc_ID", "prob")
 67 |   print("file2 exists")
 68 |   file2_state <- c("TRUE")
 69 | }
 70 | ########### ############# ############# ######################
 71 | A0<-merge(model0, mm3, by= "loc_ID")
 72 | A0$TF_pred <- ifelse(A0$prob == A0$epm_pred, 1, 0) 
 73 | ##############################################################
 74 | #
 75 | #
 76 | #
 77 | model1 <- read.table(
 78 |   file_path_in_file4,
 79 |   header=TRUE,
 80 |   sep=",")
 81 | #
 82 | ##############################################################
 83 | ##############################################################
 84 | ss_model1 <- model1[, c("gene_id", "true_target")]
 85 | colnames(ss_model1)[1]<- "loc_ID"
 86 | ss_model1 <- subset(ss_model1, true_target !=  2)                                 # !!! CHANGES DATASET SIZE BY HALF !!! # Were not included within the model training.set
 87 | A1<-merge(A0, ss_model1, by= "loc_ID")
 88 | A1$TF_expr <- ifelse(A1$true_target == A1$epm_pred, 1, 0) 
 89 | ####
 90 | head(A1)
 91 | A1$TP_TN <- ifelse(A1$prob==A1$true_target, 1, 0)
 92 | A1s <- A1[A1$prob == 1, ]
 93 | ####
 94 | ##############################################################
 95 | #
 96 | #
 97 | #
 98 | model3 <- read.table(
 99 |   file_path_in_file3,
100 |   header=TRUE,
101 |   sep=",")
102 | #
103 | ##############################################################
104 | mcon_m0<- merge(mm1,model3, by ="motif")
105 | mcon_m0$p0c<- ifelse(mcon_m0$p0s==1, mcon_m0$contrib_score_aver, 0)
106 | mcon_m0$p1c<- ifelse(mcon_m0$p1s==1, mcon_m0$contrib_score_aver, 0)
107 | mcon_m1<- mcon_m0[, c("loc_ID","p0c","p1c")]
108 | mcon_m2 <- mcon_m1 %>%
109 |   group_by(loc_ID) %>%
110 |   summarize(p0c_count = sum(p0c), p1c_count = sum(p1c))
111 | mcon_m2$epm_contrib_pred_val <- mcon_m2$p0c_count + mcon_m2$p1c_count
112 | mcon_m2$epm_contrib_pred_class <- ifelse(mcon_m2$epm_contrib_pred_val >= 0, 1, 0)
113 | A2 <- merge(A1, mcon_m2, by="loc_ID")                                             # CHANGE A1 to A1s to check expr class
114 | A2$TF_ecpc_expr <- ifelse(A2$epm_contrib_pred_class == A2$true_target, 1, 0)
115 | A2$TF_ecpc_pred <- ifelse(A2$epm_contrib_pred_class == A2$prob, 1, 0)
116 | ##############################################################
117 | value_counts1 <- table(A2$TF_ecpc_expr)
118 | value_counts2 <- table(A2$TF_ecpc_pred)
119 | acc <- value_counts1[2]/(value_counts1[2]+value_counts1[1])
120 | acc1 <- value_counts2[2]/(value_counts2[2]+value_counts2[1])
121 | epm_pred_pred <- table(A2$TF_ecpc_expr)
122 | epm_pred_expr <- table(A2$TF_ecpc_pred)
123 | print(epm_pred_pred)
124 | print(acc)
125 | print(epm_pred_expr)
126 | print(acc1)
127 | 


--------------------------------------------------------------------------------
/moca_blue/mo_proj/mo_feature_tester.v1.0.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | ##############################################################
  3 | PROJECT <- "_on_SolaITAG4_ch01_0e3-cwm"
  4 | SPEC <- "Soly"
  5 | MODEL <- "MSR"
  6 | DATE <- "20230703"
  7 | ##############################################################
  8 | #file_path <- "GO_enrichment_result_table.csv" 
  9 | #file_path1 <-"feat_enrichment_result_table.csv"    #Results
 10 | ##############################################################
 11 | dirpath_1 <- "../ref_seq"
 12 | dirpath_2 <- "./out"
 13 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 14 | file1 <- "SolyMSR_on_Spe-ch01-0e3_gene_none20230530feat_mima_q1q9.csv"
 15 | file3 <- "mapman_sopen.txt"
 16 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 17 | file_path_out <- file.path(dirpath_2, paste0(DATE,"_",PROJECT,"_mo-feat"))
 18 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 19 | FILTER<- "q1q9"
 20 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 21 | mm0 <- read.table(
 22 |   file.path(
 23 |     dirpath_2,
 24 |     file1),
 25 |   header=TRUE,
 26 |   sep=",")
 27 | 
 28 | mapman <- read.table(file3,
 29 |                      header=TRUE,
 30 |                      sep="\t", quote = "")
 31 | 
 32 | ########### ############## ############ #####################
 33 | colnames(mapman)[colnames(mapman)=="IDENTIFIER"]<-"loc_ID"
 34 | 
 35 | mm0$loc_ID <- tolower(mm0$loc_ID)
 36 | mm0$loc_ID <- gsub("[-']+", "", mm0$loc_ID)
 37 | mm0$loc_ID <- gsub("\\..*", "", mm0$loc_ID) #CAREFULL WITH THE DOTS
 38 | 
 39 | mapman$loc_ID <- tolower(mapman$loc_ID)
 40 | mapman$loc_ID <- gsub("[-']+", "", mapman$loc_ID)
 41 | mapman$loc_ID <- gsub("\\..*", "", mapman$loc_ID) #CAREFULL WITH THE DOTS
 42 | 
 43 | mapman0 <- mapman[, c("loc_ID",
 44 |                       "BINCODE",
 45 |                       "NAME")]
 46 | 
 47 | mm0_mapman <- merge(mm0, mapman0, by= "loc_ID")
 48 | # Internal check for the number of rows
 49 | if (nrow(mm0_mapman) < 2) {
 50 |   error_message <- "Please check identifiers chr#:start-end for fasta and gff input"
 51 |   stop(error_message)
 52 | }
 53 | ########### ############## ############ ##########
 54 | mm0_mapman$BINCODE <- gsub("'", "", mm0_mapman$BINCODE)
 55 | mm0_mapman <- mm0_mapman[mm0_mapman$BINCODE != 35.2, ]
 56 | mm0_mapman <- mm0_mapman[mm0_mapman$BINCODE != 35.1, ]
 57 | mm0_mapman <- mm0_mapman[mm0_mapman$BINCODE != 35, ]
 58 | #mm0_mapman$BINCODE <- gsub("\\.([^']*)$", "", mm0_mapman$BINCODE)
 59 | #now all is gone after the first . POINT !!!
 60 | ########### ############## ############ ##########
 61 | col_idx <- mm0_mapman[, c("BINCODE", "NAME")]
 62 | col_idx <- unique(col_idx)
 63 | ##################################################
 64 | cont_table_A <- table(mm0_mapman$motif, mm0_mapman$BINCODE)
 65 | testA <- as.data.frame(cont_table_A)
 66 | tfestA <- testA %>%
 67 |   pivot_wider(names_from = Var2, values_from = Freq)
 68 | ##################################################
 69 | row_idx <- as.data.frame(tfestA[, c("Var1")])
 70 | row_idx <- unique(row_idx)
 71 | idx1 <- nrow(row_idx)
 72 | row_idx$number <- rownames(row_idx)
 73 | colnames(row_idx)[1] <- "motif"
 74 | # Exclude the first column from tfestA
 75 | tfestA <- tfestA[, -1]
 76 | # Convert factor columns to numeric
 77 | tfestA <- as.data.frame(lapply(tfestA, as.numeric))
 78 | # Assuming your dataframe is called 'tfestA'
 79 | sample_size <- sum(tfestA)
 80 | results <- vector("list", nrow(tfestA))
 81 | 
 82 | #
 83 | #for (i in 1:nrow(tfestA)) {
 84 | #  row_result <- vector("list", ncol(tfestA))
 85 | #  eval1 <- sum(tfestA[i, ]) * sum(tfestA[i,]) / sample_size
 86 | #  calc1 <- (tfestA[i,]-eval1)^2/eval1
 87 | #  row_result[[i]] <- calc1 
 88 | #  results[[i]] <- calc1
 89 | #}
 90 | #
 91 | 
 92 | results <- vector("list", nrow(tfestA))
 93 | 
 94 | for (i in 1:nrow(tfestA)) {
 95 |   row_result <- vector("list", ncol(tfestA))
 96 |   eval1 <- sum(tfestA[i, ]) * sum(tfestA[, i]) / sample_size
 97 |   calc1 <- (tfestA[i,]-eval1)^2/eval1
 98 |   row_result[[i]] <- calc1 
 99 |   results[[i]] <- calc1
100 | }
101 | 
102 | print(sample_size)
103 | print(min(tfestA))
104 | print(max(tfestA))
105 | print(eval1)
106 | 
107 | result_matrix <- as.data.frame(do.call(rbind, results))
108 | result_matrix$number <- c(1:idx1)
109 | result_matrix0 <- merge(row_idx,
110 |                         result_matrix,
111 |                         by = "number")
112 | 
113 | 
114 | result_df <- pivot_longer(result_matrix0,
115 |                           cols = -c(number, motif),
116 |                           names_to = "column names",
117 |                           values_to = "values")
118 | 
119 | result_df <- result_df[, -1]
120 | colnames(result_df)[2] <- "BINCODE"
121 | colnames(result_df)[3] <- "x_square"
122 | result_df <- result_df %>%
123 |   mutate(BINCODE = str_replace(BINCODE, "X", ""))
124 | result_df <- result_df %>%
125 |   mutate(BINCODE = gsub("^\\.+|\\.+\\$", "", BINCODE))
126 | result_df <- result_df %>%
127 |   mutate(BINCODE = gsub("^\\.+|\\.+?$", "", BINCODE))
128 | #########################################################################
129 | sorted_results <- result_df[order(result_df$x_square), ]
130 | #sorted_results <- result_df[result_df$x_square != 35.2, ]
131 | top_100_rows <- sorted_results %>%
132 |   arrange(desc(x_square)) %>%
133 |   head(100)
134 | top_100_rows0 <- merge(top_100_rows, col_idx, by= "BINCODE")
135 | 
136 | 
137 | #write.csv(top_100_rows0, file = file_path, row.names = FALSE)
138 | 
139 | write.csv(top_100_rows0, file=paste0(file_path_out,
140 |                               "-goterm.csv"), row.names=FALSE)
141 | #########################################################################
142 | ################################################################## ######
143 | #Replace motifs and GO terms in top 100
144 | #Print - export
145 | #Alternative strategy: observed value - expected value/ expected value, test for independence
146 | ################################################################## ######
147 | 
148 | # # # # # # # # # # #  # # # # # # 
149 | 
150 | unique_values <- unique(mm0_mapman$type.y)
151 | print(unique_values)
152 | 
153 | cont_table_B <- table(mm0_mapman$motif, mm0_mapman$type.y)
154 | 
155 | testB =as.data.frame(cont_table_B)
156 | tfestB = testB %>%
157 |   pivot_wider(names_from = Var2, values_from = Freq)
158 | 
159 | 
160 | 
161 | ##########################################################################################
162 | 
163 | 
164 | ##########################################################################################
165 | ##########################################################################################
166 | 
167 | # Check if "mRNA" and "untranscr" columns are present in the data frame
168 | if ("mRNA" %in% colnames(tfestB) && "untranscr" %in% colnames(tfestB)) {
169 |   tfestB$transcr_class <- apply(tfestB[, c("mRNA", "untranscr")], 1, function(row) {
170 |     result <- chisq.test(row, p = c(0.5, 0.5))
171 |     result$p.value
172 |   })
173 |   tfestB$transcr_pref <- ifelse(tfestB$mRNA < tfestB$untranscr, "untranscribed", "transcribed")
174 | } else {
175 |   print("Error: 'mRNA' and/or 'untranscr' columns are not present in the data frame.")
176 | }
177 | ##########################################################################################
178 | # Check if "exon" and "intron" columns are present in the data frame
179 | if ("exon" %in% colnames(tfestB) && "intron" %in% colnames(tfestB)) {
180 |   tfestB$feature_class <- apply(tfestB[, c("exon", "intron")], 1, function(row) {
181 |     result <- chisq.test(row, p = c(0.5, 0.5))
182 |     result$p.value
183 |   })
184 |   tfestB$feature_pref <- ifelse(tfestB$exon < tfestB$intron, "intronic", "exonic")
185 | } else {
186 |   print("Error: 'exon' and/or 'intron' columns are not present in the data frame.")
187 | }
188 | ########################################################################################## Comparison might be unfair UTR/CDS better
189 | # Check if "CDS" and "intron" columns are present in the data frame
190 | if ("CDS" %in% colnames(tfestB) && "intron" %in% colnames(tfestB)) {
191 |   tfestB$transl_class <- apply(tfestB[, c("CDS", "intron")], 1, function(row) {
192 |     result <- chisq.test(row, p = c(0.5, 0.5))
193 |     result$p.value
194 |   })
195 |   tfestB$transl_pref <- ifelse(tfestB$CDS < tfestB$intron, "intronic", "codogenic")
196 | } else {
197 |   # Handle the case when "CDS" and/or "intron" columns are not present
198 |   # Print an error message or perform alternative actions
199 |   print("Error: 'CDS' and/or 'intron' columns are not present in the data frame.")
200 | }
201 | ##########################################################################################
202 | # Check if "CDS" and "intron" columns are present in the data frame
203 | if ("CDS" %in% colnames(tfestB) && "UTR" %in% colnames(tfestB)) {
204 |   tfestB$transl_class <- apply(tfestB[, c("CDS", "UTR")], 1, function(row) {
205 |     result <- chisq.test(row, p = c(0.5, 0.5))
206 |     result$p.value
207 |   })
208 |   tfestB$transl_pref <- ifelse(tfestB$CDS < tfestB$intron, "UTR", "codogenic")
209 | } else {
210 |   # Handle the case when "CDS" and/or "intron" columns are not present
211 |   # Print an error message or perform alternative actions
212 |   print("Error: 'CDS' and/or 'UTR' columns are not present in the data frame.")
213 | }
214 | 
215 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
216 | 
217 | ##########################################################################################
218 | ##########################################################################################
219 | 
220 | write.csv(tfestB, file=paste0(file_path_out,
221 |                               "-features.csv"), row.names=FALSE)
222 | ##########################################################################################
223 | 
224 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
225 | # Remove columns with sum less than 5
226 | ########### ############## ############ ##########
227 | 


--------------------------------------------------------------------------------
/moca_blue/mo_proj/mo_gene_mapper_v0.1-2.R:
--------------------------------------------------------------------------------
  1 | ##############################################################
  2 | #setwd("/home/ibg-4/Desktop/Rhome/solanum_motifs")
  3 | ##############################################################
  4 | PROJECT <- "SolyMSR_on_Spenn_ch01-2-3_0e3_0e3_q1q9.csv"
  5 | SPEC <- "SolyITAG"
  6 | MODEL <- "MSR"
  7 | DATA_ORIGIN <- "motif_matches"
  8 | DATE <- "20230615"
  9 | ##############################################################
 10 | ##############################################################
 11 | dirpath_1 <- "../../ref_seq"
 12 | dirpath_2 <- "./out"
 13 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 14 | file1 <- "SolyMSR_on_Spenn_ch01-02-03_0e3_gene_none20230615-q1q9.csv"
 15 | file2 <- "msr_predictions_on_pennellii.csv"
 16 | file3 <- "mapman_sopen.txt"  # GO-term enrichment
 17 | file4 <- "spennellii.csv"   # file with probabilities of models
 18 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 19 | file_path_out <- file.path(dirpath_2, paste0(DATE,"_",PROJECT,"_mo-predict-map"))
 20 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 21 | library(ape)
 22 | library(stats)
 23 | library(dplyr)
 24 | library(stringr)
 25 | library(ggplot2)
 26 | library(reshape2)
 27 | 
 28 | 
 29 | #library(ape)
 30 | #library(hrbrthemes)
 31 | #library(randomForestSRC)
 32 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 33 | mm0 <- read.table(
 34 |   file.path(
 35 |     dirpath_2,
 36 |     file1),
 37 |   header=TRUE,
 38 |   sep=",")
 39 | model0 <- read.table(file2,
 40 |                      header=TRUE,
 41 |                      sep=",")
 42 | mapman <- read.table(file3,
 43 |                      header=TRUE,
 44 |                      sep="\t", quote = "")
 45 | model1 <- read.table(
 46 |   file4,
 47 |   header=TRUE,
 48 |   sep=",")
 49 | colnames(model0) <- c("loc_ID","prob")
 50 | ########### ############## ############ #####################
 51 | ########### ############## ############ #####################
 52 | model0 <- model0[, c(1, 2)]
 53 | # Step 1: Subset DF2 to include only the genes with probability > 0.8
 54 | #high_prob_genes <- model0[model0$prob > 0.01, ]
 55 | # Step 2: Merge DF1 and high_prob_genes based on loc_ID
 56 | merged_df <- merge(mm0, model0, 
 57 |                    by = "loc_ID",
 58 |                    all.x = TRUE,
 59 |                    all.y = TRUE)
 60 | merged_df <- na.omit(merged_df)
 61 | merg_df00 <- merged_df[, c(1, 6, 7, 8,9, 13)]
 62 | subset_df <- merg_df00[c("motif", "prob")]
 63 | ################################################################################
 64 | # extract the required string from IDENTIFIER
 65 | colnames(mapman)[colnames(mapman)=="IDENTIFIER"]<-"loc_ID"
 66 | merg_df00$loc_ID <- tolower(merg_df00$loc_ID)
 67 | mapman$loc_ID <- tolower(mapman$loc_ID)
 68 | mapman$loc_ID <- gsub("[-']+", "", mapman$loc_ID)
 69 | mapman$loc_ID <- gsub("\\..*", "", mapman$loc_ID) #CAREFULL WITH THE DOTS
 70 | merg_df00$loc_ID <- gsub("[-']+", "", merg_df00$loc_ID)
 71 | merg_df00$loc_ID <- gsub("\\..*", "", merg_df00$loc_ID) #CAREFULL WITH THE DOTS
 72 | merg_df00map <- merge(
 73 |   merg_df00, mapman,
 74 |   by = c("loc_ID"),
 75 |   all = FALSE, ignore.case = TRUE)
 76 | # remove rows with NA cells
 77 | ################################################################################
 78 | ss_model1 <- model1[, c("gene_id", "logMaxTPM", "true_target")]
 79 | ss_model1 <- subset(ss_model1, logMaxTPM !=  0)
 80 | #ss_model1 <- subset(ss_model1, true_target %in% c(1, 2)) # GENES ARE CLASSIFIED AS 0 (NO EXPR), 1 (LOW <0.5) AND HIGH (...).
 81 | # To EVALUATE the predictiveness of EPMs, genes falgged with 0 are handled as LOW expression genes. 
 82 | ss_model1$gene_id <- tolower(ss_model1$gene_id)
 83 | ss_model1$gene_id <- gsub("[-']+", "", ss_model1$gene_id)
 84 | ss_model1$gene_id <- gsub("\\..*", "", ss_model1$gene_id)
 85 | colnames(ss_model1) <- c("loc_ID","logMaxTPM","class")
 86 | ss_model1$class <- ifelse(
 87 |   ss_model1$class == 2,
 88 |   "low", "high")
 89 | merg_df01mm<- merge(
 90 |   merg_df00map, ss_model1,
 91 |   by = c("loc_ID"),
 92 |   all = FALSE, ignore.case = TRUE)
 93 | 
 94 | ################################################################################
 95 | a0_fd <- merg_df01mm[, c(5,1,2,3,4,10,11,6,12)]
 96 | 
 97 | # Determine the percentile cutoff values
 98 | lower_percentile <- quantile(a0_fd$prob, 0.25)
 99 | upper_percentile <- quantile(a0_fd$prob, 0.75)
100 | 
101 | # Subset rows based on percentiles
102 | #a0_fd_ssprob <- a0_fd[a0_fd$prob<= lower_percentile | a0_fd$prob >= upper_percentile, ]
103 | # ONLY TAKING UPPER !
104 | a0_fd_ssprob <- a0_fd[a0_fd$prob >= upper_percentile, ]
105 | a0_fd_ssprob0 <- a0_fd[a0_fd$prob <= lower_percentile, ]
106 | # Determine the percentile cutoff values
107 | lower_percentile1 <- quantile(a0_fd$logMaxTPM, 0.25)
108 | upper_percentile1 <- quantile(a0_fd$logMaxTPM, 0.75)
109 | # Subset rows based on percentiles
110 | 
111 | #a0_fd_ssTPM <- a0_fd[a0_fd$logMaxTPM<= lower_percentile1 | a0_fd$logMaxTPM >= upper_percentile1, ]
112 | # ONLY TAKING UPPER !
113 | # ONLY TAKING UPPER !
114 | a0_fd_ssprobTPM2 <- a0_fd_ssprob[a0_fd$logMaxTPM >= upper_percentile1, ]
115 | a0_fd_ssprobTPM3 <- a0_fd_ssprob0[a0_fd$logMaxTPM >= lower_percentile1, ]
116 | 
117 | 
118 | #a0_fd_ssprobTPM2 <- a0_fd_ssprob[a0_fd_ssprob$logMaxTPM<= lower_percentile1 |
119 |  #                                  a0_fd_ssprob$logMaxTPM >= upper_percentile1, ]
120 | 
121 | 
122 | ###############################################################################
123 | head(a0_fd_ssprobTPM2)
124 | # Extracting values from the "motif" column and shortening them
125 | a0_fd_ssprobTPM2 <- a0_fd_ssprobTPM2 %>% 
126 |   mutate(motif0 = substr(motif, 4, 19))
127 | 
128 | a0_fd_ssprobTPM3 <- a0_fd_ssprobTPM3 %>% 
129 |   mutate(motif0 = substr(motif, 4, 19))
130 | # Adding "strand.x" and "region" values to "motif0"
131 | a0_fd_ssprobTPM2 <- a0_fd_ssprobTPM2 %>% 
132 |   mutate(motif0 = paste(motif0, strand.x, ifelse(region == "upstream", "up", "do"), sep = ""))
133 | 
134 | a0_fd_ssprobTPM3 <- a0_fd_ssprobTPM3 %>% 
135 |   mutate(motif0 = paste(motif0, strand.x, ifelse(region == "upstream", "up", "do"), sep = ""))
136 | # Creating the "motif1" column by combining "motif0" and "dist_transc_border"
137 | a0_fd_ssprobTPM2 <- a0_fd_ssprobTPM2 %>% 
138 |   mutate(motif1 = paste(motif0, dist_transc_border, sep = "/"))
139 | 
140 | a0_fd_ssprobTPM3 <- a0_fd_ssprobTPM3 %>% 
141 |   mutate(motif1 = paste(motif0, dist_transc_border, sep = "/"))
142 | ###############################################################################
143 | a1_fd <- a0_fd_ssprobTPM2[, c(2,10,4)]
144 | a1_fd_unique <- distinct(a1_fd)
145 | a1_fd_reshaped <- a1_fd_unique %>%
146 |   pivot_wider(names_from = motif0, values_from = dist_transc_border)  #### THIS IS WORKING SOMEHOW... FRAGILE! HANDLE WITH CARE (GREAT!)
147 | 
148 | a1_fd0 <- a0_fd_ssprobTPM3[, c(2,10,4)]
149 | a1_fd0_unique <- distinct(a1_fd0)
150 | a1_fd0_reshaped <- a1_fd0_unique %>%
151 |   pivot_wider(names_from = motif0, values_from = dist_transc_border)  #### THIS IS WORKING SOMEHOW... FRAGILE! HANDLE WITH CARE (GREAT!)
152 | ################################################################################
153 | 
154 | a2 <- as.data.frame(a1_fd_reshaped)
155 | a3 <- as.data.frame(a1_fd0_reshaped)
156 | 
157 | missing_values <- is.na(a2$loc_ID)
158 | missing_values0 <- is.na(a3$loc_ID)
159 | # Replace missing values with a placeholder or default value
160 | a2$loc_ID[missing_values] <- "NA"
161 | a3$loc_ID[missing_values0] <- "NA"
162 | # Assign row names to the data frame using the modified loc_ID values
163 | rownames(a2) <- a2$loc_ID
164 | rownames(a3) <- a3$loc_ID
165 | a2_0 <- as.data.frame(t(a2))
166 | a3_0 <- as.data.frame(t(a3))
167 | # Remove the first row
168 | a2_0 <- a2_0[-1, ]
169 | a3_0 <- a3_0[-1, ]
170 | # Rename the first column to "motif0"
171 | a2_0$motif0 <- rownames(a2_0)
172 | a3_0$motif0 <- rownames(a3_0)
173 | # Separate "motif0" into three new columns
174 | a2_0 <- cbind(
175 |   a2_0,
176 |   motif = substr(a2_0$motif0, 1, 16),
177 |   strand = ifelse(grepl("\\+", a2_0$motif0), "+", "-"),
178 |   region = ifelse(grepl("up", a2_0$motif0), "up", "do")
179 | )
180 | # Separate "motif0" into three new columns
181 | a3_0 <- cbind(
182 |   a3_0,
183 |   motif = substr(a3_0$motif0, 1, 16),
184 |   strand = ifelse(grepl("\\+", a3_0$motif0), "+", "-"),
185 |   region = ifelse(grepl("up", a3_0$motif0), "up", "do")
186 | )
187 | # Reorder the columns
188 | a2_0 <- a2_0[, c("motif", "strand", "region", colnames(a2_0)[-1])]
189 | # Reorder the columns
190 | a3_0 <- a3_0[, c("motif", "strand", "region", colnames(a3_0)[-1])]
191 | # Export the updated DataFrame as CSV
192 | #write.csv(a2_0, file = "mo_dist_matrix_preliminary.csv", row.names = FALSE)
193 | # Remove the row and column used for row and column names
194 | # Coerce columns to character type
195 | a2_fixed <- as.data.frame(lapply(a2_0, as.character), stringsAsFactors = FALSE)
196 | a3_fixed <- as.data.frame(lapply(a3_0, as.character), stringsAsFactors = FALSE)
197 | # Replace "NULL" values with zeros
198 | a2_fixed[a2_fixed == "NULL"] <- NA
199 | a3_fixed[a3_fixed == "NULL"] <- NA
200 | ####################################
201 | 
202 | a2_3<- as.data.frame(t(a2_fixed))
203 | a2_3_new <- data.frame(FirstColumn = row.names(a2_3), a2_3)
204 | # Export the fixed DataFrame as CSV
205 | a3_3<- as.data.frame(t(a3_fixed))
206 | a3_3_new <- data.frame(FirstColumn = row.names(a3_3), a3_3)
207 | # Export the fixed DataFrame as CSV
208 | write.csv(a2_3_new, file = "moSpennc_dist_matrix_preliminary_upTQ.csv", row.names = FALSE)
209 | write.csv(a3_3_new, file = "moSpennc_dist_matrix_preliminary_loTQ.csv", row.names = FALSE)
210 | ################################################################################
211 | 
212 | ############################################################################### NOTE! DATASET HAS BEEN FILTERED FOR TRUE PERCENTILES (0.05/0.095!!!!
213 | 
214 | # Contingency table of "motif" and "loc_ID"
215 | contingency_table_loc_ID <- table( a0_fd_ssprobTPM2$loc_ID, a0_fd_ssprobTPM2$motif)
216 | contingency_table_loc_ID0 <- table( a0_fd_ssprobTPM3$loc_ID, a0_fd_ssprobTPM3$motif)
217 | # Contingency table of "motif" and "GO"
218 | #contingency_table_GO <- table(a0_fd_ssprobTPM2$motif, a0_fd_ssprobTPM2$NAME)
219 | # Contingency table of "motif" and "class"
220 | 
221 | testA =as.data.frame(contingency_table_loc_ID)
222 | tfestA = testA %>%
223 |   pivot_wider(names_from = Var2, values_from = Freq)
224 | 
225 | testB =as.data.frame(contingency_table_loc_ID0)
226 | tfestB = testB %>%
227 |   pivot_wider(names_from = Var2, values_from = Freq)
228 | 
229 | #tfestA[tfestA != 0] <- tfestA$Var1
230 | 
231 | # Create a copy of the original dataframe
232 | tfestA0 <- tfestA
233 | tfestB0 <- tfestB
234 | # Convert all columns except Var1 to character type
235 | tfestA0[, -1] <- lapply(tfestA0[, -1], as.character)
236 | tfestB0[, -1] <- lapply(tfestB0[, -1], as.character)
237 | # Iterate over columns and replace non-zero values
238 | for (i in 2:ncol(tfestA0)) {
239 |   tfestA0[tfestA0[, i] != "0", i] <- tfestA0$Var1[tfestA0[, i] != "0"]
240 | }
241 | # Iterate over columns and replace non-zero values
242 | for (i in 2:ncol(tfestB0)) {
243 |   tfestB0[tfestB0[, i] != "0", i] <- tfestB0$Var1[tfestB0[, i] != "0"]
244 | }
245 | # Convert the dataframe to a matrix
246 | mat <- as.matrix(tfestA[, -1])
247 | # Convert the dataframe to a matrix
248 | matB <- as.matrix(tfestB[, -1])
249 | # Compute the correlation matrix
250 | corr <- cor(mat)
251 | # Compute the correlation matrix
252 | corrB <- cor(matB)
253 | # Compute the dissimilarity matrix using 1 - correlation
254 | diss <- 1 - corr
255 | diss0 <- 1 - corrB
256 | # Compute the hierarchical clustering
257 | hc <- hclust(as.dist(diss))
258 | # Compute the hierarchical clustering
259 | hc0 <- hclust(as.dist(diss0))
260 | # Plot the dendrogram
261 | plot(hc, hang = -1)
262 | # Set the filename for the PDF
263 | pdf("dendrogram.pdf")
264 | # Plot the dendrogram
265 | plot(hc, hang = -1)
266 | # Add rectangles to the dendrogram to indicate co-occurrence
267 | #rect.hclust(hc, k = 7, border = "grey")
268 | # Add rectangles to the dendrogram to indicate co-occurrence
269 | #rect.hclust(hc, k = 10, border = "green")
270 | # Close the PDF device
271 | dev.off()
272 | 
273 | # Convert the dendrogram to a phylogenetic tree object
274 | phy <- as.phylo(hc)
275 | phy0 <- as.phylo(hc0)
276 | # Set the filename for the Newick file
277 | newick_filename <- "moSpennc_occ_dendrogram_upQ.nwk"
278 | 
279 | # Set the filename for the Newick file
280 | newick_filename2 <- "moSpennc_occ_dendrogram_loQ.nwk"
281 | 
282 | # Write the dendrogram as a Newick file
283 | write.tree(phy, file = newick_filename)
284 | write.tree(phy0, file = newick_filename2)
285 | 
286 | 
287 | write.csv(tfestA, file = "moSpennc_binary_matrix_preliminaryupQ.csv", row.names = FALSE)
288 | write.csv(tfestA0, file = "moSpennc_gene_list_preliminaryloQ.csv", row.names = FALSE)
289 | 
290 | write.csv(tfestB, file = "moSpennc_binary_matrix_preliminaryupQ.csv", row.names = FALSE)
291 | write.csv(tfestB0, file = "moSpennc_gene_list_preliminaryloQ.csv", row.names = FALSE)
292 | 
293 | ###############################################################################################
294 | 
295 | ###############################################################################################
296 | 


--------------------------------------------------------------------------------
/moca_blue/mo_proj/mo_gene_mapper_v0.1.R:
--------------------------------------------------------------------------------
  1 | ##############################################################
  2 | #setwd("/home/ibg-4/Desktop/Rhome/solanum_motifs")
  3 | ##############################################################
  4 | PROJECT <- "Sly_mo_on_Spe_0e3_percentiles"
  5 | SPEC <- "Spenn"
  6 | MODEL <- "MSR"
  7 | DATA_ORIGIN <- "motif_matches"
  8 | DATE <- "20230530"
  9 | ##############################################################
 10 | ##############################################################
 11 | dirpath_1 <- "../ref_seq"
 12 | dirpath_2 <- "./out"
 13 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 14 | file1 <- "SolyMSR_on_Spe-ch01-0e3_gene_none20230530-q1q9.csv"
 15 | file2 <- "msr_predictions_on_pennellii.csv"
 16 | file3 <- "mapman_sopen.txt"
 17 | file4 <- "spennellii.csv"
 18 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 19 | file_path_out <- file.path(dirpath_2, paste0(DATE,"_",PROJECT,"_mo-predict-map"))
 20 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 21 | library(stats)
 22 | library(dplyr)
 23 | library(stringr)
 24 | library(ggplot2)
 25 | library(reshape2)
 26 | 
 27 | 
 28 | #library(ape)
 29 | #library(hrbrthemes)
 30 | #library(randomForestSRC)
 31 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 32 | mm0 <- read.table(
 33 |   file.path(
 34 |     dirpath_2,
 35 |     file1),
 36 |   header=TRUE,
 37 |   sep=",")
 38 | model0 <- read.table(file2,
 39 |                      header=TRUE,
 40 |                      sep=",")
 41 | mapman <- read.table(file3,
 42 |                      header=TRUE,
 43 |                      sep="\t", quote = "")
 44 | model1 <- read.table(
 45 |   file4,
 46 |   header=TRUE,
 47 |   sep=",")
 48 | colnames(model0) <- c("loc_ID","prob")
 49 | ########### ############## ############ #####################
 50 | ########### ############## ############ #####################
 51 | model0 <- model0[, c(1, 2)]
 52 | # Step 1: Subset DF2 to include only the genes with probability > 0.8
 53 | #high_prob_genes <- model0[model0$prob > 0.01, ]
 54 | # Step 2: Merge DF1 and high_prob_genes based on loc_ID
 55 | merged_df <- merge(mm0, model0, 
 56 |                    by = "loc_ID",
 57 |                    all.x = TRUE,
 58 |                    all.y = TRUE)
 59 | merged_df <- na.omit(merged_df)
 60 | merg_df00 <- merged_df[, c(1, 6, 7, 8,9, 13)]
 61 | subset_df <- merg_df00[c("motif", "prob")]
 62 | ################################################################################
 63 | # extract the required string from IDENTIFIER
 64 | colnames(mapman)[colnames(mapman)=="IDENTIFIER"]<-"loc_ID"
 65 | merg_df00$loc_ID <- tolower(merg_df00$loc_ID)
 66 | mapman$loc_ID <- tolower(mapman$loc_ID)
 67 | mapman$loc_ID <- gsub("[-']+", "", mapman$loc_ID)
 68 | mapman$loc_ID <- gsub("\\..*", "", mapman$loc_ID) #CAREFULL WITH THE DOTS
 69 | merg_df00map <- merge(
 70 |   merg_df00, mapman,
 71 |   by = c("loc_ID"),
 72 |   all = FALSE, ignore.case = TRUE)
 73 | # remove rows with NA cells
 74 | ################################################################################
 75 | ss_model1 <- model1[, c("gene_id", "logMaxTPM", "true_target")]
 76 | ss_model1 <- subset(ss_model1, logMaxTPM !=  0)
 77 | #ss_model1 <- subset(ss_model1, true_target %in% c(1, 2)) # GENES ARE CLASSIFIED AS 0 (NO EXPR), 1 (LOW <0.5) AND HIGH (...).
 78 | # To EVALUATE the predictiveness of EPMs, genes falgged with 0 are handled as LOW expression genes. 
 79 | ss_model1$gene_id <- tolower(ss_model1$gene_id)
 80 | ss_model1$gene_id <- gsub("[-']+", "", ss_model1$gene_id)
 81 | ss_model1$gene_id <- gsub("\\..*\\.", ".", ss_model1$gene_id)
 82 | colnames(ss_model1) <- c("loc_ID","logMaxTPM","class")
 83 | ss_model1$class <- ifelse(
 84 |   ss_model1$class == 2,
 85 |   "low", "high")
 86 | merg_df01mm<- merge(
 87 |   merg_df00map, ss_model1,
 88 |   by = c("loc_ID"),
 89 |   all = FALSE, ignore.case = TRUE)
 90 | 
 91 | ################################################################################
 92 | a0_fd <- merg_df01mm[, c(5,1,2,3,4,10,11,6,12)]
 93 | 
 94 | # Determine the percentile cutoff values
 95 | lower_percentile <- quantile(a0_fd$prob, 0.25)
 96 | upper_percentile <- quantile(a0_fd$prob, 0.75)
 97 | 
 98 | # Subset rows based on percentiles
 99 | a0_fd_ssprob <- a0_fd[a0_fd$prob<= lower_percentile | a0_fd$prob >= upper_percentile, ]
100 | # Determine the percentile cutoff values
101 | lower_percentile1 <- quantile(a0_fd$logMaxTPM, 0.25)
102 | upper_percentile1 <- quantile(a0_fd$logMaxTPM, 0.75)
103 | # Subset rows based on percentiles
104 | a0_fd_ssTPM <- a0_fd[a0_fd$logMaxTPM<= lower_percentile1 | a0_fd$logMaxTPM >= upper_percentile1, ]
105 | 
106 | a0_fd_ssprobTPM2 <- a0_fd_ssprob[a0_fd_ssprob$logMaxTPM<= lower_percentile1 |
107 |                                    a0_fd_ssprob$logMaxTPM >= upper_percentile1, ]
108 | 
109 | 
110 | ###############################################################################
111 | head(a0_fd_ssprobTPM2)
112 | # Extracting values from the "motif" column and shortening them
113 | a0_fd_ssprobTPM2 <- a0_fd_ssprobTPM2 %>% 
114 |   mutate(motif0 = substr(motif, 4, 19))
115 | # Adding "strand.x" and "region" values to "motif0"
116 | a0_fd_ssprobTPM2 <- a0_fd_ssprobTPM2 %>% 
117 |   mutate(motif0 = paste(motif0, strand.x, ifelse(region == "upstream", "up", "do"), sep = ""))
118 | # Creating the "motif1" column by combining "motif0" and "dist_transc_border"
119 | a0_fd_ssprobTPM2 <- a0_fd_ssprobTPM2 %>% 
120 |   mutate(motif1 = paste(motif0, dist_transc_border, sep = "/"))
121 | a1_fd <- a0_fd_ssprobTPM2[, c(2,10,4)]
122 | a1_fd_unique <- distinct(a1_fd)
123 | a1_fd_reshaped <- a1_fd_unique %>%
124 |   pivot_wider(names_from = motif0, values_from = dist_transc_border)  #### THIS IS WORKING SOMEHOW... FRAGILE! HANDLE WITH CARE (GREAT!)
125 | ################################################################################
126 | 
127 | a2<-as.data.frame(a1_fd_reshaped)
128 | rownames(a2)<-(a2$loc_ID)
129 | a2_0 <- as.data.frame(t(a2))
130 | # Remove the first row
131 | a2_0 <- a2_0[-1, ]
132 | # Rename the first column to "motif0"
133 | a2_0$motif0 <- rownames(a2_0)
134 | # Separate "motif0" into three new columns
135 | a2_0 <- cbind(
136 |   a2_0,
137 |   motif = substr(a2_0$motif0, 1, 16),
138 |   strand = ifelse(grepl("\\+", a2_0$motif0), "+", "-"),
139 |   region = ifelse(grepl("up", a2_0$motif0), "up", "do")
140 | )
141 | # Reorder the columns
142 | a2_0 <- a2_0[, c("motif", "strand", "region", colnames(a2_0)[-1])]
143 | # Export the updated DataFrame as CSV
144 | #write.csv(a2_0, file = "mo_dist_matrix_preliminary.csv", row.names = FALSE)
145 | # Remove the row and column used for row and column names
146 | # Coerce columns to character type
147 | a2_fixed <- as.data.frame(lapply(a2_0, as.character), stringsAsFactors = FALSE)
148 | 
149 | # Replace "NULL" values with zeros
150 | a2_fixed[a2_fixed == "NULL"] <- NA
151 | 
152 | # Export the fixed DataFrame as CSV
153 | write.csv(a2_fixed, file = "mo_dist_matrix_preliminary.csv", row.names = FALSE)
154 | 
155 | ################################################################################
156 | 
157 | ############################################################################### NOTE! DATASET HAS BEEN FILTERED FOR TRUE PERCENTILES (0.05/0.095!!!!
158 | 
159 | # Contingency table of "motif" and "loc_ID"
160 | contingency_table_loc_ID <- table( a0_fd_ssprobTPM2$loc_ID, a0_fd_ssprobTPM2$motif)
161 | # Contingency table of "motif" and "GO"
162 | #contingency_table_GO <- table(a0_fd_ssprobTPM2$motif, a0_fd_ssprobTPM2$NAME)
163 | # Contingency table of "motif" and "class"
164 | 
165 | testA =as.data.frame(contingency_table_loc_ID)
166 | tfestA = testA %>%
167 |   pivot_wider(names_from = Var2, values_from = Freq)
168 | 
169 | #tfestA[tfestA != 0] <- tfestA$Var1
170 | 
171 | # Create a copy of the original dataframe
172 | tfestA0 <- tfestA
173 | 
174 | # Convert all columns except Var1 to character type
175 | tfestA0[, -1] <- lapply(tfestA0[, -1], as.character)
176 | 
177 | # Iterate over columns and replace non-zero values
178 | for (i in 2:ncol(tfestA0)) {
179 |   tfestA0[tfestA0[, i] != "0", i] <- tfestA0$Var1[tfestA0[, i] != "0"]
180 | }
181 | 
182 | # View the modified dataframe
183 | tfestA0
184 | 
185 | write.csv(tfestA, file = "mo_binary_matrix_preliminary.csv", row.names = FALSE)
186 | 
187 | write.csv(tfestA0, file = "mo_gene_list_preliminary.csv", row.names = FALSE)
188 | 
189 | 
190 | 
191 | ###############################################################################################
192 | 
193 | ###############################################################################################
194 | 


--------------------------------------------------------------------------------
/moca_blue/mo_proj/motif_predictabilityV1.0.R:
--------------------------------------------------------------------------------
  1 | ##############################################################
  2 | #This script is designed to find motif with significantly 
  3 | # better or worse predictability
  4 | # go for pairwise comparison using jackard similarity
  5 | # go for pool-wise comparison using dice similarity
  6 | # use random forest and estimate importance scores
  7 | # then check if importance scores behave similar across different species
  8 | ##############################################################
  9 | #setwd("/home/ibg-4/Desktop/Rhome/solanum_motifs")
 10 | ##############################################################
 11 | PROJECT <- "Sly_mo_on_Spe_0e3_percentiles"
 12 | SPEC <- "Spenn"
 13 | MODEL <- "MSR"
 14 | DATA_ORIGIN <- "motif_matches"
 15 | DATE <- "20230530"
 16 | ##############################################################
 17 | ##############################################################
 18 | dirpath_1 <- "../ref_seq"
 19 | dirpath_2 <- "./out"
 20 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 21 | file1 <- "SolyMSR_on_Spe-ch01-0e3_gene_none20230530-mima.csv"
 22 | file2 <- "msr_predictions_on_pennellii.csv"
 23 | file3 <- "mapman_sopen.txt"
 24 | file4 <- "spennellii.csv"
 25 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 26 | file_path_out <- file.path(dirpath_2, paste0(DATE,"_",PROJECT,"_mo-predict-map"))
 27 |  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 28 | library(stats)
 29 | library(dplyr)
 30 | library(stringr)
 31 | library(ggplot2)
 32 | #library(ape)
 33 | #library(hrbrthemes)
 34 | #library(randomForestSRC)
 35 |   # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 36 | mm0 <- read.table(
 37 |   file.path(
 38 |     dirpath_2,
 39 |     file1),
 40 |   header=TRUE,
 41 |   sep=",")
 42 | model0 <- read.table(file2,
 43 |   header=TRUE,
 44 |   sep=",")
 45 | mapman <- read.table(file3,
 46 |                      header=TRUE,
 47 |                      sep="\t", quote = "")
 48 | model1 <- read.table(
 49 |   file4,
 50 |   header=TRUE,
 51 |   sep=",")
 52 | colnames(model0) <- c("loc_ID","prob")
 53 | ########### ############## ############ #####################
 54 | ########### ############## ############ #####################
 55 | model0 <- model0[, c(1, 2)]
 56 | # Step 1: Subset DF2 to include only the genes with probability > 0.8
 57 | #high_prob_genes <- model0[model0$prob > 0.01, ]
 58 | # Step 2: Merge DF1 and high_prob_genes based on loc_ID
 59 | merged_df <- merge(mm0, model0, 
 60 |                    by = "loc_ID",
 61 |                    all.x = TRUE,
 62 |                    all.y = TRUE)
 63 | merged_df <- na.omit(merged_df)
 64 | merg_df00 <- merged_df[, c(1, 6, 7, 8,9, 13)]
 65 | subset_df <- merg_df00[c("motif", "prob")]
 66 | ################################################################################
 67 | # extract the required string from IDENTIFIER
 68 | colnames(mapman)[colnames(mapman)=="IDENTIFIER"]<-"loc_ID"
 69 | merg_df00$loc_ID <- tolower(merg_df00$loc_ID)
 70 | mapman$loc_ID <- tolower(mapman$loc_ID)
 71 | mapman$loc_ID <- gsub("[-']+", "", mapman$loc_ID)
 72 | mapman$loc_ID <- gsub("\\..*", "", mapman$loc_ID) #CAREFULL WITH THE DOTS
 73 | merg_df00map <- merge(
 74 |   merg_df00, mapman,
 75 |   by = c("loc_ID"),
 76 |   all = FALSE, ignore.case = TRUE)
 77 | # remove rows with NA cells
 78 | ################################################################################
 79 | ss_model1 <- model1[, c("gene_id", "logMaxTPM", "true_target")]
 80 | ss_model1 <- subset(ss_model1, logMaxTPM !=  0)
 81 | #ss_model1 <- subset(ss_model1, true_target %in% c(1, 2)) # GENES ARE CLASSIFIED AS 0 (NO EXPR), 1 (LOW <0.5) AND HIGH (...).
 82 | # To EVALUATE the predictiveness of EPMs, genes falgged with 0 are handled as LOW expression genes. 
 83 | ss_model1$gene_id <- tolower(ss_model1$gene_id)
 84 | ss_model1$gene_id <- gsub("[-']+", "", ss_model1$gene_id)
 85 | ss_model1$gene_id <- gsub("\\..*\\.", ".", ss_model1$gene_id)
 86 | colnames(ss_model1) <- c("loc_ID","logMaxTPM","class")
 87 | ss_model1$class <- ifelse(
 88 |   ss_model1$class == 2,
 89 |   "low", "high")
 90 | merg_df01mm<- merge(
 91 |   merg_df00map, ss_model1,
 92 |   by = c("loc_ID"),
 93 |   all = FALSE, ignore.case = TRUE)
 94 | merg_df01mm$prob_class <- ifelse(
 95 |   merg_df01mm$prob <= 0.5,
 96 |   "low", "high")
 97 | 
 98 | merg_df01mm$pred_perf<- ifelse(
 99 |   merg_df01mm$class == merg_df01mm$prob_class,
100 |   "TRUE", "FALSE")
101 | ################################################################################
102 | a0_fd <- merg_df01mm[, c(5,1,2,3,4,11,6,12,13,14)]
103 | # Contingency table of "motif" and "loc_ID"
104 | contingency_table_loc_ID <- table(a0_fd$motif, a0_fd$loc_ID)
105 | # Contingency table of "motif" and "GO"
106 | contingency_table_GO <- table(merg_df01mm$motif, merg_df01mm$NAME)
107 | # Contingency table of "motif" and "class"
108 | contingency_table_expr_class <- table(a0_fd$motif, a0_fd$class)
109 | # Contingency table of "motif" and "prob"
110 | contingency_table_prob_class <- table(a0_fd$motif, a0_fd$prob_class)
111 | # Contingency table of "motif" and "performance"
112 | contingency_table_pred_perf <- table(a0_fd$motif, a0_fd$pred_perf)
113 | ################################################################################
114 | ################################################################################
115 | testA =as.data.frame(contingency_table_loc_ID)
116 | tfestA = testA %>%
117 |   pivot_wider(names_from = Var2, values_from = Freq)
118 | testB =as.data.frame(contingency_table_GO)
119 | tfestB = testB %>%
120 |   pivot_wider(names_from = Var2, values_from = Freq) # transpose A & B, sum, remove greater than 1, 
121 | ################################################################################
122 | 
123 | tra_tfestA <- as.data.frame(t(tfestA))
124 | colnames(tra_tfestA) <- tra_tfestA[1, ]
125 | tra_tfestA <- tra_tfestA[-1, ]
126 | colnames(tra_tfestA) <- sub("^.*?(Soly_M\\d+_p\\d+m\\d+F).*", "\\1", colnames(tra_tfestA))
127 | colnames(tra_tfestA) <- sub("^.*?(Soly_M\\d+_p\\d+m\\d+R).*", "\\1", colnames(tra_tfestA))
128 | tra_tfestA[, -1] <- sapply(tra_tfestA[, -1], as.numeric)
129 | #tra_tfestA[tra_tfestA > 0] <- TRUE
130 | #tra_tfestA0 <- tra_tfestA[rowSums(tra_tfestA == 1) >= 3, ]
131 | #tra_tfestA0[tra_tfestA0 > 0] <- 1
132 | 
133 | filtered_df <- tra_tfestA
134 | #filtered_df[filtered_df > 0] <- 1
135 | filtered_df <- sapply(filtered_df, as.numeric)
136 | sums <- colSums(filtered_df, na.rm = TRUE)
137 | sum_df <- data.frame(columns = colnames(filtered_df), sums = sums)
138 | 
139 | plot <- ggplot(sum_df, aes(x = columns, y = sums, color = columns)) +
140 |   geom_point(size = 3) +
141 |   labs(title = "Column sums", x = "motifs", y = "sums") +
142 |   theme_minimal() +
143 |   theme(legend.position = "bottom")
144 | 
145 | # Save the plot as a PDF file
146 | ggsave(file=paste0(file_path_out,
147 |                    "sum-mima.pdf"), plot, width = 8, height = 6)
148 | 
149 | 
150 | #######################################################################################
151 | 
152 | sums <- colSums(filtered_df, na.rm = TRUE)
153 | averages <- sums / nrow(filtered_df)
154 | 
155 | sum_df <- data.frame(columns = colnames(filtered_df), sums = sums, averages = averages)
156 | 
157 | # Create the barplot using ggplot2
158 | plot2 <- ggplot(sum_df, aes(x = columns, y = averages, color = columns)) +
159 |   geom_point(size = 3) +
160 |   labs(title = "Column Averages", x = "Columns", y = "Average") +
161 |   theme_minimal() +
162 |   theme(legend.position = "bottom")
163 | 
164 | # Save the plot as a PDF file
165 | ggsave(file=paste0(file_path_out,
166 |                    "average-mima.pdf"), plot2, width = 8, height = 6)
167 | #######################################################################################
168 | #head(tra_tfestA0, 10)
169 | #######################################################################################
170 | #######################################################################################
171 | # tra_tfestA0 CONTAINS A RAW SET OF ALL MOTIF MACTHES ACROSS THE DIFFERENT GENES AND CAN BE USED TO FIND BEST COMBINATIONS
172 | #######################################################################################
173 | #######################################################################################
174 | 
175 | #######################################################################################
176 | #tra_tfestA0$motifs <- apply(tra_tfestA0, 1, function(row) {
177 | #  names(row)[row == 1] %>% paste(collapse = "-")
178 | #})
179 | #fmo_comb <- data.frame(row.names = row.names(tra_tfestA0), motifs = tra_tfestA0$motifs)
180 | #fmo_comb$loc_ID <- row.names(fmo_comb)
181 | #contingency_table_motifs <- table(fmo_comb$motifs, fmo_comb$loc_ID)
182 | #testF =as.data.frame(contingency_table_motifs)
183 | #tfestF = testF %>%
184 | #  pivot_wider(names_from = Var2, values_from = Freq)
185 | #######################################################################################
186 | 
187 | ################################################################################
188 | 
189 | testC =as.data.frame(contingency_table_expr_class)
190 | tfestC = testC %>%
191 |   pivot_wider(names_from = Var2, values_from = Freq)
192 | colnames(tfestC)<- c("Var1", "expr_class0", "expr_class1")
193 | testD =as.data.frame(contingency_table_prob_class)
194 | tfestD = testD %>%
195 |   pivot_wider(names_from = Var2, values_from = Freq)
196 | colnames(tfestD)<- c("Var1", "prob_class0", "prob_class1")
197 | testE =as.data.frame(contingency_table_pred_perf)
198 | tfestE = testE %>%
199 |   pivot_wider(names_from = Var2, values_from = Freq)
200 | ################################################################################
201 | tfestCDE <- merge( tfestC, 
202 |                    merge(tfestD, tfestE, by = "Var1"), by = "Var1")
203 | tfestCDE$mo_metacluster <- 0
204 | tfestCDE$mo_metacluster[grep("_p1m", tfestCDE$Var1)] <- 1
205 | 
206 | 
207 | tfestCDE$prob_rat01 <- ifelse(
208 |   ifelse(tfestCDE$mo_metacluster == 0, 
209 |                               tfestCDE$prob_class0/tfestCDE$prob_class1-1, 
210 |                               tfestCDE$prob_class1/tfestCDE$prob_class0-1) > 0, 1, 0)
211 | 
212 | tfestCDE$expr_rat01 <- ifelse(
213 |   ifelse(tfestCDE$mo_metacluster == 0, 
214 |                               tfestCDE$expr_class0/tfestCDE$expr_class1-1, 
215 |                               tfestCDE$expr_class1/tfestCDE$expr_class0-1) > 0, 1, 0)
216 | 
217 | tfestCDE$reliability <- c(tfestCDE$`TRUE`/(tfestCDE$`FALSE`+tfestCDE$`TRUE`))
218 | 
219 | tfestCDE$chi_expr_class <- apply(tfestCDE[, c("expr_class0", "expr_class1")], 1, function(row) {
220 |   result <- chisq.test(row, p = c(0.5, 0.5))
221 |   result$p.value
222 | })
223 | 
224 | tfestCDE$chi_prob_class <- apply(tfestCDE[, c("prob_class0", "prob_class1")], 1, function(row) {
225 |   result <- chisq.test(row, p = c(0.5, 0.5))
226 |   result$p.value
227 | })
228 | 
229 | tfestCDE$chi_TF <- apply(tfestCDE[, c("FALSE", "TRUE")], 1, function(row) {
230 |   result <- chisq.test(row, p = c(0.5, 0.5))
231 |   result$p.value
232 | })
233 | 
234 | ##################################################################################################
235 | 
236 | write.csv(tfestCDE, file=paste0(file_path_out,
237 |                                 "-mima.csv"), row.names=FALSE)
238 | 
239 | 
240 | #tfestCDE$perf_row <- rowMeans(tfestCDE[, c(9, 10, 11)])
241 | #tfestCDE$perf_tot<- colMeans(tfestCDE[, c(9, 10, 11)])
242 | 


--------------------------------------------------------------------------------
/moca_blue/mo_proj/occ_filter_v1.1.R:
--------------------------------------------------------------------------------
 1 | #This script filters the occurences of motifs in a genome (mapped with BLAMM)
 2 | #Motifs must lie in a range of 1500 bp of gene start and end, respectively
 3 | #Before the occurences can be filtered regarding the motif preferences the genes orientations must be determined
 4 | # ATTENTION! The occurence file from BLAMM can be very large.
 5 | # To keep computational power low it is highly recommended to split the occurence file into smaller ones (with 1 billion lines)
 6 | #############################################################
 7 | ######################################### USE THIS AS BASH ##
 8 | #project="Soly_blamm_20230417"
 9 | #inputFile="occurrences_0.00001.txt"
10 | #outputSize="1000000"
11 | #split -l $outputSize --numeric-suffixes $inputFile smallfile
12 | #mkdir occ$project
13 | #mv smallfile* occ$project/
14 | #############################################################
15 | 
16 | 
17 | setwd("G:/Machina_Eva/R_Home/moca_blue/moca_blue/2023_mo_slyc_spenn")
18 | 
19 | 
20 | ############################################################# ITS WORKING !
21 | dirpath <- "./occSpen_ch01_0e3"
22 | output_file <- "./out/outSpen_ch01_0e3.txt"
23 | file_paths  <- list.files(dirpath, pattern = "^smallfile*", full.names = TRUE)
24 | ############################################################# 
25 | library(tidyr)
26 | library(dplyr)
27 | library(readr)
28 | library(magrittr)
29 | ############################################################# 
30 | # initialize an empty data frame to store the results
31 | combined_df <- data.frame()
32 | 
33 | # loop over all files and append the results to the combined data frame
34 | for (filepath in file_paths) {
35 |   # read the file and skip lines with less than 9 elements
36 |   occurrence_df <- read.table(filepath, header = FALSE, fill = TRUE)
37 |   # select rows with the specified pattern and update the column names
38 | #  occurrence_df <- subset(occurrence_df, grepl("^epmSola", V3))
39 |   colnames(occurrence_df) <- c("loc",
40 |                                "source",
41 |                                "motif",
42 |                                "mstart",
43 |                                "mend",
44 |                                "score", 
45 |                                "strand",
46 |                                "V7",
47 |                                "V8")
48 |   
49 |   # split the "loc" column into "chr", "gene_start", and "gene_end"
50 |  # selected_df <- separate(occurrence_df, loc, into = c("chr", "gene_start", "gene_end"), sep = "[:-]", fill = "left")
51 |   selected_df <- separate(occurrence_df,
52 |                           loc,
53 |                           into = c("chr", "gene_loc"),
54 |                           sep = ":",
55 |                           fill = "left")  %>%
56 |     separate(
57 |       gene_loc, into = c("gene_start", "gene_end"),
58 |       sep = "-",
59 |       fill = "left")
60 |   
61 |   # combine "chr", "gene_start", and "gene_end" into a new "loc" column
62 |   selected_df$loc <- paste(selected_df$chr,
63 |                            selected_df$gene_start,
64 |                            sep = ":")
65 |   selected_df$loc <- ifelse(
66 |     is.na(
67 |       selected_df$gene_end),
68 |     selected_df$loc,
69 |     paste(
70 |       selected_df$loc,
71 |       selected_df$gene_end,
72 |       sep = "-"))
73 |   
74 |   # filter rows based on the specified condition
75 |   filtered_df <- selected_df %>%
76 |     mutate(across(c(mstart, gene_start, gene_end),
77 |                   as.numeric)) %>% # convert columns to numeric
78 |     filter(mstart <= 1500 | abs(mstart - (gene_end-gene_start +1)) <= 1500)
79 |   
80 |   # append the filtered data frame to the combined data frame
81 |   combined_df <- rbind(combined_df, filtered_df)
82 | }
83 | 
84 | # write the combined data frame to a file
85 | write.table(combined_df, 
86 |             output_file, 
87 |             sep = "\t",
88 |             row.names = FALSE)
89 | 
90 | 


--------------------------------------------------------------------------------
/moca_blue/mo_ran/meta_motif_ranges_characteristics_TSS-TTS.1.4.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | #library(MASS)
 3 | #setwd("~/ibg-4/Desktop/Rhome/moca_blue/mo_range")
 4 | #########################################################################################################################
 5 | NAME0="rdf5_seqlet_pattern"
 6 | SPEC="Sobi"
 7 | MODEL="M0"
 8 | FILE= "arabidopsis_SSR_modisco.hdf5"
 9 | #########################################################################################################################
10 | dirpath_in = "./out"
11 | dirpath_out = "./out"
12 | ############################################################
13 | file_path_in <- file.path(dirpath_in, paste0(NAME0,SPEC,MODEL))
14 | #data <- read.csv("rdf5_seqlet_patternArthM0.txt"), sep=",")
15 | data <- read.csv(file = paste0(file_path_in,".txt"), sep=",")
16 | #########################################################################################################################
17 | ########################################################################################################################
18 | #########################################################################################################################
19 | data$motif <- ifelse(data$metacluster == "metacluster_0", "p0", "p1")
20 | data$motif <- paste0(data$motif, "m", sprintf("%02d", as.numeric(substring(data$pattern, 9))))
21 | #########################################################################################################################
22 | 
23 | #data$start <- as.numeric(data$start)
24 | range1 <- data %>% filter(start >= 1 & end <= 1500)
25 | range2 <- data %>% filter(start >= 1520 & end <= 3000)
26 | 
27 | range1$trunc_start <- floor(range1$start / 10)
28 | range2$trunc_start <- floor(range2$start / 10)
29 | ##################################################### Create function to calculate mode #################################
30 | 
31 | customMode <- function(x) {
32 |   freq <- table(x)
33 |   mode <- as.numeric(names(freq)[which.max(freq)])
34 |   return(mode)
35 | }
36 | #########################################################################################################################
37 | 
38 | # calculate the summary statistics, including the mode and its frequency
39 | result1 <- range1 %>%
40 |   group_by(motif) %>%
41 |   summarize(min = min(start),
42 |             max = max(start),
43 |             q10 = quantile(start, 0.10),
44 |             median = median(start),
45 |             q90 = quantile(start, 0.90),
46 |             mode = customMode(trunc_start),
47 |             mean = mean(start),
48 |             sd = sd(start),
49 |             cv = sd(start) / mean(start) * 100,
50 |             iqr = q90 - q10,
51 |             number = n())
52 | 
53 | # extract the mode value and frequency from the mode vector and multiply the mode by 10
54 | result1$mode <- c(result1$mode * 10)  #mode uses decimal
55 | 
56 | result2 <- range2 %>%
57 |   group_by(motif) %>%
58 |   summarize(min = min(start),
59 |             max = max(start),
60 |             q10 = quantile(start, 0.10),
61 |             median = median(start),
62 |             q90 = quantile(start, 0.90),
63 |             mode = customMode(trunc_start),
64 |             mean = mean(start),
65 |             sd = sd(start),
66 |             cv = sd(start) / mean(start) * 100,
67 |             iqr = q90 - q10,
68 |             number = n())
69 | 
70 | # extract the mode value and frequency from the mode vector and multiply the mode by 10
71 | result2$mode <- c(result2$mode * 10)  #mode uses decimal
72 | 
73 | # add additional information to the result data frames
74 | result2$Species <- c(SPEC)
75 | result2$Model <- c(MODEL)
76 | result2$source <- c(FILE)
77 | 
78 | result1$Species <- c(SPEC)
79 | result1$Model <- c(MODEL)
80 | result1$source <- c(FILE)
81 | 
82 | result1$epm <- paste("epm", result1$Species, result1$Model, result1$motif, sep="_")
83 | result2$epm <- paste("epm", result2$Species, result2$Model, result2$motif, sep="_")
84 | 
85 | # select only the desired columns in the result data frames
86 | result1 <- result1 %>%
87 |   select(epm, min, max, mean, median, mode, q10,  q90,  sd, cv, iqr, number, source)
88 | 
89 | result2 <- result2 %>%
90 |   select(epm, min, max, mean, median, mode, q10,  q90,  sd, cv, iqr, number, source)
91 | 
92 | #########################################################################################################################
93 | file_path_out <- file.path(dirpath_out, paste0(SPEC,MODEL))
94 | 
95 | write.csv(result1, file=paste0(file_path_out,"-TSS_motif_ranges.csv"), row.names=FALSE)
96 | write.csv(result2, file=paste0(file_path_out,"-TTS_motif_ranges.csv"), row.names=FALSE)


--------------------------------------------------------------------------------
/moca_blue/mo_ran/meta_motif_ranges_characteristics_TSS-TTS.1.5.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | #library(MASS)
 3 | #setwd("~/ibg-4/Desktop/Rhome/moca_blue/mo_range")
 4 | #########################################################################################################################
 5 | NAME0="rdf5_seqlet_pattern"
 6 | SPEC="Zema"
 7 | MODEL="S0"
 8 | FILE= "zea_modisco.hdf5"
 9 | #########################################################################################################################
10 | dirpath_in = "./out"
11 | dirpath_out = "./out"
12 | ############################################################
13 | file_path_in <- file.path(dirpath_in, paste0(NAME0,SPEC,MODEL))
14 | #data <- read.csv("rdf5_seqlet_patternArthM0.txt"), sep=",")
15 | data <- read.csv(file = paste0(file_path_in,".txt"), sep=",")
16 | #########################################################################################################################
17 | ########################################################################################################################
18 | #########################################################################################################################
19 | data$motif <- ifelse(data$metacluster == "metacluster_0", "p0", "p1")
20 | data$motif <- paste0(data$motif, "m", sprintf("%02d", as.numeric(substring(data$pattern, 9))))
21 | #########################################################################################################################
22 | 
23 | #data$start <- as.numeric(data$start)
24 | range1 <- data %>% filter(start >= 1 & end <= 1500)
25 | range2 <- data %>% filter(start >= 1520 & end <= 3000)
26 | 
27 | range1$trunc_start <- floor(range1$start / 10)
28 | range2$trunc_start <- floor(range2$start / 10)
29 | 
30 | range2$start <- c(3020-range2$start)
31 | range2$end <- c(3020-range2$end)
32 | ##################################################### Create function to calculate mode #################################
33 | 
34 | customMode <- function(x) {
35 |   freq <- table(x)
36 |   mode <- as.numeric(names(freq)[which.max(freq)])
37 |   return(mode)
38 | }
39 | #########################################################################################################################
40 | 
41 | # calculate the summary statistics, including the mode and its frequency
42 | result1 <- range1 %>%
43 |   group_by(motif) %>%
44 |   summarize(min = min(start),
45 |             max = max(start),
46 |             q10 = quantile(start, 0.1),
47 |             median = median(start),
48 |             q90 = quantile(start, 0.9),
49 |             mode = customMode(trunc_start),
50 |             mean = mean(start),
51 |             sd = sd(start),
52 |             cv = sd(start) / mean(start) * 100,
53 |             iqr = q90 - q10,
54 |             number = n())
55 | 
56 | # extract the mode value and frequency from the mode vector and multiply the mode by 10
57 | result1$mode <- c(result1$mode * 10)  #mode uses decimal
58 | 
59 | result2 <- range2 %>%
60 |   group_by(motif) %>%
61 |   summarize(min = min(start),
62 |             max = max(start),
63 |             q10 = quantile(start, 0.1),
64 |             median = median(start),
65 |             q90 = quantile(start, 0.9),
66 |             mode = customMode(trunc_start),
67 |             mean = mean(start),
68 |             sd = sd(start),
69 |             cv = sd(start) / mean(start) * 100,
70 |             iqr = q90 - q10,
71 |             number = n())
72 | 
73 | # extract the mode value and frequency from the mode vector and multiply the mode by 10
74 | result2$mode <- c(result2$mode * 10)  #mode uses decimal
75 | 
76 | # add additional information to the result data frames
77 | result2$Species <- c(SPEC)
78 | result2$Model <- c(MODEL)
79 | result2$source <- c(FILE)
80 | 
81 | result1$Species <- c(SPEC)
82 | result1$Model <- c(MODEL)
83 | result1$source <- c(FILE)
84 | 
85 | result1$epm <- paste("epm", result1$Species, result1$Model, result1$motif, sep="_")
86 | result2$epm <- paste("epm", result2$Species, result2$Model, result2$motif, sep="_")
87 | 
88 | # select only the desired columns in the result data frames
89 | result1 <- result1 %>%
90 |   select(epm, min, max, mean, median, mode, q10,  q90,  sd, cv, iqr, number, source)
91 | 
92 | result2 <- result2 %>%
93 |   select(epm, min, max, mean, median, mode, q10,  q90,  sd, cv, iqr, number, source)
94 | 
95 | #########################################################################################################################
96 | file_path_out <- file.path(dirpath_out, paste0(SPEC,MODEL))
97 | 
98 | write.csv(result1, file=paste0(file_path_out,"-TSS_motif_ranges_q1q9.csv"), row.names=FALSE)
99 | write.csv(result2, file=paste0(file_path_out,"-TTS_motif_ranges_q1q9.csv"), row.names=FALSE)


--------------------------------------------------------------------------------
/moca_blue/mo_ran/rdf5_get_seql_per_patternV2.1.R:
--------------------------------------------------------------------------------
 1 | library(rhdf5)
 2 | #install.packages("plyr")
 3 | #library(plyr)
 4 | library(tidyr)
 5 | 
 6 | #setwd("~/ibg-4/Desktop/Rhome/moca_blue/mo_range")
 7 | ###################################################################################
 8 | NAME0="rdf5_seqlet_pattern"
 9 | SPEC="Soly"
10 | MODEL="S0"  # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
11 | #######################################################
12 | FILE1= "solanum_modisco.hdf5"
13 | ###################################################################################
14 | dirpath_in = "../0MOTIFS/MODISCO_SSR_RAW"
15 | dirpath_out = "./out"
16 | # Define the pattern names to iterate over
17 | 
18 | ###################################################
19 | #motifs in metacluster 0 -automatize this step
20 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
21 | #X=20
22 | #motifs in metacluster 1 -automatize this step
23 | #Y=10
24 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
25 | h5file <- H5Fopen(file.path(
26 |   dirpath_in,
27 |   FILE1), "H5F_ACC_RDONLY")
28 | h5ls(h5file)
29 | metacluster_group <- h5read(h5file, "metacluster_idx_to_submetacluster_results")
30 | # loop through the metaclusters 0 and 1
31 | for (i in c(0, 1)) {
32 |   metacluster <- metacluster_group[[paste0("metacluster_", i)]]
33 | }
34 | #######################################################
35 | # loop through the metaclusters 0 and 1
36 | for (i in names(metacluster_group)) {
37 |   metacluster <- metacluster_group[[i]]
38 |   patterns = metacluster[['seqlets_to_patterns_result']][['patterns']]
39 | }
40 | # Define the pattern names to iterate over
41 | ########################################################
42 | length(patterns[['all_pattern_names']])
43 | X = length(metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]])-2
44 | Y = length(metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]])-2
45 | X1 = X+Y
46 | Y1 = X*2+Y*2
47 | 
48 | ##############################################             METACLUSTER_0    ######
49 | # Initialize a list to store the results
50 | ls_list <- list()
51 | ls_list1 <- list()
52 | seqlets_all_mc0 <- data.frame()
53 | seqlets_all_mc1 <- data.frame()
54 | 
55 | for (i in 0:X) {
56 |   pattern_name <- paste0("pattern_", i)
57 |   seqletls <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
58 |   ls <- as.data.frame(seqletls)
59 |   ls_list[[pattern_name]] <- ls
60 |   seqlets_i <- as.data.frame(ls_list[[paste0("pattern_", i)]][["seqletls"]])
61 |   colnames(seqlets_i) <- c("seqlets")
62 |   seqlets_i$pattern <- paste0("pattern_", i)
63 |   seqlets_all_mc0 <- rbind(seqlets_all_mc0, seqlets_i)
64 | }
65 | seqlets_all_mc0$metacluster <- c("metacluster_0")
66 | 
67 | ##############################################        METACLUSTER_1  #############
68 | 
69 | for (i in 0:Y) {
70 |   pattern_name <- paste0("pattern_", i)
71 |   seqletls <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
72 |   ls <- as.data.frame(seqletls)
73 |   ls_list[[pattern_name]] <- ls
74 |   seqlets_i <- as.data.frame(ls_list[[paste0("pattern_", i)]][["seqletls"]])
75 |   colnames(seqlets_i) <- c("seqlets")
76 |   seqlets_i$pattern <- paste0("pattern_", i)
77 |   seqlets_all_mc1 <- rbind(seqlets_all_mc1, seqlets_i)
78 | }
79 | seqlets_all_mc1$metacluster <- c("metacluster_1")
80 | 
81 | ############################################################################ ######
82 | 
83 | seqlet_mc01 <- rbind.data.frame(seqlets_all_mc0, seqlets_all_mc1)
84 | 
85 | df <- seqlet_mc01 %>%
86 |   mutate(example = NA, start = NA, end = NA, rc = NA) %>%
87 |   separate(col = seqlets, into = c("example", "start", "end", "rc"), sep = "[,]")
88 | 
89 | df$example <- gsub("example:", "", df$example)
90 | df$start <- gsub("start:", "", df$start)
91 | df$end <- gsub("end:", "", df$end)
92 | df$rc <- gsub("rc:", "", df$rc)
93 | 
94 | ############################################################################ ######
95 | file_path_out <- file.path(dirpath_out, paste0(NAME0,SPEC,MODEL))
96 | 
97 | write.csv(df, file = paste0(file_path_out,".txt"), row.names = FALSE)
98 | 


--------------------------------------------------------------------------------
/moca_blue/mo_ran/rdf5_get_seql_per_patternV2.R:
--------------------------------------------------------------------------------
 1 | library(rhdf5)
 2 | install.packages("plyr")
 3 | library(plyr)
 4 | library(tidyr)
 5 | 
 6 | setwd("~/Desktop/Rhome/moca_blue/Mo_range")
 7 | ###################################################################################
 8 | NAME0="rdf5_seqlet_pattern"
 9 | SPEC="Soly"
10 | MODEL="M0"  # C0 stand for DeepCistrome version 1 (available at 02-may-2023) Standard conditions
11 | #######################################################
12 | FILE1= "Solanum_MSR_modisco.hdf5"
13 | ###################################################################################
14 | dirpath_in = "../0MOTIFS/MOTIFS_MSR/SL/"
15 | dirpath_out = "./out"
16 | #motifs in metacluster 0 -automatize this step
17 | 
18 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
19 | X=20
20 | #motifs in metacluster 1 -automatize this step
21 | Y=10
22 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
23 | h5file <- H5Fopen(file.path(
24 |   dirpath_in,
25 |   FILE1), "H5F_ACC_RDONLY")
26 | h5ls(h5file)
27 | metacluster_group <- h5read(h5file, "metacluster_idx_to_submetacluster_results")
28 | # loop through the metaclusters 0 and 1
29 | for (i in c(0, 1)) {
30 |   metacluster <- metacluster_group[[paste0("metacluster_", i)]]
31 | }
32 | ##############################################             METACLUSTER_0    ######
33 | # Initialize a list to store the results
34 | ls_list <- list()
35 | ls_list1 <- list()
36 | seqlets_all_mc0 <- data.frame()
37 | seqlets_all_mc1 <- data.frame()
38 | 
39 | for (i in 0:19) {
40 |   pattern_name <- paste0("pattern_", i)
41 |   seqletls <- metacluster_group[["metacluster_0"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
42 |   ls <- as.data.frame(seqletls)
43 |   ls_list[[pattern_name]] <- ls
44 |   seqlets_i <- as.data.frame(ls_list[[paste0("pattern_", i)]][["seqletls"]])
45 |   colnames(seqlets_i) <- c("seqlets")
46 |   seqlets_i$pattern <- paste0("pattern_", i)
47 |   seqlets_all_mc0 <- rbind(seqlets_all_mc0, seqlets_i)
48 | }
49 | seqlets_all_mc0$metacluster <- c("metacluster_0")
50 | 
51 | ##############################################        METACLUSTER_1  #############
52 | 
53 | for (i in 0:9) {
54 |   pattern_name <- paste0("pattern_", i)
55 |   seqletls <- metacluster_group[["metacluster_1"]][["seqlets_to_patterns_result"]][["patterns"]][[pattern_name]][["seqlets_and_alnmts"]][["seqlets"]]
56 |   ls <- as.data.frame(seqletls)
57 |   ls_list[[pattern_name]] <- ls
58 |   seqlets_i <- as.data.frame(ls_list[[paste0("pattern_", i)]][["seqletls"]])
59 |   colnames(seqlets_i) <- c("seqlets")
60 |   seqlets_i$pattern <- paste0("pattern_", i)
61 |   seqlets_all_mc1 <- rbind(seqlets_all_mc1, seqlets_i)
62 | }
63 | seqlets_all_mc1$metacluster <- c("metacluster_1")
64 | 
65 | ############################################################################ ######
66 | 
67 | seqlet_mc01 <- rbind.data.frame(seqlets_all_mc0, seqlets_all_mc1)
68 | 
69 | df <- seqlet_mc01 %>%
70 |   mutate(example = NA, start = NA, end = NA, rc = NA) %>%
71 |   separate(col = seqlets, into = c("example", "start", "end", "rc"), sep = "[,]")
72 | 
73 | df$example <- gsub("example:", "", df$example)
74 | df$start <- gsub("start:", "", df$start)
75 | df$end <- gsub("end:", "", df$end)
76 | df$rc <- gsub("rc:", "", df$rc)
77 | 
78 | ############################################################################ ######
79 | write.csv(df, file = paste0(NAME0,SPEC,MODEL,".txt"), row.names = FALSE)
80 | 


--------------------------------------------------------------------------------
/moca_blue/ref_seq/blamm_meV1.0.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define variables
 4 | project="Spen_ch01_0e3"
 5 | pt_score=0.0001
 6 | motifs_file="Soly-MSR_20230118_uu.jaspar"
 7 | 
 8 | #
 9 | # Start recording runtime and resource information
10 | start_time=$(date +%s)
11 | start_resources=$(ps -o pid,%cpu,%mem,vsz,rss,tty,stat,start_time --no-headers $$)
12 | 
13 | 
14 | # Run the commands with variables
15 | ./blamm dict sequences.mf
16 | ./blamm hist -e "$motifs_file" sequences.mf #-e generated empirical PWM scores
17 | ./blamm scan -rc -pt "$pt_score" "$motifs_file" sequences.mf
18 | 
19 | #
20 | echo "CLEANING UP"
21 | 
22 | mkdir occ$project
23 | mv hist_* occ$project/
24 | mv occurrences.txt occ$project/
25 | mv PWMthresholds.txt occ$project/
26 | 
27 | #
28 | # Stop recording runtime and resource information
29 | end_time=$(date +%s)
30 | end_resources=$(ps -o pid,%cpu,%mem,vsz,rss,tty,stat,start_time --no-headers $$)
31 | 
32 | # Calculate runtime
33 | runtime=$((end_time - start_time))
34 | # Print runtime and resource information
35 | echo "Script runtime: $runtime seconds"
36 | echo "Resource usage:"
37 | echo "$start_resources" | awk '{print "Start:", $0}'
38 | echo "$end_resources" | awk '{print "End:", $0}'
39 | 


--------------------------------------------------------------------------------
/moca_blue/ref_seq/extract_range_to_fasta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #It extracts the gene ranges from the GFF file and saves them to the extracted_ranges.txt file.
 4 | #After that, it uses samtools to extract the corresponding sequences from the FASTA file (fasname) based on the ranges provided in output_file.
 5 | #The extracted sequences are then saved to fasout.
 6 | 
 7 | #20230628 Simon M. Zumkeller
 8 | # Specify the integer value to subtract/add from feature start and end
 9 | # ALL features are extracted from the leading strand
10 | flank_size=1000
11 | ####################################################
12 | filename="ITAG4.0_gene_models.gff"  # Replace with the path to your GFF file
13 | fasname="S_lycopersicum_chromosomes.4.00.fa" # Replace with the path to your fas file
14 | ####################################################
15 | output_file="extracted_ranges.txt"
16 | #####################################################
17 | fasout="${fasname%.*}_1kbp-flank.fa"
18 | 
19 | 
20 | 
21 | while IFS=$'\t' read -r col1 col2 col3 col4 col5 col6 col7 col8 col9; do
22 |     if [[ $col3 == *"gene"* ]]; then
23 |         col4=$((col4 - flank_size))
24 |         col5=$((col5 + flank_size))
25 |         echo -e "$col1:$col4-$col5"
26 |     fi
27 | done < "$filename" > "$output_file"
28 | 
29 | samtools faidx "$fasname" -o "$fasout" -r "$output_file" --mark-strand rc
30 | 


--------------------------------------------------------------------------------
/moca_blue/ref_seq/split_file.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | project="Spen_ch01_0e2"
 4 | inputFile="occurrences.txt"
 5 | outputSize="1000000"
 6 | 
 7 | # Start recording runtime and resource information
 8 | start_time=$(date +%s)
 9 | start_resources=$(ps -o pid,%cpu,%mem,vsz,rss,tty,stat,start_time --no-headers $$)
10 | 
11 | split -l $outputSize --numeric-suffixes occ$project/$inputFile smallfile
12 | #mkdir occ$project
13 | mv smallfile* occ$project/
14 | #
15 | # Stop recording runtime and resource information
16 | end_time=$(date +%s)
17 | end_resources=$(ps -o pid,%cpu,%mem,vsz,rss,tty,stat,start_time --no-headers $$)
18 | 
19 | # Calculate runtime
20 | runtime=$((end_time - start_time))
21 | # Print runtime and resource information
22 | echo "Script runtime: $runtime seconds"
23 | echo "Resource usage:"
24 | echo "$start_resources" | awk '{print "Start:", $0}'
25 | echo "$end_resources" | awk '{print "End:", $0}'
26 | 


--------------------------------------------------------------------------------
/model/create_generic_features.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pyranges as pr
  3 | from pyfaidx import Fasta
  4 | pd.options.display.width=0
  5 | 
  6 | 
  7 | def gc(seq):
  8 |     seq = str(seq)
  9 |     gc_count = seq.count('G') + seq.count('C')
 10 |     return gc_count/len(seq)
 11 | 
 12 | 
 13 | def cpg_perc(seq):
 14 |     cpg_counts = seq.count('CG')
 15 |     return cpg_counts/len(seq) * 100
 16 | 
 17 | 
 18 | def get_proximal_prom_and_term_features(gtf, fasta, chroms, flank=1000):
 19 |     gene_models = pr.read_gtf(f'gene_models/{gtf}', as_df=True)
 20 |     gene_models = gene_models[gene_models['Chromosome'].isin(chroms)]
 21 |     fasta = Fasta(f'genomes/{fasta}', as_raw=False, sequence_always_upper=True, read_ahead=10000)
 22 |     gene_models = gene_models[gene_models['gene_biotype'] == 'protein_coding']
 23 |     gene_models = gene_models[gene_models['Feature'] == 'gene']
 24 |     gene_models = gene_models[['Chromosome', 'Start', 'End', 'Strand', 'gene_id']]
 25 |     chrom_num, gene_id, prom_gc, term_gc, prom_cpg, term_cpg = [], [], [], [], [], []
 26 |     for chrom, start, end, strand, gene in gene_models.values:
 27 |         if strand == '-':
 28 |             prom_start, prom_end = end, end + flank
 29 |             term_start, term_end = start - flank, start
 30 |             term_start = 0 if term_start < 0 else term_start
 31 |         else:
 32 |             prom_start, prom_end = start - flank, start
 33 |             prom_start = 0 if prom_start < 0 else prom_start
 34 |             term_start, term_end = end, end + flank
 35 | 
 36 |         promoter = fasta[chrom][prom_start:prom_end]
 37 |         terminator = fasta[chrom][term_start:term_end]
 38 | 
 39 |         if strand == '-':
 40 |             promoter = promoter.reverse.complement.seq
 41 |             terminator = terminator.reverse.complement.seq
 42 |         else:
 43 |             promoter = promoter.seq
 44 |             terminator = terminator.seq
 45 |         gene_id.append(gene)
 46 |         chrom_num.append(chrom)
 47 |         prom_gc.append(gc(promoter))
 48 |         term_gc.append(gc(terminator))
 49 |         prom_cpg.append(cpg_perc(promoter))
 50 |         term_cpg.append(cpg_perc(terminator))
 51 | 
 52 |     return gene_id, chrom_num, prom_gc, prom_cpg, term_gc, term_cpg
 53 | 
 54 | 
 55 | def get_utr_features(gtf, fasta, chroms):
 56 |     utr_length_5, gc_content_5, cpg_5, utr_length_3, gc_content_3, cpg_3, gene_ids = [], [], [], [], [], [], []
 57 |     fasta = Fasta(f'genomes/{fasta}', as_raw=False, sequence_always_upper=True, read_ahead=10000)
 58 |     gene_models = pr.read_gtf(f'gene_models/{gtf}', as_df=True)
 59 |     gene_models = gene_models[gene_models['Chromosome'].isin(chroms)]
 60 |     gene_models = gene_models[gene_models['gene_biotype'] == 'protein_coding']
 61 |     gene_models = gene_models[['Chromosome', 'Feature', 'Start', 'End', 'Strand', 'gene_id']]
 62 |     gene_models = gene_models[gene_models['Feature'].isin(['five_prime_utr', 'three_prime_utr'])]
 63 |     for gene in gene_models['gene_id'].unique():
 64 |         gene_model_gene = gene_models.copy()
 65 |         gene_model_gene = gene_model_gene[gene_model_gene['gene_id'] == gene]
 66 |         gene_ids.append(gene)
 67 | 
 68 |         for utr_df in gene_model_gene.groupby('Feature'):
 69 |             # If no UTR is annotated for the gene
 70 |             # For genes with no UTR annotation we consider the UTR to be absent
 71 |             if utr_df[1].shape[0] == 0:
 72 |                 if utr_df[0] == 'five_prime_utr':
 73 |                     utr_length_5.append(0)
 74 |                     gc_content_5.append(0)
 75 |                     cpg_5.append(0)
 76 |                 elif utr_df[0] == 'three_prime_utr':
 77 |                     utr_length_3.append(0)
 78 |                     gc_content_3.append(0)
 79 |                     cpg_3.append(0)
 80 | 
 81 |             # If gene has a UTR annotated
 82 |             # some genes have more than one of a specific UTR annotation, so we take the longest
 83 |             else:
 84 |                 utr_df_copy = utr_df[1]
 85 |                 utr_df_copy['utr_lengths'] = utr_df_copy['End'] - utr_df_copy['Start']
 86 |                 utr_df_copy = utr_df_copy[utr_df_copy['utr_lengths'] == utr_df_copy['utr_lengths'].max()]
 87 |                 chrom = utr_df_copy.Chromosome.values[0]
 88 |                 utr_start = utr_df_copy.Start.values[0]
 89 |                 utr_end = utr_df_copy.End.values[0]
 90 |                 strand = utr_df_copy.Strand.values[0]
 91 |                 utr_length = utr_end - utr_start
 92 |                 utr_sequence = fasta[chrom][utr_start:utr_end]
 93 |                 if strand == '-':
 94 |                     utr_sequence = utr_sequence.reverse.complement.seq
 95 |                 else:
 96 |                     utr_sequence = utr_sequence.seq
 97 |                 if utr_df[0] == 'five_prime_utr':
 98 |                     utr_length_5.append(utr_length)
 99 |                     gc_content_5.append(gc(utr_sequence))
100 |                     cpg_5.append(cpg_perc(utr_sequence))
101 |                 elif utr_df[0] == 'three_prime_utr':
102 |                     utr_length_3.append(utr_length)
103 |                     gc_content_3.append(gc(utr_sequence))
104 |                     cpg_3.append(cpg_perc(utr_sequence))
105 |     return gene_ids, utr_length_5, gc_content_5, cpg_5, utr_length_3, gc_content_3, cpg_3
106 | 
107 | 
108 | gtfs = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf', 'Solanum_lycopersicum.SL3.0.52.gtf',
109 |         'Arabidopsis_thaliana.TAIR10.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
110 | genomes = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa', 'Solanum_lycopersicum.SL3.0.dna.toplevel.fa',
111 |            'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
112 | num_chromosomes = [10, 12, 5, 10]
113 | 
114 | for gtf_file, genome, num_chroms in zip(gtfs, genomes, num_chromosomes):
115 |     chromosomes = [str(x) for x in range(1, num_chroms+1)]
116 |     print(gtf_file)
117 | 
118 |     genes, utr_5, gc_5, cpg5, utr_3, gc_3, cpg3 = get_utr_features(gtf_file, genome, chromosomes)
119 |     df_utr_feats = pd.DataFrame({
120 |         'gene_id': genes,
121 |         "5'UTR length": utr_5,
122 |         "GC 5'UTR ": gc_5,
123 |         "CpG 5'UTR": cpg5,
124 |         "3'UTR length": utr_3,
125 |         "GC 3'UTR ": gc_3,
126 |         "CpG 3'UTR": cpg3,
127 |             })
128 |     print(df_utr_feats.head())
129 | 
130 |     g_ids, chrom_nums, p_gc, p_cpg, t_gc, t_cpg = get_proximal_prom_and_term_features(gtf_file, genome, chromosomes)
131 |     df_prox_prom_term = pd.DataFrame({
132 |         'gene_id': g_ids,
133 |         'Chromosome': chrom_nums,
134 |         'GC promoter': p_gc,
135 |         'CpG promoter': p_cpg,
136 |         'GC terminator': t_gc,
137 |         'CpG terminator': t_cpg
138 |     })
139 |     print(df_prox_prom_term.head())
140 | 
141 |     generated_features = df_prox_prom_term.merge(df_utr_feats, how='inner', on='gene_id')
142 |     generated_features.to_csv(f"{gtf_file.split('_')[0]}_generated_features.csv")
143 | 


--------------------------------------------------------------------------------
/model/create_super_genome_gtf_tpm_for_msr.py:
--------------------------------------------------------------------------------
 1 | import pyranges as pr
 2 | import pandas as pd
 3 | import numpy as np
 4 | from Bio.SeqIO import SeqRecord
 5 | from Bio import SeqIO
 6 | 
 7 | print('Now generating zea_sol_ara_sor_dna.gtf')
 8 | gene_models = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf', 'Solanum_lycopersicum.SL3.0.52.gtf',
 9 |                'Arabidopsis_thaliana.TAIR10.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
10 | num_chromosomes = [10, 12, 5, 10]
11 | gtfs = []
12 | for idx, specie in enumerate(gene_models):
13 |     gene_models = pr.read_gtf(f'gene_models/{specie}', as_df=True)
14 |     gene_models = gene_models[gene_models['Feature'] == 'gene']
15 |     gene_models = gene_models[gene_models['gene_biotype'] == 'protein_coding']
16 |     gene_models = gene_models[['Chromosome', 'Start', 'End', 'Strand', 'gene_id']]
17 |     gene_models = gene_models[gene_models['Chromosome'].isin([str(x) for x in range(1, num_chromosomes[idx]+1)])]
18 |     gene_models['Chromosome'] = specie.split('_')[0] + '_' + gene_models['Chromosome']
19 |     gene_models['specie'] = [x.split('_')[0] for x in gene_models['Chromosome']]
20 |     gtfs.append(gene_models)
21 |     print(gene_models.head())
22 | 
23 | gtfs = pd.concat(gtfs)
24 | gtfs.to_csv('gene_models/zea_sol_ara_sor_52.gtf', sep='\t', index=False)
25 | 
26 | # For tpm counts
27 | counts_root = ['zea_root_counts.csv', 'solanum_root_counts.csv', 'arabidopsis_root_counts.csv',
28 |                'sbicolor_root_counts.csv']
29 | counts_leaf = ['zea_counts.csv', 'solanum_counts.csv', 'arabidopsis_counts.csv',
30 |                'sbicolor_counts.csv']
31 | 
32 | print('Now generating zea_sol_ara_sor_roots.csv and zea_sol_ara_sor.csv for root and leaf respectively')
33 | for tissue, tissue_counts in zip(['leaf', 'root'], [counts_leaf, counts_root]):
34 |     df_counts = []
35 |     for count in tissue_counts:
36 |         counts_df = pd.read_csv(f'tpm_counts/{count}')
37 |         true_targets = []
38 |         print(count)
39 |         print(np.percentile(counts_df['logMaxTPM'], 25))
40 |         print(np.percentile(counts_df['logMaxTPM'], 75))
41 |         for log_count in counts_df['logMaxTPM'].values:
42 |             if log_count <= np.percentile(counts_df['logMaxTPM'], 25):
43 |                 true_targets.append(0)
44 |             elif log_count >= np.percentile(counts_df['logMaxTPM'], 75):
45 |                 true_targets.append(1)
46 |             else:
47 |                 true_targets.append(2)
48 |         counts_df['true_target'] = true_targets
49 |         counts_df = counts_df[counts_df['true_target'].isin([0, 1])]
50 |         df_counts.append(counts_df[['gene_id', 'true_target']])
51 |         print(counts_df.head())
52 | 
53 |     df_counts = pd.concat(df_counts)
54 |     print(df_counts.head())
55 |     if tissue == 'root':
56 |         df_counts.to_csv('tpm_counts/zea_sol_ara_sor_roots.csv', index=False)
57 |     else:
58 |         df_counts.to_csv('tpm_counts/zea_sol_ara_sor.csv', index=False)
59 | 
60 | print('Now generating zea_sol_ara_sor_dna.fa')
61 | genomes = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa', 'Solanum_lycopersicum.SL3.0.dna.toplevel.fa',
62 |            'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
63 | 
64 | 
65 | records = []
66 | for genome in genomes:
67 |     for rec in SeqIO.parse(f'genomes/{genome}', format='fasta'):
68 |         print(f"{genome.split('_')[0]}_"+rec.id)
69 |         print(f"{genome.split('_')[0]}_"+rec.description)
70 |         records.append(SeqRecord(seq=rec.seq, id=f"{genome.split('_')[0]}_"+rec.id,
71 |                                  description=f"{genome.split('_')[0]}_"+rec.description))
72 | 
73 | SeqIO.write(records, handle='genomes/zea_sol_ara_sor_dna.fa', format='fasta')


--------------------------------------------------------------------------------
/model/cross_specie_test_leaf.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras import models
 2 | import os
 3 | import pandas as pd
 4 | from utils import prepare_valid_seqs
 5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 6 | 
 7 | mapped_read_counts = ['zea_counts.csv', 'solanum_counts.csv', 'arabidopsis_counts.csv', 'sbicolor_counts.csv']
 8 | gene_models = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf', 'Solanum_lycopersicum.SL3.0.52.gtf',
 9 |                'Arabidopsis_thaliana.TAIR10.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
10 | genomes = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa', 'Solanum_lycopersicum.SL3.0.dna.toplevel.fa',
11 |            'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
12 | chrom_nums = {'Arabidopsis': 5, 'Solanum': 12, 'Zea': 10, 'Sorghum': 10}
13 | 
14 | cross_specie_results = []
15 | for tpm, gene_model, genome in zip(mapped_read_counts, gene_models, genomes):
16 |     chroms = [str(x) for x in range(1, chrom_nums[genome.split('_')[0]]+1)]
17 |     x_val, y_val, _ = prepare_valid_seqs(genome, gene_model, tpm, chroms)
18 |     for model in os.listdir('saved_models'):
19 |         if model.split('_')[0] != tpm.split('_')[0] and model.endswith('terminator.h5'):
20 |             loaded_model = models.load_model(f'saved_models/{model}')
21 |             evaluation = loaded_model.evaluate(x_val, y_val)
22 |             print(genome.split('_')[0], model.split('_')[0])
23 |             cross_specie_results.append([model.split('_')[0], genome.split('_')[0], evaluation[-1]])
24 | 
25 | cross_specie_results = pd.DataFrame(cross_specie_results,
26 |                                     columns=['train_specie', 'cross_specie', 'accuracy'])
27 | print(cross_specie_results.head())
28 | cross_specie_results.to_csv('../results/cross_specie_leaf_result.csv', index=False)
29 | 
30 | 


--------------------------------------------------------------------------------
/model/cross_specie_test_root.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras import models
 2 | import os
 3 | import pandas as pd
 4 | from utils import prepare_valid_seqs
 5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 6 | 
 7 | mapped_read_counts = ['zea_root_counts.csv', 'solanum_root_counts.csv', 'arabidopsis_root_counts.csv',
 8 |                       'sbicolor_root_counts.csv']
 9 | gene_models = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf', 'Solanum_lycopersicum.SL3.0.52.gtf',
10 |                'Arabidopsis_thaliana.TAIR10.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
11 | genomes = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa', 'Solanum_lycopersicum.SL3.0.dna.toplevel.fa',
12 |            'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
13 | chrom_nums = {'Arabidopsis': 5, 'Solanum': 12, 'Zea': 10, 'Sorghum': 10}
14 | 
15 | cross_specie_results = []
16 | for tpm, gene_model, genome in zip(mapped_read_counts, gene_models, genomes):
17 |     chroms = [str(x) for x in range(1, chrom_nums[genome.split('_')[0]]+1)]
18 |     x_val, y_val, _ = prepare_valid_seqs(genome, gene_model, tpm, chroms)
19 |     for model in os.listdir('saved_models'):
20 |         if model.split('_')[0] != tpm.split('_')[0] and model.endswith('terminatorroot.h5'):
21 |             print(model)
22 |             loaded_model = models.load_model(f'saved_models/{model}')
23 |             evaluation = loaded_model.evaluate(x_val, y_val)
24 |             print(genome.split('_')[0], model.split('_')[0])
25 |             cross_specie_results.append([model.split('_')[0], genome.split('_')[0], evaluation[-1]])
26 | 
27 | cross_specie_results = pd.DataFrame(cross_specie_results,
28 |                                     columns=['train_specie', 'cross_specie', 'accuracy'])
29 | print(cross_specie_results.head())
30 | cross_specie_results.to_csv('../results/cross_specie_root_result.csv', index=False)


--------------------------------------------------------------------------------
/model/effect_of_different_outer_sizes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | from utils import FastaSequenceLoader, ConvNetwork
 5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 6 | mapped_read_counts = ['zea_counts.csv', 'solanum_counts.csv', 'arabidopsis_counts.csv', 'sbicolor_counts.csv']
 7 | gene_models = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf', 'Solanum_lycopersicum.SL3.0.52.gtf',
 8 |                'Arabidopsis_thaliana.TAIR10.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
 9 | genomes = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa', 'Solanum_lycopersicum.SL3.0.dna.toplevel.fa',
10 |            'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
11 | pickle_keys = ['zea', 'sol', 'ara', 'sor']
12 | num_chromosomes = [10, 12, 5, 10]
13 | 
14 | if not os.path.isdir('../results'):
15 |     os.mkdir('../results')
16 | if not os.path.isdir('saved_models/size_effect'):
17 |     os.mkdir('saved_models/size_effect')
18 | 
19 | for m_reads, gene_model, genome, num_chr, p_key in zip(mapped_read_counts, gene_models, genomes, num_chromosomes,
20 |                                                        pickle_keys):
21 |     for outer_flank in [500, 1500, 2000, 2500, 3000]:
22 |         print('-----------------------------------------------------------------------------\n')
23 |         print(f"Plant: {m_reads.split('_')[0]} Outer flank: {outer_flank}nt inner flank: 500nt")
24 |         print('-----------------------------------------------------------------------------\n')
25 | 
26 |         if not os.path.exists(f"../results/{m_reads.split('_')[0]}_{outer_flank}_outer_result.csv"):
27 |             final_training_output = []
28 |             tpm_counts = pd.read_csv(f'tpm_counts/{m_reads}', index_col=0)
29 |             true_targets = []
30 | 
31 |             for log_count in tpm_counts['logMaxTPM'].values:
32 |                 if log_count <= np.percentile(tpm_counts['logMaxTPM'], 25):
33 |                     true_targets.append(0)
34 |                 elif log_count >= np.percentile(tpm_counts['logMaxTPM'], 75):
35 |                     true_targets.append(1)
36 |                 else:
37 |                     true_targets.append(2)
38 |             tpm_counts['true_target'] = true_targets
39 |             print(tpm_counts.head())
40 | 
41 |             for val_chromosome in np.arange(1, num_chr+1):
42 |                 fastaloader = FastaSequenceLoader(f'genomes/{genome}', f'gene_models/{gene_model}',
43 |                                                   val_chromosome, pickled_val_ids='validation_genes.pickle',
44 |                                                   pickled_key=p_key, upstream=outer_flank, downstream=500)
45 |                 enc_train, enc_val, train_ids, val_ids = fastaloader.extract_seq()
46 |                 convnet = ConvNetwork(enc_train, enc_val, train_ids, val_ids, val_chromosome, tpm_counts,
47 |                                       m_reads.split('_')[0], 'promoter_terminator', outer_flank=outer_flank,
48 |                                       size_effect=True)
49 |                 val_acc, val_auroc, specie_name, _, _ = convnet.train_network()
50 |                 final_training_output.append([val_acc, val_auroc, specie_name, outer_flank])
51 | 
52 |             final_training_output = pd.DataFrame(final_training_output, columns=['val_acc', 'val_auROC', 'plant',
53 |                                                                                  'size'])
54 |             final_training_output.to_csv(f"../results/{m_reads.split('_')[0]}_{outer_flank}_outer_result.csv",
55 |                                          index=False)
56 | 


--------------------------------------------------------------------------------
/model/effect_of_different_utr_sizes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | from utils import FastaSequenceLoader, ConvNetwork
 5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 6 | mapped_read_counts = ['zea_counts.csv', 'solanum_counts.csv', 'arabidopsis_counts.csv', 'sbicolor_counts.csv']
 7 | gene_models = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf', 'Solanum_lycopersicum.SL3.0.52.gtf',
 8 |                'Arabidopsis_thaliana.TAIR10.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
 9 | genomes = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa', 'Solanum_lycopersicum.SL3.0.dna.toplevel.fa',
10 |            'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
11 | pickle_keys = ['zea', 'sol', 'ara', 'sor']
12 | num_chromosomes = [10, 12, 5, 10]
13 | 
14 | if not os.path.isdir('../results'):
15 |     os.mkdir('../results')
16 | if not os.path.isdir('saved_models/size_effect'):
17 |     os.mkdir('saved_models/size_effect')
18 | 
19 | for m_reads, gene_model, genome, num_chr, p_key in zip(mapped_read_counts, gene_models, genomes, num_chromosomes,
20 |                                                        pickle_keys):
21 |     for inner_flank in [100, 300, 700]:
22 |         print('-----------------------------------------------------------------------------\n')
23 |         print(f"Plant: {m_reads.split('_')[0]} Outer flank: 1000nt inner flank: {inner_flank}nt")
24 |         print('-----------------------------------------------------------------------------\n')
25 | 
26 |         if not os.path.exists(f"../results/{m_reads.split('_')[0]}_{inner_flank}_utr_result.csv"):
27 |             final_training_output = []
28 |             tpm_counts = pd.read_csv(f'tpm_counts/{m_reads}', index_col=0)
29 |             true_targets = []
30 | 
31 |             for log_count in tpm_counts['logMaxTPM'].values:
32 |                 if log_count <= np.percentile(tpm_counts['logMaxTPM'], 25):
33 |                     true_targets.append(0)
34 |                 elif log_count >= np.percentile(tpm_counts['logMaxTPM'], 75):
35 |                     true_targets.append(1)
36 |                 else:
37 |                     true_targets.append(2)
38 |             tpm_counts['true_target'] = true_targets
39 |             print(tpm_counts.head())
40 | 
41 |             for val_chromosome in np.arange(1, num_chr+1):
42 |                 fastaloader = FastaSequenceLoader(f'genomes/{genome}', f'gene_models/{gene_model}',
43 |                                                   val_chromosome, pickled_val_ids='validation_genes.pickle',
44 |                                                   pickled_key=p_key, upstream=1000, downstream=inner_flank)
45 |                 enc_train, enc_val, train_ids, val_ids = fastaloader.extract_seq()
46 |                 convnet = ConvNetwork(enc_train, enc_val, train_ids, val_ids, val_chromosome, tpm_counts,
47 |                                       m_reads.split('_')[0], 'promoter_terminator', inner_flank=inner_flank,
48 |                                       size_effect=True)
49 |                 val_acc, val_auroc, specie_name, _, _ = convnet.train_network()
50 |                 final_training_output.append([val_acc, val_auroc, specie_name, inner_flank])
51 | 
52 |             final_training_output = pd.DataFrame(final_training_output, columns=['val_acc', 'val_auROC', 'plant',
53 |                                                                                  'size'])
54 |             final_training_output.to_csv(f"../results/{m_reads.split('_')[0]}_{inner_flank}_utr_result.csv",
55 |                                          index=False)


--------------------------------------------------------------------------------
/model/extract_motifs_msr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import h5py
 3 | import numpy as np
 4 | import pandas as pd
 5 | from modisco.visualization.viz_sequence import plot_weights
 6 | 
 7 | # ------------- Extracting motifs from modisco results for visualisation----------------#
 8 | if not os.path.exists('../figures/MOTIFS_MSR/'):
 9 |     os.mkdir('../figures/MOTIFS_MSR/')
10 | 
11 | 
12 | def get_predictive_pwms(mod_file, specie):
13 |     print(specie)
14 |     cwm = []
15 |     motif_id = []
16 |     strand = []
17 |     metacluster_id = []
18 |     n_motif_seqlets = []
19 | 
20 |     f = h5py.File(mod_file, "r")
21 |     for metacluster_idx, metacluster_key in enumerate(f["metacluster_idx_to_submetacluster_results"]):
22 |         print(metacluster_idx, metacluster_key)
23 |         metacluster = f["metacluster_idx_to_submetacluster_results"][metacluster_key]
24 |         patterns = metacluster['seqlets_to_patterns_result']['patterns']
25 | 
26 |         for pattern_idx, pattern_name in enumerate(patterns['all_pattern_names']):
27 |             pattern = patterns[pattern_name.decode()]
28 |             pattern_seqlets = pattern["seqlets_and_alnmts"]["seqlets"]
29 |             # add motif
30 |             nfcwm = np.absolute(pattern["task0_contrib_scores"]["fwd"][:])
31 |             nfcwm = len(pattern_seqlets) * (nfcwm / np.max(nfcwm.flat))
32 |             cwm.append(nfcwm)
33 |             motif_id.append(pattern_name.decode())
34 |             metacluster_id.append(metacluster_key)
35 |             n_motif_seqlets.append(len(pattern_seqlets))
36 |             strand.append('fwd')
37 |             save_fwd = f'../figures/MOTIFS_MSR/{specie}/{metacluster_key}_{pattern_name.decode()}_fwd.png'
38 |             plot_weights(pattern["task0_contrib_scores"]["fwd"][:], save_fwd)
39 | 
40 |             # add motif reverse
41 |             nrcwm = np.absolute(pattern["task0_contrib_scores"]["rev"][:])
42 |             nrcwm = len(pattern_seqlets) * (nrcwm / np.max(nrcwm.flat))
43 |             cwm.append(nrcwm)
44 |             motif_id.append(pattern_name.decode())
45 |             metacluster_id.append(metacluster_key)
46 |             n_motif_seqlets.append(len(pattern_seqlets))
47 |             strand.append('rev')
48 |             save_fwd = f'../figures/MOTIFS_MSR/{specie}/{metacluster_key}_{pattern_name.decode()}_rev.png'
49 |             plot_weights(pattern["task0_contrib_scores"]["rev"][:], save_fwd)
50 | 
51 |     # Save PFMs in H5 file
52 |     cwm = np.array(cwm)
53 |     motif_save = f'../figures/MOTIFS_MSR/{specie}/motifs.h5'
54 |     os.system(f'rm -rf {motif_save}')
55 |     h = h5py.File(motif_save, 'w')
56 |     h.create_dataset('CWMs', data=cwm)
57 |     h.close()
58 |     meta_info = pd.DataFrame([motif_id, strand, metacluster_id, n_motif_seqlets]).T
59 |     meta_info.columns = ['motifID', 'strand', 'metacluster', 'n_seqlets']
60 |     meta_info.to_csv(f'../figures/MOTIFS_MSR/{specie}/meta_info.csv', sep='\t', index=False)
61 | 
62 | 
63 | modisco_feats = ['Arabidopsis_modisco.hdf5', 'Zea_modisco.hdf5', 'Solanum_modisco.hdf5', 'Sorghum_modisco.hdf5']
64 | species = ['AT', 'ZM', 'SL', 'SB']
65 | 
66 | 
67 | for feats, sp in zip(modisco_feats, species):
68 |     if not os.path.exists(f'../figures/MOTIFS_MSR/{sp}'):
69 |         os.mkdir(f'../figures/MOTIFS_MSR/{sp}')
70 |     feats_path = f'modisco_multi/{feats}'
71 |     get_predictive_pwms(feats_path, sp)
72 | 


--------------------------------------------------------------------------------
/model/extract_motifs_ssr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import h5py
 3 | import numpy as np
 4 | import pandas as pd
 5 | from modisco.visualization.viz_sequence import plot_weights
 6 | 
 7 | # ------------- Extracting motifs from modisco results for visualisation----------------#
 8 | if not os.path.exists('../figures/MOTIFS/'):
 9 |     os.mkdir('../figures/MOTIFS/')
10 | 
11 | 
12 | def get_predictive_pwms(mod_file, specie):
13 |     print(specie)
14 |     cwm = []
15 |     motif_id = []
16 |     strand = []
17 |     metacluster_id = []
18 |     n_motif_seqlets = []
19 | 
20 |     f = h5py.File(mod_file, "r")
21 |     for metacluster_idx, metacluster_key in enumerate(f["metacluster_idx_to_submetacluster_results"]):
22 |         metacluster = f["metacluster_idx_to_submetacluster_results"][metacluster_key]
23 |         patterns = metacluster['seqlets_to_patterns_result']['patterns']
24 |         print(metacluster_idx, metacluster_key, len(patterns['all_pattern_names']))
25 | 
26 |         for pattern_idx, pattern_name in enumerate(patterns['all_pattern_names']):
27 |             pattern = patterns[pattern_name.decode()]
28 |             pattern_seqlets = pattern["seqlets_and_alnmts"]["seqlets"]
29 |             # add motif
30 |             nfcwm = np.absolute(pattern["task0_contrib_scores"]["fwd"][:])
31 |             nfcwm = len(pattern_seqlets) * (nfcwm / np.max(nfcwm.flat))
32 |             cwm.append(nfcwm)
33 |             motif_id.append(pattern_name.decode())
34 |             metacluster_id.append(metacluster_key)
35 |             n_motif_seqlets.append(len(pattern_seqlets))
36 |             strand.append('fwd')
37 |             save_fwd = f'../figures/MOTIFS/{specie}/{metacluster_key}_{pattern_name.decode()}_fwd.png'
38 |             plot_weights(pattern["task0_contrib_scores"]["fwd"][:], save_fwd)
39 | 
40 | 
41 |             # add motif reverse
42 |             nrcwm = np.absolute(pattern["task0_contrib_scores"]["rev"][:])
43 |             nrcwm = len(pattern_seqlets) * (nrcwm / np.max(nrcwm.flat))
44 |             cwm.append(nrcwm)
45 |             motif_id.append(pattern_name.decode())
46 |             metacluster_id.append(metacluster_key)
47 |             n_motif_seqlets.append(len(pattern_seqlets))
48 |             strand.append('rev')
49 |             save_fwd = f'../figures/MOTIFS/{specie}/{metacluster_key}_{pattern_name.decode()}_rev.png'
50 |             plot_weights(pattern["task0_contrib_scores"]["rev"][:], save_fwd)
51 | 
52 |     # Save PFMs in H5 file
53 |     cwm = np.array(cwm)
54 |     motif_save = f'../figures/MOTIFS/{specie}/motifs.h5'
55 |     os.system(f'rm -rf {motif_save}')
56 |     h = h5py.File(motif_save, 'w')
57 |     h.create_dataset('CWMs', data=cwm)
58 |     h.close()
59 |     meta_info = pd.DataFrame([motif_id, strand, metacluster_id, n_motif_seqlets]).T
60 |     meta_info.columns = ['motifID', 'strand', 'metacluster', 'n_seqlets']
61 |     meta_info.to_csv(f'../figures/MOTIFS/{specie}/meta_info.csv', sep='\t', index=False)
62 | 
63 | 
64 | modisco_feats = ['arabidopsis_modisco.hdf5', 'zea_modisco.hdf5', 'solanum_modisco.hdf5', 'sbicolor_modisco.hdf5']
65 | # For root, use
66 | # modisco_feats = ['arabidopsis_modisco_root.hdf5', 'zea_modisco_root.hdf5', 'solanum_modisco_root.hdf5', 'sbicolor_modisco_root.hdf5']
67 | 
68 | species = ['AT', 'ZM', 'SL', 'SB']
69 | 
70 | for feats, sp in zip(modisco_feats, species):
71 |     if not os.path.exists(f'../figures/MOTIFS/{sp}'):
72 |         os.mkdir(f'../figures/MOTIFS/{sp}')
73 |     feats_path = f'modisco/{feats}'
74 |     get_predictive_pwms(feats_path, sp)
75 | 


--------------------------------------------------------------------------------
/model/fetch_genomes_and_annotation.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # creating the required directories
 4 | echo "Creating required directories"
 5 | mkdir genomes
 6 | mkdir gene_models
 7 | mkdir tpm_counts
 8 | 
 9 | # Downloading reference genomes from Ensembl plants
10 | echo "Downloading reference genomes"
11 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz
12 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/fasta/solanum_lycopersicum/dna/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz
13 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/fasta/sorghum_bicolor/dna/Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa.gz
14 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/fasta/zea_mays/dna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa.gz
15 | 
16 | # Downloading corresponding annotation files in gtf format from Ensembl plants
17 | echo "Downloading reference annotations"
18 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.52.gtf.gz
19 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/gtf/solanum_lycopersicum/Solanum_lycopersicum.SL3.0.52.gtf.gz
20 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/gtf/sorghum_bicolor/Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf.gz
21 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/gtf/zea_mays/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf.gz
22 | 
23 | gunzip *.gz
24 | mv *.gtf gene_models
25 | mv *toplevel.fa genomes
26 | 
27 | 


--------------------------------------------------------------------------------
/model/motif_discovery_msr_leaf.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from utils import onehot
  3 | from pyfaidx import Fasta
  4 | from tensorflow.keras import models
  5 | from tensorflow.keras import backend
  6 | import os
  7 | import numpy as np
  8 | import deeplift
  9 | from deeplift.util import get_shuffle_seq_ref_function
 10 | from deeplift.dinuc_shuffle import dinuc_shuffle
 11 | from deeplift.conversion import kerasapi_conversion as kc
 12 | from sklearn.metrics import accuracy_score
 13 | from deeplift.util import get_hypothetical_contribs_func_onehot
 14 | import h5py
 15 | import modisco
 16 | from importlib import reload
 17 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 18 | backend.clear_session()
 19 | if not os.path.exists('modisco_multi'):
 20 |     os.mkdir('modisco_multi')
 21 | 
 22 | 
 23 | def compute_actual_and_hypothetical_scores(x_val, y_val, specie, chrom):
 24 |     saved_name = f'saved_models/multi_specie_model_{specie}_{chrom}.h5'
 25 |     loaded_model = models.load_model(saved_name)
 26 |     predicted_class = loaded_model.predict(x_val)
 27 |     predicted_class = predicted_class > 0.5
 28 |     print(f'{specie} Accuracy {accuracy_score(y_val, predicted_class)}')
 29 | 
 30 |     x = []
 31 |     for idx, seq in enumerate(x_val):
 32 |         if predicted_class[idx] == 1 and y_val[idx] == 1:
 33 |             x.append(seq)
 34 |         elif predicted_class[idx] == 0 and y_val[idx] == 0:
 35 |             x.append(seq)
 36 | 
 37 |     x = np.array(x)
 38 | 
 39 |     print(f'Number of correct predictions {x.shape[0]}')
 40 |     # ---------- Computing importance and hypothetical scores-------------------------------------------#
 41 |     deeplift_model = kc.convert_model_from_saved_files(saved_name,
 42 |                                                        nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.DeepLIFT_GenomicsDefault)
 43 | 
 44 |     deeplift_contribs_func = deeplift_model.get_target_contribs_func(find_scores_layer_idx=0,
 45 |                                                                      target_layer_idx=-2)
 46 | 
 47 |     contribs_many_refs_func = get_shuffle_seq_ref_function(
 48 |         score_computation_function=deeplift_contribs_func,
 49 |         shuffle_func=dinuc_shuffle)
 50 | 
 51 |     multipliers_func = deeplift_model.get_target_multipliers_func(find_scores_layer_idx=0,
 52 |                                                                   target_layer_idx=-2)
 53 |     hypothetical_contribs_func = get_hypothetical_contribs_func_onehot(multipliers_func)
 54 | 
 55 |     # Once again, we rely on multiple shuffled references
 56 |     hypothetical_contribs_many_refs_func = get_shuffle_seq_ref_function(
 57 |         score_computation_function=hypothetical_contribs_func,
 58 |         shuffle_func=dinuc_shuffle)
 59 | 
 60 |     actual_scores = np.squeeze(np.sum(contribs_many_refs_func(task_idx=0,
 61 |                                                               input_data_sequences=x,
 62 |                                                               num_refs_per_seq=10,
 63 |                                                               batch_size=50,
 64 |                                                               progress_update=4000), axis=2))[:, :, None] * x
 65 | 
 66 |     hyp_scores = hypothetical_contribs_many_refs_func(task_idx=0,
 67 |                                                       input_data_sequences=x,
 68 |                                                       num_refs_per_seq=10,
 69 |                                                       batch_size=50,
 70 |                                                       progress_update=4000)
 71 | 
 72 |     return x, actual_scores, hyp_scores
 73 | 
 74 | 
 75 | def run_modisco(specie):
 76 |     save_file = f"modisco_multi/{specie}_modisco.hdf5"
 77 |     os.system(f'rm -rf {save_file}')
 78 | 
 79 |     h5_data = h5py.File(f'modisco_multi/{specie}_scores.h5', 'r')
 80 |     contribution_scores = h5_data.get('contrib_scores')
 81 |     hypothetical_scores = h5_data.get('hypothetical_scores')
 82 |     one_hots = h5_data.get('one_hots')
 83 | 
 84 |     print('contributions', contribution_scores.shape)
 85 |     print('hypothetical contributions', hypothetical_scores.shape)
 86 |     print('correct predictions', one_hots.shape)
 87 |     # -----------------------Running modisco----------------------------------------------#
 88 |     # Uncomment to refresh modules for when tweaking code during development:
 89 |     reload(modisco.util)
 90 |     reload(modisco.pattern_filterer)
 91 |     reload(modisco.aggregator)
 92 |     reload(modisco.core)
 93 |     reload(modisco.seqlet_embedding.advanced_gapped_kmer)
 94 |     reload(modisco.affinitymat.transformers)
 95 |     reload(modisco.affinitymat.core)
 96 |     reload(modisco.affinitymat)
 97 |     reload(modisco.cluster.core)
 98 |     reload(modisco.cluster)
 99 |     reload(modisco.tfmodisco_workflow.seqlets_to_patterns)
100 |     reload(modisco.tfmodisco_workflow)
101 |     reload(modisco)
102 | 
103 |     null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)
104 |     tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
105 |         # Slight modifications from the default settings
106 |         sliding_window_size=21,
107 |         flank_size=5,
108 |         target_seqlet_fdr=0.01,
109 |         seqlets_to_patterns_factory=modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
110 |             # Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
111 |             # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
112 |             # initialization, you would specify the initclusterer_factory as shown in the
113 |             # commented-out code below:
114 |             # initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(
115 |             #    meme_command="meme", base_outdir="meme_out",
116 |             #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
117 |             trim_to_window_size=10,
118 |             initial_flank_to_add=2,
119 |             final_flank_to_add=0,
120 |             final_min_cluster_size=60,
121 |             # use_pynnd=True can be used for faster nn comp at coarse grained step
122 |             # (it will use pynndescent), but note that pynndescent may crash
123 |             # use_pynnd=True,
124 |             n_cores=40)
125 |     )(
126 |         task_names=['task0'],
127 |         contrib_scores={'task0': contribution_scores},
128 |         hypothetical_contribs={'task0': hypothetical_scores},
129 |         one_hot=one_hots,
130 |         null_per_pos_scores=null_per_pos_scores)
131 | 
132 |     reload(modisco.util)
133 |     grp = h5py.File(save_file, "w")
134 |     tfmodisco_results.save_hdf5(grp)
135 |     grp.close()
136 | 
137 | 
138 | def encode_seq(gene_models, fasta, labels, upstream=1000, downstream=500):
139 |     encoded_seqs, true_labels = [], []
140 |     for chrom, start, end, strand, gene_id, _ in gene_models.values:
141 |         if gene_id in labels.index:
142 |             if strand == '+':
143 |                 prom_start, prom_end = start - upstream, start + downstream
144 |                 term_start, term_end = end - downstream, end + upstream
145 |                 if prom_start > 0 and term_start > 0:
146 |                     seq = str(fasta[chrom][prom_start:prom_end]) + 'N'*20 + str(fasta[chrom][term_start:term_end])
147 | 
148 |                     if len(seq) == 2 * (upstream + downstream) + 20:
149 |                         encoded_seqs.append(onehot(seq))
150 |                         true_labels.append(labels.loc[gene_id, 'true_target'])
151 | 
152 |             else:
153 |                 prom_start, prom_end = end - downstream, end + upstream
154 |                 term_start, term_end = start - upstream, start + downstream
155 |                 if prom_start > 0 and term_start > 0:
156 |                     seq = str(fasta[chrom][prom_start:prom_end].reverse.complement) + 'N'*20 +\
157 |                           str(fasta[chrom][term_start:term_end].reverse.complement)
158 | 
159 |                     if len(seq) == 2 * (upstream + downstream) + 20:
160 |                         encoded_seqs.append(onehot(seq))
161 |                         true_labels.append(labels.loc[gene_id, 'true_target'])
162 | 
163 |     encoded_seqs = np.array(encoded_seqs)
164 |     true_labels = np.array(true_labels)
165 | 
166 |     return encoded_seqs, true_labels
167 | 
168 | 
169 | gene_model = pd.read_csv('gene_models/zea_sol_ara_sor_52.gtf', sep='\t')
170 | gene_model['specie'] = [x.split('_')[0] for x in gene_model['Chromosome']]
171 | genes_labels = pd.read_csv('tpm_counts/zea_sol_ara_sor.csv', sep='\t', index_col=0)
172 | genome = Fasta('genomes/zea_sol_ara_sor_dna.fa',  as_raw=False, sequence_always_upper=True, read_ahead=10000)
173 | print(gene_model.head())
174 | 
175 | plant_species = ['Arabidopsis', 'Zea', 'Sorghum', 'Solanum']
176 | for specie in plant_species:
177 |     if not os.path.exists(f"modisco_multi/{specie}_modisco.hdf5"):
178 |         final_results = []
179 |         print(f'Testing on {specie}')
180 |         test_specie = gene_model.copy()
181 |         test_specie = test_specie[test_specie['specie'] == specie]
182 | 
183 |         actual_scores_all, hypothetical_scores_all, onehot_all = [], [], []
184 |         for chrom_num in test_specie['Chromosome'].unique():
185 |             test_specie_chrom = test_specie.copy()
186 |             test_specie_chrom = test_specie_chrom[test_specie_chrom['Chromosome'] == chrom_num]
187 |             test_seqs, test_targets = encode_seq(test_specie_chrom, genome, genes_labels)
188 | 
189 |             print(f'Running Deeplift {chrom_num}')
190 |             encoded_x, act_imp, hyp_imp = compute_actual_and_hypothetical_scores(test_seqs, test_targets, specie,
191 |                                                                                  chrom_num)
192 |             actual_scores_all.append(act_imp)
193 |             hypothetical_scores_all.append(hyp_imp)
194 |             onehot_all.append(encoded_x)
195 | 
196 |         # Save scores in h5 format
197 |         if os.path.isfile(f'modisco_multi/{specie}_scores.h5'):
198 |             os.system(f'rm -rf modisco_multi/{specie}_scores.h5')
199 | 
200 |         actual_scores_all = np.concatenate(actual_scores_all, axis=0)
201 |         hypothetical_scores_all = np.concatenate(hypothetical_scores_all, axis=0)
202 |         onehot_all = np.concatenate(onehot_all, axis=0)
203 | 
204 |         h = h5py.File(f'modisco_multi/{specie}_scores.h5', 'w')
205 |         h.create_dataset('contrib_scores', data=actual_scores_all)
206 |         h.create_dataset('hypothetical_scores', data=hypothetical_scores_all)
207 |         h.create_dataset('one_hots', data=onehot_all)
208 |         h.close()
209 | 
210 |         print('Running modisco')
211 |         run_modisco(specie)
212 | 


--------------------------------------------------------------------------------
/model/motif_discovery_ssr_leaf.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from tensorflow.keras import models
  3 | from tensorflow.keras import backend
  4 | import os
  5 | import numpy as np
  6 | import deeplift
  7 | from deeplift.util import get_shuffle_seq_ref_function
  8 | from deeplift.dinuc_shuffle import dinuc_shuffle
  9 | from deeplift.conversion import kerasapi_conversion as kc
 10 | from utils import prepare_valid_seqs
 11 | from sklearn.metrics import accuracy_score
 12 | from deeplift.util import get_hypothetical_contribs_func_onehot
 13 | import h5py
 14 | import modisco
 15 | from importlib import reload
 16 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 17 | backend.clear_session()
 18 | if not os.path.exists('modisco'):
 19 |     os.mkdir('modisco')
 20 | 
 21 | 
 22 | def compute_actual_and_hypothetical_scores(fasta, gtf, tpms, specie):
 23 |     actual_scores_all, hypothetical_scores_all, onehot_all = [], [], []
 24 |     for saved_model in os.listdir('saved_models'):
 25 |         if saved_model.startswith(specie) and saved_model.endswith('terminator.h5'):
 26 |             print(saved_model)
 27 |             val_chrom = saved_model.split('_')[2]
 28 |             x_val, y_val, genes_val = prepare_valid_seqs(fasta, gtf, tpms, val_chrom, pkey=False)
 29 | 
 30 |             saved_name = f'saved_models/{saved_model}'
 31 |             loaded_model = models.load_model(saved_name)
 32 |             predicted_prob = loaded_model.predict(x_val)
 33 |             predicted_class = predicted_prob > 0.5
 34 |             print('Accuracy', accuracy_score(y_val, predicted_class))
 35 |             x = []
 36 |             for idx, seq in enumerate(x_val):
 37 |                 if predicted_class[idx] == 0 and y_val[idx] == 0:
 38 |                     x.append(seq)
 39 |             for idx, seq in enumerate(x_val):
 40 |                 if predicted_class[idx] == 1 and y_val[idx] == 1:
 41 |                     x.append(seq)
 42 | 
 43 |             x = np.array(x)
 44 | 
 45 |             print(f'Number of correct predictions {x.shape[0]}')
 46 |             # ---------- Computing importance and hypothetical scores-------------------------------------------#
 47 |             deeplift_model = kc.convert_model_from_saved_files(saved_name,
 48 |                                                                nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.DeepLIFT_GenomicsDefault)
 49 | 
 50 |             deeplift_contribs_func = deeplift_model.get_target_contribs_func(find_scores_layer_idx=0,
 51 |                                                                              target_layer_idx=-2)
 52 | 
 53 |             contribs_many_refs_func = get_shuffle_seq_ref_function(
 54 |                 score_computation_function=deeplift_contribs_func,
 55 |                 shuffle_func=dinuc_shuffle)
 56 | 
 57 |             multipliers_func = deeplift_model.get_target_multipliers_func(find_scores_layer_idx=0,
 58 |                                                                           target_layer_idx=-2)
 59 |             hypothetical_contribs_func = get_hypothetical_contribs_func_onehot(multipliers_func)
 60 | 
 61 |             # Once again, we rely on multiple shuffled references
 62 |             hypothetical_contribs_many_refs_func = get_shuffle_seq_ref_function(
 63 |                 score_computation_function=hypothetical_contribs_func,
 64 |                 shuffle_func=dinuc_shuffle)
 65 | 
 66 |             actual_scores = np.squeeze(np.sum(contribs_many_refs_func(task_idx=0,
 67 |                                                                       input_data_sequences=x,
 68 |                                                                       num_refs_per_seq=10,
 69 |                                                                       batch_size=50,
 70 |                                                                       progress_update=4000), axis=2))[:, :, None] * x
 71 | 
 72 |             hyp_scores = hypothetical_contribs_many_refs_func(task_idx=0,
 73 |                                                               input_data_sequences=x,
 74 |                                                               num_refs_per_seq=10,
 75 |                                                               batch_size=50,
 76 |                                                               progress_update=4000)
 77 | 
 78 |             actual_scores_all.append(actual_scores)
 79 |             hypothetical_scores_all.append(hyp_scores)
 80 |             onehot_all.append(x)
 81 | 
 82 |     # Save scores in h5 format
 83 |     if os.path.isfile(f'modisco/{specie}_scores.h5'):
 84 |         os.system(f'rm -rf modisco/{specie}_scores.h5')
 85 | 
 86 |     actual_scores_all = np.concatenate(actual_scores_all, axis=0)
 87 |     hypothetical_scores_all = np.concatenate(hypothetical_scores_all, axis=0)
 88 |     onehot_all = np.concatenate(onehot_all, axis=0)
 89 | 
 90 |     h = h5py.File(f'modisco/{specie}_scores.h5', 'w')
 91 |     h.create_dataset('contrib_scores', data=actual_scores_all)
 92 |     h.create_dataset('hypothetical_scores', data=hypothetical_scores_all)
 93 |     h.create_dataset('one_hots', data=onehot_all)
 94 |     h.close()
 95 | 
 96 | 
 97 | def run_modisco(specie):
 98 |     save_file = f"modisco/{specie}_modisco.hdf5"
 99 |     os.system(f'rm -rf {save_file}')
100 | 
101 |     h5_data = h5py.File(f'modisco/{specie}_scores.h5', 'r')
102 |     contribution_scores = h5_data.get('contrib_scores')
103 |     hypothetical_scores = h5_data.get('hypothetical_scores')
104 |     one_hots = h5_data.get('one_hots')
105 | 
106 |     print('contributions', contribution_scores.shape)
107 |     print('hypothetical contributions', hypothetical_scores.shape)
108 |     print('correct predictions', one_hots.shape)
109 |     # -----------------------Running modisco----------------------------------------------#
110 |     # Uncomment to refresh modules for when tweaking code during development:
111 |     reload(modisco.util)
112 |     reload(modisco.pattern_filterer)
113 |     reload(modisco.aggregator)
114 |     reload(modisco.core)
115 |     reload(modisco.seqlet_embedding.advanced_gapped_kmer)
116 |     reload(modisco.affinitymat.transformers)
117 |     reload(modisco.affinitymat.core)
118 |     reload(modisco.affinitymat)
119 |     reload(modisco.cluster.core)
120 |     reload(modisco.cluster)
121 |     reload(modisco.tfmodisco_workflow.seqlets_to_patterns)
122 |     reload(modisco.tfmodisco_workflow)
123 |     reload(modisco)
124 | 
125 |     null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)
126 |     tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
127 |         # Slight modifications from the default settings
128 |         sliding_window_size=21,
129 |         flank_size=5,
130 |         target_seqlet_fdr=0.01,
131 |         seqlets_to_patterns_factory=modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
132 |             # Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
133 |             # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
134 |             # initialization, you would specify the initclusterer_factory as shown in the
135 |             # commented-out code below:
136 |             # initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(
137 |             #    meme_command="meme", base_outdir="meme_out",
138 |             #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
139 |             trim_to_window_size=10,
140 |             initial_flank_to_add=2,
141 |             final_flank_to_add=0,
142 |             final_min_cluster_size=60,
143 |             # use_pynnd=True can be used for faster nn comp at coarse grained step
144 |             # (it will use pynndescent), but note that pynndescent may crash
145 |             # use_pynnd=True,
146 |             n_cores=50)
147 |     )(
148 |         task_names=['task0'],
149 |         contrib_scores={'task0': contribution_scores},
150 |         hypothetical_contribs={'task0': hypothetical_scores},
151 |         one_hot=one_hots,
152 |         null_per_pos_scores=null_per_pos_scores)
153 | 
154 |     reload(modisco.util)
155 |     grp = h5py.File(save_file, "w")
156 |     tfmodisco_results.save_hdf5(grp)
157 |     grp.close()
158 | 
159 | 
160 | species = ['arabidopsis', 'zea', 'solanum', 'sbicolor']
161 | gene_models = ['Arabidopsis_thaliana.TAIR10.52.gtf', 'Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf',
162 |                'Solanum_lycopersicum.SL3.0.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
163 | genomes = ['Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa',
164 |            'Solanum_lycopersicum.SL3.0.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
165 | pickle_keys = ['ara', 'zea', 'sol', 'sor']
166 | mapped_read_counts = ['arabidopsis_counts.csv', 'zea_counts.csv', 'solanum_counts.csv', 'sbicolor_counts.csv']
167 | 
168 | for plant, fasta_file, gtf_file, pickled_key, counts in zip(species, genomes, gene_models, pickle_keys,
169 |                                                             mapped_read_counts):
170 |     if not os.path.exists(f'modisco/{plant}_modisco.hdf5'):
171 |         print(f'Computing contribution and hypothetical contribution scores for {plant}-----------------------------\n')
172 |         compute_actual_and_hypothetical_scores(fasta_file, gtf_file, counts, plant)
173 |         print(f'Running TFMoDisco on {plant}------------------------------------------------------------------------\n')
174 |         run_modisco(plant)
175 | 


--------------------------------------------------------------------------------
/model/motif_discovery_ssr_root.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.keras import models
  2 | from tensorflow.keras import backend
  3 | import os
  4 | import numpy as np
  5 | import deeplift
  6 | from deeplift.util import get_shuffle_seq_ref_function
  7 | from deeplift.dinuc_shuffle import dinuc_shuffle
  8 | from deeplift.conversion import kerasapi_conversion as kc
  9 | from utils import prepare_valid_seqs
 10 | from sklearn.metrics import accuracy_score, roc_auc_score
 11 | from deeplift.util import get_hypothetical_contribs_func_onehot
 12 | import h5py
 13 | import modisco
 14 | from importlib import reload
 15 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 16 | backend.clear_session()
 17 | if not os.path.exists('modisco'):
 18 |     os.mkdir('modisco')
 19 | 
 20 | 
 21 | def compute_actual_and_hypothetical_scores(fasta, gtf, tpms, specie):
 22 |     actual_scores_all, hypothetical_scores_all, onehot_all = [], [], []
 23 |     for saved_model in os.listdir('saved_models'):
 24 |         if saved_model.startswith(specie) and saved_model.endswith('terminatorroot.h5'):
 25 |             print(saved_model)
 26 |             val_chrom = saved_model.split('_')[2]
 27 |             x_val, y_val, _ = prepare_valid_seqs(fasta=fasta, gtf=gtf, tpms=tpms, val_chrom=val_chrom)
 28 | 
 29 |             saved_name = f'saved_models/{saved_model}'
 30 |             loaded_model = models.load_model(saved_name)
 31 |             predicted_class = loaded_model.predict(x_val)
 32 |             predicted_class = predicted_class > 0.5
 33 |             print('Accuracy', accuracy_score(y_val, predicted_class))
 34 |             x = []
 35 |             for idx, seq in enumerate(x_val):
 36 |                 if predicted_class[idx] == 0 and y_val[idx] == 0:
 37 |                     x.append(seq)
 38 |             for idx, seq in enumerate(x_val):
 39 |                 if predicted_class[idx] == 1 and y_val[idx] == 1:
 40 |                     x.append(seq)
 41 |             x = np.array(x)
 42 | 
 43 |             print(f'Number of correct predictions {x.shape[0]}, {x_val.shape}')
 44 |             # ---------- Computing importance and hypothetical scores-------------------------------------------#
 45 |             deeplift_model = kc.convert_model_from_saved_files(saved_name,
 46 |                                                                nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.DeepLIFT_GenomicsDefault)
 47 | 
 48 |             deeplift_contribs_func = deeplift_model.get_target_contribs_func(find_scores_layer_idx=0,
 49 |                                                                              target_layer_idx=-2)
 50 | 
 51 |             contribs_many_refs_func = get_shuffle_seq_ref_function(
 52 |                 score_computation_function=deeplift_contribs_func,
 53 |                 shuffle_func=dinuc_shuffle)
 54 | 
 55 |             multipliers_func = deeplift_model.get_target_multipliers_func(find_scores_layer_idx=0,
 56 |                                                                           target_layer_idx=-2)
 57 |             hypothetical_contribs_func = get_hypothetical_contribs_func_onehot(multipliers_func)
 58 | 
 59 |             # Once again, we rely on multiple shuffled references
 60 |             hypothetical_contribs_many_refs_func = get_shuffle_seq_ref_function(
 61 |                 score_computation_function=hypothetical_contribs_func,
 62 |                 shuffle_func=dinuc_shuffle)
 63 | 
 64 |             actual_scores = np.squeeze(np.sum(contribs_many_refs_func(task_idx=0,
 65 |                                                                       input_data_sequences=x,
 66 |                                                                       num_refs_per_seq=10,
 67 |                                                                       batch_size=50,
 68 |                                                                       progress_update=4000), axis=2))[:, :, None] * x
 69 | 
 70 |             hyp_scores = hypothetical_contribs_many_refs_func(task_idx=0,
 71 |                                                               input_data_sequences=x,
 72 |                                                               num_refs_per_seq=10,
 73 |                                                               batch_size=50,
 74 |                                                               progress_update=4000)
 75 | 
 76 |             actual_scores_all.append(actual_scores)
 77 |             hypothetical_scores_all.append(hyp_scores)
 78 |             onehot_all.append(x)
 79 | 
 80 |     # Save scores in h5 format
 81 |     if os.path.isfile(f'modisco/{specie}_scores_root.h5'):
 82 |         os.system(f'rm -rf modisco/{specie}_scores_root.h5')
 83 | 
 84 |     actual_scores_all = np.concatenate(actual_scores_all, axis=0)
 85 |     hypothetical_scores_all = np.concatenate(hypothetical_scores_all, axis=0)
 86 |     onehot_all = np.concatenate(onehot_all, axis=0)
 87 | 
 88 |     h = h5py.File(f'modisco/{specie}_scores_root.h5', 'w')
 89 |     h.create_dataset('contrib_scores', data=actual_scores_all)
 90 |     h.create_dataset('hypothetical_scores', data=hypothetical_scores_all)
 91 |     h.create_dataset('one_hots', data=onehot_all)
 92 |     h.close()
 93 | 
 94 | 
 95 | def run_modisco(specie):
 96 |     save_file = f"modisco/{specie}_modisco_root.hdf5"
 97 |     os.system(f'rm -rf {save_file}')
 98 | 
 99 |     h5_data = h5py.File(f'modisco/{specie}_scores_root.h5', 'r')
100 |     contribution_scores = h5_data.get('contrib_scores')
101 |     hypothetical_scores = h5_data.get('hypothetical_scores')
102 |     one_hots = h5_data.get('one_hots')
103 | 
104 |     print('contributions', contribution_scores.shape)
105 |     print('hypothetical contributions', hypothetical_scores.shape)
106 |     print('correct predictions', one_hots.shape)
107 |     # -----------------------Running modisco----------------------------------------------#
108 |     # Uncomment to refresh modules for when tweaking code during development:
109 |     reload(modisco.util)
110 |     reload(modisco.pattern_filterer)
111 |     reload(modisco.aggregator)
112 |     reload(modisco.core)
113 |     reload(modisco.seqlet_embedding.advanced_gapped_kmer)
114 |     reload(modisco.affinitymat.transformers)
115 |     reload(modisco.affinitymat.core)
116 |     reload(modisco.affinitymat)
117 |     reload(modisco.cluster.core)
118 |     reload(modisco.cluster)
119 |     reload(modisco.tfmodisco_workflow.seqlets_to_patterns)
120 |     reload(modisco.tfmodisco_workflow)
121 |     reload(modisco)
122 | 
123 |     null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)
124 |     tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
125 |         # Slight modifications from the default settings
126 |         sliding_window_size=21,
127 |         flank_size=5,
128 |         target_seqlet_fdr=0.01,
129 |         max_seqlets_per_metacluster=50000,
130 |         seqlets_to_patterns_factory=modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
131 |             # Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
132 |             # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
133 |             # initialization, you would specify the initclusterer_factory as shown in the
134 |             # commented-out code below:
135 |             # initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(
136 |             #    meme_command="meme", base_outdir="meme_out",
137 |             #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
138 |             trim_to_window_size=10,
139 |             initial_flank_to_add=2,
140 |             final_flank_to_add=0,
141 |             final_min_cluster_size=60,
142 |             # use_pynnd=True can be used for faster nn comp at coarse grained step
143 |             # (it will use pynndescent), but note that pynndescent may crash
144 |             # use_pynnd=True,
145 |             n_cores=50)
146 |     )(
147 |         task_names=['task0'],
148 |         contrib_scores={'task0': contribution_scores},
149 |         hypothetical_contribs={'task0': hypothetical_scores},
150 |         one_hot=one_hots,
151 |         null_per_pos_scores=null_per_pos_scores)
152 | 
153 |     reload(modisco.util)
154 |     grp = h5py.File(save_file, "w")
155 |     tfmodisco_results.save_hdf5(grp)
156 |     grp.close()
157 | 
158 | 
159 | species = ['arabidopsis', 'zea', 'solanum', 'sbicolor']
160 | gene_models = ['Arabidopsis_thaliana.TAIR10.52.gtf', 'Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf',
161 |                'Solanum_lycopersicum.SL3.0.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
162 | genomes = ['Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa',
163 |            'Solanum_lycopersicum.SL3.0.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
164 | pickle_keys = ['ara', 'zea', 'sol', 'sor']
165 | mapped_read_counts = ['arabidopsis_root_counts.csv', 'zea_root_counts.csv', 'solanum_root_counts.csv',
166 |                       'sbicolor_root_counts.csv']
167 | 
168 | 
169 | for plant, fasta_file, gtf_file, pickled_key, counts in zip(species, genomes, gene_models, pickle_keys,
170 |                                                             mapped_read_counts):
171 |     if not os.path.exists(f'modisco/{plant}_modisco_root.hdf5'):
172 |         print(f'Computing contribution and hypothetical contribution scores for {plant}-----------------------------\n')
173 |         compute_actual_and_hypothetical_scores(fasta_file, gtf_file, counts, plant)
174 |         print(f'Running TFMoDisco on {plant}------------------------------------------------------------------------\n')
175 |         run_modisco(plant)
176 | 


--------------------------------------------------------------------------------
/model/random_forest_msr.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.preprocessing import StandardScaler
 4 | from imblearn.over_sampling import RandomOverSampler
 5 | from sklearn.metrics import accuracy_score
 6 | from sklearn.ensemble import RandomForestClassifier
 7 | pd.options.display.width = 0
 8 | 
 9 | feat_imp_scores, species, feat_names = [], [], []
10 | scores = []
11 | 
12 | for cls_task in ['Low-medium-High', 'Low-High']:
13 |     mapped_read_counts = ['arabidopsis_counts.csv', 'zea_counts.csv', 'solanum_counts.csv', 'sbicolor_counts.csv']
14 |     generic_feats = ['Arabidopsis_generated_features.csv', 'Zea_generated_features.csv',
15 |                      'Solanum_generated_features.csv', 'Sorghum_generated_features.csv']
16 |     dfs = []
17 |     for idx in range(4):
18 |         tpm = pd.read_csv(f'tpm_counts/{mapped_read_counts[idx]}')
19 |         predictors = pd.read_csv(generic_feats[idx], index_col=0)
20 |         targets = []
21 |         for log_count in tpm['logMaxTPM']:
22 |             if log_count <= np.percentile(tpm['logMaxTPM'], 25):
23 |                 targets.append(0)
24 |             elif log_count >= np.percentile(tpm['logMaxTPM'], 75):
25 |                 targets.append(1)
26 |             else:
27 |                 targets.append(2)
28 |         tpm['label'] = targets
29 |         tpm = tpm[['gene_id', 'label']]
30 |         data = predictors.merge(tpm, how='inner', on='gene_id')
31 |         if cls_task == 'Low-High':
32 |             data = data[data['label'] != 2]
33 |         data['species'] = [generic_feats[idx].split('_')[0]]*data.shape[0]
34 |         dfs.append(data)
35 | 
36 |     dfs = pd.concat(dfs)
37 | 
38 |     for sp in dfs['species'].unique():
39 |         # Get test species, balance data and  standardize
40 |         data_train = dfs.copy()
41 |         data_train = data_train[data_train['species'] != sp]
42 |         print(f'Train species {data_train.species.unique()}')
43 |         data_train.drop(columns=['gene_id', 'Chromosome', 'species'], inplace=True)
44 |         sampler = RandomOverSampler(random_state=42)
45 |         x_train, y_train = data_train.values[:, :-1], data_train['label'].values
46 |         x_train, y_train = sampler.fit_resample(x_train, y_train)
47 |         scaler = StandardScaler()
48 |         scaler.fit(x_train)
49 |         x_train_std = scaler.transform(x_train)
50 |         random_forest = RandomForestClassifier(100)
51 |         random_forest.fit(x_train_std, y_train)
52 |         if cls_task == 'Low-High':
53 |             feat_imp = random_forest.feature_importances_
54 |             feat_imp_scores.extend(feat_imp)
55 |             feat_names.extend(data_train.columns[:-1])
56 |             species.extend([sp] * len(data_train.columns[:-1]))
57 | 
58 |         data_test = dfs.copy()
59 |         data_test = data_test[data_test['species'] == sp]
60 |         print(f'Test species {data_test.species.unique()}')
61 | 
62 |         for chrom in data_test['Chromosome'].unique():
63 |             data_test_chrom = data_test.copy()
64 |             data_test_chrom = data_test_chrom[data_test_chrom['Chromosome'] == chrom]
65 |             data_test_chrom.drop(columns=['gene_id', 'Chromosome', 'species'], inplace=True)
66 |             x_test, y_test = data_test_chrom.values[:, :-1], data_test_chrom['label'].values
67 |             x_test_std = scaler.transform(x_test)
68 |             y_pred = random_forest.predict(x_test_std)
69 |             acc = accuracy_score(y_test, y_pred)
70 |             scores.append([sp, acc, cls_task])
71 |             print(sp, random_forest.score(x_test_std, y_test), cls_task)
72 | 
73 | 
74 | df_feat_imp = pd.DataFrame({'Importance scores': feat_imp_scores,
75 |                             'Species': species,
76 |                             'Feature': feat_names})
77 | df_feat_imp.sort_values(by='Species', inplace=True)
78 | df_performance = pd.DataFrame(scores, columns=['Species', 'accuracy',
79 |                                                'task']).to_csv('../results/rand_for_perf_MSR.csv', index=False)
80 | df_feat_imp.to_csv('../results/rand_for_feat_imp_MSR.csv', index=False)
81 | 


--------------------------------------------------------------------------------
/model/random_forest_ssr.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.preprocessing import StandardScaler
 4 | from imblearn.over_sampling import RandomOverSampler
 5 | from sklearn.metrics import accuracy_score
 6 | from sklearn.ensemble import RandomForestClassifier
 7 | pd.options.display.width = 0
 8 | 
 9 | feat_imp_scores, species, feat_names = [], [], []
10 | scores = []
11 | 
12 | for cls_task in ['Low-medium-High', 'Low-High']:
13 |     mapped_read_counts = ['arabidopsis_counts.csv', 'zea_counts.csv', 'solanum_counts.csv', 'sbicolor_counts.csv']
14 |     generic_feats = ['Arabidopsis_generated_features.csv', 'Zea_generated_features.csv',
15 |                      'Solanum_generated_features.csv', 'Sorghum_generated_features.csv']
16 | 
17 |     for idx in range(4):
18 |         tpm = pd.read_csv(f'tpm_counts/{mapped_read_counts[idx]}')
19 |         predictors = pd.read_csv(generic_feats[idx], index_col=0)
20 |         targets = []
21 |         for log_count in tpm['logMaxTPM']:
22 |             if log_count <= np.percentile(tpm['logMaxTPM'], 25):
23 |                 targets.append(0)
24 |             elif log_count >= np.percentile(tpm['logMaxTPM'], 75):
25 |                 targets.append(1)
26 |             else:
27 |                 targets.append(2)
28 |         tpm['label'] = targets
29 |         tpm = tpm[['gene_id', 'label']]
30 |         data = predictors.merge(tpm, how='inner', on='gene_id')
31 |         if cls_task == 'Low-High':
32 |             data = data[data['label'] != 2]
33 |         print(data.head())
34 | 
35 |         for chrom in data['Chromosome'].unique():
36 |             data_train = data.copy()
37 |             data_train = data_train[data_train['Chromosome'] != chrom]
38 |             data_train.drop(columns=['gene_id', 'Chromosome'], inplace=True)
39 |             data_test = data.copy()
40 |             data_test = data_test[data_test['Chromosome'] == chrom]
41 |             data_test.drop(columns=['gene_id', 'Chromosome'], inplace=True)
42 | 
43 |             # Balance data and  standardizing
44 |             sampler = RandomOverSampler(random_state=42)
45 |             x_train, y_train = data_train.values[:, :-1], data_train['label'].values
46 |             x_train, y_train = sampler.fit_resample(x_train, y_train)
47 |             x_test, y_test = data_test.values[:, :-1], data_test['label'].values
48 |             scaler = StandardScaler()
49 |             scaler.fit(x_train)
50 |             x_train_std = scaler.transform(x_train)
51 |             x_test_std = scaler.transform(x_test)
52 |             random_forest = RandomForestClassifier(100)
53 |             random_forest.fit(x_train_std, y_train)
54 |             if cls_task == 'Low-High':
55 |                 feat_imp = random_forest.feature_importances_
56 |                 feat_imp_scores.extend(feat_imp)
57 |                 feat_names.extend(data_train.columns[:-1])
58 |                 species.extend([generic_feats[idx].split('_')[0]] * len(data_train.columns[:-1]))
59 |             y_pred = random_forest.predict(x_test_std)
60 |             acc = accuracy_score(y_test, y_pred)
61 |             scores.append([generic_feats[idx].split('_')[0], acc, cls_task])
62 |             print(generic_feats[idx].split('_')[0], random_forest.score(x_test_std, y_test), cls_task)
63 | 
64 | df_feat_imp = pd.DataFrame({'Importance scores': feat_imp_scores,
65 |                             'Species': species,
66 |                             'Feature': feat_names})
67 | df_feat_imp.sort_values(by='Species', inplace=True)
68 | df_performance = pd.DataFrame(scores, columns=['Species', 'accuracy',
69 |                                                'task']).to_csv('../results/rand_for_perf.csv', index=False)
70 | df_feat_imp.to_csv('../results/rand_for_feat_imp_SSR.csv', index=False)
71 | 


--------------------------------------------------------------------------------
/model/train_msr_models_leaf.py:
--------------------------------------------------------------------------------
  1 | """In this script we train models with several plant species and test on single species"""
  2 | import pandas as pd
  3 | import numpy as np
  4 | from utils import onehot
  5 | from pyfaidx import Fasta
  6 | from sklearn.utils import shuffle
  7 | from tensorflow.keras import Sequential, optimizers, backend, models
  8 | from tensorflow.keras.layers import Conv1D, Dense, MaxPool1D, Dropout, Flatten
  9 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 10 | from sklearn.metrics import accuracy_score, roc_auc_score
 11 | import os
 12 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 13 | 
 14 | 
 15 | gene_model = pd.read_csv('gene_models/zea_sol_ara_sor_52.gtf', sep='\t')
 16 | gene_model['specie'] = [x.split('_')[0] for x in gene_model['Chromosome']]
 17 | genes_labels = pd.read_csv('tpm_counts/zea_sol_ara_sor.csv', index_col=0)
 18 | genome = Fasta('genomes/zea_sol_ara_sor_dna.fa',  as_raw=True, sequence_always_upper=True, read_ahead=10000)
 19 | 
 20 | 
 21 | def encode_seq(gene_models, fasta, labels, upstream=1000, downstream=500, training=False):
 22 |     encoded_seqs, true_labels = [], []
 23 | 
 24 |     for chrom, start, end, strand, gene_id, _ in gene_models.values:
 25 |         if gene_id in labels.index:
 26 |             if strand == '+':
 27 |                 prom_start, prom_end = start - upstream, start + downstream
 28 |                 term_start, term_end = end - downstream, end + upstream
 29 |                 if prom_start > 0 and term_start > 0:
 30 |                     encoded_seq = np.concatenate([onehot(fasta[chrom][prom_start:prom_end]),
 31 |                                                   np.zeros(shape=(20, 4)),
 32 |                                                   onehot(fasta[chrom][term_start:term_end])])
 33 |                     if encoded_seq.shape[0] == 2 * (upstream + downstream) + 20:
 34 |                         encoded_seqs.append(encoded_seq)
 35 |                         true_labels.append(labels.loc[gene_id, 'true_target'])
 36 | 
 37 |             else:
 38 |                 prom_start, prom_end = end - downstream, end + upstream
 39 |                 term_start, term_end = start - upstream, start + downstream
 40 |                 if prom_start > 0 and term_start > 0:
 41 |                     encoded_seq = np.concatenate([onehot(fasta[chrom][prom_start:prom_end])[::-1, ::-1],
 42 |                                                   np.zeros(shape=(20, 4)),
 43 |                                                   onehot(fasta[chrom][term_start:term_end])[::-1, ::-1]])
 44 | 
 45 |                     if encoded_seq.shape[0] == 2 * (upstream + downstream) + 20:
 46 |                         encoded_seqs.append(encoded_seq)
 47 |                         true_labels.append(labels.loc[gene_id, 'true_target'])
 48 | 
 49 |     encoded_seqs = np.array(encoded_seqs)
 50 |     true_labels = np.array(true_labels)
 51 | 
 52 |     if training:
 53 |         print('Encoding and balancing Training set')
 54 |         # Random down sampling to balance data
 55 |         low_train, high_train = np.where(true_labels == 0)[0], np.where(true_labels == 1)[0]
 56 |         min_class = min([len(low_train), len(high_train)])
 57 |         selected_low_train = np.random.choice(low_train, min_class, replace=False)
 58 |         selected_high_train = np.random.choice(high_train, min_class, replace=False)
 59 |         encoded_seqs = np.concatenate([
 60 |             np.take(encoded_seqs, selected_low_train, axis=0),
 61 |             np.take(encoded_seqs, selected_high_train, axis=0)
 62 |         ], axis=0)
 63 |         true_labels = np.concatenate([
 64 |             np.take(true_labels, selected_low_train, axis=0),
 65 |             np.take(true_labels, selected_high_train, axis=0)
 66 |         ], axis=0)
 67 |         encoded_seqs, true_labels = shuffle(encoded_seqs, true_labels, random_state=42)
 68 | 
 69 |     return encoded_seqs, true_labels
 70 | 
 71 | 
 72 | def build_network(x_train, x_val, y_train, y_val, specie_name, val_chrom):
 73 |     backend.clear_session()
 74 |     model = Sequential([
 75 |         # Conv Block 1
 76 |         Conv1D(64, kernel_size=8, activation='relu', padding='same',
 77 |                input_shape=(x_train.shape[1], x_train.shape[2])),
 78 |         Conv1D(64, kernel_size=8, activation='relu', padding='same'),
 79 |         MaxPool1D(8, padding='same'),
 80 |         Dropout(0.25),
 81 | 
 82 |         # Conv Block 2
 83 |         Conv1D(128, kernel_size=8, activation='relu', padding='same'),
 84 |         Conv1D(128, kernel_size=8, activation='relu', padding='same'),
 85 |         MaxPool1D(8, padding='same'),
 86 |         Dropout(0.25),
 87 | 
 88 |         # Conv Block 3
 89 |         Conv1D(64, kernel_size=8, activation='relu', padding='same'),
 90 |         Conv1D(64, kernel_size=8, activation='relu', padding='same'),
 91 |         MaxPool1D(8, padding='same'),
 92 |         Dropout(0.25),
 93 | 
 94 |         # Fully connected Block
 95 |         Flatten(),
 96 |         Dense(128, activation='relu'),
 97 |         Dropout(0.25),
 98 |         Dense(64, activation='relu'),
 99 |         Dense(1, activation='sigmoid')
100 | 
101 |     ])
102 | 
103 |     print(model.summary())
104 | 
105 |     model_save_name = f'saved_models/multi_specie_model_{specie_name}_{val_chrom}.h5'
106 |     model_chkpt = ModelCheckpoint(model_save_name, save_best_only=True, verbose=1)
107 |     early_stop = EarlyStopping(patience=10)
108 |     reduce_lr = ReduceLROnPlateau(patience=5, factor=0.1)
109 |     model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(0.0001), metrics=['accuracy'])
110 |     model.fit(x_train, y_train, batch_size=64, epochs=100, validation_data=(x_val, y_val),
111 |               callbacks=[early_stop, model_chkpt, reduce_lr])
112 |     saved_model = models.load_model(model_save_name)
113 |     predictions = saved_model.predict(x_val)
114 |     val_auroc = roc_auc_score(y_val, predictions)
115 |     predictions = predictions > 0.5
116 |     val_acc = accuracy_score(y_val, predictions)
117 |     print('Best model performance--------------------------\n')
118 |     print(f'Accuracy: {val_acc}, auROC: {val_auroc}\n')
119 |     print('------------------------------------------------')
120 | 
121 |     performace = [val_acc, val_auroc, specie_name]
122 | 
123 |     return performace
124 | 
125 | 
126 | final_results = []
127 | for specie in gene_model['specie'].unique():
128 |     test_specie = gene_model.copy()
129 |     test_specie = test_specie[test_specie['specie'] == specie]
130 |     train_specie = gene_model.copy()
131 |     train_specie = train_specie[train_specie['specie'] != specie]
132 |     train_seqs, train_targets = encode_seq(train_specie, genome, genes_labels, training=True)
133 | 
134 |     for chrom in test_specie['Chromosome'].unique():
135 |         test_specie_chrom = test_specie.copy()
136 |         test_specie_chrom = test_specie_chrom[test_specie_chrom['Chromosome'] == chrom]
137 |         test_seqs, test_targets = encode_seq(test_specie_chrom, genome, genes_labels)
138 |         print(f'Testing on {specie}, chromosome {chrom}')
139 |         result = build_network(train_seqs, test_seqs, train_targets, test_targets, specie, chrom)
140 |         final_results.append(result)
141 | 
142 | final_results = pd.DataFrame(final_results, columns=['accuracy', 'auRoc', 'test specie'])
143 | final_results.to_csv("../results/multi_specie_result_leaf.csv", index=False)
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 


--------------------------------------------------------------------------------
/model/train_msr_models_root.py:
--------------------------------------------------------------------------------
  1 | """In this script we train models with several plant species and test on single species"""
  2 | import pandas as pd
  3 | import numpy as np
  4 | from utils import onehot
  5 | from pyfaidx import Fasta
  6 | from sklearn.utils import shuffle
  7 | from tensorflow.keras import Sequential, optimizers, backend, models
  8 | from tensorflow.keras.layers import Conv1D, Dense, MaxPool1D, Dropout, Flatten
  9 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 10 | from sklearn.metrics import accuracy_score, roc_auc_score
 11 | import os
 12 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 13 | 
 14 | 
 15 | gene_model = pd.read_csv('gene_models/zea_sol_ara_sor_52.gtf', sep='\t')
 16 | gene_model['specie'] = [x.split('_')[0] for x in gene_model['Chromosome']]
 17 | print(gene_model.head())
 18 | genes_labels = pd.read_csv('tpm_counts/zea_sol_ara_sor_roots.csv', index_col=0)
 19 | print(genes_labels.head())
 20 | genome = Fasta('genomes/zea_sol_ara_sor_dna.fa',  as_raw=True, sequence_always_upper=True, read_ahead=10000)
 21 | 
 22 | 
 23 | def encode_seq(gene_models, fasta, labels, upstream=1000, downstream=500, training=False):
 24 |     encoded_seqs, true_labels = [], []
 25 | 
 26 |     for chrom, start, end, strand, gene_id, _ in gene_models.values:
 27 |         if gene_id in labels.index:
 28 |             if strand == '+':
 29 |                 prom_start, prom_end = start - upstream, start + downstream
 30 |                 term_start, term_end = end - downstream, end + upstream
 31 |                 if prom_start > 0 and term_start > 0:
 32 |                     encoded_seq = np.concatenate([onehot(fasta[chrom][prom_start:prom_end]),
 33 |                                                   np.zeros(shape=(20, 4)),
 34 |                                                   onehot(fasta[chrom][term_start:term_end])])
 35 |                     if encoded_seq.shape[0] == 2 * (upstream + downstream) + 20:
 36 |                         encoded_seqs.append(encoded_seq)
 37 |                         true_labels.append(labels.loc[gene_id, 'true_target'])
 38 | 
 39 |             else:
 40 |                 prom_start, prom_end = end - downstream, end + upstream
 41 |                 term_start, term_end = start - upstream, start + downstream
 42 |                 if prom_start > 0 and term_start > 0:
 43 |                     encoded_seq = np.concatenate([onehot(fasta[chrom][prom_start:prom_end])[::-1, ::-1],
 44 |                                                   np.zeros(shape=(20, 4)),
 45 |                                                   onehot(fasta[chrom][term_start:term_end])[::-1, ::-1]])
 46 | 
 47 |                     if encoded_seq.shape[0] == 2 * (upstream + downstream) + 20:
 48 |                         encoded_seqs.append(encoded_seq)
 49 |                         true_labels.append(labels.loc[gene_id, 'true_target'])
 50 | 
 51 |     encoded_seqs = np.array(encoded_seqs)
 52 |     true_labels = np.array(true_labels)
 53 | 
 54 |     if training:
 55 |         print('Encoding and balancing Training set')
 56 |         # Random down sampling to balance data
 57 |         low_train, high_train = np.where(true_labels == 0)[0], np.where(true_labels == 1)[0]
 58 |         min_class = min([len(low_train), len(high_train)])
 59 |         selected_low_train = np.random.choice(low_train, min_class, replace=False)
 60 |         selected_high_train = np.random.choice(high_train, min_class, replace=False)
 61 |         encoded_seqs = np.concatenate([
 62 |             np.take(encoded_seqs, selected_low_train, axis=0),
 63 |             np.take(encoded_seqs, selected_high_train, axis=0)
 64 |         ], axis=0)
 65 |         true_labels = np.concatenate([
 66 |             np.take(true_labels, selected_low_train, axis=0),
 67 |             np.take(true_labels, selected_high_train, axis=0)
 68 |         ], axis=0)
 69 |         encoded_seqs, true_labels = shuffle(encoded_seqs, true_labels, random_state=42)
 70 | 
 71 |     return encoded_seqs, true_labels
 72 | 
 73 | 
 74 | def build_network(x_train, x_val, y_train, y_val, specie_name, val_chrom):
 75 |     backend.clear_session()
 76 |     model = Sequential([
 77 |         # Conv Block 1
 78 |         Conv1D(64, kernel_size=8, activation='relu', padding='same',
 79 |                input_shape=(x_train.shape[1], x_train.shape[2])),
 80 |         Conv1D(64, kernel_size=8, activation='relu', padding='same'),
 81 |         MaxPool1D(8, padding='same'),
 82 |         Dropout(0.25),
 83 | 
 84 |         # Conv Block 2
 85 |         Conv1D(128, kernel_size=8, activation='relu', padding='same'),
 86 |         Conv1D(128, kernel_size=8, activation='relu', padding='same'),
 87 |         MaxPool1D(8, padding='same'),
 88 |         Dropout(0.25),
 89 | 
 90 |         # Conv Block 3
 91 |         Conv1D(64, kernel_size=8, activation='relu', padding='same'),
 92 |         Conv1D(64, kernel_size=8, activation='relu', padding='same'),
 93 |         MaxPool1D(8, padding='same'),
 94 |         Dropout(0.25),
 95 | 
 96 |         # Fully connected Block
 97 |         Flatten(),
 98 |         Dense(128, activation='relu'),
 99 |         Dropout(0.25),
100 |         Dense(64, activation='relu'),
101 |         Dense(1, activation='sigmoid')
102 | 
103 |     ])
104 | 
105 |     print(model.summary())
106 | 
107 |     model_save_name = f'saved_models/multi_specie_model_{specie_name}_{val_chrom}_root.h5'
108 |     model_chkpt = ModelCheckpoint(model_save_name, save_best_only=True, verbose=1)
109 |     early_stop = EarlyStopping(patience=10)
110 |     reduce_lr = ReduceLROnPlateau(patience=5, factor=0.1)
111 |     model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(0.0001), metrics=['accuracy'])
112 |     model.fit(x_train, y_train, batch_size=64, epochs=100, validation_data=(x_val, y_val),
113 |               callbacks=[early_stop, model_chkpt, reduce_lr])
114 |     saved_model = models.load_model(model_save_name)
115 |     predictions = saved_model.predict(x_val)
116 |     val_auroc = roc_auc_score(y_val, predictions)
117 |     predictions = predictions > 0.5
118 |     val_acc = accuracy_score(y_val, predictions)
119 |     print('Best model performance--------------------------\n')
120 |     print(f'Accuracy: {val_acc}, auROC: {val_auroc}\n')
121 |     print('------------------------------------------------')
122 | 
123 |     performace = [val_acc, val_auroc, specie_name]
124 | 
125 |     return performace
126 | 
127 | 
128 | final_results = []
129 | for specie in gene_model['specie'].unique():
130 |     test_specie = gene_model.copy()
131 |     test_specie = test_specie[test_specie['specie'] == specie]
132 |     train_specie = gene_model.copy()
133 |     train_specie = train_specie[train_specie['specie'] != specie]
134 |     train_seqs, train_targets = encode_seq(train_specie, genome, genes_labels, training=True)
135 | 
136 |     for chrom in test_specie['Chromosome'].unique():
137 |         test_specie_chrom = test_specie.copy()
138 |         test_specie_chrom = test_specie_chrom[test_specie_chrom['Chromosome'] == chrom]
139 |         test_seqs, test_targets = encode_seq(test_specie_chrom, genome, genes_labels)
140 |         print(f'Training on {train_specie["specie"].unique()}')
141 |         print(f'Testing on {test_specie["specie"].unique()}, chromosome {chrom}')
142 |         result = build_network(train_seqs, test_seqs, train_targets, test_targets, specie, chrom)
143 |         final_results.append(result)
144 | 
145 | final_results = pd.DataFrame(final_results, columns=['accuracy', 'auRoc', 'test specie'])
146 | final_results.to_csv("../results/multi_specie_result_root.csv", index=False)
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/model/train_ssr_ssc_models_leaf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | from utils import FastaSequenceLoader, ConvNetwork
 5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 6 | mapped_read_counts = ['zea_counts.csv', 'solanum_counts.csv',
 7 |                       'arabidopsis_counts.csv', 'sbicolor_counts.csv']
 8 | gene_models = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf', 'Solanum_lycopersicum.SL3.0.52.gtf',
 9 |                'Arabidopsis_thaliana.TAIR10.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
10 | genomes = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa', 'Solanum_lycopersicum.SL3.0.dna.toplevel.fa',
11 |            'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
12 | pickle_keys = ['zea', 'sol', 'ara', 'sor']
13 | num_chromosomes = [10, 12, 5, 10]
14 | 
15 | if not os.path.isdir('../results'):
16 |     os.mkdir('../results')
17 | if not os.path.isdir('saved_models'):
18 |     os.mkdir('saved_models')
19 | 
20 | for m_reads, gene_model, genome, num_chr, p_key in zip(mapped_read_counts, gene_models, genomes, num_chromosomes,
21 |                                                        pickle_keys):
22 |     if not os.path.exists(f"../results/{m_reads.split('_')[0]}_result.csv"):
23 |         final_training_output = []
24 |         tpm_counts = pd.read_csv(f'tpm_counts/{m_reads}', index_col=0)
25 |         true_targets = []
26 | 
27 |         for log_count in tpm_counts['logMaxTPM'].values:
28 |             if log_count <= np.percentile(tpm_counts['logMaxTPM'], 25):
29 |                 true_targets.append(0)
30 |             elif log_count >= np.percentile(tpm_counts['logMaxTPM'], 75):
31 |                 true_targets.append(1)
32 |             else:
33 |                 true_targets.append(2)
34 |         tpm_counts['true_target'] = true_targets
35 |         print(tpm_counts.head())
36 | 
37 |         for val_chromosome in np.arange(1, num_chr+1):
38 |             fastaloader = FastaSequenceLoader(f'genomes/{genome}', f'gene_models/{gene_model}',
39 |                                               val_chromosome, pickled_val_ids='validation_genes.pickle',
40 |                                               pickled_key=p_key)
41 |             enc_train, enc_val, train_ids, val_ids = fastaloader.extract_seq()
42 | 
43 |             print('-----------------------------------------------------------------------------\n')
44 |             print(f"Plant: {m_reads.split('_')[0]} Case: promoter_terminator")
45 |             print('-------------------------------------------------------------------------------')
46 |             convnet = ConvNetwork(enc_train, enc_val, train_ids, val_ids, val_chromosome, tpm_counts,
47 |                                   m_reads.split('_')[0], 'promoter_terminator')
48 |             output = convnet.train_network()
49 |             final_training_output.append(output)
50 | 
51 |             # Train models with shuffled sequences
52 |             print('-----------------------------------------------------------------------------\n')
53 |             print(f"Plant: {m_reads.split('_')[0]} Case: si-nucleotide_shuffle")
54 |             print('-------------------------------------------------------------------------------')
55 |             shuffle_enc_train = []
56 |             for train_seq in enc_train.copy():
57 |                 np.random.shuffle(train_seq)
58 |                 shuffle_enc_train.append(train_seq)
59 | 
60 |             shuffle_convnet = ConvNetwork(shuffle_enc_train, enc_val, train_ids, val_ids, val_chromosome, tpm_counts,
61 |                                           m_reads.split('_')[0], 'si-nucleotide_shuffle')
62 |             shuffle_output = shuffle_convnet.train_network()
63 |             final_training_output.append(shuffle_output)
64 | 
65 |         final_training_output = pd.DataFrame(final_training_output, columns=['val_acc', 'val_auROC', 'plant', 'case',
66 |                                                                              'training size'])
67 |         final_training_output.to_csv(f"../results/{m_reads.split('_')[0]}_leaf_result.csv", index=False)
68 | 
69 | 


--------------------------------------------------------------------------------
/model/train_ssr_ssc_models_root.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | from utils import FastaSequenceLoader, ConvNetwork
 5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 6 | mapped_read_counts = ['zea_root_counts.csv', 'solanum_root_counts.csv',
 7 |                       'arabidopsis_root_counts.csv', 'sbicolor_root_counts.csv']
 8 | gene_models = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.52.gtf', 'Solanum_lycopersicum.SL3.0.52.gtf',
 9 |                'Arabidopsis_thaliana.TAIR10.52.gtf', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.52.gtf']
10 | genomes = ['Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa', 'Solanum_lycopersicum.SL3.0.dna.toplevel.fa',
11 |            'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa', 'Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa']
12 | pickle_keys = ['zea', 'sol', 'ara', 'sor']
13 | num_chromosomes = [10, 12, 5, 10]
14 | 
15 | if not os.path.isdir('../results'):
16 |     os.mkdir('../results')
17 | if not os.path.isdir('saved_models'):
18 |     os.mkdir('saved_models')
19 | 
20 | for m_reads, gene_model, genome, num_chr, p_key in zip(mapped_read_counts, gene_models, genomes, num_chromosomes,
21 |                                                        pickle_keys):
22 |     if not os.path.exists(f"../results/{m_reads.split('_')[0]}_root_result.csv"):
23 |         final_training_output = []
24 |         tpm_counts = pd.read_csv(f'tpm_counts/{m_reads}', index_col=0)
25 |         true_targets = []
26 | 
27 |         for log_count in tpm_counts['logMaxTPM'].values:
28 |             if log_count <= np.percentile(tpm_counts['logMaxTPM'], 25):
29 |                 true_targets.append(0)
30 |             elif log_count >= np.percentile(tpm_counts['logMaxTPM'], 75):
31 |                 true_targets.append(1)
32 |             else:
33 |                 true_targets.append(2)
34 |         tpm_counts['true_target'] = true_targets
35 |         print(tpm_counts.head())
36 | 
37 |         for val_chromosome in np.arange(1, num_chr+1):
38 |             fastaloader = FastaSequenceLoader(f'genomes/{genome}', f'gene_models/{gene_model}',
39 |                                               val_chromosome, pickled_val_ids='validation_genes.pickle',
40 |                                               pickled_key=p_key)
41 |             enc_train, enc_val, train_ids, val_ids = fastaloader.extract_seq()
42 | 
43 |             print('-----------------------------------------------------------------------------\n')
44 |             print(f"Plant: {m_reads.split('_')[0]} Case: promoter_terminator")
45 |             print('-------------------------------------------------------------------------------')
46 |             convnet = ConvNetwork(enc_train, enc_val, train_ids, val_ids, val_chromosome, tpm_counts,
47 |                                   m_reads.split('_')[0], 'promoter_terminator', tissue="root")
48 |             output = convnet.train_network()
49 |             final_training_output.append(output)
50 | 
51 |             # Train models with shuffled sequences
52 |             print('-----------------------------------------------------------------------------\n')
53 |             print(f"Plant: {m_reads.split('_')[0]} Case: si-nucleotide_shuffle")
54 |             print('-------------------------------------------------------------------------------')
55 |             shuffle_enc_train = []
56 |             for train_seq in enc_train.copy():
57 |                 np.random.shuffle(train_seq)
58 |                 shuffle_enc_train.append(train_seq)
59 | 
60 |             shuffle_convnet = ConvNetwork(shuffle_enc_train, enc_val, train_ids, val_ids, val_chromosome, tpm_counts,
61 |                                           m_reads.split('_')[0], 'si-nucleotide_shuffle', tissue='root')
62 |             shuffle_output = shuffle_convnet.train_network()
63 |             final_training_output.append(shuffle_output)
64 | 
65 |         final_training_output = pd.DataFrame(final_training_output, columns=['val_acc', 'val_auROC', 'plant', 'case',
66 |                                                                              'training size'])
67 |         final_training_output.to_csv(f"../results/{m_reads.split('_')[0]}_root_result.csv", index=False)
68 | 


--------------------------------------------------------------------------------
/model/validation_genes.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NAMlab/DeepCRE/72838636b880e6d0b7759741e30a603524cbfb04/model/validation_genes.pickle


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ## Deep learning the cis-regulatory code for gene expression in selected model plants
 2 | 
 3 | [![DOI](https://zenodo.org/badge/632932657.svg)](https://zenodo.org/doi/10.5281/zenodo.10822013)
 4 | 
 5 | Please follow the steps below to reproduce the results from our work.
 6 | 1. Download this repository.
 7 | 2. Change directory into the **model** subdirectory, run the *fetch_genomes_and_annotation.sh* script. Firstly, this
 8 | will create 3 new subdirectories: genomes, gene_models, tpm_counts. Then, it will download genomes and gene models
 9 | for 4 plant species, uncompress them and store them in genomes and gene models subdirectories respectively.
10 | 3. Download expression counts for the project from supplementary data from publication and save these files 
11 | within the tpm_counts subdirectory. NB: you should have 8 files, corresponding to 4 plant species and 2 tissues.
12 | For example, for *Arabidopsis thaliana* you would have arabidopsis_counts.csv and arabidopsis_root_counts.csv 
13 | for leaf and root tissues respectively.
14 | 
15 | ### Training convolutional neural networks
16 | - To train SSR and SSC models, run the *train_ssr_ssc_models_leaf.py* and *train_ssr_ssc_models_root.py* for
17 | leaf and root tissues respectively.
18 | - Train the MSR models using *train_msr_models_leaf.py* and *train_msr_models_root.py*.
19 | 
20 | Only after training CNN models can you run the scripts below that compute importance scores and generate motifs. Also 
21 | note that deepLIFT which is used to compute importance scores is currently only compatible with tensorflow 1.x. So if
22 | you build models with tensorflow 2.x, you won't be able to use these scripts.
23 | ### Computing importance scores and obtaining motifs
24 | - Run the *motif_discovery_...* scripts for respective tissue and models.
25 | - Then run *extract_motifs_ssr.py* or *extract_motifs_msr.py* to get the motifs out of the output produced by
26 | modisco.
27 | 
28 | ### Random forest models
29 | - Firstly, create the features using the *create_generic_feature.py* script.
30 | - Then run either *random_forest_msr.py* or *random_forest_ssr.py* for MSR and SSR models respectively.
31 | 
32 | ### Investigate effect of different sequence lengths
33 | To investigate the effects of different UTR or promoter sequence lengths, use the *effect_of_different_...*
34 | scripts. These scripts will build several models based on different length specified within the scripts.
35 | 
36 | 
37 | #### Generating validation_genes.pickle file
38 | This file contains information of genes that have homologs only within their chromosomes, such that when we use
39 | chromosome level cross validation for training, we mitigate the effects of homologs leaking information between our
40 | training and test set. While I provide the pickle file used for this project, one can generate this themselves  by
41 | firstly going into the data directory that sits as a sibling directory to model directory, then running the commands
42 | below in the terminal:
43 | 
44 | ```shell
45 | wget https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-52/plants/fasta/arabidopsis_thaliana/pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz
46 | ```
47 | ```shell
48 | gunzip Arabidopsis_thaliana.TAIR10.pep.all.fa.gz
49 | ```
50 | ```shell
51 | makeblastdb -in Arabidopsis_thaliana.TAIR10.pep.all.fa -dbtype prot -title arabidopsis -parse_seqids -hash_index -out arabidopsis
52 | ```
53 | ```shell
54 | blastp -db arabidopsis -query Arabidopsis_thaliana.TAIR10.pep.all.fa  -out Blast_ara_to_ara -outfmt 6
55 | ```
56 | The above assume that you have blast installed on your computer.
57 | The above 4 lines are just for *Arabidopsis thaliana* but should be edited and repeated for the other 4 species. Once 
58 | this is done, run the *produce_non_homologous_val_sets.py* script.
59 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | biopython==1.79
 2 | deeplift==0.6.13.0
 3 | h5py==2.10.0
 4 | imbalanced_learn==0.8.1
 5 | matplotlib==3.3.4
 6 | numpy==1.19.5
 7 | pandas==1.1.5
 8 | pyfaidx==0.7.2.1
 9 | pyranges==0.0.120
10 | scikit_learn==1.3.0
11 | scipy==1.5.3
12 | seaborn==0.11.2
13 | tensorflow==1.14.0
14 | tqdm==4.64.1
15 | modisco==0.5.16.0
16 | cudatoolkit==10.1.243
17 | cudnn==7.6.5


--------------------------------------------------------------------------------