├── .DS_Store ├── README.md ├── afdb-regions-network-R ├── .DS_Store ├── README.md ├── Rscript_1.R ├── Rscript_10.R ├── Rscript_2.R ├── Rscript_3.R ├── Rscript_4.R ├── Rscript_5.R ├── Rscript_6.R ├── Rscript_7.R ├── Rscript_8.R └── Rscript_9.R ├── commands ├── commands_darkening ├── commands_distribution ├── commands_purity ├── commands_sapiens ├── commands_share_db ├── dark_distribution └── dark_distribution.ipynb ├── purity ├── pfam_consistency.ipynb ├── purity_figure.ipynb └── subsitute_clan.ipynb ├── sapiens_go_exploration └── find_immune_related_human_cluster.ipynb └── sapiens_plot └── go_plot.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steineggerlab/afdb-clusters-analysis/4c38d80184fbb967b5fc9fe75d7a765a1c6cf98e/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # afdb-clusters-analysis 2 | Scripts to generate and analyze afdb clusters 3 | 4 | [Barrio-Hernandez I, Yeo J, Jänes J, Wein T, Varadi M, Velankar S, Beltrao P, Steinegger M. Clustering predicted structures at the scale of the known protein universe. Nature, doi.org:10.1101/2023.03.09.531927 (2023)]([https://www.nature.com/articles/s41586-023-06510-w](https://www.nature.com/articles/s41586-023-06510-w)) 5 | -------------------------------------------------------------------------------- /afdb-regions-network-R/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steineggerlab/afdb-clusters-analysis/4c38d80184fbb967b5fc9fe75d7a765a1c6cf98e/afdb-regions-network-R/.DS_Store -------------------------------------------------------------------------------- /afdb-regions-network-R/README.md: -------------------------------------------------------------------------------- 1 | # afdb-regions-network 2 | 3 | Scripts to analyse protein protein similarities based on foldseek output, as described in methods 4 | 5 | Barrio-Hernandez I, Yeo J, Jänes J, Wein T, Varadi M, Velankar S, Beltrao P, Steinegger M. Clustering predicted structures at the scale of the known protein universe. bioRxiv, doi.org:10.1101/2023.03.09.531927 (2023) 6 | 7 | SCRIPT 1: partition of Foldseek output for paralelization 8 | SCRIPT 2: filtering edges for evalue<=0.001 plus protein files per partition (loop) 9 | SCRIPT 3: getting ready for hierarchical clustering of regions per protein 10 | SCRIPT 4: hierarchical clustering of regions per protein (loop) 11 | SCRIPT 5: assembling clustering results 12 | SCRIPT 6: recoding the edges (based on clustering results) 13 | SCRIPT 7: assembling the recoded tables, selection of connected components 14 | SCRIPT 8: PFAM annotation of regions, first part, extracting information from database and cuting of files 15 | SCRIPT 9: annotation of regions using pfam (loop) 16 | SCRIPT 10: trimming of the network, clustering of regions from connected components and connecting modules 17 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_1.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ###SCRIPT1: partition of original table to handle size 5 | 6 | ######Due to file size, we split it in files containing 2,000,000 lines (561 files in total) 7 | 8 | ####open connexion 9 | 10 | raw_data_path <- "/nfs/research/beltrao/ibarrioh/AF-3D/071022/afdb50best_clu_nosingleton_repseq_ava.m8.gz" 11 | raw_data_nrow <- length(count.fields(raw_data_path)) 12 | 13 | con = file(raw_data_path, "r") 14 | 15 | ###cutting loop 16 | 17 | for (i in 0:560){ 18 | 19 | pairs=read.delim(con, nrows=2000000,header=F) 20 | 21 | temp_node=unique(c(paste(as.character(pairs[,1]), 22 | pairs[,7], 23 | pairs[,8],sep="_"), 24 | paste(as.character(pairs[,2]), 25 | pairs[,9], 26 | pairs[,10],sep="_"))) 27 | 28 | temp_edge=cbind(paste(as.character(pairs[,1]), 29 | pairs[,7], 30 | pairs[,8],sep="_"), 31 | paste(as.character(pairs[,2]), 32 | pairs[,9], 33 | pairs[,10],sep="_"), 34 | pairs[,11], 35 | pairs[,12]) 36 | 37 | path_1=paste("interactome_nodes_temp/nodes_",i,".rds",sep="") 38 | path_2=paste("interactome_edges_temp/edges_",i,".rds",sep="") 39 | 40 | saveRDS(temp_node,path_1) 41 | saveRDS(temp_edge,path_2) 42 | 43 | } 44 | close(con) 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | # -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_10.R: -------------------------------------------------------------------------------- 1 | 2 | ####SCRIPT-10: trimming of the network, clustering of regions from connected components and connecting modules 3 | 4 | 5 | library(igraph) 6 | 7 | pairs=readRDS("cut_001eval/recode_edges_all_FILTER.rds") 8 | modules=readRDS("cut_001eval/result_modules_all.rds") 9 | colnames(pairs)=c("Query","Target","Eval","score") 10 | 11 | pairs=pairs[as.numeric(pairs[,"Eval"])<=0.00001,] 12 | 13 | modules=cbind(modules,"","") 14 | colnames(modules)[(ncol(modules)-1):ncol(modules)]=c("subnet_trim","clust_trim") 15 | 16 | ####sizes of recoded regions 17 | 18 | length_structures=as.numeric(modules[,"end"])-as.numeric(modules[,"start"]) 19 | 20 | ####stablishing filter to trim the network, length<350 and pval<=0.00001 -5 21 | 22 | criteria=pairs[,"Query"]%in%rownames(modules)[length_structures<=350] & 23 | pairs[,"Target"]%in%rownames(modules)[length_structures<=350] 24 | 25 | net=graph_from_data_frame(d=pairs[criteria,c("Query","Target")],directed=F) 26 | E(net)$weight=as.numeric(pairs[criteria,"Eval"]) 27 | E(net)$score=as.numeric(pairs[criteria,"score"]) 28 | subnet=components(net) 29 | 30 | modules[names(subnet$membership),"subnet_trim"]=subnet$membership 31 | 32 | saveRDS(modules,"cut_001eval/result_modules_all_trim350_pval5.rds") 33 | 34 | ####Selection of connected components for walktrap clustering (at least 100 members) 35 | 36 | sel=table(subnet$membership) 37 | sel=sel[sel>=100] 38 | 39 | 40 | criteria=pairs[,"Query"]%in%rownames(modules)[modules[,"subnet_trim"]%in%names(sel)] & 41 | pairs[,"Target"]%in%rownames(modules)[modules[,"subnet_trim"]%in%names(sel)] 42 | 43 | pairs=pairs[criteria,] 44 | rm(net) 45 | gc() 46 | 47 | for (i in 1:length(sel)){ 48 | 49 | criteria=pairs[,"Query"]%in%rownames(modules)[modules[,"subnet_trim"]%in%names(sel)[i]] & 50 | pairs[,"Target"]%in%rownames(modules)[modules[,"subnet_trim"]%in%names(sel)[i]] 51 | 52 | 53 | net=graph_from_data_frame(d=pairs[criteria,c("Query","Target")],directed=F) 54 | E(net)$weight=as.numeric(pairs[criteria,"Eval"]) 55 | E(net)$score=as.numeric(pairs[criteria,"score"]) 56 | 57 | cwt=cluster_walktrap( net, 58 | weights = E(net)$weight, 59 | steps = 6, 60 | merges = TRUE, 61 | modularity = TRUE, 62 | membership = TRUE) 63 | 64 | modules[V(net)$name,"clust_trim"]=cwt$membership 65 | 66 | } 67 | 68 | ###saving the tables with connected components and clustr information 69 | 70 | saveRDS(modules,"cut_001eval/result_modules_all_trim350_pval5.rds") 71 | 72 | ################################################################################ 73 | ################################################################################ 74 | ################################################################################ 75 | ####Table of modules (connected components plus clusters) 76 | 77 | modules=readRDS("cut_001eval/result_modules_all_trim350_pval5.rds") 78 | sum(modules[,"clust_trim"]!="") 79 | sum(modules[,"subnet_trim"]!="") 80 | 81 | modules[modules[,"clust_trim"]!="","subnet_trim"]=paste(modules[modules[,"clust_trim"]!="","subnet_trim"], 82 | modules[modules[,"clust_trim"]!="","clust_trim"], 83 | sep=";") 84 | 85 | ###Protein regions that are part of a community (connected component or cluster) 86 | 87 | modules=modules[modules[,"subnet_trim"]!="",] 88 | 89 | mod_pfam_long=readRDS("cut_001eval/result_longCut_all.rds") 90 | mod_pfam_long=mod_pfam_long[mod_pfam_long[,"ID"]%in%rownames(modules),] 91 | mod_pfam_long[,"subnet"]=modules[mod_pfam_long[,"ID"],"subnet_trim"] 92 | 93 | #####Table of modules 94 | 95 | pfam_clusters=cbind(table(modules[,"subnet_trim"]), 96 | "", 97 | "", 98 | "", 99 | "", 100 | "", 101 | "", 102 | "", 103 | "") 104 | 105 | colnames(pfam_clusters)=c("counts_chunks", 106 | "counts_chunks_pfam", 107 | "counts_chunks_pfam_075", 108 | "pfam_first", 109 | "pfam_first_count", 110 | "pfam_first_rel", 111 | "pfam_second", 112 | "pfam_second_count", 113 | "pfam_second_rel") 114 | 115 | pfam_clusters=pfam_clusters[as.numeric(pfam_clusters[,1])>=5,] 116 | 117 | ####Annotating the modules with pfam from regions 118 | 119 | modules=modules[modules[,"subnet_trim"]%in%rownames(pfam_clusters),] 120 | mod_pfam_long=mod_pfam_long[mod_pfam_long[,"subnet"]%in%rownames(pfam_clusters),] 121 | 122 | temp=table(modules[modules[,"pfam_counts"]!="","subnet_trim"]) 123 | temp=temp[names(temp)%in%rownames(pfam_clusters)] 124 | pfam_clusters[names(temp),"counts_chunks_pfam"]=temp 125 | 126 | ###intermediate table 127 | 128 | intermedio=mod_pfam_long[,c("subnet","pfam_names","ID")] 129 | intermedio=intermedio[!duplicated(intermedio),] 130 | 131 | for (i in 1:nrow(pfam_clusters)){ 132 | 133 | temp=cbind(intermedio[intermedio[,"subnet"]%in%rownames(pfam_clusters)[i],"pfam_names"], 134 | intermedio[intermedio[,"subnet"]%in%rownames(pfam_clusters)[i],"ID"]) 135 | 136 | if(length(temp)>2){ 137 | 138 | temp1=table(temp[,1]) 139 | temp1=temp1[order(temp1,decreasing=T)] 140 | 141 | if(length(temp1)>1){ 142 | 143 | pfam_clusters[i,c("pfam_first","pfam_second")]=names(temp1)[1:2] 144 | pfam_clusters[i,c("pfam_first_count","pfam_second_count")]=temp1[1:2] 145 | pfam_clusters[i,c("counts_chunks_pfam_075")]=sum(!duplicated(temp[,2])) 146 | 147 | }else{ 148 | 149 | pfam_clusters[i,c("pfam_first","pfam_second")]=c(names(temp1),"") 150 | pfam_clusters[i,c("pfam_first_count","pfam_second_count")]=c(temp1,"") 151 | pfam_clusters[i,c("counts_chunks_pfam_075")]=sum(!duplicated(temp[,2])) 152 | } 153 | 154 | }else{ 155 | 156 | pfam_clusters[i,c("pfam_first","pfam_second")]=c(temp[1],"") 157 | pfam_clusters[i,c("pfam_first_count","pfam_second_count")]=c(1,"") 158 | pfam_clusters[i,c("counts_chunks_pfam_075")]=1 159 | 160 | } 161 | 162 | } 163 | 164 | ####REcoding for missing 165 | 166 | sum(is.na(pfam_clusters[,"pfam_first"])) 167 | sum(is.na(pfam_clusters[,"pfam_second"])) 168 | 169 | pfam_clusters[is.na(pfam_clusters[,"pfam_first"]),"counts_chunks_pfam_075"]="0" 170 | pfam_clusters[is.na(pfam_clusters[,"pfam_first"]),"pfam_first_count"]="0" 171 | pfam_clusters[is.na(pfam_clusters[,"pfam_first"]),"pfam_first"]="" 172 | pfam_clusters[pfam_clusters[,"pfam_second_count"]=="","pfam_second_count"]="0" 173 | pfam_clusters[pfam_clusters[,"pfam_first_count"]=="","pfam_first_count"]="0" 174 | pfam_clusters[pfam_clusters[,"counts_chunks_pfam_075"]=="0","pfam_first_count"]="0" 175 | pfam_clusters[pfam_clusters[,"counts_chunks_pfam_075"]=="0","pfam_second_count"]="0" 176 | 177 | pfam_clusters[pfam_clusters[,"counts_chunks_pfam_075"]!="0","pfam_first_rel"]=as.numeric(pfam_clusters[pfam_clusters[,"counts_chunks_pfam_075"]!="0","pfam_first_count"])/ 178 | as.numeric(pfam_clusters[pfam_clusters[,"counts_chunks_pfam_075"]!="0","counts_chunks_pfam_075"]) 179 | 180 | pfam_clusters[pfam_clusters[,"counts_chunks_pfam_075"]!="0","pfam_second_rel"]=as.numeric(pfam_clusters[pfam_clusters[,"counts_chunks_pfam_075"]!="0","pfam_second_count"])/ 181 | as.numeric(pfam_clusters[pfam_clusters[,"counts_chunks_pfam_075"]!="0","counts_chunks_pfam_075"]) 182 | 183 | saveRDS(pfam_clusters,"cut_001eval/communities_trim350_pval5_min5struc.rds")###supplementary table 184 | 185 | 186 | ################################################################################ 187 | ################################################################################ 188 | #####Selection of edges connecting modules 189 | 190 | pairs=readRDS("cut_001eval/recode_edges_all_FILTER.rds") 191 | colnames(pairs)=c("Query","Target","Eval","score") 192 | 193 | modules=readRDS("cut_001eval/result_modules_all_trim350_pval5.rds") 194 | 195 | modules[modules[,"clust_trim"]!="","subnet_trim"]=paste(modules[modules[,"clust_trim"]!="","subnet_trim"], 196 | modules[modules[,"clust_trim"]!="","clust_trim"], 197 | sep=";") 198 | 199 | ###Filtering 200 | 201 | modules=modules[modules[,"subnet_trim"]!="",] 202 | 203 | temp=table(modules[,"subnet_trim"]) 204 | temp=temp[temp>=10] 205 | 206 | modules=modules[modules[,"subnet_trim"]%in%names(temp),] 207 | 208 | ####Finding the edges connecting modules 209 | 210 | recoding=modules[,"subnet_trim"] 211 | names(recoding)=rownames(modules) 212 | 213 | pairs=pairs[pairs[,"Query"]%in%names(recoding) & 214 | pairs[,"Target"]%in%names(recoding) ,] 215 | 216 | pairs[,"Query"]=recoding[pairs[,"Query"]] 217 | pairs[,"Target"]=recoding[pairs[,"Target"]] 218 | 219 | ###Keeping the ones with smaller evalues 220 | 221 | net=graph_from_data_frame(d=pairs[,c("Query","Target")],directed=F) 222 | E(net)$weight=as.numeric(pairs[,"Eval"]) 223 | E(net)$score=as.numeric(pairs[,"score"]) 224 | 225 | net=igraph::simplify(net,remove.loops = T,remove.multiple = T , edge.attr.comb = c(weight="min",score="max","ignore")) 226 | 227 | saveRDS(cbind(get.edgelist(net,names = T), 228 | E(net)$weight, 229 | E(net)$score), 230 | "cut_001eval/recode_edges_all_comunities_trim350_pval5.rds") 231 | 232 | ################################################################################ 233 | ################################################################################ 234 | ################################################################################ 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_2.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | ###SCRIPT2: cutting for evalue<=0.001 plus protein tables 4 | 5 | 6 | 7 | ###Argument to loop in a cluster system 8 | 9 | i <- as.numeric(commandArgs(trailingOnly = TRUE)) 10 | 11 | path1="interactome_edges_temp/edges_" ###path with the 561 original file partitions 12 | 13 | path2_edges="cut_001eval/interactome_edges_temp/edges_"###new location of filtered tables 14 | path2_nodes="cut_001eval/interactome_nodes_temp/nodes_" 15 | path2_nodes_nonred="cut_001eval/interactome_nodes_temp/nodes_prot" 16 | 17 | library(igraph) 18 | 19 | setwd("/hps/nobackup/beltrao/ibarrioh/AF-3D/071022/interactome_edges_temp") 20 | 21 | ###open original tables, filter names to keep uniprot IDs 22 | pairs=readRDS(paste(path1,i,".rds",sep="")) 23 | colnames(pairs)=c("Query","Target","Eval","score") 24 | pairs=pairs[as.numeric(pairs[,"Eval"])<=0.001,] 25 | pairs[,"Query"]=gsub("-F1-model_v3.cif","",pairs[,"Query"]) 26 | pairs[,"Query"]=gsub("AF-","",pairs[,"Query"]) 27 | pairs[,"Target"]=gsub("-F1-model_v3.cif","",pairs[,"Target"]) 28 | pairs[,"Target"]=gsub("AF-","",pairs[,"Target"]) 29 | gc() 30 | 31 | ###Use igraph to remove duplicities keeping max score and min evalue 32 | net=graph_from_data_frame(d=pairs[,c("Query","Target")],directed=F) 33 | E(net)$weight=as.numeric(pairs[,"Eval"]) 34 | E(net)$score=as.numeric(pairs[,"score"]) 35 | net=igraph::simplify(net,remove.loops = T,remove.multiple = T , edge.attr.comb = c(weight="min",score="max","ignore")) 36 | Isolated = which(igraph::degree(net)==0) 37 | length(Isolated) 38 | net = delete.vertices(net, Isolated) 39 | 40 | nodes=V(net)$name 41 | 42 | ###Saving protein sequence regions 43 | saveRDS(nodes,paste(path2_nodes,i,".rds",sep="")) 44 | saveRDS(cbind(get.edgelist(net,names = T), 45 | E(net)$weight, 46 | E(net)$score),paste(path2_edges,i,".rds",sep="")) 47 | 48 | ###saving unique roteins the regions beong to 49 | nodes=unlist(strsplit(nodes,"_"))[c(T,F,F)] 50 | nodes=nodes[!duplicated(nodes)] 51 | saveRDS(nodes,paste(path2_nodes_nonred,i,".rds",sep="")) 52 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_3.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | ####SCRIPT3: getting ready for hierarchical clustering of regions 4 | 5 | ####Here we generate vectors with uniprot identifiers, start and stop positions for each region 6 | 7 | setwd("cut_001eval/interactome_nodes_temp") 8 | 9 | files=list.files(pattern = "nodes_prot") 10 | files1=list.files(pattern = "node") 11 | files1=files1[!files1%in%files] 12 | 13 | nodes=readRDS(files1[1]) 14 | 15 | for (i in 2:length(files1)){ 16 | 17 | nodes=c(nodes,readRDS(files1[i])) 18 | 19 | } 20 | 21 | nodes=nodes[!duplicated(nodes)] 22 | saveRDS(nodes,"cut_001eval/all_proteins.rds") 23 | 24 | 25 | names_unlist=unlist(strsplit(nodes,"_"))[c(T,F,F)] 26 | saveRDS(names_unlist,"cut_001eval/all_uniprotID.rds") 27 | prots=table(names_unlist) 28 | 29 | print(sum(prots==1)) 30 | print(length(prots)) 31 | 32 | rm(names_unlist) 33 | gc() 34 | 35 | start_unlist=unlist(strsplit(nodes,"_"))[c(F,T,F)] 36 | saveRDS(start_unlist,"cut_001eval/all_start.rds") 37 | 38 | rm(start_unlist) 39 | gc() 40 | 41 | stop_unlist= unlist(strsplit(nodes,"_"))[c(F,F,T)] 42 | saveRDS(stop_unlist,"cut_001eval/all_stop.rds") 43 | 44 | 45 | ####################################################### 46 | #####Generating 580 part files with protein ID, start and stop positions 47 | 48 | ####Lines for proteins with only one region (no clustering) 49 | 50 | names_unlist=readRDS("cut_001eval/all_uniprotID.rds") 51 | start_unlist=readRDS("cut_001eval/all_start.rds") 52 | stop_unlist=readRDS("cut_001eval/all_stop.rds") 53 | 54 | prots=table(names_unlist) 55 | 56 | ##clean out proteins with one region 57 | prots=prots[prots>1] 58 | start_unlist=start_unlist[names_unlist%in%names(prots)] 59 | stop_unlist=stop_unlist[names_unlist%in%names(prots)] 60 | names_unlist=names_unlist[names_unlist%in%names(prots)] 61 | 62 | ### regions cut 63 | 64 | coor=cbind(seq(from=1, to=2004717, by=3452), 65 | c(seq(from=3452, to=2004717, by=3452),2004717)) 66 | 67 | pathID="cut_001eval/hierar/uniprotID_" 68 | pathstart="cut_001eval/hierar/start_" 69 | pathstop="cut_001eval/hierar/stop_" 70 | 71 | for (i in 1:nrow(coor)){ 72 | 73 | cluster_temp=names_unlist[names_unlist%in%names(prots)[coor[i,1]:coor[i,2]]] 74 | start_temp=start_unlist[names_unlist%in%cluster_temp] 75 | stop_temp=stop_unlist[names_unlist%in%cluster_temp] 76 | 77 | saveRDS(cluster_temp,paste(pathID,i,sep="")) 78 | saveRDS(start_temp,paste(pathstart,i,sep="")) 79 | saveRDS(stop_temp,paste(pathstop,i,sep="")) 80 | 81 | } 82 | 83 | #################################################### 84 | #################################################### 85 | 86 | 87 | ###Creasome seed 88 | 89 | i <- as.numeric(commandArgs(trailingOnly = TRUE)) 90 | 91 | pathID="/hps/nobackup/beltrao/ibarrioh/AF-3D/181022/cut_001eval/hierar/uniprotID_" 92 | pathstart="/hps/nobackup/beltrao/ibarrioh/AF-3D/181022/cut_001eval/hierar/start_" 93 | pathstop="/hps/nobackup/beltrao/ibarrioh/AF-3D/181022/cut_001eval/hierar/stop_" 94 | pathcluster="/hps/nobackup/beltrao/ibarrioh/AF-3D/181022/cut_001eval/hierar/cluster_" 95 | ####subimos los vectores 96 | 97 | vector_guia_uniprot=readRDS(paste(pathID,i,sep="")) 98 | vector_guia_from=readRDS(paste(pathstart,i,sep="")) 99 | vector_guia_to=readRDS(paste(pathstop,i,sep="")) 100 | vector_guia_cluster=rep(1,length(vector_guia_to)) 101 | protes=table(vector_guia_uniprot) 102 | 103 | 104 | for (j in 1:length(protes)){ 105 | 106 | temp=cbind(vector_guia_from[vector_guia_uniprot%in%names(protes)[j]], 107 | vector_guia_to[vector_guia_uniprot%in%names(protes)[j]]) 108 | 109 | vector_guia_cluster[vector_guia_uniprot%in%names(protes)[j]]=cutree(hclust(dist(temp,method = "euclidean")),h= 250) 110 | 111 | } 112 | 113 | saveRDS(vector_guia_cluster,paste(pathcluster,i)) 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_4.R: -------------------------------------------------------------------------------- 1 | #################################################### 2 | 3 | ###SCRIPT 4: hierarchical clustering of regions per protein 4 | 5 | ###This script is ment to run in side a loop 6 | 7 | i <- as.numeric(commandArgs(trailingOnly = TRUE)) 8 | 9 | pathID="cut_001eval/hierar/uniprotID_" 10 | pathstart="cut_001eval/hierar/start_" 11 | pathstop="cut_001eval/hierar/stop_" 12 | pathcluster="cut_001eval/hierar/cluster_" 13 | 14 | ####Protein ID plus start stop positions 15 | #### 16 | vector_guia_uniprot=readRDS(paste(pathID,i,sep="")) 17 | vector_guia_from=readRDS(paste(pathstart,i,sep="")) 18 | vector_guia_to=readRDS(paste(pathstop,i,sep="")) 19 | vector_guia_cluster=rep(1,length(vector_guia_to)) 20 | protes=table(vector_guia_uniprot) 21 | 22 | 23 | for (j in 1:length(protes)){ 24 | 25 | temp=cbind(vector_guia_from[vector_guia_uniprot%in%names(protes)[j]], 26 | vector_guia_to[vector_guia_uniprot%in%names(protes)[j]]) 27 | 28 | vector_guia_cluster[vector_guia_uniprot%in%names(protes)[j]]=cutree(hclust(dist(temp,method = "euclidean")),h= 250) 29 | 30 | } 31 | 32 | saveRDS(vector_guia_cluster,paste(pathcluster,i)) 33 | 34 | 35 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_5.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | ####SCRIPT5: assembling clustering results to one file 4 | 5 | setwd("cut_001eval/hierar") 6 | 7 | paths_clust=files=list.files(pattern="cluster") 8 | paths_start=files=list.files(pattern="start") 9 | paths_stop=files=list.files(pattern="stop") 10 | paths_uniprot=files=list.files(pattern="uniprot") 11 | 12 | paths_uniprot=paths_uniprot[!paths_uniprot%in%"uniprotID_0"] 13 | paths_start=paths_start[!paths_start%in%"start_0"] 14 | paths_stop=paths_stop[!paths_stop%in%"stop_0"] 15 | 16 | ####Empezamos 17 | 18 | temp=readRDS("uniprotID_0") 19 | 20 | names_vect=paste(gsub("_1","",temp), 21 | readRDS("start_0"), 22 | readRDS("stop_0"),sep="_") 23 | 24 | final_vect=temp 25 | names(final_vect)=names_vect 26 | 27 | for (i in 1:length(paths_uniprot)){ 28 | 29 | temp_uni=readRDS(paths_uniprot[i]) 30 | temp_from=readRDS(paths_start[i]) 31 | temp_to=readRDS(paths_stop[i]) 32 | temp_cl=readRDS(paths_clust[i]) 33 | 34 | temp=paste(temp_uni,temp_cl,sep="_") 35 | names(temp)=paste(temp_uni,temp_from,temp_to,sep="_") 36 | 37 | final_vect=c(final_vect,temp) 38 | 39 | } 40 | 41 | saveRDS(final_vect,"cut_001eval/mapping_chunks.rds") 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_6.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #####SCRIPT6: recoding the edges (similarity based on clustering results) 7 | 8 | ####Script meant to run inside a loop 9 | 10 | library("igraph") 11 | 12 | i <- as.numeric(commandArgs(trailingOnly = TRUE)) 13 | 14 | path=paste("cut_001eval/interactome_edges_temp/edges_",i,".rds",sep="") 15 | 16 | ####loading previous files 17 | pairs=readRDS(path) 18 | colnames(pairs)=c("Query","Target","Eval","score") 19 | net=graph_from_data_frame(d=pairs[,c("Query","Target")],directed=F) 20 | 21 | #loading mapping 22 | nodes=readRDS("cut_001eval/mapping_chunks.rds") 23 | nodes=nodes[names(nodes)%in%V(net)$name] 24 | gc() 25 | 26 | pairs=pairs[pairs[,"Query"]%in%names(nodes) & 27 | pairs[,"Target"]%in%names(nodes),] 28 | 29 | pairs[,"Query"]=nodes[pairs[,"Query"]] 30 | pairs[,"Target"]=nodes[pairs[,"Target"]] 31 | 32 | ###recoding the network and eliminating duplicates 33 | 34 | net=graph_from_data_frame(d=pairs[,c("Query","Target")],directed=F) 35 | E(net)$weight=as.numeric(pairs[,"Eval"]) 36 | E(net)$score=as.numeric(pairs[,"score"]) 37 | net=igraph::simplify(net,remove.loops = T,remove.multiple = T , edge.attr.comb = c(weight="min",score="max","ignore")) 38 | Isolated = which(igraph::degree(net)==0) 39 | length(Isolated) 40 | net = delete.vertices(net, Isolated) 41 | 42 | ###New edges 43 | 44 | path_node=paste("cut_001eval/recode/node_",i,".rds",sep="") 45 | path_edge=paste("cut_001eval/recode/edge_",i,".rds",sep="") 46 | 47 | saveRDS(nodes,path_node) 48 | saveRDS(cbind(get.edgelist(net,names = T), 49 | E(net)$weight, 50 | E(net)$score),path_edge) 51 | 52 | ############## 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_7.R: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | ############################################################################ 3 | 4 | ###SCRIPT7: assembling the recoded tables, selection of connected components 5 | 6 | #####Part1: assembly 7 | ####EDGES 8 | 9 | path="cut_001eval/recode/edge_" 10 | 11 | 12 | i=0 13 | 14 | edges=readRDS(paste(path,i,".rds",sep="")) 15 | 16 | for (i in 1:560){ 17 | 18 | edges=rbind(edges, 19 | readRDS(paste(path,i,".rds",sep=""))) 20 | 21 | } 22 | 23 | colnames(edges)=c("Query","Target","Eval","score") 24 | 25 | ###saving the assembly 26 | 27 | saveRDS(edges,"cut_001eval/recode_edges_all.rds") 28 | 29 | ####NODES 30 | 31 | path="cut_001eval/recode/node_" 32 | 33 | i=0 34 | 35 | nodes=unique(readRDS(paste(path,i,".rds",sep=""))) 36 | 37 | for (i in 1:560){ 38 | 39 | nodes=c(nodes, 40 | readRDS(paste(path,i,".rds",sep=""))) 41 | 42 | } 43 | 44 | ###saving the assembly 45 | 46 | saveRDS(nodes,"cut_001eval/recode_nodes_all.rds") 47 | 48 | rm(edges) 49 | rm(nodes) 50 | 51 | ############################################################### 52 | ############################################################### 53 | 54 | ###Part2: components 55 | 56 | ####We are going to selected connected components with at least 5 nodes 57 | 58 | library(igraph) 59 | 60 | ################################################### 61 | ####Volvemos localmente a ver que sale de todo esto 62 | 63 | pairs=readRDS("cut_001eval/recode_edges_all.rds") 64 | 65 | net=graph_from_data_frame(d=pairs[,c("Query","Target")],directed=F) 66 | E(net)$weight=as.numeric(pairs[,"Eval"]) 67 | E(net)$score=as.numeric(pairs[,"score"]) 68 | net=igraph::simplify(net,remove.loops = T,remove.multiple = T , edge.attr.comb = c(weight="min",score="max","ignore")) 69 | Isolated = which(igraph::degree(net)==0) 70 | length(Isolated) 71 | net = delete.vertices(net, Isolated) 72 | 73 | ##saving assmbled edges after removing dups 74 | 75 | saveRDS(cbind(get.edgelist(net,names = T), 76 | E(net)$weight, 77 | E(net)$score),"cut_001eval/recode_edges_all_FILTER.rds") 78 | 79 | ###Components 80 | 81 | subnet=components(net) 82 | 83 | nodes_sel=subnet$membership[subnet$membership%in%names(table(subnet$membership)[table(subnet$membership)>=5])] 84 | 85 | saveRDS(nodes_sel,"cut_001eval/recode_nodes_all_FILTER_components5.rds") 86 | saveRDS(pairs[pairs[,1]%in%names(nodes_sel) & 87 | pairs[,2]%in%names(nodes_sel) ,],"cut_001eval/recode_edges_all_FILTER_components5.rds") 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_8.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ####Script8: PFAM annotation 5 | 6 | ###Selecting pfam mapped to our sequences from the database 7 | 8 | genes=readRDS("cut_001eval/recode_nodes_all_FILTER_components5.rds") 9 | genes=names(genes) 10 | genes=unlist(strsplit(genes,"_"))[c(T,F)] 11 | length(genes) 12 | 13 | raw_data_path <- "Pfam-A.regions.uniprot.tsv.gz" 14 | raw_data_nrow <- length(count.fields(raw_data_path)) 15 | 16 | ###open connexion 17 | con = file(raw_data_path, "r") 18 | ###Getting ready for filtering 19 | filtered_data=read.delim(con, nrows=raw_data_nrow-2.72e+08) 20 | filtered_data=filtered_data[filtered_data[,1]%in%genes ,c("uniprot_acc","pfamA_acc","seq_start","seq_end")] 21 | rownames(filtered_data)=NULL 22 | gc() 23 | 24 | for (i in 1:136){ 25 | 26 | temp=read.delim(con, nrows=2000000,header=F) 27 | temp=temp[temp[,1]%in%genes ,c(1,5,6,7)] 28 | 29 | colnames(temp)= colnames(filtered_data) 30 | 31 | filtered_data=rbind(filtered_data, 32 | temp) 33 | gc() 34 | print(i) 35 | 36 | } 37 | close(con) 38 | 39 | saveRDS(filtered_data, 40 | "cut_eval001/filtered_pfam_cut001.rds") 41 | 42 | ########################################################################################## 43 | 44 | ####Splitting data to run annotation in paralel 45 | 46 | ###All positions 47 | nodes=readRDS("cut_001eval/recode_nodes_all.rds") 48 | ###All components 49 | modules=readRDS("cut_001eval/recode_nodes_all_FILTER_components5.rds") 50 | modules=cbind(modules, 51 | unlist(strsplit(names(modules),"_"))[c(T,F)]) 52 | modules=cbind(modules,"","") 53 | colnames(modules)=c("subnet","uniprot","from","to") 54 | ###pfam domains 55 | pfam=as.matrix(readRDS("cut_001eval/filtered_pfam_cut001.rds")) 56 | pfam=pfam[pfam[,"uniprot_acc"]%in%modules[,"uniprot"],] 57 | 58 | ####coordinates to cut in 1000 files 59 | coor=cbind(seq(from=1, to=3725915, by=3726), 60 | c(seq(from=3726, to=3725915, by=3726),3725915)) 61 | 62 | path_node="cut_001eval/pfam_anot/recode_nodes_" 63 | path_module="cut_001eval/pfam_anot/modules_" 64 | path_pfam="cut_001eval/pfam_anot/pfam_" 65 | 66 | for (i in 1:nrow(coor)){ 67 | 68 | temp_module=modules[coor[i,1]:coor[i,2],] 69 | saveRDS(nodes[nodes%in%rownames(temp_module)],paste(path_node,i,sep="")) 70 | saveRDS(pfam[pfam[,"uniprot_acc"]%in%temp_module[,"uniprot"],],paste(path_pfam,i,sep="")) 71 | saveRDS(temp_module,paste(path_module,i,sep="")) 72 | } 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /afdb-regions-network-R/Rscript_9.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #####SCRIPT9 : annotation of regions using pfam 4 | 5 | ####Lines to annotate a region using pfam domains,, 75% of the domain must be included in the region to be considered 6 | 7 | ###Ment to run inside a loop 8 | 9 | 10 | i <- as.numeric(commandArgs(trailingOnly = TRUE)) 11 | 12 | 13 | path_node="cut_001eval/pfam_anot/recode_nodes_" 14 | path_module="cut_001eval/pfam_anot/modules_" 15 | path_pfam="cut_001eval/pfam_anot/pfam_" 16 | 17 | nodes=readRDS(paste(path_node,i,sep="")) 18 | modules=readRDS(paste(path_module,i,sep="")) 19 | pfam=readRDS(paste(path_pfam,i,sep="")) 20 | 21 | for (j in 1:nrow(modules)){ 22 | 23 | temp=names(nodes)[nodes==rownames(modules)[j]] 24 | modules[j,"from"]=min(as.numeric(unlist(strsplit(temp,"_"))[c(F,T,F)])) 25 | modules[j,"to"]=max(as.numeric(unlist(strsplit(temp,"_"))[c(F,F,T)])) 26 | 27 | } 28 | 29 | modules=cbind(modules,"","") 30 | colnames(modules)=c("subnet","uniprot","start","end","pfam_names","pfam_counts") 31 | 32 | ###Ahora anotacion de pfams 33 | 34 | for (j in c(1:nrow(modules))[modules[,"uniprot"]%in%pfam[,"uniprot_acc"]]){ 35 | 36 | criteria=pfam[,"uniprot_acc"]%in%modules[j,"uniprot"] & ( 37 | (as.numeric(pfam[,"seq_start"])>=as.numeric( modules[j,"start"]) & 38 | as.numeric(pfam[,"seq_end"])<=as.numeric( modules[j,"end"])) | 39 | (as.numeric(pfam[,"seq_start"])<=as.numeric( modules[j,"start"]) & 40 | as.numeric(pfam[,"seq_end"])>=as.numeric( modules[j,"start"])) | 41 | (as.numeric(pfam[,"seq_start"])<=as.numeric( modules[j,"end"]) & 42 | as.numeric(pfam[,"seq_end"])>=as.numeric( modules[j,"end"]))) 43 | temp=unique(pfam[criteria,"pfamA_acc"]) 44 | modules[j,"pfam_names"]=paste(temp,collapse=";") 45 | modules[j,"pfam_counts"]=length(temp) 46 | 47 | } 48 | 49 | ####Ahora vamos a buscar el porcentaje 50 | 51 | modules=cbind(modules,"") 52 | colnames(modules)[ncol(modules)]="pfam_chunk_rel" 53 | 54 | for (j in c(1:nrow(modules))[modules[,"pfam_counts"]=="1"]){ 55 | 56 | temp=as.numeric(pfam[pfam[,"uniprot_acc"]%in%modules[j,"uniprot"] & ( 57 | (as.numeric(pfam[,"seq_start"])>=as.numeric( modules[j,"start"]) & 58 | as.numeric(pfam[,"seq_end"])<=as.numeric( modules[j,"end"])) | 59 | (as.numeric(pfam[,"seq_start"])<=as.numeric( modules[j,"start"]) & 60 | as.numeric(pfam[,"seq_end"])>=as.numeric( modules[j,"start"])) | 61 | (as.numeric(pfam[,"seq_start"])<=as.numeric( modules[j,"end"]) & 62 | as.numeric(pfam[,"seq_end"])>=as.numeric( modules[j,"end"]))) ,c("seq_start","seq_end")]) 63 | 64 | A=c(temp[1]:temp[2]) 65 | B=c(as.numeric(modules[j,"start"]):as.numeric(modules[j,"end"])) 66 | 67 | modules[j,"pfam_chunk_rel"]=round(sum(A%in%B)/length(A),3) 68 | 69 | } 70 | 71 | 72 | modules=cbind(modules,"") 73 | colnames(modules)[ncol(modules)]="pfam_all" 74 | 75 | modules[modules[,"pfam_counts"]=="1","pfam_all"]=modules[modules[,"pfam_counts"]=="1","pfam_names"] 76 | 77 | 78 | for (j in c(1:nrow(modules))[modules[,"pfam_counts"]%in%c("2","3","4","5","6","7","8","9")]){ 79 | 80 | temp=cbind(pfam[pfam[,"uniprot_acc"]%in%modules[j,"uniprot"] & ( 81 | (as.numeric(pfam[,"seq_start"])>=as.numeric( modules[j,"start"]) & 82 | as.numeric(pfam[,"seq_end"])<=as.numeric( modules[j,"end"])) | 83 | (as.numeric(pfam[,"seq_start"])<=as.numeric( modules[j,"start"]) & 84 | as.numeric(pfam[,"seq_end"])>=as.numeric( modules[j,"start"])) | 85 | (as.numeric(pfam[,"seq_start"])<=as.numeric( modules[j,"end"]) & 86 | as.numeric(pfam[,"seq_end"])>=as.numeric( modules[j,"end"]))) ,c("pfamA_acc","seq_start","seq_end")],"") 87 | 88 | for (k in 1:nrow(temp)){ 89 | 90 | A=c(as.numeric(temp[k,"seq_start"]):as.numeric(temp[k,"seq_end"])) 91 | B=c(as.numeric(modules[j,"start"]):as.numeric(modules[j,"end"])) 92 | temp[k,4]=round(sum(A%in%B)/length(A),3) 93 | 94 | } 95 | 96 | modules[j,"pfam_chunk_rel"]=paste( temp[,4],collapse=";") 97 | modules[j,"pfam_all"]=paste( temp[,1],collapse=";") 98 | 99 | } 100 | 101 | 102 | ################################################ 103 | ################################################ 104 | ############Long version table (one line per pfam per domain) 105 | 106 | 107 | mod_pfam_long=modules[modules[,"pfam_counts"]=="1",c("subnet", 108 | "uniprot", 109 | "start", 110 | "end", 111 | "pfam_names", 112 | "pfam_counts", 113 | "pfam_chunk_rel")] 114 | mod_pfam_long=cbind(rownames(mod_pfam_long),mod_pfam_long) 115 | colnames(mod_pfam_long)[1]="ID" 116 | 117 | temp_pfam=modules[modules[,"pfam_counts"]%in%c("2","3","4","5","6","7","8","9"),] 118 | 119 | temp=unlist(strsplit(temp_pfam[1,"pfam_all"],";")) 120 | primer=cbind(rep(rownames(temp_pfam)[1],length(temp)), 121 | rep(temp_pfam[1,"subnet"],length(temp)), 122 | rep(temp_pfam[1,"uniprot"],length(temp)), 123 | rep(temp_pfam[1,"start"],length(temp)), 124 | rep(temp_pfam[1,"end"],length(temp)), 125 | temp, 126 | rep(temp_pfam[1,"pfam_counts"],length(temp)), 127 | unlist(strsplit(temp_pfam[1,"pfam_chunk_rel"],";"))) 128 | 129 | for (j in 2:nrow(temp_pfam)){ 130 | 131 | temp=unlist(strsplit(temp_pfam[j,"pfam_all"],";")) 132 | temp1=cbind(rep(rownames(temp_pfam)[j],length(temp)), 133 | rep(temp_pfam[j,"subnet"],length(temp)), 134 | rep(temp_pfam[j,"uniprot"],length(temp)), 135 | rep(temp_pfam[j,"start"],length(temp)), 136 | rep(temp_pfam[j,"end"],length(temp)), 137 | temp, 138 | rep(temp_pfam[j,"pfam_counts"],length(temp)), 139 | unlist(strsplit(temp_pfam[j,"pfam_chunk_rel"],";"))) 140 | 141 | primer=rbind(primer,temp1) 142 | 143 | } 144 | 145 | mod_pfam_long=rbind(mod_pfam_long, 146 | primer) 147 | rownames(mod_pfam_long)=NULL 148 | 149 | ############################################ 150 | ############################################ 151 | ############################################ 152 | ####Saving the files 153 | 154 | path_result="cut_001eval/pfam_anot/result_modules_"###Table with annotation 155 | path_result_long="cut_001eval/pfam_anot/result_long_"###long format 156 | path_result_long_cut="cut_001eval/pfam_anot/result_longCut_"###Long format keeping only annotation 157 | 158 | saveRDS(modules,paste(path_result,i,sep="")) 159 | saveRDS(mod_pfam_long,paste(path_result_long,i,sep="")) 160 | saveRDS(mod_pfam_long[as.numeric(mod_pfam_long[,"pfam_chunk_rel"])>=0.75,],paste(path_result_long_cut,i,sep="")) 161 | 162 | 163 | 164 | ############################################ 165 | ############################################ 166 | #### 167 | ####Assembling all together 168 | 169 | path_result="cut_001eval/pfam_anot/result_modules_" 170 | path_result_long="cut_001eval/pfam_anot/result_long_" 171 | path_result_long_cut="cut_001eval/pfam_anot/result_longCut_" 172 | 173 | ####Primero los resultados 174 | 175 | table1=readRDS(paste(path_result,1,sep="")) 176 | 177 | for (i in 2:1000){ 178 | 179 | table1=rbind(table1, 180 | readRDS(paste(path_result,i,sep=""))) 181 | 182 | } 183 | 184 | saveRDS(table1,"cut_001eval/result_modules_all.rds") 185 | 186 | ######## 187 | 188 | table1=readRDS(paste(path_result_long,1,sep="")) 189 | 190 | for (i in 2:1000){ 191 | 192 | table1=rbind(table1, 193 | readRDS(paste(path_result_long,i,sep=""))) 194 | 195 | } 196 | 197 | saveRDS(table1,"cut_001eval/result_long_all.rds") 198 | 199 | ######## 200 | 201 | table1=readRDS(paste(path_result_long_cut,1,sep="")) 202 | 203 | for (i in 2:1000){ 204 | 205 | table1=rbind(table1, 206 | readRDS(paste(path_result_long_cut,i,sep=""))) 207 | 208 | } 209 | 210 | saveRDS(table1,"cut_001eval/result_longCut_all.rds") 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | -------------------------------------------------------------------------------- /commands: -------------------------------------------------------------------------------- 1 | ### Removing fragments 2 | 3 | # removing fragments and alter the centroid 4 | awk 'BEGIN{newclu=""} FNR==NR{f[$2]=1;next} ($1 != rename){newclu="";rename=""} ($1 in f && $2 in f){rename=$1; next} ($1 == rename && !($2 in f) && newclu==""){newclu=$2; print $2"\t"$2; next} ($1 == rename && !($2 in f) && newclu!=""){print newclu"\t"$2; next} !($2 in f){print}' /storage/martin/foldseek_cluster/uniprot_trembl_sprot_fragments.ids /storage/martin/foldseek_cluster/afdb50best_foldseek_clu.tsv > afdb50best_foldseek_clu_nofragments.tsv & 5 | 6 | # tsv2db 7 | foldseek tsv2db afdb50best_foldseek_clu_nofragments.tsv ./databases/afdb50best_foldseek_clu_nofragments --output-dbtype 6 8 | 9 | # w/o frag to tsv file 10 | mmseqs createtsv /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb ./databases/afdb50best_foldseek_clu_nofragments ./databases/afdb50best_foldseek_clu_nofragments.tsv 11 | 12 | # count the number w/o frag 13 | wc -l databases/afdb50best_foldseek_clu_nofragments.index # 15315246 14 | 15 | ### Removing singletons 16 | 17 | # count the number of members 18 | awk '{id[$1]+=1; next; } END { for (key in id) print key"\t"id[key]}' afdb50best_foldseek_clu_nofragments.tsv > afdb50best_foldseek_clu_nofragments-repIndex_nMem.tsv 19 | 20 | # count the number of singletones 21 | awk '$2==1 {n+=1;} END {print n}' afdb50best_foldseek_clu_nofragments-repIndex_nMem.tsv # 13012338 22 | 23 | # find the rep Ids of singleotns 24 | awk '$2==1 {print $1}' afdb50best_foldseek_clu_nofragments-repIndex_nMem.tsv > singleton_repIndex 25 | 26 | # remove singletons 27 | awk 'FNR==NR {singletons[$1]=1; next;} !($1 in singletons) {print $0}' singleton_repIndex afdb50best_foldseek_clu_nofragments.tsv > afdb50best_foldseek_clu_nofrag_nosingletons.tsv 28 | 29 | # create nosingleton db 30 | foldseek tsv2db afdb50best_foldseek_clu_nofrag_nosingletons.tsv ./databases/afdb50best_foldseek_clu_nofrag_nosingletons --output-dbtype 6 31 | 32 | # tsv - repId memId 33 | mmseqs createtsv /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb ./databases/afdb50best_foldseek_clu_nofrag_nosingletons ./databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv 34 | 35 | ### Create AFDB clusters database 36 | 37 | # pick reps 38 | awk '!($1 in id) {print $1; id[$1]=1; next;}' databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > ./databases/afdb50best_foldseek_clu_nofrag_nosingletons-reps.tsv 39 | 40 | # representative sequences db 41 | foldseek createsubdb databases/afdb50best_foldseek_clu_nofrag_nosingletons-reps.tsv /storage/martin/foldseek_cluster/afdb databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs --id-mode 1 42 | foldseek createsubdb databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs.index /storage/martin/foldseek_cluster/afdb_ss databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs_ss 43 | foldseek createsubdb databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs.index /storage/martin/foldseek_cluster/afdb_ca databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs_ca 44 | 45 | ### Analyze AFDB clusters 46 | 47 | # count how many clusters left 48 | wc -l ./databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs.index # 2302908 49 | 50 | # compute LCA 51 | mmseqs lca /storage/martin/foldseek_cluster/afdb50best databases/afdb50best_foldseek_clu_nofrag_nosingletons lca/afdb50best_foldseek_nofrag_nosingletons_lca --tax-lineage 1 52 | 53 | # make lca report 54 | mmseqs taxonomyreport /storage/martin/foldseek_cluster/afdb50best lca/afdb50best_foldseek_nofrag_nosingletons_lca lca/afdb50best_foldseek_nofrag_nosingletons_lca.report 55 | 56 | # create lca tsv 57 | mmseqs createtsv /storage/martin/foldseek_cluster/afdb lca/afdb50best_foldseek_nofrag_nosingletons_lca ./lca/afdb50best_foldseek_clu_nofragments_nosingleton_lca.tsv 58 | 59 | # print the number of clusters conserved to high superkingdoms 60 | awk -F "\t" 'BEGIN {interest["Bacteria"]=1; interest["Archaea"]=1; interest["cellular organisms"]=1; interest["Eukaryota"]=1;} {gsub(/^\s*/, "", $6)} $6 in interest {print $6"\t"$3}' lca/afdb50best_foldseek_nofrag_nosingletons_lca.report 61 | # OUTPUT is 62 | # cellular organisms 529373 63 | # Bacteria 370762 64 | # Eukaryota 311226 65 | # Archaea 11336 66 | 67 | # revise Fig 3 A 68 | 69 | # align lddt, tm-score values 70 | 71 | 72 | ### Summary file 73 | 74 | # number of members 75 | awk '{id[$1]+=1;} END {for (key in id) print key"\t"id[key]}' ./databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > ./summary/repId_nMem.tsv & 76 | 77 | # length 78 | awk 'FNR==NR {id[$1]=$2; next;} $1 in id {print id[$1]"\t"$3-2}' /storage/martin/foldseek_cluster/afdb.lookup /storage/martin/foldseek_cluster/afdb.index > ./summary/entryId_length.tsv & 79 | 80 | # avg length 81 | wk 'FNR==NR {len[$1]=$2; next;} {slen[$1] += len[$2]; n[$1]+=1;} END {for (key in slen) print key"\t"slen[key]/n[key]} ' ./summary/entryId_length.tsv ./databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > ./summary/repId_avgLen.tsv & 82 | 83 | # avg plddt 84 | awk 'FNR==NR {plddt[$1]=$2; next;} {splddt[$1] += plddt[$2]; n[$1]+=1;} END {for (key in splddt) print key"\t"splddt[key]/n[key]} ' ./plddt/entryId_plddt.tsv ./databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > ./summary/repId_avgPlddt.tsv & 85 | 86 | # + repLen 87 | awk 'FNR==NR {nMem[$1]=$0; next;} $1 in nMem {print nMem[$1]"\t"$2}' ./summary/repId_nMem.tsv ./summary/entryId_length.tsv > ./summary/repId_nMem_repLen.tsv & 88 | # + avgLen 89 | awk 'FNR==NR {prev[$1]=$0; next;} $1 in prev {print prev[$1]"\t"$2}' ./summary/repId_nMem_repLen.tsv ./summary/repId_avgLen.tsv > ./summary/repId_nMem_repLen_avgLen.tsv & 90 | # + repPlddt 91 | awk 'FNR==NR {prev[$1]=$0; next;} $1 in prev {print prev[$1]"\t"$2}' ./summary/repId_nMem_repLen_avgLen.tsv ./plddt/entryId_plddt.tsv > ./summary/repId_nMem_repLen_avgLen_repPlddt.tsv & 92 | # + avgPlddt 93 | awk 'FNR==NR {prev[$1]=$0; next;} $1 in prev {print prev[$1]"\t"$2}' ./summary/repId_nMem_repLen_avgLen_repPlddt.tsv ./summary/repId_avgPlddt.tsv > ./summary/repId_nMem_repLen_avgLen_repPlddt_avgPlddt.tsv & 94 | # + taxId, lineage 95 | awk -F "\t" 'FNR==NR {prev[$1]=$0; next;} $1 in prev {print prev[$1]"\t"$2"\t"$5}' ./summary/repId_nMem_repLen_avgLen_repPlddt_avgPlddt.tsv lca/afdb50best_foldseek_clu_nofragments_nosingleton_lca.tsv > ./summary/repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv 96 | -------------------------------------------------------------------------------- /commands_darkening: -------------------------------------------------------------------------------- 1 | ### Darkening 2 | 3 | # mmseqs pfam search 4 | srun -p compute -w super002 -c 16 -t 2-0 mmseqs search ./databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs ../../cluster_analysis_old/pfam_pdb/pfam darkening/mmseqs_pfam/mmseqs_pfam tmp -e 0.1 --threads 32 -s 7.5 --max-seqs 100000 5 | 6 | # convert mmseqs pfam search result to alns 7 | mmseqs convertalis ./databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs ../../cluster_analysis_old/pfam_pdb/pfam ./darkening/mmseqs_pfam/mmseqs_pfam ./darkening/aln_mmseqs_pfam 8 | 9 | # fodlseek pdb search 10 | srun -p compute -w super001 -c 16 -t 2-0 foldseek search ./databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs ../../cluster_analysis_old/pfam_pdb/pdb ./darkening/foldseek_pdb/foldseek_pdb tmp -e 0.1 --threads 32 11 | 12 | # conver foldseek pdb search result to alns 13 | foldseek convertalis ./databases/afdb50best_foldseek_clu_nofrag_nosingletons_repseqs ../../cluster_analysis_old/pfam_pdb/pdb ./darkening/foldseek_pdb/foldseek_pdb ./darkening/aln_foldseek_pdb 14 | 15 | ## uniprot&sprot pfam, tigrfam search 16 | 17 | # sprot pfam 18 | awk 'FNR==NR {id[$1]=$2; next;} $2 in id {print $1"\t"$2"\t"id[$2]} ' ../replicate_find_dark_cluster/web_hit/accession_pfam_sprot_long_semi.tsv databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > ./darkening/ftp_pfam_tigrfam/hit_pfam_sprot.tsv & 19 | # uniprot pfam 20 | awk 'FNR==NR {id[$1]=$2; next;} $2 in id {print $1"\t"$2"\t"id[$2]} ' ../replicate_find_dark_cluster/web_hit/accession_pfam_uniprot_long_semi.tsv databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > ./darkening/ftp_pfam_tigrfam/hit_pfam_uniprot.tsv & 21 | # sprot tigrfam 22 | awk 'FNR==NR {id[$1]=$2; next;} $2 in id {print $1"\t"$2"\t"id[$2]} ' ../replicate_find_dark_cluster/web_hit/accession_tigrfam_sprot_long_semi.tsv databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > ./darkening/ftp_pfam_tigrfam/hit_tigrfam_sprot.tsv & 23 | # sprot pfam 24 | awk 'FNR==NR {id[$1]=$2; next;} $2 in id {print $1"\t"$2"\t"id[$2]} ' ../replicate_find_dark_cluster/web_hit/accession_tigrfam_uniprot_long_semi.tsv databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > ./darkening/ftp_pfam_tigrfam/hit_tigrfam_uniprot.tsv & 25 | 26 | # integrate all ftp hits 27 | awk '{id[$1]=1} END { for (key in id) print key}' ./darkening/ftp_pfam_tigrfam/hit_pfam_sprot.tsv ./darkening/ftp_pfam_tigrfam/hit_pfam_uniprot.tsv ./darkening/ftp_pfam_tigrfam/hit_tigrfam_sprot.tsv ./darkening/ftp_pfam_tigrfam/hit_tigrfam_uniprot.tsv > ./darkening/all_ftp_pfam_tigrfam 28 | 29 | # find reps w/o pdb hit by foldseek 30 | awk 'FNR==NR {id[$1]=1; next;} !($1 in id) {print $0}' ./darkening/aln_foldseek_pdb ./databases/afdb50best_foldseek_clu_nofrag_nosingletons-reps.tsv > ./darkening/without_foldseek-pdb 31 | # residues 32 | wc -l ./darkening/without_foldseek-pdb # 1135118 33 | 34 | # find reps w/o ( * ) & pfam hit by mmseqs 35 | awk 'FNR==NR {id[$1]=1; next;} !($1 in id) {print $0}' ./darkening/aln_mmseqs_pfam ./darkening/without_foldseek-pdb > ./darkening/without_foldseek-pdb_mmseqs-pfam 36 | # residues 37 | wc -l ./darkening/without_foldseek-pdb_mmseqs-pfam # 883788 38 | 39 | # find reps w/o ( * ) & pfam and tigrfam by ftp data 40 | awk 'FNR==NR {id[$1]=1; next;} !($1 in id) {print $0}' ./darkening/all_ftp_pfam_tigrfam ./darkening/without_foldseek-pdb_mmseqs-pfam > ./darkening/without_foldseek-pdb_mmseqs-pfam_ftp-pfam-tigrfram 41 | # residues 42 | wc -l ./darkening/without_foldseek-pdb_mmseqs-pfam_ftp-pfam-tigrfram # 711705 43 | 44 | ### Dark clusters analysis 45 | 46 | # pick dark clusters representatives and members 47 | awk 'FNR==NR {id[$1]=1; next;} $1 in id {print $0}' darkening/without_foldseek-pdb_mmseqs-pfam_ftp-pfam-tigrfram databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > darkening/dark-repId_memId.tsv 48 | 49 | # dark clusters summary file 50 | awk 'FNR==NR {id[$1]=1; next;} $1 in id {print $0}' darkening/without_foldseek-pdb_mmseqs-pfam_ftp-pfam-tigrfram summary/repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv > darkening/dark-repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv 51 | 52 | # pick high avgPlddt clusters 53 | awk '$6 > 90 {print $0}' darkening/dark-repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv > darkening/highAvgPlddt-dark-repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv 54 | 55 | # pick highest pLDDT member in each cluster 56 | awk 'FNR==NR {plddt[$1]=$2; next;} !($1 in id) {id[$1]=$2; highest[$1]=plddt[$2]; next;} ($1 in id) && plddt[$2]>highest[$1] {id[$1]=$2; highest[$1]=plddt[$2];} END {for (key in id) print key"\t"id[key]"\t"highest[key]}' plddt/entryId_plddt.tsv ./darkening/dark-repId_memId.tsv > darkening/dark-repId_highestId_plddt.tsv & 57 | 58 | # pick the high plddt members from the clusters >90% avgPlddt 59 | awk 'FNR==NR && $6 > 90 {id[$1]=1; next;} $1 in id {print "wget https://alphafold.ebi.ac.uk/files/"$2}' ./darkening/dark-repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv darkening/dark-repId_highestId_plddt.tsv | sed -E 's/cif/pdb/' > darkening/enzyme_analysis_pdb_download.sh 60 | 61 | # pick the highets plddt members from all clusters 62 | awk '{print "wget https://alphafold.ebi.ac.uk/files/"$2}' darkening/dark-repId_highestId_plddt.tsv | sed -E 's/cif/pdb/' > darkening/all_dark_clusters_pdb_download.sh -------------------------------------------------------------------------------- /commands_distribution: -------------------------------------------------------------------------------- 1 | ### Let's generate Fig 1 D - the distribution of dark clusters 2 | 3 | # repId_isDark_nMem 4 | awk 'FNR==NR {dark[$1]=1; next;} {d=0;} $1 in dark {d=1} {print $1"\t"d"\t"$2}' darkening/without_foldseek-pdb_mmseqs-pfam_ftp-pfam-tigrfram summary/repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv > ./summary/repId_isDark_nMem.tsv 5 | 6 | # find AFDB removed, dark, bright numbers 7 | awk 'FNR==NR && $2==1 {dark[$1]=1; next;} FNR==NR && $2==0 {bright[$1]=1; next;} $1 in dark {nDark+=1; next;} $1 in bright {nBright+=1; next;} {nRemoved+=1; next;} END {print nRemoved"\t"nDark"\t"nBright}' summary/repId_isDark_nMem.tsv share_db/all-repId_memId_cluFlag-1234-2.tsv > summary/AFDB-removed_dark_bright.tsv 8 | 9 | # find AFDB clusters dark and bright numbers 10 | awk '$2==1 {nDark += 1; next;} $2==0 {nBright += 1; next;} END {print "0\t"nDark"\t"nBright}' summary/repId_isDark_nMem.tsv > summary/AFDB-Clusters-removed_dark-bright.tsv 11 | 12 | # Draw dark and bright clusters' distribution 13 | # execution: dark_distribution.ipynb -------------------------------------------------------------------------------- /commands_purity: -------------------------------------------------------------------------------- 1 | ### lddt, tm-score, pfam-consistency per clusters 2 | 3 | # align structurally members to representative 4 | foldseek structurealign /storage/martin/foldseek_cluster/afdb50best /storage/martin/foldseek_cluster/afdb50best ./databases/afdb50best_foldseek_clu_nofrag_nosingletons purity_analysis/database/aln_db -a -e INF --threads 64 5 | 6 | # convert align output to tsv 7 | foldseek convertalis /storage/martin/foldseek_cluster/afdb50best /storage/martin/foldseek_cluster/afdb50best purity_analysis/database/aln_db purity_analysis/aln-query_target_fident_alnlen_mismatch_gapopen_qstart_qend_tstart_tend_evalue_bits_lddt_alntmscore.tsv --format-output query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,lddt,alntmscore --threads 128 8 | 9 | # lddt 10 | awk '{ if($(NF-1) != "-NAN"){ sum[$1]+=$(NF-1); cnt[$1]++; } }END{for(key in sum){print key"\t"sum[key]"\t"cnt[key]"\t"sum[key]/cnt[key]}}' purity_analysis/aln-query_target_fident_alnlen_mismatch_gapopen_qstart_qend_tstart_tend_evalue_bits_lddt_alntmscore.tsv > purity_analysis/lddt-repId_sumLddt_nMem_avgLddt.tsv 11 | 12 | # tm-score 13 | awk '{ if($NF != "-NAN"){ sum[$1]+=$NF; cnt[$1]++; } }END{for(key in sum){print key"\t"sum[key]"\t"cnt[key]"\t"sum[key]/cnt[key]}}' purity_analysis/aln-query_target_fident_alnlen_mismatch_gapopen_qstart_qend_tstart_tend_evalue_bits_lddt_alntmscore.tsv > purity_analysis/tmScore-repId_sumTmScore_nMem_avgTmScore.tsv -------------------------------------------------------------------------------- /commands_sapiens: -------------------------------------------------------------------------------- 1 | ### Homo Sapiens clusters analysis 2 | 3 | # pick homo sapiens (9606) proteins 4 | mmseqs filtertaxseqdb /storage/martin/foldseek_cluster/afdb ./homo_sapiens/afdb_v3_human --taxon-list 9606 --threads 64 5 | 6 | # extract homo sapiens containing clusters 7 | awk 'FNR==1 {fn+=1;} fn==1 {id[$2]=1; next;} fn==2 && $2 in id {rep[$1]=1; next;} fn==3 && $1 in rep {print $0}' ./homo_sapiens/afdb_v3_human.index afdb50best_foldseek_clu_nofrag_nosingletons.tsv afdb50best_foldseek_clu_nofrag_nosingletons.tsv > homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human_index.tsv 8 | awk 'FNR==1 {fn+=1;} fn==1 {id[$1]=1; next;} fn==2 && $2 in id {rep[$1]=1; next;} fn==3 && $1 in rep {print $0}' ./homo_sapiens/afdb_v3_human.index afdb50best_foldseek_clu_nofrag_nosingletons.tsv afdb50best_foldseek_clu_nofrag_nosingletons.tsv > homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human_index.tsv 9 | 10 | # create homo sapiens clusters 11 | mmseqs tsv2db homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human_index.tsv homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human 12 | 13 | # repId-memId tsv file 14 | mmseqs createtsv /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human.tsv 15 | 16 | # LCA 17 | mmseqs createsubdb ./homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human.index ./lca/afdb50best_foldseek_nofrag_nosingletons_lca ./homo_sapiens/human_containing_lca 18 | 19 | # taxonomyreport 20 | mmseqs taxonomyreport /storage/martin/foldseek_cluster/afdb50best ./homo_sapiens/human_containing_lca ./homo_sapiens/human_containing_lca.report 21 | 22 | ### GO data set up 23 | awk '{gsub(";", "")} $1=="AC" { AC=$2} $1=="DR" && $2=="GO" {print AC"\t"$3} ' ../../cluster_analysis_old/pfam_pdb/uniprot_trembl.dat > ./go/trembl_accession_GO.tsv 24 | awk ' !($1 in go) {go[$1] = $2} $1 in go {go[$1]=go[$1]";"$2} END { for (key in go) print key"\t"go[key]}' ./go/trembl_accession_GO.tsv > ./go/trembl_accession_GO_semicolon.tsv 25 | awk '{gsub(";", "")} $1=="AC" { AC=$2} $1=="DR" && $2=="GO" {print AC"\t"$3} ' ../../cluster_analysis_old/pfam_pdb/uniprot_sprot.dat > ./go/sprot_accession_GO.tsv & 26 | awk 'FNR==NR {print $0; id[$1$2]=1; next} !($1$2 in id) {print $0}' go/sprot_accession_GO.tsv go/trembl_accession_GO.tsv > go/union_accession_GO.tsv 27 | 28 | # find homo sapiens AFDB id 29 | awk 'FNR==NR {id[$1]=1; next;} $1 in id {print $2}' homo_sapiens/afdb_v3_human.index homo_sapiens/afdb_v3_human.lookup > homo_sapiens/afdb_v3_human.ids 30 | 31 | # map GO to human proteins 32 | awk 'FNR==NR {id[$1]=1; next;} "AF-"$1"-F1-model_v3.cif" in id {print "AF-"$1"-F1-model_v3.cif\t"$2}' homo_sapiens/afdb_v3_human.ids go/union_accession_GO.tsv > homo_sapiens/human-sapId_GO.tsv 33 | 34 | ### pick the higher plddt sapiens protein in each cluster 35 | # map plddt to homo sapiens AFDB proteins 36 | awk 'FNR==NR {id[$1]=1; next;} $1 in id {print }' homo_sapiens/afdb_v3_human.ids plddt/entryId_plddt.tsv > homo_sapiens/human-sapId_plddt.tsv 37 | 38 | # pick the highest plddt sapiens protein 39 | awk 'FNR==NR {plddt[$1]=$2; next;} !($1 in id) {id[$1]=$2; highest[$1]=plddt[$2]; next;} ($1 in id) && plddt[$2]>highest[$1] {id[$1]=$2; highest[$1]=plddt[$2];} END {for (key in id) print key"\t"id[key]"\t"highest[key]}' homo_sapiens/human-sapId_plddt.tsv homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human.tsv > homo_sapiens/human-repId_highestSapId_plddt.tsv & 40 | 41 | # map plddt and GO to highest Plddt sapiens protein 42 | awk 'FNR==NR {id[$2]=$0; next;} $1 in id {print id[$1]"\t"$2}' homo_sapiens/human-repId_highestSapId_plddt.tsv homo_sapiens/human-sapId_GO.tsv > homo_sapiens/human-repId_highestSapId_sapPlddt_GO.tsv 43 | 44 | # map lca rank info 45 | awk -F "\t" 'FNR==NR {rep[$1]=$2"\t"$4; next;} $1 in rep {print $1"\t"rep[$1]"\t"$2"\t"$3"\t"$4}' lca/afdb50best_foldseek_clu_nofragments_nosingleton_lca.tsv homo_sapiens/human-repId_highestSapId_sapPlddt_GO.tsv > homo_sapiens/clu-sap-repId_lcaTaxId_lcaRankName_sapId_sapPlddt_sapGO.tsv 46 | 47 | # annotate taxonomy to spaiens cluster members 48 | awk -F "\t" 'FNR==NR {mem[$2]=$1; next} $2 in mem {print mem[$2]"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}' homo_sapiens/afdb50best_foldseek_clu_nofrag_nosingletons_containing_human.tsv ./taxonomy/afdb50best_clu_lineage.tsv > homo_sapiens/repId_memId_taxId_rank_rankName_lieage.tsv 49 | 50 | # find out immune related cluster with python (below) 51 | # by the file - find_immune_related_human_cluster.ipynb 52 | # the ipynb file returns the output file - homo_sapiens/human-immune-go-repId_sapId_sapGO_lca_sapGOFunc.tsv 53 | 54 | # find human immunity clusters that has Bacteria or Archaea in it 55 | grep cellular homo_sapiens/human-immune-go-repId_sapId_sapGO_lca_sapGOFunc.tsv > homo_sapiens/across-kingdom-human-immune-go-repId_sapId_sapGO_lca_sapGOFunc.tsv 56 | 57 | # find the examples in Fig 3 58 | # A 59 | 60 | # find the D2N2J3 cluster from nucleous (GO:0005634) annotated clusters 61 | grep GO:0005634 homo_sapiens/clu-sap-repId_lcaTaxId_lcaRankName_sapId_sapPlddt_sapGO.tsv 62 | # output: 63 | # ... 64 | # AF-D2N2J3-F1-model_v3.cif 131567 cellular organisms AF-A0A2R8Y619-F1-model_v3.cif 84.45 GO:0005634 65 | # ... 66 | 67 | # find the Bacteria protein (A0A1G5ASE0) in the D2N2J3 cluster 68 | grep AF-D2N2J3 homo_sapiens/repId_memId_taxId_rank_rankName_lieage.tsv | grep Bacteria 69 | # output: 70 | # ... 71 | # AF-D2N2J3-F1-model_v3.cif AF-A0A1G5ASE0-F1-model_v3.cif 582692 species Paenibacillus polysaccharolyticus -_cellular organisms;d_Bacteria;-_Terrabacteria group;p_Firmicutes;c_Bacilli;o_Bacillales;f_Paenibacillaceae;g_Paenibacillus;s_Paenibacillus polysaccharolyticus 72 | # ... 73 | 74 | # B 75 | 76 | # find the B4DKH6 cluster from the immune related GO annotated clusters 77 | grep B4DKH6 homo_sapiens/across-kingdom-human-immune-go-repId_sapId_sapGO_lca_sapGOFunc.tsv 78 | # output: 79 | # ... 80 | # AF-A0A401S3L8-F1-model_v3.cif AF-B4DKH6-F1-model_v3.cif GO:0006955 cellular organisms immune response 81 | # ... 82 | 83 | # find the Bacteria protein (A0A2D5ZNG0) in the D2N2J3 cluster 84 | grep A0A401S3L8 homo_sapiens/repId_memId_taxId_rank_rankName_lieage.tsv | grep Bacteria 85 | # output: 86 | # ... 87 | # AF-A0A401S3L8-F1-model_v3.cif AF-A0A2D5ZNG0-F1-model_v3.cif 2026742 species Gemmatimonadetes bacterium -_cellular organisms;d_Bacteria;-_FCB group;p_Gemmatimonadetes;-_unclassified Gemmatimonadetes;s_Gemmatimonadetes bacterium 88 | # ... 89 | 90 | # C 91 | 92 | # find the O14862 cluster from the immune related GO annotated clusters 93 | grep O14862 homo_sapiens/across-kingdom-human-immune-go-repId_sapId_sapGO_lca_sapGOFunc.tsv 94 | # output: 95 | # ... 96 | # AF-A0A286S9Y4-F1-model_v3.cif AF-O14862-F1-model_v3.cif GO:0002218 cellular organisms activation of innate immune response 97 | # ... 98 | 99 | # find the Bacteria protein (A0A1C5UEQ5) in the A0A286S9Y4 cluster 100 | grep A0A286S9Y4 homo_sapiens/repId_memId_taxId_rank_rankName_lieage.tsv | grep Bacteria 101 | # output: 102 | # ... 103 | # AF-A0A286S9Y4-F1-model_v3.cif AF-A0A1C5UEQ5-F1-model_v3.cif 59620 species uncultured Clostridium sp. -_cellular organisms;d_Bacteria;-_Terrabacteria group;p_Firmicutes;c_Clostridia;o_Eubacteriales;f_Clostridiaceae;g_Clostridium;-_environmental samples;s_uncultured Clostridium sp. 104 | # ... 105 | 106 | -------------------------------------------------------------------------------- /commands_share_db: -------------------------------------------------------------------------------- 1 | ### Generate DB for sharing 2 | 3 | # Foldseek cluster 4 | mmseqs createtsv /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb50best_foldseek_clu afdb50best_foldseek_clu.tsv 5 | 6 | # Flag 2 - AFDB clusters 7 | awk '{print $1"\t"$2"\t2"}' databases/afdb50best_foldseek_clu_nofrag_nosingletons.tsv > share_db/all-repId_memId_cluFlag-2.tsv 8 | 9 | # Flag 4 - add singletons 10 | awk 'FNR==NR {id[$2]=1; print $0; next;} !($2 in id) {print $1"\t"$2"\t4"}' share_db/all-repId_memId_cluFlag-2.tsv databases/afdb50best_foldseek_clu_nofragments.tsv > share_db/all-repId_memId_cluFlag-24.tsv 11 | 12 | # Find fragments repId-memId (repId should be modified) 13 | awk 'FNR==NR {rep[$2]=$1; next;} !($2 in rep) {print $0}' databases/afdb50best_foldseek_clu_nofragments.tsv afdb50best_foldseek_clu.tsv > ./share_db/fragments-removed_foldseekRepId_memId-2 & 14 | 15 | # find other protein if there is any other member in the AFDB clusters 16 | awk 'FNR==NR {mem2rep[$2]=$1; rep2other[$1]="N/A"; next;} $1 in rep2other && !($2 in mem2rep) {rep2other[$1]=$2;} END {for (key in mem2rep) print mem2rep[key]"\t"key"\t"rep2other[mem2rep[key]]}' ./share_db/fragments-removed_foldseekRepId_memId-2 afdb50best_foldseek_clu.tsv > ./share_db/fragments-other-member_repId_memId_otherId-2 & 17 | 18 | # get the altered rep iD. If there is other Id, get the repId from the other Id. If not, the fragment's repId remains the same. 19 | awk 'FNR==NR {mem2rep[$2]=$1; next;} $3=="N/A" {print $1"\t"$2"\t"$3"\t"$1; next;} $3!="N/A" {print $1"\t"$2"\t"$3"\t"mem2rep[$3]} ' ./databases/afdb50best_foldseek_clu_nofragments.tsv ./share_db/fragments-other-member_repId_memId_otherId-2 > ./share_db/fragments-other-member_repId_memId_otherId_alteredRepId-2 & 20 | 21 | # Flag 3 - add fragments 22 | awk 'FNR==NR {id[$2]=1; print $0; next;} !($2 in id) {print $4"\t"$2"\t3"}' share_db/all-repId_memId_cluFlag-24.tsv ./share_db/fragments-other-member_repId_memId_otherId_alteredRepId-2 > share_db/all-repId_memId_cluFlag-234-2.tsv & 23 | 24 | # AFDB50 25 | mmseqs createtsv /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb50best_clu ./afdb50best_clu_repId_memId.tsv 26 | 27 | # Flag 1 - add AFDB50 28 | awk 'FNR==NR {mem2rep[$2]=$1; print $0; next;} !($2 in mem2rep) {print mem2rep[$1]"\t"$2"\t1"}' share_db/all-repId_memId_cluFlag-234-2.tsv afdb50best_clu_repId_memId.tsv | sort > share_db/all-repId_memId_cluFlag-1234-2.tsv & 29 | 30 | # create entry taxonomy 31 | foldseek addtaxonomy /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb50best_clu ./taxonomy/afdb50best_clu_lineage --tax-lineage 1 && foldseek createtsv /storage/martin/foldseek_cluster/afdb /storage/martin/foldseek_cluster/afdb ./taxonomy/afdb50best_clu_lineage ./taxonomy/afdb50best_clu_lineage.tsv 32 | 33 | # a file for the website 34 | awk 'FNR==NR {mem2tax[$2]=$3; next;} {print $1"\t"$2"\t"$3"\t"mem2tax[$2]}' ./taxonomy/afdb50best_clu_lineage.tsv ./share_db/all-repId_memId_cluFlag-1234-2.tsv > ./share_db/website-all-repId_memId_cluFlag_taxId.tsv & 35 | 36 | # Cluster file with non-AFDB-clusters rep Ids (for web) 37 | awk 'FNR==NR {print $1"\t1\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8; id[$1]=1; next;} !($1 in id) {id[$1]=1; print $1"\t0\t0\t0\t0\t0\t0\t0\tN/A"}' summary/repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv share_db/all-repId_memId_cluFlag-1234-2.tsv > share_db/all-clusters-repId_isAFDBCluster_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv 38 | 39 | ### Supplements 40 | awk -F "\t" 'FNR==NR {dark[$1]=1; next;} ($1 in dark) {print $1"\t1\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7; next;} !($1 in darK) {print $1"\t0\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7}' ./darkening/without_foldseek-pdb_mmseqs-pfam_ftp-pfam-tigrfram summary/repId_nMem_repLen_avgLen_repPlddt_avgPlddt_taxId_lineage.tsv | sed -E 's/-F1-model_v3.cif//' | sed -E 's/AF-//' > share_db/2-repId_isDark_nMem_repLen_avgLen_repPlddt_avgPlddt_LCAtaxId.tsv -------------------------------------------------------------------------------- /dark_distribution/dark_distribution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# read file\n", 10 | "fn = \"repId_isDark_nMem.tsv\"\n", 11 | "f = open(f'../summary/{fn}', 'r')\n", 12 | "\n", 13 | "bright_nMem_hit = {}\n", 14 | "dark_nMem_hit = {}\n", 15 | "\n", 16 | "while True:\n", 17 | " line = f.readline().strip()\n", 18 | "\n", 19 | " if not line:\n", 20 | " break\n", 21 | "\n", 22 | " [repId, isDark, nMem] = line.split('\\t')\n", 23 | " isDark = int(isDark)\n", 24 | " nMem = int(nMem)\n", 25 | "\n", 26 | " if isDark == 1:\n", 27 | " if dark_nMem_hit.get(nMem):\n", 28 | " dark_nMem_hit[nMem] += 1\n", 29 | " else:\n", 30 | " dark_nMem_hit[nMem] = 1\n", 31 | " else:\n", 32 | " if bright_nMem_hit.get(nMem):\n", 33 | " bright_nMem_hit[nMem] += 1\n", 34 | " else:\n", 35 | " bright_nMem_hit[nMem] = 1\n", 36 | "\n", 37 | "f.close()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 23, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# dictionary to array\n", 47 | "max_nMem = max(list(bright_nMem_hit.keys()) + list(dark_nMem_hit.keys())) \n", 48 | "bright_hist_nMem = [ bright_nMem_hit[i] if bright_nMem_hit.get(i) else 0 for i in range(max_nMem+1)]\n", 49 | "dark_hist_nMem = [ dark_nMem_hit[i] if dark_nMem_hit.get(i) else 0 for i in range(max_nMem+1)]" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 59, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# arrange histogram - set variables\n", 59 | "delimiters = [4, 10, 20, 40, 80, 160, 320, 700, 1800, 38901]\n", 60 | "delimiters_starting_point = [2] + delimiters\n", 61 | "Xs = [f\"{delimiters_starting_point[i-1]}~{delimiters_starting_point[i]}\" for i in range(1, len(delimiters_starting_point)) ]\n", 62 | "num_bins = len(delimiters)\n", 63 | "\n", 64 | "num_member_AFDB_Clusters = 0\n", 65 | "\n", 66 | "dark_bins = [0] * num_bins\n", 67 | "bright_bins = [0] * num_bins\n", 68 | "occupations_per_bins = [0] * num_bins\n", 69 | "\n", 70 | "dark_rate_bins = [0] * num_bins\n", 71 | "bright_rate_bins = [0] * num_bins" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 60, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# histogram into 10 bins\n", 81 | "bin_i = 0\n", 82 | "nMem_i = 2\n", 83 | "\n", 84 | "for bin_i in range(len(delimiters)):\n", 85 | " bin_nMem_upper_bound = delimiters[bin_i]\n", 86 | "\n", 87 | " # traverse hist array\n", 88 | " while True:\n", 89 | " if nMem_i > max_nMem:\n", 90 | " break\n", 91 | " \n", 92 | " dark_bins[bin_i] += dark_hist_nMem[nMem_i]\n", 93 | " bright_bins[bin_i] += bright_hist_nMem[nMem_i]\n", 94 | "\n", 95 | " occupations_per_bins[bin_i] += ( dark_hist_nMem[nMem_i] + bright_hist_nMem[nMem_i] ) * nMem_i\n", 96 | " num_member_AFDB_Clusters += ( dark_hist_nMem[nMem_i] + bright_hist_nMem[nMem_i] ) * nMem_i\n", 97 | "\n", 98 | " nMem_i += 1\n", 99 | "\n", 100 | " if nMem_i > bin_nMem_upper_bound:\n", 101 | " break" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 61, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "12.24%, 10.59%, 9.20%, 10.07%, 10.46%, 10.05%, 9.04%, 9.20%, 9.19%, 9.96%, " 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "# find occupations\n", 119 | "for i in range(num_bins):\n", 120 | " occupations_per_bins[i] = occupations_per_bins[i] / num_member_AFDB_Clusters * 100\n", 121 | "\n", 122 | "for e in occupations_per_bins:\n", 123 | " print(\"{:.2f}%\".format(e), end=\", \")" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 63, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# generate ratio per bin\n", 133 | "for i in range(num_bins):\n", 134 | " denominator = bright_bins[i] + dark_bins[i]\n", 135 | "\n", 136 | " bright_rate_bins[i] = bright_bins[i] / denominator * 100\n", 137 | " dark_rate_bins[i] = dark_bins[i] / denominator * 100" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 65, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "image/png": "", 148 | "text/plain": [ 149 | "
" 150 | ] 151 | }, 152 | "metadata": {}, 153 | "output_type": "display_data" 154 | } 155 | ], 156 | "source": [ 157 | "import matplotlib.pyplot as plt\n", 158 | "\n", 159 | "colors = ['#d4453d', '#67c8fb', '#67c8fb']\n", 160 | "labels = ['w/o annotation',\n", 161 | " 'with annotation',\n", 162 | " 'sequences without annotation in (1)',]\n", 163 | "\n", 164 | "plt.bar(Xs, bright_rate_bins, bottom=dark_rate_bins, color=colors[2], label=labels[1])\n", 165 | "plt.bar(Xs, dark_rate_bins, color=colors[0], label=labels[0])\n", 166 | "plt.xticks(Xs, rotation=-35, ha='left')\n", 167 | "plt.xlabel(\"cluster size range\")\n", 168 | "plt.ylabel(\"occupation in each bin (%)\")\n", 169 | "plt.legend()\n", 170 | "\n", 171 | "plt.tight_layout()\n", 172 | "plt.savefig('darks.svg')" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 5, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "import matplotlib.pyplot as plt\n", 182 | "import numpy as np\n", 183 | "\n", 184 | "# set variables\n", 185 | "label_bars = ['removed (fragments, singletons)', 'w/o annotation', 'with annotation']\n", 186 | "colors = ['#b6b6b6', '#d4453d', '#67c8fb']\n", 187 | "Xs = ['AFDB\\n214M', 'AFDB clusters\\n2.27M']\n", 188 | "Ys_top = np.arange(len(Xs))\n", 189 | "\n", 190 | "afdb_r_d_b = [0, 0, 0] # removed, dark, bright\n", 191 | "afdb_clusters_r_d_b = [0, 0, 0]" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 6, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "fn_afdb = 'AFDB-removed_dark_bright.tsv'\n", 201 | "f_afdb = open(f\"../summary/{fn_afdb}\")\n", 202 | "\n", 203 | "afdb_r_d_b = list(map(int, f_afdb.readline().strip().split()))\n", 204 | "\n", 205 | "f_afdb.close()\n", 206 | "\n", 207 | "fn_afdb_clusters = 'AFDB-Clusters-removed_dark-bright.tsv'\n", 208 | "f_afdb_clusters = open(f'../summary/{fn_afdb_clusters}')\n", 209 | "\n", 210 | "afdb_clusters_r_d_b = list(map(int, f_afdb_clusters.readline().strip().split()))\n", 211 | "\n", 212 | "f_afdb_clusters.close()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 7, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# generate ratio\n", 222 | "layers = np.array([afdb_r_d_b, afdb_clusters_r_d_b]).T\n", 223 | "denominators = np.sum(layers, axis=0)\n", 224 | "layers = layers / denominators * 100" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 11, 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "image/png": "", 235 | "text/plain": [ 236 | "
" 237 | ] 238 | }, 239 | "metadata": {}, 240 | "output_type": "display_data" 241 | } 242 | ], 243 | "source": [ 244 | "\n", 245 | "fig = plt.figure()\n", 246 | "gs = fig.add_gridspec(1, 3)\n", 247 | "ax1 = fig.add_subplot(gs[0, 2])\n", 248 | "ax2 = fig.add_subplot(gs[0, 0:1])\n", 249 | "\n", 250 | "w = 0.5\n", 251 | "bottoms = np.zeros(len(Ys_top))\n", 252 | "[I, J] = layers.shape\n", 253 | "\n", 254 | "for i in range(len(layers)):\n", 255 | " ax2.bar(Ys_top, layers[i], w, bottom=bottoms, color=colors[i], label=label_bars[i])\n", 256 | " bottoms = bottoms + layers[i]\n", 257 | "\n", 258 | " for j in range(J-1):\n", 259 | " ax2.plot([Ys_top[j] + w/2, Ys_top[j+1] - w/2],\n", 260 | " [bottoms[j]-0.3, bottoms[j+1]-0.3],\\\n", 261 | " color='C7', ls='--', zorder=1)\n", 262 | "\n", 263 | "ax2.set_xticks(Ys_top, Xs, rotation=-35, ha='left')\n", 264 | "ax2.set_ylabel('occupation in each bin (%)')\n", 265 | "ax2.set_ylim([0, 105])\n", 266 | "# ax2.legend()\n", 267 | "\n", 268 | "plt.legend()\n", 269 | "plt.tight_layout()\n", 270 | "plt.savefig('two_bars2.svg')\n", 271 | "# plt.show()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "tutorial", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.10.6" 299 | }, 300 | "orig_nbformat": 4 301 | }, 302 | "nbformat": 4, 303 | "nbformat_minor": 2 304 | } 305 | -------------------------------------------------------------------------------- /purity/pfam_consistency.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# open file\n", 10 | "fn = \"pfamOrClan-uniprot-numHits_repId_pfamOrClan.tsv\"\n", 11 | "f = open(f\"../purity/{fn}\")\n", 12 | "\n", 13 | "# data\n", 14 | "rep_pfams = {}\n", 15 | "\n", 16 | "# read file\n", 17 | "while True:\n", 18 | " line = f.readline().strip()\n", 19 | " \n", 20 | " if not line:\n", 21 | " break\n", 22 | " \n", 23 | " tokens = line.split()\n", 24 | " hit = int(tokens[0])\n", 25 | " repId = tokens[1]\n", 26 | " pfams = set(tokens[2].split(';')[:-1])\n", 27 | " \n", 28 | " if not rep_pfams.get(repId):\n", 29 | " rep_pfams[repId] = []\n", 30 | " \n", 31 | " rep_pfams[repId].append([pfams, hit])\n", 32 | "\n", 33 | "# close file\n", 34 | "f.close()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# count the diversity of pfam\n", 44 | "def compute_sum_hits(pfam_hits):\n", 45 | " L = len(pfam_hits)\n", 46 | " \n", 47 | " sum_hits = 0\n", 48 | " for i in range(L):\n", 49 | " [pfam, hits] = pfam_hits[i]\n", 50 | " \n", 51 | " sum_hits += hits\n", 52 | " \n", 53 | " return sum_hits" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 7, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "import math\n", 63 | "\n", 64 | "rep_cov = {}\n", 65 | "iteration = math.inf\n", 66 | "\n", 67 | "check_id = 'AF-A0A009G5I8-F1-model_v3.cif'\n", 68 | "\n", 69 | "for repId, pfam_hits in rep_pfams.items():\n", 70 | " \n", 71 | " # if there is only one hit at a cluster, we don't measure the consistency value\n", 72 | " if compute_sum_hits(pfam_hits) < 2:\n", 73 | " continue\n", 74 | " \n", 75 | " rep_cov[repId] = 0\n", 76 | " N = len(pfam_hits) \n", 77 | " \n", 78 | " repId_hits = 0\n", 79 | " \n", 80 | " for i in range(N):\n", 81 | " pairwise_score = 0\n", 82 | " \n", 83 | " query_pfams = pfam_hits[i][0]\n", 84 | " query_hits = pfam_hits[i][1]\n", 85 | " repId_hits += query_hits\n", 86 | " \n", 87 | " query_N_pfams = len(query_pfams)\n", 88 | " \n", 89 | " for j in range(N):\n", 90 | " coverage = 0\n", 91 | " target_pfams = pfam_hits[j][0]\n", 92 | " target_hits = pfam_hits[j][1]\n", 93 | " \n", 94 | " # w/o self-pair\n", 95 | " if i == j :\n", 96 | " target_hits -= 1\n", 97 | " \n", 98 | " for pfam in query_pfams:\n", 99 | " if pfam in target_pfams:\n", 100 | " coverage += 1\n", 101 | " \n", 102 | " coverage = coverage/ query_N_pfams * target_hits\n", 103 | " pairwise_score += coverage\n", 104 | " \n", 105 | " pairwise_score *= query_hits\n", 106 | " rep_cov[repId] += pairwise_score\n", 107 | " \n", 108 | " rep_cov[repId] /= (repId_hits**2 - repId_hits)\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 8, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "fon = \"pfam-consistency_repId_cov.tsv\"\n", 118 | "fo = open(f'../purity/{fon}', 'w')\n", 119 | "\n", 120 | "for key, value in rep_cov.items():\n", 121 | " fo.write(f'{key}\\t{value}\\n')\n", 122 | "\n", 123 | "fo.close()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [] 132 | } 133 | ], 134 | "metadata": { 135 | "kernelspec": { 136 | "display_name": "tutorial", 137 | "language": "python", 138 | "name": "python3" 139 | }, 140 | "language_info": { 141 | "codemirror_mode": { 142 | "name": "ipython", 143 | "version": 3 144 | }, 145 | "file_extension": ".py", 146 | "mimetype": "text/x-python", 147 | "name": "python", 148 | "nbconvert_exporter": "python", 149 | "pygments_lexer": "ipython3", 150 | "version": "3.10.6" 151 | }, 152 | "orig_nbformat": 4 153 | }, 154 | "nbformat": 4, 155 | "nbformat_minor": 2 156 | } 157 | -------------------------------------------------------------------------------- /purity/purity_figure.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steineggerlab/afdb-clusters-analysis/4c38d80184fbb967b5fc9fe75d7a765a1c6cf98e/purity/purity_figure.ipynb -------------------------------------------------------------------------------- /purity/subsitute_clan.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# open file\n", 10 | "fcn = \"pfam-clan.tsv\"\n", 11 | "fc = open(f'../purity/{fcn}')\n", 12 | "\n", 13 | "# data\n", 14 | "pfam_clan = {}\n", 15 | "\n", 16 | "# read file\n", 17 | "while True:\n", 18 | " line = fc.readline().strip()\n", 19 | " \n", 20 | " if not line:\n", 21 | " break\n", 22 | " \n", 23 | " tokens = line.split()\n", 24 | " \n", 25 | " pfam = tokens[0]\n", 26 | " clan = tokens[1]\n", 27 | " \n", 28 | " pfam_clan[pfam] = clan\n", 29 | "\n", 30 | "# close file\n", 31 | "fc.close()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 6, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# open file\n", 41 | "fn = \"pfam-uniprot-numHits_repId_pfams.tsv\"\n", 42 | "f = open(f'../darkening/ftp_pfam_tigrfam/{fn}')\n", 43 | "\n", 44 | "# data\n", 45 | "lines = []\n", 46 | "\n", 47 | "# read file & clan substitution\n", 48 | "while True:\n", 49 | " line = f.readline().strip()\n", 50 | " \n", 51 | " if not line:\n", 52 | " break\n", 53 | " \n", 54 | " tokens = line.split()\n", 55 | " \n", 56 | " numHit = tokens[0]\n", 57 | " repId = tokens[1]\n", 58 | " pfams = tokens[2].split(';')[:-1]\n", 59 | " \n", 60 | " for i in range(len(pfams)):\n", 61 | " pfam = pfams[i]\n", 62 | " \n", 63 | " if pfam_clan.get(pfam):\n", 64 | " pfams[i] = pfam_clan[pfam]\n", 65 | " \n", 66 | " pfam_string = ';'.join(pfams)\n", 67 | " \n", 68 | " lines.append(f'{numHit}\\t{repId}\\t{pfam_string};')\n", 69 | "\n", 70 | "# close file\n", 71 | "f.close()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 7, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "fon = \"pfamOrClan-uniprot-numHits_repId_pfamOrClan.tsv\"\n", 81 | "fo = open(f'../purity/{fon}', 'w')\n", 82 | "\n", 83 | "for line in lines:\n", 84 | " fo.write(f'{line}\\n')\n", 85 | "\n", 86 | "fo.close()" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "display_name": "Python 3", 93 | "language": "python", 94 | "name": "python3" 95 | }, 96 | "language_info": { 97 | "codemirror_mode": { 98 | "name": "ipython", 99 | "version": 3 100 | }, 101 | "file_extension": ".py", 102 | "mimetype": "text/x-python", 103 | "name": "python", 104 | "nbconvert_exporter": "python", 105 | "pygments_lexer": "ipython3", 106 | "version": "3.10.6" 107 | }, 108 | "orig_nbformat": 4 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } 113 | --------------------------------------------------------------------------------