├── Gui_pipeline.png ├── git_hub_modiDeC.png ├── data_curation_tutorial ├── Analysis_GUI.png ├── analysis_plot1.png ├── analysis_plot2.png ├── Training_figure_2.png ├── Figura_data_generation.png ├── data_creation_example.png ├── tutorial_training.md ├── tutorial_ModiDeC_analysis.md └── tutorial_data_creation.md ├── CITATION.cff ├── LICENSE ├── Coverage_check.py ├── Load_data_for_training_V2.py ├── README.md ├── ModiDec_NN.py ├── remora_TF2_env.yml ├── Analyze_data_NN.py ├── Analyze_data_NN_V2.py ├── Resquigle_remora_GUI.py ├── Training_NN_GUI.py ├── Analysis_platform_GUI.py └── Remora_resquigle_generate_data.py /Gui_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mem3nto0/ModiDeC-RNA-modification-classifier/HEAD/Gui_pipeline.png -------------------------------------------------------------------------------- /git_hub_modiDeC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mem3nto0/ModiDeC-RNA-modification-classifier/HEAD/git_hub_modiDeC.png -------------------------------------------------------------------------------- /data_curation_tutorial/Analysis_GUI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mem3nto0/ModiDeC-RNA-modification-classifier/HEAD/data_curation_tutorial/Analysis_GUI.png -------------------------------------------------------------------------------- /data_curation_tutorial/analysis_plot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mem3nto0/ModiDeC-RNA-modification-classifier/HEAD/data_curation_tutorial/analysis_plot1.png -------------------------------------------------------------------------------- /data_curation_tutorial/analysis_plot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mem3nto0/ModiDeC-RNA-modification-classifier/HEAD/data_curation_tutorial/analysis_plot2.png -------------------------------------------------------------------------------- /data_curation_tutorial/Training_figure_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mem3nto0/ModiDeC-RNA-modification-classifier/HEAD/data_curation_tutorial/Training_figure_2.png -------------------------------------------------------------------------------- /data_curation_tutorial/Figura_data_generation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mem3nto0/ModiDeC-RNA-modification-classifier/HEAD/data_curation_tutorial/Figura_data_generation.png -------------------------------------------------------------------------------- /data_curation_tutorial/data_creation_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mem3nto0/ModiDeC-RNA-modification-classifier/HEAD/data_curation_tutorial/data_creation_example.png -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.1.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: Alagna 5 | given-names: Nicolo 6 | orcid: https://orcid.org/0009-0006-0804-5774 7 | title:mem3nto0/ModiDeC-RNA-modification-classifier: ModiDeC_RNA_modification_classifier 8 | version: ModiDeC V1 9 | date-released: 2025-06-23 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Nicolò Alagna 2024 2 | 3 | Permission is granted to use, copy, and modify this software and its documentation for academic and non-commercial research purposes only, provided that the following conditions are met: 4 | 5 | 1. The software is used for scientific or educational purposes only. 6 | 2. Proper citation of the original author and source must be included in any publications or presentations using this software. 7 | 3. Commercial use, redistribution, or modification is prohibited without prior written permission from the author. 8 | 4. For any derivative works or reuse in future research projects, users must contact the original author for permission. 9 | 10 | This software is provided "as is", without warranty of any kind, express or implied. 11 | -------------------------------------------------------------------------------- /Coverage_check.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # /// Function to penalize low covarage sites while calculating frequency /// 4 | 5 | def Coverage_analysis(N_max, Data, Track_coverage, threshold = 0.2): 6 | 7 | Filtered_covarage = Track_coverage[ Track_coverage > 1] 8 | median_covarage = np.median(Filtered_covarage) 9 | MAD_covarage = np.median(np.absolute(Filtered_covarage - median_covarage)) 10 | 11 | if median_covarage > 2*N_max*threshold: 12 | 13 | lower_bound = median_covarage - MAD_covarage 14 | mask = np.where((Track_coverage >= lower_bound), 1, 0) 15 | 16 | Division_factor = N_max*(1 - (1 - Track_coverage/N_max)*mask) 17 | Final_results = Data / Division_factor[:,np.newaxis] 18 | 19 | else: 20 | 21 | if median_covarage < N_max*threshold: 22 | 23 | Final_results = (Data/N_max) 24 | 25 | else: 26 | 27 | lower_bound = median_covarage 28 | mask = np.where((Track_coverage >= lower_bound), 1, 0) 29 | 30 | Division_factor = N_max*(1 - (1 - Track_coverage/N_max)*mask) 31 | Final_results = Data / Division_factor[:,np.newaxis] 32 | 33 | 34 | return Final_results -------------------------------------------------------------------------------- /data_curation_tutorial/tutorial_training.md: -------------------------------------------------------------------------------- 1 | # Tutorial Part 2: Training of the neural network 2 | 3 | ![GUI for retraining ModiDeC](https://github.com/mem3nto0/ModiDeC-RNA-modification-classifier/blob/main/data_curation_tutorial/Training_figure_2.png) 4 | 5 | 6 | ## Training Data Input 7 | For training the neural network with your own data, the directories containing the data need to be specified. data has to be created using the "Resguiggle_remora_Gui.py" to use the training user interface. 8 | 1) select the folder where the training data were created by pressing the button "training data folder". 9 | 2) (optional) if validation data are created as well using the "Resguiggle_remora_Gui.py", select the validation folder using the "validation data folder" button. if you don't have validation data, remember to set the "validation during training" variable with "no". 10 | 3) Select the folder where the retrained ModiDeC model will be saved by pressing the button "save model folder". This will specify the path where the model will be stored. 11 | 12 | ## General Variables 13 | Additional specifications are needed for training the neural network: 14 | 1) The batch size (1) specifies the number of samples, which are propagated through the network during training. Batch sizes like 128 or 256 are recommended depending on the memory available for training. 15 | 2) The k-mer model (2) can be adjusted to the type of data used for training. Both data sequenced from RNA002 and RNA004 flowcells from Oxford Nanopore Technologies can be used for training the model: for RNA002, insert the number 5 (5-mer). For RNA004, insert the number 9 (9-mer). 16 | 3) Insert the number of epochs for the training (3). We recommend setting the number of epochs to three. 17 | 4) The variable name NN specifies the name of the saved model (4). 18 | 5) The user can also specify if a validation of the retrained model is needed, by typing yes or no (5). 19 | 20 | After the settings are settled, press the button "Start training" to retrain ModiDeC. When the training is finished, a folder with the name of the neural network will be created in the "save folder" that contains the trained neural network. 21 | -------------------------------------------------------------------------------- /Load_data_for_training_V2.py: -------------------------------------------------------------------------------- 1 | from tensorflow import keras 2 | import numpy as np 3 | import os 4 | 5 | class Load_data_RNA(keras.utils.Sequence): 6 | 7 | """generate data in sequence mode for training the neural network""" 8 | 9 | def __init__(self, batch_size, N_batches, path, files_list, chunck_size, labels, batch_loading, max_seq_len): 10 | 11 | self.batch_size = batch_size 12 | self.N_batches = N_batches 13 | self.batch_loading = batch_loading 14 | 15 | self.X_train = np.zeros([self.batch_size,chunck_size,1]) 16 | self.X_train2 = np.zeros([self.batch_size,max_seq_len,4,1]) 17 | self.labels = np.zeros([self.batch_size,max_seq_len,labels]) 18 | 19 | # The set of characters accepted in the transcription. 20 | self.path = path 21 | self.files_list = files_list 22 | self.ind_rand = np.arange(0,self.N_batches,1) 23 | np.random.shuffle(self.ind_rand) 24 | np.random.shuffle(self.files_list) 25 | 26 | def __len__(self): 27 | ''' 28 | Denotes the number of batches per epoch 29 | ''' 30 | return int(self.N_batches) 31 | 32 | def __getitem__(self, index): 33 | 34 | selected_ind = self.ind_rand[index] 35 | 36 | const = self.batch_loading 37 | 38 | for i in range(int(self.batch_size/const)): 39 | 40 | try: 41 | with np.load(self.path + "/" + self.files_list[int(self.batch_size/const)*selected_ind + i]) as data: 42 | 43 | 44 | new_x_train = data["train_input"] 45 | new_x_train2 = data["train_input2"] 46 | y_train = data["train_output"] 47 | 48 | except: 49 | 50 | with np.load(self.path + "/" + self.files_list[0]) as data: 51 | 52 | new_x_train = data["train_input"] 53 | new_x_train2 = data["train_input2"] 54 | y_train = data["train_output"] 55 | 56 | self.X_train[i*const:(i+1)*const,:,0] = new_x_train 57 | self.X_train2[i*const:(i+1)*const,:,:,0] = new_x_train2 58 | 59 | self.labels[i*const:(i+1)*const,:,:] = y_train 60 | 61 | X_total = { 62 | "Input_1": self.X_train, 63 | "Input_2": self.X_train2 64 | } 65 | 66 | return X_total, self.labels 67 | 68 | 69 | def on_epoch_end(self): 70 | """Shuffle indices after each epoch""" 71 | np.random.shuffle(self.ind_rand) 72 | -------------------------------------------------------------------------------- /data_curation_tutorial/tutorial_ModiDeC_analysis.md: -------------------------------------------------------------------------------- 1 | # Tutorial Part 3: Data Analysis with ModiDeC 2 | 3 | ![GUI for retraining ModiDeC](https://github.com/mem3nto0/ModiDeC-RNA-modification-classifier/blob/main/data_curation_tutorial/Analysis_GUI.png) 4 | 5 | The data analysis GUI allows us to visualize the ModiDeC analysis in a simple way. 6 | 7 | ## Starting the analysis using ModiDeC neural network 8 | 9 | from the above figure, it is possible to see that several buttons can be pressed to load data, neural network, and reference for the data analysis. 10 | ModiDeC reconstructs the analysis directly on the reference. The data has to be basecalled using dorado (with the --emit-move flag) and aligned 11 | using sametools. 12 | 13 | This user interface allows the analysis of one reference at a time. 14 | 15 | Here is a list of what each button does: 16 | 17 | 1) "Pod5 file folder": load the folder containing the pod5 files that you want to analyze. 18 | 2) "bam file": load the bam file for the analysis of the transcript 19 | 3) "Neural network folder": load the folder where the model is stored. 20 | 4) "kmer-level table file": kmer level table that is gives from ONT. 21 | 5) "reference": load the reference for your single transcript. 22 | 23 | After these steps, press the button "initialize the data". this can take a few seconds to load the model. 24 | When the initialization is finished, you can select the total amount of reads to analyze. for example, if you want to analyze the initial 25 | 1000 reads, put as start_index = 0 and end_index = 1000. if you want to analyze all the reads, put start_index = 0 and end_index = -1. 26 | The analysis of a lot of reads can take a lot of time. For a good statistical analysis, we suggest a value of 5000 for the first analysis. 27 | 28 | Press the "start analysis with Neural Network" to let ModiDeC analyze your data. At the end of the analysis, a "ModiDeC_analysis.npz" file 29 | will be created in the current working folder. The file contains the analysis of ModiDeC, which shows the modification frequency for each 30 | modification that ModiDeC was trained on and for each nucleotide. 31 | 32 | The results can be also visualized using the GUI by pressing the "visualize results" button. A new window is open where the data can 33 | be visualized. 34 | 35 | ![figure plot](https://github.com/mem3nto0/ModiDeC-RNA-modification-classifier/blob/main/data_curation_tutorial/analysis_plot1.png) 36 | 37 | select the start and end reference points to visualize and press "plot". the window will change and the results of the selected reference 38 | region are shown. 39 | 40 | ![figure plot 2](https://github.com/mem3nto0/ModiDeC-RNA-modification-classifier/blob/main/data_curation_tutorial/analysis_plot2.png) 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ModiDeC-RNA-modification-classifier 2 | 3 | ![GUI for retraining ModiDeC](https://github.com/mem3nto0/ModiDeC-RNA-modification-classifier/blob/main/git_hub_modiDeC.png) 4 | 5 | ModiDeC is a Personalized two input neural network that was designed to identify RNA modifications from direct RNA sequencing using 6 | RNA002 or RNA004 Oxford Nanopore technology (ONT) kits. In detail, ModiDeC combines LSTM and a newly designed inception-res-net block for 7 | the multi-classification process. In this GitHub repository, we offer the ModiDeC models and several user graphic interfaces to retrain 8 | from scratch the neural network to readapt ModiDeC to your specific problem. 9 | 10 | ## Requirements and Installation 11 | 12 | ModiDeC uses simple libraries such as NumPy and TensorFlow. It also uses the pre-compiled library "ont-remora" from ONT. 13 | Here below is a list of the libraries used for ModiDeC creation: 14 | 15 | python == 3.10.14 16 | TensorFlow == 2.15 17 | pyqt5 == 5.15.11 18 | matplotlib == 3.9.1 19 | numpy == 1.26.4 20 | ont-remora == 3.2.0 21 | 22 | To install the Conda environment to run the ModiDeC GUI, we suggest running the two following command lines in the prompt 23 | of Linux or WSL. 24 | 25 | sudo apt install gcc -y 26 | conda env create -f /path_to_ModiDeC_folder/remora_TF2_env.yml 27 | 28 | These two command lines will install C and C++ interpreters (needed for Remora), and it will create a new conda environment called 29 | "Remora_TF2", which contains all the libraries necessary to run ModiDeC. 30 | 31 | IMPORTANT: The ont-remora library is a Linux-based library, which means that ModiDeC can be used in the Linx system or Windows with WSL. 32 | 33 | ## General information ModiDeC GUI 34 | 35 | ModiDeC GUI is divided in three sub-interfaces (see figure below), which each of them has a specific design. The ModiDeC GUI can be used in several ways, from retraining the neural network to directly 36 | analyzing an aligned sample using a pre-trained neural network. We decided to create the GUIs to give the opportunity to adapt and customize ModiDeC for specific problems. 37 | 38 | the figure below shows a general overview of ModiDeC GUIs. The first thing that can be observed is that the GUIs are divided into ”ModiDeC data curation”, “ModiDeC training” and “ModiDeC analysis”. 39 | 40 | ![GUI for retraining ModiDeC](https://github.com/mem3nto0/ModiDeC-RNA-modification-classifier/blob/main/Gui_pipeline.png) 41 | 42 | in the tutorial folder it is possible to find detailed tutorials for each of the three GUI. 43 | 44 | ## Epi2Me pipeline link 45 | 46 | We also implemented ModiDeC in Epi2Me. Epi2Me links repositary can be found here below. 47 | 48 | https://github.com/Nanopore-Hackathon/wf-modidec_data-curation 49 | 50 | https://github.com/Nanopore-Hackathon/wf-modidec_training 51 | 52 | https://github.com/Nanopore-Hackathon/wf-modidec_analysis 53 | 54 | ## Collaboration 55 | 56 | This work is a collaboration partnership with the group of Prof. Dr. Susanne Gerber, Uni Medical Center, Mainz. https://csg.uni-mainz.de/group-member/susanne-gerber/ 57 | 58 | ## Credit and Licence 59 | 60 | This code is provided by Dr. Nicolo Alagna and the Computational Systems Genetics Group of the University Medical Center of Mainz. © 2024 All rights reserved. 61 | 62 | This code is licensed for non-commercial academic use only. See LICENSE file. 63 | -------------------------------------------------------------------------------- /ModiDec_NN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from keras.models import Model 3 | from keras.layers import Conv1D, MaxPooling1D , Add, Dropout , Dense , Conv2D 4 | from keras.layers import Input, Activation , Concatenate, LSTM , BatchNormalization, Reshape 5 | from keras.layers import Resizing , Masking, Multiply 6 | 7 | 8 | 9 | def Conv1D_swish_bn(x, N_filters, kernel, strides): 10 | 11 | x = Conv1D(N_filters, kernel, strides=strides, padding="same")(x) 12 | x = BatchNormalization()(x) 13 | x = Activation("swish")(x) 14 | 15 | return x 16 | 17 | 18 | def Conv2D_swish_bn(x, N_filters, kernel, strides): 19 | 20 | x = Conv2D(N_filters, kernel, strides=strides, padding="same")(x) 21 | x = BatchNormalization()(x) 22 | x = Activation("swish")(x) 23 | 24 | return x 25 | 26 | 27 | def Inception_res_block(x, N_filters): 28 | 29 | short = Conv1D(N_filters, 1, strides = 1, padding="same")(x) 30 | short = BatchNormalization()(short) 31 | 32 | x_Inc_1 = Conv1D_swish_bn(x, int(0.2*N_filters), kernel= 1, strides= 1) 33 | x_Inc_2 = Conv1D_swish_bn(x_Inc_1, int(0.35*N_filters), kernel= 3, strides= 1) 34 | x_Inc_3 = Conv1D_swish_bn(x_Inc_2, int(0.5*N_filters), kernel= 5, strides= 1) 35 | 36 | x_conc = Concatenate(axis=-1)([x_Inc_1, x_Inc_2, x_Inc_3]) 37 | x_conc = Conv1D(N_filters, 1, strides=1, padding="same")(x_conc) 38 | x_conc = BatchNormalization()(x_conc) 39 | 40 | out = Add()([short,x_conc]) 41 | out = BatchNormalization()(out) 42 | out = Activation("swish")(out) 43 | 44 | return out 45 | 46 | def Inception_res_block_2D(x, N_filters, kernel): 47 | 48 | short = Conv2D(N_filters, 1, strides = 1, padding="same")(x) 49 | short = BatchNormalization()(short) 50 | 51 | x_Inc_1 = Conv2D_swish_bn(x, int(0.2*N_filters), kernel= 1, strides= 1) 52 | x_Inc_2 = Conv2D_swish_bn(x_Inc_1, int(0.35*N_filters), kernel= kernel, strides= 1) 53 | x_Inc_3 = Conv2D_swish_bn(x_Inc_2, int(0.5*N_filters) + 1, kernel= kernel, strides= 1) 54 | 55 | x_conc = Concatenate(axis=-1)([x_Inc_1, x_Inc_2, x_Inc_3]) 56 | 57 | # Final 1x1 Conv to reduce dimensions to N_filters 58 | x_conc = Conv2D(N_filters, 1, strides=1, padding="same")(x_conc) 59 | x_conc = BatchNormalization()(x_conc) 60 | 61 | out = Add()([short,x_conc]) 62 | out = BatchNormalization()(out) 63 | out = Activation("swish")(out) 64 | 65 | return out 66 | 67 | 68 | def ModiDeC_model(Inp_1, Inp_2, labels, kmer_model): 69 | 70 | input_layer1 = Input((Inp_1,1), name='Input_1') 71 | 72 | x1 = Inception_res_block(input_layer1,256) 73 | x1 = MaxPooling1D(2)(x1) 74 | x1 = tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True))(x1) 75 | 76 | x1 = Inception_res_block(x1,512) 77 | x1 = MaxPooling1D(2)(x1) 78 | x1 = tf.keras.layers.Bidirectional(LSTM(256, return_sequences=True))(x1) 79 | 80 | x1 = Inception_res_block(x1,1024) 81 | x1 = MaxPooling1D(2)(x1) 82 | x1 = tf.keras.layers.Bidirectional(LSTM(512, return_sequences=True))(x1) 83 | 84 | x1 = Reshape((Inp_2, int(x1.shape[1]*1024/Inp_2)))(x1) 85 | 86 | input_layer2 = Input((Inp_2,4,1), name='Input_2') 87 | masked_input = Masking(mask_value=0.0)(input_layer2) 88 | 89 | x2 = Inception_res_block_2D(masked_input , 128, (kmer_model,4)) 90 | x2 = Inception_res_block_2D(x2 , 256, (3,3)) 91 | 92 | x2 = Reshape((x2.shape[-3], x2.shape[-2] * x2.shape[-1]))(x2) 93 | x2 = Inception_res_block(x2,512) # 512 94 | 95 | x_con = Concatenate(axis=-1)([x1,x2]) 96 | 97 | x_con = Dense(1024)(x_con) 98 | x_con = BatchNormalization()(x_con) 99 | x_con = Activation("swish")(x_con) 100 | 101 | x_con = Dropout(0.2)(x_con) 102 | 103 | x_con = Dense(1024)(x_con) 104 | x_con = BatchNormalization()(x_con) 105 | x_con = Activation("swish")(x_con) 106 | 107 | x_con = Dropout(0.2)(x_con) 108 | 109 | x_LSTM = tf.keras.layers.Bidirectional(LSTM(256, return_sequences=True))(x_con) 110 | 111 | out_2 = Dense(labels, activation="sigmoid")(x_LSTM) 112 | model = Model(inputs = [input_layer1, input_layer2] , outputs = [out_2]) 113 | 114 | return model 115 | -------------------------------------------------------------------------------- /remora_TF2_env.yml: -------------------------------------------------------------------------------- 1 | name: remora_TF2 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=conda_forge 7 | - _openmp_mutex=4.5=2_gnu 8 | - asttokens=2.4.1=pyhd8ed1ab_0 9 | - bzip2=1.0.8=h5eee18b_6 10 | - ca-certificates=2024.7.4=hbcca054_0 11 | - comm=0.2.2=pyhd8ed1ab_0 12 | - debugpy=1.6.7=py310h6a678d5_0 13 | - decorator=5.1.1=pyhd8ed1ab_0 14 | - exceptiongroup=1.2.2=pyhd8ed1ab_0 15 | - executing=2.0.1=pyhd8ed1ab_0 16 | - importlib-metadata=8.2.0=pyha770c72_0 17 | - importlib_metadata=8.2.0=hd8ed1ab_0 18 | - ipykernel=6.29.5=pyh3099207_0 19 | - ipython=8.26.0=pyh707e725_0 20 | - jedi=0.19.1=pyhd8ed1ab_0 21 | - jupyter_client=8.6.2=pyhd8ed1ab_0 22 | - jupyter_core=5.7.2=py310hff52083_0 23 | - ld_impl_linux-64=2.38=h1181459_1 24 | - libffi=3.4.4=h6a678d5_1 25 | - libgcc-ng=14.1.0=h77fa898_0 26 | - libgomp=14.1.0=h77fa898_0 27 | - libsodium=1.0.18=h36c2ea0_1 28 | - libstdcxx-ng=11.2.0=h1234567_1 29 | - libuuid=1.41.5=h5eee18b_0 30 | - matplotlib-inline=0.1.7=pyhd8ed1ab_0 31 | - ncurses=6.4=h6a678d5_0 32 | - nest-asyncio=1.6.0=pyhd8ed1ab_0 33 | - openssl=3.3.1=h4bc722e_2 34 | - packaging=24.1=pyhd8ed1ab_0 35 | - parso=0.8.4=pyhd8ed1ab_0 36 | - pexpect=4.9.0=pyhd8ed1ab_0 37 | - pickleshare=0.7.5=py_1003 38 | - pip=24.0=py310h06a4308_0 39 | - platformdirs=4.2.2=pyhd8ed1ab_0 40 | - prompt-toolkit=3.0.47=pyha770c72_0 41 | - psutil=6.0.0=py310hc51659f_0 42 | - ptyprocess=0.7.0=pyhd3deb0d_0 43 | - pure_eval=0.2.3=pyhd8ed1ab_0 44 | - pygments=2.18.0=pyhd8ed1ab_0 45 | - python=3.10.14=h955ad1f_1 46 | - python_abi=3.10=2_cp310 47 | - pyzmq=25.1.2=py310h6a678d5_0 48 | - readline=8.2=h5eee18b_0 49 | - setuptools=69.5.1=py310h06a4308_0 50 | - six=1.16.0=pyh6c4a22f_0 51 | - sqlite=3.45.3=h5eee18b_0 52 | - stack_data=0.6.2=pyhd8ed1ab_0 53 | - tk=8.6.14=h39e8969_0 54 | - tornado=6.4.1=py310hc51659f_0 55 | - traitlets=5.14.3=pyhd8ed1ab_0 56 | - typing_extensions=4.12.2=pyha770c72_0 57 | - wcwidth=0.2.13=pyhd8ed1ab_0 58 | - wheel=0.43.0=py310h06a4308_0 59 | - xz=5.4.6=h5eee18b_1 60 | - zeromq=4.3.5=h6a678d5_0 61 | - zipp=3.19.2=pyhd8ed1ab_0 62 | - zlib=1.2.13=h5eee18b_1 63 | - pip: 64 | - absl-py==2.1.0 65 | - astunparse==1.6.3 66 | - cachetools==5.4.0 67 | - certifi==2024.7.4 68 | - charset-normalizer==3.3.2 69 | - contourpy==1.2.1 70 | - cycler==0.12.1 71 | - filelock==3.15.4 72 | - flatbuffers==24.3.25 73 | - fonttools==4.53.1 74 | - fsspec==2024.6.1 75 | - gast==0.6.0 76 | - google-auth==2.32.0 77 | - google-auth-oauthlib==1.2.1 78 | - google-pasta==0.2.0 79 | - grpcio==1.65.1 80 | - h5py==3.11.0 81 | - idna==3.7 82 | - importlib-resources==6.4.0 83 | - iso8601==2.1.0 84 | - jinja2==3.1.4 85 | - joblib==1.4.2 86 | - keras==2.15.0 87 | - kiwisolver==1.4.5 88 | - lib-pod5==0.3.12 89 | - libclang==18.1.1 90 | - markdown==3.6 91 | - markdown-it-py==3.0.0 92 | - markupsafe==2.1.5 93 | - matplotlib==3.9.1 94 | - mdurl==0.1.2 95 | - mizani==0.9.3 96 | - ml-dtypes==0.2.0 97 | - more-itertools==10.3.0 98 | - mpmath==1.3.0 99 | - namex==0.0.8 100 | - networkx==3.3 101 | - numpy==1.26.4 102 | - nvidia-cublas-cu12==12.1.3.1 103 | - nvidia-cuda-cupti-cu12==12.1.105 104 | - nvidia-cuda-nvcc-cu12==12.3.107 105 | - nvidia-cuda-nvrtc-cu12==12.1.105 106 | - nvidia-cuda-runtime-cu12==12.1.105 107 | - nvidia-cudnn-cu12==8.9.2.26 108 | - nvidia-cufft-cu12==11.0.2.54 109 | - nvidia-curand-cu12==10.3.2.106 110 | - nvidia-cusolver-cu12==11.4.5.107 111 | - nvidia-cusparse-cu12==12.1.0.106 112 | - nvidia-nccl-cu12==2.20.5 113 | - nvidia-nvjitlink-cu12==12.3.101 114 | - nvidia-nvtx-cu12==12.1.105 115 | - oauthlib==3.2.2 116 | - ont-remora==3.2.0 117 | - opt-einsum==3.3.0 118 | - optree==0.12.1 119 | - pandas==2.2.2 120 | - parasail==1.3.4 121 | - patsy==0.5.6 122 | - pillow==10.4.0 123 | - plotnine==0.12.4 124 | - pod5==0.3.12 125 | - polars==0.20.31 126 | - protobuf==4.25.3 127 | - pyarrow==16.1.0 128 | - pyasn1==0.6.0 129 | - pyasn1-modules==0.4.0 130 | - pyparsing==3.1.2 131 | - pyqt5==5.15.11 132 | - pyqt5-qt5==5.15.14 133 | - pyqt5-sip==12.15.0 134 | - pysam==0.22.1 135 | - python-dateutil==2.9.0.post0 136 | - pytz==2024.1 137 | - requests==2.32.3 138 | - requests-oauthlib==2.0.0 139 | - rich==13.7.1 140 | - rsa==4.9 141 | - scikit-learn==1.5.1 142 | - scipy==1.14.0 143 | - statsmodels==0.14.2 144 | - sympy==1.13.1 145 | - tensorboard==2.15.2 146 | - tensorboard-data-server==0.7.2 147 | - tensorflow==2.15.0 148 | - tensorflow-estimator==2.15.0 149 | - tensorflow-io-gcs-filesystem==0.37.1 150 | - termcolor==2.4.0 151 | - thop==0.1.1-2209072238 152 | - threadpoolctl==3.5.0 153 | - toml==0.10.2 154 | - torch==2.3.1 155 | - tqdm==4.66.4 156 | - triton==2.3.1 157 | - tzdata==2024.1 158 | - urllib3==2.2.2 159 | - vbz-h5py-plugin==1.0.1 160 | - werkzeug==3.0.3 161 | - wrapt==1.14.1 162 | prefix: /home/nicolo/anaconda3/envs/remora_TF2 163 | -------------------------------------------------------------------------------- /data_curation_tutorial/tutorial_data_creation.md: -------------------------------------------------------------------------------- 1 | # Tutorial Part 1: Data creation for training ModiDeC 2 | 3 | ![GUI for retraining ModiDeC](https://github.com/mem3nto0/ModiDeC-RNA-modification-classifier/blob/main/data_curation_tutorial/Figura_data_generation.png) 4 | 5 | The "Data Curation" (or data creation) GUI was created to give the opportunity to the user to generate personalized training data for ModiDeC, which can be used 6 | for further steps and retraining the neural network. 7 | 8 | The figure shows three sections with several variables as inputs. In this Tutorial, we will explain the several steps to do to correctly generate your own 9 | dataset for training ModiDeC for your specific problem. 10 | 11 | Initially, we will give a description of the inputs that can be introduced in the GUI for data creation. In the second part of the file, an example will be 12 | provide to show what are the steps to do for creating the training data. 13 | 14 | ## Important Steps for running the GUI 15 | 16 | data has to be basecalled using Dorado and aligned using samtools: 17 | 18 | 1) Basecall your data using Dorado with the --emit-move. It is necessary for resquiggleling process. 19 | 2) aligned using "samtools" to generate a .bam file 20 | 21 | ## Select Input files and save-directory (Section 1) 22 | 23 | In this section, the input files can be selected using the several Gui buttons. 24 | 25 | 1) "Pod5 file folder" button: Select the folder where the pod5 files are stored. The folder must contain only pod5 files. 26 | 2) "bam file folder" button: Select the folder where the bam files are stored. The folder must contain only pod5 files. 27 | 3) "Save path" button: select the folder where the training data will be saved. Create a specific folder for it. 28 | 4) "kmer-level table file" button: select the k-mer level table for the 004 or 002 kit. These files are provided by ONT. 29 | 30 | We use the bam folder instead of a single file selection because, in certain cases, multiple .bam files can be obtained by the same pod5 measurement. 31 | If this is the case, generating multiple .bam files the GUI automatically analyzes all the bam files without any data overwriting. 32 | 33 | Example for multiple bam: if you used the first alignment flag during the alignment using samtools, use samtools to generate a single .bam file for each 34 | reference. then create a folder containing all the bam files created in this way. Use this folder for the GUI and all the bam files will be used for the data generation of training data. 35 | 36 | ## General variable for training data (Section 2) 37 | 38 | This second section of the GUI focuses on giving sequence information for the data sets the user wants to use for training. Information like "modification position" or "modified data" 39 | can be selected and let the users use their oligos for retraining the neural network. here below, a description of the input is provided: 40 | 41 | 1) "modification_data?": it is a yes or no question. The user can specify if the data are modified or not. It is useful if the user wants to add un-modfied reads for the training. 42 | 2) "take_modification_region?": it is a yes or no question. The user can decide to use all the read for the analysis or use only the signal region around the 43 | modification position that can be selected a few steps later. For example, it is useful for un-modified data for taking more k-mer for the analysis. 44 | 3) "name_save_file": specify the name of the file that will be saved. For each modification that you want to analyze or if the data are modified or not, give a new name. 45 | 4) "What type of modification?: it is a string linked also to the modification dictionary. For example, if you have in your dictionary two modifications (m6A and Gm), type Gm if you want 46 | to create training data for Gm, or type m6A to create training data for m6A. 47 | 5) "Bases before modification": It can be a positive or negative integer. Choose the number of bases to consider before (positive values) or after (negative values) for the resquiggle. Use 0 48 | if you want to take only a few bases around the modification position. This feature can be useful depending on the oligos design. 49 | 6) "Modification dictionary": come separated list of the total modifications that ModiDeC has to learn. For example, For Gm and m6A write in the box "Gm,m6A". 50 | 51 | ## Segmentation variables for training data (Section 3) 52 | 53 | This third section focuses on raw signal and neural network features that can be personalized by the user. A description of the input is provided: 54 | 55 | 1) "batch size": it is the number of raw signal that we will be saved in a single file. This is helpful to reduce memory problems during the saving process. Recommended value 16. 56 | 2) "max seq- length": it is an integer linked to one of the inputs of the neural network. It is linked to the maximum number of bases to use for the input. A Good value is "chunk length" divided by 10. 57 | 3) "chunk length": it is an integer that tells you how much is bit the time window to extract from the raw signal. IT is linked to one of the inputs of the neural network. 58 | 4) "shift in time": indicates how many time points to move for creating a new representation of the modified raw signal. suggested value is "chunk length" divided "batch size". 59 | 5) "start read number" and "end read number": Integers to select the pod5 reads indexes to use for generating data. 60 | 61 | After filling all the variables, press the button "Start resguigle" and .npz files will be generated in the save-folder. 62 | 63 | ## Practical example training data generation: Create a training data set containing Gm and m6A modification 64 | 65 | We want to give a practical example on how to fill the GUI for generating training data for ModiDeC. We have two oligos, one containing one Gm modification at the reference position 64 and another one 66 | containing m6A at the reference position 75. Additionally, we also have an un-modified oligo as well. 67 | 68 | First step, basecall each of the three oligos pod5 files indipently Using Dorado with the --emit-move flag. this means that I will have a .ubam file for Gm, one for m6A and one for Un-mod. After it, 69 | Use sametools to align each basecalled data to its corresponding sequence to obtain three .bam files. in the end, we should have something like this: 70 | 71 | 1) Gm_pod5_folder + Gm_bam_folder(containing "Gm_aligned.bam" file) 72 | 2) m6A_pod5_folder + m6A_bam_folder(containing "m6A_aligned.bam" file) 73 | 3) unmodified_pod5_folder + unmofied_bam_folder(containing "unmodifed_aligned.bam" file) 74 | 75 | You want to save all of them in the same folder for the training, then create a folder called "training_data". 76 | 77 | Now, for this case we want that ModiDeC analyzes closely the modified signal. Having this purpose, we can set the "chunck size" parameter to 400, which means that the "max seq. length" is 40. 78 | additionally, we want to save 16 raw signals per file, which means that "batch size" is 16 and consequently shift in time is 25 (400/16). Setting in mind these values, we can run the GUI and start 79 | fo fill the variable for analyzing first Gm, m6a, and unmodified data. in the figure below you can see how the GUI was filled with our goal with the three runs. The red squares 80 | show what was changed in the GUI between each run. In each run, press the button "start resguille" for creating the training data. 81 | 82 | ![generating the data](https://github.com/mem3nto0/ModiDeC-RNA-modification-classifier/blob/main/data_curation_tutorial/data_creation_example.png) 83 | 84 | In the figure it is possible to observe that "pod5 file folder" and "bam file folder" are also marked in red. This because for each run you have to load the corresponding pod5 and bam folder. 85 | The "training_data" folder that we created will be filled with .npz files containing the modification resquiglle signal for the training phase. 86 | -------------------------------------------------------------------------------- /Analyze_data_NN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import pod5 4 | from remora import io 5 | import matplotlib.pyplot as plt 6 | 7 | def NN_analyzer(variables, pod5_dr, bam_fh, read_id, sig_map_refiner, model, reference, labels_mod = 4): 8 | 9 | chunck_size = variables[2] 10 | max_seq_len = variables[3] 11 | labels = 4 12 | N_miss = 0 13 | 14 | reference_track_mod = np.zeros([len(reference), labels_mod]) 15 | 16 | if variables[1] == -1: 17 | 18 | variables[1] = len(read_id) 19 | 20 | if len(read_id) > variables[1]: 21 | 22 | end_reads = variables[1] 23 | 24 | else: 25 | end_reads = len(read_id) 26 | 27 | if end_reads < variables[0]: 28 | 29 | if end_reads - np.abs(variables[0] - variables[1]) < 0: 30 | start_reads = 0 31 | 32 | else: 33 | start_reads = end_reads - np.abs(variables[0] - variables[1]) 34 | 35 | else: 36 | start_reads = variables[0] 37 | 38 | #print(len(read_id)) 39 | #print(start_reads, end_reads) 40 | 41 | for name_id in read_id[start_reads: end_reads]: 42 | 43 | pod5_read = pod5_dr.get_read(name_id) 44 | bam_read = bam_fh.get_first_alignment(name_id) 45 | 46 | seq_resquigle = "" 47 | position_adjusting = 0 48 | Error_read = False 49 | 50 | if bam_read.is_reverse: #correct the signal for forward direction 51 | flip = False 52 | else: 53 | flip = True 54 | 55 | try: 56 | #/// read data 57 | read_analysed = io.Read.from_pod5_and_alignment(pod5_read, bam_read, reverse_signal = flip) 58 | 59 | #/// If data were aligned with U, U in sequence will be replaced by the T. Important for resquiggle 60 | prob_ref = read_analysed.ref_seq 61 | prob_ref = prob_ref.replace("U", "T") 62 | read_analysed.ref_seq = prob_ref 63 | 64 | # // resquigle the data with the reference 65 | read_analysed.set_refine_signal_mapping(sig_map_refiner, ref_mapping=True) 66 | 67 | start_of_mapping = read_analysed.extract_ref_reg( 68 | read_analysed.ref_reg.adjust(start_adjust = 0, end_adjust=read_analysed.ref_reg.len)) 69 | 70 | Raw_signal = start_of_mapping.norm_signal 71 | seq_resquigle = start_of_mapping.seq 72 | start_end_resquigle = start_of_mapping.seq_to_sig_map 73 | 74 | # /// check if the modification position has to be adjusted /// 75 | position_adjusting = start_of_mapping.ref_reg.start 76 | 77 | except: 78 | position_adjusting = 0 79 | seq_resquigle = "" 80 | Error_read = True 81 | 82 | if Error_read == False: 83 | 84 | base_dict = {"A":1, "C":2, "G":3, "T":4} 85 | bases_onehot = np.zeros([len(Raw_signal),4 + 1]) 86 | 87 | try: 88 | 89 | for k in range(len(seq_resquigle)): 90 | 91 | start_resq = start_end_resquigle[k] 92 | bases_onehot[start_resq,base_dict[seq_resquigle[k]]] = 1 93 | 94 | 95 | N_segments = int(len(Raw_signal)/chunck_size) 96 | Input_1 = np.zeros([N_segments +1,chunck_size]) # initialize the first input of the NN 97 | Input_2 = np.zeros([N_segments +1,max_seq_len,labels]) # initialize the second input of the NN 98 | 99 | for k in range (N_segments): 100 | 101 | start = k*chunck_size 102 | Input_1[k] = Raw_signal[start: start + chunck_size] 103 | 104 | window_onehot = bases_onehot[start: start + chunck_size,:] 105 | probe = np.argmax(window_onehot, axis=-1) 106 | probe = probe[probe != 0] 107 | probe = probe -1 108 | 109 | for kk in range(len(probe)): 110 | 111 | Input_2[k, kk, probe[kk]] = 1 112 | 113 | #find the number of point not overlapping 114 | not_overlaping_last_seg = len(Raw_signal) - (start + chunck_size) 115 | 116 | # the extention to +1 is for keeping the full dimention of the output 117 | Input_1[N_segments] = Raw_signal[-chunck_size:] 118 | 119 | Additional_window = bases_onehot[-chunck_size:,:] 120 | probe = np.argmax(Additional_window, axis = -1) 121 | probe = probe[probe != 0] 122 | probe = probe - 1 123 | 124 | for kk in range (len(probe)): 125 | 126 | Input_2[N_segments, kk, probe[kk]] = 1 127 | 128 | #probe the overlapping bases for the last segment 129 | Window_overlap = bases_onehot[-chunck_size:-not_overlaping_last_seg,:] 130 | seq_overlap = np.zeros([Window_overlap.shape[0],4]) 131 | probe = np.argmax(Window_overlap, axis = -1) 132 | probe = probe[probe != 0] 133 | probe = probe - 1 134 | 135 | for kk in range (len(probe)): 136 | 137 | seq_overlap[kk, probe[kk]] = 1 138 | 139 | seq_overlap = np.sum(seq_overlap, axis = 1) 140 | seq_overlap = np.where(seq_overlap > 0.5)[0] 141 | len_overlap = len(seq_overlap) 142 | 143 | Input_1 = np.expand_dims(Input_1, axis=-1) 144 | #Input_2 = np.expand_dims(Input_2, axis=-1) 145 | 146 | X_total ={"Input_1": Input_1, "Input_2": Input_2} 147 | 148 | #analyze the read with the NN 149 | 150 | prediction = model.predict(X_total, verbose=0) # 151 | 152 | # reconstruct the final output removing the null part of the predictions 153 | Final_seq_binary = [] 154 | 155 | for kk in range(N_segments): # 156 | 157 | full_position = np.sum(prediction[kk], axis = 1) 158 | full_position = np.where(full_position> 0.5)[0] 159 | 160 | real_part = np.argmax(prediction[kk,:len(full_position)], axis=-1) 161 | Final_seq_binary = np.concatenate((Final_seq_binary,real_part), axis=0) 162 | 163 | full_position = np.sum(prediction[N_segments], axis = 1) 164 | full_position = np.where(full_position> 0.5)[0] 165 | 166 | real_part = np.argmax(prediction[N_segments,:len(full_position)], axis=-1) 167 | not_overlaping_part = real_part[len_overlap:] 168 | Final_seq_binary = np.concatenate((Final_seq_binary,not_overlaping_part), axis=0) 169 | 170 | if (len(Final_seq_binary) - len(seq_resquigle)) != 0: 171 | 172 | N_miss += 1 173 | 174 | else: 175 | 176 | where_mod = np.where(Final_seq_binary >= 1)[0] 177 | modific_detec = np.zeros(len(where_mod)) 178 | 179 | for j in range(len(where_mod)): 180 | 181 | modific_detec[j] = Final_seq_binary[where_mod[j]] 182 | 183 | if len(modific_detec) > 1: 184 | 185 | for n in range(len(modific_detec)): 186 | 187 | mod_probe_position = where_mod[n] 188 | mod_probe_predicted = modific_detec[n] 189 | 190 | reference_track_mod[int(mod_probe_position) + int(position_adjusting), int(mod_probe_predicted -1)] += 1 191 | 192 | else: 193 | 194 | mod_probe_position = where_mod[0] 195 | mod_probe_predicted = modific_detec[0] 196 | 197 | reference_track_mod[int(mod_probe_position) + int(position_adjusting), int(mod_probe_predicted -1)] += 1 198 | 199 | except: 200 | 201 | None 202 | 203 | 204 | print("analysis finished") 205 | print("Total data to analyize:", np.abs(end_reads - start_reads)) 206 | print("data analyized:", np.abs(end_reads - start_reads) - N_miss) 207 | 208 | #return reference_track_mod 209 | 210 | # /// calculate the modification frequency lust by the number or reads analyzed/// 211 | 212 | return (reference_track_mod)/(np.abs(end_reads - start_reads - N_miss)) 213 | -------------------------------------------------------------------------------- /Analyze_data_NN_V2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import pod5 4 | from remora import io 5 | import matplotlib.pyplot as plt 6 | from Coverage_check import Coverage_analysis 7 | 8 | def NN_analyzer(variables, pod5_dr, bam_fh, read_id, sig_map_refiner, model, reference, labels_mod): 9 | 10 | chunck_size = variables[2] 11 | max_seq_len = variables[3] 12 | labels = 4 13 | N_miss = 0 14 | 15 | reference_track_mod = np.zeros([len(reference), labels_mod]) # matrix to track the modification 16 | Track_coverage = np.zeros([len(reference)]) 17 | 18 | if variables[1] == -1: 19 | 20 | variables[1] = len(read_id) 21 | 22 | if len(read_id) > variables[1]: 23 | 24 | end_reads = variables[1] 25 | 26 | else: 27 | end_reads = len(read_id) 28 | 29 | if end_reads < variables[0]: 30 | 31 | if end_reads - np.abs(variables[0] - variables[1]) < 0: 32 | start_reads = 0 33 | 34 | else: 35 | start_reads = end_reads - np.abs(variables[0] - variables[1]) 36 | 37 | else: 38 | start_reads = variables[0] 39 | 40 | #print(len(read_id)) 41 | #print(start_reads, end_reads) 42 | 43 | for name_id in read_id[start_reads: end_reads]: 44 | 45 | pod5_read = pod5_dr.get_read(name_id) 46 | bam_read = bam_fh.get_first_alignment(name_id) 47 | 48 | seq_resquigle = "" 49 | position_adjusting = 0 50 | Error_read = False 51 | 52 | if bam_read.is_reverse: #correct the signal for forward direction 53 | flip = False 54 | else: 55 | flip = True 56 | 57 | try: 58 | #/// read data 59 | read_analysed = io.Read.from_pod5_and_alignment(pod5_read, bam_read, reverse_signal = flip) 60 | 61 | #/// If data were aligned with U, U in sequence will be replaced by the T. Important for resquiggle 62 | prob_ref = read_analysed.ref_seq 63 | prob_ref = prob_ref.replace("U", "T") 64 | read_analysed.ref_seq = prob_ref 65 | 66 | # // resquigle the data with the refence 67 | read_analysed.set_refine_signal_mapping(sig_map_refiner, ref_mapping=True) 68 | 69 | start_of_mapping = read_analysed.extract_ref_reg( 70 | read_analysed.ref_reg.adjust(start_adjust = 0, end_adjust=read_analysed.ref_reg.len)) 71 | 72 | Raw_signal = start_of_mapping.norm_signal 73 | seq_resquigle = start_of_mapping.seq 74 | start_end_resquigle = start_of_mapping.seq_to_sig_map 75 | 76 | # /// check if the modification position has to be adjusted /// 77 | position_adjusting = start_of_mapping.ref_reg.start 78 | end_of_sequence = start_of_mapping.ref_reg.len 79 | 80 | except: 81 | position_adjusting = 0 82 | seq_resquigle = "" 83 | Error_read = True 84 | 85 | if Error_read == False: 86 | 87 | base_dict = {"A":1, "C":2, "G":3, "T":4} 88 | bases_onehot = np.zeros([len(Raw_signal),4 + 1]) 89 | 90 | try: 91 | 92 | for k in range(len(seq_resquigle)): 93 | 94 | start_resq = start_end_resquigle[k] 95 | bases_onehot[start_resq,base_dict[seq_resquigle[k]]] = 1 96 | 97 | 98 | N_segments = int(len(Raw_signal)/chunck_size) 99 | Input_1 = np.zeros([N_segments +1,chunck_size]) # initialize the first input of the NN 100 | Input_2 = np.zeros([N_segments +1,max_seq_len,labels]) # initialize the second input of the NN 101 | 102 | for k in range (N_segments): 103 | 104 | start = k*chunck_size 105 | Input_1[k] = Raw_signal[start: start + chunck_size] 106 | 107 | window_onehot = bases_onehot[start: start + chunck_size,:] 108 | probe = np.argmax(window_onehot, axis=-1) 109 | probe = probe[probe != 0] 110 | probe = probe -1 111 | 112 | for kk in range(len(probe)): 113 | 114 | Input_2[k, kk, probe[kk]] = 1 115 | 116 | #find the number of point not overlapping 117 | not_overlaping_last_seg = len(Raw_signal) - (start + chunck_size) 118 | 119 | # the extention to +1 is for keeping the full dimention of the output 120 | Input_1[N_segments] = Raw_signal[-chunck_size:] 121 | 122 | Additional_window = bases_onehot[-chunck_size:,:] 123 | probe = np.argmax(Additional_window, axis = -1) 124 | probe = probe[probe != 0] 125 | probe = probe - 1 126 | 127 | for kk in range (len(probe)): 128 | 129 | Input_2[N_segments, kk, probe[kk]] = 1 130 | 131 | #probe the overlapping bases for the last segment 132 | Window_overlap = bases_onehot[-chunck_size:-not_overlaping_last_seg,:] 133 | seq_overlap = np.zeros([Window_overlap.shape[0],4]) 134 | probe = np.argmax(Window_overlap, axis = -1) 135 | probe = probe[probe != 0] 136 | probe = probe - 1 137 | 138 | for kk in range (len(probe)): 139 | 140 | seq_overlap[kk, probe[kk]] = 1 141 | 142 | seq_overlap = np.sum(seq_overlap, axis = 1) 143 | seq_overlap = np.where(seq_overlap > 0.5)[0] 144 | len_overlap = len(seq_overlap) 145 | 146 | Input_1 = np.expand_dims(Input_1, axis=-1) 147 | 148 | X_total ={"Input_1": Input_1, "Input_2": Input_2} 149 | 150 | #analyze the read with the NN 151 | 152 | prediction = model.predict(X_total, verbose=0) # 153 | 154 | # reconstruct the final output removing the null part of the predictions 155 | Final_seq_binary = [] 156 | 157 | for kk in range(N_segments): # 158 | 159 | full_position = np.sum(Input_2[kk], axis = 1) 160 | full_position = np.where(full_position> 0.5)[0] 161 | 162 | real_part = np.argmax(prediction[kk,:len(full_position)], axis=-1) 163 | Final_seq_binary = np.concatenate((Final_seq_binary,real_part), axis=0) 164 | 165 | full_position = np.sum(Input_2[N_segments], axis = 1) 166 | full_position = np.where(full_position> 0.5)[0] 167 | 168 | real_part = np.argmax(prediction[N_segments,:len(full_position)], axis=-1) 169 | not_overlaping_part = real_part[len_overlap:] 170 | Final_seq_binary = np.concatenate((Final_seq_binary,not_overlaping_part), axis=0) 171 | 172 | if (len(Final_seq_binary) - len(seq_resquigle)) != 0: 173 | 174 | N_miss += 1 175 | 176 | else: 177 | 178 | where_mod = np.where(Final_seq_binary >= 1)[0] 179 | modific_detec = np.zeros(len(where_mod)) 180 | 181 | for j in range(len(where_mod)): 182 | 183 | modific_detec[j] = Final_seq_binary[where_mod[j]] 184 | 185 | if len(modific_detec) > 1: 186 | 187 | for n in range(len(modific_detec)): 188 | 189 | mod_probe_position = where_mod[n] 190 | mod_probe_predicted = modific_detec[n] 191 | 192 | reference_track_mod[int(mod_probe_position) + int(position_adjusting), int(mod_probe_predicted -1)] += 1 193 | 194 | else: 195 | 196 | mod_probe_position = where_mod[0] 197 | mod_probe_predicted = modific_detec[0] 198 | 199 | reference_track_mod[int(mod_probe_position) + int(position_adjusting), int(mod_probe_predicted -1)] += 1 200 | 201 | Track_coverage[ int(position_adjusting): int(position_adjusting) + int(end_of_sequence)] += 1 202 | 203 | except: 204 | 205 | None 206 | 207 | 208 | # //////// caculate the weight for each base calculating the distance from avarage covarage //////// 209 | # the median seems to work bad in few cases. need to check where is the error. 210 | 211 | N_tot_analyzed = np.abs(end_reads - start_reads - N_miss) 212 | Final_results = Coverage_analysis(N_tot_analyzed, reference_track_mod, Track_coverage, threshold = 0.2) 213 | 214 | print("analysis finished") 215 | print("Total data to analyize:", np.abs(end_reads - start_reads)) 216 | print("data analyized:", N_tot_analyzed) 217 | 218 | # /// calculate the modification frequency lust by the number or reads analyzed/// 219 | 220 | return Final_results -------------------------------------------------------------------------------- /Resquigle_remora_GUI.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import sys 3 | from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton, QFileDialog, QVBoxLayout, QWidget, QLineEdit, QLabel, QHBoxLayout, QCheckBox 4 | from Remora_resquigle_generate_data import Remora_resquigle_Generation_data 5 | import json 6 | import os 7 | import numpy as np 8 | 9 | class MainWindow(QMainWindow): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | # list of variables 14 | self.paths = {"folder1": None, "folder2": None, "folder3": None, "folder4": None, "folder5": None} 15 | 16 | # Set up the main window 17 | self.setWindowTitle('Remora Resquigle - Generata training data for NN') 18 | self.setGeometry(100, 100, 320, 100) 19 | 20 | # Create a QWidget and set it as the central widget 21 | self.central_widget = QWidget() 22 | self.setCentralWidget(self.central_widget) 23 | 24 | # Create a vertical layout 25 | layout = QVBoxLayout() 26 | 27 | # Create buttons and add them to the layout 28 | self.button1 = QPushButton('Pod5 file folder') 29 | self.button1.clicked.connect(lambda: self.open_directory_dialog('folder1')) 30 | layout.addWidget(self.button1) 31 | 32 | self.button2 = QPushButton('bam file folder') 33 | self.button2.clicked.connect(lambda: self.open_directory_dialog('folder2')) 34 | layout.addWidget(self.button2) 35 | 36 | self.button3 = QPushButton('Save path') 37 | self.button3.clicked.connect(lambda: self.open_directory_dialog('folder4')) 38 | layout.addWidget(self.button3) 39 | 40 | self.button3 = QPushButton('kmer-level table file') 41 | self.button3.clicked.connect(lambda: self.open_filename_dialog('folder5')) 42 | layout.addWidget(self.button3) 43 | 44 | # set the first set of variables 45 | textbox1 = QLabel("General variables for training data:") 46 | layout.addWidget(textbox1) 47 | self.setup_variables(layout) 48 | 49 | # set the second set of variables 50 | textbox1 = QLabel("segmentation variables for training data:") 51 | layout.addWidget(textbox1) 52 | self.setup_variables_segmentation(layout) 53 | 54 | # Create buttons and add them to the layout 55 | self.button4 = QPushButton('Start resquigle') 56 | self.button4.clicked.connect(lambda: self.start_resquigle()) 57 | layout.addWidget(self.button4) 58 | 59 | 60 | # Set the layout on the central widget 61 | self.central_widget.setLayout(layout) 62 | 63 | 64 | """ list of function used in the main""" 65 | 66 | def open_directory_dialog(self, folder_name): 67 | # Open a dialog to choose a directory 68 | directory = QFileDialog.getExistingDirectory(self, f"Select {folder_name}") 69 | if directory: 70 | self.paths[folder_name] = directory 71 | print(f"Selected path for {folder_name}: {directory}") 72 | 73 | 74 | def open_filename_dialog(self, file_type): 75 | # Open a dialog to choose a file 76 | options = QFileDialog.Options() 77 | options |= QFileDialog.ReadOnly 78 | 79 | file_name, _ = QFileDialog.getOpenFileName(self, f"Select {file_type}", "", "All Files (*);;FASTA Files (*.fasta)", options=options) 80 | if file_name: 81 | self.paths[file_type] = file_name 82 | print(f"Selected path for {file_type}: {file_name}") 83 | 84 | def setup_variables(self, layout): 85 | # Creating layout and widgets for each variable in Variables tuple # "mod_mapping or basecalling?", 86 | labels = ["modified_data? (bool)", 87 | "take_modifed_region? (bool)", "name_save_file (str)", 88 | "what type of modification? (str)", 89 | "modification pos. (int)", "Bases before modfication (int)", "modification dictionary (str)"] 90 | 91 | self.vars_entries = ["mod_mapping"] # "mod_mapping" 92 | for i, label in enumerate(labels): 93 | row_layout = QHBoxLayout() 94 | label_widget = QLabel(label + ":") 95 | input_widget = QLineEdit() 96 | row_layout.addWidget(label_widget) 97 | row_layout.addWidget(input_widget) 98 | layout.addLayout(row_layout) 99 | self.vars_entries.append(input_widget) 100 | 101 | 102 | def setup_variables_segmentation(self, layout): 103 | labels_segmentation = ["batch size (int)", "max seq. length (int)", 104 | "chunk length (int)", "shift in time (int)", 105 | "start read number (int)", "end read number (int)"] 106 | 107 | self.segmentation_entries = [] 108 | for label in labels_segmentation: 109 | row_layout = QHBoxLayout() 110 | label_widget = QLabel(label + ":") 111 | input_widget = QLineEdit() 112 | row_layout.addWidget(label_widget) 113 | row_layout.addWidget(input_widget) 114 | layout.addLayout(row_layout) 115 | self.segmentation_entries.append(input_widget) 116 | 117 | """ """ 118 | def start_resquigle(self): 119 | 120 | #level_table_folder = self.paths["folder5"] 121 | #level_table_list = os.listdir(level_table_folder) #maybe to change to read the file and not the folder 122 | #level_table_file = level_table_folder + "/" + level_table_list[0] 123 | 124 | level_table_file = self.paths["folder5"] 125 | 126 | save_path = self.paths["folder4"] 127 | 128 | pod5_folder = self.paths["folder1"] 129 | bam_folder = self.paths["folder2"] 130 | bam_list = os.listdir(bam_folder) 131 | 132 | var1_bool = [] 133 | var2_bool = [] 134 | 135 | if self.vars_entries[1].text() == "yes" or self.vars_entries[1].text() == "Yes": 136 | 137 | var1_bool = True 138 | 139 | else: 140 | 141 | var1_bool = False 142 | 143 | 144 | if self.vars_entries[2].text() == "yes" or self.vars_entries[2].text() == "Yes": 145 | 146 | var2_bool = True 147 | 148 | else: 149 | 150 | var2_bool = False 151 | 152 | 153 | Variables = (self.vars_entries[0], #.text() 154 | var1_bool, #bool, 155 | var2_bool, #bool, 156 | self.vars_entries[3].text(), 157 | self.vars_entries[4].text(), 158 | int( self.vars_entries[5].text()), 159 | int( self.vars_entries[6].text()) 160 | ) 161 | 162 | variables_segmentation = (int( self.segmentation_entries[0].text()), 163 | int( self.segmentation_entries[1].text()), 164 | int( self.segmentation_entries[2].text()), 165 | int( self.segmentation_entries[3].text()) 166 | ) 167 | 168 | Indexes = (int( self.segmentation_entries[4].text()), 169 | int( self.segmentation_entries[5].text())) 170 | 171 | # /// create a dictionary /// 172 | 173 | probe_names = self.vars_entries[7].text() 174 | probe_names = probe_names.split(',') 175 | 176 | values = np.arange(2, len(probe_names) + 2, 1) 177 | 178 | # Convert the list into a dictionary with default values 179 | mod_dictionary = {probe_names[i]: values[i] for i in range(len(probe_names))} 180 | 181 | 182 | print(mod_dictionary) 183 | print(Variables) 184 | print(variables_segmentation) 185 | print(Indexes) 186 | 187 | for i in range (len(bam_list)): 188 | 189 | bam_file = bam_folder + "/" + bam_list[i] 190 | 191 | Remora_resquigle_Generation_data(pod5_folder, bam_file, 192 | level_table_file, save_path, 193 | Variables, variables_segmentation, 194 | Indexes, mod_dictionary, i) 195 | 196 | """ 197 | # /// save each bam file in a different generated folder 198 | bam_file = bam_folder + "/" + bam_list[i] 199 | 200 | #this creates several folder and save inside the data 201 | Directory = self.vars_entries[3].text() + f"reference_{i}" 202 | Final_path = os.path.join(save_path, Directory) 203 | os.mkdir(Final_path) 204 | 205 | Remora_resquigle_Generation_data(pod5_folder, bam_file, 206 | level_table_file, Final_path, #save_path 207 | Variables, variables_segmentation, 208 | Indexes, mod_dictionary, i) 209 | """ 210 | 211 | print("Resquigle finished") 212 | 213 | def main(): 214 | app = QApplication(sys.argv) 215 | window = MainWindow() 216 | window.show() 217 | sys.exit(app.exec_()) 218 | 219 | if __name__ == '__main__': 220 | main() -------------------------------------------------------------------------------- /Training_NN_GUI.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton, QFileDialog, QVBoxLayout, QWidget, QLineEdit, QLabel, QHBoxLayout, QCheckBox 3 | import tensorflow as tf 4 | from keras.callbacks import LearningRateScheduler 5 | from Load_data_for_training_V2 import Load_data_RNA 6 | from ModiDec_NN import ModiDeC_model 7 | import os 8 | import numpy as np 9 | 10 | class MainWindow(QMainWindow): 11 | def __init__(self): 12 | super().__init__() 13 | 14 | # list of variables 15 | self.paths = {"folder1": None, "folder2": None , "folder3": None} 16 | 17 | # Set up the main window 18 | self.setWindowTitle("Training Nueral network - modification classifier") 19 | self.setGeometry(100, 100, 320, 100) 20 | 21 | # Create a QWidget and set it as the central widget 22 | self.central_widget = QWidget() 23 | self.setCentralWidget(self.central_widget) 24 | 25 | # Create a vertical layout 26 | layout = QVBoxLayout() 27 | 28 | # Create buttons and add them to the layout 29 | self.button1 = QPushButton('training data folder') 30 | self.button1.clicked.connect(lambda: self.open_directory_dialog('folder1')) 31 | layout.addWidget(self.button1) 32 | 33 | # Create buttons and add them to the layout 34 | self.button2 = QPushButton('Validation data folder') 35 | self.button2.clicked.connect(lambda: self.open_directory_dialog('folder2')) 36 | layout.addWidget(self.button2) 37 | 38 | self.button3 = QPushButton('save model folder') 39 | self.button3.clicked.connect(lambda: self.open_directory_dialog('folder3')) 40 | layout.addWidget(self.button3) 41 | 42 | # set the first set of variables 43 | textbox1 = QLabel("General variables for training data:") 44 | layout.addWidget(textbox1) 45 | self.setup_variables(layout) 46 | 47 | # Create buttons and add them to the layout 48 | self.button4 = QPushButton('Start training') 49 | self.button4.clicked.connect(lambda: self.start_training()) 50 | layout.addWidget(self.button4) 51 | 52 | # Set the layout on the central widget 53 | self.central_widget.setLayout(layout) 54 | 55 | 56 | """ list of function used in the main""" 57 | 58 | def open_directory_dialog(self, folder_name): 59 | # Open a dialog to choose a directory 60 | directory = QFileDialog.getExistingDirectory(self, f"Select {folder_name}") 61 | if directory: 62 | self.paths[folder_name] = directory 63 | print(f"Selected path for {folder_name}: {directory}") 64 | 65 | """ 66 | labels = ["chunck_size (int)", "batch_size (int)", 67 | "single_data_size (int)", "max seq. length (int)", "k-mer model (int)", 68 | "labels (int)", "epoches (suggeste 4) (int)", "name NN (str)" ] 69 | """ 70 | 71 | def setup_variables(self, layout): 72 | # Creating layout and widgets for each variable in Variables tuple 73 | labels = ["batch_size (int)", "k-mer model (int)", "epoches (suggeste 4) (int)", "name NN (str)", "validation during training? (bool)" ] 74 | 75 | self.vars_entries = [] 76 | for i, label in enumerate(labels): 77 | row_layout = QHBoxLayout() 78 | label_widget = QLabel(label + ":") 79 | input_widget = QLineEdit() 80 | row_layout.addWidget(label_widget) 81 | row_layout.addWidget(input_widget) 82 | layout.addLayout(row_layout) 83 | self.vars_entries.append(input_widget) 84 | 85 | def start_training(self): 86 | 87 | """load the variables""" 88 | 89 | path_data = self.paths["folder1"] 90 | data_list = os.listdir(path_data) 91 | 92 | 93 | path_eval = self.paths["folder2"] 94 | eval_list = os.listdir(path_data) 95 | 96 | var1_bool = [] 97 | 98 | # //// extract variable for training from data training datasets /// 99 | 100 | probe_data = np.load(path_data + "/" + data_list[0]) 101 | 102 | probe_x1_data = probe_data["train_input"] 103 | probe_y_data = probe_data["train_output"] 104 | 105 | chunck_size = int(probe_x1_data.shape[1]) 106 | single_data_size = int(probe_x1_data.shape[0]) 107 | labels = int(probe_y_data.shape[2]) 108 | max_seq_len = int(probe_y_data.shape[1]) 109 | 110 | batch_size = int(self.vars_entries[0].text()) 111 | k_mer = int( self.vars_entries[1].text()) 112 | N_epoch = int( self.vars_entries[2].text()) 113 | 114 | "validation during training? (bool)" 115 | 116 | if self.vars_entries[4].text() == "yes" or self.vars_entries[4].text() == "Yes": 117 | 118 | var1_bool = True 119 | 120 | else: 121 | 122 | var1_bool = False 123 | 124 | 125 | """ /////define the model /////""" 126 | 127 | model = ModiDeC_model(Inp_1 = chunck_size, Inp_2 = max_seq_len, labels = labels, kmer_model=k_mer) 128 | 129 | """ /////compile the model for the training ///""" 130 | 131 | opt_adam =tf.keras.optimizers.Adam(learning_rate= 0.0001) 132 | 133 | model.compile(optimizer=opt_adam, 134 | loss= tf.losses.binary_crossentropy, 135 | metrics=["accuracy"]) 136 | 137 | def lr_schedule(epoch, optimizer): 138 | 139 | min_lr = 0.0000125 # Set the minimum learning rate 140 | 141 | # Update the learning rate if needed (similar to your original code) 142 | if epoch % 2 == 0 and epoch > 0: 143 | 144 | new_lr = tf.keras.backend.get_value(model.optimizer.lr) * 0.5 # You can adjust the decay factor as needed 145 | model.optimizer.lr.assign(new_lr) 146 | return max(new_lr, min_lr) 147 | 148 | else: 149 | return tf.keras.backend.get_value(model.optimizer.lr) 150 | 151 | lr_scheduler = LearningRateScheduler(lambda epoch: lr_schedule(epoch, optimizer=opt_adam)) 152 | 153 | if var1_bool == False: 154 | 155 | N_batches = int(len(data_list)/(batch_size/single_data_size)) 156 | 157 | """loading function used for training""" 158 | 159 | training_generator = Load_data_RNA(batch_size, N_batches, 160 | path_data, 161 | data_list, 162 | chunck_size = chunck_size, 163 | labels= labels , 164 | batch_loading = single_data_size, 165 | max_seq_len= max_seq_len) 166 | 167 | """start the training""" 168 | 169 | model.fit(training_generator, 170 | shuffle = True, 171 | epochs=N_epoch, 172 | workers= 6, 173 | max_queue_size=128, 174 | callbacks= [lr_scheduler]) 175 | 176 | """save the model""" 177 | 178 | model.save( self.paths["folder3"] + "/" + self.vars_entries[3].text()) 179 | 180 | print("training complete") 181 | 182 | else: 183 | 184 | N_batches = int(len(data_list)/(batch_size/single_data_size)) 185 | N_batches_2 = int(len(eval_list)/(batch_size/single_data_size)) 186 | 187 | """loading function used for training""" 188 | 189 | training_generator = Load_data_RNA(batch_size, N_batches, 190 | path_data, 191 | data_list, 192 | chunck_size = chunck_size, 193 | labels= labels , 194 | batch_loading = single_data_size, 195 | max_seq_len= max_seq_len) 196 | 197 | validation_generator = Load_data_RNA(batch_size, N_batches_2, 198 | path_eval, 199 | eval_list, 200 | chunck_size = chunck_size, 201 | labels= labels, 202 | batch_loading = single_data_size, 203 | max_seq_len= max_seq_len) 204 | 205 | """start the training""" 206 | 207 | model.fit(training_generator, 208 | validation_data = validation_generator, 209 | shuffle = True, 210 | epochs=N_epoch, 211 | workers= 6, 212 | max_queue_size=256, 213 | callbacks= [lr_scheduler]) 214 | 215 | """save the model""" 216 | 217 | model.save( self.paths["folder3"] + "/" + self.vars_entries[3].text()) 218 | 219 | print("training complete") 220 | 221 | 222 | def main(): 223 | app = QApplication(sys.argv) 224 | window = MainWindow() 225 | window.show() 226 | sys.exit(app.exec_()) 227 | 228 | if __name__ == '__main__': 229 | main() 230 | -------------------------------------------------------------------------------- /Analysis_platform_GUI.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton, QVBoxLayout, QWidget, QLabel, QSlider, QLineEdit, QHBoxLayout , QFileDialog 4 | from PyQt5.QtCore import Qt 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas 8 | import pod5 9 | from remora import io , refine_signal_map, util 10 | import tensorflow as tf 11 | from Analyze_data_NN_V2 import NN_analyzer 12 | 13 | 14 | """ to generate a window into a window two main class has to be defined""" 15 | """ second window generator""" 16 | class RNA_analysis_platform(QWidget): 17 | def __init__(self, Analysis_NN): 18 | super().__init__() 19 | 20 | self.Analysis_NN = Analysis_NN 21 | self.initUI() 22 | 23 | def initUI(self): 24 | self.layout = QVBoxLayout() 25 | 26 | self.start_label = QLabel('Start Point:') 27 | self.end_label = QLabel('End Point:') 28 | self.start_input = QLineEdit(self) 29 | self.end_input = QLineEdit(self) 30 | self.start_slider = QSlider(Qt.Horizontal, self) 31 | self.end_slider = QSlider(Qt.Horizontal, self) 32 | 33 | self.start_input.setText('0') 34 | self.end_input.setText('10') 35 | self.start_slider.setMinimum(0) 36 | self.start_slider.setMaximum(self.Analysis_NN.shape[0]) 37 | self.start_slider.setValue(0) 38 | self.end_slider.setMinimum(0) 39 | self.end_slider.setMaximum(self.Analysis_NN.shape[0]) 40 | self.end_slider.setValue(10) 41 | 42 | self.start_input.textChanged.connect(self.update_start_slider) 43 | self.end_input.textChanged.connect(self.update_end_slider) 44 | self.start_slider.valueChanged.connect(self.update_start_input) 45 | self.end_slider.valueChanged.connect(self.update_end_input) 46 | 47 | self.hbox1 = QHBoxLayout() 48 | self.hbox1.addWidget(self.start_label) 49 | self.hbox1.addWidget(self.start_input) 50 | 51 | self.hbox2 = QHBoxLayout() 52 | self.hbox2.addWidget(self.end_label) 53 | self.hbox2.addWidget(self.end_input) 54 | 55 | self.layout.addLayout(self.hbox1) 56 | self.layout.addWidget(self.start_slider) 57 | self.layout.addLayout(self.hbox2) 58 | self.layout.addWidget(self.end_slider) 59 | 60 | self.plot_button = QPushButton('Plot', self) 61 | self.plot_button.clicked.connect(self.plot) 62 | 63 | self.layout.addWidget(self.plot_button) 64 | 65 | self.figure, self.ax = plt.subplots() 66 | self.canvas = FigureCanvas(self.figure) 67 | self.layout.addWidget(self.canvas) 68 | 69 | self.setLayout(self.layout) 70 | 71 | def update_start_slider(self, text): 72 | try: 73 | value = int(text) 74 | self.start_slider.setValue(value) 75 | except ValueError: 76 | pass 77 | 78 | def update_end_slider(self, text): 79 | try: 80 | value = int(text) 81 | self.end_slider.setValue(value) 82 | except ValueError: 83 | pass 84 | 85 | def update_start_input(self, value): 86 | self.start_input.setText(str(value)) 87 | 88 | def update_end_input(self, value): 89 | self.end_input.setText(str(value)) 90 | 91 | def plot(self): 92 | start = int(self.start_input.text()) 93 | end = int(self.end_input.text()) 94 | 95 | x_axis = np.arange(0,self.Analysis_NN.shape[0],1) 96 | 97 | self.ax.clear() 98 | 99 | """ 100 | self.ax.plot(x_axis,self.Analysis_NN[:,0], marker= "o" , label="Gm") 101 | self.ax.plot(x_axis,self.Analysis_NN[:,1], marker= "o" , label="$m^6A$") 102 | self.ax.plot(x_axis,self.Analysis_NN[:,2], marker= "o" , label="Ino") 103 | self.ax.plot(x_axis,self.Analysis_NN[:,3], marker= "o" , label="Psi") 104 | """ 105 | self.ax.plot(x_axis,self.Analysis_NN, marker= "o" , label="label") 106 | 107 | self.ax.set_xlim(start,end) 108 | self.ax.set_ylim(-0.05,1) 109 | #ax2.set_xlim(0,len(reference)) #len(reference) 110 | self.ax.set_xlabel("ref. seq. position") 111 | self.ax.set_ylabel("freq. modif.") 112 | self.ax.legend() 113 | self.canvas.draw() 114 | 115 | 116 | """ first window generator""" 117 | 118 | class MainWindow(QMainWindow): 119 | def __init__(self): 120 | super().__init__() 121 | 122 | # list of variables 123 | self.paths = {"folder1": None, "folder2": None, "folder3": None, "folder4": None, "folder5": None} 124 | 125 | # Set up the main window 126 | self.setWindowTitle('Analysis data Neural network') 127 | self.setGeometry(100, 100, 320, 100) 128 | 129 | # Create a QWidget and set it as the central widget 130 | self.central_widget = QWidget() 131 | self.setCentralWidget(self.central_widget) 132 | 133 | # Create a vertical layout 134 | layout = QVBoxLayout() 135 | 136 | # Create buttons and add them to the layout 137 | self.button1 = QPushButton('Pod5 file folder') 138 | self.button1.clicked.connect(lambda: self.open_directory_dialog('folder1')) 139 | layout.addWidget(self.button1) 140 | 141 | self.button2 = QPushButton('bam file') 142 | self.button2.clicked.connect(lambda: self.open_filename_dialog('folder2')) 143 | layout.addWidget(self.button2) 144 | 145 | self.button3 = QPushButton('Neural Network folder') 146 | self.button3.clicked.connect(lambda: self.open_directory_dialog('folder3')) 147 | layout.addWidget(self.button3) 148 | 149 | self.button4 = QPushButton('kmer-level table file') 150 | self.button4.clicked.connect(lambda: self.open_filename_dialog('folder4')) 151 | layout.addWidget(self.button4) 152 | 153 | self.button4 = QPushButton('reference') 154 | self.button4.clicked.connect(lambda: self.open_filename_dialog('folder5')) 155 | layout.addWidget(self.button4) 156 | 157 | self.button4 = QPushButton('Initialize the data') 158 | self.button4.clicked.connect(lambda: self.Initialize_Analysis()) 159 | layout.addWidget(self.button4) 160 | 161 | # set the first set of variables 162 | textbox1 = QLabel("General variables for the analysis:") 163 | layout.addWidget(textbox1) 164 | self.setup_variables(layout) 165 | 166 | self.button4 = QPushButton('start analysis with Neural network') 167 | self.button4.clicked.connect(lambda: self.Analysis_Neural_network()) 168 | layout.addWidget(self.button4) 169 | 170 | self.button4 = QPushButton('Visualize results') 171 | self.button4.clicked.connect(lambda: self.open_visualization_results()) 172 | layout.addWidget(self.button4) 173 | 174 | # Set the layout on the central widget 175 | self.central_widget.setLayout(layout) 176 | 177 | 178 | """ list of function used in the mainWindow""" 179 | 180 | def open_directory_dialog(self, folder_name): 181 | # Open a dialog to choose a directory 182 | directory = QFileDialog.getExistingDirectory(self, f"Select {folder_name}") 183 | if directory: 184 | self.paths[folder_name] = directory 185 | print(f"Selected path for {folder_name}: {directory}") 186 | 187 | def open_filename_dialog(self, file_type): 188 | # Open a dialog to choose a file 189 | options = QFileDialog.Options() 190 | options |= QFileDialog.ReadOnly 191 | 192 | file_name, _ = QFileDialog.getOpenFileName(self, f"Select {file_type}", "", "All Files (*);;FASTA Files (*.fasta)", options=options) 193 | if file_name: 194 | self.paths[file_type] = file_name 195 | print(f"Selected path for {file_type}: {file_name}") 196 | 197 | def setup_variables(self, layout): 198 | # Creating layout and widgets for each variable in Variables tuple 199 | labels = ["start_index", "end_index"] 200 | 201 | self.vars_entries = [] 202 | for i, label in enumerate(labels): 203 | row_layout = QHBoxLayout() 204 | label_widget = QLabel(label + ":") 205 | input_widget = QLineEdit() 206 | row_layout.addWidget(label_widget) 207 | row_layout.addWidget(input_widget) 208 | layout.addLayout(row_layout) 209 | self.vars_entries.append(input_widget) 210 | 211 | 212 | def Initialize_Analysis(self): 213 | 214 | pod5_path = self.paths["folder1"] 215 | bam_pathr = self.paths["folder2"] 216 | model_path = self.paths["folder3"] 217 | level_table_file = self.paths["folder4"] 218 | 219 | self.pod5_dr = pod5.DatasetReader(pod5_path) 220 | self.bam_fh = io.ReadIndexedBam(bam_pathr) 221 | 222 | self.read_id = self.bam_fh.read_ids 223 | 224 | self.sig_map_refiner = refine_signal_map.SigMapRefiner( 225 | kmer_model_filename=level_table_file, 226 | do_rough_rescale=True, 227 | scale_iters=0, 228 | do_fix_guage=True) 229 | 230 | self.NN_model = tf.keras.models.load_model(model_path) 231 | 232 | input_shapes = self.NN_model.input_shape 233 | output_shape = self.NN_model.output_shape 234 | 235 | self.chunck_size = int(input_shapes[0][1]) 236 | self.max_seq_len = int(input_shapes[1][1]) 237 | self.total_mod = output_shape[2] - 1 238 | 239 | print("initialize: Done") 240 | 241 | def Analysis_Neural_network(self): 242 | 243 | Variables = (int(self.vars_entries[0].text()), 244 | int(self.vars_entries[1].text()), 245 | int(self.chunck_size), 246 | int(self.max_seq_len)) 247 | 248 | 249 | reference_path = self.paths["folder5"] 250 | reference = open(reference_path) 251 | reference = reference.read() 252 | 253 | self.Analysis_NN = NN_analyzer(Variables, 254 | self.pod5_dr, 255 | self.bam_fh, 256 | self.read_id, 257 | self.sig_map_refiner, 258 | self.NN_model, 259 | reference, 260 | labels_mod = self.total_mod) 261 | 262 | print("Analysis finished") 263 | 264 | def open_visualization_results(self): 265 | self.gaussian_plot = RNA_analysis_platform(self.Analysis_NN) 266 | self.gaussian_plot.show() 267 | 268 | 269 | def main(): 270 | app = QApplication(sys.argv) 271 | window = MainWindow() 272 | window.show() 273 | sys.exit(app.exec_()) 274 | 275 | if __name__ == '__main__': 276 | main() 277 | -------------------------------------------------------------------------------- /Remora_resquigle_generate_data.py: -------------------------------------------------------------------------------- 1 | import pod5 2 | from remora import io , refine_signal_map, util 3 | import os 4 | import numpy as np 5 | 6 | 7 | def Remora_resquigle_Generation_data(data_path, bam_file, level_table_file, save_path, Variables, variables_segmentation, Indexes, mod_dictionary, ind_loop): 8 | 9 | #initial variable 10 | type_analysis = Variables[0] 11 | modified_data = Variables[1] 12 | take_mod_region = Variables[2] 13 | name_save_file = Variables[3] 14 | Modfied_base = Variables[4] 15 | mod_pos_initial = Variables[5] 16 | start_base_resquigle = Variables[6] 17 | 18 | #second variable for chunk size creations 19 | batch_size = variables_segmentation[0] 20 | max_label_length = variables_segmentation[1] 21 | time_segment = variables_segmentation[2] 22 | shift = variables_segmentation[3] 23 | 24 | # /////// read the files ////// 25 | 26 | pod5_dr = pod5.DatasetReader(data_path) 27 | bam_fh = io.ReadIndexedBam(bam_file) 28 | 29 | # /////// take the name of reads//// 30 | 31 | read_id = bam_fh.read_ids 32 | 33 | # /// define the function for resquile from Remora /// 34 | # // old version used for DNA. maybe DNA data has to be analysed again // 35 | 36 | sig_map_refiner = refine_signal_map.SigMapRefiner( 37 | kmer_model_filename=level_table_file, 38 | do_rough_rescale=True, 39 | scale_iters=0, 40 | do_fix_guage=True) 41 | 42 | if type_analysis == "mod_mapping": 43 | 44 | labels = len(mod_dictionary) 45 | 46 | if type_analysis == "basecalling": 47 | 48 | labels = 4 49 | 50 | start_Index = Indexes[0] 51 | 52 | for name_id in read_id[Indexes[0]: Indexes[1]]: #need to find a way to choose the ids. 53 | 54 | start_Index += 1 55 | print(start_Index) 56 | seq_resquigle = "" 57 | position_adjusting = 0 58 | Error_read = False 59 | 60 | # /// extract the select read and info from bam file /// 61 | 62 | pod5_read = pod5_dr.get_read(name_id) 63 | bam_read = bam_fh.get_first_alignment(name_id) 64 | 65 | # /// after extraction, obtain the basecalling information /// 66 | 67 | if bam_read.is_reverse: #correct the signal for forward direction 68 | flip = False 69 | else: 70 | flip = True 71 | 72 | try: 73 | #/// read data 74 | read_analysed = io.Read.from_pod5_and_alignment(pod5_read, bam_read, reverse_signal = flip) 75 | 76 | #/// If data were aligned with U, U in sequence will be replaced by the T. Important for resquiggle 77 | prob_ref = read_analysed.ref_seq 78 | prob_ref = prob_ref.replace("U", "T") 79 | read_analysed.ref_seq = prob_ref 80 | 81 | # // resquigle the data with the refence 82 | read_analysed.set_refine_signal_mapping(sig_map_refiner, ref_mapping=True) 83 | 84 | start_of_mapping = read_analysed.extract_ref_reg( 85 | read_analysed.ref_reg.adjust(start_adjust = 0, end_adjust=read_analysed.ref_reg.len)) 86 | 87 | Raw_signal = start_of_mapping.norm_signal 88 | seq_resquigle = start_of_mapping.seq 89 | start_end_resquigle = start_of_mapping.seq_to_sig_map 90 | 91 | # /// check if the modification position has to be adjusted /// 92 | position_adjusting =start_of_mapping.ref_reg.start 93 | 94 | except: 95 | 96 | print("error") 97 | position_adjusting = 0 98 | seq_resquigle = "" 99 | Error_read = True 100 | 101 | """ 102 | mod_pos = mod_pos_initial - position_adjusting - 1 103 | max_signal_length = Raw_signal[0 : mod_pos + time_segment] 104 | """ 105 | 106 | val_total_seq = position_adjusting + len(seq_resquigle) 107 | high_threshold = mod_pos_initial + 20 108 | 109 | # // select only high score quality, extrapolate signal and save data // 110 | 111 | start_analysis = False 112 | 113 | if take_mod_region == True: 114 | 115 | if high_threshold < val_total_seq and position_adjusting < mod_pos_initial and Error_read == False: 116 | 117 | start_analysis = True 118 | 119 | else: 120 | 121 | if Error_read == False: 122 | 123 | start_analysis = True 124 | 125 | 126 | if start_analysis == True: # ///////// TO CHECK !!! //////////// 127 | 128 | Signal_onehot = np.zeros([len(Raw_signal),4 + 1]) 129 | Output_onehot = np.zeros([len(Raw_signal), labels + 2]) 130 | 131 | mod_pos = mod_pos_initial - position_adjusting - 1 132 | 133 | if modified_data == True: 134 | 135 | seq_resquigle_mod = seq_resquigle[:mod_pos] + "X" + seq_resquigle[mod_pos +1:] 136 | 137 | else: 138 | 139 | seq_resquigle_mod = seq_resquigle 140 | 141 | if type_analysis == "mod_mapping": 142 | 143 | #modification_dict = {"G":2, "M":3, "I":4, "P":5} 144 | value_modification = int(mod_dictionary[Modfied_base]) 145 | base_dict_output = { "A":1, "C":1, "G":1, "T":1,"X":value_modification} # variable 146 | 147 | if type_analysis == "basecalling": 148 | 149 | base_dict_output = { "A":1, "C":2, "G":3, "T":4, "X":5} 150 | 151 | base_dict = {"A":1, "C":2, "G":3, "T":4} 152 | 153 | try: 154 | 155 | for k in range(len(seq_resquigle)): 156 | 157 | start_resq = start_end_resquigle[k] 158 | Signal_onehot[start_resq,base_dict[seq_resquigle[k]]] = 1 159 | Output_onehot[start_resq,base_dict_output[seq_resquigle_mod[k]]] = 1 160 | 161 | if type_analysis == "mod_mapping" and modified_data == True: 162 | 163 | mod_position = np.where(Output_onehot[:,value_modification] > 0)[0][0] 164 | 165 | if type_analysis == "mod_mapping" and modified_data == False: 166 | 167 | if take_mod_region == True: 168 | 169 | mod_position = np.where(Output_onehot[:,1] > 0)[0][mod_pos] 170 | 171 | else: 172 | 173 | mod_position = 0 174 | 175 | if type_analysis == "basecalling" and modified_data == True: 176 | 177 | mod_position = np.where(Output_onehot[:,5] > 0)[0][0] 178 | 179 | if type_analysis == "basecalling" and modified_data == False: # to check for the others 180 | 181 | if take_mod_region == True: 182 | 183 | mod_position = np.where(Output_onehot[:,1] > 0)[0][mod_pos] 184 | 185 | else: 186 | 187 | mod_position = 0 188 | 189 | if take_mod_region == True: 190 | 191 | minus_start = np.abs(start_end_resquigle[mod_pos - start_base_resquigle] - mod_position) 192 | 193 | N_shift = int((time_segment + minus_start)/shift) 194 | 195 | else: 196 | 197 | N_shift = int((len(Raw_signal) - time_segment)/shift) 198 | 199 | for n in range(int(N_shift/batch_size)): 200 | 201 | train1_batch = np.zeros([batch_size, time_segment]) 202 | train2_batch = np.zeros([batch_size, max_label_length, 4]) 203 | output_batch = np.zeros([batch_size, max_label_length, 1 + labels]) 204 | 205 | for m in range(batch_size): 206 | 207 | if take_mod_region == True: 208 | 209 | midlle_mod_position = mod_position #+ int(0.5*np.abs(start_end_resquigle[mod_pos + 1] - start_end_resquigle[mod_pos])) 210 | start = midlle_mod_position - n*batch_size*shift - m*shift 211 | end = start + time_segment 212 | 213 | else: 214 | 215 | start = n*batch_size*shift + m*shift 216 | end = start + time_segment 217 | 218 | output_for_batch = np.zeros([max_label_length,1 + labels]) 219 | train2_for_batch = np.zeros([max_label_length,4]) 220 | 221 | # // here I am using a trick. All the bases has no zero value 222 | # making again the one-hot into an array and removing the 0 values, 223 | # I obtain the index of the final one-hot sequence for train2 and output 224 | 225 | probe_1 = np.argmax(Signal_onehot[start:end,:], axis = -1) 226 | probe_1 = probe_1[probe_1 != 0] 227 | probe_1 = probe_1 - 1 228 | 229 | probe_2 = np.argmax(Output_onehot[start:end,:], axis = -1) 230 | probe_2 = probe_2[probe_2 != 0] 231 | probe_2 = probe_2 - 1 232 | 233 | try: 234 | 235 | for kk in range(len(probe_1)): 236 | 237 | train2_for_batch[kk, probe_1[kk]] = 1 238 | output_for_batch[kk, probe_2[kk]] = 1 239 | 240 | except: 241 | 242 | for kk in range(max_label_length): 243 | 244 | train2_for_batch[kk, probe_1[kk]] = 1 245 | output_for_batch[kk, probe_2[kk]] = 1 246 | 247 | # try/expect is places for data that are too short for storage 248 | # the problem is only related to modified data. 249 | 250 | try: 251 | 252 | train1_batch[m] = Raw_signal[start:end] 253 | train2_batch[m] = train2_for_batch 254 | output_batch[m] = output_for_batch 255 | 256 | except: 257 | 258 | if mod_position < int(time_segment/2): 259 | start = mod_position 260 | end = start + time_segment 261 | 262 | else: 263 | start = mod_position - int(time_segment/2) 264 | end = start + time_segment 265 | 266 | probe_1 = np.argmax(Signal_onehot[start:end,:], axis = -1) 267 | probe_1 = probe_1[probe_1 != 0] 268 | probe_1 = probe_1 - 1 269 | 270 | probe_2 = np.argmax(Output_onehot[start:end,:], axis = -1) 271 | probe_2 = probe_2[probe_2 != 0] 272 | probe_2 = probe_2 - 1 273 | 274 | try: 275 | 276 | for kk in range(len(probe_1)): 277 | 278 | train2_for_batch[kk, probe_1[kk]] = 1 279 | output_for_batch[kk, probe_2[kk]] = 1 280 | 281 | except: 282 | 283 | for kk in range(max_label_length): 284 | 285 | train2_for_batch[kk, probe_1[kk]] = 1 286 | output_for_batch[kk, probe_2[kk]] = 1 287 | 288 | train1_batch[m] = Raw_signal[start:end] 289 | train2_batch[m] = train2_for_batch 290 | output_batch[m] = output_for_batch 291 | 292 | file_name = name_save_file + f"{int(ind_loop)}_{int(start_Index)}" + f"_{n}.npz" 293 | 294 | np.savez_compressed(os.path.join(save_path,file_name), 295 | train_input = train1_batch, 296 | train_input2 = train2_batch, 297 | train_output = output_batch) 298 | 299 | # // save long rads enter in the quality check. maybe is not necessary 300 | 301 | 302 | except: 303 | print("resquigle error") 304 | 305 | --------------------------------------------------------------------------------