├── LICENSE ├── README.md ├── file_handling ├── clustal_to_fasta.py ├── clustal_to_tsv.py ├── compare_bed.py ├── extract_accession_no.py ├── extract_fasta_headers.py ├── extract_fasta_records.py ├── extract_seq.py ├── fasta2db_feed.py ├── fasta2fastq.py ├── fasta_concatenator.py ├── fasta_record_finder.py ├── fastq2fasta.py ├── file_comparison.py ├── ftp_download.py ├── gdc_download.py ├── multi_fasta_deconcatenator.py ├── mysqldb_find.py └── seq_concatenator.py ├── machine_learning ├── brca_classifier-1.py └── dna_classifier-1.py ├── sequence_analysis ├── PSSM.py ├── aa_comp.py ├── alignment2consensus.py ├── base_composition.py ├── consensus.py ├── gc_percent.py ├── hydrophobicity_plot.py ├── k-mer_constructor.py ├── kmer_constructor-1.py ├── orf_analyzer.py ├── prot_mol_weight_calculator.py ├── prototype_aligner1.py ├── random_seq_generator-1.py ├── random_seq_generator.py ├── temp_to_cod.py └── translate.py ├── supplementary_data └── images │ ├── dna_helix.png │ └── nt_seq_logo.png └── visualization ├── dna_fasta_visualization.py ├── dna_helix_visualizer.py └── streamlit_base_comp.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Rajan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bioinfo_py_Scripts 2 | 3 | ## Python Scripts for Bioinformatics 4 | 5 | ### Introduction 6 | Bioinfo_py_Scripts is a curated GitHub repository with useful scripts for bioinformatics. The scripts are command line based and written/tested on python 3.7 or higher versions within Linux operating system. This is a free and open-source repository so feel free to use and contribute. 7 | 8 | ### Table of Content 9 | 1. [File Handling](https://github.com/rajanbit/Bioinfo_py_Scripts/wiki/File-Handling) 10 | 2. [Sequence Analysis](https://github.com/rajanbit/Bioinfo_py_Scripts/wiki/Sequence-Analysis) 11 | 3. [Visualization](https://github.com/rajanbit/Bioinfo_py_Scripts/wiki/Visualization) 12 | 4. [Machine Learning](https://github.com/rajanbit/Bioinfo_py_Scripts/wiki/Machine-Learning) 13 | 14 | ### Documentation 15 | ****Wiki***: https://github.com/rajanbit/Bioinfo_py_Scripts/wiki* 16 | 17 | -------------------------------------------------------------------------------- /file_handling/clustal_to_fasta.py: -------------------------------------------------------------------------------- 1 | # Clustal Omega to FASTA 2 | 3 | import sys 4 | import re 5 | 6 | msa_open = open(sys.argv[1], "r") 7 | msa_fast = open(sys.argv[2], "w+") 8 | msa_rl = msa_open.readlines() 9 | 10 | s_char = ["*",":"] 11 | 12 | temp_l = [] 13 | head_l = [] 14 | for line in msa_rl: 15 | line = line.strip() 16 | 17 | if "CLUSTAL" in line: 18 | pass 19 | 20 | elif line == "": 21 | pass 22 | 23 | elif line == "\n": 24 | pass 25 | 26 | else: 27 | m = re.search(r'\d+$', line) 28 | 29 | if m is not None: 30 | line = line.split("\t") 31 | line = re.sub("\s\s+", ",",line[0]) 32 | line = line.split(",") 33 | temp_l.append(line) 34 | head_l.append(line[0]) 35 | 36 | head_l = list(set(head_l)) 37 | 38 | for head in head_l: 39 | msa_fast.write(">"+head+"\n") 40 | for data in temp_l: 41 | if head in data: 42 | msa_fast.write(data[1]+"\n") 43 | 44 | # python clustal_to_fasta.py 45 | -------------------------------------------------------------------------------- /file_handling/clustal_to_tsv.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | 4 | msa_open = open(sys.argv[1], "r") 5 | msa_rl = msa_open.readlines() 6 | tsv = open("result.tsv", "w+") 7 | 8 | s_char = ["*",":",] 9 | 10 | temp = "" 11 | for line in msa_rl: 12 | line = line.strip() 13 | 14 | if "CLUSTAL" in line: 15 | temp+=line+"\n" 16 | 17 | elif line == "": 18 | temp += "\n" 19 | 20 | elif line == "\n": 21 | temp += "\n" 22 | 23 | else: 24 | m = re.search(r'\d+$', line) 25 | 26 | if m is not None: 27 | line = line.replace("\t", ",") 28 | line = re.sub("\s\s+", ",",line).split(",") 29 | temp += line[0]+"\t" 30 | for i in range(0,len(line[1])): 31 | temp += line[1][i]+"\t" 32 | temp += line[2]+"\n" 33 | tsv.write(temp) 34 | -------------------------------------------------------------------------------- /file_handling/compare_bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | 5 | # Compare two bed files for sequence overlaps 6 | 7 | import sys 8 | 9 | BED2 = open(sys.argv[2], "r") 10 | BED1 = open(sys.argv[1], "r") 11 | BED2_rl = BED2.readlines() 12 | BED1_rl = BED1.readlines() 13 | print("\n# Overlapping Regions . . . . . . . . . . . . . . . .\n") 14 | for line2 in BED2_rl: 15 | BED2_tl= list(range(int(line2.split("\t")[1]),int(line2.split("\t")[2]))) 16 | for line1 in BED1_rl: 17 | BED1_tl= list(range(int(line1.split("\t")[1]),int(line1.split("\t")[2]))) 18 | 19 | out = any(check in BED1_tl for check in BED2_tl) 20 | 21 | if out: 22 | print("BED2:"+line2+"--BED1:"+line1) 23 | 24 | # $ python compare_bed.py file1.bed file2.bed 25 | -------------------------------------------------------------------------------- /file_handling/extract_accession_no.py: -------------------------------------------------------------------------------- 1 | import sys 2 | f_file = sys.argv[1] 3 | mlti_fasta = open(f_file, "r") 4 | acc_no = open("accession_no.txt", "w+") 5 | for line in mlti_fasta: 6 | if line[0] == ">": 7 | acc_no.write(line[1:10] + "\n") 8 | 9 | # python extract_accession_no.py 10 | -------------------------------------------------------------------------------- /file_handling/extract_fasta_headers.py: -------------------------------------------------------------------------------- 1 | import sys 2 | f_fasta = sys.argv[1] 3 | mlti_fasta = open(f_fasta, "r") 4 | head = open("fasta_headers.txt", "w") 5 | for line in mlti_fasta: 6 | if line[0] == ">": 7 | head.write(line[1:]) 8 | head.close() 9 | 10 | # python extract_fasta_headers.py 11 | -------------------------------------------------------------------------------- /file_handling/extract_fasta_records.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Extracts Fasta_records from Multi_Fasta file 4 | # whose ids are in Accession_Ids file 5 | 6 | import sys 7 | file1 = sys.argv[1] # MULTI_FASTA File 8 | file2 = sys.argv[2] # ACCESSION_IDS File 9 | acc_id = open(file2, "r") 10 | mlti_fasta = open(file1, "r") 11 | out_fasta = open("output.fasta", "w+") 12 | acc_list = [] 13 | for line in acc_id: 14 | acc_list.append(line.strip()) 15 | seq ="" 16 | for line in mlti_fasta: 17 | if line[0] == ">" and seq == "": 18 | header = line 19 | elif line[0]!= ">": 20 | seq += line 21 | elif line[0] == ">" and seq != "": 22 | for i in acc_list: 23 | if i in header: 24 | out_fasta.write(header + seq) 25 | seq = "" 26 | header = line 27 | if i in header: 28 | out_fasta.write(header + seq) 29 | 30 | # python extract_fasta_records.py 31 | -------------------------------------------------------------------------------- /file_handling/extract_seq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Program to extract nucleotide or protein 4 | # sequence of particular index from a Fasta file 5 | 6 | import sys 7 | 8 | file_db = open(sys.argv[1], "r") 9 | 10 | head = "" 11 | seq = "" 12 | fasta_seq = "" 13 | 14 | for line in file_db: 15 | if line[0:1] == ">" and seq == "": 16 | head = line 17 | elif line[0:1] != ">": 18 | seq += line.strip() 19 | elif line[0:1] == ">" and seq != "": 20 | print("Multi FASTA record found ...\nProgram Break ...\n") 21 | quit() 22 | 23 | index_b = int(input("Enter the sequence index [FROM]: ")) 24 | index_e = int(input("Enter the sequence index [TO]: ")) 25 | print("Sequence index: "+str(index_b)+"..."+str(index_e)) 26 | 27 | dot_in = head.index(".") 28 | out_head = head[:dot_in+2]+":"+str(index_b)+"-"+str(index_e)+head[dot_in+2:] 29 | out_seq = seq[index_b-1:index_e] 30 | for i in range(0, len(out_seq), 70): 31 | fasta_seq += out_seq[i:i+70]+"\n" 32 | print(out_head+fasta_seq) 33 | 34 | file_out = open(head[1:dot_in+2]+"_out.fasta","w") 35 | file_out.write(out_head+fasta_seq) 36 | 37 | print("Saving sequence ... Done\nFile: "+head[1:dot_in+2]+"_out.fasta\n") 38 | -------------------------------------------------------------------------------- /file_handling/fasta2db_feed.py: -------------------------------------------------------------------------------- 1 | # Importing modules 2 | import sys 3 | from Bio import SeqIO 4 | import mysql.connector as mc 5 | import hashlib 6 | 7 | ## Connecting to database 8 | mysql=mc.connect( host="localhost", user="user1", passwd="", database="16srRNAdb") 9 | 10 | ## Feeding sequence data into database 11 | fasta_rec = open(sys.argv[1]) 12 | data = SeqIO.parse(fasta_rec, "fasta") 13 | for record in data: 14 | head, seq = record.id, str(record.seq) 15 | seq_hash = hashlib.md5(seq.encode()) 16 | seq_md5 = seq_hash.hexdigest() 17 | mycursor=mysql.cursor() 18 | sql="insert into myseq(seqID, seq) values(%s, %s)" 19 | val=[(head, seq_md5)] 20 | mycursor.executemany(sql, val) 21 | mysql.commit() 22 | 23 | ## Usage: $ python fasta2db_feed.py 24 | 25 | -------------------------------------------------------------------------------- /file_handling/fasta2fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # Importing Modules 5 | import sys 6 | from Bio import SeqIO 7 | from random import randrange 8 | 9 | # Function for FASTA to FASTQ conversion 10 | def fastq_convertor(seq, header, frame): 11 | for i in range(frame,len(seq),int(sys.argv[4])): 12 | temphead = "@"+header+"_"+str(i)+"-"+str(i+int(sys.argv[4])) 13 | tempseq = seq[i:i+int(sys.argv[4])] 14 | qscore="" 15 | for j in range(1, 101): 16 | qscore+=score[randrange(36,42)] 17 | for k in range(101, 141): 18 | qscore+=score[randrange(31,36)] 19 | for l in range(141, 148): 20 | qscore+=score[randrange(20,31)] 21 | for m in range(148, len(tempseq)+1): 22 | qscore+=score[randrange(0,20)] 23 | fastq_out.write(temphead+"\n"+tempseq+"\n+\n"+qscore+"\n") 24 | 25 | score="""!"#$&'()*+,-./0123456789:;<>=?@ABCDEFGHIJK""" 26 | 27 | # Running everything ... 28 | fasta_in = open(sys.argv[2]) 29 | fastq_out = open(sys.argv[8], "w+") 30 | fasta_rec = SeqIO.parse(fasta_in, "fasta") 31 | for rec in fasta_rec: 32 | seq, header = str(rec.seq), rec.id 33 | for x in range(1,int(sys.argv[6])+1): 34 | fastq_convertor(seq, header, x) 35 | 36 | fasta_in.close() 37 | fastq_out.close() 38 | 39 | #Usage: $ python fasta2fastq.py -f -l -x -o 40 | -------------------------------------------------------------------------------- /file_handling/fasta_concatenator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # Extracting files name with the extension .fasta 5 | import os 6 | def get_name(): 7 | f_fasta = [] 8 | root = os.getcwd() 9 | for files in os.listdir(root): 10 | if files.endswith(".fasta"): 11 | f_fasta.append(files) 12 | return(f_fasta) 13 | 14 | # Concatenating all the fasta files to create multi_fasta file 15 | def concatenate(): 16 | f_out = open("multi_fasta", "w+") 17 | f_file = get_name() 18 | for data in f_file: 19 | fasta_rec = open(data, "r") 20 | for lines in fasta_rec: 21 | f_out.write(lines) 22 | concatenate() 23 | 24 | # python fasta_concatenator.py 25 | -------------------------------------------------------------------------------- /file_handling/fasta_record_finder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Extract Fasta_record from Multi_Fasta file 4 | # whose Accession_no is inputted by the user and 5 | # write the record in a new file (NC_XXXXXX.fasta) 6 | 7 | import os 8 | import sys 9 | file1 = sys.argv[1] # MULTI_FASTA File 10 | mlti_fasta = open(file1) 11 | lines = mlti_fasta.readlines() 12 | acc_id = input("Enter the Accession_No: ") 13 | f_name = acc_id+".fasta" 14 | file2 = open(f_name,"w+") 15 | head_index = None 16 | for i in range(0, len(lines)): 17 | data = lines[i] 18 | if acc_id in data: 19 | file2.write(data) 20 | head_index = i 21 | break 22 | if head_index != None: 23 | for i in range(head_index+1, len(lines)): 24 | data = lines[i] 25 | if data[0] != ">": 26 | file2.write(data) 27 | else: 28 | break 29 | file2.close() 30 | else: 31 | print("Error: FASTA record not found") 32 | os.remove(f_name) 33 | 34 | # python fasta_record_finder.py 35 | -------------------------------------------------------------------------------- /file_handling/fastq2fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # Importing Modules 5 | import sys 6 | from Bio import SeqIO 7 | 8 | # Function for FASTQ to FASTA conversion 9 | fastq_in = open(sys.argv[1]) 10 | fasta_out = open(sys.argv[1]+".fasta", "w+") 11 | fastq_rec = SeqIO.parse(fastq_in, "fastq") 12 | for rec in fastq_rec: 13 | fasta_out.write(">"+rec.id+"\n"+str(rec.seq)+"\n") 14 | 15 | fastq_in.close() 16 | fasta_out.close() 17 | 18 | # Usage: python fastq2fasta.py 19 | 20 | -------------------------------------------------------------------------------- /file_handling/file_comparison.py: -------------------------------------------------------------------------------- 1 | ''' \ 2 | Usage: 3 | python file_comparison.py -f1 -f2 ''' 4 | 5 | import sys 6 | ... 7 | 8 | inputs = sys.argv 9 | if '-f1' not in inputs or '-f2' not in inputs: 10 | print (__doc__) 11 | else: 12 | file1 = inputs[inputs.index('-f1') + 1] 13 | file2 = inputs[inputs.index('-f2') + 1] 14 | db = open(file1, "r") 15 | qr = open(file2, "r") 16 | db_list = [] 17 | qr_list = [] 18 | for line in db: 19 | db_list.append(line) 20 | for line in qr: 21 | qr_list.append(line) 22 | print("Data not in f2:") 23 | for data1 in db_list: 24 | if data1 not in qr_list: 25 | f1_data = data1 26 | print(f1_data) 27 | print("Data not in f1:") 28 | for data2 in qr_list: 29 | if data2 not in db_list: 30 | f2_data = data2 31 | print(f2_data) 32 | ... 33 | 34 | # python file_comparison.py -f1 -f2 35 | -------------------------------------------------------------------------------- /file_handling/ftp_download.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | file1 = sys.argv[1] 5 | ftp_file = open(file1, "r") 6 | ftps = ftp_file.readlines() 7 | os.system("mkdir ftp_downloads") 8 | log = open("logfile.txt", "w") 9 | for ftp in ftps: 10 | os.system("wget -P ftp_downloads/ " + ftp) 11 | log = open("logfile.txt", "a") 12 | log.write(ftp) 13 | log.close() 14 | ftp_file.close() 15 | 16 | # python ftp_download.py 17 | -------------------------------------------------------------------------------- /file_handling/gdc_download.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | file1 = sys.argv[1] 5 | gdc_manifest_file = open(file1, "r") 6 | gdc_data = gdc_manifest_file.readlines() 7 | os.system("mkdir gdc_downloads") 8 | log = open("logfile.txt", "w") 9 | for data in gdc_data[1:]: 10 | data_ls = data.split("\t") 11 | gdc_id = data_ls[0] 12 | os.system("wget https://api.gdc.cancer.gov/data/"+gdc_id+" -O gdc_downloads/"+data_ls[1]) 13 | log = open("logfile.txt", "a") 14 | log.write(gdc_id+"\n") 15 | log.close() 16 | gdc_manifest_file() 17 | 18 | # python gdc_download.py 19 | -------------------------------------------------------------------------------- /file_handling/multi_fasta_deconcatenator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # Input and Read the Multi_FASTA record 5 | import sys 6 | file1 = sys.argv[1] 7 | mlti_fasta = open(file1) 8 | fasta_rec = mlti_fasta.readlines() 9 | acc_list = [] 10 | for line in fasta_rec: 11 | if line[0] == ">": 12 | acc_list.append(line[1:10]) 13 | 14 | # Create Files with Accession_ids 15 | def file_creater(acc_id): 16 | file_name = acc_id+".fasta" 17 | file_c = open(file_name,"w+") 18 | return(file_c) 19 | 20 | # Writing FASTA records in Files 21 | def file_writer(file_w, data): 22 | file_w.write(data) 23 | return(file_w) 24 | 25 | # Parsing Multi_FASTA records 26 | def parsing_data(acc_id, data): 27 | head_index = None 28 | seq = "" 29 | header = "" 30 | for i in range(0, len(data)): 31 | data_f = data[i] 32 | if acc_id in data_f: 33 | header += data_f 34 | head_index = i 35 | break 36 | if head_index != None: 37 | for i in range(head_index+1, len(data)): 38 | data_s = data[i] 39 | if data_s[0] != ">": 40 | seq += data_s 41 | else: 42 | break 43 | return(header+seq) 44 | 45 | # Executing all the Functions 46 | def out(): 47 | for i in range(0, len(acc_list)): 48 | acc_id = acc_list[i] 49 | f1 = file_creater(acc_id) 50 | f2 = parsing_data(acc_id, fasta_rec) 51 | file_writer(f1, f2) 52 | out() 53 | 54 | # python multi_fasta_deconcatenator.py 55 | -------------------------------------------------------------------------------- /file_handling/mysqldb_find.py: -------------------------------------------------------------------------------- 1 | # Importing modules 2 | import mysql.connector as mc 3 | 4 | # Connecting to local MySQL database 5 | mysql=mc.connect( host="localhost", user="user1", passwd="", database="16srRNAdb") 6 | 7 | mycursor = mysql.cursor() 8 | # Select everything from myseq table where 7dd1e0c5450f0ff6c59187d02ae5783b (hash) found 9 | mycursor.execute("SELECT * FROM myseq WHERE seq LIKE '7dd1e0c5450f0ff6c59187d02ae5783b'") 10 | myresult = mycursor.fetchall() 11 | 12 | # Printing column-1 of the result 13 | for x in myresult: 14 | print(x[0]) 15 | 16 | # Usage: python mysqldb_find.py 17 | -------------------------------------------------------------------------------- /file_handling/seq_concatenator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | file_in = sys.argv[1] 4 | os.system("cp "+file_in+" tempoxc1265.fasta") 5 | 6 | name_list = ["ECO","SAL"] 7 | for name in name_list: 8 | cmd = "sed -i "+"'/"+name+"/c\>"+name+"' tempoxc1265.fasta" 9 | os.system(cmd) 10 | fasta_f = open("tempoxc1265.fasta", "r") 11 | file_out = open ("concatenated_seq.fasta","w+") 12 | fast = fasta_f.readlines() 13 | l = [] 14 | new = [] 15 | for line in fast: 16 | if line not in l and line[0] == ">": 17 | new.append(line) 18 | l.append(line) 19 | elif line[0] != ">": 20 | new.append(line) 21 | else: 22 | pass 23 | for line in new: 24 | file_out.write(line) 25 | os.system("rm tempoxc1265.fasta") 26 | file_out.close() 27 | 28 | # python seq_concatenator.py 29 | -------------------------------------------------------------------------------- /machine_learning/brca_classifier-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # Importing Modules 5 | from sklearn.datasets import load_breast_cancer 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.decomposition import PCA 9 | from sklearn.neural_network import MLPClassifier 10 | from sklearn.decomposition import PCA 11 | from sklearn.metrics import confusion_matrix 12 | from sklearn.metrics import ConfusionMatrixDisplay 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | import pandas as pd 16 | 17 | # Loading Dataset 18 | breast_cancer = load_breast_cancer() 19 | 20 | # Converting to data.frame 21 | df = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names) 22 | df['diagnosis'] = breast_cancer.target 23 | 24 | # Get the features and label from the original dataframe 25 | X = df.iloc[:,:-1] 26 | y = df.iloc[:,-1] 27 | 28 | # Performing standardization 29 | sc = StandardScaler() 30 | X_scaled = sc.fit_transform(X) 31 | 32 | # Converting features to PCs 33 | pca = PCA(n_components=3, whiten=True) 34 | X_pca = pca.fit_transform(X_scaled) 35 | df1 = pd.DataFrame(data = X_pca, columns = ["PC-1", "PC-2", "PC-3"]) 36 | 37 | # Subsets PCA data.frame into testing and training dataset 38 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True) 39 | 40 | # Classifier 41 | clf = MLPClassifier(random_state=1, max_iter=1000) 42 | 43 | # Training 44 | clf.fit(X_train, y_train) 45 | 46 | # Testing 47 | pred_val = clf.predict(X_test) 48 | 49 | # Output 50 | output = pd.DataFrame() 51 | output['Expected Output'] = y_test 52 | output['Predicted Output'] = pred_val 53 | 54 | # Confusion Matrix 55 | cmat = confusion_matrix(y_test, pred_val, labels=clf.classes_, normalize="true") 56 | disp_cmat = ConfusionMatrixDisplay(confusion_matrix=cmat, display_labels=clf.classes_) 57 | 58 | # Plotting Confusion Matrix 59 | disp_cmat.plot(xticks_rotation='vertical') 60 | plt.savefig('cmat.png',bbox_inches ="tight", pad_inches = 0.5) 61 | 62 | # Usage: python brca_classifier-1.py 63 | -------------------------------------------------------------------------------- /machine_learning/dna_classifier-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # Importing Modules 5 | import sys 6 | import numpy as np 7 | from sklearn.neural_network import MLPClassifier 8 | from sklearn.metrics import confusion_matrix 9 | from sklearn.metrics import ConfusionMatrixDisplay 10 | import matplotlib.pyplot as plt 11 | 12 | # Input Data 13 | training_data = (open(sys.argv[1], "r")).readlines() 14 | test_data = (open(sys.argv[2], "r")).readlines() 15 | 16 | # Sequence Encoder (one hot encoding) 17 | def sequence_encoder(seq): 18 | seq_mat = [] 19 | temp = "" 20 | for i in range(0, len(seq)): 21 | if seq[i].upper() == "A": 22 | temp+="1000" 23 | elif seq[i].upper() == "T": 24 | temp+="0100" 25 | elif seq[i].upper() == "G": 26 | temp+="0010" 27 | elif seq[i].upper() == "C": 28 | temp+="0001" 29 | else: 30 | temp+="0000" 31 | temp = list(temp) 32 | temp1 = [int(j) for j in temp] 33 | return(temp1) 34 | 35 | # Feature Matrix Generator 36 | def matrix_generator(data): 37 | 38 | feature_matrix = [] 39 | seq = "" 40 | label = [] 41 | for line in data: 42 | if line[0] == ">" and seq == "": 43 | label.append(line.strip().replace(">", "")) 44 | elif line[0]!= ">": 45 | seq += line.strip() 46 | elif line[0] == ">" and seq != "": 47 | feature_matrix.append(sequence_encoder(seq)) 48 | seq="" 49 | label.append(line.strip().replace(">", "")) 50 | feature_matrix.append(sequence_encoder(seq)) 51 | return(feature_matrix, label) 52 | 53 | 54 | # Generate feature matrix for training dataset 55 | dataset1 = matrix_generator(training_data) 56 | #print(dataset1) 57 | # Feature matrix for training 58 | X_train = np.array(dataset1[0]) 59 | 60 | # Labels for training 61 | y_train = np.array(dataset1[1]) 62 | 63 | # Classifier 64 | model = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(4), max_iter=5000) 65 | 66 | # Training 67 | model = model.fit(X_train, y_train) 68 | 69 | # Generate feature matrix for testing dataset 70 | dataset2 = matrix_generator(test_data) 71 | 72 | # Feature matrix for testing 73 | X_test = np.array(dataset2[0]) 74 | 75 | # Labels for testing 76 | y_test = np.array(dataset2[1]) 77 | 78 | # Prediction 79 | y_predict = model.predict(X_test) 80 | 81 | # Plotting Confusion Matrix 82 | cmat = confusion_matrix(y_test, y_predict, labels=model.classes_) 83 | disp_cmat = ConfusionMatrixDisplay(confusion_matrix=cmat, display_labels=model.classes_) 84 | disp_cmat.plot(xticks_rotation='vertical') 85 | plt.savefig('cmat.png',bbox_inches ="tight", pad_inches = 0.5) 86 | 87 | # Usage: $ python dna_classifier-1.py 88 | -------------------------------------------------------------------------------- /sequence_analysis/PSSM.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | import sys 5 | import numpy as np 6 | 7 | fasta_o = open(sys.argv[1], "r") 8 | fasta_r = fasta_o.readlines() 9 | 10 | seq_list = [] 11 | for line in fasta_r: 12 | if line[0] != ">" and line != "\n": 13 | seq_list.append(line.strip()) 14 | 15 | #Function 16 | def frequency(base,total): 17 | try: 18 | f_base = base/total 19 | except ZeroDivisionError: 20 | f_base = 0 21 | return(f_base) 22 | 23 | # Raw Frequency Matrix 24 | n = len(seq_list[0]) 25 | m = 4 26 | mat = np.full((m, n), 0.0) 27 | for i in range(0,len(seq_list[0])): 28 | A = T = G = C = 0 29 | for data in seq_list: 30 | if data[i] == "A": 31 | A += 1 32 | elif data[i] == "G": 33 | G += 1 34 | elif data[i] == "T": 35 | T += 1 36 | elif data[i] == "C": 37 | C += 1 38 | tot = A+G+C+T 39 | 40 | fA = frequency(A,tot) 41 | fG = frequency(G,tot) 42 | fC = frequency(C,tot) 43 | fT = frequency(T,tot) 44 | 45 | mat[0,i] = fA 46 | mat[1,i] = fT 47 | mat[2,i] = fG 48 | mat[3,i] = fC 49 | 50 | A = T = G = C = 0 51 | 52 | #Normalized Matrix 53 | over_mat = mat.sum(axis = 1)/n 54 | norm_mat = mat/over_mat[:,None] 55 | 56 | #Convert Normalized Matrix to Log Odd Scores 57 | scores = np.log2(norm_mat) 58 | print(np.around(scores,decimals=2)) 59 | 60 | -------------------------------------------------------------------------------- /sequence_analysis/aa_comp.py: -------------------------------------------------------------------------------- 1 | # Importing modules 2 | import sys 3 | import pandas as pd 4 | from Bio import SeqIO 5 | import matplotlib.pyplot as plt 6 | 7 | # Amino acid composition dict 8 | aa_dict = { "nonpolar_aliphatic":["G", "A", "V", "L", "I", "P", "M"], 9 | "nonpolar_aromatic":["F", "W"], 10 | "polar_uncharged":["S", "T", "N", "Q", "C", "Y"], 11 | "positive":["H", "K", "R"], "negative":["D", "E"], "NA":["-", "*"]} 12 | 13 | # Reading Protein FASTA file 14 | data = SeqIO.parse(open(sys.argv[1]), "fasta") 15 | seq = [] 16 | for record in data: 17 | seq.append(list(str(record.seq))) 18 | 19 | # Creating dataframe containing aa comp infor for each seq 20 | df = pd.DataFrame(columns = ["pos", "nonpolar_aliphatic", "nonpolar_aromatic","polar_uncharged", "positive", "negative", "NA"]) 21 | for i in range(len(seq[0])): 22 | temp = {"nonpolar_aliphatic":0, "nonpolar_aromatic":0, 23 | "polar_uncharged":0, "positive":0, "negative":0, "NA":0} 24 | for s in seq: 25 | temp["".join([j for j in aa_dict if s[i] in aa_dict[j]])] = temp["".join([j for j in aa_dict if s[i] in aa_dict[j]])]+1 26 | temp = { i:(temp[i]/sum(temp.values()))*100 for i in temp} 27 | temp["pos"] = i+1 28 | df.loc[len(df)]=temp 29 | 30 | # Plotting stacked barplot 31 | df.plot(x='pos', kind='bar', stacked=True, figsize=(15, 8)) 32 | plt.xlabel("Amino Acid Position") 33 | plt.ylabel("Percentage(%)") 34 | plt.legend(loc = "lower right") 35 | plt.savefig('aa_composition_plot.png',bbox_inches ="tight", pad_inches = 0.1) 36 | plt.close() 37 | 38 | # Usage: python aa_comp.py aligned_seq.fasta 39 | -------------------------------------------------------------------------------- /sequence_analysis/alignment2consensus.py: -------------------------------------------------------------------------------- 1 | ### Import Modules 2 | import sys 3 | from Bio import AlignIO 4 | from collections import Counter 5 | 6 | ### File Handling 7 | out_file = open(sys.argv[1]+"_consen.fasta", "w") 8 | conseq = ">consensus_"+sys.argv[1]+"\n" 9 | 10 | ### Read Alignment File 11 | alignment = AlignIO.read(sys.argv[1], "fasta") 12 | 13 | ### Generate Consensus Sequence 14 | for i in range (0, alignment.get_alignment_length()): 15 | temp_ls = [] 16 | col_data = alignment[:,i] 17 | for data in Counter(col_data): 18 | if ((Counter(col_data)[data]/len(col_data))*100) >= 1 and data in ["A","T","G","C"]: 19 | temp_ls.append(data) 20 | if len(temp_ls) == 1 and ((Counter(col_data)[temp_ls[0]]/len(col_data))*100) >= 99: 21 | conseq += temp_ls[0] 22 | else: 23 | conseq += "-" 24 | 25 | ### Writing Output 26 | out_file.write(conseq+"\n") 27 | -------------------------------------------------------------------------------- /sequence_analysis/base_composition.py: -------------------------------------------------------------------------------- 1 | # Base Compositions of DNAs from Multi/Fasta Record 2 | 3 | import sys 4 | 5 | f_file = sys.argv[1] 6 | file_op = open(f_file, "r") 7 | lines = file_op.readlines() 8 | file_wr = open("base_composition.tsv", "w+") 9 | file_wr.write("DNA origin A T G C Length AT% GC%\n") 10 | seq = "" 11 | head = "" 12 | for line in lines: 13 | if line[0] == ">" and seq == "": 14 | head = line 15 | elif line[0]!= ">": 16 | seq += line.strip() 17 | elif line[0] == ">" and seq != "": 18 | 19 | file_wr.write(str(head[1:12])+"\t"+str(seq.count("A"))+"\t"+str(seq.count("T"))+\ 20 | "\t"+str(seq.count("G"))+"\t"+str(seq.count("C"))+"\t"+str(len(seq))+"\t"+"%.2f" %(((seq.count("A")+seq.count("T"))/len(seq))*100)+"\t"\ 21 | +"%.2f" %(((seq.count("G")+seq.count("C"))/len(seq))*100)+"\t"+"\n") 22 | seq = "" 23 | head = line 24 | 25 | file_wr.write(str(head[1:12])+"\t"+str(seq.count("A"))+"\t"+str(seq.count("T"))+\ 26 | "\t"+str(seq.count("G"))+"\t"+str(seq.count("C"))+"\t"+str(len(seq))+"\t"+"%.2f" %(((seq.count("A")+seq.count("T"))/len(seq))*100)+"\t"\ 27 | +"%.2f" %(((seq.count("G")+seq.count("C"))/len(seq))*100)+"\t"+"\n") 28 | 29 | # python base_composition.py 30 | -------------------------------------------------------------------------------- /sequence_analysis/consensus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # import sys module 5 | import sys 6 | # import NumPy library 7 | import numpy as np 8 | 9 | # file handling 10 | f_file = sys.argv[1] 11 | f_open = open(f_file, "r") 12 | f_read = f_open.readlines() 13 | 14 | # declaring variables 15 | seq_list = [] 16 | seq = "" 17 | header = "" 18 | 19 | # assigning values to variables 20 | for line in f_read: 21 | if ">" in line[0:1]: 22 | header = line[1:] 23 | elif ">" not in line[0:1]: 24 | seq += line.strip() 25 | seq_list.append(seq) 26 | seq = "" 27 | seq_list = ' '.join(seq_list).split() 28 | 29 | # calculating consensus 30 | n = len(seq_list[0]) 31 | con_array = np.full((4, n), 0) 32 | for seq in seq_list: 33 | for i, char in enumerate(seq): 34 | if char == "A": 35 | con_array[0][i] +=1 36 | elif char == "G": 37 | con_array[1][i] +=1 38 | elif char == "T": 39 | con_array[2][i] +=1 40 | elif char == "C": 41 | con_array[3][i] +=1 42 | 43 | # printing output 44 | print("A:",con_array[0],"\nG:",con_array[1],"\nT:",con_array[2],"\nC:",con_array[3]) 45 | 46 | # python consensus.py 47 | -------------------------------------------------------------------------------- /sequence_analysis/gc_percent.py: -------------------------------------------------------------------------------- 1 | import sys 2 | file1 = sys.argv[1] 3 | Dna = open(file1) 4 | dna = Dna.readlines() 5 | seq = "" 6 | for line in dna: 7 | if line[0] != ">": 8 | seq = seq + line 9 | no_a = seq.count("A") 10 | no_t = seq.count("T") 11 | no_g = seq.count("G") 12 | no_c = seq.count("C") 13 | dna_len = no_a + no_t + no_g + no_c 14 | gc_percent = ((no_g + no_c)/dna_len)*100.0 15 | print("GC content: " "%.2f" % gc_percent+"%") 16 | 17 | # python gc_percent.py 18 | -------------------------------------------------------------------------------- /sequence_analysis/hydrophobicity_plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | 5 | # Python script for calculating and plotting 6 | # hydrophobicity of a given peptide/protein 7 | # sequence using Kyte-Doolittle scale 8 | 9 | 10 | # Importing modules 11 | import sys 12 | from Bio import SeqIO 13 | import matplotlib.pyplot as plt 14 | 15 | # Kyte-Doolittle scale 16 | kydo = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5, 17 | 'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5, 18 | 'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6, 19 | 'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2 } 20 | 21 | x_plot = [] 22 | y_plot = [] 23 | 24 | # Function for converting sequence to Kyte-Doolittle propensity 25 | def seq_to_kydo(seq): 26 | kydo_values = [] 27 | for aa in seq: 28 | kydo_values.append(kydo[aa]) 29 | return(kydo_values) 30 | 31 | # Function for smoothing the data 32 | def smoothing(values_list): 33 | window = int(9) # Adjust window size here 34 | half_window = int((window-1)/2) 35 | new_values = [0]*half_window+values_list+[0]*half_window 36 | y = [] # Smoothened Kyte-Doolittle Values 37 | x = [] # Amino acid positions 38 | for i in range(half_window,len(new_values)-half_window): 39 | y.append(sum(new_values[i-half_window:i+1+half_window])/window) 40 | for j in range(1, len(values_list)+1): 41 | x.append(j) 42 | return(x, y) 43 | 44 | # Reading fasta record 45 | fasta_rec = open(sys.argv[1]) 46 | seq_data = SeqIO.parse(fasta_rec, "fasta") 47 | for record in seq_data: 48 | header, sequence = record.id, str(record.seq) 49 | x_plot, y_plot = smoothing(seq_to_kydo(sequence))[0], smoothing(seq_to_kydo(sequence))[1] 50 | 51 | # Plotting the data 52 | plt.plot(x_plot, y_plot) 53 | plt.title("Kyte-Doolittle Hydrophobicity Plot") 54 | plt.xlabel("Amino Acid Position") 55 | plt.ylabel("Hydrophobicity Score") 56 | plt.savefig('hydrophobicity_plot.png',bbox_inches ="tight", pad_inches = 0.5, dpi = 500) 57 | plt.close() 58 | 59 | 60 | # Usage: python hydrophobicity_plot.py prot.fasta 61 | -------------------------------------------------------------------------------- /sequence_analysis/k-mer_constructor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | DNA_file = sys.argv[1] 3 | k_mer_size = input("Enter the K-mer length: ") 4 | DNA = open(DNA_file, "r") 5 | seq = "" 6 | for line in DNA: 7 | if line.startswith(">"): 8 | pass 9 | else: 10 | seq += line.replace('\n', "") 11 | x = int(k_mer_size) 12 | for i in range(len(seq) - x): 13 | print(seq[i:i+int(k_mer_size)]) 14 | 15 | # python k-mer_constructor.py 16 | -------------------------------------------------------------------------------- /sequence_analysis/kmer_constructor-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # Importing Modules 5 | import sys 6 | from Bio import SeqIO 7 | 8 | 9 | # Function for Overlapping Kmers 10 | def overlapping_kmer_constructor(seq): 11 | if len(sys.argv[2].split("..")) == 1: 12 | k = int(sys.argv[2]) 13 | for i in range(len(seq)): 14 | print(seq[i:i+k]) 15 | 16 | elif len(sys.argv[2].split("..")) > 1: 17 | k_list = sys.argv[2].split("..") 18 | k_list = [i for i in range(int(k_list[0]), int(k_list[1])+1)] 19 | for k in k_list: 20 | k = int(k) 21 | for i in range(len(seq)): 22 | print(seq[i:i+k]) 23 | 24 | # Function for Non-Overlapping Kmers 25 | def nonoverlapping_kmer_constructor(seq): 26 | if len(sys.argv[2].split("..")) == 1: 27 | k = int(sys.argv[2]) 28 | for i in range(0, len(seq), k): 29 | print(seq[i:i+k]) 30 | 31 | elif len(sys.argv[2].split("..")) > 1: 32 | k_list = sys.argv[2].split("..") 33 | k_list = [i for i in range(int(k_list[0]), int(k_list[1])+1)] 34 | for k in k_list: 35 | k = int(k) 36 | for i in range(0, len(seq), k): 37 | print(seq[i:i+k]) 38 | 39 | 40 | # Reading Fasta Records 41 | fasta_in = open(sys.argv[1]) 42 | fasta_recs = SeqIO.parse(fasta_in, "fasta") 43 | for rec in fasta_recs: 44 | head, seq = rec.id, str(rec.seq) 45 | 46 | # Generating Overlapping Kmers 47 | if sys.argv[3] == "-O": 48 | overlapping_kmer_constructor(seq) 49 | 50 | # Generating Non-Overlapping Kmers 51 | elif sys.argv[3] == "-N": 52 | nonoverlapping_kmer_constructor(seq) 53 | 54 | # Usage: $ python kmer_constructor-1.py a.fasta 3/3..7 -N/-O 55 | -------------------------------------------------------------------------------- /sequence_analysis/orf_analyzer.py: -------------------------------------------------------------------------------- 1 | # Importing modules 2 | from Bio.SeqUtils import GC 3 | import matplotlib.pyplot as plt 4 | from matplotlib.cm import get_cmap 5 | from Bio.Seq import Seq 6 | from Bio import SeqIO 7 | import sys 8 | 9 | # Function to plot nucleotide composition 10 | def nucl_comp(seq): 11 | nucl_dict = {"A": 0, "T": 0, "G": 0, "C": 0} 12 | for nt in nucl_dict: 13 | nucl_dict[nt] = round((seq.count(nt)/len(seq))*100, 2) 14 | plt.bar(list(nucl_dict.keys()), list(nucl_dict.values()), 15 | color =['blue', 'red', 'green', 'orange']) 16 | 17 | plt.xlabel("Nucleotides") 18 | plt.ylabel("Percentage (%)") 19 | plt.title("Nucleotides Composition Plot") 20 | plt.savefig('nucleotide_composition.png',bbox_inches ="tight", pad_inches = 0.5) 21 | plt.close() 22 | 23 | # Function to return GC composition 24 | def gc_comp(seq): 25 | gc_perc = GC(seq) 26 | gc_dict = {"GC":gc_perc,"AT":100-gc_perc} 27 | plt.bar(list(gc_dict.keys()), list(gc_dict.values()), color =['blue', 'red']) 28 | plt.xticks(fontsize=8, rotation=90) 29 | plt.xlabel("Compositions") 30 | plt.ylabel("Percentage (%)") 31 | plt.title("GC & AT Composition Plot") 32 | plt.savefig('gc_at_composition.png',bbox_inches ="tight", pad_inches = 0.5, dpi = 500) 33 | plt.close() 34 | 35 | # Function to plot codon composition 36 | def codon_comp(seq): 37 | codons = [seq[x:x+3] for x in range(0, len(seq)+1, 3)] 38 | codon_dict = dict.fromkeys(list(set(codons)), 0) 39 | for codon in codon_dict: 40 | codon_dict[codon] = (codons.count(codon)/len(codons))*100 41 | codon_dict={key:value for key,value in codon_dict.items() if len(key)==3} 42 | plt.figure(figsize=(10,4)) 43 | plt.bar(list(codon_dict.keys()), list(codon_dict.values()), color=get_cmap("Accent").colors) 44 | plt.xticks(fontsize=8, rotation=90) 45 | plt.xlabel("Codons") 46 | plt.ylabel("Percentage (%)") 47 | plt.title("Codons Composition Plot") 48 | plt.savefig('codon_composition.png',bbox_inches ="tight", pad_inches = 0.5, dpi = 500) 49 | plt.close() 50 | 51 | # Function to plot amino acid composition 52 | def aa_comp(seq): 53 | a_acid_seq = Seq(seq).translate() 54 | a_acid_seq = str(a_acid_seq).replace("*", "") 55 | a_acids = [ aa for aa in a_acid_seq] 56 | a_acids_dict = dict.fromkeys(list(set(a_acids)), 0) 57 | for aa in a_acids_dict: 58 | a_acids_dict[aa] = (a_acids.count(aa)/len(a_acids))*100 59 | plt.figure(figsize=(10,4)) 60 | plt.bar(list(a_acids_dict.keys()), list(a_acids_dict.values()), color=get_cmap("tab20").colors) 61 | plt.xticks(fontsize=8, rotation=90) 62 | plt.xlabel("Amino Acids") 63 | plt.ylabel("Percentage (%)") 64 | plt.title("Amino Acid Composition Plot") 65 | plt.savefig('amino_acid_composition.png',bbox_inches ="tight", pad_inches = 0.5, dpi = 500) 66 | plt.close() 67 | 68 | # Reading FASTA file and running all 69 | fasta_rec = open(sys.argv[1]) 70 | seq = "" 71 | data = SeqIO.parse(fasta_rec, "fasta") 72 | for record in data: 73 | seq = record.id, str(record.seq) 74 | seq = str(seq[1]) 75 | nucl_comp(seq) 76 | gc_comp(seq) 77 | codon_comp(seq) 78 | aa_comp(seq) 79 | 80 | # Usage: $ python orf_analyzer.py 81 | -------------------------------------------------------------------------------- /sequence_analysis/prot_mol_weight_calculator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Program to calculate protein molecular weight 4 | # from the amino acid sequence in Fasta file 5 | 6 | import sys 7 | 8 | file_f = open(sys.argv[1], "r") 9 | file_rl = file_f.readlines() 10 | aa_mw = {"A":71.0779, "C":103.1429,"D":115.0874,"E":129.114,"F":147.1739,"G":57.0513,"H":137.1393,"I":113.1576,"K":128.1723, 11 | "L":113.1576,"M":131.1961,"N":114.1026,"P":97.1152,"Q":128.1292,"R":156.1857,"S":87.0773,"T":101.1039,"V":99.1311,"W":186.2099,"Y":163.1733} 12 | 13 | 14 | def calc(prot_seq, header): 15 | aa_molwt = 18.0153 16 | for i in range(0, len(prot_seq)): 17 | aa_molwt += aa_mw.get(prot_seq[i]) 18 | 19 | print("Sequence Info: "+header[1:]) 20 | print("Number of amino acid residues: "+str(len(prot_seq))) 21 | print("Protein molecular weight: "+str(round(aa_molwt/1000, 2))+" kDa ["+str(round(aa_molwt, 2))+" Da]\n") 22 | 23 | seq = "" 24 | head = "" 25 | print("\nProtein Molecular Weight Calculator Result\n") 26 | for line in file_rl: 27 | if line[0:1] == ">" and seq == "": 28 | head = line.strip() 29 | elif line[0:1] != ">": 30 | seq += line.strip() 31 | elif line[0:1] ==">" and seq != "": 32 | calc(seq, head) 33 | seq = "" 34 | head = line.strip() 35 | calc(seq, head) 36 | 37 | # Usage: python prot_mol_weight_calculator.py prot.fasta 38 | -------------------------------------------------------------------------------- /sequence_analysis/prototype_aligner1.py: -------------------------------------------------------------------------------- 1 | # importing modules 2 | import sys 3 | import numpy as np 4 | 5 | # Function for Alignment 6 | def Aligner(): 7 | n=len(seq1) 8 | m=len(seq2) 9 | matrix = np.full((n, m), 0) 10 | for i in range(0,n): 11 | for j in range(0, m): 12 | if seq1[i] == seq2[j]: 13 | matrix[i,j] = 1 14 | else: 15 | matrix[i,j] = 0 16 | 17 | al_seq = "" 18 | for x in range(0, m): 19 | if matrix[x,x] == 1: 20 | al_seq += seq1[x] 21 | elif matrix[x,x] == 0: 22 | al_seq += "-" 23 | 24 | fast_al_seq = "" 25 | for y in range(0, len(al_seq), 70): 26 | fast_al_seq += al_seq[y:y+70]+"\n" 27 | print(">Aligned Sequence") 28 | print(fast_al_seq) 29 | 30 | # Accessory code 31 | seq1="" 32 | seq2="" 33 | mf_file = sys.argv[1] 34 | mf_open = open(mf_file, "r") 35 | mf_read = mf_open.readlines() 36 | l1= [] 37 | for id_index, line in enumerate(mf_read, 1): 38 | if ">" in line: 39 | l1.append(id_index) 40 | seq1_temp = mf_read[l1[0]: l1[1]-1] 41 | seq2_temp = mf_read[l1[1]:] 42 | for data1 in seq1_temp: 43 | seq1 += data1.strip() 44 | for data2 in seq2_temp: 45 | seq2 += data2.strip() 46 | 47 | Aligner() 48 | 49 | # Usage: python prototype_aligner1.py 50 | -------------------------------------------------------------------------------- /sequence_analysis/random_seq_generator-1.py: -------------------------------------------------------------------------------- 1 | # Importing modules 2 | import numpy as np 3 | import pandas as pd 4 | from random import uniform, randint, sample 5 | 6 | # Functions to generate random sequence of length -> l and gc_content -> gc 7 | 8 | # Approach 1 | Using Probabilistic Model 9 | # Iterate through all the positions in increasing order 10 | # and for each position generate random probablity between 11 | # 1% and 100%. If the probablity is < given GC percent 12 | # then randomly assigning G/C to that positions and 13 | # else randomly assigning A/T to that position 14 | def generate_seq_1(l:int, gc:float): 15 | seq = "" 16 | nt = ["A", "T", "G", "C"] 17 | for i in range(l): 18 | if uniform(0.01, 1.0) < gc: 19 | seq += nt[randint(2, 3)] 20 | else: 21 | seq += nt[randint(0, 1)] 22 | return seq 23 | 24 | # Approach 2 | Using Random Sampling 25 | # Sample random positions where nt should be G/C 26 | # and then randomly assigning G/C to that positions and 27 | # for rest of the positions randomly assigning A/T 28 | def generate_seq_2(l:int, gc:float): 29 | seq = "" 30 | nt = ["A", "T", "G", "C"] 31 | gc_pos = sample(range(l), int(gc*l)) 32 | for i in range(l): 33 | if i in gc_pos: 34 | seq += nt[randint(2, 3)] 35 | else: 36 | seq += nt[randint(0, 1)] 37 | return seq 38 | -------------------------------------------------------------------------------- /sequence_analysis/random_seq_generator.py: -------------------------------------------------------------------------------- 1 | import random 2 | nt_type = input("Enter the sequence type [RNA/DNA]: ") 3 | nt_len = int(input("Enter the sequence length: ")) 4 | nt_DNA = "ATGC" 5 | nt_RNA = "AUGC" 6 | nt_seq = "" 7 | x = "" 8 | for i in range (int(nt_len)): 9 | x = random.randint(0, 3) 10 | if nt_type == "DNA": 11 | nt_seq = nt_seq + nt_DNA[x] 12 | if nt_type == "RNA": 13 | nt_seq = nt_seq + nt_RNA[x] 14 | fasta_seq = "" 15 | for i in range(0, len(nt_seq), 70): 16 | fasta_seq += nt_seq[i:i+70]+"\n" 17 | print("> "+nt_type+"_Sequence") 18 | print(fasta_seq) 19 | 20 | # python random_seq_generator.py 21 | -------------------------------------------------------------------------------- /sequence_analysis/temp_to_cod.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | temp_strand = input("Enter the DNA Sequence: ") 5 | 6 | cod_strand = "" 7 | 8 | for i in range(0, len(temp_strand)): 9 | 10 | if temp_strand[i] == "A": 11 | cod_strand = cod_strand + "T" 12 | elif temp_strand[i] == "T": 13 | cod_strand = cod_strand + "A" 14 | elif temp_strand[i] == "G": 15 | cod_strand = cod_strand + "C" 16 | elif temp_strand[i] == "C": 17 | cod_strand = cod_strand + "G" 18 | 19 | print(cod_strand) 20 | -------------------------------------------------------------------------------- /sequence_analysis/translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | import sys 5 | 6 | f_file = sys.argv[1] 7 | f_open = open(f_file, "r") 8 | f_read = f_open.readlines() 9 | 10 | codon_table = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 11 | 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 12 | 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 13 | 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', 14 | 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 15 | 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 16 | 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 17 | 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 18 | 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 19 | 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 20 | 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 21 | 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 22 | 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 23 | 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 24 | 'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*', 25 | 'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'} 26 | 27 | head = "" 28 | seq = "" 29 | rev_seq = "" 30 | for line in f_read: 31 | if ">" in line[0:2]: 32 | head = line 33 | elif ">" not in line[0:2]: 34 | seq += line.strip() 35 | for i in reversed(range(0, len(seq))): 36 | if seq[i] == "A": 37 | rev_seq = rev_seq + "T" 38 | elif seq[i] == "T": 39 | rev_seq = rev_seq + "A" 40 | elif seq[i] == "G": 41 | rev_seq = rev_seq + "C" 42 | elif seq[i] == "C": 43 | rev_seq = rev_seq + "G" 44 | 45 | # FRAME I Forward 46 | prot_seq_IF = "" 47 | for x in range(0,len(seq),3): 48 | codon = seq[x:x+3] 49 | if len(codon) == 3: 50 | aa = codon_table.get(codon) 51 | prot_seq_IF += aa 52 | # FRAME II Forward 53 | prot_seq_IIF = "" 54 | for x in range(1,len(seq),3): 55 | codon = seq[x:x+3] 56 | if len(codon) == 3: 57 | aa = codon_table.get(codon) 58 | prot_seq_IIF += aa 59 | # FRAME III Forward 60 | prot_seq_IIIF = "" 61 | for x in range(2,len(seq),3): 62 | codon = seq[x:x+3] 63 | if len(codon) == 3: 64 | aa = codon_table.get(codon) 65 | prot_seq_IIIF += aa 66 | # FRAME I Reverse 67 | prot_seq_IR = "" 68 | for x in range(0,len(rev_seq),3): 69 | codon = rev_seq[x:x+3] 70 | if len(codon) == 3: 71 | aa = codon_table.get(codon) 72 | prot_seq_IR += aa 73 | # FRAME II Reverse 74 | prot_seq_IIR = "" 75 | for x in range(1,len(rev_seq),3): 76 | codon = rev_seq[x:x+3] 77 | if len(codon) == 3: 78 | aa = codon_table.get(codon) 79 | prot_seq_IIR += aa 80 | # FRAME III Reverse 81 | prot_seq_IIIR = "" 82 | for x in range(2,len(rev_seq),3): 83 | codon = rev_seq[x:x+3] 84 | if len(codon) == 3: 85 | aa = codon_table.get(codon) 86 | prot_seq_IIIR += aa 87 | 88 | def out(prot_seq): 89 | prot_fasta = "" 90 | for i in range(0, len(prot_seq), 70): 91 | prot_fasta += prot_seq[i:i+70]+"\n" 92 | print(prot_fasta) 93 | 94 | print(">5'3' Frame I") 95 | out(prot_seq_IF) 96 | print(">5'3' Frame II") 97 | out(prot_seq_IIF) 98 | print(">5'3' Frame III") 99 | out(prot_seq_IIIF) 100 | print(">3'5' Frame I") 101 | out(prot_seq_IR) 102 | print(">3'5' Frame II") 103 | out(prot_seq_IIR) 104 | print(">3'5' Frame III") 105 | out(prot_seq_IIIR) 106 | 107 | # Usage: python translate.py file.fasta 108 | -------------------------------------------------------------------------------- /supplementary_data/images/dna_helix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajanbit/Bioinfo_py_Scripts/e11b8cb44227893138ea531dc478f9d787cab338/supplementary_data/images/dna_helix.png -------------------------------------------------------------------------------- /supplementary_data/images/nt_seq_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajanbit/Bioinfo_py_Scripts/e11b8cb44227893138ea531dc478f9d787cab338/supplementary_data/images/nt_seq_logo.png -------------------------------------------------------------------------------- /visualization/dna_fasta_visualization.py: -------------------------------------------------------------------------------- 1 | ''' \ 2 | Usage: 3 | python dna_fasta_visualization.py -n ''' 4 | 5 | import sys 6 | import re 7 | 8 | ... 9 | 10 | class text(): 11 | red = '\033[31m' 12 | green = '\033[32m' 13 | yellow = '\033[33m' 14 | blue = '\033[34m' 15 | purple='\033[35m' 16 | default = '\033[0m' 17 | black = "\x1b[30m" 18 | cyan = '\033[36m' 19 | bg = "\x1b[107m" 20 | 21 | inputs = sys.argv 22 | if '-n' not in inputs: 23 | print (__doc__) 24 | else: 25 | in_file = inputs[inputs.index('-n') + 1] 26 | file1 = open(in_file) 27 | seq_data = file1.readlines() 28 | header = "" 29 | seq = "" 30 | for line in seq_data: 31 | if line.startswith(">"): 32 | header += line.strip() 33 | else: 34 | seq += line 35 | 36 | f_seq1 = seq.replace("A",(text.red+"A"+text.default)) 37 | f_seq2 = f_seq1.replace("T",(text.blue+"T"+text.default)) 38 | f_seq3 = f_seq2.replace("G",(text.green+"G"+text.default)) 39 | f_seq4 = f_seq3.replace("C",(text.yellow+"C"+text.default)) 40 | 41 | print("\n"+text.bg+text.black+header+text.default) 42 | if len(seq) <= 10000: 43 | print(f_seq4+"\n") 44 | else: 45 | print(text.red+"\n #######################################################\n\ 46 | ## DNA sequence is extremely long for visualization! ##\n\ 47 | #######################################################"+text.default) 48 | 49 | no_A = seq.count("A") 50 | no_T = seq.count("T") 51 | no_G = seq.count("G") 52 | no_C = seq.count("C") 53 | 54 | acc_regex= re.compile(r'\S\S_\d\d\d\d\d\d.\d') 55 | acc_id = acc_regex.search(header) 56 | 57 | print("\n"+text.bg+text.black+"Sequence Information:"+text.default+"\n") 58 | 59 | print("Accession number: " + text.cyan+acc_id.group()+text.default+"\n") 60 | 61 | list_head = header.split() 62 | start_ind = 1 63 | if "complete" in list_head: 64 | end_ind = list_head.index("complete") 65 | print("Organism/Origin: "+text.cyan+(" ".join(list_head[start_ind:end_ind]).replace(",",""))+text.default+"\n") 66 | 67 | print("Number of Adenine residues" +text.red+" [A]"+text.default+": "+str(no_A)) 68 | print("Number of Guanine residues" +text.green+" [G]"+text.default+": "+str(no_G)) 69 | print("Number of Cytosine residues" +text.yellow+" [C]"+text.default+": "+str(no_C)) 70 | print("Number of Thymine residues" +text.blue+" [T]"+text.default+": "+str(no_T)+"\n") 71 | 72 | print("DNA Sequence length" +text.purple+" [l]"+text.default+": "+str(len(seq))+"\n") 73 | 74 | gc_per = ((no_G + no_C)/len(seq))*100.0 75 | at_per = ((no_A + no_T)/len(seq))*100.0 76 | 77 | print("GC content"+text.green+" [G"+text.default+text.yellow+"C]"+text.default+": "+"%.2f" % gc_per+"%") 78 | print("AT content"+text.red+" [A"+text.default+text.blue+"T]"+text.default+": "+"%.2f" % at_per+"%"+"\n") 79 | 80 | if gc_per >= 60: 81 | print("The DNA is"+text.green+" [G"+text.default+text.yellow+"C] "+text.default+"rich"+"\n") 82 | elif at_per >= 60: 83 | print("The DNA is"+text.red+" [A"+text.default+text.blue+"T] "+text.default+"rich"+"\n") 84 | ... 85 | 86 | # python dna_fasta_visualization.py -n 87 | -------------------------------------------------------------------------------- /visualization/dna_helix_visualizer.py: -------------------------------------------------------------------------------- 1 | class text(): 2 | red = '\033[31m' 3 | green = '\033[32m' 4 | yellow = '\033[33m' 5 | blue = '\033[34m' 6 | default = '\033[0m' 7 | 8 | t_seq = input("Enter the Sequence: ") 9 | c_seq = "" 10 | for i in range(0, len(t_seq)): 11 | if t_seq[i] == "A": 12 | c_seq = c_seq + "T" 13 | elif t_seq[i] == "T": 14 | c_seq = c_seq + "A" 15 | elif t_seq[i] == "G": 16 | c_seq = c_seq + "C" 17 | elif t_seq[i] == "C": 18 | c_seq = c_seq + "G" 19 | 20 | 21 | spacer_list = [3,2,1,0,0,1,2,3] 22 | bp_list = [0,2,4,6,6,4,2,0] 23 | data = "\n" 24 | x = 0 25 | l = len(t_seq) 26 | while x != l: 27 | 28 | for y in range(0,len(spacer_list)): 29 | try: 30 | data += "\t"+" "*spacer_list[y] + t_seq[x] + "-"*bp_list[y] + c_seq[x] + "\n" 31 | x+=1 32 | except: 33 | break 34 | 35 | data = data.replace("A",(text.red+"A"+text.default)) 36 | data = data.replace("T",(text.blue+"T"+text.default)) 37 | data = data.replace("G",(text.green+"G"+text.default)) 38 | data = data.replace("C",(text.yellow+"C"+text.default)) 39 | print("\n\tDNA Helix\n"+data) 40 | 41 | # python dna_helix_visualizer.py 42 | -------------------------------------------------------------------------------- /visualization/streamlit_base_comp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #!python 3 | 4 | # Importing modules 5 | import streamlit as st 6 | from Bio import SeqIO 7 | from Bio.SeqUtils import gc_fraction 8 | import pandas as pd 9 | from io import StringIO 10 | import matplotlib.pyplot as plt 11 | 12 | # Page config 13 | st.set_page_config(page_title="Base_Comp") 14 | 15 | # Title 16 | st.title("Nucleotide Composition Analysis") 17 | 18 | # Subheader 19 | st.subheader("This tool take FASTA sequence as input and generate nucleotide composition table and plots.") 20 | 21 | # File uploader 22 | st.header("FASTA File Upload") 23 | uploaded_file = st.file_uploader("Upload a file", type=["fasta"]) 24 | 25 | # Creating empty dataframe 26 | df = pd.DataFrame(columns=["Accession_IDs","Length","A","T","G","C","GC(%)"]) 27 | 28 | # Generating base composition table 29 | if uploaded_file is not None: 30 | 31 | # Reading fasta records and filling the dataframe 32 | stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) 33 | data = SeqIO.parse(stringio, "fasta") 34 | for record in data: 35 | header, seq = record.id, record.seq 36 | nucl_dict = {"Accession_IDs":0,"Length":0,"A": 0, "T": 0, "G": 0, "C": 0,"GC(%)":0,} 37 | for nt in nucl_dict: 38 | nucl_dict[nt] = str(round((str(seq).count(nt)/len(seq))*100, 2)) 39 | nucl_dict["Accession_IDs"] = header 40 | nucl_dict["GC(%)"] = gc_fraction(seq)*100 41 | nucl_dict["Length"] = len(str(seq)) 42 | df = pd.concat([df, pd.DataFrame([nucl_dict])],ignore_index=True) 43 | 44 | # Writing table in app 45 | df.rename(columns = {"A":"A(%)", "T":"T(%)","G":"G(%)","C":"C(%)"}, inplace = True) 46 | st.text("Table 1. Nucleotide composition of given sequences.") 47 | st.write(df) 48 | 49 | # Plotting nucleotide base composition boxplot in app 50 | st.text("Figure 1. Boxplot showing nucleotide base composition of given sequences.") 51 | fig, ax = plt.subplots() 52 | new_df = df[["A(%)","T(%)","G(%)","C(%)"]].astype(float) 53 | new_df.boxplot(column=["A(%)","T(%)","G(%)","C(%)"], grid=False) 54 | st.pyplot(fig) 55 | 56 | # Plotting GC composition boxplot in app 57 | st.text("Figure 2. Boxplot showing GC composition of given sequences.") 58 | fig, ax = plt.subplots() 59 | df.boxplot(column=["GC(%)"], grid=False) 60 | st.pyplot(fig) 61 | 62 | # Plotting length distribution boxplot in app 63 | st.text("Figure 3. Boxplot showing length distribution of given sequences.") 64 | fig, ax = plt.subplots() 65 | new_df = df[["Length"]].astype(float) 66 | new_df.boxplot(column=["Length"], grid=False) 67 | st.pyplot(fig) 68 | 69 | ## Usage: $ streamlit run streamlit_base_comp.py 70 | --------------------------------------------------------------------------------