├── LICENSE
├── README.md
├── file_handling
    ├── clustal_to_fasta.py
    ├── clustal_to_tsv.py
    ├── compare_bed.py
    ├── extract_accession_no.py
    ├── extract_fasta_headers.py
    ├── extract_fasta_records.py
    ├── extract_seq.py
    ├── fasta2db_feed.py
    ├── fasta2fastq.py
    ├── fasta_concatenator.py
    ├── fasta_record_finder.py
    ├── fastq2fasta.py
    ├── file_comparison.py
    ├── ftp_download.py
    ├── gdc_download.py
    ├── multi_fasta_deconcatenator.py
    ├── mysqldb_find.py
    └── seq_concatenator.py
├── machine_learning
    ├── brca_classifier-1.py
    └── dna_classifier-1.py
├── sequence_analysis
    ├── PSSM.py
    ├── aa_comp.py
    ├── alignment2consensus.py
    ├── base_composition.py
    ├── consensus.py
    ├── gc_percent.py
    ├── hydrophobicity_plot.py
    ├── k-mer_constructor.py
    ├── kmer_constructor-1.py
    ├── orf_analyzer.py
    ├── prot_mol_weight_calculator.py
    ├── prototype_aligner1.py
    ├── random_seq_generator-1.py
    ├── random_seq_generator.py
    ├── temp_to_cod.py
    └── translate.py
├── supplementary_data
    └── images
    │   ├── dna_helix.png
    │   └── nt_seq_logo.png
└── visualization
    ├── dna_fasta_visualization.py
    ├── dna_helix_visualizer.py
    └── streamlit_base_comp.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Rajan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bioinfo_py_Scripts
 2 | 
 3 | ## Python Scripts for Bioinformatics
 4 | 
 5 | ### Introduction
 6 | Bioinfo_py_Scripts is a curated GitHub repository with useful scripts for bioinformatics. The scripts are command line based and written/tested on python 3.7 or higher versions within Linux operating system. This is a free and open-source repository so feel free to use and contribute.
 7 | 
 8 | ### Table of Content
 9 | 1. [File Handling](https://github.com/rajanbit/Bioinfo_py_Scripts/wiki/File-Handling)
10 | 2. [Sequence Analysis](https://github.com/rajanbit/Bioinfo_py_Scripts/wiki/Sequence-Analysis)
11 | 3. [Visualization](https://github.com/rajanbit/Bioinfo_py_Scripts/wiki/Visualization)
12 | 4. [Machine Learning](https://github.com/rajanbit/Bioinfo_py_Scripts/wiki/Machine-Learning)
13 | 
14 | ### Documentation
15 | ****Wiki***: https://github.com/rajanbit/Bioinfo_py_Scripts/wiki*
16 | 
17 | 


--------------------------------------------------------------------------------
/file_handling/clustal_to_fasta.py:
--------------------------------------------------------------------------------
 1 | # Clustal Omega to FASTA
 2 | 
 3 | import sys
 4 | import re
 5 | 
 6 | msa_open = open(sys.argv[1], "r")
 7 | msa_fast = open(sys.argv[2], "w+")
 8 | msa_rl = msa_open.readlines()
 9 | 
10 | s_char = ["*",":"]
11 | 
12 | temp_l = []
13 | head_l = []
14 | for line in msa_rl:
15 | 	line = line.strip()
16 | 
17 | 	if "CLUSTAL" in line:
18 | 		pass
19 | 		
20 | 	elif line == "":
21 | 		pass
22 | 
23 | 	elif line == "\n":
24 | 		pass
25 | 
26 | 	else:
27 | 		m = re.search(r'\d+$', line)
28 | 
29 | 		if m is not None:
30 | 			line = line.split("\t")
31 | 			line = re.sub("\s\s+", ",",line[0])
32 | 			line = line.split(",")
33 | 			temp_l.append(line) 
34 | 			head_l.append(line[0])
35 | 
36 | head_l = list(set(head_l))
37 | 
38 | for head in head_l:
39 | 	msa_fast.write(">"+head+"\n")
40 | 	for data in temp_l:
41 | 		if head in data:
42 | 			msa_fast.write(data[1]+"\n")
43 | 
44 | # python clustal_to_fasta.py <file.clustal_num> <file.fasta>
45 | 


--------------------------------------------------------------------------------
/file_handling/clustal_to_tsv.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | 
 4 | msa_open = open(sys.argv[1], "r")
 5 | msa_rl = msa_open.readlines()
 6 | tsv = open("result.tsv", "w+")
 7 | 
 8 | s_char = ["*",":",]
 9 | 
10 | temp = ""
11 | for line in msa_rl:
12 | 	line = line.strip()
13 | 
14 | 	if "CLUSTAL" in line:
15 | 		temp+=line+"\n"
16 | 		
17 | 	elif line == "":
18 | 		temp += "\n"
19 | 
20 | 	elif line == "\n":
21 | 		temp += "\n"
22 | 
23 | 	else:
24 | 		m = re.search(r'\d+$', line)
25 | 
26 | 		if m is not None:
27 | 			line = line.replace("\t", ",")
28 | 			line = re.sub("\s\s+", ",",line).split(",")
29 | 			temp += line[0]+"\t"
30 | 			for i in range(0,len(line[1])):
31 | 				temp += line[1][i]+"\t"
32 | 			temp += line[2]+"\n"
33 | tsv.write(temp)
34 | 


--------------------------------------------------------------------------------
/file_handling/compare_bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | 
 5 | # Compare two bed files for sequence overlaps
 6 | 
 7 | import sys
 8 | 
 9 | BED2 = open(sys.argv[2], "r")
10 | BED1 = open(sys.argv[1], "r")
11 | BED2_rl = BED2.readlines()
12 | BED1_rl = BED1.readlines()
13 | print("\n# Overlapping Regions . . . . . . . . . . . . . . . .\n")
14 | for line2 in BED2_rl:
15 | 	BED2_tl= list(range(int(line2.split("\t")[1]),int(line2.split("\t")[2])))
16 | 	for line1 in BED1_rl:
17 | 		BED1_tl= list(range(int(line1.split("\t")[1]),int(line1.split("\t")[2])))
18 | 		
19 | 		out = any(check in BED1_tl for check in BED2_tl)
20 |   
21 | 		if out:
22 | 			print("BED2:"+line2+"--BED1:"+line1)
23 | 
24 | # $ python compare_bed.py file1.bed file2.bed
25 | 


--------------------------------------------------------------------------------
/file_handling/extract_accession_no.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | f_file = sys.argv[1]
 3 | mlti_fasta = open(f_file, "r")
 4 | acc_no = open("accession_no.txt", "w+")
 5 | for line in mlti_fasta:
 6 | 	if line[0] == ">":
 7 | 		acc_no.write(line[1:10] + "\n")
 8 | 
 9 | # python extract_accession_no.py <Multi_FASTA_File>
10 | 


--------------------------------------------------------------------------------
/file_handling/extract_fasta_headers.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | f_fasta = sys.argv[1]
 3 | mlti_fasta = open(f_fasta, "r")
 4 | head = open("fasta_headers.txt", "w")
 5 | for line in mlti_fasta:
 6 | 	if line[0] == ">":
 7 | 		head.write(line[1:])
 8 | head.close()
 9 | 
10 | # python extract_fasta_headers.py <Multi_FASTA_File>
11 | 


--------------------------------------------------------------------------------
/file_handling/extract_fasta_records.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Extracts Fasta_records from Multi_Fasta file
 4 | # whose ids are in Accession_Ids file
 5 | 
 6 | import sys
 7 | file1 = sys.argv[1] # MULTI_FASTA File
 8 | file2 = sys.argv[2] # ACCESSION_IDS File
 9 | acc_id = open(file2, "r")
10 | mlti_fasta = open(file1, "r")
11 | out_fasta = open("output.fasta", "w+")
12 | acc_list = []
13 | for line in acc_id:
14 | 	acc_list.append(line.strip())
15 | seq =""
16 | for line in mlti_fasta:
17 | 	if line[0] == ">" and seq == "":
18 | 		header = line
19 | 	elif line[0]!= ">":
20 | 		seq += line
21 | 	elif line[0] == ">" and seq != "":
22 | 		for i in acc_list:
23 | 			if i in header:
24 | 				out_fasta.write(header + seq)
25 | 		seq = ""
26 | 		header = line
27 | if i in header:
28 | 	out_fasta.write(header + seq)
29 | 
30 | # python extract_fasta_records.py <Multi_FASTA_File> <Accession_IDs_File>
31 | 


--------------------------------------------------------------------------------
/file_handling/extract_seq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Program to extract nucleotide or protein
 4 | # sequence of particular index from a Fasta file
 5 | 
 6 | import sys
 7 | 
 8 | file_db = open(sys.argv[1], "r")
 9 | 
10 | head = ""
11 | seq = ""
12 | fasta_seq = ""
13 | 
14 | for line in file_db:
15 | 	if line[0:1] == ">" and seq == "":
16 | 		head = line
17 | 	elif line[0:1] != ">":
18 | 		seq += line.strip()
19 | 	elif line[0:1] == ">" and seq != "":
20 | 		print("Multi FASTA record found ...\nProgram Break ...\n")
21 | 		quit()
22 | 
23 | index_b = int(input("Enter the sequence index [FROM]: "))
24 | index_e = int(input("Enter the sequence index [TO]: "))
25 | print("Sequence index: "+str(index_b)+"..."+str(index_e))
26 | 
27 | dot_in = head.index(".")
28 | out_head = head[:dot_in+2]+":"+str(index_b)+"-"+str(index_e)+head[dot_in+2:]
29 | out_seq = seq[index_b-1:index_e]
30 | for i in range(0, len(out_seq), 70):
31 | 	fasta_seq += out_seq[i:i+70]+"\n"
32 | print(out_head+fasta_seq)
33 | 
34 | file_out = open(head[1:dot_in+2]+"_out.fasta","w")
35 | file_out.write(out_head+fasta_seq)
36 | 
37 | print("Saving sequence ... Done\nFile: "+head[1:dot_in+2]+"_out.fasta\n")
38 | 


--------------------------------------------------------------------------------
/file_handling/fasta2db_feed.py:
--------------------------------------------------------------------------------
 1 | # Importing modules
 2 | import sys
 3 | from Bio import SeqIO
 4 | import mysql.connector as mc
 5 | import hashlib
 6 | 
 7 | ## Connecting to database
 8 | mysql=mc.connect( host="localhost", user="user1", passwd="", database="16srRNAdb")
 9 | 
10 | ## Feeding sequence data into database
11 | fasta_rec = open(sys.argv[1])
12 | data = SeqIO.parse(fasta_rec, "fasta")
13 | for record in data:
14 | 	head, seq = record.id, str(record.seq)
15 | 	seq_hash = hashlib.md5(seq.encode())
16 | 	seq_md5 = seq_hash.hexdigest()
17 | 	mycursor=mysql.cursor()
18 | 	sql="insert into myseq(seqID, seq) values(%s, %s)"
19 | 	val=[(head, seq_md5)]
20 | 	mycursor.executemany(sql, val)
21 | 	mysql.commit()
22 | 	
23 | ## Usage: $ python fasta2db_feed.py <sequence.fasta>
24 | 
25 | 


--------------------------------------------------------------------------------
/file_handling/fasta2fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # Importing Modules
 5 | import sys
 6 | from Bio import SeqIO
 7 | from random import randrange
 8 | 
 9 | # Function for FASTA to FASTQ conversion
10 | def fastq_convertor(seq, header, frame):
11 | 	for i in range(frame,len(seq),int(sys.argv[4])):
12 | 		temphead = "@"+header+"_"+str(i)+"-"+str(i+int(sys.argv[4]))
13 | 		tempseq = seq[i:i+int(sys.argv[4])]
14 | 		qscore=""
15 | 		for j in range(1, 101):
16 | 			qscore+=score[randrange(36,42)]
17 | 		for k in range(101, 141):
18 | 			qscore+=score[randrange(31,36)]
19 | 		for l in range(141, 148):
20 | 			qscore+=score[randrange(20,31)]
21 | 		for m in range(148, len(tempseq)+1):
22 | 			qscore+=score[randrange(0,20)]
23 | 		fastq_out.write(temphead+"\n"+tempseq+"\n+\n"+qscore+"\n")
24 | 
25 | score="""!"#$&'()*+,-./0123456789:;<>=?@ABCDEFGHIJK"""
26 | 
27 | # Running everything ...
28 | fasta_in = open(sys.argv[2])
29 | fastq_out = open(sys.argv[8], "w+")
30 | fasta_rec = SeqIO.parse(fasta_in, "fasta")
31 | for rec in fasta_rec:
32 | 	seq, header = str(rec.seq), rec.id
33 | 	for x in range(1,int(sys.argv[6])+1):
34 | 		fastq_convertor(seq, header, x)
35 | 
36 | fasta_in.close()
37 | fastq_out.close()
38 | 
39 | #Usage: $ python fasta2fastq.py -f <sequence.fasta> -l <read_length> -x <coverage> -o <sequence.fastq>
40 | 


--------------------------------------------------------------------------------
/file_handling/fasta_concatenator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # Extracting files name with the extension .fasta
 5 | import os
 6 | def get_name():
 7 | 	f_fasta = []
 8 | 	root = os.getcwd()
 9 | 	for files in os.listdir(root):
10 | 		if files.endswith(".fasta"):
11 | 			f_fasta.append(files)
12 | 	return(f_fasta)
13 | 
14 | # Concatenating all the fasta files to create multi_fasta file
15 | def concatenate():
16 | 	f_out = open("multi_fasta", "w+")
17 | 	f_file = get_name()
18 | 	for data in f_file:
19 | 		fasta_rec = open(data, "r")
20 | 		for lines in fasta_rec:
21 | 			f_out.write(lines)
22 | concatenate()
23 | 
24 | # python fasta_concatenator.py
25 | 


--------------------------------------------------------------------------------
/file_handling/fasta_record_finder.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Extract Fasta_record from Multi_Fasta file
 4 | # whose Accession_no is inputted by the user and
 5 | # write the record in a new file (NC_XXXXXX.fasta)
 6 | 
 7 | import os
 8 | import sys
 9 | file1 = sys.argv[1] # MULTI_FASTA File
10 | mlti_fasta = open(file1)
11 | lines = mlti_fasta.readlines()
12 | acc_id = input("Enter the Accession_No: ")
13 | f_name = acc_id+".fasta"
14 | file2 = open(f_name,"w+")
15 | head_index = None
16 | for i in range(0, len(lines)):
17 | 	data = lines[i]
18 | 	if acc_id in data:
19 | 		file2.write(data)
20 | 		head_index = i
21 | 		break
22 | if head_index != None:
23 | 	for i in range(head_index+1, len(lines)):
24 | 		data = lines[i]
25 | 		if data[0] != ">":
26 | 			file2.write(data)
27 | 		else:
28 | 			break
29 | 	file2.close()
30 | else:
31 | 	print("Error: FASTA record not found")
32 | 	os.remove(f_name)
33 | 
34 | # python fasta_record_finder.py <Multi_FASTA_File>
35 | 


--------------------------------------------------------------------------------
/file_handling/fastq2fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # Importing Modules
 5 | import sys
 6 | from Bio import SeqIO
 7 | 
 8 | # Function for FASTQ to FASTA conversion
 9 | fastq_in = open(sys.argv[1])
10 | fasta_out = open(sys.argv[1]+".fasta", "w+")
11 | fastq_rec = SeqIO.parse(fastq_in, "fastq")
12 | for rec in fastq_rec:
13 |     fasta_out.write(">"+rec.id+"\n"+str(rec.seq)+"\n")
14 | 
15 | fastq_in.close()
16 | fasta_out.close()
17 | 
18 | # Usage: python fastq2fasta.py <seq.fastq>
19 | 
20 | 


--------------------------------------------------------------------------------
/file_handling/file_comparison.py:
--------------------------------------------------------------------------------
 1 | ''' \
 2 | Usage:
 3 |    python file_comparison.py -f1 <File_1> -f2 <File_2>'''
 4 | 
 5 | import sys
 6 | ...
 7 | 
 8 | inputs = sys.argv
 9 | if '-f1' not in inputs  or '-f2' not in inputs:
10 | 	print (__doc__)
11 | else:
12 | 	file1 = inputs[inputs.index('-f1') + 1]
13 | 	file2 = inputs[inputs.index('-f2') + 1]
14 | 	db = open(file1, "r")
15 | 	qr = open(file2, "r")
16 | 	db_list = []
17 | 	qr_list = []
18 | 	for line in db:
19 | 		db_list.append(line)
20 | 	for line in qr:
21 | 		qr_list.append(line)
22 | 	print("Data not in f2:")
23 | 	for data1 in db_list:
24 | 		if data1 not in qr_list:
25 | 			f1_data = data1
26 | 			print(f1_data)
27 | 	print("Data not in f1:")
28 | 	for data2 in qr_list:
29 | 		if data2 not in db_list:
30 | 			f2_data = data2
31 | 			print(f2_data)
32 | ...
33 | 
34 | # python file_comparison.py -f1 <File_1> -f2 <File_2>
35 | 


--------------------------------------------------------------------------------
/file_handling/ftp_download.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | file1 = sys.argv[1]
 5 | ftp_file = open(file1, "r")
 6 | ftps = ftp_file.readlines()
 7 | os.system("mkdir ftp_downloads")
 8 | log = open("logfile.txt", "w")
 9 | for ftp in ftps:
10 | 	os.system("wget -P ftp_downloads/ " + ftp)
11 | 	log = open("logfile.txt", "a")
12 | 	log.write(ftp)
13 | log.close()
14 | ftp_file.close()
15 | 
16 | # python ftp_download.py <ftpfilepaths>
17 | 


--------------------------------------------------------------------------------
/file_handling/gdc_download.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | file1 = sys.argv[1]
 5 | gdc_manifest_file = open(file1, "r")
 6 | gdc_data = gdc_manifest_file.readlines()
 7 | os.system("mkdir gdc_downloads")
 8 | log = open("logfile.txt", "w")
 9 | for data in gdc_data[1:]:
10 | 	data_ls = data.split("\t")
11 | 	gdc_id = data_ls[0]
12 | 	os.system("wget https://api.gdc.cancer.gov/data/"+gdc_id+" -O gdc_downloads/"+data_ls[1])
13 | 	log = open("logfile.txt", "a")
14 | 	log.write(gdc_id+"\n")
15 | log.close()
16 | gdc_manifest_file()
17 | 
18 | # python gdc_download.py <gdc_manifest.txt>
19 | 


--------------------------------------------------------------------------------
/file_handling/multi_fasta_deconcatenator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # Input and Read the Multi_FASTA record
 5 | import sys
 6 | file1 = sys.argv[1]
 7 | mlti_fasta = open(file1)
 8 | fasta_rec = mlti_fasta.readlines()
 9 | acc_list = []
10 | for line in fasta_rec:
11 | 	if line[0] == ">":
12 | 		acc_list.append(line[1:10])
13 | 
14 | # Create Files with Accession_ids
15 | def file_creater(acc_id):
16 | 	file_name = acc_id+".fasta"
17 | 	file_c = open(file_name,"w+")
18 | 	return(file_c)
19 | 
20 | # Writing FASTA records in Files
21 | def file_writer(file_w, data):
22 | 	file_w.write(data)
23 | 	return(file_w)
24 | 
25 | # Parsing Multi_FASTA records
26 | def parsing_data(acc_id, data):
27 | 	head_index = None
28 | 	seq = ""
29 | 	header = ""
30 | 	for i in range(0, len(data)):
31 | 		data_f = data[i]
32 | 		if acc_id in data_f:
33 | 			header += data_f
34 | 			head_index = i
35 | 			break
36 | 	if head_index != None:
37 | 		for i in range(head_index+1, len(data)):
38 | 			data_s = data[i]
39 | 			if data_s[0] != ">":
40 | 				seq += data_s
41 | 			else:
42 | 				break
43 | 	return(header+seq)
44 | 
45 | # Executing all the Functions
46 | def out():
47 | 	for i in range(0, len(acc_list)):
48 | 		acc_id = acc_list[i]
49 | 		f1 = file_creater(acc_id)
50 | 		f2 = parsing_data(acc_id, fasta_rec)
51 | 		file_writer(f1, f2)
52 | out()
53 | 
54 | # python multi_fasta_deconcatenator.py <Multi_FASTA_File>
55 | 


--------------------------------------------------------------------------------
/file_handling/mysqldb_find.py:
--------------------------------------------------------------------------------
 1 | # Importing modules
 2 | import mysql.connector as mc
 3 | 
 4 | # Connecting to local MySQL database
 5 | mysql=mc.connect( host="localhost", user="user1", passwd="", database="16srRNAdb")
 6 | 
 7 | mycursor = mysql.cursor()
 8 | # Select everything from myseq table where 7dd1e0c5450f0ff6c59187d02ae5783b (hash) found
 9 | mycursor.execute("SELECT * FROM myseq WHERE seq LIKE '7dd1e0c5450f0ff6c59187d02ae5783b'") 
10 | myresult = mycursor.fetchall()
11 | 
12 | # Printing column-1 of the result
13 | for x in myresult:
14 |   print(x[0])
15 |   
16 | # Usage: python mysqldb_find.py
17 | 


--------------------------------------------------------------------------------
/file_handling/seq_concatenator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | file_in = sys.argv[1]
 4 | os.system("cp "+file_in+" tempoxc1265.fasta")
 5 | 
 6 | name_list = ["ECO","SAL"]
 7 | for name in name_list:
 8 | 	cmd = "sed -i "+"'/"+name+"/c\>"+name+"' tempoxc1265.fasta"
 9 | 	os.system(cmd)
10 | fasta_f = open("tempoxc1265.fasta", "r")
11 | file_out = open ("concatenated_seq.fasta","w+")
12 | fast = fasta_f.readlines()
13 | l = []
14 | new = []
15 | for line in fast:
16 | 	if line not in l and line[0] == ">":
17 | 		new.append(line)
18 | 		l.append(line)
19 | 	elif line[0] != ">":
20 | 		new.append(line)
21 | 	else:
22 | 		pass
23 | for line in new:
24 | 	file_out.write(line)
25 | os.system("rm tempoxc1265.fasta")
26 | file_out.close()
27 | 
28 | # python seq_concatenator.py <file.fasta>
29 | 


--------------------------------------------------------------------------------
/machine_learning/brca_classifier-1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # Importing Modules
 5 | from sklearn.datasets import load_breast_cancer
 6 | from sklearn.model_selection import train_test_split
 7 | from sklearn.preprocessing import StandardScaler
 8 | from sklearn.decomposition import PCA
 9 | from sklearn.neural_network import MLPClassifier
10 | from sklearn.decomposition import PCA
11 | from sklearn.metrics import confusion_matrix
12 | from sklearn.metrics import ConfusionMatrixDisplay
13 | import matplotlib.pyplot as plt
14 | import numpy as np
15 | import pandas as pd
16 | 
17 | # Loading Dataset
18 | breast_cancer = load_breast_cancer()
19 | 
20 | # Converting to data.frame
21 | df = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names)
22 | df['diagnosis'] = breast_cancer.target
23 | 
24 | # Get the features and label from the original dataframe
25 | X = df.iloc[:,:-1]
26 | y = df.iloc[:,-1]
27 | 
28 | # Performing standardization
29 | sc = StandardScaler()
30 | X_scaled = sc.fit_transform(X)
31 | 
32 | # Converting features to PCs
33 | pca = PCA(n_components=3, whiten=True)
34 | X_pca = pca.fit_transform(X_scaled)
35 | df1 = pd.DataFrame(data = X_pca, columns = ["PC-1", "PC-2", "PC-3"])
36 | 
37 | # Subsets PCA data.frame into testing and training dataset
38 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True)
39 | 
40 | # Classifier
41 | clf = MLPClassifier(random_state=1, max_iter=1000)
42 | 
43 | # Training
44 | clf.fit(X_train, y_train)
45 | 
46 | # Testing
47 | pred_val = clf.predict(X_test)
48 | 
49 | # Output
50 | output = pd.DataFrame()
51 | output['Expected Output'] = y_test
52 | output['Predicted Output'] = pred_val
53 | 
54 | # Confusion Matrix
55 | cmat = confusion_matrix(y_test, pred_val, labels=clf.classes_, normalize="true")
56 | disp_cmat = ConfusionMatrixDisplay(confusion_matrix=cmat, display_labels=clf.classes_)
57 | 
58 | # Plotting Confusion Matrix
59 | disp_cmat.plot(xticks_rotation='vertical')
60 | plt.savefig('cmat.png',bbox_inches ="tight", pad_inches = 0.5)
61 | 
62 | # Usage: python brca_classifier-1.py
63 | 


--------------------------------------------------------------------------------
/machine_learning/dna_classifier-1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # Importing Modules
 5 | import sys
 6 | import numpy as np
 7 | from sklearn.neural_network import MLPClassifier
 8 | from sklearn.metrics import confusion_matrix
 9 | from sklearn.metrics import ConfusionMatrixDisplay
10 | import matplotlib.pyplot as plt
11 | 
12 | # Input Data
13 | training_data = (open(sys.argv[1], "r")).readlines()
14 | test_data = (open(sys.argv[2], "r")).readlines()
15 | 
16 | # Sequence Encoder (one hot encoding)
17 | def sequence_encoder(seq):
18 | 	seq_mat = []
19 | 	temp = ""
20 | 	for i in range(0, len(seq)):
21 | 		if seq[i].upper() == "A":
22 | 			temp+="1000"
23 | 		elif seq[i].upper() == "T":
24 | 			temp+="0100"
25 | 		elif seq[i].upper() == "G":
26 | 			temp+="0010"
27 | 		elif seq[i].upper() == "C":
28 | 			temp+="0001"
29 | 		else:
30 | 			temp+="0000"
31 | 	temp = list(temp)
32 | 	temp1 = [int(j) for j in temp]
33 | 	return(temp1)
34 | 
35 | # Feature Matrix Generator
36 | def matrix_generator(data):
37 | 	
38 | 	feature_matrix = []
39 | 	seq = ""
40 | 	label = []
41 | 	for line in data:
42 | 		if line[0] == ">" and seq == "":
43 | 			label.append(line.strip().replace(">", ""))
44 | 		elif line[0]!= ">":
45 | 			seq += line.strip()
46 | 		elif line[0] == ">" and seq != "":
47 | 			feature_matrix.append(sequence_encoder(seq))
48 | 			seq=""
49 | 			label.append(line.strip().replace(">", ""))
50 | 	feature_matrix.append(sequence_encoder(seq))
51 | 	return(feature_matrix, label)
52 | 
53 | 
54 | # Generate feature matrix for training dataset 
55 | dataset1 = matrix_generator(training_data)
56 | #print(dataset1)
57 | # Feature matrix for training
58 | X_train = np.array(dataset1[0])
59 | 
60 | # Labels for training
61 | y_train = np.array(dataset1[1])
62 | 
63 | # Classifier
64 | model = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(4), max_iter=5000)
65 | 
66 | # Training
67 | model = model.fit(X_train, y_train)
68 | 
69 | # Generate feature matrix for testing dataset
70 | dataset2 = matrix_generator(test_data)
71 | 
72 | # Feature matrix for testing
73 | X_test = np.array(dataset2[0])
74 | 
75 | # Labels for testing
76 | y_test = np.array(dataset2[1])
77 | 
78 | # Prediction
79 | y_predict = model.predict(X_test)
80 | 
81 | # Plotting Confusion Matrix
82 | cmat = confusion_matrix(y_test, y_predict, labels=model.classes_)
83 | disp_cmat = ConfusionMatrixDisplay(confusion_matrix=cmat, display_labels=model.classes_)
84 | disp_cmat.plot(xticks_rotation='vertical')
85 | plt.savefig('cmat.png',bbox_inches ="tight", pad_inches = 0.5)
86 | 
87 | # Usage: $ python dna_classifier-1.py <train.fasta> <test.fasta>
88 | 


--------------------------------------------------------------------------------
/sequence_analysis/PSSM.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | import sys
 5 | import numpy as np
 6 | 
 7 | fasta_o = open(sys.argv[1], "r")
 8 | fasta_r = fasta_o.readlines()
 9 | 
10 | seq_list = []
11 | for line in fasta_r:
12 | 	if line[0] != ">" and line != "\n":
13 | 		seq_list.append(line.strip())
14 | 
15 | #Function
16 | def frequency(base,total):
17 | 	try:
18 | 		f_base = base/total
19 | 	except ZeroDivisionError:
20 | 		f_base = 0
21 | 	return(f_base)
22 | 	
23 | # Raw Frequency Matrix
24 | n = len(seq_list[0])
25 | m = 4
26 | mat = np.full((m, n), 0.0)
27 | for i in range(0,len(seq_list[0])):
28 | 	A = T = G = C = 0
29 | 	for data in seq_list:
30 | 		if data[i] == "A":
31 | 			A += 1
32 | 		elif data[i] == "G":
33 | 			G += 1
34 | 		elif data[i] == "T":
35 | 			T += 1
36 | 		elif data[i] == "C":
37 | 			C += 1
38 | 	tot = A+G+C+T
39 | 
40 | 	fA = frequency(A,tot)
41 | 	fG = frequency(G,tot)
42 | 	fC = frequency(C,tot)
43 | 	fT = frequency(T,tot)
44 | 
45 | 	mat[0,i] = fA
46 | 	mat[1,i] = fT
47 | 	mat[2,i] = fG
48 | 	mat[3,i] = fC
49 | 
50 | 	A = T = G = C = 0
51 | 
52 | #Normalized Matrix
53 | over_mat = mat.sum(axis = 1)/n
54 | norm_mat = mat/over_mat[:,None]
55 | 
56 | #Convert Normalized Matrix to Log Odd Scores
57 | scores = np.log2(norm_mat)
58 | print(np.around(scores,decimals=2))
59 | 
60 | 


--------------------------------------------------------------------------------
/sequence_analysis/aa_comp.py:
--------------------------------------------------------------------------------
 1 | # Importing modules
 2 | import sys
 3 | import pandas as pd
 4 | from Bio import SeqIO
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | # Amino acid composition dict
 8 | aa_dict = { "nonpolar_aliphatic":["G", "A", "V", "L", "I", "P", "M"],
 9 | 	    "nonpolar_aromatic":["F", "W"],
10 | 	    "polar_uncharged":["S", "T", "N", "Q", "C", "Y"],
11 | 	    "positive":["H", "K", "R"], "negative":["D", "E"], "NA":["-", "*"]}
12 | 
13 | # Reading Protein FASTA file 
14 | data = SeqIO.parse(open(sys.argv[1]), "fasta")
15 | seq = []
16 | for record in data:
17 | 	seq.append(list(str(record.seq)))
18 | 
19 | # Creating dataframe containing aa comp infor for each seq
20 | df = pd.DataFrame(columns = ["pos", "nonpolar_aliphatic", "nonpolar_aromatic","polar_uncharged", "positive", "negative", "NA"])
21 | for i in range(len(seq[0])):
22 | 	temp = {"nonpolar_aliphatic":0, "nonpolar_aromatic":0,
23 | 		"polar_uncharged":0, "positive":0, "negative":0, "NA":0}
24 | 	for s in seq:
25 | 		temp["".join([j for j in aa_dict if s[i] in aa_dict[j]])] = temp["".join([j for j in aa_dict if s[i] in aa_dict[j]])]+1
26 | 	temp = { i:(temp[i]/sum(temp.values()))*100 for i in temp}
27 | 	temp["pos"] = i+1
28 | 	df.loc[len(df)]=temp
29 | 
30 | # Plotting stacked barplot
31 | df.plot(x='pos', kind='bar', stacked=True, figsize=(15, 8))
32 | plt.xlabel("Amino Acid Position")
33 | plt.ylabel("Percentage(%)")
34 | plt.legend(loc = "lower right")
35 | plt.savefig('aa_composition_plot.png',bbox_inches ="tight", pad_inches = 0.1)
36 | plt.close()
37 | 
38 | # Usage: python aa_comp.py aligned_seq.fasta
39 | 


--------------------------------------------------------------------------------
/sequence_analysis/alignment2consensus.py:
--------------------------------------------------------------------------------
 1 | ### Import Modules
 2 | import sys
 3 | from Bio import AlignIO
 4 | from collections import Counter
 5 | 
 6 | ### File Handling
 7 | out_file = open(sys.argv[1]+"_consen.fasta", "w")
 8 | conseq = ">consensus_"+sys.argv[1]+"\n"
 9 | 
10 | ### Read Alignment File
11 | alignment = AlignIO.read(sys.argv[1], "fasta")
12 | 
13 | ### Generate Consensus Sequence
14 | for i in range (0, alignment.get_alignment_length()):
15 | 	temp_ls = []
16 | 	col_data = alignment[:,i]
17 | 	for data in Counter(col_data):
18 | 		if ((Counter(col_data)[data]/len(col_data))*100) >= 1 and data in ["A","T","G","C"]:
19 | 			temp_ls.append(data)
20 | 	if len(temp_ls) == 1 and ((Counter(col_data)[temp_ls[0]]/len(col_data))*100) >= 99:
21 | 		conseq += temp_ls[0]
22 | 	else:
23 | 		conseq += "-"
24 | 
25 | ### Writing Output
26 | out_file.write(conseq+"\n")
27 | 


--------------------------------------------------------------------------------
/sequence_analysis/base_composition.py:
--------------------------------------------------------------------------------
 1 | # Base Compositions of DNAs from Multi/Fasta Record
 2 | 
 3 | import sys
 4 | 
 5 | f_file = sys.argv[1]
 6 | file_op = open(f_file, "r")
 7 | lines = file_op.readlines()
 8 | file_wr = open("base_composition.tsv", "w+")
 9 | file_wr.write("DNA origin	A	T	G	C	Length	AT%	GC%\n")
10 | seq = ""
11 | head = ""
12 | for line in lines:
13 | 	if line[0] == ">" and seq == "":
14 | 		head = line
15 | 	elif line[0]!= ">":
16 | 		seq += line.strip()
17 | 	elif line[0] == ">" and seq != "":
18 | 
19 | 			file_wr.write(str(head[1:12])+"\t"+str(seq.count("A"))+"\t"+str(seq.count("T"))+\
20 | "\t"+str(seq.count("G"))+"\t"+str(seq.count("C"))+"\t"+str(len(seq))+"\t"+"%.2f" %(((seq.count("A")+seq.count("T"))/len(seq))*100)+"\t"\
21 | +"%.2f" %(((seq.count("G")+seq.count("C"))/len(seq))*100)+"\t"+"\n")
22 | 			seq = ""
23 | 			head = line
24 | 
25 | file_wr.write(str(head[1:12])+"\t"+str(seq.count("A"))+"\t"+str(seq.count("T"))+\
26 | "\t"+str(seq.count("G"))+"\t"+str(seq.count("C"))+"\t"+str(len(seq))+"\t"+"%.2f" %(((seq.count("A")+seq.count("T"))/len(seq))*100)+"\t"\
27 | +"%.2f" %(((seq.count("G")+seq.count("C"))/len(seq))*100)+"\t"+"\n")
28 | 
29 | # python base_composition.py <Fasta/Multi_fasta File>
30 | 


--------------------------------------------------------------------------------
/sequence_analysis/consensus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # import sys module
 5 | import sys
 6 | # import NumPy library 
 7 | import numpy as np
 8 | 
 9 | # file handling
10 | f_file = sys.argv[1]
11 | f_open = open(f_file, "r")
12 | f_read = f_open.readlines()
13 | 
14 | # declaring variables
15 | seq_list = []
16 | seq = ""
17 | header = ""
18 | 
19 | # assigning values to variables
20 | for line in f_read:
21 | 	if ">" in line[0:1]:
22 | 		header = line[1:]
23 | 	elif ">" not in line[0:1]:
24 | 		seq += line.strip()
25 | 	seq_list.append(seq)
26 | 	seq = ""
27 | seq_list = ' '.join(seq_list).split()
28 | 
29 | # calculating consensus
30 | n = len(seq_list[0])
31 | con_array = np.full((4, n), 0)
32 | for seq in seq_list:
33 | 	for i, char in enumerate(seq):
34 | 		if char == "A":
35 | 			con_array[0][i] +=1
36 | 		elif char == "G":
37 | 			con_array[1][i] +=1
38 | 		elif char == "T":
39 | 			con_array[2][i] +=1
40 | 		elif char == "C":
41 | 			con_array[3][i] +=1
42 | 
43 | # printing output
44 | print("A:",con_array[0],"\nG:",con_array[1],"\nT:",con_array[2],"\nC:",con_array[3])
45 | 
46 | # python consensus.py <multi_fasta_file>
47 | 


--------------------------------------------------------------------------------
/sequence_analysis/gc_percent.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | file1 = sys.argv[1]
 3 | Dna = open(file1)
 4 | dna = Dna.readlines()
 5 | seq = ""
 6 | for line in dna:
 7 | 	if line[0] != ">":
 8 | 		seq = seq + line
 9 | no_a = seq.count("A")
10 | no_t = seq.count("T")
11 | no_g = seq.count("G")
12 | no_c = seq.count("C")
13 | dna_len = no_a + no_t + no_g + no_c
14 | gc_percent = ((no_g + no_c)/dna_len)*100.0
15 | print("GC content: " "%.2f" % gc_percent+"%")
16 | 
17 | # python gc_percent.py <FASTA_File>
18 | 


--------------------------------------------------------------------------------
/sequence_analysis/hydrophobicity_plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | 
 5 | # Python script for calculating and plotting
 6 | # hydrophobicity of a given peptide/protein 
 7 | # sequence using Kyte-Doolittle scale
 8 | 
 9 | 
10 | # Importing modules
11 | import sys
12 | from Bio import SeqIO
13 | import matplotlib.pyplot as plt
14 | 
15 | # Kyte-Doolittle scale
16 | kydo = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5,
17 | 	'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5,
18 | 	'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6,
19 | 	'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2 }
20 | 
21 | x_plot = []
22 | y_plot = []
23 | 
24 | # Function for converting sequence to Kyte-Doolittle propensity
25 | def seq_to_kydo(seq):
26 | 	kydo_values = []
27 | 	for aa in seq:
28 | 		kydo_values.append(kydo[aa])
29 | 	return(kydo_values)
30 | 
31 | # Function for smoothing the data 
32 | def smoothing(values_list):
33 | 	window = int(9) # Adjust window size here
34 | 	half_window = int((window-1)/2)
35 | 	new_values = [0]*half_window+values_list+[0]*half_window
36 | 	y = [] # Smoothened Kyte-Doolittle Values
37 | 	x = [] # Amino acid positions
38 | 	for i in range(half_window,len(new_values)-half_window):
39 | 		y.append(sum(new_values[i-half_window:i+1+half_window])/window)
40 | 	for j in range(1, len(values_list)+1):
41 | 		x.append(j)
42 | 	return(x, y)
43 | 
44 | # Reading fasta record
45 | fasta_rec = open(sys.argv[1])
46 | seq_data = SeqIO.parse(fasta_rec, "fasta")
47 | for record in seq_data:
48 | 	header, sequence = record.id, str(record.seq)
49 | 	x_plot, y_plot = smoothing(seq_to_kydo(sequence))[0], smoothing(seq_to_kydo(sequence))[1]
50 | 
51 | # Plotting the data
52 | plt.plot(x_plot, y_plot)
53 | plt.title("Kyte-Doolittle Hydrophobicity Plot")
54 | plt.xlabel("Amino Acid Position")
55 | plt.ylabel("Hydrophobicity Score")
56 | plt.savefig('hydrophobicity_plot.png',bbox_inches ="tight", pad_inches = 0.5, dpi = 500)
57 | plt.close()
58 | 
59 | 
60 | # Usage: python hydrophobicity_plot.py prot.fasta
61 | 


--------------------------------------------------------------------------------
/sequence_analysis/k-mer_constructor.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | DNA_file = sys.argv[1]
 3 | k_mer_size = input("Enter the K-mer length: ")
 4 | DNA = open(DNA_file, "r")
 5 | seq = ""
 6 | for line in DNA:
 7 | 	if line.startswith(">"):
 8 | 		pass
 9 | 	else:
10 | 		seq += line.replace('\n', "")
11 | x = int(k_mer_size)
12 | for i in range(len(seq) - x):
13 | 	print(seq[i:i+int(k_mer_size)])
14 | 
15 | # python k-mer_constructor.py <FASTA_File>
16 | 


--------------------------------------------------------------------------------
/sequence_analysis/kmer_constructor-1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # Importing Modules
 5 | import sys
 6 | from Bio import SeqIO
 7 | 
 8 | 
 9 | # Function for Overlapping Kmers
10 | def overlapping_kmer_constructor(seq):
11 | 	if len(sys.argv[2].split("..")) == 1:
12 | 		k = int(sys.argv[2])
13 | 		for i in range(len(seq)):
14 | 	        	print(seq[i:i+k])
15 | 
16 | 	elif len(sys.argv[2].split("..")) > 1:
17 | 		k_list = sys.argv[2].split("..")
18 | 		k_list = [i for i in range(int(k_list[0]), int(k_list[1])+1)]
19 | 		for k in k_list:
20 | 			k = int(k)
21 | 			for i in range(len(seq)):
22 | 				print(seq[i:i+k])
23 | 
24 | # Function for Non-Overlapping Kmers
25 | def nonoverlapping_kmer_constructor(seq):
26 | 	if len(sys.argv[2].split("..")) == 1:
27 | 		k = int(sys.argv[2])
28 | 		for i in range(0, len(seq), k):
29 | 			print(seq[i:i+k])
30 | 
31 | 	elif len(sys.argv[2].split("..")) > 1:
32 | 		k_list = sys.argv[2].split("..")
33 | 		k_list = [i for i in range(int(k_list[0]), int(k_list[1])+1)]
34 | 		for k in k_list:
35 | 			k = int(k)
36 | 			for i in range(0, len(seq), k):
37 | 				print(seq[i:i+k])
38 | 
39 | 
40 | # Reading Fasta Records
41 | fasta_in = open(sys.argv[1])
42 | fasta_recs = SeqIO.parse(fasta_in, "fasta")
43 | for rec in fasta_recs:
44 | 	head, seq = rec.id, str(rec.seq)
45 | 	
46 | 	# Generating Overlapping Kmers
47 | 	if sys.argv[3] == "-O":
48 | 		overlapping_kmer_constructor(seq)
49 | 	
50 | 	# Generating Non-Overlapping Kmers
51 | 	elif sys.argv[3] == "-N":
52 | 		nonoverlapping_kmer_constructor(seq)
53 | 		
54 | # Usage: $ python kmer_constructor-1.py a.fasta 3/3..7 -N/-O
55 | 


--------------------------------------------------------------------------------
/sequence_analysis/orf_analyzer.py:
--------------------------------------------------------------------------------
 1 | # Importing modules
 2 | from Bio.SeqUtils import GC
 3 | import matplotlib.pyplot as plt
 4 | from matplotlib.cm import get_cmap
 5 | from Bio.Seq import Seq
 6 | from Bio import SeqIO
 7 | import sys
 8 | 
 9 | # Function to plot nucleotide composition
10 | def nucl_comp(seq):
11 | 	nucl_dict = {"A": 0,  "T": 0, "G": 0, "C": 0}
12 | 	for nt in nucl_dict:
13 | 		nucl_dict[nt] = round((seq.count(nt)/len(seq))*100, 2)
14 | 	plt.bar(list(nucl_dict.keys()), list(nucl_dict.values()),
15 | 		color =['blue', 'red', 'green', 'orange'])
16 | 
17 | 	plt.xlabel("Nucleotides")
18 | 	plt.ylabel("Percentage (%)")
19 | 	plt.title("Nucleotides Composition Plot")
20 | 	plt.savefig('nucleotide_composition.png',bbox_inches ="tight", pad_inches = 0.5)
21 | 	plt.close()
22 |   
23 | # Function to return GC composition
24 | def gc_comp(seq):
25 | 	gc_perc = GC(seq)
26 | 	gc_dict = {"GC":gc_perc,"AT":100-gc_perc}
27 | 	plt.bar(list(gc_dict.keys()), list(gc_dict.values()), color =['blue', 'red'])
28 | 	plt.xticks(fontsize=8, rotation=90)
29 | 	plt.xlabel("Compositions")
30 | 	plt.ylabel("Percentage (%)")
31 | 	plt.title("GC & AT Composition Plot")
32 | 	plt.savefig('gc_at_composition.png',bbox_inches ="tight", pad_inches = 0.5, dpi = 500)
33 | 	plt.close()
34 | 
35 | # Function to plot codon composition
36 | def codon_comp(seq):
37 | 	codons = [seq[x:x+3] for x in range(0, len(seq)+1, 3)]
38 | 	codon_dict = dict.fromkeys(list(set(codons)), 0)
39 | 	for codon in codon_dict:
40 | 		codon_dict[codon] = (codons.count(codon)/len(codons))*100
41 | 	codon_dict={key:value for key,value in codon_dict.items() if len(key)==3}
42 | 	plt.figure(figsize=(10,4))
43 | 	plt.bar(list(codon_dict.keys()), list(codon_dict.values()), color=get_cmap("Accent").colors)
44 | 	plt.xticks(fontsize=8, rotation=90)
45 | 	plt.xlabel("Codons")
46 | 	plt.ylabel("Percentage (%)")
47 | 	plt.title("Codons Composition Plot")
48 | 	plt.savefig('codon_composition.png',bbox_inches ="tight", pad_inches = 0.5, dpi = 500)
49 | 	plt.close()
50 | 
51 | # Function to plot amino acid composition
52 | def aa_comp(seq):
53 | 	a_acid_seq = Seq(seq).translate()
54 | 	a_acid_seq = str(a_acid_seq).replace("*", "")
55 | 	a_acids = [ aa for aa in a_acid_seq]
56 | 	a_acids_dict = dict.fromkeys(list(set(a_acids)), 0)
57 | 	for aa in a_acids_dict:
58 | 		a_acids_dict[aa] = (a_acids.count(aa)/len(a_acids))*100
59 | 	plt.figure(figsize=(10,4))
60 | 	plt.bar(list(a_acids_dict.keys()), list(a_acids_dict.values()), color=get_cmap("tab20").colors)
61 | 	plt.xticks(fontsize=8, rotation=90)
62 | 	plt.xlabel("Amino Acids")
63 | 	plt.ylabel("Percentage (%)")
64 | 	plt.title("Amino Acid Composition Plot")
65 | 	plt.savefig('amino_acid_composition.png',bbox_inches ="tight", pad_inches = 0.5, dpi = 500)
66 | 	plt.close()
67 | 
68 | # Reading FASTA file and running all 
69 | fasta_rec = open(sys.argv[1])
70 | seq = ""
71 | data = SeqIO.parse(fasta_rec, "fasta")
72 | for record in data:
73 | 	seq = record.id, str(record.seq)
74 | 	seq = str(seq[1])
75 | nucl_comp(seq)
76 | gc_comp(seq)
77 | codon_comp(seq)
78 | aa_comp(seq)
79 | 
80 | # Usage: $ python orf_analyzer.py <seq.fasta>
81 | 


--------------------------------------------------------------------------------
/sequence_analysis/prot_mol_weight_calculator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Program to calculate protein molecular weight
 4 | # from the amino acid sequence in Fasta file
 5 | 
 6 | import sys
 7 | 
 8 | file_f = open(sys.argv[1], "r")
 9 | file_rl = file_f.readlines()
10 | aa_mw = {"A":71.0779, "C":103.1429,"D":115.0874,"E":129.114,"F":147.1739,"G":57.0513,"H":137.1393,"I":113.1576,"K":128.1723,
11 | "L":113.1576,"M":131.1961,"N":114.1026,"P":97.1152,"Q":128.1292,"R":156.1857,"S":87.0773,"T":101.1039,"V":99.1311,"W":186.2099,"Y":163.1733}
12 | 
13 | 
14 | def calc(prot_seq, header):
15 | 	aa_molwt = 18.0153
16 | 	for i in range(0, len(prot_seq)):
17 | 		aa_molwt += aa_mw.get(prot_seq[i])
18 | 
19 | 	print("Sequence Info: "+header[1:])
20 | 	print("Number of amino acid residues: "+str(len(prot_seq)))
21 | 	print("Protein molecular weight: "+str(round(aa_molwt/1000, 2))+" kDa ["+str(round(aa_molwt, 2))+" Da]\n")
22 | 
23 | seq = ""
24 | head = ""
25 | print("\nProtein Molecular Weight Calculator Result\n")
26 | for line in file_rl:
27 | 	if line[0:1] == ">" and seq == "":
28 | 		head = line.strip()
29 | 	elif line[0:1] != ">":
30 | 		seq += line.strip()
31 | 	elif line[0:1] ==">" and seq != "":
32 | 		calc(seq, head)
33 | 		seq = ""
34 | 		head = line.strip()
35 | calc(seq, head)
36 | 
37 | # Usage: python prot_mol_weight_calculator.py prot.fasta
38 | 


--------------------------------------------------------------------------------
/sequence_analysis/prototype_aligner1.py:
--------------------------------------------------------------------------------
 1 | # importing modules
 2 | import sys
 3 | import numpy as np
 4 | 
 5 | # Function for Alignment
 6 | def Aligner():
 7 | 	n=len(seq1)
 8 | 	m=len(seq2)
 9 | 	matrix = np.full((n, m), 0)
10 | 	for i in range(0,n):
11 | 		for j in range(0, m):
12 | 			if seq1[i] == seq2[j]:
13 | 				matrix[i,j] = 1
14 | 			else:
15 | 				matrix[i,j] = 0
16 | 
17 | 	al_seq = ""
18 | 	for x in range(0, m):
19 | 		if matrix[x,x] == 1:
20 | 			al_seq += seq1[x]
21 | 		elif matrix[x,x] == 0:
22 | 			al_seq += "-"
23 | 
24 | 	fast_al_seq = ""
25 | 	for y in range(0, len(al_seq), 70):
26 | 		fast_al_seq += al_seq[y:y+70]+"\n"
27 | 	print(">Aligned Sequence")
28 | 	print(fast_al_seq)
29 | 
30 | # Accessory code
31 | seq1=""
32 | seq2=""
33 | mf_file = sys.argv[1]
34 | mf_open = open(mf_file, "r")
35 | mf_read = mf_open.readlines()
36 | l1= []
37 | for id_index, line in enumerate(mf_read, 1):
38 | 	if ">" in line:
39 |             l1.append(id_index)
40 | seq1_temp = mf_read[l1[0]: l1[1]-1]
41 | seq2_temp = mf_read[l1[1]:]
42 | for data1 in seq1_temp:
43 | 	seq1 += data1.strip()
44 | for data2 in seq2_temp:
45 | 	seq2 += data2.strip()
46 | 
47 | Aligner()
48 | 
49 | # Usage: python prototype_aligner1.py <file.fasta> 
50 | 


--------------------------------------------------------------------------------
/sequence_analysis/random_seq_generator-1.py:
--------------------------------------------------------------------------------
 1 | # Importing modules
 2 | import numpy as np
 3 | import pandas as pd
 4 | from random import uniform, randint, sample
 5 | 
 6 | # Functions to generate random sequence of length -> l and gc_content -> gc
 7 | 
 8 | # Approach 1 | Using Probabilistic Model 
 9 | # Iterate through all the positions in increasing order
10 | # and for each position generate random probablity between
11 | # 1% and 100%. If the probablity is < given GC percent
12 | # then randomly assigning G/C to that positions and 
13 | # else randomly assigning A/T to that position
14 | def generate_seq_1(l:int, gc:float):
15 | 	seq = ""
16 | 	nt = ["A", "T", "G", "C"]
17 | 	for i in range(l):
18 | 		if uniform(0.01, 1.0) < gc:
19 | 			seq += nt[randint(2, 3)]
20 | 		else:
21 | 			seq += nt[randint(0, 1)]
22 | 	return seq
23 | 
24 | # Approach 2 | Using Random Sampling
25 | # Sample random positions where nt should be G/C
26 | # and then randomly assigning G/C to that positions and 
27 | # for rest of the positions randomly assigning A/T
28 | def generate_seq_2(l:int, gc:float):
29 | 	seq = ""
30 | 	nt = ["A", "T", "G", "C"]
31 | 	gc_pos = sample(range(l), int(gc*l))
32 | 	for i in range(l):
33 | 		if i in gc_pos:
34 | 			seq += nt[randint(2, 3)]
35 | 		else:
36 | 			seq += nt[randint(0, 1)]
37 | 	return seq
38 | 


--------------------------------------------------------------------------------
/sequence_analysis/random_seq_generator.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | nt_type = input("Enter the sequence type [RNA/DNA]: ")
 3 | nt_len = int(input("Enter the sequence length: "))
 4 | nt_DNA = "ATGC"
 5 | nt_RNA = "AUGC"
 6 | nt_seq = ""
 7 | x = ""
 8 | for i in range (int(nt_len)):
 9 | 	x = random.randint(0, 3)
10 | 	if nt_type == "DNA":
11 | 		nt_seq = nt_seq + nt_DNA[x]
12 | 	if nt_type == "RNA":
13 | 		nt_seq = nt_seq + nt_RNA[x]
14 | fasta_seq = ""
15 | for i in range(0, len(nt_seq), 70):
16 | 	fasta_seq += nt_seq[i:i+70]+"\n"
17 | print("> "+nt_type+"_Sequence")
18 | print(fasta_seq)
19 | 
20 | # python random_seq_generator.py
21 | 


--------------------------------------------------------------------------------
/sequence_analysis/temp_to_cod.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | temp_strand = input("Enter the DNA Sequence: ")
 5 | 
 6 | cod_strand = ""
 7 | 
 8 | for i in range(0, len(temp_strand)):
 9 | 
10 | 	if temp_strand[i] == "A":
11 | 		cod_strand = cod_strand + "T"
12 | 	elif temp_strand[i] == "T":
13 | 		cod_strand = cod_strand + "A"
14 | 	elif temp_strand[i] == "G":
15 | 		cod_strand = cod_strand + "C"
16 | 	elif temp_strand[i] == "C":
17 | 		cod_strand = cod_strand + "G"
18 | 
19 | print(cod_strand) 
20 | 


--------------------------------------------------------------------------------
/sequence_analysis/translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #!python
  3 | 
  4 | import sys
  5 | 
  6 | f_file = sys.argv[1]
  7 | f_open = open(f_file, "r")
  8 | f_read = f_open.readlines()
  9 | 
 10 | codon_table = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
 11 |         'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
 12 |         'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
 13 |         'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
 14 |         'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
 15 |         'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
 16 |         'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
 17 |         'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
 18 |         'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
 19 |         'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
 20 |         'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
 21 |         'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
 22 |         'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
 23 |         'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
 24 |         'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*', 
 25 |         'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'} 
 26 | 
 27 | head = ""
 28 | seq = ""
 29 | rev_seq = ""
 30 | for line in f_read:
 31 | 	if ">" in line[0:2]:
 32 | 		head = line
 33 | 	elif ">" not in line[0:2]:
 34 | 		seq += line.strip()
 35 | for i in reversed(range(0, len(seq))):
 36 | 	if seq[i] == "A":
 37 | 		rev_seq = rev_seq + "T"
 38 | 	elif seq[i] == "T":
 39 | 		rev_seq = rev_seq + "A"
 40 | 	elif seq[i] == "G":
 41 | 		rev_seq = rev_seq + "C"
 42 | 	elif seq[i] == "C":
 43 | 		rev_seq = rev_seq + "G"
 44 | 
 45 | # FRAME I Forward
 46 | prot_seq_IF = ""
 47 | for x in range(0,len(seq),3):
 48 | 	codon = seq[x:x+3]
 49 | 	if len(codon) == 3:
 50 | 		aa = codon_table.get(codon)
 51 | 		prot_seq_IF += aa
 52 | # FRAME II Forward
 53 | prot_seq_IIF = ""
 54 | for x in range(1,len(seq),3):
 55 | 	codon = seq[x:x+3]
 56 | 	if len(codon) == 3: 
 57 | 		aa = codon_table.get(codon)
 58 | 		prot_seq_IIF += aa
 59 | # FRAME III Forward
 60 | prot_seq_IIIF = ""
 61 | for x in range(2,len(seq),3):
 62 | 	codon = seq[x:x+3]
 63 | 	if len(codon) == 3: 
 64 | 		aa = codon_table.get(codon)
 65 | 		prot_seq_IIIF += aa
 66 | # FRAME I Reverse
 67 | prot_seq_IR = ""
 68 | for x in range(0,len(rev_seq),3):
 69 | 	codon = rev_seq[x:x+3]
 70 | 	if len(codon) == 3:
 71 | 		aa = codon_table.get(codon)
 72 | 		prot_seq_IR += aa
 73 | # FRAME II Reverse
 74 | prot_seq_IIR = ""
 75 | for x in range(1,len(rev_seq),3):
 76 | 	codon = rev_seq[x:x+3]
 77 | 	if len(codon) == 3:
 78 | 		aa = codon_table.get(codon)
 79 | 		prot_seq_IIR += aa
 80 | # FRAME III Reverse
 81 | prot_seq_IIIR = ""
 82 | for x in range(2,len(rev_seq),3):
 83 | 	codon = rev_seq[x:x+3]
 84 | 	if len(codon) == 3:
 85 | 		aa = codon_table.get(codon)
 86 | 		prot_seq_IIIR += aa
 87 | 
 88 | def out(prot_seq):
 89 | 	prot_fasta = ""
 90 | 	for i in range(0, len(prot_seq), 70):
 91 | 		prot_fasta += prot_seq[i:i+70]+"\n"
 92 | 	print(prot_fasta)
 93 | 
 94 | print(">5'3' Frame I")
 95 | out(prot_seq_IF)
 96 | print(">5'3' Frame II")
 97 | out(prot_seq_IIF)
 98 | print(">5'3' Frame III")
 99 | out(prot_seq_IIIF)
100 | print(">3'5' Frame I")
101 | out(prot_seq_IR)
102 | print(">3'5' Frame II")
103 | out(prot_seq_IIR)
104 | print(">3'5' Frame III")
105 | out(prot_seq_IIIR)
106 | 
107 | # Usage: python translate.py file.fasta
108 | 


--------------------------------------------------------------------------------
/supplementary_data/images/dna_helix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajanbit/Bioinfo_py_Scripts/e11b8cb44227893138ea531dc478f9d787cab338/supplementary_data/images/dna_helix.png


--------------------------------------------------------------------------------
/supplementary_data/images/nt_seq_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajanbit/Bioinfo_py_Scripts/e11b8cb44227893138ea531dc478f9d787cab338/supplementary_data/images/nt_seq_logo.png


--------------------------------------------------------------------------------
/visualization/dna_fasta_visualization.py:
--------------------------------------------------------------------------------
 1 | ''' \
 2 | Usage:
 3 |    python dna_fasta_visualization.py -n <FASTA_File> '''
 4 | 
 5 | import sys
 6 | import re
 7 | 
 8 | ...
 9 | 
10 | class text():
11 | 	red = '\033[31m'
12 | 	green = '\033[32m'
13 | 	yellow = '\033[33m'
14 | 	blue = '\033[34m'
15 | 	purple='\033[35m'
16 | 	default = '\033[0m'
17 | 	black = "\x1b[30m"
18 | 	cyan = '\033[36m'
19 | 	bg = "\x1b[107m"
20 | 
21 | inputs = sys.argv
22 | if '-n' not in inputs:
23 | 	print (__doc__)
24 | else:
25 | 	in_file = inputs[inputs.index('-n') + 1]
26 | 	file1 = open(in_file)
27 | 	seq_data = file1.readlines()
28 | 	header = ""
29 | 	seq = ""
30 | 	for line in seq_data:
31 | 		if line.startswith(">"):
32 | 			header += line.strip()
33 | 		else:
34 | 			seq += line
35 | 
36 | 	f_seq1 = seq.replace("A",(text.red+"A"+text.default))
37 | 	f_seq2 = f_seq1.replace("T",(text.blue+"T"+text.default))
38 | 	f_seq3 = f_seq2.replace("G",(text.green+"G"+text.default))
39 | 	f_seq4 = f_seq3.replace("C",(text.yellow+"C"+text.default))
40 | 	
41 | 	print("\n"+text.bg+text.black+header+text.default)
42 | 	if len(seq) <= 10000:
43 | 		print(f_seq4+"\n")
44 | 	else:
45 | 		print(text.red+"\n     #######################################################\n\
46 |      ## DNA sequence is extremely long for visualization! ##\n\
47 |      #######################################################"+text.default)
48 | 
49 | 	no_A = seq.count("A")
50 | 	no_T = seq.count("T")
51 | 	no_G = seq.count("G")
52 | 	no_C = seq.count("C")
53 | 	
54 | 	acc_regex= re.compile(r'\S\S_\d\d\d\d\d\d.\d')
55 | 	acc_id = acc_regex.search(header)
56 | 	
57 | 	print("\n"+text.bg+text.black+"Sequence Information:"+text.default+"\n")
58 | 
59 | 	print("Accession number: " + text.cyan+acc_id.group()+text.default+"\n")
60 | 
61 | 	list_head = header.split()
62 | 	start_ind = 1
63 | 	if "complete" in list_head:
64 | 		end_ind = list_head.index("complete")
65 | 		print("Organism/Origin: "+text.cyan+(" ".join(list_head[start_ind:end_ind]).replace(",",""))+text.default+"\n")
66 | 
67 | 	print("Number of Adenine residues" +text.red+" [A]"+text.default+": "+str(no_A))
68 | 	print("Number of Guanine residues" +text.green+" [G]"+text.default+": "+str(no_G))
69 | 	print("Number of Cytosine residues" +text.yellow+" [C]"+text.default+": "+str(no_C))
70 | 	print("Number of Thymine residues" +text.blue+" [T]"+text.default+": "+str(no_T)+"\n")
71 | 
72 | 	print("DNA Sequence length" +text.purple+" [l]"+text.default+": "+str(len(seq))+"\n")
73 | 
74 | 	gc_per = ((no_G + no_C)/len(seq))*100.0
75 | 	at_per = ((no_A + no_T)/len(seq))*100.0
76 | 
77 | 	print("GC content"+text.green+" [G"+text.default+text.yellow+"C]"+text.default+": "+"%.2f" % gc_per+"%")
78 | 	print("AT content"+text.red+" [A"+text.default+text.blue+"T]"+text.default+": "+"%.2f" % at_per+"%"+"\n")
79 | 
80 | 	if gc_per >= 60:
81 | 		print("The DNA is"+text.green+" [G"+text.default+text.yellow+"C] "+text.default+"rich"+"\n")
82 | 	elif at_per >= 60:
83 | 		print("The DNA is"+text.red+" [A"+text.default+text.blue+"T] "+text.default+"rich"+"\n")
84 | ...
85 | 
86 | # python dna_fasta_visualization.py -n <FASTA_File>
87 | 


--------------------------------------------------------------------------------
/visualization/dna_helix_visualizer.py:
--------------------------------------------------------------------------------
 1 | class text():
 2 | 	red = '\033[31m'
 3 | 	green = '\033[32m'
 4 | 	yellow = '\033[33m'
 5 | 	blue = '\033[34m'
 6 | 	default = '\033[0m'
 7 | 
 8 | t_seq = input("Enter the Sequence: ")
 9 | c_seq = ""
10 | for i in range(0, len(t_seq)):
11 | 	if t_seq[i] == "A":
12 | 		c_seq = c_seq + "T"
13 | 	elif t_seq[i] == "T":
14 | 		c_seq = c_seq + "A"
15 | 	elif t_seq[i] == "G":
16 | 		c_seq = c_seq + "C"
17 | 	elif t_seq[i] == "C":
18 | 		c_seq = c_seq + "G"
19 | 
20 | 
21 | spacer_list = [3,2,1,0,0,1,2,3]
22 | bp_list = [0,2,4,6,6,4,2,0]
23 | data = "\n"
24 | x = 0
25 | l = len(t_seq)
26 | while x != l:
27 | 
28 | 	for y in range(0,len(spacer_list)):
29 | 		try:
30 | 			data += "\t"+" "*spacer_list[y] + t_seq[x] + "-"*bp_list[y] + c_seq[x] + "\n"
31 | 			x+=1
32 | 		except:
33 | 			break
34 | 	
35 | data = data.replace("A",(text.red+"A"+text.default))
36 | data = data.replace("T",(text.blue+"T"+text.default))
37 | data = data.replace("G",(text.green+"G"+text.default))
38 | data = data.replace("C",(text.yellow+"C"+text.default))
39 | print("\n\tDNA Helix\n"+data)
40 | 
41 | # python dna_helix_visualizer.py
42 | 


--------------------------------------------------------------------------------
/visualization/streamlit_base_comp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #!python
 3 | 
 4 | # Importing modules
 5 | import streamlit as st
 6 | from Bio import SeqIO
 7 | from Bio.SeqUtils import gc_fraction
 8 | import pandas as pd
 9 | from io import StringIO
10 | import matplotlib.pyplot as plt
11 | 
12 | # Page config
13 | st.set_page_config(page_title="Base_Comp")
14 | 
15 | # Title
16 | st.title("Nucleotide Composition Analysis")
17 | 
18 | # Subheader
19 | st.subheader("This tool take FASTA sequence as input and generate nucleotide composition table and plots.")
20 | 
21 | # File uploader
22 | st.header("FASTA File Upload")
23 | uploaded_file = st.file_uploader("Upload a file", type=["fasta"])
24 | 
25 | # Creating empty dataframe
26 | df = pd.DataFrame(columns=["Accession_IDs","Length","A","T","G","C","GC(%)"])
27 | 
28 | # Generating base composition table
29 | if uploaded_file is not None:
30 | 	
31 | 	# Reading fasta records and filling the dataframe
32 | 	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
33 | 	data = SeqIO.parse(stringio, "fasta")
34 | 	for record in data:
35 | 		header, seq = record.id, record.seq
36 | 		nucl_dict = {"Accession_IDs":0,"Length":0,"A": 0,  "T": 0, "G": 0, "C": 0,"GC(%)":0,}
37 | 		for nt in nucl_dict:
38 | 			nucl_dict[nt] = str(round((str(seq).count(nt)/len(seq))*100, 2))
39 | 		nucl_dict["Accession_IDs"] = header		
40 | 		nucl_dict["GC(%)"] = gc_fraction(seq)*100
41 | 		nucl_dict["Length"] = len(str(seq))	
42 | 		df = pd.concat([df, pd.DataFrame([nucl_dict])],ignore_index=True)
43 | 	
44 | 	# Writing table in app
45 | 	df.rename(columns = {"A":"A(%)", "T":"T(%)","G":"G(%)","C":"C(%)"}, inplace = True)
46 | 	st.text("Table 1. Nucleotide composition of given sequences.")
47 | 	st.write(df)
48 | 
49 | 	# Plotting nucleotide base composition boxplot in app
50 | 	st.text("Figure 1. Boxplot showing nucleotide base composition of given sequences.")
51 | 	fig, ax = plt.subplots()
52 | 	new_df = df[["A(%)","T(%)","G(%)","C(%)"]].astype(float)	
53 | 	new_df.boxplot(column=["A(%)","T(%)","G(%)","C(%)"], grid=False)
54 | 	st.pyplot(fig)
55 | 
56 | 	# Plotting GC composition boxplot in app
57 | 	st.text("Figure 2. Boxplot showing GC composition of given sequences.")
58 | 	fig, ax = plt.subplots()
59 | 	df.boxplot(column=["GC(%)"], grid=False)
60 | 	st.pyplot(fig)
61 | 	
62 | 	# Plotting length distribution boxplot in app
63 | 	st.text("Figure 3. Boxplot showing length distribution of given sequences.")
64 | 	fig, ax = plt.subplots()
65 | 	new_df = df[["Length"]].astype(float)
66 | 	new_df.boxplot(column=["Length"], grid=False)
67 | 	st.pyplot(fig)
68 | 
69 | ## Usage: $ streamlit run streamlit_base_comp.py
70 | 


--------------------------------------------------------------------------------