├── Results
    ├── readme
    ├── workflow.png
    └── giraffe_logo.png
├── Giraffe_View
    ├── __init__.py
    ├── giraffe_run_demo
    ├── regional_modification.py
    ├── function.py
    ├── estimated_read_accuracy.py
    ├── homopolymer.py
    ├── observed_read_accuracy.py
    ├── gc_bias.py
    ├── giraffe_plot
    ├── plot.py
    ├── summary_html.py
    └── giraffe
├── scripts
    ├── renormalization_sequencing_bias
    ├── homopolymer_count
    └── replot_sequencing_bias
├── LICENSE
├── setup.py
└── README.md


/Results/readme:
--------------------------------------------------------------------------------
1 | The results of demo dataset!
2 | 


--------------------------------------------------------------------------------
/Giraffe_View/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals, absolute_import
2 | 


--------------------------------------------------------------------------------
/Results/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lrslab/Giraffe_View/HEAD/Results/workflow.png


--------------------------------------------------------------------------------
/Results/giraffe_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lrslab/Giraffe_View/HEAD/Results/giraffe_logo.png


--------------------------------------------------------------------------------
/scripts/renormalization_sequencing_bias:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | if __name__ == "__main__":
 7 | 	parser = argparse.ArgumentParser(description="A script provides a solution for renormalizing the sequencing depth based on the given GC content scale.")
 8 | 	parser.add_argument("-i", "--input", type=str, metavar="", required=True, help="input the resulting file")
 9 | 	parser.add_argument("-l", "--left", type=int, metavar="", required=True, help="The start of GC content.")
10 | 	parser.add_argument("-r", "--right", type=int, metavar="", required=True, help="The end of GC content.")
11 | 	parser.add_argument("-o", "--out", type=str, metavar="", required=True, help="Name of output file.")
12 | 	args = parser.parse_args()
13 | 
14 | 	df = pd.read_csv(args.input, delim_whitespace=True)
15 | 	nor_df = df[(args.left <=df["GC_content"]) & (df["GC_content"] <= args.right)].copy()
16 | 
17 | 	ave_dp = nor_df["Depth"].mean()
18 | 	nor_df["Normalized_depth"] = nor_df.apply(lambda row: row["Depth"]/ave_dp, axis=1)
19 | 	nor_df.to_csv(args.out, sep="\t", index=False, header=True)
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Raymond
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |   long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |   name="Giraffe_View",
 8 |   version="0.2.3",
 9 |   author="Xudong Liu",
10 |   author_email="xudongliu98@gmail.com",
11 |   description="Giraffe_View is specially designed to provide a comprehensive assessment of the accuracy of long-read sequencing datasets obtained from both the PacBio and Nanopore platforms.",
12 |   long_description=long_description,
13 |   long_description_content_type="text/markdown",
14 |   url="https://github.com/lxd98/Giraffe_View",
15 |   packages=setuptools.find_packages(),
16 |   classifiers=[
17 |   "Programming Language :: Python :: 3",
18 |   "License :: OSI Approved :: MIT License",
19 |   "Operating System :: OS Independent",
20 |   ],
21 |   python_requires = '>=3',
22 |   install_requires=[
23 |   'pysam >= 0.17.0',
24 |   'numpy >= 1.7.0',
25 |   'pandas >= 1.5.0',
26 |   'seaborn >= 0.13.2',
27 |   'termcolor >= 2.0.0',
28 |   'biopython >= 1.6.2'
29 |   ],
30 |   scripts = ["Giraffe_View/giraffe","Giraffe_View/giraffe_run_demo", "Giraffe_View/giraffe_plot", "scripts/homopolymer_count", "scripts/renormalization_sequencing_bias"]
31 | )
32 | 


--------------------------------------------------------------------------------
/Giraffe_View/giraffe_run_demo:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from os import system
 3 | 
 4 | # get the table list
 5 | system("wget https://figshare.com/ndownloader/files/44967445 -O read.txt")
 6 | system("wget https://figshare.com/ndownloader/files/44967442 -O methyl.txt")
 7 | system("wget https://figshare.com/ndownloader/files/44967499 -O aligned.txt")
 8 | 
 9 | # get the reference and ONT reads (R10.4.1 and R9.4.1) of E.coli
10 | system("wget https://figshare.com/ndownloader/files/44967436 -O Read.tar.gz")
11 | 
12 | # The 5mC methylation files of zebrafish blood and kidney samples.
13 | # The bed file is the gene promoter position in chromosome 1.
14 | system("wget https://figshare.com/ndownloader/files/44967427 -O Methylation.tar.gz")
15 | 
16 | system("tar -xzvf Read.tar.gz")
17 | system("tar -xzvf Methylation.tar.gz")
18 | system("sed -i 's/,/\t/g' Methylation/zf_promoter.db")
19 | system("rm Read.tar.gz Methylation.tar.gz")
20 | 
21 | system("giraffe --read read.txt --ref Read/ecoli_chrom.fa --cpu 2")
22 | system("giraffe estimate --read read.txt --plot --cpu 2")
23 | system("giraffe observe --read read.txt --ref Read/ecoli_chrom.fa --plot --cpu 2")
24 | system("giraffe observe --aligned aligned.txt --plot --cpu 2")
25 | system("giraffe gcbias --ref Read/ecoli_chrom.fa --aligned aligned.txt --plot --cpu 2")
26 | system("giraffe	modbin --methyl methyl.txt --region Methylation/zf_promoter.db --plot --cpu 2")


--------------------------------------------------------------------------------
/scripts/homopolymer_count:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | 
 6 | if __name__ == '__main__':
 7 | 	parser = argparse.ArgumentParser(description="A script to count the position and type of homopolymer in your reference.")
 8 | 	parser.add_argument("--ref", type=str, metavar="", required=True, help="Input reference (FASTA)")
 9 | 	args = parser.parse_args()
10 | 
11 | 	database = {}
12 | 	for read in SeqIO.parse(args.ref, "fasta"):
13 | 		# read.id
14 | 		# read.seq
15 | 		database[read.id] = {}
16 | 
17 | 		count = 0
18 | 		number = 1
19 | 		tmp = []
20 | 		frame = ""
21 | 
22 | 		for base in read.seq:
23 | 			if len(tmp) == 0 and frame == "":
24 | 				frame = base.upper()
25 | 				tmp.append(frame)
26 | 				count += 1
27 | 
28 | 			else:
29 | 				if frame == base.upper():
30 | 					tmp.append(frame)
31 | 					count += 1
32 | 				
33 | 				else:
34 | 					if len(tmp) >= 3:
35 | 						database[read.id][str(number)] = {}
36 | 						database[read.id][str(number)]["length"] = len(tmp)
37 | 						database[read.id][str(number)]["position"] = count
38 | 						database[read.id][str(number)]["base"] = tmp[0]
39 | 						number += 1
40 | 					
41 | 					tmp =[]
42 | 					frame = base.upper()
43 | 					tmp.append(frame)
44 | 					count += 1
45 | 
46 | 	for k in database.keys():
47 | 		# ref = string(k)
48 | 		for n in database[k].keys():
49 | 			start = database[k][n]["position"] - database[k][n]["length"] + 1
50 | 			end = database[k][n]["position"]
51 | 			basetype = database[k][n]["base"]
52 | 			feature = str(database[k][n]["length"]) + str(database[k][n]["base"])
53 | 			mes = str(k) + "\t" + str(start) + "\t" + str(end) + "\t" + str(basetype) + "\t" + str(feature)
54 | 			print(mes)
55 | 


--------------------------------------------------------------------------------
/scripts/replot_sequencing_bias:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from plotnine import * 
 3 | import argparse
 4 | import pandas as pd
 5 | import numpy as np
 6 | import warnings
 7 | warnings.filterwarnings('ignore')
 8 | 
 9 | def plot_GC_bias(input_file, output_name):
10 | 	df1=pd.read_csv(input_file, sep="\t")
11 | 
12 | 	dif = df1["GC_content"].max() - df1["GC_content"].min()
13 | 	if dif <= 15:
14 | 		gc_breaks = [i for i in range(0, 101, 1)]
15 | 	elif dif <= 30:
16 | 		gc_breaks = [i for i in range(0, 101, 2)]
17 | 	elif dif <= 50:
18 | 		gc_breaks = [i for i in range(0, 101, 5)]
19 | 	else:
20 | 		gc_breaks = [i for i in range(0, 101, 10)]
21 | 
22 | 	GC_bias=(
23 | 		ggplot(df1, aes(x="GC_content", y="Normalized_depth", 
24 | 			group="Group", fill="Group", color="Group")) + 
25 | 			geom_hline(aes(yintercept=1), color="grey", linetype="dotted") + 
26 | 			geom_line(size=1.5, alpha=.3) + 
27 | 			geom_point(size=1.5,color="black") +
28 | 			scale_y_continuous(name="Normalized depth", 
29 | 				limits=[0, 2], breaks=np.arange(0, 2.1, 0.2)) +
30 | 			theme_classic() + 
31 | 			scale_x_continuous(name="GC content (%)", breaks=gc_breaks) +
32 |             theme(axis_text=element_text(size=12, color="black"),
33 |                   axis_title=element_text(size=12, color="black"),
34 |                   legend_title = element_blank(),
35 |                   legend_text = element_text(size=12, color="black"),
36 |                   legend_position = "bottom"
37 |                   )
38 |             )
39 | 
40 | 	GC_bias.save(filename = output_name+".pdf", width=8, height=5, dpi=300, path="./")
41 | 
42 | if __name__ == "__main__":
43 | 	parser = argparse.ArgumentParser(description="")
44 | 	parser.add_argument("-i", "--input", type=str, metavar="", required=True, help="")
45 | 	parser.add_argument("-o", "--out", type=str, metavar="", required=True, help="")
46 | 	args = parser.parse_args()
47 | 
48 | 	plot_GC_bias(args.input, args.out)
49 | 


--------------------------------------------------------------------------------
/Giraffe_View/regional_modification.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import multiprocessing
 3 | 
 4 | def run_regional_methylation(input_methyl, input_target, sample_ID, num_processes):
 5 |     # Read methylation and target data from the input files
 6 |     methyl = pd.read_csv(input_methyl, sep='\t', header=None, names=["CHROM", "START", "END", "VALUE"])
 7 |     target = pd.read_csv(input_target, sep='\t', header=None, names=["CHROM", "START", "END", "ID"])
 8 | 
 9 |     # Determine if VALUE is in range 0-1 or 0-100
10 |     max_value = methyl['VALUE'].max()
11 |     value_scale = 100 if max_value > 1 else 1
12 | 
13 |     # Get the unique chromosomes from the target data
14 |     unique_chromosomes = set(target['CHROM'])
15 | 
16 |     # Create a multiprocessing pool with the specified number of processes
17 |     with multiprocessing.Pool(processes=num_processes) as pool:
18 |         jobs = []
19 |         for chromosome in unique_chromosomes:
20 |             # Filter the methylation and target data for the current chromosome
21 |             sub_methyl = methyl[methyl["CHROM"] == chromosome][["START", "END", "VALUE"]]
22 |             sub_target = target[target["CHROM"] == chromosome][["START", "END", "ID"]]
23 |             # Create an asynchronous job for processing the data
24 |             jobs.append(pool.apply_async(regional_methylation_bed_worker, (sub_methyl, sub_target, str(sample_ID), chromosome, value_scale)))
25 | 
26 |         # Wait for all jobs to complete
27 |         for job in jobs:
28 |             job.get()
29 | 
30 | def regional_methylation_bed_worker(input_methyl, input_target, sample_ID, chromosome, value_scale):
31 |     # Open the output file for writing
32 |     with open(f"Giraffe_Results/4_Regional_modification/Temp_methy_{sample_ID}_{chromosome}.txt", "w") as ff:
33 |         # Iterate over each row in the target data
34 |         for row in input_target.itertuples(index=True, name='Pandas'):
35 |             start = row.START
36 |             end = row.END
37 |             target_ID = row.ID
38 | 
39 |             # Filter the methylation data for the current region
40 |             target_data = input_methyl[(start <= input_methyl["START"]) & (input_methyl["END"] <= end)]
41 |             
42 |             # Calculate the mean methylation value and scale it
43 |             mean_methylation = target_data["VALUE"].mean() / value_scale
44 | 
45 |             # Write the result to the output file
46 |             ff.write(f"{target_ID}\t{mean_methylation:.4f}\t{sample_ID}\n")
47 | 


--------------------------------------------------------------------------------
/Giraffe_View/function.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import re
  3 | import os
  4 | from termcolor import colored
  5 | from subprocess import Popen, PIPE
  6 | import pandas as pd
  7 | 
  8 | def print_with_color(input_string):
  9 |     print(colored(input_string, "green"))
 10 | 
 11 | def error_with_color(input_string):
 12 |     print(colored(input_string, "red"))
 13 | 
 14 | def loading_dataset(input_file):
 15 |     dataset = {}
 16 |     with open(input_file) as ff:
 17 |         for l in ff:
 18 |             l = l.replace("\n", "")
 19 |             l = l.split()
 20 | 
 21 |             # check the file
 22 |             if os.path.exists(l[2]):
 23 |                 dataset[l[0]] = {}
 24 |                 dataset[l[0]]["type"] = l[1]
 25 |                 dataset[l[0]]["path"] = l[2]
 26 | 
 27 |             else:
 28 |                 error_with_color("Please check the path of " + str(l[0]) + "!")
 29 | 
 30 |     return dataset
 31 | 
 32 | def cmd_shell(cammands, string):
 33 |     process = Popen(cammands.split(' '), stdout=subprocess.DEVNULL, universal_newlines=True)
 34 |     process.wait()
 35 |     err = process.communicate()
 36 | 
 37 |     if process.returncode == 0:
 38 |         # print('{} SUCCESS'.format(string))
 39 |         pass
 40 |     else:
 41 |         # print('{} FAILED'.format(string))
 42 |         error_with_color(err)
 43 | 
 44 | def mkdir_d(input_name):
 45 |     mes = "Giraffe_Results/" + str(input_name)
 46 |     cmd = ["mkdir", "-p", str(mes)]
 47 |     subprocess.run(cmd, check=True)
 48 | 
 49 | def count_indel_and_snv(str):
 50 |     dict = {}
 51 |     for i in str:
 52 |         dict[i] = dict.get(i, 0) + 1
 53 |     return dict
 54 | 
 55 | 
 56 | def bam2fastq(input_bam, CPU):
 57 |     with open("bam2fq.sh", "w") as ff:
 58 |         ff.write("samtools fastq " + str(input_bam) + " -@ " + str(CPU) + " > giraffe_tmp.fastq")
 59 |     ff.close()
 60 | 
 61 | #remove the insertion (I) in the tail of string
 62 | def remove_I(string):
 63 |     while string[-1] == "I":
 64 |         string = string[:-1]
 65 |     return(string)
 66 | 
 67 | # remove soft (S) and hard (H) clip in CIGAR and return the matched pairs
 68 | def remove_clip_list(input_cigar, input_pairs, input_ID):
 69 |     remove_cigarstring = re.findall(r"\d+[S, H]+", input_cigar)
 70 |     #HH & 0H & H0 & 00
 71 |     if ((len(remove_cigarstring) == 2) and (remove_cigarstring[0][-1] == remove_cigarstring[1][-1] == "H")) or ((len(remove_cigarstring) == 1) and (remove_cigarstring[-1] == "H")) or (len(remove_cigarstring) == 0):
 72 |         valid_pairs = input_pairs
 73 |     #SS
 74 |     elif (len(remove_cigarstring) == 2) and (remove_cigarstring[0][-1] == remove_cigarstring[1][-1] == "S"):
 75 |         remove_start_site = int(remove_cigarstring[0][:-1])
 76 |         tmp_pairs = input_pairs[remove_start_site:]
 77 |         remove_end_site = int(remove_cigarstring[1][:-1])
 78 |         valid_pairs = tmp_pairs[:len(tmp_pairs)-remove_end_site]
 79 |     # 0S & HS
 80 |     elif ((len(remove_cigarstring) == 1) and (input_cigar[-1] == "S")) or (len(remove_cigarstring) == 2) and (remove_cigarstring[0][-1] == "H") and ((remove_cigarstring[1][-1] == "S")):
 81 |         remove_end_site = int(remove_cigarstring[-1][:-1])
 82 |         valid_pairs = input_pairs[:len(input_pairs)-remove_end_site]
 83 |     # S0 & SH
 84 |     elif (len(remove_cigarstring) == 1) and (input_cigar[-1] != "S") or ((len(remove_cigarstring) == 2) and (remove_cigarstring[0][-1] == "S") and (remove_cigarstring[1][-1] == "H")):
 85 |         remove_start_site = int(remove_cigarstring[0][:-1])
 86 |         valid_pairs = input_pairs[remove_start_site:]
 87 |     else:
 88 |         print(str(input_ID) + ", please recheck this CIGAR and MD!")
 89 |     return(valid_pairs)
 90 | 
 91 | """
 92 | only for base A T G C
 93 | (read_position, ref_position, "ref_base")
 94 | none    √   √   Deletion(D)
 95 | √   none    none Insertion(I)
 96 | √   √   N(A,T,G,C)  Match(M)
 97 | √   √   n(a,t,g,c)  Substitution(S)
 98 | """
 99 | def get_base_alignment(input_list): 
100 |     map_list = ["A", "T", "G", "C"]
101 |     result = ""
102 |     if input_list[0] == None:
103 |         result = "D" # D = deletion
104 |     else:
105 |         if input_list[1] == None:
106 |             result = "I" # I = insertion
107 |         else:
108 |             if input_list[2] in map_list:
109 |                 result = "M" # M = match
110 |             else:
111 |                 result =  "S" # S = substitution
112 |     return result
113 | 
114 | def process_in_chunks(file_path, chunk_size=10000):
115 |     chunks = pd.read_csv(file_path, chunksize=chunk_size, sep="\t")
116 |     results = []
117 |     for chunk in chunks:
118 |         results.append(chunk)
119 |     return pd.concat(results)


--------------------------------------------------------------------------------
/Giraffe_View/estimated_read_accuracy.py:
--------------------------------------------------------------------------------
  1 | from os import system 
  2 | import numpy as np
  3 | import math
  4 | import multiprocessing
  5 | import gzip
  6 | 
  7 | def GC_content(string):
  8 |     read = str(string).upper()
  9 |     length = len(read)
 10 |     c = read.count("C")
 11 |     g = read.count("G")
 12 |     GC = (c+g)/length
 13 |     return[length, GC]
 14 | 
 15 | def Qvalue_to_accuracy(string):
 16 |     error_list = []
 17 |     for base_value in string:
 18 |         ascii_value = ord(base_value) - 33
 19 |         error_proporation = math.pow(10, (-1) * int(ascii_value) / 10)
 20 |         error_list.append(error_proporation)
 21 |     error_mean = np.mean(error_list)
 22 |     return [1 - error_mean, error_mean, (-10) * math.log10(error_mean)]
 23 | 
 24 | def process_chunk(chunk):
 25 |     results = []
 26 |     for line in chunk:
 27 |         read_id, sequence, quality = line
 28 |         GC = GC_content(sequence)
 29 |         quality = Qvalue_to_accuracy(quality)
 30 |         results.append([read_id, quality[0], quality[1], quality[2], GC[0], GC[1]])
 31 |     return results
 32 | 
 33 | def calculate_estimated_accuracy(input_type, input_file, num_processes, chunk_size=1000):
 34 |     pool = multiprocessing.Pool(processes=num_processes)
 35 |     results = []
 36 |     whether_compressed = ""
 37 | 
 38 | 
 39 |     # judge whether input file is compressed (.gz) or not
 40 |     if input_file.endswith('.gz'):
 41 |     	open_func = gzip.open
 42 |     	whether_compressed = "yes"
 43 | 
 44 |     else:
 45 |     	open_func = open
 46 |     	whether_compressed = "no"
 47 | 
 48 |     with open_func(input_file, "r") as input_file:
 49 |         count = 1
 50 |         chunk = []
 51 |         for line in input_file:
 52 |         	line = line.strip()
 53 | 
 54 |         	if whether_compressed == "yes":
 55 |         		line = line.decode('ascii')
 56 | 
 57 |         	if count % 4 == 1:
 58 |         		read_id = line.split(" ")[0]
 59 |         	elif count % 4 == 2:
 60 |         		sequence = line
 61 |         	elif count % 4 == 0:
 62 |         		chunk.append((read_id, sequence, line))
 63 | 
 64 |         	count += 1
 65 | 
 66 |         	if len(chunk) == chunk_size:
 67 |         		results.append(pool.apply_async(process_chunk, (chunk,)))
 68 |         		chunk = []
 69 |         
 70 |         if len(chunk) > 0:
 71 |             results.append(pool.apply_async(process_chunk, (chunk,)))
 72 |     pool.close()
 73 |     pool.join()
 74 |     input_file.close()
 75 | 
 76 |     file = "Giraffe_Results/1_Estimated_quality/" + str(input_type) + ".tmp"
 77 |     with open(file, "w") as output_file:
 78 |             for result in results:
 79 |                 for line in result.get():
 80 |                     message = f"{line[0]}\t{line[1]:.4f}\t{line[2]:.4f}\t{line[3]:.4f}"
 81 |                     message += f"\t{line[4]}\t{line[5]:.4f}\t{input_type}"
 82 |                     output_file.write(message + "\n")
 83 |     output_file.close()
 84 |     output_file.close()
 85 | 
 86 | def process_chunk_slow(queue, output_files, input_type):
 87 |     while True:
 88 |         chunk = queue.get()
 89 |         if chunk is None:
 90 |             break
 91 |         output_file = output_files[chunk['process_id']]
 92 |         with open(output_file, "a") as f:
 93 |             for line in chunk['data']:
 94 |                 read_id, sequence, quality = line
 95 |                 GC = GC_content(sequence)
 96 |                 quality = Qvalue_to_accuracy(quality)
 97 |                 message = (f"{read_id}\t{quality[0]:.4f}\t{quality[1]:.4f}\t{quality[2]:.4f}\t"
 98 |                            f"{GC[0]}\t{GC[1]:.4f}\t{input_type}")
 99 |                 f.write(message + "\n")
100 | 
101 | def calculate_estimated_accuracy_slow(input_type, input_file, num_processes, chunk_size=1000):
102 |     output_dir = "Giraffe_Results/1_Estimated_quality/"
103 |     output_files = [f"{output_dir}{input_type}_{i}.tmp" for i in range(num_processes)]
104 |     queue = multiprocessing.Queue(maxsize=num_processes * 2)
105 | 
106 |     if input_file.endswith('.gz'):
107 |         open_func = gzip.open
108 |     else:
109 |         open_func = open
110 | 
111 |     workers = []
112 |     for process_id in range(num_processes):
113 |         worker = multiprocessing.Process(target=process_chunk_slow, args=(queue, output_files, input_type))
114 |         worker.start()
115 |         workers.append(worker)
116 | 
117 |     with open_func(input_file, "rt") as input_file_handle:
118 |         chunk = []
119 |         chunk_counter = 0
120 |         for count, line in enumerate(input_file_handle, 1):
121 |             line = line.strip()
122 |             if count % 4 == 1:
123 |                 read_id = line.split(" ")[0]
124 |             elif count % 4 == 2:
125 |                 sequence = line
126 |             elif count % 4 == 0:
127 |                 chunk.append((read_id, sequence, line))
128 | 
129 |             if len(chunk) == chunk_size:
130 |                 queue.put({'process_id': chunk_counter % num_processes, 'data': chunk})
131 |                 chunk = []
132 |                 chunk_counter += 1
133 | 
134 |         if chunk:
135 |             queue.put({'process_id': chunk_counter % num_processes, 'data': chunk})
136 | 
137 |     for _ in range(num_processes):
138 |         queue.put(None)
139 | 
140 |     for worker in workers:
141 |         worker.join()
142 | 
143 | def merge_results():
144 |     with open("Giraffe_Results/1_Estimated_quality/header", "a") as ff:
145 |         ff.write("ReadID\tAccuracy\tError\tQ_value\tLength\tGC_content\tGroup\n")
146 |     ff.close()
147 |     system("cat Giraffe_Results/1_Estimated_quality/header \
148 |         Giraffe_Results/1_Estimated_quality/*tmp > \
149 |         Giraffe_Results/1_Estimated_quality/Estimated_information.txt")
150 |     system("rm Giraffe_Results/1_Estimated_quality/*tmp Giraffe_Results/1_Estimated_quality/header")
151 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # <img src="Results/giraffe_logo.png" width="80" style="display: block; margin-left: auto; margin-right: auto;"> Giraffe
  2 | <a href="https://pypi.org/project/Giraffe-View/" rel="pypi">![PyPI](https://img.shields.io/pypi/v/Giraffe-View?color=green)</a> <a href="https://opensource.org/license/mit/" rel="license">![License](https://img.shields.io/pypi/l/nanoCEM?color=orange)</a>
  3 | 
  4 | **Giraffe** is specially designed to provide a comprehensive assessment of the accuracy of long-read sequencing datasets obtained from both the Pacific Biosciences (PacBio) and Oxford Nanopore Technologies (ONT) platforms, offering four distinct functions.
  5 | 
  6 | <img src="Results/workflow.png" width="850" style="display: block; margin-left: auto; margin-right: auto;">
  7 | 
  8 | `estimate`   Calculation of estimated read accuracy (Q score), length, and GC content.
  9 | 
 10 | `observe`     Calculation of observed read accuracy, mismatch proportion, and homopolymer identification (e.g. AAAA).
 11 | 
 12 | `gcbias`       Calculation of the relationship between GC content and sequencing depth.
 13 | 
 14 | `modbin`       Calculation of the distribution of modification (e.g. 5mC or 6mA methylation) at the regional level.
 15 | 
 16 | 
 17 | 
 18 | # Installation
 19 | 
 20 | ## Installation by [Conda](https://conda.io/projects/conda/en/latest/index.html)
 21 | 
 22 | ```shell
 23 | # install on the current environment
 24 | conda install -c raymond_liu giraffe_view -y
 25 | 
 26 | # install on a new environment
 27 | conda create -n giraffe -c raymond_liu giraffe_view -y
 28 | ```
 29 | 
 30 | 
 31 | 
 32 | ## Installation by [PyPI](https://pypi.org/)
 33 | 
 34 | Before using this tool, you need to install additional dependencies for read processing, including the [samtools](https://www.htslib.org/)，[minimap2](https://github.com/lh3/minimap2), and [bedtools](https://github.com/arq5x/bedtools2). The following commands can help you install both the software package and its dependencies.
 35 | 
 36 | ```shell
 37 | # Testing version
 38 | # samtools 1.17
 39 | # minimap2 2.17-r941
 40 | # bedtools 2.30.0
 41 | 
 42 | # install on the currently environment
 43 | conda install -c bioconda -c conda-forge samtools minimap2 bedtools -y
 44 | 
 45 | # install on a new environment
 46 | conda create -n giraffe -c bioconda -c conda-forge python==3.9 samtools==1.17 minimap2==2.17 bedtools==2.30.0 -y && conda activate giraffe
 47 | ```
 48 | 
 49 | To install this tool, please use the following command.
 50 | ```shell
 51 | pip install Giraffe-View
 52 | ```
 53 | 
 54 | 
 55 | 
 56 | 
 57 | # Quick usage
 58 | 
 59 |  **Giraffe** can be run with a one-button command or by executing individual functions.
 60 | 
 61 | ## ONE-button pattern
 62 | 
 63 | ```shell
 64 | # Running function of "estimate", "observe", and "gcbias" with FASTQ files
 65 | giraffe --read <read table> --ref <reference> --cpu <number of processes or threads>
 66 | 
 67 | # Running function of "estimate", "observe", and "gcbias" with unaligned SAM/BAM files
 68 | giraffe --read <unaligned SAM/BAM table> --ref <reference> --cpu <number of processes or threads>
 69 | 
 70 | # Example for input table (sample_ID data_type file_path)
 71 | sample_A ONT /home/user/data/S1.fastq
 72 | sample_B ONT /home/user/data/S2.fastq
 73 | sample_C ONT /home/user/data/S3.fastq
 74 | ...
 75 | ```
 76 | 
 77 |  Here the data_type can be ONT DNA reads (ONT), ONT directly sequencing reads (ONT_RNA), and Pacbio DNA reads (Pacbio).
 78 | 
 79 | 
 80 | 
 81 | ## Estimate function
 82 | 
 83 | ```shell
 84 | # For the FASTQ reads
 85 | giraffe estimate --read <read table> 
 86 | 
 87 | # For the unaligned SAM/BAM files
 88 | giraffe estimate --unaligned <unaligned SAM/BAM table>
 89 | ```
 90 | 
 91 | 
 92 | 
 93 | ## Observe function
 94 | 
 95 | ```shell
 96 | # For FASTQ reads
 97 | giraffe observe --read <read table> --ref <reference>
 98 | 
 99 | # For unaligned SAM/BAM files
100 | giraffe observe --unaligned <unaligned SAM/BAM table> --ref <reference>
101 | 
102 | # For aligned SAM/BAM files
103 | giraffe observe --aligned <aligned SAM/BAM table>
104 | ```
105 | 
106 | **Note:** If you are going to use aligned SAM/BAM files as input, please remove the secondary alignment (**--secondary=no**) and add the MD tag (**--MD**) before mapping by adding these two highlighted parameters.
107 | 
108 | 
109 | 
110 | ## GCbias function
111 | 
112 | ```shell
113 | giraffe gcbias --ref <reference> --aligned <aligned SAM/BAM table>
114 | ```
115 | 
116 | 
117 | 
118 | ## Modbin function
119 | 
120 | ```shell
121 | giraffe modbin --methyl <methylation table> --region <target region>
122 | 
123 | # Example for methylation file (Chrom Start End Value):
124 | contig_A 132 133 0.92
125 | contig_A 255 256 0.27
126 | contig_A 954 955 0.52
127 | ...
128 | ```
129 | 
130 | 
131 | 
132 | # Example
133 | 
134 | Here, we provide demo datasets for testing the **Giraffe**. The following commands can help to download them and run the demo.
135 | 
136 | ```shell
137 | giraffe_run_demo
138 | ```
139 | 
140 | The demo datasets included three E. coli datasets including a 4.2 MB reference, 79 MB R10.4.1 reads, and 121 MB R9.4.1 reads. For the methylation files, two files of zebrafish blood (23 MB)and kidney (19 KB) are included. This demo takes about 7 minutes and  20 seconds with a maximum memory of 391 MB. This running includes the one-command pattern and four individual functions testing.
141 | 
142 | 
143 | 
144 | # Tool showcase
145 | 
146 | The one-command pattern will generate a summary in [HTML](https://lxd98.github.io/giraffe.github.io) format. If the scale of the X/Y-axis is not reasonable, the script of `giraffe_plot`  can be used to replot the figure.
147 | 
148 | # Documentation
149 | 
150 | For more details about the usage of Giraffe and results profiling, please refer to the [document](https://giraffe-documentation.readthedocs.io/en/latest).
151 | 
152 | # Citation
153 | 
154 | Liu, X., Shao, Y., Guo, Z., Ni, Y., Sun, X., Leung, A. Y. H., & Li, R. (2024). Giraffe: A tool for comprehensive processing and visualization of multiple long-read sequencing data. *Computational and Structural Biotechnology Journal, 23,* 3241-3246. https://doi.org/10.1016/j.csbj.2024.08.003
155 | 


--------------------------------------------------------------------------------
/Giraffe_View/homopolymer.py:
--------------------------------------------------------------------------------
  1 | from os import system 
  2 | import pandas as pd
  3 | import pysam
  4 | import re
  5 | from Giraffe_View.function import *
  6 | import multiprocessing
  7 | 
  8 | def homopolymer_summary_1(input_file, sample_ID, chromosome):
  9 | 	data = {}
 10 | 	with open(input_file) as ff:
 11 | 		for line in ff:
 12 | 			line = line.replace("\n", "")
 13 | 			line = line.split("\t")
 14 | 			position = line[0] + "_" + line[1] + "_" + line[2]
 15 | 
 16 | 			if position not in data.keys():
 17 | 				data[position] = {}
 18 | 				data[position]["type"] = str(line[3]) + line[4]
 19 | 				data[position]["depth"] = 1
 20 | 				data[position]["mat"] = 0
 21 | 
 22 | 				if int(line[3]) == int(line[5]):
 23 | 					data[position]["mat"] += 1
 24 | 
 25 | 			elif position in data.keys():
 26 | 				data[position]["depth"] += 1
 27 | 				if int(line[3]) == int(line[5]):
 28 | 					data[position]["mat"] += 1
 29 | 	ff.close()
 30 | 
 31 | 	output_1 = f"Giraffe_Results/2_Observed_quality/{sample_ID}_homopolymer_in_reference_{chromosome}.txt"
 32 | 	with open(output_1, "w") as ff:
 33 | 		# ff.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n")
 34 | 		for i in data.keys():
 35 | 			mes = str(i) + "\t" + str(data[i]["mat"]) + "\t" + str(data[i]["depth"]) + "\t" + data[i]["type"]
 36 | 			ff.write(mes + "\t" + str(sample_ID) + "\n")
 37 | 		ff.close()
 38 | 
 39 | def homopolymer_summary_2(sample_ID):
 40 | 	input_file = "Giraffe_Results/2_Observed_quality/" + str(sample_ID) + ".homopolymer_in_reference.txt"
 41 | 	output_file = "Giraffe_Results/2_Observed_quality/" + str(sample_ID) + ".homo_tmp"
 42 | 	data = pd.read_table(input_file, sep="\t")	
 43 | 	valid = data[data["depth"] >= 3].copy()
 44 | 
 45 | 	if len(valid) != 0:
 46 | 		ff = open(output_file, "w")
 47 | 		valid["rate"] = valid["num_of_mat"] / valid["depth"]
 48 | 		
 49 | 		def Abase(x):
 50 | 			if re.search(".*A", x): 
 51 | 				return(True)
 52 | 			else: 
 53 | 				return(False)
 54 | 		
 55 | 		def Tbase(x):
 56 | 			if re.search(".*T", x): 
 57 | 				return(True)
 58 | 			else: 
 59 | 				return(False)
 60 | 		
 61 | 		def Gbase(x):
 62 | 			if re.search(".*G", x): 
 63 | 				return(True)
 64 | 			else: 
 65 | 				return(False)
 66 | 
 67 | 		def Cbase(x):
 68 | 			if re.search(".*C", x): 
 69 | 				return(True)
 70 | 			else: 
 71 | 				return(False)
 72 |         
 73 | 		T_acc = valid[valid["type"].apply(Tbase)]["rate"].mean()
 74 | 		G_acc = valid[valid["type"].apply(Gbase)]["rate"].mean()
 75 | 		C_acc = valid[valid["type"].apply(Cbase)]["rate"].mean()
 76 | 		A_acc = valid[valid["type"].apply(Abase)]["rate"].mean()
 77 | 
 78 | 		ff.write(f"A\t{A_acc:.4f}\t{sample_ID}\n")
 79 | 		ff.write(f"T\t{T_acc:.4f}\t{sample_ID}\n")
 80 | 		ff.write(f"C\t{C_acc:.4f}\t{sample_ID}\n")
 81 | 		ff.write(f"G\t{G_acc:.4f}\t{sample_ID}\n")
 82 | 		ff.close()
 83 | 	else:
 84 | 		error_with_color("The read coverage of data was too shallow to conduct the homopolymer analysis!!!") 
 85 | 
 86 | def merge_results_observed_homopolymer():
 87 |     with open("Giraffe_Results/2_Observed_quality/header", "a") as ff:
 88 |         ff.write("Base\tAccuracy\tGroup\n")
 89 |     ff.close()
 90 | 
 91 |     system("cat Giraffe_Results/2_Observed_quality/header \
 92 |         Giraffe_Results/2_Observed_quality/*.homo_tmp > \
 93 |         Giraffe_Results/2_Observed_quality/Homoploymer_summary.txt")
 94 | 
 95 |     system("rm Giraffe_Results/2_Observed_quality/*homo_tmp \
 96 |         Giraffe_Results/2_Observed_quality/header"
 97 |         )
 98 | 
 99 | def homopolymer_from_bam_worker(input_bamfile, sample_ID, chromosome):
100 |     bamfile = pysam.AlignmentFile(input_bamfile, "rb")
101 |     output = f"Giraffe_Results/2_Observed_quality/{sample_ID}_homopolymer_detail_{chromosome}.txt"
102 |     
103 |     with open(output, "w") as ff:
104 |         for read in bamfile.fetch(chromosome):
105 |             read_ID = read.query_name
106 |             read_pair = read.get_aligned_pairs(matches_only=False, with_seq=True)
107 |             read_cigar = read.cigarstring
108 |             read_ref_id = read.reference_name
109 |             read_valid_pair = remove_clip_list(read_cigar, read_pair, read_ID)
110 | 
111 |             homopolymer_ref = ""
112 |             homopolymer_read = ""
113 |             homopolymer_ref_pos = []
114 |             count = 1
115 | 
116 |             for base in read_valid_pair:
117 |                 base_alignment = get_base_alignment(base)
118 |                 if homopolymer_ref == "":
119 |                     if base_alignment != "I":
120 |                         homopolymer_ref = str(base[2]).upper()
121 |                         homopolymer_read = str(base_alignment)
122 |                         homopolymer_ref_pos.append(base[1])
123 |                 else:
124 |                     if base[2] is None:
125 |                         homopolymer_read += str(base_alignment)
126 |                     else:
127 |                         base_ref = str(base[2]).upper()
128 |                         if base_ref == homopolymer_ref[0]:
129 |                             homopolymer_ref += base_ref
130 |                             homopolymer_read += str(base_alignment)
131 |                             homopolymer_ref_pos.append(base[1])
132 |                         else:
133 |                             if len(homopolymer_ref) >= 4:
134 |                                 homopolymer_ref_pos = [
135 |                                     str(len(homopolymer_ref)) + homopolymer_ref[0],
136 |                                     str(remove_I(homopolymer_read)),
137 |                                     str(read_ref_id),
138 |                                     *homopolymer_ref_pos
139 |                                 ]
140 |                                 stat_info = count_indel_and_snv(homopolymer_ref_pos[1])
141 |                                 stat_info = {k: stat_info.get(k, 0) for k in ['M', 'D', 'S', 'I']}
142 | 
143 |                                 mes = (
144 |                                     f"{homopolymer_ref_pos[2]}\t{homopolymer_ref_pos[3]}\t{homopolymer_ref_pos[-1]}\t"
145 |                                     f"{homopolymer_ref_pos[0][:-1]}\t{homopolymer_ref_pos[0][-1]}\t"
146 |                                     f"{stat_info['M']}\t{stat_info['D']}\t{stat_info['I']}\t{stat_info['S']}\t"
147 |                                     f"{read_ID}\t{sample_ID}")
148 | 
149 |                                 ff.write(mes + "\n")
150 |                                 count += 1
151 | 
152 |                             homopolymer_ref = base_ref
153 |                             homopolymer_read = str(base_alignment)
154 |                             homopolymer_ref_pos = [base[1]]
155 |     bamfile.close()
156 |     ff.close()
157 |     homopolymer_summary_1(output, sample_ID, chromosome)
158 | 
159 | def run_homopolymer_from_bam(input_bamfile, sample_ID, num_processes=10):
160 |     bamfile = pysam.AlignmentFile(input_bamfile, "rb")
161 |     chromosomes = bamfile.references
162 | 
163 |     with multiprocessing.Pool(processes=num_processes) as pool:
164 |         jobs = []
165 |         for chromosome in chromosomes:
166 |             jobs.append(pool.apply_async(homopolymer_from_bam_worker, (input_bamfile, sample_ID, chromosome)))
167 | 
168 |         for job in jobs:
169 |             job.get()
170 | 


--------------------------------------------------------------------------------
/Giraffe_View/observed_read_accuracy.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import re
  3 | from os import system 
  4 | from Giraffe_View.function import *
  5 | import multiprocessing
  6 | 
  7 | def data_process(sample_ID, data_type, data_path, ref, threads=10):
  8 |     output = "Giraffe_Results/2_Observed_quality/" + str(sample_ID) + ".bam"   
  9 |     if data_type == "ONT":
 10 |         cmd1 = ["minimap2", "-ax", "map-ont", "-o", "Giraffe_Results/2_Observed_quality/tmp.sam", "--MD", \
 11 |         "--secondary=no", "-L", "-t", str(threads), ref, data_path]
 12 | 
 13 |     elif data_type == "ONT_RNA":
 14 |         cmd1 = ["minimap2", "-ax", "splice", "-uf", "-k14", "-o", "Giraffe_Results/2_Observed_quality/tmp.sam", "--MD", \
 15 |         "--secondary=no", "-L", "-t", str(threads), ref, data_path]
 16 | 
 17 |     elif data_type == "Pacbio":
 18 |         cmd1 = ["minimap2", "-ax", "map-pb", "-o", "Giraffe_Results/2_Observed_quality/tmp.sam", "--MD", \
 19 |         "--secondary=no", "-L", "-t", str(threads), ref, data_path]
 20 | 
 21 |     else:
 22 |         error_with_color("Please check your data type!!! [ONT, Pacbio, ONT_RNA]")
 23 | 
 24 |     cmd2 = ["samtools", "view", "-bS", "-F4", "-@", str(threads), "-o", "Giraffe_Results/2_Observed_quality/tmp.bam", "Giraffe_Results/2_Observed_quality/tmp.sam"]
 25 |     cmd3 = ["samtools", "sort", "-@", str(threads), "-o", output, "Giraffe_Results/2_Observed_quality/tmp.bam"]
 26 |     cmd4 = ["samtools", "index", "-@", str(threads), output]
 27 |     cmd5 = ["rm", "-rf", "Giraffe_Results/2_Observed_quality/tmp.bam", "Giraffe_Results/2_Observed_quality/tmp.sam"]
 28 | 
 29 |     # Run each command and check the return code
 30 |     for i, cmd in enumerate([cmd1, cmd2, cmd3, cmd4, cmd5]):
 31 |         try:
 32 |             subprocess.run(cmd, check=True)
 33 |             # print("Command {} succeeded".format(i + 1))
 34 |         except subprocess.CalledProcessError as e:
 35 |             print("Command {} failed with error code {}".format(i + 1, e.returncode))
 36 |             print(e.output)
 37 |             # Raise an exception to indicate that processing failed
 38 |             raise Exception("Data processing failed")
 39 | 
 40 | def identify_match(cigar):
 41 |     # Identifies the number of matching bases in a read from its CIGAR string.
 42 |     cigar_mat = re.findall(r"\d+M", cigar)
 43 |     base_num_mat = sum(int(i[:-1]) for i in cigar_mat)
 44 |     return base_num_mat
 45 | 
 46 | def identify_insertion(cigar):
 47 |     # Identifies the number of inserted bases in a read from its CIGAR string.
 48 |     cigar_ins = re.findall(r"\d+I", cigar)
 49 |     base_num_ins = sum(int(i[:-1]) for i in cigar_ins)
 50 |     return base_num_ins
 51 | 
 52 | def identify_deletion(cigar):
 53 |     # Identifies the number of deleted bases in a read from its CIGAR string.
 54 |     cigar_del = re.findall(r"\d+D", cigar)
 55 |     base_num_del = sum(int(i[:-1]) for i in cigar_del)
 56 |     return base_num_del
 57 | 
 58 | def identify_substitution(md):
 59 |     # Identifies the number of substitutions in a read from its MD tag.
 60 |     return len(re.findall(r"\d+[ATCG]", md))
 61 | 
 62 | def merge_results_observed_acc():
 63 |     with open("Giraffe_Results/2_Observed_quality/header", "a") as ff:
 64 |         ff.write("ID\tIns\tDel\tSub\tMat\tIden\tAcc\tGroup\n")
 65 |     ff.close()
 66 | 
 67 |     system("cat Giraffe_Results/2_Observed_quality/header \
 68 |         Giraffe_Results/2_Observed_quality/*_primary_* \
 69 |         Giraffe_Results/2_Observed_quality/*_supplementary.total.txt > \
 70 |         Giraffe_Results/2_Observed_quality/Observed_information.txt")
 71 |     
 72 |     system("rm Giraffe_Results/2_Observed_quality/*_primary_* \
 73 |         Giraffe_Results/2_Observed_quality/header \
 74 |         Giraffe_Results/2_Observed_quality/*_supplementary.total.txt")
 75 | 
 76 | def observed_accuracy_worker(bam_file, sample_ID, chromosome):
 77 |     output_1 = f"Giraffe_Results/2_Observed_quality/{sample_ID}_primary_{chromosome}.txt"
 78 |     output_2 = f"Giraffe_Results/2_Observed_quality/{sample_ID}_supplementary_{chromosome}.txt"
 79 |     bamfile= pysam.AlignmentFile(bam_file, 'rb')
 80 | 
 81 |     with open(output_1, "w") as pri_f:
 82 |     	with open(output_2, "w") as sup_f:
 83 |     		for read in bamfile.fetch(chromosome):
 84 |                 # filter the unmapped reads
 85 |     			if read.flag == 4:
 86 |     				continue
 87 |     			else:
 88 |     				read_ID = read.query_name
 89 |     				read_cigar = read.cigarstring
 90 |     				read_md = read.get_tag("MD")
 91 | 
 92 |     				# count the number of matched and mismatched base
 93 |     				Ins = identify_insertion(read_cigar)
 94 |     				Del = identify_deletion(read_cigar)
 95 |     				Sub = identify_substitution(read_md)
 96 |     				Mat = identify_match(read_cigar) - Sub
 97 |     				
 98 |     				# check the presence of supplementary reads
 99 |     				if read.has_tag("SA"):
100 |     					sup_f.write(f"{read_ID}\t{Ins}\t{Del}\t{Sub}\t{Mat}\t{sample_ID}\n")
101 |     				else:
102 |     					# calculate the observed accuracy and identification
103 |     					total = Ins + Del + Sub + Mat
104 |     					Acc = Mat / total if total > 0 else 0
105 |     					Iden = Mat / (Mat + Sub) if (Mat + Sub) > 0 else 0
106 |     					pri_f.write(f"{read_ID}\t{Ins}\t{Del}\t{Sub}\t{Mat}\t{Iden:.4f}\t{Acc:.4f}\t{sample_ID}\n")	
107 |     pri_f.close()
108 |     sup_f.close()
109 |     bamfile.close()
110 | 
111 | def run_observed_accuracy(input_bamfile, sample_ID, num_processes=10):
112 |     bamfile = pysam.AlignmentFile(input_bamfile, "rb")
113 |     chromosomes = bamfile.references
114 | 
115 |     with multiprocessing.Pool(processes=num_processes) as pool:
116 |         jobs = []
117 |         for chromosome in chromosomes:
118 |             jobs.append(pool.apply_async(observed_accuracy_worker, (input_bamfile, sample_ID, chromosome)))
119 | 
120 |         for job in jobs:
121 |             job.get()
122 | 
123 | def supplementary_read_processing(sample_ID):
124 |     total = {}
125 | 
126 |     with open("Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt", "r") as ff:
127 |         for line in ff.readlines():
128 |             line = line.rstrip("\n")
129 |             data = line.split("\t")
130 | 
131 |             read_ID = str(data[0])
132 |             Ins = int(data[1])
133 |             Del = int(data[2])
134 |             Sub = int(data[3])
135 |             Mat = int(data[4])
136 |             sample_ID = str(data[5])
137 | 
138 |             if read_ID not in total:
139 |                 total[read_ID] = {}
140 |                 total[read_ID]["Ins"] = Ins
141 |                 total[read_ID]["Del"] = Del
142 |                 total[read_ID]["Sub"] = Sub
143 |                 total[read_ID]["Mat"] = Mat
144 |                 total[read_ID]["sample"] = sample_ID
145 |             else:
146 |                 total[read_ID]["Ins"] += Ins
147 |                 total[read_ID]["Del"] += Del
148 |                 total[read_ID]["Sub"] += Sub
149 |                 total[read_ID]["Mat"] += Mat
150 |     ff.close()
151 | 
152 |     output = f"Giraffe_Results/2_Observed_quality/{sample_ID}_supplementary.total.txt"
153 |     with open(output, "w") as ff:
154 |         for read_key, read_data in total.items():
155 |             read_ID = read_key
156 |             Ins = read_data["Ins"]
157 |             Del = read_data["Del"]
158 |             Sub = read_data["Sub"]
159 |             Mat = read_data["Mat"]
160 |             sample_ID = read_data["sample"]
161 | 
162 |             All = Ins + Del + Sub + Mat
163 |             Acc = Mat / All if All > 0 else 0
164 |             Iden = Mat / (Mat + Sub) if (Mat + Sub) > 0 else 0
165 |             ff.write(f"{read_ID}\t{Ins}\t{Del}\t{Sub}\t{Mat}\t{Iden:.4f}\t{Acc:.4f}\t{sample_ID}\n")
166 |     ff.close()


--------------------------------------------------------------------------------
/Giraffe_View/gc_bias.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from os import system
  3 | import pandas as pd
  4 | from Giraffe_View.function import cmd_shell
  5 | import multiprocessing
  6 | 
  7 | def classify_by_chromosome(input_file):
  8 |     classified_lines = {}
  9 | 
 10 |     # Read the input file and classify the lines
 11 |     with open(input_file, 'r') as file:
 12 |         for line in file:
 13 |             # Split the line by tab
 14 |             fields = line.strip().split('\t')
 15 |             first_field = fields[0]
 16 | 
 17 |             # Add the line to the appropriate list in the dictionary
 18 |             if first_field not in classified_lines:
 19 |                 classified_lines[first_field] = []
 20 |             classified_lines[first_field].append(line)
 21 | 
 22 |     # Write the classified lines to separate output files
 23 |     for key, lines in classified_lines.items():
 24 |         output_file = f"Giraffe_Results/3_GC_bias/{key}_gcbias_bin.bed"
 25 |         with open(output_file, 'w') as file:
 26 |             file.writelines(lines)
 27 | 
 28 | def get_bin_bed(input_reference, input_binsize):
 29 | 	if not os.path.exists(f"{input_reference}.fai"):
 30 | 		system(f"samtools faidx {input_reference}")
 31 | 	system(f"bedtools makewindows -g {input_reference}.fai -w {input_binsize} > Giraffe_Results/3_GC_bias/bin.bed")
 32 | 	classify_by_chromosome("Giraffe_Results/3_GC_bias/bin.bed")
 33 | 
 34 | def get_bin_GC(args):
 35 | 	input_reference, input_chromosome, path = args
 36 | 	system(f"bedtools nuc -fi {input_reference} -bed {path}/{input_chromosome}_gcbias_bin.bed > {path}/{input_chromosome}_bin_GC.tmp")
 37 | 
 38 | 	input_file = f"{path}/{input_chromosome}_bin_GC.tmp"
 39 | 	output = f"{path}/{input_chromosome}_bin_GC.txt"
 40 | 	with open(input_file, "r") as ff:
 41 | 		with open(output, "w") as of:
 42 | 			for bin in ff:
 43 | 				if bin[0] != "#":
 44 | 					bin = bin.replace("\n", "")
 45 | 					bin = bin.split()
 46 | 					bin_chrom = bin[0]
 47 | 					bin_start = bin[1]
 48 | 					bin_end = bin[2]
 49 | 					bin_gc = bin[4]
 50 | 					mes = str(bin_chrom) + "\t" + str(bin_start) + "\t"
 51 | 					mes += str(bin_end) + "\t" + str(bin_gc) + "\n"
 52 | 					of.write(mes)
 53 | 	system(f"rm {path}/{input_chromosome}_bin_GC.tmp")
 54 | 
 55 | def manager_GC_content(input_reference, num_cpus):
 56 | 	processes = []
 57 | 	chromosomes = []
 58 | 
 59 | 	with open(f"{input_reference}.fai", "r") as ff:
 60 | 		for l in ff.readlines():
 61 | 			l = l.replace("\n","").split()
 62 | 			chromosomes.append(l[0])
 63 | 	ff.close()
 64 | 
 65 | 	args = [(input_reference, chrom, "Giraffe_Results/3_GC_bias") for chrom in chromosomes]
 66 | 	with multiprocessing.Pool(processes=num_cpus) as pool:
 67 | 		pool.map(get_bin_GC, args)
 68 | 
 69 | 	path = "Giraffe_Results/3_GC_bias"
 70 | 	system(f"cat {path}/*_bin_GC.txt > {path}/bin_GC.txt")
 71 | 	system(f"rm {path}/*_bin_GC.txt")
 72 | 
 73 | def get_bin_depth(args):
 74 | 	input_sample_ID, input_bam, input_chromosome, path = args
 75 | 	system(f"samtools bedcov {path}/{input_chromosome}_gcbias_bin.bed {input_bam} > {path}/{input_sample_ID}_{input_chromosome}_bin_depth.txt")
 76 | 
 77 | def manager_bin_depth(input_reference,sample_ID, bamfile, num_cpus):
 78 | 	processes = []
 79 | 	chromosomes = []
 80 | 
 81 | 	with open(f"{input_reference}.fai", "r") as ff:
 82 | 		for l in ff.readlines():
 83 | 			l = l.replace("\n","").split()
 84 | 			chromosomes.append(l[0])
 85 | 	ff.close()
 86 | 
 87 | 	args = [(sample_ID, bamfile, chrom, "Giraffe_Results/3_GC_bias") for chrom in chromosomes]
 88 | 	with multiprocessing.Pool(processes=num_cpus) as pool:
 89 | 		pool.map(get_bin_depth, args)
 90 | 
 91 | 	path = "Giraffe_Results/3_GC_bias"
 92 | 	system(f"cat {path}/*_bin_depth.txt > {path}/{sample_ID}.bin_depth.txt")
 93 | 	system(f"rm {path}/*_bin_depth.txt")
 94 | 
 95 | def compute_GC_bias(ref, bamfile, binsize, sample_ID, num_cpu):
 96 | 	path="Giraffe_Results/3_GC_bias"
 97 | 	if os.path.exists(f"{path}/bin.bed") and os.path.exists(f"{path}/bin_GC.txt"):
 98 | 		manager_bin_depth(ref, sample_ID, bamfile, num_cpu)
 99 | 	else:
100 | 		get_bin_bed(ref, binsize)
101 | 		manager_GC_content(ref, num_cpu)
102 | 		manager_bin_depth(ref, sample_ID, bamfile, num_cpu)
103 | 
104 | 	system(f"rm {path}/bin.bed")
105 | 	system(f"rm {path}/*_bin.bed")
106 | 
107 | def merge_GC_content_and_depth(binsize, sample_ID):
108 | 	data = {}
109 | 	input_depth = "Giraffe_Results/3_GC_bias/" + str(sample_ID) + ".bin_depth.txt"
110 | 	with open(input_depth) as f1:
111 | 		for bins in f1:
112 | 			bins = bins.replace("\n", "")
113 | 			bins = bins.split("\t")
114 | 			if bins[-1] != 0:
115 | 				KEY = bins[0] + "_" + bins[1] + "_" + bins[2]
116 | 				data[KEY]= {}
117 | 				data[KEY]["dp"] = int(bins[3]) /  int(binsize)
118 | 	f1.close()
119 | 
120 | 	with open("Giraffe_Results/3_GC_bias/bin_GC.txt") as f2:
121 | 		for bins in f2:
122 | 			bins = bins.replace("\n", "")
123 | 			bins = bins.split("\t")		
124 | 			KEY = bins[0] + "_" + bins[1] + "_" + bins[2]
125 | 			if KEY in data.keys():
126 | 				data[KEY]["GC"] = float(bins[3]) * 100
127 | 			else:
128 | 				continue
129 | 	f2.close()
130 | 
131 | 	merged_data = {}
132 | 	merged_data["dp"] = []
133 | 	merged_data["GC"] = []
134 | 	for i in data.keys():
135 | 		tmp_dp = data[i]["dp"]
136 | 		tmp_gc = data[i]["GC"]
137 | 		merged_data["dp"].append(tmp_dp)
138 | 		merged_data["GC"].append(tmp_gc)
139 | 	merged_data = pd.DataFrame.from_dict(merged_data)
140 | 
141 | 	output_file = "Giraffe_Results/3_GC_bias/" + str(sample_ID) + "_relationship_raw.txt"
142 | 	ff = open(output_file, "w")
143 | 	ff.write("GC_content\tDepth\tNumber\tGroup\n")
144 | 	for i in range(0,101):
145 | 		tmp = merged_data[(i-0.5 <= merged_data["GC"]) & (merged_data["GC"] < i+0.5)].copy()
146 | 		if len(tmp) != 0:
147 | 			ave_dp = tmp["dp"].mean()
148 | 		else:
149 | 			ave_dp = 0.0
150 | 		ff.write(str(i) + "\t" + str(ave_dp) + "\t" + str(len(tmp)) + "\t" + str(sample_ID) + "\n")
151 | 	ff.close()
152 | 
153 | 	# get the 95% data for downstream normalization
154 | 	df = pd.read_csv(output_file, sep=r'\s+')
155 | 	# df = pd.read_csv(output_file, delim_whitespace=True)
156 | 	max_number = df["Number"].max()
157 | 	total_number = df["Number"].sum()
158 | 	porportion = 0.95
159 | 	tmp = df[df["Number"] == max_number].copy()
160 | 	
161 | 	if len(tmp) == 1:
162 | 		for i in tmp["GC_content"]:
163 | 			start = i
164 | 			end = i
165 | 		
166 | 		for i in range(1,51):
167 | 			t1 = df[(start-1 <=df["GC_content"]) & (df["GC_content"] <= end+1)].copy()
168 | 			if t1["Number"].sum() / total_number >= porportion:
169 | 				nor_df = t1
170 | 				break
171 | 			else:
172 | 				start -= 1
173 | 				end += 1
174 | 				continue
175 | 
176 | 	# normalization
177 | 	ave_dp = nor_df["Depth"].mean()
178 | 	nor_df["Normalized_depth"] = nor_df.apply(lambda row: row["Depth"]/ave_dp, axis=1)
179 | 	output_file_1 = "Giraffe_Results/3_GC_bias/" + str(sample_ID) + "_relationship_tmp.txt"
180 | 	nor_df.to_csv(output_file_1, sep="\t", index=False, header=False)
181 | 
182 | 	system("rm Giraffe_Results/3_GC_bias/*bin_depth.txt")
183 | 
184 | def merge_files():
185 | 	with open("header", "w") as ff:
186 | 		ff.write("GC_content\tDepth\tNumber\tGroup\tNormalized_depth\n")
187 | 	ff.close()
188 | 	system("cat header Giraffe_Results/3_GC_bias/*_relationship_tmp.txt \
189 | 			> Giraffe_Results/3_GC_bias/Relationship_normalization.txt")
190 | 	system("rm header Giraffe_Results/3_GC_bias/*_relationship_tmp.txt")
191 | 
192 | def get_bin_number_within_GC_content():
193 | 	df = pd.read_table("Giraffe_Results/3_GC_bias/bin_GC.txt", header=None)
194 | 	with open("Giraffe_Results/3_GC_bias/Bin_distribution.txt", "w") as ff:
195 | 		ff.write("GC_content\tNumber\n")
196 | 		df[3] = df[3] * 100
197 | 		for i in range(0,101):
198 | 			tmp = df[(i-0.5 <= df[3]) & (df[3] < i+0.5)].copy()
199 | 			ff.write(str(i) + "\t" + str(len(tmp)) + "\n")
200 | 	ff.close()
201 | 	system("rm Giraffe_Results/3_GC_bias/bin_GC.txt")
202 | 


--------------------------------------------------------------------------------
/Giraffe_View/giraffe_plot:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | import pandas as pd
  6 | import numpy as np
  7 | import sys
  8 | from Giraffe_View.function import *
  9 | 
 10 | def plot_estimate_acc(input_file, x_min, x_max, x_gap):
 11 |     df = process_in_chunks(input_file)
 12 |     df = pd.DataFrame(df)
 13 |     df["Accuracy"] = df["Accuracy"] * 100
 14 | 
 15 |     plt.figure(figsize=(8, 6))
 16 |     ax = sns.kdeplot(data=df, x="Accuracy", hue="Group", fill=True, 
 17 |                      alpha=0.6, palette="Set2", common_norm=False)
 18 |     sns.move_legend(ax, "upper left")
 19 |     ax
 20 |     acc_scale = [x_min, x_max]
 21 |     acc_breaks = [i for i in range(x_min, x_max+1, x_gap)]
 22 | 
 23 |     plt.xlabel("Estimated read accuracy (%)")
 24 |     plt.ylabel("Probability Density Function")
 25 |     plt.xlim(acc_scale)
 26 |     plt.xticks(acc_breaks)
 27 |     plt.tight_layout()
 28 |     plt.savefig("New_read_estimate_accuracy.svg", format="svg", dpi=300)
 29 |     plt.close()
 30 | 
 31 | def plot_observe_acc(input_file, x_min, x_max, x_gap):
 32 |     df = process_in_chunks(input_file)
 33 |     df = pd.DataFrame(df)
 34 |     df["Acc"] = df["Acc"] * 100
 35 | 
 36 |     plt.figure(figsize=(8, 6))
 37 |     ax = sns.kdeplot(data=df, x="Acc", hue="Group", fill=True, 
 38 |                      common_norm=False, alpha=0.6, palette="Set2")
 39 |     sns.move_legend(ax, "upper left")
 40 |     ax
 41 | 
 42 |     acc_scale = [x_min, x_max]
 43 |     acc_breaks = [i for i in range(x_min, x_max+1, x_gap)]
 44 | 
 45 |     plt.xlabel("Observed read accuracy (%)")
 46 |     plt.ylabel("Probability Density Function")
 47 |     plt.xlim(acc_scale)
 48 |     plt.xticks(acc_breaks)
 49 |     plt.tight_layout()
 50 |     plt.savefig("New_read_observe_accuracy.svg", format="svg", dpi=300)
 51 |     plt.close()
 52 | 
 53 | def plot_observe_mismatch(input_file, y_max, y_gap):
 54 |     df = process_in_chunks(input_file)
 55 |     df = pd.DataFrame(df)
 56 | 
 57 |     df["p_ins"] = 100 * df["Ins"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"])
 58 |     df["p_del"] = 100 * df["Del"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"])
 59 |     df["p_sub"] = 100 * df["Sub"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"])
 60 | 
 61 |     df = df.melt(id_vars=["Group"], value_vars=["p_ins", "p_del", "p_sub"], 
 62 |                  var_name="Mismatch Type", value_name="Mismatch Proportion")
 63 | 
 64 |     plt.figure(figsize=(8, 6))
 65 |     sns.boxplot(data=df, x="Mismatch Type", y="Mismatch Proportion", hue="Group", 
 66 |                 showfliers=False, width=0.5, saturation=0.6, palette="Set2")
 67 | 
 68 |     mis_scale = [0, y_max]
 69 |     mis_breaks = [i for i in range(0, y_max+1, y_gap)]
 70 | 
 71 |     plt.ylabel("Mismatch proportion (%)")
 72 |     plt.ylim(mis_scale)
 73 |     plt.yticks(mis_breaks)
 74 |     plt.xticks(ticks=[0, 1, 2], labels=["Insertion", "Deletion", "Substitution"])
 75 |     plt.xlabel("")
 76 | 
 77 |     plt.legend(title='Group')
 78 |     plt.tight_layout()
 79 |     plt.savefig("New_observed_mismatch_proportion.svg", format="svg", dpi=300)
 80 |     plt.close()
 81 | 
 82 | def plot_observe_homo(input_file, y_min, y_max, y_gap):
 83 |     df = process_in_chunks(input_file)
 84 |     df = pd.DataFrame(df)
 85 |     df["Accuracy"] = df["Accuracy"] * 100
 86 | 
 87 |     plt.figure(figsize=(8, 6))
 88 |     sns.lineplot(data=df, x='Base', y='Accuracy', hue='Group', linewidth=1.5,
 89 |                  markers=True, dashes=False, palette="Set2", alpha=0.6, legend=False)
 90 |     sns.scatterplot(data=df, x='Base', y='Accuracy', hue='Group', 
 91 |                     palette="Set2", s=50, edgecolor="black")
 92 | 
 93 |     homo_scale = [y_min, y_max]
 94 |     homo_breaks = [i for i in range(y_min, y_max+1, y_gap)]
 95 | 
 96 |     plt.ylim(homo_scale)
 97 |     plt.yticks(homo_breaks)
 98 |     plt.ylabel('Accuracy of homopolymer identification (%)')
 99 |     plt.xlabel('Base')
100 | 
101 |     plt.legend(title='Group')
102 |     plt.tight_layout()
103 |     plt.savefig("New_homoploymer_summary.svg", format="svg", dpi=300, bbox_inches='tight')
104 |     plt.close()
105 | 
106 | def plot_GC_bias(input_file, x_min, x_max, x_gap):
107 |     df = process_in_chunks(input_file)
108 |     df = pd.DataFrame(df)
109 | 
110 |     plt.figure(figsize=(8, 5))
111 |     sns.lineplot(data=df, x="GC_content", y="Normalized_depth", hue="Group", 
112 |                  palette="Set2", linewidth=1.5, alpha=0.6)
113 |     sns.scatterplot(data=df, x="GC_content", y="Normalized_depth", hue="Group",
114 |                     palette="Set2", edgecolor="black", s=20, legend=False)
115 |     plt.axhline(1, color="grey", linestyle="dotted")
116 |     plt.ylim(0, 2)
117 |     # depth_breaks = [i for i in range(0, 2.1, 0.2)]
118 |     depth_breaks = [i * 0.2 for i in range(11)]
119 | 
120 |     plt.yticks(depth_breaks)
121 | 
122 |     plt.xlim([x_min, x_max])
123 |     plt.xticks([i for i in range(x_min, x_max+1, x_gap)])
124 |     plt.xlabel("GC content (%)")
125 |     plt.ylabel("Normalized depth")
126 |     plt.grid(False)
127 |     plt.tight_layout()
128 |     plt.savefig("New_relationship_normalization.svg", format="svg", dpi=300)
129 |     plt.close()
130 | 
131 | if __name__ == '__main__':
132 |     version = "0.2.3"
133 |     parser = argparse.ArgumentParser(description="",
134 |         usage="\n   # Users can replot the figures by rescaling the regions along the x-axis or y-axis.\n"
135 |               "\n   %(prog)s estimate_acc --input Estimated_information.txt --x_min 50 --x_max 100 --x_gap 10    # For estimated read accuracy!"
136 |               "\n   %(prog)s observe_acc --input Observed_information.txt --x_min 50 --x_max 100 --x_gap 10      # For observed read accuracy!"
137 |               "\n   %(prog)s observe_mismatch --input Observed_information.txt --y_max 5 --y_gap 1               # For mismatch proportion!"
138 |               "\n   %(prog)s observe_homo --input Homoploymer_summary.txt --y_min 90 --y_max 100 --y_gap 2       # For homopolymer accuracy!"
139 |               "\n   %(prog)s gcbias --input Relationship_normalization.txt --x_min 20 --x_max 50 --x_gap 2       # For relationship between normalized depth and GC content!"
140 |               "\n\nversion: " + str(version) + "\n"
141 |               "For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.")
142 | 
143 |     subparsers = parser.add_subparsers(dest='function', help=None, description=None, prog="giraffe", metavar="  subcommand and function")
144 | 
145 |     plot_estimate_acc_parser = subparsers.add_parser('estimate_acc', help='Replot estimated read accuracy')
146 |     plot_estimate_acc_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Estimated_information.txt)")
147 |     plot_estimate_acc_parser.add_argument("--x_min", type=int, metavar="", required=True, help="the smallest cutoff for estimated read accuracy")
148 |     plot_estimate_acc_parser.add_argument("--x_max", type=int, metavar="", required=True, help="the largest cutoff for estimated read accuracy")
149 |     plot_estimate_acc_parser.add_argument("--x_gap", type=int, metavar="", required=True, help="the interval between two values on an x-axis")
150 | 
151 |     plot_observe_acc_parser = subparsers.add_parser('observe_acc', help='Replot observed read accuracy')
152 |     plot_observe_acc_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Observed_information.txt)")
153 |     plot_observe_acc_parser.add_argument("--x_min", type=int, metavar="", required=True, help="the smallest cutoff for observed read accuracy")
154 |     plot_observe_acc_parser.add_argument("--x_max", type=int, metavar="", required=True, help="the largest cutoff for observed read accuracy")
155 |     plot_observe_acc_parser.add_argument("--x_gap", type=int, metavar="", required=True, help="the interval between two values on an x-axis")
156 | 
157 |     plot_observe_mismatch_parser = subparsers.add_parser('observe_mismatch', help='Replot observed mismatch proportion')
158 |     plot_observe_mismatch_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Observed_information.txt)")
159 |     plot_observe_mismatch_parser.add_argument("--y_max", type=int, metavar="", required=True, help="the largest cutoff for mismatch proportion")
160 |     plot_observe_mismatch_parser.add_argument("--y_gap", type=int, metavar="", required=True, help="the interval between two values on a y-axis")
161 | 
162 |     plot_observe_homo_parser = subparsers.add_parser('observe_homo', help='Replot observed read accuracy')
163 |     plot_observe_homo_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Homoploymer_summary.txt)")
164 |     plot_observe_homo_parser.add_argument("--y_min", type=int, metavar="", required=True, help="the smallest cutoff for homopolymer accuracy")
165 |     plot_observe_homo_parser.add_argument("--y_max", type=int, metavar="", required=True, help="the largest cutoff for homopolymer accuracy")
166 |     plot_observe_homo_parser.add_argument("--y_gap", type=int, metavar="", required=True, help="the interval between two values on a y-axis")
167 | 
168 |     plot_GC_bias_parser = subparsers.add_parser('gcbias', help='Replot observed read accuracy')
169 |     plot_GC_bias_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Relationship_normalization.txt)")
170 |     plot_GC_bias_parser.add_argument("--x_min", type=int, metavar="", required=True, help="the smallest cutoff for GC content")
171 |     plot_GC_bias_parser.add_argument("--x_max", type=int, metavar="", required=True, help="the largest cutoff for GC content")
172 |     plot_GC_bias_parser.add_argument("--x_gap", type=int, metavar="", required=True, help="the interval between two values on an x-axis")
173 | 
174 |     args = parser.parse_args()
175 | 
176 |     if len(sys.argv) == 1:
177 |         parser.print_help(sys.stderr)
178 |         sys.exit(1)
179 | 
180 |     if args.function == "estimate_acc":
181 |         if len(sys.argv) == 2:
182 |             plot_estimate_acc_parser.print_help(sys.stderr)
183 |             sys.exit(1)
184 |         else:
185 |             plot_estimate_acc(args.input, args.x_min, args.x_max, args.x_gap)
186 | 
187 |     elif args.function == "observe_acc":
188 |         if len(sys.argv) == 2:
189 |             plot_observe_acc_parser.print_help(sys.stderr)
190 |             sys.exit(1)
191 |         else:
192 |             plot_observe_acc(args.input, args.x_min, args.x_max, args.x_gap)
193 | 
194 |     elif args.function == "observe_mismatch":
195 |         if len(sys.argv) == 2:
196 |             plot_observe_mismatch_parser.print_help(sys.stderr)
197 |             sys.exit(1)
198 |         else:
199 |             plot_observe_mismatch(args.input, args.y_max, args.y_gap)
200 | 
201 |     elif args.function == "observe_homo":
202 |         if len(sys.argv) == 2:
203 |             plot_observe_homo_parser.print_help(sys.stderr)
204 |             sys.exit(1)
205 |         else:
206 |             plot_observe_homo(args.input, args.y_min, args.y_max, args.y_gap)
207 | 
208 |     elif args.function == "gcbias":
209 |         if len(sys.argv) == 2:
210 |             plot_GC_bias_parser.print_help(sys.stderr)
211 |             sys.exit(1)
212 |         else:
213 |             plot_GC_bias(args.input, args.x_min, args.x_max, args.x_gap)
214 | 
215 | 
216 | 


--------------------------------------------------------------------------------
/Giraffe_View/plot.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import seaborn as sns
  3 | import matplotlib.pyplot as plt
  4 | import warnings
  5 | import os
  6 | from Giraffe_View.function import process_in_chunks
  7 | 
  8 | warnings.filterwarnings('ignore')
  9 | 
 10 | def plot_estimate(format='svg', path='Giraffe_Results/1_Estimated_quality'):
 11 | 	df = process_in_chunks("Giraffe_Results/1_Estimated_quality/Estimated_information.txt")
 12 | 	df = pd.DataFrame(df)
 13 | 	df["Accuracy"] = df["Accuracy"] * 100
 14 | 	df["GC_content"] = df["GC_content"] * 100
 15 | 	df["Length"] = df["Length"] / 1000
 16 | 
 17 | 	min_1 = df["Accuracy"].min()
 18 | 	if min_1 >= 95:
 19 | 		acc_scale = [95, 100]
 20 | 		acc_breaks = [i for i in range(95, 101, 1)]
 21 | 	elif min_1 >= 90:
 22 | 		acc_scale = [90, 100]
 23 | 		acc_breaks = [i for i in range(90, 101, 1)]
 24 | 	elif min_1 >= 80:
 25 | 		acc_scale = [80, 100]
 26 | 		acc_breaks = [i for i in range(80, 101, 2)]
 27 | 	elif min_1 >= 70:
 28 | 		acc_scale = [70, 100]
 29 | 		acc_breaks = [i for i in range(70, 101, 5)]
 30 | 	elif min_1 >= 60:
 31 | 		acc_scale = [60, 100]
 32 | 		acc_breaks = [i for i in range(60, 101, 5)]
 33 | 	elif min_1 >= 50:
 34 | 		acc_scale = [50, 100]
 35 | 		acc_breaks = [i for i in range(50, 101, 5)]
 36 | 	elif min_1 >= 40:
 37 | 		acc_scale = [40, 100]
 38 | 		acc_breaks = [i for i in range(40, 101, 10)]
 39 | 	elif min_1 >= 30:
 40 | 		acc_scale = [30, 100]
 41 | 		acc_breaks = [i for i in range(30, 101, 10)]
 42 | 	elif min_1 >= 20:
 43 | 		acc_scale = [20, 100]
 44 | 		acc_breaks = [i for i in range(20, 101, 10)]
 45 | 	elif min_1 >= 10:
 46 | 		acc_scale = [10, 100]
 47 | 		acc_breaks = [i for i in range(10, 101, 10)]
 48 | 	else:
 49 | 		acc_scale = [0, 100]
 50 | 		acc_breaks = [i for i in range(0, 101, 5)]
 51 | 
 52 | 	# plot
 53 | 	plt.figure(figsize=(8, 6))
 54 | 	ax = sns.kdeplot(data=df, x="Accuracy", hue="Group", fill=True, 
 55 | 		alpha=0.6, palette = "Set2", common_norm=False)
 56 | 	sns.move_legend(ax, "upper left")	
 57 | 	ax
 58 | 	plt.xlabel("Estimated read accuracy (%)")
 59 | 	plt.ylabel("Probability Density Function")
 60 | 	plt.xlim(acc_scale)
 61 | 	plt.xticks(acc_breaks)
 62 | 
 63 | 	plt.tight_layout()
 64 | 	plt.savefig(f"{path}/1_Read_estimate_accuracy.{format}", format=format, dpi=300)
 65 | 	plt.close()
 66 | 
 67 | 	plt.figure(figsize=(8, 4))
 68 | 	sns.boxplot(data=df, y="Group", x="GC_content", hue="Group", 
 69 | 		palette="Set2", dodge=False, showfliers=False)
 70 | 	
 71 | 	plt.xlabel("GC content (%)")
 72 | 	plt.xlim(0, 101)
 73 | 	plt.xticks(range(0, 101, 10))
 74 | 	
 75 | 	plt.yticks()	
 76 | 	plt.legend([],[], frameon=False)  # Hide the legend
 77 | 	plt.tight_layout()
 78 | 	plt.savefig(f"{path}/2_Read_GC_content.{format}", format=format, dpi=300)
 79 | 	plt.close()
 80 | 
 81 | 	ave = df["Length"].mean()
 82 | 	if ave <= 1:
 83 | 		len_scale = [0, 5]
 84 | 		len_breaks = [i for i in range(0, 6, 1)]
 85 | 	elif ave <= 5:
 86 | 		len_scale = [0, 10]
 87 | 		len_breaks = [i for i in range(0, 11, 1)]
 88 | 	elif ave <= 10:
 89 | 		len_scale = [0, 20]
 90 | 		len_breaks = [i for i in range(0, 21, 2)]
 91 | 	elif ave <= 20:
 92 | 		len_scale = [0, 30]
 93 | 		len_breaks = [i for i in range(0, 31, 5)]
 94 | 	elif ave <= 30:
 95 | 		len_scale = [0, 50]
 96 | 		len_breaks = [i for i in range(0, 51, 5)]
 97 | 	else:
 98 | 		len_scale = [0, 100]
 99 | 		len_breaks = [i for i in range(0, 101, 10)]
100 | 
101 | 	plt.figure(figsize=(8, 6))
102 | 	sns.kdeplot(data=df, x="Length", hue="Group", 
103 | 		fill=True, common_norm=False, alpha=0.6,
104 | 		palette="Set2")
105 | 
106 | 	plt.xlabel("Read length (Kb)")
107 | 	plt.ylabel("Probability Density Function")
108 | 	plt.xlim(len_scale)
109 | 	plt.xticks(len_breaks)
110 | 	plt.tight_layout()
111 | 	plt.savefig(f"{path}/3_Read_length.{format}", format=format, dpi=300)
112 | 	plt.close()
113 | 
114 | def plot_observe_acc(format='svg', path='Giraffe_Results/2_Observed_quality'):
115 | 	# color_set = "Set2"
116 | 	df = process_in_chunks("Giraffe_Results/2_Observed_quality/Observed_information.txt")
117 | 	df = pd.DataFrame(df)
118 | 
119 | 	df["Acc"] = df["Acc"] * 100
120 | 	min_1 = df["Acc"].min()
121 | 
122 | 	min_1 = df["Acc"].min()
123 | 
124 | 	if min_1 >= 95:
125 | 		acc_scale = [95, 100]
126 | 		acc_breaks = [i for i in range(95, 101, 1)]
127 | 	elif min_1 >= 90:
128 | 		acc_scale = [90, 100]
129 | 		acc_breaks = [i for i in range(90, 101, 1)]
130 | 	elif min_1 >= 80:
131 | 		acc_scale = [80, 100]
132 | 		acc_breaks = [i for i in range(80, 101, 2)]
133 | 	elif min_1 >= 70:
134 | 		acc_scale = [70, 100]
135 | 		acc_breaks = [i for i in range(70, 101, 5)]
136 | 	elif min_1 >= 60:
137 | 		acc_scale = [60, 100]
138 | 		acc_breaks = [i for i in range(60, 101, 5)]
139 | 	elif min_1 >= 50:
140 | 		acc_scale = [50, 100]
141 | 		acc_breaks = [i for i in range(50, 101, 5)]
142 | 	elif min_1 >= 40:
143 | 		acc_scale = [40, 100]
144 | 		acc_breaks = [i for i in range(40, 101, 10)]
145 | 	elif min_1 >= 30:
146 | 		acc_scale = [30, 100]
147 | 		acc_breaks = [i for i in range(30, 101, 10)]
148 | 	elif min_1 >= 20:
149 | 		acc_scale = [20, 100]
150 | 		acc_breaks = [i for i in range(20, 101, 10)]
151 | 	elif min_1 >= 10:
152 | 		acc_scale = [10, 100]
153 | 		acc_breaks = [i for i in range(10, 101, 10)]
154 | 	else:
155 | 		acc_scale = [0, 100]
156 | 		acc_breaks = [i for i in range(0, 101, 5)]
157 | 
158 | 	# Plot density plot for observed read accuracy
159 | 	# sns.set(style='darkgrid')
160 | 	plt.figure(figsize=(8, 6))
161 | 
162 | 	ax = sns.kdeplot(data=df, x="Acc", hue="Group", fill=True, 
163 | 		common_norm=False, alpha=0.6, palette = "Set2")
164 | 	sns.move_legend(ax, "upper left")	
165 | 	ax
166 | 	
167 | 	plt.xlabel("Observed read accuracy (%)")
168 | 	plt.ylabel("Probability Density Function")
169 | 	plt.xlim(acc_scale)
170 | 	plt.xticks(acc_breaks)
171 | 
172 | 	plt.tight_layout()
173 | 	plt.savefig(f"{path}/1_Observed_read_accuracy.{format}", format=format, dpi=300)
174 | 	plt.close()
175 | 
176 | 	# Compute mismatch proportions
177 | 	df["p_ins"] = 100 * df["Ins"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"])
178 | 	df["p_del"] = 100 * df["Del"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"])
179 | 	df["p_sub"] = 100 * df["Sub"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"])
180 | 
181 | 	# Melt the dataframe for mismatch proportions
182 | 	df1 = pd.melt(df, id_vars=['Group'], value_vars=['p_ins', 'p_del', 'p_sub'])
183 | 	max_1 = df["p_ins"].max()
184 | 	max_2 = df["p_del"].max()
185 | 	max_3 = df["p_sub"].max()
186 | 	max_4 = max(max_1, max_2, max_3)
187 | 
188 | 	if max_4 <= 5:
189 | 		mis_scale = [0, 5]
190 | 		mis_breaks = [i for i in range(0, 6, 1)]
191 | 	elif max_4 <= 10:
192 | 		mis_scale = [0, 10]
193 | 		mis_breaks = [i for i in range(0, 11, 1)]
194 | 	elif max_4 <= 20:
195 | 		mis_scale = [0, 20]
196 | 		mis_breaks = [i for i in range(0, 21, 2)]
197 | 	elif max_4 <= 30:
198 | 		mis_scale = [0, 30]
199 | 		mis_breaks = [i for i in range(0, 31, 5)]
200 | 	elif max_4 <= 40:
201 | 		mis_scale = [0, 40]
202 | 		mis_breaks = [i for i in range(0, 41, 5)]
203 | 	elif max_4 <= 50:
204 | 		mis_scale = [0, 50]
205 | 		mis_breaks = [i for i in range(0, 51, 5)]
206 | 	elif max_4 <= 60:
207 | 		mis_scale = [0, 60]
208 | 		mis_breaks = [i for i in range(0, 61, 10)]
209 | 	elif max_4 <= 70:
210 | 		mis_scale = [0, 70]
211 | 		mis_breaks = [i for i in range(0, 71, 10)]
212 | 	elif max_4 <= 80:
213 | 		mis_scale = [0, 80]
214 | 		mis_breaks = [i for i in range(0, 81, 10)]
215 | 	elif max_4 <= 90:
216 | 		mis_scale = [0, 90]
217 | 		mis_breaks = [i for i in range(0, 91, 10)]
218 | 	else:
219 | 		mis_scale = [0, 100]
220 | 		mis_breaks = [i for i in range(0, 101, 10)]
221 | 
222 | 	# Plot boxplot for mismatch proportions
223 | 	plt.figure(figsize=(8, 6))
224 | 	sns.boxplot(data=df1, x="variable", y="value", hue="Group", 
225 | 		showfliers=False, width=0.5, gap=0.1, saturation=0.6, 
226 | 		palette = "Set2", linecolor="black")
227 | 
228 | 	plt.ylabel("Mismatch proportion (%)")
229 | 	plt.ylim(mis_scale)
230 | 	plt.yticks(mis_breaks)
231 | 	plt.xticks(ticks=[0, 1, 2], labels=["Deletion", "Insertion", "Substitution"])
232 | 	plt.xlabel("")
233 | 
234 | 	# Ensure the legend is created correctly
235 | 	handles, labels = plt.gca().get_legend_handles_labels()
236 | 	if not handles:
237 | 		for group in df["Group"].unique():
238 | 			handle = plt.Line2D([0], [0], color=sns.color_palette("pastel")[0], lw=2)
239 | 			handles.append(handle)
240 | 			labels.append(group)
241 | 	
242 | 	plt.legend(handles=handles, labels=labels, title='Group')
243 | 	plt.tight_layout()
244 | 	plt.savefig(f"{path}/2_Observed_mismatch_proportion.{format}", format=format, dpi=300)
245 | 	plt.close()
246 | 
247 | def plot_observe_homo(format='svg', path='Giraffe_Results/2_Observed_quality'):
248 | 	# Load data
249 | 	df = process_in_chunks("Giraffe_Results/2_Observed_quality/Homoploymer_summary.txt")
250 | 	df["Accuracy"] = df["Accuracy"] * 100
251 | 
252 | 	# Determine scale and breaks for y-axis based on minimum accuracy
253 | 	min_acc = df["Accuracy"].min()
254 | 	max_acc = df["Accuracy"].max()
255 | 
256 | 	min_value = int((min_acc//10) * 10)
257 | 	max_value = int((max_acc//10) * 10 + 10)
258 | 	homo_scale = [min_value, max_value]
259 | 
260 | 	dif = max_value - min_value
261 | 
262 | 	if dif <= 10:
263 | 		homo_breaks = [i for i in range(min_value, max_value+1, 1)]
264 | 	elif dif <= 20:
265 | 		homo_breaks = [i for i in range(min_value, max_value+1, 2)]
266 | 	elif dif <= 30:
267 | 		homo_breaks = [i for i in range(min_value, max_value+1, 5)]
268 | 	elif dif <= 40:
269 | 		homo_breaks = [i for i in range(min_value, max_value+1, 5)]
270 | 	elif dif <= 50:
271 | 		homo_breaks = [i for i in range(min_value, max_value+1, 5)]
272 | 	elif dif <= 60:
273 | 		homo_breaks = [i for i in range(min_value, max_value+1, 5)]
274 | 	elif dif <= 70:
275 | 		homo_breaks = [i for i in range(min_value, max_value+1, 5)]
276 | 	elif dif <= 80:
277 | 		homo_breaks = [i for i in range(min_value, max_value+1, 5)]
278 | 	elif dif <= 90:
279 | 		homo_breaks = [i for i in range(min_value, max_value+1, 5)]
280 | 	else:
281 | 		homo_breaks = [i for i in range(min_value, max_value+1, 10)]
282 | 
283 | 	# Create the plot
284 | 	plt.figure(figsize=(8, 6))
285 | 	sns.lineplot(data=df, x='Base', y='Accuracy', hue='Group', linewidth=1.5,
286 | 		markers=True, dashes=False, palette = "Set2", alpha=0.6, legend=False)
287 | 	sns.scatterplot(data=df, x='Base',y='Accuracy', hue='Group', 
288 | 		palette = "Set2", s=50, edgecolor="black")
289 | 	
290 | 	# Customize plot
291 | 	plt.ylim(homo_scale)
292 | 	plt.yticks(homo_breaks)
293 | 	plt.ylabel('Accuracy of homopolymer identification (%)')
294 | 	plt.xlabel('Base')
295 | 
296 | 	# Save plot
297 | 	output_path = f"{path}/3_Homoploymer_summary.{format}"
298 | 	plt.savefig(output_path, format=format, dpi=300, bbox_inches='tight')
299 | 	plt.close()
300 | 
301 | def plot_GC_bias(input_binsize, format='svg', path='Giraffe_Results/3_GC_bias'):
302 | 	# sns.set_style("whitegrid")
303 | 	# Load the first dataset
304 | 	df = pd.read_csv("Giraffe_Results/3_GC_bias/Bin_distribution.txt", sep="\t")
305 | 	accuracy_scale = [0, 100]
306 | 	accuracy_breaks = [i for i in range(0, 101, 10)]
307 | 
308 | 
309 | 	# Plot distribution length
310 | 	plt.figure(figsize=(8, 5))
311 | 
312 | 	sns.lineplot(data=df, x="GC_content", y="Number", color="#96D1E8", linewidth=1.5, alpha=0.3)
313 | 	sns.scatterplot(data=df, x="GC_content", y="Number", color="#96D1E8", edgecolor="black", s=15)
314 | 
315 | 	plt.xlim(accuracy_scale)
316 | 	plt.xticks(accuracy_breaks)
317 | 	plt.xlabel("GC content (%)")
318 | 	plt.ylabel(f"Number of bins (bin size = {input_binsize} bp)")
319 | 	plt.grid(False)
320 | 	plt.savefig(f"{path}/1_Bin_distribution.{format}", dpi=300)
321 | 	plt.close()
322 | 
323 | 	# Load the second dataset
324 | 	df1 = pd.read_csv("Giraffe_Results/3_GC_bias/Relationship_normalization.txt", sep="\t")
325 | 
326 | 	# Plot GC bias
327 | 	plt.figure(figsize=(8, 5))
328 | 	sns.lineplot(data=df1, x="GC_content", y="Normalized_depth", hue="Group", 
329 | 	palette = "Set2", linewidth=1.5, alpha=.6)
330 | 	sns.scatterplot(data=df1, x="GC_content", y="Normalized_depth", hue="Group",
331 | 	palette = "Set2", edgecolor="black", s=20, legend=False)
332 | 
333 | 	plt.axhline(1, color="grey", linestyle="dotted")
334 | 	plt.ylim(0, 2)
335 | 
336 | 	depth_breaks = (0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0)
337 | 	plt.yticks(depth_breaks)
338 | 
339 | 	plt.xlabel("GC content (%)")
340 | 	plt.ylabel("Normalized depth")
341 | 	plt.grid(False)
342 | 	plt.savefig(f"{path}/2_Relationship_normalization.{format}", dpi=300)
343 | 	plt.close()
344 | 
345 | def plot_modi_bin(format="svg", path="Giraffe_Results/4_Regional_modification"):
346 | 
347 |     df = pd.read_csv("Giraffe_Results/4_Regional_modification/Regional_methylation_proportion.txt", sep="\t", names=["ID", "Value", "Group"])
348 |     df["Value"] = df["Value"] * 100
349 | 
350 |     # sns.set(style="whitegrid")
351 |     plt.figure(figsize=(20, 5))
352 | 
353 |     # Violin plot
354 |     sns.violinplot(data=df, y="Group", x="Value",
355 |      width=0.5, alpha=0.7, inner="box", split=True,
356 |      inner_kws=dict(box_width=8, whis_width=2, color=".8"))
357 | 
358 |     methyl_scale = (0,100)
359 |     methyl_breaks = [i for i in range(0, 101, 10)]
360 | 
361 |     plt.xlim(methyl_scale)
362 |     plt.xticks(methyl_breaks)
363 |     plt.xlabel("Methylation proportion (%)")
364 |     plt.ylabel("")
365 |     plt.yticks(fontsize=12, color='black')
366 |     plt.legend([],[], frameon=False)  # Remove legend
367 | 
368 |     plt.savefig(f"{path}/1_Regional_modification.{format}", dpi=300)
369 |     plt.close()
370 | 


--------------------------------------------------------------------------------
/Giraffe_View/summary_html.py:
--------------------------------------------------------------------------------
  1 | from Giraffe_View.function import *
  2 | import pandas as pd
  3 | import re
  4 | import os
  5 | import pathlib
  6 | 
  7 | def generate_giraffe_data(input_table):
  8 |     # Initialize the dictionary structure
  9 |     giraffe_data = {
 10 |         "samples": [],
 11 |         "metrics": {
 12 |             "Estimate (average)": {
 13 |                 "Estimated read accuracy": [],
 14 |                 "Read length": [],
 15 |                 "Read GC content": []
 16 |             },
 17 |             "Observed (average)": {
 18 |                 "Observed read accuracy": [],
 19 |                 "Observed read identification": [],
 20 |                 "Substitution proportion": [],
 21 |                 "Insertion proportion": [],
 22 |                 "Deletion proportion": [],
 23 |                 "Homopolymer accuracy (A)": [],
 24 |                 "Homopolymer accuracy (T)": [],
 25 |                 "Homopolymer accuracy (G)": [],
 26 |                 "Homopolymer accuracy (C)": []
 27 |             }
 28 |         }
 29 |     }
 30 | 
 31 |     # Read sample IDs from the input table
 32 |     with open(input_table, "r") as ff:
 33 |         giraffe_data["samples"] = [line.strip().split()[0] for line in ff]
 34 | 
 35 |     # Process the Estimated results
 36 |     estimated_file = "Giraffe_Results/1_Estimated_quality/Estimated_information.txt"
 37 |     df_estimated = process_in_chunks(estimated_file)
 38 |     df_estimated = pd.DataFrame(df_estimated)
 39 |     
 40 |     for sample in giraffe_data["samples"]:
 41 |         sample_df = df_estimated[df_estimated['Group'] == sample]
 42 |         giraffe_data["metrics"]["Estimate (average)"]["Estimated read accuracy"].append(round(100*sample_df["Accuracy"].mean(), 2))
 43 |         giraffe_data["metrics"]["Estimate (average)"]["Read length"].append(round(sample_df["Length"].mean(), 2))
 44 |         giraffe_data["metrics"]["Estimate (average)"]["Read GC content"].append(round(100*sample_df["GC_content"].mean(), 2))
 45 | 
 46 |     # Process the Observed results
 47 |     observed_file = "Giraffe_Results/2_Observed_quality/Observed_information.txt"
 48 |     df_observed = process_in_chunks(observed_file)
 49 |     df_observed = pd.DataFrame(df_observed)
 50 | 
 51 |     # Calculate proportions
 52 |     df_observed["p_ins"] = 100 * df_observed["Ins"] / (df_observed["Ins"] + df_observed["Del"] + df_observed["Sub"] + df_observed["Mat"])
 53 |     df_observed["p_del"] = 100 * df_observed["Del"] / (df_observed["Ins"] + df_observed["Del"] + df_observed["Sub"] + df_observed["Mat"])
 54 |     df_observed["p_sub"] = 100 * df_observed["Sub"] / (df_observed["Ins"] + df_observed["Del"] + df_observed["Sub"] + df_observed["Mat"])
 55 | 
 56 |     for sample in giraffe_data["samples"]:
 57 |         sample_df = df_observed[df_observed['Group'] == sample]
 58 |         giraffe_data["metrics"]["Observed (average)"]["Observed read accuracy"].append(round(100*sample_df["Acc"].mean(), 2))
 59 |         giraffe_data["metrics"]["Observed (average)"]["Observed read identification"].append(100*round(sample_df["Iden"].mean(), 2))
 60 |         giraffe_data["metrics"]["Observed (average)"]["Substitution proportion"].append(round(sample_df["p_sub"].mean(), 2))
 61 |         giraffe_data["metrics"]["Observed (average)"]["Deletion proportion"].append(round(sample_df["p_del"].mean(), 2))
 62 |         giraffe_data["metrics"]["Observed (average)"]["Insertion proportion"].append(round(sample_df["p_ins"].mean(), 2))
 63 | 
 64 |     # Process the Homopolymer results
 65 |     homopolymer_file = "Giraffe_Results/2_Observed_quality/Homoploymer_summary.txt"
 66 |     df_homopolymer = process_in_chunks(homopolymer_file)
 67 |     df_homopolymer = pd.DataFrame(df_homopolymer)
 68 | 
 69 |     for sample in giraffe_data["samples"]:
 70 |         sample_df = df_homopolymer[df_homopolymer['Group'] == sample]
 71 |         for base in ["A", "T", "G", "C"]:
 72 |             accuracy = sample_df[sample_df['Base'] == base]["Accuracy"].mean() if not sample_df[sample_df['Base'] == base].empty else float('nan')
 73 |             giraffe_data["metrics"]["Observed (average)"][f"Homopolymer accuracy ({base})"].append(round(100*accuracy, 2) if not pd.isna(accuracy) else float('nan'))
 74 | 
 75 |     return giraffe_data
 76 | 
 77 | def generate_giraffe_html(giraffe_data, summary_figures, output_file):
 78 |     with open(output_file, 'w') as f:
 79 |         f.write("<html>\n<head>\n<title></title>\n")
 80 |         f.write("<style>\n")
 81 |         f.write("body { font-family: Arial, sans-serif; margin: 0; padding: 0; }\n")
 82 |         f.write("header { background-color: #4CAF50; color: white; padding: 10px 0; text-align: center; }\n")
 83 |         f.write("nav { background-color: #f2f2f2; padding: 10px; width: 25%; float: left; height: 100vh; box-sizing: border-box; position: fixed; top: 0; left: 0; }\n")
 84 |         f.write("main { margin-left: 26%; padding: 10px; }\n")
 85 |         f.write("nav a { display: block; margin: 10px 0; text-decoration: none; color: #4CAF50; font-weight: bold; }\n")
 86 |         f.write("nav a:hover { text-decoration: underline; }\n")
 87 |         f.write("h1, h2, h3, h4 { color: #4CAF50; }\n")
 88 |         f.write("table { border-collapse: collapse; width: 800px; margin: 20px auto; }\n")
 89 |         f.write("th, td { border: 1px solid #ddd; padding: 8px; text-align: center; }\n")
 90 |         f.write("th { background-color: #f2f2f2; }\n")
 91 |         f.write(".figure-container { text-align: center; margin: 80px; }\n")
 92 |         f.write(".figure-container img { width: 800px; height: auto; }\n")
 93 |         f.write("</style>\n")
 94 |         f.write("</head>\n<body>\n")
 95 |         f.write("<header>\n<h1>Giraffe Report</h1>\n</header>\n")
 96 | 
 97 |         # Navigation Index
 98 |         f.write("<nav>\n")
 99 |         f.write("<h2>Giraffe report</h2>\n")
100 |         f.write("<ul>\n")
101 |         
102 |         # Add entry for Statistics first
103 |         f.write("<li><a href='#summary_table'>Statistics</a></li>\n")
104 | 
105 |         # Add entries for summary figures
106 |         for category, figures in summary_figures.items():
107 |             f.write(f"<li><a href='#summary_{category}'>{category}</a>\n")
108 |             f.write("<ul>\n")
109 |             for figure in figures:
110 |                 if figure == "Summary_html/1_Read_estimate_accuracy.png":
111 |                     figure_title = "Estimated accuracy"
112 |                     f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
113 | 
114 |                 elif figure == "Summary_html/2_Read_GC_content.png":
115 |                     figure_title = "Read GC content"
116 |                     f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
117 | 
118 |                 elif figure == "Summary_html/3_Read_length.png":
119 |                     figure_title = "Read length"
120 |                     f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
121 | 
122 |                 elif figure == "Summary_html/1_Observed_read_accuracy.png":
123 |                     figure_title = "Observed accuracy"
124 |                     f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
125 | 
126 |                 elif figure == "Summary_html/2_Observed_mismatch_proportion.png":
127 |                     figure_title = "Mismatch proportion"
128 |                     f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
129 | 
130 |                 elif figure == "Summary_html/3_Homoploymer_summary.png":
131 |                     figure_title = "Homopolymer identification"
132 |                     f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
133 | 
134 |                 elif figure == "Summary_html/1_Bin_distribution.png":
135 |                     figure_title = "Bin distribution"
136 |                     f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
137 | 
138 |                 elif figure == "Summary_html/2_Relationship_normalization.png":
139 |                     figure_title = "Relationship (depth and GC conetent)"
140 |                     f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
141 | 
142 |                 else:
143 |                     continue
144 | 
145 |             f.write("</ul>\n")
146 |             f.write("</li>\n")
147 |         f.write("</ul>\n</nav>\n")
148 | 
149 | 
150 |         # Main content area
151 |         f.write("<main>\n")
152 | 
153 |         # Create table headers
154 |         headers = "<tr><th>Metric</th>"
155 |         for sample in giraffe_data['samples']:
156 |             headers += f"<th>{sample}</th>"
157 |         headers += "</tr>\n"
158 |         
159 |         # Generate table rows
160 |         rows = ""
161 |         for group, metrics in giraffe_data['metrics'].items():
162 |             # Add group header
163 |             rows += f"<tr><td colspan='{len(giraffe_data['samples']) + 1}'><strong>{group}</strong></td></tr>\n"
164 |             for metric, values in metrics.items():
165 |                 rows += f"<tr><td>{metric}</td>"
166 |                 for value in values:
167 |                     rows += f"<td>{value:.2f}</td>"
168 |                 rows += "</tr>\n"
169 |         
170 |         # Combine headers and rows into a table
171 |         f.write("<section id='summary_table'>\n<h2>Statistics</h2>\n")
172 |         f.write("<div style='text-align: center;'>\n")  # Center the table
173 |         f.write("<table>\n")
174 |         f.write(headers)
175 |         f.write(rows)
176 |         f.write("</table>\n")
177 |         f.write("</div>\n")  # End of centering div
178 |         f.write("</section>\n")
179 | 
180 |         # Summary Section with Figures
181 |         # f.write("<section>\n<h2>Figures</h2>\n")
182 |         for category, figures in summary_figures.items():
183 |             f.write(f"<h2 id='summary_{category}'>{category}</h2>\n")
184 |             for figure in figures:
185 |                 if figure == "Summary_html/1_Read_estimate_accuracy.png":
186 |                    figure_title = "Estimated accuracy"
187 |                    f.write(f"<div class='figure-container' id='{figure_title}'>\n")
188 |                    f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
189 |                    f.write(f"<p style='font-size:14px;'>Note: If the scale of accuracy is not suitable, please use the giraffe_plot function to replot.</p>\n")
190 |                    f.write(f"<p style='font-size:14px;'>giraffe_plot estimate_acc --input Estimated_information.txt --x_min 95 --x_max 100 --x_gap 1 </p>\n")
191 |                    f.write(f"</div>\n")
192 | 
193 |                 elif figure == "Summary_html/2_Read_GC_content.png":
194 |                    figure_title = "Read GC content"
195 |                    f.write(f"<div class='figure-container' id='{figure_title}'>\n")
196 |                    f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
197 |                    # f.write(f"<p>This is a description!</p>\n")
198 |                    f.write(f"</div>\n")
199 | 
200 |                 elif figure == "Summary_html/3_Read_length.png":
201 |                    figure_title = "Read length"
202 |                    f.write(f"<div class='figure-container' id='{figure_title}'>\n")
203 |                    f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
204 |                    # f.write(f"<p>This is a description!</p>\n")
205 |                    f.write(f"</div>\n")
206 | 
207 |                 elif figure == "Summary_html/1_Observed_read_accuracy.png":
208 |                    figure_title = "Observed accuracy"
209 |                    f.write(f"<div class='figure-container' id='{figure_title}'>\n")
210 |                    f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
211 |                    f.write(f"<p style='font-size:14px;'>Note: If the scale of accuracy is not suitable, please use the giraffe_plot function to replot.</p>\n")
212 |                    f.write(f"<p style='font-size:14px;'>giraffe_plot observe_acc --input Observed_information.txt --x_min 95 --x_max 100 --x_gap 1 </p>\n")
213 |                    f.write(f"</div>\n")
214 |                     
215 |                 elif figure == "Summary_html/2_Observed_mismatch_proportion.png":
216 |                    figure_title = "Mismatch proportion"
217 |                    f.write(f"<div class='figure-container' id='{figure_title}'>\n")
218 |                    f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
219 |                    f.write(f"<p style='font-size:14px;'>Note: If the scale of proportion is not suitable, please use the giraffe_plot function to replot.</p>\n")
220 |                    f.write(f"<p style='font-size:14px;'>giraffe_plot observe_mismatch --input Observed_information.txt --y_max 5 --y_gap 1 </p>\n")
221 |                    f.write(f"</div>\n")
222 |                     
223 |                 elif figure == "Summary_html/3_Homoploymer_summary.png":
224 |                    figure_title = "Homopolymer identification"
225 |                    f.write(f"<div class='figure-container' id='{figure_title}'>\n")
226 |                    f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
227 |                    f.write(f"<p style='font-size:14px;'>Note: If the scale of accuracy is not suitable, please use the giraffe_plot function to replot.</p>\n")
228 |                    f.write(f"<p style='font-size:14px;'>giraffe_plot observe_homo --input Homoploymer_summary.txt --y_min 90 --y_max 100 --y_gap 2 </p>\n")
229 |                    f.write(f"</div>\n")
230 |                     
231 |                 elif figure == "Summary_html/1_Bin_distribution.png":
232 |                    figure_title = "Bin distribution"
233 |                    f.write(f"<div class='figure-container' id='{figure_title}'>\n")
234 |                    f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 1000px; height: auto;'>\n")
235 |                    # f.write(f"<p>This is a description!</p>\n")
236 |                    f.write(f"</div>\n")
237 | 
238 |                 elif figure == "Summary_html/2_Relationship_normalization.png":
239 |                    figure_title = "Relationship (depth and GC conetent)"
240 |                    f.write(f"<div class='figure-container' id='{figure_title}'>\n")
241 |                    f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 1000px; height: auto;'>\n")
242 |                    f.write(f"<p style='font-size:14px;'>Note: If the scale of GC content is not suitable, please use the renormalization_sequencing_bias for normalzation and giraffe_plot for plotting.</p>\n")
243 |                    f.write(f"<p style='font-size:14px;'>renormalization_sequencing_bias -i S1_distribution.txt -l 30 -r 60 -o S1.txt </p>\n")
244 |                    f.write(f"<p style='font-size:14px;'>giraffe_plot gcbias --input new_gcbias.txt --x_min 30 --x_max 60 --x_gap 2</pre>\n")
245 |                    f.write(f"</div>\n")
246 |                 
247 |                 else:
248 |                     continue
249 | 
250 |                 
251 |                 # f.write(f"<div class='figure-container' id='{figure_title}'>\n")
252 |                 # # f.write(f"<h4>{figure_title}</h4>\n")
253 |                 # # f.write(f"<p>Description for {figure_title}.</p>\n")
254 |                 # f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
255 |                 # f.write(f"</div>\n")
256 | 
257 |             # for figure in figures:
258 |             #     if figure == "Summary_html/1_Read_estimate_accuracy.png":
259 |             #         figure_title = "Estimated accuracy"
260 |             #         f.write(f"<div class='figure-container' id='{figure_title}'>\n")
261 |             #         f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
262 |             #         f.write(f"<p>If the scale of accuracy is not suitable. Plasese using the giraffe_plot to replot.</p>\n")
263 |             #         f.write(f"<p>giraffe_plot estimate_acc --input Estimated_information.txt --x_min 50 --x_max 100 --x_gap 10</p>\n")
264 |             #         f.write(f"</div>\n")
265 | 
266 |             #     elif figure == "Summary_html/2_Read_GC_content.png":
267 |             #         figure_title = "Read GC content"
268 |             #         f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
269 | 
270 |             #     elif figure == "Summary_html/3_Read_length.png":
271 |             #         figure_title = "Read length"
272 |             #         f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
273 | 
274 |             #     elif figure == "Summary_html/1_Observed_read_accuracy.png":
275 |             #         figure_title = "Observed accuracy"
276 |             #         f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
277 | 
278 |             #     elif figure == "Summary_html/2_Observed_mismatch_proportion.png":
279 |             #         figure_title = "Mismatch proportion"
280 |             #         f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
281 | 
282 |             #     elif figure == "Summary_html/3_Homoploymer_summary.png":
283 |             #         figure_title = "Homopolymer identification"
284 |             #         f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
285 | 
286 | 
287 |             #     elif figure == "Summary_html/1_Bin_distribution.png":
288 |             #         figure_title = "Bin distribution"
289 |             #         f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
290 | 
291 |             #     elif figure == "Summary_html/2_Relationship_normalization.png":
292 |             #         figure_title = "Relationship (depth and GC conetent)"
293 |             #         f.write(f"<li><a href='#{figure_title}'>{figure_title}</a></li>\n")
294 |                     
295 |             #     else:
296 |             #         continue
297 | 
298 |         # for category, figures in summary_figures.items():
299 |         #    f.write(f"<h3 id='summary_{category}'>{category}</h3>\n")
300 |         #    for figure in figures:
301 |         #       figure_title = os.path.splitext(os.path.basename(figure))[0].replace('_', ' ').title()
302 |         #       f.write(f"<div class='figure-container' id='{figure_title}'>\n")
303 |         #       f.write(f"<h4>{figure_title}</h4>\n")
304 |         #       # f.write(f"<p>Description for {figure_title}.</p>\n")
305 |         #       f.write(f"<img src='{figure}' alt='{figure_title}' style='width: 800px; height: auto;'>\n")
306 |         #       f.write(f"</div>\n")
307 | 
308 |         f.write("</section>\n")
309 |         f.write("</main>\n")
310 |         f.write("</body>\n</html>\n")
311 | 
312 | def summarize_giraffe_results(input_data):
313 |     path = "Summary_html"
314 |     giraffe_data = generate_giraffe_data(input_data)
315 |     
316 |     summary_figures = {
317 |     "Estimate": [
318 |         f"{path}/1_Read_estimate_accuracy.png",
319 |         f"{path}/2_Read_GC_content.png",
320 |         f"{path}/3_Read_length.png"
321 |     ],
322 |     "Observe": [
323 |         f"{path}/1_Observed_read_accuracy.png",
324 |         f"{path}/2_Observed_mismatch_proportion.png",
325 |         f"{path}/3_Homoploymer_summary.png"
326 |     ],
327 |     "GC bias": [
328 |         f"{path}/1_Bin_distribution.png",
329 |         f"{path}/2_Relationship_normalization.png"
330 |     ]}
331 | 
332 |     generate_giraffe_html(giraffe_data, summary_figures, "Giraffe_Results/giraffe_report.html")
333 | 


--------------------------------------------------------------------------------
/Giraffe_View/giraffe:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import datetime
  3 | import sys
  4 | import argparse
  5 | import pathlib
  6 | from os import system
  7 | from Giraffe_View.function import *
  8 | from Giraffe_View.homopolymer import *
  9 | from Giraffe_View.observed_read_accuracy import *
 10 | from Giraffe_View.gc_bias import *
 11 | from Giraffe_View.estimated_read_accuracy import *
 12 | from Giraffe_View.regional_modification import *
 13 | from Giraffe_View.plot import *
 14 | from Giraffe_View.summary_html import * 
 15 | 
 16 | working_path = pathlib.Path().resolve()
 17 | 
 18 | def estimated(args):
 19 | 	mkdir_d("1_Estimated_quality")
 20 | 	if args.read:
 21 | 		input_dataset = loading_dataset(args.read)
 22 | 		for data in input_dataset.keys():
 23 | 			now = datetime.datetime.now()			
 24 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start estimated read accuracy analysis!")
 25 | 			# if args.less_memory:
 26 | 			calculate_estimated_accuracy_slow(data, input_dataset[data]["path"], args.cpu)
 27 | 			# else:
 28 | 			# 	calculate_estimated_accuracy(data, input_dataset[data]["path"], args.cpu)
 29 | 
 30 | 	elif args.unaligned:
 31 | 		input_dataset = loading_dataset(args.unaligned)
 32 | 		for data in input_dataset.keys():
 33 | 			now = datetime.datetime.now()
 34 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start estimated read accuracy analysis!")
 35 | 			bam2fastq(input_dataset[data]["path"], args.cpu)
 36 | 			
 37 | 			system("bash bam2fq.sh")
 38 | 			# if args.less_memory:
 39 | 			calculate_estimated_accuracy_slow(data, "giraffe_tmp.fastq", args.cpu)
 40 | 			# else:
 41 | 			# 	calculate_estimated_accuracy(data, "giraffe_tmp.fastq", args.cpu)
 42 | 			system("rm bam2fq.sh giraffe_tmp.fastq")
 43 | 
 44 | 	merge_results()
 45 | 
 46 | 	if args.plot:
 47 | 		now = datetime.datetime.now()
 48 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!")
 49 | 		plot_estimate()
 50 | 
 51 | 		now = datetime.datetime.now()
 52 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!")
 53 | 	else:
 54 | 		now = datetime.datetime.now()
 55 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!")
 56 | 	
 57 | 	mes = "The results are available at " + str(working_path) + "/Giraffe_Results/1_Estimated_quality!"
 58 | 	now = datetime.datetime.now()
 59 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(mes))
 60 | 
 61 | def observed(args):
 62 | 	now = datetime.datetime.now()
 63 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start data processing!")
 64 | 	mkdir_d("2_Observed_quality")
 65 | 
 66 | 	if args.read:
 67 | 		if not args.ref:
 68 | 			error_with_color("Please input a reference!!!")
 69 | 
 70 | 		input_dataset = loading_dataset(args.read)
 71 | 		for data in input_dataset.keys():
 72 | 			now = datetime.datetime.now()
 73 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start read mapping!")
 74 | 			data_process(data, input_dataset[data]["type"], input_dataset[data]["path"], args.ref, args.cpu)
 75 | 			bamfile = "Giraffe_Results/2_Observed_quality/" + str(data) + ".bam"
 76 | 			
 77 | 			now = datetime.datetime.now()
 78 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!")
 79 | 			run_observed_accuracy(bamfile, data, args.cpu)
 80 | 
 81 | 			temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt"
 82 | 			with open("merge_supplementary.sh", "w") as ff:
 83 | 			    mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out)
 84 | 			    ff.write(mes + "\n")
 85 | 			ff.close()
 86 | 
 87 | 			system("bash merge_supplementary.sh")
 88 | 			supplementary_read_processing(data)
 89 |                         
 90 | 			system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt")
 91 | 			system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt")
 92 | 
 93 | 			now = datetime.datetime.now()
 94 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!")
 95 | 			run_homopolymer_from_bam(bamfile, data, args.cpu)
 96 | 			
 97 | 			of = open("header", "w")
 98 | 			of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n")
 99 | 			of.close()
100 | 
101 | 			output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt"
102 | 			ff = open("merge_homopolymer.sh", "w")
103 | 			ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output))
104 | 			ff.close()
105 | 
106 | 			system("bash merge_homopolymer.sh")
107 | 			system("rm header")
108 | 			system("rm merge_homopolymer.sh ")
109 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt")
110 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ")
111 | 
112 | 			now = datetime.datetime.now()
113 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!")
114 | 			homopolymer_summary_2(data)
115 | 
116 | 	elif args.aligned:
117 | 		input_dataset = loading_dataset(args.aligned)
118 | 		for data in input_dataset.keys():
119 | 			bamfile = input_dataset[data]["path"]
120 | 
121 | 			now = datetime.datetime.now()
122 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!")
123 | 			
124 | 			if not os.path.exists(bamfile+".bai"):
125 | 				system("samtools index -@ " + str(args.cpu) + " " + bamfile)
126 | 
127 | 			run_observed_accuracy(bamfile, data, args.cpu)
128 | 
129 | 			temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt"
130 | 			with open("merge_supplementary.sh", "w") as ff:
131 | 			    mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out)
132 | 			    ff.write(mes + "\n")
133 | 			ff.close()
134 | 
135 | 			system("bash merge_supplementary.sh")
136 | 			supplementary_read_processing(data)
137 |                         
138 | 			system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt")
139 | 			system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt")
140 | 
141 | 			now = datetime.datetime.now()
142 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!")
143 | 			run_homopolymer_from_bam(bamfile, data, args.cpu)
144 | 			
145 | 			of = open("header", "w")
146 | 			of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n")
147 | 			of.close()
148 | 
149 | 			output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt"
150 | 			ff = open("merge_homopolymer.sh", "w")
151 | 			ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output))
152 | 			ff.close()
153 | 
154 | 			system("bash merge_homopolymer.sh")
155 | 			system("rm header")
156 | 			system("rm merge_homopolymer.sh ")
157 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt")
158 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ")
159 | 
160 | 			now = datetime.datetime.now()
161 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!")
162 | 			homopolymer_summary_2(data)
163 | 
164 | 	elif args.unaligned:
165 | 		if not args.ref:
166 | 			error_with_color("Please input a reference!!!")
167 | 		input_dataset = loading_dataset(args.unaligned)
168 | 		for data in input_dataset.keys():
169 | 			now = datetime.datetime.now()
170 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start read mapping!")
171 | 			bam2fastq(input_dataset[data]["path"], args.cpu)
172 | 			system("bash bam2fq.sh")
173 | 			data_process(data, input_dataset[data]["type"], "giraffe_tmp.fastq", args.ref, args.cpu)
174 | 			system("rm bam2fq.sh giraffe_tmp.fastq")
175 | 			bamfile = "Giraffe_Results/2_Observed_quality/" + str(data) + ".bam"
176 | 			
177 | 			now = datetime.datetime.now()
178 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!")
179 | 			run_observed_accuracy(bamfile, data, args.cpu)
180 | 
181 | 			temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt"
182 | 			with open("merge_supplementary.sh", "w") as ff:
183 | 			    mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out)
184 | 			    ff.write(mes + "\n")
185 | 			ff.close()
186 | 
187 | 			system("bash merge_supplementary.sh")
188 | 			supplementary_read_processing(data)
189 |                         
190 | 			system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt")
191 | 			system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt")
192 | 
193 | 			now = datetime.datetime.now()
194 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!")
195 | 			run_homopolymer_from_bam(bamfile, data, args.cpu)
196 | 			
197 | 			of = open("header", "w")
198 | 			of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n")
199 | 			of.close()
200 | 
201 | 			output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt"
202 | 			ff = open("merge_homopolymer.sh", "w")
203 | 			ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output))
204 | 			ff.close()
205 | 
206 | 			system("bash merge_homopolymer.sh")
207 | 			system("rm header")
208 | 			system("rm merge_homopolymer.sh ")
209 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt")
210 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ")
211 | 
212 | 			now = datetime.datetime.now()
213 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!")
214 | 			homopolymer_summary_2(data)
215 | 
216 | 	now = datetime.datetime.now()
217 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start merge the observed quality results!")
218 | 	merge_results_observed_acc()
219 | 	merge_results_observed_homopolymer()
220 | 
221 | 	if args.plot:
222 | 		now = datetime.datetime.now()
223 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!")
224 | 		plot_observe_acc()
225 | 		plot_observe_homo()
226 | 	else:
227 | 		pass
228 | 
229 | 	mes = "The results are available at " + str(working_path) + "/Giraffe_Results/2_Observed_quality!"
230 | 	now = datetime.datetime.now()
231 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(mes))
232 | 
233 | def GC_bias(args):
234 | 	mkdir_d("3_GC_bias")
235 | 	input_dataset = loading_dataset(args.aligned)
236 | 	for data in input_dataset.keys():
237 | 		now = datetime.datetime.now()
238 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start GC bias analysis!")
239 | 		compute_GC_bias(args.ref, input_dataset[data]["path"], args.binsize, data, args.cpu)
240 | 		merge_GC_content_and_depth(args.binsize, data)
241 | 	
242 | 	merge_files()
243 | 	get_bin_number_within_GC_content()
244 | 
245 | 	if args.plot:
246 | 		now = datetime.datetime.now()
247 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!")
248 | 		plot_GC_bias(input_binsize=str(args.binsize))
249 | 		now = datetime.datetime.now()
250 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!")
251 | 	else:
252 | 		now = datetime.datetime.now()
253 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!")
254 | 	
255 | 	mes = "The results are available at " + str(working_path) + "/Giraffe_Results/3_GC_bias!"
256 | 	now = datetime.datetime.now()
257 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(mes))
258 | 
259 | def methylation(args):
260 | 	now = datetime.datetime.now()
261 | 	mkdir_d("4_Regional_modification")
262 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis start!")
263 | 
264 | 
265 | 	input_dataset = loading_dataset(args.methyl)
266 | 	for data in input_dataset.keys():
267 | 		now = datetime.datetime.now()
268 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " +f"{data}" + ": Calculating regional modification analysis!")
269 | 		run_regional_methylation(input_dataset[data]["path"], args.region, data, args.cpu)
270 | 
271 | 	now = datetime.datetime.now()
272 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Summarize results!")
273 | 	system("cat Giraffe_Results/4_Regional_modification/Temp_methy_* > Giraffe_Results/4_Regional_modification/Regional_methylation_proportion.txt")
274 | 	system("rm Giraffe_Results/4_Regional_modification/Temp_methy_*")
275 | 
276 | 	if args.plot:
277 | 		now = datetime.datetime.now()
278 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!")
279 | 		plot_modi_bin()
280 | 		print_with_color("Analysis finished!")
281 | 	else:
282 | 		now = datetime.datetime.now()
283 | 		print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!")
284 | 
285 | 	now = datetime.datetime.now()
286 | 	mes = "[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] The results are available at " + str(working_path) + "/Giraffe_Results/4_Regional_modification!"
287 | 	print_with_color(str(mes))
288 | 
289 | def total(args):
290 | 	now = datetime.datetime.now()
291 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis start!")
292 | 
293 | 	mkdir_d("1_Estimated_quality")
294 | 	mkdir_d("2_Observed_quality")
295 | 	mkdir_d("3_GC_bias")
296 | 	mkdir_d("Summary_html")
297 | 
298 | 	if args.read:
299 | 		input_dataset = loading_dataset(args.read)
300 | 		data_table = args.read
301 | 		for data in input_dataset.keys():
302 | 			# estimate
303 | 			now = datetime.datetime.now()			
304 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start estimated read accuracy analysis!")
305 | 			
306 | 			# if args.less_memory:
307 | 			calculate_estimated_accuracy_slow(data, input_dataset[data]["path"], args.cpu)
308 | 			# else:
309 | 			# 	calculate_estimated_accuracy(data, input_dataset[data]["path"], args.cpu)
310 | 
311 | 			# observe
312 | 			now = datetime.datetime.now()
313 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start read mapping!")
314 | 			data_process(data, input_dataset[data]["type"], input_dataset[data]["path"], args.ref, args.cpu)
315 | 			bamfile = "Giraffe_Results/2_Observed_quality/" + str(data) + ".bam"
316 | 			
317 | 			now = datetime.datetime.now()
318 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!")
319 | 			run_observed_accuracy(bamfile, data, args.cpu)
320 | 
321 | 			temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt"
322 | 			with open("merge_supplementary.sh", "w") as ff:
323 | 			    mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out)
324 | 			    ff.write(mes + "\n")
325 | 			ff.close()
326 | 
327 | 			system("bash merge_supplementary.sh")
328 | 			supplementary_read_processing(data)
329 |                         
330 | 			system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt")
331 | 			system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt")
332 | 
333 | 			now = datetime.datetime.now()
334 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!")
335 | 			run_homopolymer_from_bam(bamfile, data, args.cpu)
336 | 			
337 | 			of = open("header", "w")
338 | 			of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n")
339 | 			of.close()
340 | 
341 | 			output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt"
342 | 			ff = open("merge_homopolymer.sh", "w")
343 | 			ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output))
344 | 			ff.close()
345 | 
346 | 			system("bash merge_homopolymer.sh")
347 | 			system("rm header")
348 | 			system("rm merge_homopolymer.sh ")
349 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt")
350 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ")
351 | 
352 | 			now = datetime.datetime.now()
353 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!")
354 | 			homopolymer_summary_2(data)
355 | 
356 | 			# gc bias
357 | 			now = datetime.datetime.now()
358 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start GC bias analysis!")
359 | 			compute_GC_bias(args.ref, bamfile, args.binsize, data, args.cpu)
360 | 			merge_GC_content_and_depth(args.binsize, data)
361 | 
362 | 	elif args.unaligned:
363 | 		input_dataset = loading_dataset(args.unaligned)
364 | 		data_table = args.unaligned
365 | 		for data in input_dataset.keys():
366 | 			
367 | 			# estimate
368 | 			now = datetime.datetime.now()
369 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start estimated read accuracy analysis!")
370 | 			bam2fastq(input_dataset[data]["path"], args.cpu)
371 | 			system("bash bam2fq.sh")
372 | 			# if args.less_memory:
373 | 			calculate_estimated_accuracy_slow(data, "giraffe_tmp.fastq", args.cpu)
374 | 			# else:
375 | 			# 	calculate_estimated_accuracy(data, "giraffe_tmp.fastq", args.cpu)
376 | 
377 | 			# observe
378 | 			now = datetime.datetime.now()
379 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start read mapping!")
380 | 			data_process(data, input_dataset[data]["type"], "giraffe_tmp.fastq", args.ref, args.cpu)
381 | 			system("rm bam2fq.sh giraffe_tmp.fastq")
382 | 			bamfile = "Giraffe_Results/2_Observed_quality/" + str(data) + ".bam"
383 | 
384 | 			now = datetime.datetime.now()
385 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!")
386 | 			run_observed_accuracy(bamfile, data, args.cpu)
387 | 
388 | 			temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt"
389 | 			with open("merge_supplementary.sh", "w") as ff:
390 | 			    mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out)
391 | 			    ff.write(mes + "\n")
392 | 			ff.close()
393 | 
394 | 			system("bash merge_supplementary.sh")
395 | 			supplementary_read_processing(data)
396 |                         
397 | 			system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt")
398 | 			system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt")
399 | 
400 | 			now = datetime.datetime.now()
401 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!")
402 | 			run_homopolymer_from_bam(bamfile, data, args.cpu)
403 | 			
404 | 			of = open("header", "w")
405 | 			of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n")
406 | 			of.close()
407 | 
408 | 			output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt"
409 | 			ff = open("merge_homopolymer.sh", "w")
410 | 			ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output))
411 | 			ff.close()
412 | 
413 | 			system("bash merge_homopolymer.sh")
414 | 			system("rm header")
415 | 			system("rm merge_homopolymer.sh ")
416 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt")
417 | 			system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ")
418 | 
419 | 			now = datetime.datetime.now()
420 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!")
421 | 			homopolymer_summary_2(data)
422 | 
423 | 			# gc bias
424 | 			now = datetime.datetime.now()
425 | 			print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start GC bias analysis!")
426 | 			compute_GC_bias(args.ref, bamfile, args.binsize, data, args.cpu)
427 | 			merge_GC_content_and_depth(args.binsize, data)
428 | 
429 | 	# merge results
430 | 	now = datetime.datetime.now()
431 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start merge the results!")
432 | 	# estimate
433 | 	merge_results()
434 | 	# observe
435 | 	merge_results_observed_acc()
436 | 	merge_results_observed_homopolymer()
437 | 	# gc_bias
438 | 	merge_files()
439 | 	get_bin_number_within_GC_content()
440 | 
441 | 
442 | 	# plot
443 | 	now = datetime.datetime.now()
444 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!")
445 | 
446 | 	plot_estimate()
447 | 	plot_estimate("png","Giraffe_Results/Summary_html")
448 | 	plot_observe_acc()
449 | 	plot_observe_acc("png","Giraffe_Results/Summary_html")
450 | 	plot_observe_homo()
451 | 	plot_observe_homo("png","Giraffe_Results/Summary_html")
452 | 	plot_GC_bias(input_binsize=str(args.binsize))
453 | 	plot_GC_bias(input_binsize=str(args.binsize), format="png", path="Giraffe_Results/Summary_html")
454 | 	
455 | 	now = datetime.datetime.now()
456 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start summarizing!")
457 | 	summarize_giraffe_results(data_table)
458 | 
459 | 	now = datetime.datetime.now()
460 | 	mes = "The results are available at " + str(working_path) + "/Giraffe_Results!"
461 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!")	
462 | 	print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(mes))
463 | 
464 | if __name__ == '__main__':
465 | 
466 | 	version = "0.2.3"
467 | 
468 | 	parser = argparse.ArgumentParser(description="",
469 | 		usage="\n   %(prog)s [subcommands] [options]							    # Users can execute subcommands as needed to perform specific tasks."
470 | 			  "\n   %(prog)s --read <read table> --ref <reference> --cpu <number of processes or threads>			# Running function of estimate, observe, and gcbias with FASTQ reads."
471 | 			  "\n   %(prog)s --read <unaligned SAM/BAM table> --ref <reference> --cpu <number of processes or threads>	# Running function of estimate, observe, and gcbias with unaligned SAM/BAM reads."
472 |               "\n\nexample for table (sample_ID data_type file_path):\n"
473 | 			  "  sample_A ONT /home/user/data/S1.fastq\n"
474 | 			  "  sample_B ONT /home/user/data/S2.fastq\n"
475 | 		      "  sample_C ONT /home/user/data/S3.fastq\n"
476 | 		      "  ..."
477 | 			  "\n\nnote:\n"
478 | 			  "   version: " + str(version) + "\n"
479 | 		      "   data_type: ONT, ONT_RNA, or Pacbio\n"
480 | 		      "   For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.")
481 | 	
482 | 	parser.add_argument("--read", type=str, metavar="", required=False, help="table of FASTQ read files")
483 | 	parser.add_argument("--unaligned", type=str, metavar="", required=False, help="table of the unaligned SAM/BAM files")
484 | 	parser.add_argument("--ref", type=str, metavar="", required=False, help="reference file")
485 | 	parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (recommend to set this equal to the number of chromosomes, default:10)", default=10)
486 | 	parser.add_argument("--binsize", type=int, metavar="", required=False, help="reference will be split into bins of the specified size (default:1000)", default=1000)
487 | 	# parser.add_argument("--plot", required=False, help="results visualization", action='store_true')
488 | 	# parser.add_argument("--less_memory", required=False, help="using less memory but takes more time to complete the estimated analysis.", action='store_true')
489 | 
490 | 
491 | 
492 | 	# Define subparsers	
493 | 	subparsers = parser.add_subparsers(dest='function', help=None, description=None, prog="giraffe", metavar="  subcommand and function")
494 | 	
495 | 	estimated_parser = subparsers.add_parser('estimate', help='Estimated accuracy, length, and GC content.',
496 | 		usage='\n  %(prog)s --read <read table> 				# For the FASTQ reads.\n'
497 |               '  %(prog)s --unaligned <unaligned SAM/BAM table> 	# For the unaligned SAM/BAM files.'
498 |               "\n\nexample for table (sample_ID data_type file_path):\n"
499 | 			  "  sample_A ONT /home/user/data/S1.fastq\n"
500 | 			  "  sample_B ONT /home/user/data/S2.fastq\n"
501 | 		      "  sample_C ONT /home/user/data/S3.fastq\n"
502 | 		      "  ..."
503 | 		      "\n\nnote:\n"
504 | 		      "   version: " + str(version) + "\n"
505 | 		      "   data_type: ONT, ONT_RNA, or Pacbio\n"
506 | 		      "   For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.")
507 | 
508 | 	estimated_parser.add_argument("--read", type=str, metavar="", required=False, help="table of FASTQ read files")
509 | 	estimated_parser.add_argument("--unaligned", type=str, metavar="", required=False, help="table of the unaligned SAM/BAM files")
510 | 	estimated_parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (default:10)", default=10)
511 | 	estimated_parser.add_argument("--plot", required=False, help="results visualization", action='store_true')
512 | 	# estimated_parser.add_argument("--less_memory", required=False, help="using less memory but takes more time to complete the task", action='store_true')
513 | 
514 | 	observed_parser = subparsers.add_parser('observe', help='Observed accuracy, mismatch proportion, and homopolymer identification.',
515 | 		usage="\n    %(prog)s --aligned <aligned SAM/BAM table> \t\t\t\t# For aligned SAM/BAM files. Please remove the secondary alignment (--secondary=no) and add MD tag (--MD) during mapping!\n"
516 | 			  "    %(prog)s --read <read table> --ref <reference> \t\t\t# For FASTQ reads.\n"
517 |               "    %(prog)s --unaligned <unaligned SAM/BAM table> --ref <reference> \t# For unaligned SAM/BAM files."
518 |               "\n\nexample for table (sample_ID data_type file_path):\n"
519 | 			  "  sample_A ONT /home/user/data/S1.fastq\n"
520 | 			  "  sample_B ONT /home/user/data/S2.fastq\n"
521 | 		      "  sample_C ONT /home/user/data/S3.fastq\n"
522 |               		      "\n\nnote:\n"
523 | 		      "   version: " + str(version) + "\n"
524 | 		      "   data_type: ONT, ONT_RNA, or Pacbio\n"
525 | 		      "   For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.")
526 |               
527 | 	observed_parser.add_argument("--read", type=str, metavar="", required=False, help="table of the FASTQ read files")
528 | 	observed_parser.add_argument("--aligned", type=str, metavar="", required=False, help="table of the aligned SAM/BAM files")
529 | 	observed_parser.add_argument("--unaligned", type=str, metavar="", required=False, help="table of the unaligned SAM/BAM files")
530 | 	observed_parser.add_argument("--ref", type=str, metavar="", required=False, help="reference file")
531 | 	observed_parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (recommend to set this equal to the number of chromosomes, default:10)", default=10)
532 | 	observed_parser.add_argument("--plot", required=False, help="results visualization", action='store_true')
533 | 
534 | 	GC_bias_parser = subparsers.add_parser('gcbias', help='Relationship between GC content and sequencing depth.',
535 | 		usage="\n   %(prog)s --ref <reference> --aligned <aligned SAM/BAM table> --binsize 5000 --cpu 24\n\n"
536 | 		"example for table (sample_ID data_type file_path):\n"
537 | 		"   sample_A ONT /home/user/data/S1.sort.bam\n"
538 | 		"   sample_B ONT /home/user/data/S2.sort.bam\n"
539 | 		"   sample_C ONT /home/user/data/S3.sort.bam\n"
540 | 		"   ..."
541 | 		"\n\nnote:\n"
542 | 		"   version: " + str(version) + "\n"
543 | 		"   data_type: ONT, ONT_RNA, or Pacbio\n"
544 | 		"   For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.")
545 | 	GC_bias_parser.add_argument("--ref", type=str, metavar="", required=True, help="reference file")
546 | 	GC_bias_parser.add_argument("--aligned", type=str, metavar="", required=True, help="table of sorted SAM/BAM files")
547 | 	GC_bias_parser.add_argument("--binsize", type=int, metavar="", required=False, help="reference will be split into bins of the specified size (default:1000)", default=1000)
548 | 	GC_bias_parser.add_argument("--plot", required=False, help="results visualization", action='store_true')
549 | 	GC_bias_parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (recommend to set this equal to the number of chromosomes, default:10)", default=10)
550 | 
551 | 	methylation_parser = subparsers.add_parser('modbin', help='Average modification proportion at regional level.',
552 | 		usage="\n   %(prog)s --methyl <methylation table> --region <target region> \n\n"
553 | 		"example for table (sample_ID data_type file_path):\n"
554 | 		"   sample_A ONT /home/user/data/S1_5mC.txt\n"
555 | 		"   sample_B ONT /home/user/data/S2_5mC.txt\n"
556 | 		"   sample_C ONT /home/user/data/S3_5mC.txt\n"
557 | 		"   ..."
558 | 		"\n\nexample for methylation file (Chrom Start End Value):\n"
559 | 		"   contig_A\t132\t133\t0.92\n"
560 | 		"   contig_A\t255\t256\t0.27\n"
561 | 		"   contig_A\t954\t955\t0.52\n"
562 | 		"   ..."
563 | 		"\n\nnote:\n"
564 | 		"   version: " + str(version) + "\n"
565 | 		"   data_type: ONT, ONT_RNA, or Pacbio\n"
566 | 		"   For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.")
567 | 
568 | 
569 | 	methylation_parser.add_argument("--methyl", type=str, metavar="", required=True, help="table of methylation files")
570 | 	methylation_parser.add_argument("--region", type=str, metavar="", required=True, help="target region file (Chromosome\tStart\tEnd\tRegion_name)")
571 | 	methylation_parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (recommend to set this equal to the number of chromosomes, default:10)", default=10)
572 | 	methylation_parser.add_argument("--plot", required=False, help="results visualization", action='store_true')
573 | 	args = parser.parse_args()
574 | 
575 | 	# Add function to print help if no arguments are provided
576 | 	if len(sys.argv) == 1:
577 | 		parser.print_help(sys.stderr)
578 | 		sys.exit(1)
579 | 
580 | 	# Call the appropriate function based on the subparser used
581 | 	if args.function == "observe":
582 | 		if len(sys.argv) == 2:
583 | 			observed_parser.print_help(sys.stderr)
584 | 			sys.exit(1)
585 | 		else:
586 | 			observed(args)
587 | 
588 | 	elif args.function == "modbin":
589 | 		methylation(args)
590 | 
591 | 	elif args.function == "gcbias":
592 | 		GC_bias(args)
593 | 
594 | 	elif args.function == "estimate":
595 | 		if len(sys.argv) == 2:
596 | 			estimated_parser.print_help(sys.stderr)
597 | 			sys.exit(1)
598 | 		else:
599 | 			estimated(args)
600 | 	else:
601 | 		total(args)
602 | 		#create a summary in html
603 | 


--------------------------------------------------------------------------------