├── Results ├── readme ├── workflow.png └── giraffe_logo.png ├── Giraffe_View ├── __init__.py ├── giraffe_run_demo ├── regional_modification.py ├── function.py ├── estimated_read_accuracy.py ├── homopolymer.py ├── observed_read_accuracy.py ├── gc_bias.py ├── giraffe_plot ├── plot.py ├── summary_html.py └── giraffe ├── scripts ├── renormalization_sequencing_bias ├── homopolymer_count └── replot_sequencing_bias ├── LICENSE ├── setup.py └── README.md /Results/readme: -------------------------------------------------------------------------------- 1 | The results of demo dataset! 2 | -------------------------------------------------------------------------------- /Giraffe_View/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals, absolute_import 2 | -------------------------------------------------------------------------------- /Results/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lrslab/Giraffe_View/HEAD/Results/workflow.png -------------------------------------------------------------------------------- /Results/giraffe_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lrslab/Giraffe_View/HEAD/Results/giraffe_logo.png -------------------------------------------------------------------------------- /scripts/renormalization_sequencing_bias: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import pandas as pd 5 | 6 | if __name__ == "__main__": 7 | parser = argparse.ArgumentParser(description="A script provides a solution for renormalizing the sequencing depth based on the given GC content scale.") 8 | parser.add_argument("-i", "--input", type=str, metavar="", required=True, help="input the resulting file") 9 | parser.add_argument("-l", "--left", type=int, metavar="", required=True, help="The start of GC content.") 10 | parser.add_argument("-r", "--right", type=int, metavar="", required=True, help="The end of GC content.") 11 | parser.add_argument("-o", "--out", type=str, metavar="", required=True, help="Name of output file.") 12 | args = parser.parse_args() 13 | 14 | df = pd.read_csv(args.input, delim_whitespace=True) 15 | nor_df = df[(args.left <=df["GC_content"]) & (df["GC_content"] <= args.right)].copy() 16 | 17 | ave_dp = nor_df["Depth"].mean() 18 | nor_df["Normalized_depth"] = nor_df.apply(lambda row: row["Depth"]/ave_dp, axis=1) 19 | nor_df.to_csv(args.out, sep="\t", index=False, header=True) 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Raymond 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="Giraffe_View", 8 | version="0.2.3", 9 | author="Xudong Liu", 10 | author_email="xudongliu98@gmail.com", 11 | description="Giraffe_View is specially designed to provide a comprehensive assessment of the accuracy of long-read sequencing datasets obtained from both the PacBio and Nanopore platforms.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/lxd98/Giraffe_View", 15 | packages=setuptools.find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | python_requires = '>=3', 22 | install_requires=[ 23 | 'pysam >= 0.17.0', 24 | 'numpy >= 1.7.0', 25 | 'pandas >= 1.5.0', 26 | 'seaborn >= 0.13.2', 27 | 'termcolor >= 2.0.0', 28 | 'biopython >= 1.6.2' 29 | ], 30 | scripts = ["Giraffe_View/giraffe","Giraffe_View/giraffe_run_demo", "Giraffe_View/giraffe_plot", "scripts/homopolymer_count", "scripts/renormalization_sequencing_bias"] 31 | ) 32 | -------------------------------------------------------------------------------- /Giraffe_View/giraffe_run_demo: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from os import system 3 | 4 | # get the table list 5 | system("wget https://figshare.com/ndownloader/files/44967445 -O read.txt") 6 | system("wget https://figshare.com/ndownloader/files/44967442 -O methyl.txt") 7 | system("wget https://figshare.com/ndownloader/files/44967499 -O aligned.txt") 8 | 9 | # get the reference and ONT reads (R10.4.1 and R9.4.1) of E.coli 10 | system("wget https://figshare.com/ndownloader/files/44967436 -O Read.tar.gz") 11 | 12 | # The 5mC methylation files of zebrafish blood and kidney samples. 13 | # The bed file is the gene promoter position in chromosome 1. 14 | system("wget https://figshare.com/ndownloader/files/44967427 -O Methylation.tar.gz") 15 | 16 | system("tar -xzvf Read.tar.gz") 17 | system("tar -xzvf Methylation.tar.gz") 18 | system("sed -i 's/,/\t/g' Methylation/zf_promoter.db") 19 | system("rm Read.tar.gz Methylation.tar.gz") 20 | 21 | system("giraffe --read read.txt --ref Read/ecoli_chrom.fa --cpu 2") 22 | system("giraffe estimate --read read.txt --plot --cpu 2") 23 | system("giraffe observe --read read.txt --ref Read/ecoli_chrom.fa --plot --cpu 2") 24 | system("giraffe observe --aligned aligned.txt --plot --cpu 2") 25 | system("giraffe gcbias --ref Read/ecoli_chrom.fa --aligned aligned.txt --plot --cpu 2") 26 | system("giraffe modbin --methyl methyl.txt --region Methylation/zf_promoter.db --plot --cpu 2") -------------------------------------------------------------------------------- /scripts/homopolymer_count: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | 6 | if __name__ == '__main__': 7 | parser = argparse.ArgumentParser(description="A script to count the position and type of homopolymer in your reference.") 8 | parser.add_argument("--ref", type=str, metavar="", required=True, help="Input reference (FASTA)") 9 | args = parser.parse_args() 10 | 11 | database = {} 12 | for read in SeqIO.parse(args.ref, "fasta"): 13 | # read.id 14 | # read.seq 15 | database[read.id] = {} 16 | 17 | count = 0 18 | number = 1 19 | tmp = [] 20 | frame = "" 21 | 22 | for base in read.seq: 23 | if len(tmp) == 0 and frame == "": 24 | frame = base.upper() 25 | tmp.append(frame) 26 | count += 1 27 | 28 | else: 29 | if frame == base.upper(): 30 | tmp.append(frame) 31 | count += 1 32 | 33 | else: 34 | if len(tmp) >= 3: 35 | database[read.id][str(number)] = {} 36 | database[read.id][str(number)]["length"] = len(tmp) 37 | database[read.id][str(number)]["position"] = count 38 | database[read.id][str(number)]["base"] = tmp[0] 39 | number += 1 40 | 41 | tmp =[] 42 | frame = base.upper() 43 | tmp.append(frame) 44 | count += 1 45 | 46 | for k in database.keys(): 47 | # ref = string(k) 48 | for n in database[k].keys(): 49 | start = database[k][n]["position"] - database[k][n]["length"] + 1 50 | end = database[k][n]["position"] 51 | basetype = database[k][n]["base"] 52 | feature = str(database[k][n]["length"]) + str(database[k][n]["base"]) 53 | mes = str(k) + "\t" + str(start) + "\t" + str(end) + "\t" + str(basetype) + "\t" + str(feature) 54 | print(mes) 55 | -------------------------------------------------------------------------------- /scripts/replot_sequencing_bias: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from plotnine import * 3 | import argparse 4 | import pandas as pd 5 | import numpy as np 6 | import warnings 7 | warnings.filterwarnings('ignore') 8 | 9 | def plot_GC_bias(input_file, output_name): 10 | df1=pd.read_csv(input_file, sep="\t") 11 | 12 | dif = df1["GC_content"].max() - df1["GC_content"].min() 13 | if dif <= 15: 14 | gc_breaks = [i for i in range(0, 101, 1)] 15 | elif dif <= 30: 16 | gc_breaks = [i for i in range(0, 101, 2)] 17 | elif dif <= 50: 18 | gc_breaks = [i for i in range(0, 101, 5)] 19 | else: 20 | gc_breaks = [i for i in range(0, 101, 10)] 21 | 22 | GC_bias=( 23 | ggplot(df1, aes(x="GC_content", y="Normalized_depth", 24 | group="Group", fill="Group", color="Group")) + 25 | geom_hline(aes(yintercept=1), color="grey", linetype="dotted") + 26 | geom_line(size=1.5, alpha=.3) + 27 | geom_point(size=1.5,color="black") + 28 | scale_y_continuous(name="Normalized depth", 29 | limits=[0, 2], breaks=np.arange(0, 2.1, 0.2)) + 30 | theme_classic() + 31 | scale_x_continuous(name="GC content (%)", breaks=gc_breaks) + 32 | theme(axis_text=element_text(size=12, color="black"), 33 | axis_title=element_text(size=12, color="black"), 34 | legend_title = element_blank(), 35 | legend_text = element_text(size=12, color="black"), 36 | legend_position = "bottom" 37 | ) 38 | ) 39 | 40 | GC_bias.save(filename = output_name+".pdf", width=8, height=5, dpi=300, path="./") 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser(description="") 44 | parser.add_argument("-i", "--input", type=str, metavar="", required=True, help="") 45 | parser.add_argument("-o", "--out", type=str, metavar="", required=True, help="") 46 | args = parser.parse_args() 47 | 48 | plot_GC_bias(args.input, args.out) 49 | -------------------------------------------------------------------------------- /Giraffe_View/regional_modification.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import multiprocessing 3 | 4 | def run_regional_methylation(input_methyl, input_target, sample_ID, num_processes): 5 | # Read methylation and target data from the input files 6 | methyl = pd.read_csv(input_methyl, sep='\t', header=None, names=["CHROM", "START", "END", "VALUE"]) 7 | target = pd.read_csv(input_target, sep='\t', header=None, names=["CHROM", "START", "END", "ID"]) 8 | 9 | # Determine if VALUE is in range 0-1 or 0-100 10 | max_value = methyl['VALUE'].max() 11 | value_scale = 100 if max_value > 1 else 1 12 | 13 | # Get the unique chromosomes from the target data 14 | unique_chromosomes = set(target['CHROM']) 15 | 16 | # Create a multiprocessing pool with the specified number of processes 17 | with multiprocessing.Pool(processes=num_processes) as pool: 18 | jobs = [] 19 | for chromosome in unique_chromosomes: 20 | # Filter the methylation and target data for the current chromosome 21 | sub_methyl = methyl[methyl["CHROM"] == chromosome][["START", "END", "VALUE"]] 22 | sub_target = target[target["CHROM"] == chromosome][["START", "END", "ID"]] 23 | # Create an asynchronous job for processing the data 24 | jobs.append(pool.apply_async(regional_methylation_bed_worker, (sub_methyl, sub_target, str(sample_ID), chromosome, value_scale))) 25 | 26 | # Wait for all jobs to complete 27 | for job in jobs: 28 | job.get() 29 | 30 | def regional_methylation_bed_worker(input_methyl, input_target, sample_ID, chromosome, value_scale): 31 | # Open the output file for writing 32 | with open(f"Giraffe_Results/4_Regional_modification/Temp_methy_{sample_ID}_{chromosome}.txt", "w") as ff: 33 | # Iterate over each row in the target data 34 | for row in input_target.itertuples(index=True, name='Pandas'): 35 | start = row.START 36 | end = row.END 37 | target_ID = row.ID 38 | 39 | # Filter the methylation data for the current region 40 | target_data = input_methyl[(start <= input_methyl["START"]) & (input_methyl["END"] <= end)] 41 | 42 | # Calculate the mean methylation value and scale it 43 | mean_methylation = target_data["VALUE"].mean() / value_scale 44 | 45 | # Write the result to the output file 46 | ff.write(f"{target_ID}\t{mean_methylation:.4f}\t{sample_ID}\n") 47 | -------------------------------------------------------------------------------- /Giraffe_View/function.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import re 3 | import os 4 | from termcolor import colored 5 | from subprocess import Popen, PIPE 6 | import pandas as pd 7 | 8 | def print_with_color(input_string): 9 | print(colored(input_string, "green")) 10 | 11 | def error_with_color(input_string): 12 | print(colored(input_string, "red")) 13 | 14 | def loading_dataset(input_file): 15 | dataset = {} 16 | with open(input_file) as ff: 17 | for l in ff: 18 | l = l.replace("\n", "") 19 | l = l.split() 20 | 21 | # check the file 22 | if os.path.exists(l[2]): 23 | dataset[l[0]] = {} 24 | dataset[l[0]]["type"] = l[1] 25 | dataset[l[0]]["path"] = l[2] 26 | 27 | else: 28 | error_with_color("Please check the path of " + str(l[0]) + "!") 29 | 30 | return dataset 31 | 32 | def cmd_shell(cammands, string): 33 | process = Popen(cammands.split(' '), stdout=subprocess.DEVNULL, universal_newlines=True) 34 | process.wait() 35 | err = process.communicate() 36 | 37 | if process.returncode == 0: 38 | # print('{} SUCCESS'.format(string)) 39 | pass 40 | else: 41 | # print('{} FAILED'.format(string)) 42 | error_with_color(err) 43 | 44 | def mkdir_d(input_name): 45 | mes = "Giraffe_Results/" + str(input_name) 46 | cmd = ["mkdir", "-p", str(mes)] 47 | subprocess.run(cmd, check=True) 48 | 49 | def count_indel_and_snv(str): 50 | dict = {} 51 | for i in str: 52 | dict[i] = dict.get(i, 0) + 1 53 | return dict 54 | 55 | 56 | def bam2fastq(input_bam, CPU): 57 | with open("bam2fq.sh", "w") as ff: 58 | ff.write("samtools fastq " + str(input_bam) + " -@ " + str(CPU) + " > giraffe_tmp.fastq") 59 | ff.close() 60 | 61 | #remove the insertion (I) in the tail of string 62 | def remove_I(string): 63 | while string[-1] == "I": 64 | string = string[:-1] 65 | return(string) 66 | 67 | # remove soft (S) and hard (H) clip in CIGAR and return the matched pairs 68 | def remove_clip_list(input_cigar, input_pairs, input_ID): 69 | remove_cigarstring = re.findall(r"\d+[S, H]+", input_cigar) 70 | #HH & 0H & H0 & 00 71 | if ((len(remove_cigarstring) == 2) and (remove_cigarstring[0][-1] == remove_cigarstring[1][-1] == "H")) or ((len(remove_cigarstring) == 1) and (remove_cigarstring[-1] == "H")) or (len(remove_cigarstring) == 0): 72 | valid_pairs = input_pairs 73 | #SS 74 | elif (len(remove_cigarstring) == 2) and (remove_cigarstring[0][-1] == remove_cigarstring[1][-1] == "S"): 75 | remove_start_site = int(remove_cigarstring[0][:-1]) 76 | tmp_pairs = input_pairs[remove_start_site:] 77 | remove_end_site = int(remove_cigarstring[1][:-1]) 78 | valid_pairs = tmp_pairs[:len(tmp_pairs)-remove_end_site] 79 | # 0S & HS 80 | elif ((len(remove_cigarstring) == 1) and (input_cigar[-1] == "S")) or (len(remove_cigarstring) == 2) and (remove_cigarstring[0][-1] == "H") and ((remove_cigarstring[1][-1] == "S")): 81 | remove_end_site = int(remove_cigarstring[-1][:-1]) 82 | valid_pairs = input_pairs[:len(input_pairs)-remove_end_site] 83 | # S0 & SH 84 | elif (len(remove_cigarstring) == 1) and (input_cigar[-1] != "S") or ((len(remove_cigarstring) == 2) and (remove_cigarstring[0][-1] == "S") and (remove_cigarstring[1][-1] == "H")): 85 | remove_start_site = int(remove_cigarstring[0][:-1]) 86 | valid_pairs = input_pairs[remove_start_site:] 87 | else: 88 | print(str(input_ID) + ", please recheck this CIGAR and MD!") 89 | return(valid_pairs) 90 | 91 | """ 92 | only for base A T G C 93 | (read_position, ref_position, "ref_base") 94 | none √ √ Deletion(D) 95 | √ none none Insertion(I) 96 | √ √ N(A,T,G,C) Match(M) 97 | √ √ n(a,t,g,c) Substitution(S) 98 | """ 99 | def get_base_alignment(input_list): 100 | map_list = ["A", "T", "G", "C"] 101 | result = "" 102 | if input_list[0] == None: 103 | result = "D" # D = deletion 104 | else: 105 | if input_list[1] == None: 106 | result = "I" # I = insertion 107 | else: 108 | if input_list[2] in map_list: 109 | result = "M" # M = match 110 | else: 111 | result = "S" # S = substitution 112 | return result 113 | 114 | def process_in_chunks(file_path, chunk_size=10000): 115 | chunks = pd.read_csv(file_path, chunksize=chunk_size, sep="\t") 116 | results = [] 117 | for chunk in chunks: 118 | results.append(chunk) 119 | return pd.concat(results) -------------------------------------------------------------------------------- /Giraffe_View/estimated_read_accuracy.py: -------------------------------------------------------------------------------- 1 | from os import system 2 | import numpy as np 3 | import math 4 | import multiprocessing 5 | import gzip 6 | 7 | def GC_content(string): 8 | read = str(string).upper() 9 | length = len(read) 10 | c = read.count("C") 11 | g = read.count("G") 12 | GC = (c+g)/length 13 | return[length, GC] 14 | 15 | def Qvalue_to_accuracy(string): 16 | error_list = [] 17 | for base_value in string: 18 | ascii_value = ord(base_value) - 33 19 | error_proporation = math.pow(10, (-1) * int(ascii_value) / 10) 20 | error_list.append(error_proporation) 21 | error_mean = np.mean(error_list) 22 | return [1 - error_mean, error_mean, (-10) * math.log10(error_mean)] 23 | 24 | def process_chunk(chunk): 25 | results = [] 26 | for line in chunk: 27 | read_id, sequence, quality = line 28 | GC = GC_content(sequence) 29 | quality = Qvalue_to_accuracy(quality) 30 | results.append([read_id, quality[0], quality[1], quality[2], GC[0], GC[1]]) 31 | return results 32 | 33 | def calculate_estimated_accuracy(input_type, input_file, num_processes, chunk_size=1000): 34 | pool = multiprocessing.Pool(processes=num_processes) 35 | results = [] 36 | whether_compressed = "" 37 | 38 | 39 | # judge whether input file is compressed (.gz) or not 40 | if input_file.endswith('.gz'): 41 | open_func = gzip.open 42 | whether_compressed = "yes" 43 | 44 | else: 45 | open_func = open 46 | whether_compressed = "no" 47 | 48 | with open_func(input_file, "r") as input_file: 49 | count = 1 50 | chunk = [] 51 | for line in input_file: 52 | line = line.strip() 53 | 54 | if whether_compressed == "yes": 55 | line = line.decode('ascii') 56 | 57 | if count % 4 == 1: 58 | read_id = line.split(" ")[0] 59 | elif count % 4 == 2: 60 | sequence = line 61 | elif count % 4 == 0: 62 | chunk.append((read_id, sequence, line)) 63 | 64 | count += 1 65 | 66 | if len(chunk) == chunk_size: 67 | results.append(pool.apply_async(process_chunk, (chunk,))) 68 | chunk = [] 69 | 70 | if len(chunk) > 0: 71 | results.append(pool.apply_async(process_chunk, (chunk,))) 72 | pool.close() 73 | pool.join() 74 | input_file.close() 75 | 76 | file = "Giraffe_Results/1_Estimated_quality/" + str(input_type) + ".tmp" 77 | with open(file, "w") as output_file: 78 | for result in results: 79 | for line in result.get(): 80 | message = f"{line[0]}\t{line[1]:.4f}\t{line[2]:.4f}\t{line[3]:.4f}" 81 | message += f"\t{line[4]}\t{line[5]:.4f}\t{input_type}" 82 | output_file.write(message + "\n") 83 | output_file.close() 84 | output_file.close() 85 | 86 | def process_chunk_slow(queue, output_files, input_type): 87 | while True: 88 | chunk = queue.get() 89 | if chunk is None: 90 | break 91 | output_file = output_files[chunk['process_id']] 92 | with open(output_file, "a") as f: 93 | for line in chunk['data']: 94 | read_id, sequence, quality = line 95 | GC = GC_content(sequence) 96 | quality = Qvalue_to_accuracy(quality) 97 | message = (f"{read_id}\t{quality[0]:.4f}\t{quality[1]:.4f}\t{quality[2]:.4f}\t" 98 | f"{GC[0]}\t{GC[1]:.4f}\t{input_type}") 99 | f.write(message + "\n") 100 | 101 | def calculate_estimated_accuracy_slow(input_type, input_file, num_processes, chunk_size=1000): 102 | output_dir = "Giraffe_Results/1_Estimated_quality/" 103 | output_files = [f"{output_dir}{input_type}_{i}.tmp" for i in range(num_processes)] 104 | queue = multiprocessing.Queue(maxsize=num_processes * 2) 105 | 106 | if input_file.endswith('.gz'): 107 | open_func = gzip.open 108 | else: 109 | open_func = open 110 | 111 | workers = [] 112 | for process_id in range(num_processes): 113 | worker = multiprocessing.Process(target=process_chunk_slow, args=(queue, output_files, input_type)) 114 | worker.start() 115 | workers.append(worker) 116 | 117 | with open_func(input_file, "rt") as input_file_handle: 118 | chunk = [] 119 | chunk_counter = 0 120 | for count, line in enumerate(input_file_handle, 1): 121 | line = line.strip() 122 | if count % 4 == 1: 123 | read_id = line.split(" ")[0] 124 | elif count % 4 == 2: 125 | sequence = line 126 | elif count % 4 == 0: 127 | chunk.append((read_id, sequence, line)) 128 | 129 | if len(chunk) == chunk_size: 130 | queue.put({'process_id': chunk_counter % num_processes, 'data': chunk}) 131 | chunk = [] 132 | chunk_counter += 1 133 | 134 | if chunk: 135 | queue.put({'process_id': chunk_counter % num_processes, 'data': chunk}) 136 | 137 | for _ in range(num_processes): 138 | queue.put(None) 139 | 140 | for worker in workers: 141 | worker.join() 142 | 143 | def merge_results(): 144 | with open("Giraffe_Results/1_Estimated_quality/header", "a") as ff: 145 | ff.write("ReadID\tAccuracy\tError\tQ_value\tLength\tGC_content\tGroup\n") 146 | ff.close() 147 | system("cat Giraffe_Results/1_Estimated_quality/header \ 148 | Giraffe_Results/1_Estimated_quality/*tmp > \ 149 | Giraffe_Results/1_Estimated_quality/Estimated_information.txt") 150 | system("rm Giraffe_Results/1_Estimated_quality/*tmp Giraffe_Results/1_Estimated_quality/header") 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Giraffe 2 | ![PyPI](https://img.shields.io/pypi/v/Giraffe-View?color=green) ![License](https://img.shields.io/pypi/l/nanoCEM?color=orange) 3 | 4 | **Giraffe** is specially designed to provide a comprehensive assessment of the accuracy of long-read sequencing datasets obtained from both the Pacific Biosciences (PacBio) and Oxford Nanopore Technologies (ONT) platforms, offering four distinct functions. 5 | 6 | 7 | 8 | `estimate` Calculation of estimated read accuracy (Q score), length, and GC content. 9 | 10 | `observe` Calculation of observed read accuracy, mismatch proportion, and homopolymer identification (e.g. AAAA). 11 | 12 | `gcbias` Calculation of the relationship between GC content and sequencing depth. 13 | 14 | `modbin` Calculation of the distribution of modification (e.g. 5mC or 6mA methylation) at the regional level. 15 | 16 | 17 | 18 | # Installation 19 | 20 | ## Installation by [Conda](https://conda.io/projects/conda/en/latest/index.html) 21 | 22 | ```shell 23 | # install on the current environment 24 | conda install -c raymond_liu giraffe_view -y 25 | 26 | # install on a new environment 27 | conda create -n giraffe -c raymond_liu giraffe_view -y 28 | ``` 29 | 30 | 31 | 32 | ## Installation by [PyPI](https://pypi.org/) 33 | 34 | Before using this tool, you need to install additional dependencies for read processing, including the [samtools](https://www.htslib.org/),[minimap2](https://github.com/lh3/minimap2), and [bedtools](https://github.com/arq5x/bedtools2). The following commands can help you install both the software package and its dependencies. 35 | 36 | ```shell 37 | # Testing version 38 | # samtools 1.17 39 | # minimap2 2.17-r941 40 | # bedtools 2.30.0 41 | 42 | # install on the currently environment 43 | conda install -c bioconda -c conda-forge samtools minimap2 bedtools -y 44 | 45 | # install on a new environment 46 | conda create -n giraffe -c bioconda -c conda-forge python==3.9 samtools==1.17 minimap2==2.17 bedtools==2.30.0 -y && conda activate giraffe 47 | ``` 48 | 49 | To install this tool, please use the following command. 50 | ```shell 51 | pip install Giraffe-View 52 | ``` 53 | 54 | 55 | 56 | 57 | # Quick usage 58 | 59 | **Giraffe** can be run with a one-button command or by executing individual functions. 60 | 61 | ## ONE-button pattern 62 | 63 | ```shell 64 | # Running function of "estimate", "observe", and "gcbias" with FASTQ files 65 | giraffe --read --ref --cpu 66 | 67 | # Running function of "estimate", "observe", and "gcbias" with unaligned SAM/BAM files 68 | giraffe --read --ref --cpu 69 | 70 | # Example for input table (sample_ID data_type file_path) 71 | sample_A ONT /home/user/data/S1.fastq 72 | sample_B ONT /home/user/data/S2.fastq 73 | sample_C ONT /home/user/data/S3.fastq 74 | ... 75 | ``` 76 | 77 | Here the data_type can be ONT DNA reads (ONT), ONT directly sequencing reads (ONT_RNA), and Pacbio DNA reads (Pacbio). 78 | 79 | 80 | 81 | ## Estimate function 82 | 83 | ```shell 84 | # For the FASTQ reads 85 | giraffe estimate --read 86 | 87 | # For the unaligned SAM/BAM files 88 | giraffe estimate --unaligned 89 | ``` 90 | 91 | 92 | 93 | ## Observe function 94 | 95 | ```shell 96 | # For FASTQ reads 97 | giraffe observe --read --ref 98 | 99 | # For unaligned SAM/BAM files 100 | giraffe observe --unaligned --ref 101 | 102 | # For aligned SAM/BAM files 103 | giraffe observe --aligned 104 | ``` 105 | 106 | **Note:** If you are going to use aligned SAM/BAM files as input, please remove the secondary alignment (**--secondary=no**) and add the MD tag (**--MD**) before mapping by adding these two highlighted parameters. 107 | 108 | 109 | 110 | ## GCbias function 111 | 112 | ```shell 113 | giraffe gcbias --ref --aligned 114 | ``` 115 | 116 | 117 | 118 | ## Modbin function 119 | 120 | ```shell 121 | giraffe modbin --methyl --region 122 | 123 | # Example for methylation file (Chrom Start End Value): 124 | contig_A 132 133 0.92 125 | contig_A 255 256 0.27 126 | contig_A 954 955 0.52 127 | ... 128 | ``` 129 | 130 | 131 | 132 | # Example 133 | 134 | Here, we provide demo datasets for testing the **Giraffe**. The following commands can help to download them and run the demo. 135 | 136 | ```shell 137 | giraffe_run_demo 138 | ``` 139 | 140 | The demo datasets included three E. coli datasets including a 4.2 MB reference, 79 MB R10.4.1 reads, and 121 MB R9.4.1 reads. For the methylation files, two files of zebrafish blood (23 MB)and kidney (19 KB) are included. This demo takes about 7 minutes and 20 seconds with a maximum memory of 391 MB. This running includes the one-command pattern and four individual functions testing. 141 | 142 | 143 | 144 | # Tool showcase 145 | 146 | The one-command pattern will generate a summary in [HTML](https://lxd98.github.io/giraffe.github.io) format. If the scale of the X/Y-axis is not reasonable, the script of `giraffe_plot` can be used to replot the figure. 147 | 148 | # Documentation 149 | 150 | For more details about the usage of Giraffe and results profiling, please refer to the [document](https://giraffe-documentation.readthedocs.io/en/latest). 151 | 152 | # Citation 153 | 154 | Liu, X., Shao, Y., Guo, Z., Ni, Y., Sun, X., Leung, A. Y. H., & Li, R. (2024). Giraffe: A tool for comprehensive processing and visualization of multiple long-read sequencing data. *Computational and Structural Biotechnology Journal, 23,* 3241-3246. https://doi.org/10.1016/j.csbj.2024.08.003 155 | -------------------------------------------------------------------------------- /Giraffe_View/homopolymer.py: -------------------------------------------------------------------------------- 1 | from os import system 2 | import pandas as pd 3 | import pysam 4 | import re 5 | from Giraffe_View.function import * 6 | import multiprocessing 7 | 8 | def homopolymer_summary_1(input_file, sample_ID, chromosome): 9 | data = {} 10 | with open(input_file) as ff: 11 | for line in ff: 12 | line = line.replace("\n", "") 13 | line = line.split("\t") 14 | position = line[0] + "_" + line[1] + "_" + line[2] 15 | 16 | if position not in data.keys(): 17 | data[position] = {} 18 | data[position]["type"] = str(line[3]) + line[4] 19 | data[position]["depth"] = 1 20 | data[position]["mat"] = 0 21 | 22 | if int(line[3]) == int(line[5]): 23 | data[position]["mat"] += 1 24 | 25 | elif position in data.keys(): 26 | data[position]["depth"] += 1 27 | if int(line[3]) == int(line[5]): 28 | data[position]["mat"] += 1 29 | ff.close() 30 | 31 | output_1 = f"Giraffe_Results/2_Observed_quality/{sample_ID}_homopolymer_in_reference_{chromosome}.txt" 32 | with open(output_1, "w") as ff: 33 | # ff.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n") 34 | for i in data.keys(): 35 | mes = str(i) + "\t" + str(data[i]["mat"]) + "\t" + str(data[i]["depth"]) + "\t" + data[i]["type"] 36 | ff.write(mes + "\t" + str(sample_ID) + "\n") 37 | ff.close() 38 | 39 | def homopolymer_summary_2(sample_ID): 40 | input_file = "Giraffe_Results/2_Observed_quality/" + str(sample_ID) + ".homopolymer_in_reference.txt" 41 | output_file = "Giraffe_Results/2_Observed_quality/" + str(sample_ID) + ".homo_tmp" 42 | data = pd.read_table(input_file, sep="\t") 43 | valid = data[data["depth"] >= 3].copy() 44 | 45 | if len(valid) != 0: 46 | ff = open(output_file, "w") 47 | valid["rate"] = valid["num_of_mat"] / valid["depth"] 48 | 49 | def Abase(x): 50 | if re.search(".*A", x): 51 | return(True) 52 | else: 53 | return(False) 54 | 55 | def Tbase(x): 56 | if re.search(".*T", x): 57 | return(True) 58 | else: 59 | return(False) 60 | 61 | def Gbase(x): 62 | if re.search(".*G", x): 63 | return(True) 64 | else: 65 | return(False) 66 | 67 | def Cbase(x): 68 | if re.search(".*C", x): 69 | return(True) 70 | else: 71 | return(False) 72 | 73 | T_acc = valid[valid["type"].apply(Tbase)]["rate"].mean() 74 | G_acc = valid[valid["type"].apply(Gbase)]["rate"].mean() 75 | C_acc = valid[valid["type"].apply(Cbase)]["rate"].mean() 76 | A_acc = valid[valid["type"].apply(Abase)]["rate"].mean() 77 | 78 | ff.write(f"A\t{A_acc:.4f}\t{sample_ID}\n") 79 | ff.write(f"T\t{T_acc:.4f}\t{sample_ID}\n") 80 | ff.write(f"C\t{C_acc:.4f}\t{sample_ID}\n") 81 | ff.write(f"G\t{G_acc:.4f}\t{sample_ID}\n") 82 | ff.close() 83 | else: 84 | error_with_color("The read coverage of data was too shallow to conduct the homopolymer analysis!!!") 85 | 86 | def merge_results_observed_homopolymer(): 87 | with open("Giraffe_Results/2_Observed_quality/header", "a") as ff: 88 | ff.write("Base\tAccuracy\tGroup\n") 89 | ff.close() 90 | 91 | system("cat Giraffe_Results/2_Observed_quality/header \ 92 | Giraffe_Results/2_Observed_quality/*.homo_tmp > \ 93 | Giraffe_Results/2_Observed_quality/Homoploymer_summary.txt") 94 | 95 | system("rm Giraffe_Results/2_Observed_quality/*homo_tmp \ 96 | Giraffe_Results/2_Observed_quality/header" 97 | ) 98 | 99 | def homopolymer_from_bam_worker(input_bamfile, sample_ID, chromosome): 100 | bamfile = pysam.AlignmentFile(input_bamfile, "rb") 101 | output = f"Giraffe_Results/2_Observed_quality/{sample_ID}_homopolymer_detail_{chromosome}.txt" 102 | 103 | with open(output, "w") as ff: 104 | for read in bamfile.fetch(chromosome): 105 | read_ID = read.query_name 106 | read_pair = read.get_aligned_pairs(matches_only=False, with_seq=True) 107 | read_cigar = read.cigarstring 108 | read_ref_id = read.reference_name 109 | read_valid_pair = remove_clip_list(read_cigar, read_pair, read_ID) 110 | 111 | homopolymer_ref = "" 112 | homopolymer_read = "" 113 | homopolymer_ref_pos = [] 114 | count = 1 115 | 116 | for base in read_valid_pair: 117 | base_alignment = get_base_alignment(base) 118 | if homopolymer_ref == "": 119 | if base_alignment != "I": 120 | homopolymer_ref = str(base[2]).upper() 121 | homopolymer_read = str(base_alignment) 122 | homopolymer_ref_pos.append(base[1]) 123 | else: 124 | if base[2] is None: 125 | homopolymer_read += str(base_alignment) 126 | else: 127 | base_ref = str(base[2]).upper() 128 | if base_ref == homopolymer_ref[0]: 129 | homopolymer_ref += base_ref 130 | homopolymer_read += str(base_alignment) 131 | homopolymer_ref_pos.append(base[1]) 132 | else: 133 | if len(homopolymer_ref) >= 4: 134 | homopolymer_ref_pos = [ 135 | str(len(homopolymer_ref)) + homopolymer_ref[0], 136 | str(remove_I(homopolymer_read)), 137 | str(read_ref_id), 138 | *homopolymer_ref_pos 139 | ] 140 | stat_info = count_indel_and_snv(homopolymer_ref_pos[1]) 141 | stat_info = {k: stat_info.get(k, 0) for k in ['M', 'D', 'S', 'I']} 142 | 143 | mes = ( 144 | f"{homopolymer_ref_pos[2]}\t{homopolymer_ref_pos[3]}\t{homopolymer_ref_pos[-1]}\t" 145 | f"{homopolymer_ref_pos[0][:-1]}\t{homopolymer_ref_pos[0][-1]}\t" 146 | f"{stat_info['M']}\t{stat_info['D']}\t{stat_info['I']}\t{stat_info['S']}\t" 147 | f"{read_ID}\t{sample_ID}") 148 | 149 | ff.write(mes + "\n") 150 | count += 1 151 | 152 | homopolymer_ref = base_ref 153 | homopolymer_read = str(base_alignment) 154 | homopolymer_ref_pos = [base[1]] 155 | bamfile.close() 156 | ff.close() 157 | homopolymer_summary_1(output, sample_ID, chromosome) 158 | 159 | def run_homopolymer_from_bam(input_bamfile, sample_ID, num_processes=10): 160 | bamfile = pysam.AlignmentFile(input_bamfile, "rb") 161 | chromosomes = bamfile.references 162 | 163 | with multiprocessing.Pool(processes=num_processes) as pool: 164 | jobs = [] 165 | for chromosome in chromosomes: 166 | jobs.append(pool.apply_async(homopolymer_from_bam_worker, (input_bamfile, sample_ID, chromosome))) 167 | 168 | for job in jobs: 169 | job.get() 170 | -------------------------------------------------------------------------------- /Giraffe_View/observed_read_accuracy.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import re 3 | from os import system 4 | from Giraffe_View.function import * 5 | import multiprocessing 6 | 7 | def data_process(sample_ID, data_type, data_path, ref, threads=10): 8 | output = "Giraffe_Results/2_Observed_quality/" + str(sample_ID) + ".bam" 9 | if data_type == "ONT": 10 | cmd1 = ["minimap2", "-ax", "map-ont", "-o", "Giraffe_Results/2_Observed_quality/tmp.sam", "--MD", \ 11 | "--secondary=no", "-L", "-t", str(threads), ref, data_path] 12 | 13 | elif data_type == "ONT_RNA": 14 | cmd1 = ["minimap2", "-ax", "splice", "-uf", "-k14", "-o", "Giraffe_Results/2_Observed_quality/tmp.sam", "--MD", \ 15 | "--secondary=no", "-L", "-t", str(threads), ref, data_path] 16 | 17 | elif data_type == "Pacbio": 18 | cmd1 = ["minimap2", "-ax", "map-pb", "-o", "Giraffe_Results/2_Observed_quality/tmp.sam", "--MD", \ 19 | "--secondary=no", "-L", "-t", str(threads), ref, data_path] 20 | 21 | else: 22 | error_with_color("Please check your data type!!! [ONT, Pacbio, ONT_RNA]") 23 | 24 | cmd2 = ["samtools", "view", "-bS", "-F4", "-@", str(threads), "-o", "Giraffe_Results/2_Observed_quality/tmp.bam", "Giraffe_Results/2_Observed_quality/tmp.sam"] 25 | cmd3 = ["samtools", "sort", "-@", str(threads), "-o", output, "Giraffe_Results/2_Observed_quality/tmp.bam"] 26 | cmd4 = ["samtools", "index", "-@", str(threads), output] 27 | cmd5 = ["rm", "-rf", "Giraffe_Results/2_Observed_quality/tmp.bam", "Giraffe_Results/2_Observed_quality/tmp.sam"] 28 | 29 | # Run each command and check the return code 30 | for i, cmd in enumerate([cmd1, cmd2, cmd3, cmd4, cmd5]): 31 | try: 32 | subprocess.run(cmd, check=True) 33 | # print("Command {} succeeded".format(i + 1)) 34 | except subprocess.CalledProcessError as e: 35 | print("Command {} failed with error code {}".format(i + 1, e.returncode)) 36 | print(e.output) 37 | # Raise an exception to indicate that processing failed 38 | raise Exception("Data processing failed") 39 | 40 | def identify_match(cigar): 41 | # Identifies the number of matching bases in a read from its CIGAR string. 42 | cigar_mat = re.findall(r"\d+M", cigar) 43 | base_num_mat = sum(int(i[:-1]) for i in cigar_mat) 44 | return base_num_mat 45 | 46 | def identify_insertion(cigar): 47 | # Identifies the number of inserted bases in a read from its CIGAR string. 48 | cigar_ins = re.findall(r"\d+I", cigar) 49 | base_num_ins = sum(int(i[:-1]) for i in cigar_ins) 50 | return base_num_ins 51 | 52 | def identify_deletion(cigar): 53 | # Identifies the number of deleted bases in a read from its CIGAR string. 54 | cigar_del = re.findall(r"\d+D", cigar) 55 | base_num_del = sum(int(i[:-1]) for i in cigar_del) 56 | return base_num_del 57 | 58 | def identify_substitution(md): 59 | # Identifies the number of substitutions in a read from its MD tag. 60 | return len(re.findall(r"\d+[ATCG]", md)) 61 | 62 | def merge_results_observed_acc(): 63 | with open("Giraffe_Results/2_Observed_quality/header", "a") as ff: 64 | ff.write("ID\tIns\tDel\tSub\tMat\tIden\tAcc\tGroup\n") 65 | ff.close() 66 | 67 | system("cat Giraffe_Results/2_Observed_quality/header \ 68 | Giraffe_Results/2_Observed_quality/*_primary_* \ 69 | Giraffe_Results/2_Observed_quality/*_supplementary.total.txt > \ 70 | Giraffe_Results/2_Observed_quality/Observed_information.txt") 71 | 72 | system("rm Giraffe_Results/2_Observed_quality/*_primary_* \ 73 | Giraffe_Results/2_Observed_quality/header \ 74 | Giraffe_Results/2_Observed_quality/*_supplementary.total.txt") 75 | 76 | def observed_accuracy_worker(bam_file, sample_ID, chromosome): 77 | output_1 = f"Giraffe_Results/2_Observed_quality/{sample_ID}_primary_{chromosome}.txt" 78 | output_2 = f"Giraffe_Results/2_Observed_quality/{sample_ID}_supplementary_{chromosome}.txt" 79 | bamfile= pysam.AlignmentFile(bam_file, 'rb') 80 | 81 | with open(output_1, "w") as pri_f: 82 | with open(output_2, "w") as sup_f: 83 | for read in bamfile.fetch(chromosome): 84 | # filter the unmapped reads 85 | if read.flag == 4: 86 | continue 87 | else: 88 | read_ID = read.query_name 89 | read_cigar = read.cigarstring 90 | read_md = read.get_tag("MD") 91 | 92 | # count the number of matched and mismatched base 93 | Ins = identify_insertion(read_cigar) 94 | Del = identify_deletion(read_cigar) 95 | Sub = identify_substitution(read_md) 96 | Mat = identify_match(read_cigar) - Sub 97 | 98 | # check the presence of supplementary reads 99 | if read.has_tag("SA"): 100 | sup_f.write(f"{read_ID}\t{Ins}\t{Del}\t{Sub}\t{Mat}\t{sample_ID}\n") 101 | else: 102 | # calculate the observed accuracy and identification 103 | total = Ins + Del + Sub + Mat 104 | Acc = Mat / total if total > 0 else 0 105 | Iden = Mat / (Mat + Sub) if (Mat + Sub) > 0 else 0 106 | pri_f.write(f"{read_ID}\t{Ins}\t{Del}\t{Sub}\t{Mat}\t{Iden:.4f}\t{Acc:.4f}\t{sample_ID}\n") 107 | pri_f.close() 108 | sup_f.close() 109 | bamfile.close() 110 | 111 | def run_observed_accuracy(input_bamfile, sample_ID, num_processes=10): 112 | bamfile = pysam.AlignmentFile(input_bamfile, "rb") 113 | chromosomes = bamfile.references 114 | 115 | with multiprocessing.Pool(processes=num_processes) as pool: 116 | jobs = [] 117 | for chromosome in chromosomes: 118 | jobs.append(pool.apply_async(observed_accuracy_worker, (input_bamfile, sample_ID, chromosome))) 119 | 120 | for job in jobs: 121 | job.get() 122 | 123 | def supplementary_read_processing(sample_ID): 124 | total = {} 125 | 126 | with open("Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt", "r") as ff: 127 | for line in ff.readlines(): 128 | line = line.rstrip("\n") 129 | data = line.split("\t") 130 | 131 | read_ID = str(data[0]) 132 | Ins = int(data[1]) 133 | Del = int(data[2]) 134 | Sub = int(data[3]) 135 | Mat = int(data[4]) 136 | sample_ID = str(data[5]) 137 | 138 | if read_ID not in total: 139 | total[read_ID] = {} 140 | total[read_ID]["Ins"] = Ins 141 | total[read_ID]["Del"] = Del 142 | total[read_ID]["Sub"] = Sub 143 | total[read_ID]["Mat"] = Mat 144 | total[read_ID]["sample"] = sample_ID 145 | else: 146 | total[read_ID]["Ins"] += Ins 147 | total[read_ID]["Del"] += Del 148 | total[read_ID]["Sub"] += Sub 149 | total[read_ID]["Mat"] += Mat 150 | ff.close() 151 | 152 | output = f"Giraffe_Results/2_Observed_quality/{sample_ID}_supplementary.total.txt" 153 | with open(output, "w") as ff: 154 | for read_key, read_data in total.items(): 155 | read_ID = read_key 156 | Ins = read_data["Ins"] 157 | Del = read_data["Del"] 158 | Sub = read_data["Sub"] 159 | Mat = read_data["Mat"] 160 | sample_ID = read_data["sample"] 161 | 162 | All = Ins + Del + Sub + Mat 163 | Acc = Mat / All if All > 0 else 0 164 | Iden = Mat / (Mat + Sub) if (Mat + Sub) > 0 else 0 165 | ff.write(f"{read_ID}\t{Ins}\t{Del}\t{Sub}\t{Mat}\t{Iden:.4f}\t{Acc:.4f}\t{sample_ID}\n") 166 | ff.close() -------------------------------------------------------------------------------- /Giraffe_View/gc_bias.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os import system 3 | import pandas as pd 4 | from Giraffe_View.function import cmd_shell 5 | import multiprocessing 6 | 7 | def classify_by_chromosome(input_file): 8 | classified_lines = {} 9 | 10 | # Read the input file and classify the lines 11 | with open(input_file, 'r') as file: 12 | for line in file: 13 | # Split the line by tab 14 | fields = line.strip().split('\t') 15 | first_field = fields[0] 16 | 17 | # Add the line to the appropriate list in the dictionary 18 | if first_field not in classified_lines: 19 | classified_lines[first_field] = [] 20 | classified_lines[first_field].append(line) 21 | 22 | # Write the classified lines to separate output files 23 | for key, lines in classified_lines.items(): 24 | output_file = f"Giraffe_Results/3_GC_bias/{key}_gcbias_bin.bed" 25 | with open(output_file, 'w') as file: 26 | file.writelines(lines) 27 | 28 | def get_bin_bed(input_reference, input_binsize): 29 | if not os.path.exists(f"{input_reference}.fai"): 30 | system(f"samtools faidx {input_reference}") 31 | system(f"bedtools makewindows -g {input_reference}.fai -w {input_binsize} > Giraffe_Results/3_GC_bias/bin.bed") 32 | classify_by_chromosome("Giraffe_Results/3_GC_bias/bin.bed") 33 | 34 | def get_bin_GC(args): 35 | input_reference, input_chromosome, path = args 36 | system(f"bedtools nuc -fi {input_reference} -bed {path}/{input_chromosome}_gcbias_bin.bed > {path}/{input_chromosome}_bin_GC.tmp") 37 | 38 | input_file = f"{path}/{input_chromosome}_bin_GC.tmp" 39 | output = f"{path}/{input_chromosome}_bin_GC.txt" 40 | with open(input_file, "r") as ff: 41 | with open(output, "w") as of: 42 | for bin in ff: 43 | if bin[0] != "#": 44 | bin = bin.replace("\n", "") 45 | bin = bin.split() 46 | bin_chrom = bin[0] 47 | bin_start = bin[1] 48 | bin_end = bin[2] 49 | bin_gc = bin[4] 50 | mes = str(bin_chrom) + "\t" + str(bin_start) + "\t" 51 | mes += str(bin_end) + "\t" + str(bin_gc) + "\n" 52 | of.write(mes) 53 | system(f"rm {path}/{input_chromosome}_bin_GC.tmp") 54 | 55 | def manager_GC_content(input_reference, num_cpus): 56 | processes = [] 57 | chromosomes = [] 58 | 59 | with open(f"{input_reference}.fai", "r") as ff: 60 | for l in ff.readlines(): 61 | l = l.replace("\n","").split() 62 | chromosomes.append(l[0]) 63 | ff.close() 64 | 65 | args = [(input_reference, chrom, "Giraffe_Results/3_GC_bias") for chrom in chromosomes] 66 | with multiprocessing.Pool(processes=num_cpus) as pool: 67 | pool.map(get_bin_GC, args) 68 | 69 | path = "Giraffe_Results/3_GC_bias" 70 | system(f"cat {path}/*_bin_GC.txt > {path}/bin_GC.txt") 71 | system(f"rm {path}/*_bin_GC.txt") 72 | 73 | def get_bin_depth(args): 74 | input_sample_ID, input_bam, input_chromosome, path = args 75 | system(f"samtools bedcov {path}/{input_chromosome}_gcbias_bin.bed {input_bam} > {path}/{input_sample_ID}_{input_chromosome}_bin_depth.txt") 76 | 77 | def manager_bin_depth(input_reference,sample_ID, bamfile, num_cpus): 78 | processes = [] 79 | chromosomes = [] 80 | 81 | with open(f"{input_reference}.fai", "r") as ff: 82 | for l in ff.readlines(): 83 | l = l.replace("\n","").split() 84 | chromosomes.append(l[0]) 85 | ff.close() 86 | 87 | args = [(sample_ID, bamfile, chrom, "Giraffe_Results/3_GC_bias") for chrom in chromosomes] 88 | with multiprocessing.Pool(processes=num_cpus) as pool: 89 | pool.map(get_bin_depth, args) 90 | 91 | path = "Giraffe_Results/3_GC_bias" 92 | system(f"cat {path}/*_bin_depth.txt > {path}/{sample_ID}.bin_depth.txt") 93 | system(f"rm {path}/*_bin_depth.txt") 94 | 95 | def compute_GC_bias(ref, bamfile, binsize, sample_ID, num_cpu): 96 | path="Giraffe_Results/3_GC_bias" 97 | if os.path.exists(f"{path}/bin.bed") and os.path.exists(f"{path}/bin_GC.txt"): 98 | manager_bin_depth(ref, sample_ID, bamfile, num_cpu) 99 | else: 100 | get_bin_bed(ref, binsize) 101 | manager_GC_content(ref, num_cpu) 102 | manager_bin_depth(ref, sample_ID, bamfile, num_cpu) 103 | 104 | system(f"rm {path}/bin.bed") 105 | system(f"rm {path}/*_bin.bed") 106 | 107 | def merge_GC_content_and_depth(binsize, sample_ID): 108 | data = {} 109 | input_depth = "Giraffe_Results/3_GC_bias/" + str(sample_ID) + ".bin_depth.txt" 110 | with open(input_depth) as f1: 111 | for bins in f1: 112 | bins = bins.replace("\n", "") 113 | bins = bins.split("\t") 114 | if bins[-1] != 0: 115 | KEY = bins[0] + "_" + bins[1] + "_" + bins[2] 116 | data[KEY]= {} 117 | data[KEY]["dp"] = int(bins[3]) / int(binsize) 118 | f1.close() 119 | 120 | with open("Giraffe_Results/3_GC_bias/bin_GC.txt") as f2: 121 | for bins in f2: 122 | bins = bins.replace("\n", "") 123 | bins = bins.split("\t") 124 | KEY = bins[0] + "_" + bins[1] + "_" + bins[2] 125 | if KEY in data.keys(): 126 | data[KEY]["GC"] = float(bins[3]) * 100 127 | else: 128 | continue 129 | f2.close() 130 | 131 | merged_data = {} 132 | merged_data["dp"] = [] 133 | merged_data["GC"] = [] 134 | for i in data.keys(): 135 | tmp_dp = data[i]["dp"] 136 | tmp_gc = data[i]["GC"] 137 | merged_data["dp"].append(tmp_dp) 138 | merged_data["GC"].append(tmp_gc) 139 | merged_data = pd.DataFrame.from_dict(merged_data) 140 | 141 | output_file = "Giraffe_Results/3_GC_bias/" + str(sample_ID) + "_relationship_raw.txt" 142 | ff = open(output_file, "w") 143 | ff.write("GC_content\tDepth\tNumber\tGroup\n") 144 | for i in range(0,101): 145 | tmp = merged_data[(i-0.5 <= merged_data["GC"]) & (merged_data["GC"] < i+0.5)].copy() 146 | if len(tmp) != 0: 147 | ave_dp = tmp["dp"].mean() 148 | else: 149 | ave_dp = 0.0 150 | ff.write(str(i) + "\t" + str(ave_dp) + "\t" + str(len(tmp)) + "\t" + str(sample_ID) + "\n") 151 | ff.close() 152 | 153 | # get the 95% data for downstream normalization 154 | df = pd.read_csv(output_file, sep=r'\s+') 155 | # df = pd.read_csv(output_file, delim_whitespace=True) 156 | max_number = df["Number"].max() 157 | total_number = df["Number"].sum() 158 | porportion = 0.95 159 | tmp = df[df["Number"] == max_number].copy() 160 | 161 | if len(tmp) == 1: 162 | for i in tmp["GC_content"]: 163 | start = i 164 | end = i 165 | 166 | for i in range(1,51): 167 | t1 = df[(start-1 <=df["GC_content"]) & (df["GC_content"] <= end+1)].copy() 168 | if t1["Number"].sum() / total_number >= porportion: 169 | nor_df = t1 170 | break 171 | else: 172 | start -= 1 173 | end += 1 174 | continue 175 | 176 | # normalization 177 | ave_dp = nor_df["Depth"].mean() 178 | nor_df["Normalized_depth"] = nor_df.apply(lambda row: row["Depth"]/ave_dp, axis=1) 179 | output_file_1 = "Giraffe_Results/3_GC_bias/" + str(sample_ID) + "_relationship_tmp.txt" 180 | nor_df.to_csv(output_file_1, sep="\t", index=False, header=False) 181 | 182 | system("rm Giraffe_Results/3_GC_bias/*bin_depth.txt") 183 | 184 | def merge_files(): 185 | with open("header", "w") as ff: 186 | ff.write("GC_content\tDepth\tNumber\tGroup\tNormalized_depth\n") 187 | ff.close() 188 | system("cat header Giraffe_Results/3_GC_bias/*_relationship_tmp.txt \ 189 | > Giraffe_Results/3_GC_bias/Relationship_normalization.txt") 190 | system("rm header Giraffe_Results/3_GC_bias/*_relationship_tmp.txt") 191 | 192 | def get_bin_number_within_GC_content(): 193 | df = pd.read_table("Giraffe_Results/3_GC_bias/bin_GC.txt", header=None) 194 | with open("Giraffe_Results/3_GC_bias/Bin_distribution.txt", "w") as ff: 195 | ff.write("GC_content\tNumber\n") 196 | df[3] = df[3] * 100 197 | for i in range(0,101): 198 | tmp = df[(i-0.5 <= df[3]) & (df[3] < i+0.5)].copy() 199 | ff.write(str(i) + "\t" + str(len(tmp)) + "\n") 200 | ff.close() 201 | system("rm Giraffe_Results/3_GC_bias/bin_GC.txt") 202 | -------------------------------------------------------------------------------- /Giraffe_View/giraffe_plot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import pandas as pd 6 | import numpy as np 7 | import sys 8 | from Giraffe_View.function import * 9 | 10 | def plot_estimate_acc(input_file, x_min, x_max, x_gap): 11 | df = process_in_chunks(input_file) 12 | df = pd.DataFrame(df) 13 | df["Accuracy"] = df["Accuracy"] * 100 14 | 15 | plt.figure(figsize=(8, 6)) 16 | ax = sns.kdeplot(data=df, x="Accuracy", hue="Group", fill=True, 17 | alpha=0.6, palette="Set2", common_norm=False) 18 | sns.move_legend(ax, "upper left") 19 | ax 20 | acc_scale = [x_min, x_max] 21 | acc_breaks = [i for i in range(x_min, x_max+1, x_gap)] 22 | 23 | plt.xlabel("Estimated read accuracy (%)") 24 | plt.ylabel("Probability Density Function") 25 | plt.xlim(acc_scale) 26 | plt.xticks(acc_breaks) 27 | plt.tight_layout() 28 | plt.savefig("New_read_estimate_accuracy.svg", format="svg", dpi=300) 29 | plt.close() 30 | 31 | def plot_observe_acc(input_file, x_min, x_max, x_gap): 32 | df = process_in_chunks(input_file) 33 | df = pd.DataFrame(df) 34 | df["Acc"] = df["Acc"] * 100 35 | 36 | plt.figure(figsize=(8, 6)) 37 | ax = sns.kdeplot(data=df, x="Acc", hue="Group", fill=True, 38 | common_norm=False, alpha=0.6, palette="Set2") 39 | sns.move_legend(ax, "upper left") 40 | ax 41 | 42 | acc_scale = [x_min, x_max] 43 | acc_breaks = [i for i in range(x_min, x_max+1, x_gap)] 44 | 45 | plt.xlabel("Observed read accuracy (%)") 46 | plt.ylabel("Probability Density Function") 47 | plt.xlim(acc_scale) 48 | plt.xticks(acc_breaks) 49 | plt.tight_layout() 50 | plt.savefig("New_read_observe_accuracy.svg", format="svg", dpi=300) 51 | plt.close() 52 | 53 | def plot_observe_mismatch(input_file, y_max, y_gap): 54 | df = process_in_chunks(input_file) 55 | df = pd.DataFrame(df) 56 | 57 | df["p_ins"] = 100 * df["Ins"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"]) 58 | df["p_del"] = 100 * df["Del"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"]) 59 | df["p_sub"] = 100 * df["Sub"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"]) 60 | 61 | df = df.melt(id_vars=["Group"], value_vars=["p_ins", "p_del", "p_sub"], 62 | var_name="Mismatch Type", value_name="Mismatch Proportion") 63 | 64 | plt.figure(figsize=(8, 6)) 65 | sns.boxplot(data=df, x="Mismatch Type", y="Mismatch Proportion", hue="Group", 66 | showfliers=False, width=0.5, saturation=0.6, palette="Set2") 67 | 68 | mis_scale = [0, y_max] 69 | mis_breaks = [i for i in range(0, y_max+1, y_gap)] 70 | 71 | plt.ylabel("Mismatch proportion (%)") 72 | plt.ylim(mis_scale) 73 | plt.yticks(mis_breaks) 74 | plt.xticks(ticks=[0, 1, 2], labels=["Insertion", "Deletion", "Substitution"]) 75 | plt.xlabel("") 76 | 77 | plt.legend(title='Group') 78 | plt.tight_layout() 79 | plt.savefig("New_observed_mismatch_proportion.svg", format="svg", dpi=300) 80 | plt.close() 81 | 82 | def plot_observe_homo(input_file, y_min, y_max, y_gap): 83 | df = process_in_chunks(input_file) 84 | df = pd.DataFrame(df) 85 | df["Accuracy"] = df["Accuracy"] * 100 86 | 87 | plt.figure(figsize=(8, 6)) 88 | sns.lineplot(data=df, x='Base', y='Accuracy', hue='Group', linewidth=1.5, 89 | markers=True, dashes=False, palette="Set2", alpha=0.6, legend=False) 90 | sns.scatterplot(data=df, x='Base', y='Accuracy', hue='Group', 91 | palette="Set2", s=50, edgecolor="black") 92 | 93 | homo_scale = [y_min, y_max] 94 | homo_breaks = [i for i in range(y_min, y_max+1, y_gap)] 95 | 96 | plt.ylim(homo_scale) 97 | plt.yticks(homo_breaks) 98 | plt.ylabel('Accuracy of homopolymer identification (%)') 99 | plt.xlabel('Base') 100 | 101 | plt.legend(title='Group') 102 | plt.tight_layout() 103 | plt.savefig("New_homoploymer_summary.svg", format="svg", dpi=300, bbox_inches='tight') 104 | plt.close() 105 | 106 | def plot_GC_bias(input_file, x_min, x_max, x_gap): 107 | df = process_in_chunks(input_file) 108 | df = pd.DataFrame(df) 109 | 110 | plt.figure(figsize=(8, 5)) 111 | sns.lineplot(data=df, x="GC_content", y="Normalized_depth", hue="Group", 112 | palette="Set2", linewidth=1.5, alpha=0.6) 113 | sns.scatterplot(data=df, x="GC_content", y="Normalized_depth", hue="Group", 114 | palette="Set2", edgecolor="black", s=20, legend=False) 115 | plt.axhline(1, color="grey", linestyle="dotted") 116 | plt.ylim(0, 2) 117 | # depth_breaks = [i for i in range(0, 2.1, 0.2)] 118 | depth_breaks = [i * 0.2 for i in range(11)] 119 | 120 | plt.yticks(depth_breaks) 121 | 122 | plt.xlim([x_min, x_max]) 123 | plt.xticks([i for i in range(x_min, x_max+1, x_gap)]) 124 | plt.xlabel("GC content (%)") 125 | plt.ylabel("Normalized depth") 126 | plt.grid(False) 127 | plt.tight_layout() 128 | plt.savefig("New_relationship_normalization.svg", format="svg", dpi=300) 129 | plt.close() 130 | 131 | if __name__ == '__main__': 132 | version = "0.2.3" 133 | parser = argparse.ArgumentParser(description="", 134 | usage="\n # Users can replot the figures by rescaling the regions along the x-axis or y-axis.\n" 135 | "\n %(prog)s estimate_acc --input Estimated_information.txt --x_min 50 --x_max 100 --x_gap 10 # For estimated read accuracy!" 136 | "\n %(prog)s observe_acc --input Observed_information.txt --x_min 50 --x_max 100 --x_gap 10 # For observed read accuracy!" 137 | "\n %(prog)s observe_mismatch --input Observed_information.txt --y_max 5 --y_gap 1 # For mismatch proportion!" 138 | "\n %(prog)s observe_homo --input Homoploymer_summary.txt --y_min 90 --y_max 100 --y_gap 2 # For homopolymer accuracy!" 139 | "\n %(prog)s gcbias --input Relationship_normalization.txt --x_min 20 --x_max 50 --x_gap 2 # For relationship between normalized depth and GC content!" 140 | "\n\nversion: " + str(version) + "\n" 141 | "For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.") 142 | 143 | subparsers = parser.add_subparsers(dest='function', help=None, description=None, prog="giraffe", metavar=" subcommand and function") 144 | 145 | plot_estimate_acc_parser = subparsers.add_parser('estimate_acc', help='Replot estimated read accuracy') 146 | plot_estimate_acc_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Estimated_information.txt)") 147 | plot_estimate_acc_parser.add_argument("--x_min", type=int, metavar="", required=True, help="the smallest cutoff for estimated read accuracy") 148 | plot_estimate_acc_parser.add_argument("--x_max", type=int, metavar="", required=True, help="the largest cutoff for estimated read accuracy") 149 | plot_estimate_acc_parser.add_argument("--x_gap", type=int, metavar="", required=True, help="the interval between two values on an x-axis") 150 | 151 | plot_observe_acc_parser = subparsers.add_parser('observe_acc', help='Replot observed read accuracy') 152 | plot_observe_acc_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Observed_information.txt)") 153 | plot_observe_acc_parser.add_argument("--x_min", type=int, metavar="", required=True, help="the smallest cutoff for observed read accuracy") 154 | plot_observe_acc_parser.add_argument("--x_max", type=int, metavar="", required=True, help="the largest cutoff for observed read accuracy") 155 | plot_observe_acc_parser.add_argument("--x_gap", type=int, metavar="", required=True, help="the interval between two values on an x-axis") 156 | 157 | plot_observe_mismatch_parser = subparsers.add_parser('observe_mismatch', help='Replot observed mismatch proportion') 158 | plot_observe_mismatch_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Observed_information.txt)") 159 | plot_observe_mismatch_parser.add_argument("--y_max", type=int, metavar="", required=True, help="the largest cutoff for mismatch proportion") 160 | plot_observe_mismatch_parser.add_argument("--y_gap", type=int, metavar="", required=True, help="the interval between two values on a y-axis") 161 | 162 | plot_observe_homo_parser = subparsers.add_parser('observe_homo', help='Replot observed read accuracy') 163 | plot_observe_homo_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Homoploymer_summary.txt)") 164 | plot_observe_homo_parser.add_argument("--y_min", type=int, metavar="", required=True, help="the smallest cutoff for homopolymer accuracy") 165 | plot_observe_homo_parser.add_argument("--y_max", type=int, metavar="", required=True, help="the largest cutoff for homopolymer accuracy") 166 | plot_observe_homo_parser.add_argument("--y_gap", type=int, metavar="", required=True, help="the interval between two values on a y-axis") 167 | 168 | plot_GC_bias_parser = subparsers.add_parser('gcbias', help='Replot observed read accuracy') 169 | plot_GC_bias_parser.add_argument("--input", type=str, metavar="", required=True, help="the result generated from giraffe (Relationship_normalization.txt)") 170 | plot_GC_bias_parser.add_argument("--x_min", type=int, metavar="", required=True, help="the smallest cutoff for GC content") 171 | plot_GC_bias_parser.add_argument("--x_max", type=int, metavar="", required=True, help="the largest cutoff for GC content") 172 | plot_GC_bias_parser.add_argument("--x_gap", type=int, metavar="", required=True, help="the interval between two values on an x-axis") 173 | 174 | args = parser.parse_args() 175 | 176 | if len(sys.argv) == 1: 177 | parser.print_help(sys.stderr) 178 | sys.exit(1) 179 | 180 | if args.function == "estimate_acc": 181 | if len(sys.argv) == 2: 182 | plot_estimate_acc_parser.print_help(sys.stderr) 183 | sys.exit(1) 184 | else: 185 | plot_estimate_acc(args.input, args.x_min, args.x_max, args.x_gap) 186 | 187 | elif args.function == "observe_acc": 188 | if len(sys.argv) == 2: 189 | plot_observe_acc_parser.print_help(sys.stderr) 190 | sys.exit(1) 191 | else: 192 | plot_observe_acc(args.input, args.x_min, args.x_max, args.x_gap) 193 | 194 | elif args.function == "observe_mismatch": 195 | if len(sys.argv) == 2: 196 | plot_observe_mismatch_parser.print_help(sys.stderr) 197 | sys.exit(1) 198 | else: 199 | plot_observe_mismatch(args.input, args.y_max, args.y_gap) 200 | 201 | elif args.function == "observe_homo": 202 | if len(sys.argv) == 2: 203 | plot_observe_homo_parser.print_help(sys.stderr) 204 | sys.exit(1) 205 | else: 206 | plot_observe_homo(args.input, args.y_min, args.y_max, args.y_gap) 207 | 208 | elif args.function == "gcbias": 209 | if len(sys.argv) == 2: 210 | plot_GC_bias_parser.print_help(sys.stderr) 211 | sys.exit(1) 212 | else: 213 | plot_GC_bias(args.input, args.x_min, args.x_max, args.x_gap) 214 | 215 | 216 | -------------------------------------------------------------------------------- /Giraffe_View/plot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | import warnings 5 | import os 6 | from Giraffe_View.function import process_in_chunks 7 | 8 | warnings.filterwarnings('ignore') 9 | 10 | def plot_estimate(format='svg', path='Giraffe_Results/1_Estimated_quality'): 11 | df = process_in_chunks("Giraffe_Results/1_Estimated_quality/Estimated_information.txt") 12 | df = pd.DataFrame(df) 13 | df["Accuracy"] = df["Accuracy"] * 100 14 | df["GC_content"] = df["GC_content"] * 100 15 | df["Length"] = df["Length"] / 1000 16 | 17 | min_1 = df["Accuracy"].min() 18 | if min_1 >= 95: 19 | acc_scale = [95, 100] 20 | acc_breaks = [i for i in range(95, 101, 1)] 21 | elif min_1 >= 90: 22 | acc_scale = [90, 100] 23 | acc_breaks = [i for i in range(90, 101, 1)] 24 | elif min_1 >= 80: 25 | acc_scale = [80, 100] 26 | acc_breaks = [i for i in range(80, 101, 2)] 27 | elif min_1 >= 70: 28 | acc_scale = [70, 100] 29 | acc_breaks = [i for i in range(70, 101, 5)] 30 | elif min_1 >= 60: 31 | acc_scale = [60, 100] 32 | acc_breaks = [i for i in range(60, 101, 5)] 33 | elif min_1 >= 50: 34 | acc_scale = [50, 100] 35 | acc_breaks = [i for i in range(50, 101, 5)] 36 | elif min_1 >= 40: 37 | acc_scale = [40, 100] 38 | acc_breaks = [i for i in range(40, 101, 10)] 39 | elif min_1 >= 30: 40 | acc_scale = [30, 100] 41 | acc_breaks = [i for i in range(30, 101, 10)] 42 | elif min_1 >= 20: 43 | acc_scale = [20, 100] 44 | acc_breaks = [i for i in range(20, 101, 10)] 45 | elif min_1 >= 10: 46 | acc_scale = [10, 100] 47 | acc_breaks = [i for i in range(10, 101, 10)] 48 | else: 49 | acc_scale = [0, 100] 50 | acc_breaks = [i for i in range(0, 101, 5)] 51 | 52 | # plot 53 | plt.figure(figsize=(8, 6)) 54 | ax = sns.kdeplot(data=df, x="Accuracy", hue="Group", fill=True, 55 | alpha=0.6, palette = "Set2", common_norm=False) 56 | sns.move_legend(ax, "upper left") 57 | ax 58 | plt.xlabel("Estimated read accuracy (%)") 59 | plt.ylabel("Probability Density Function") 60 | plt.xlim(acc_scale) 61 | plt.xticks(acc_breaks) 62 | 63 | plt.tight_layout() 64 | plt.savefig(f"{path}/1_Read_estimate_accuracy.{format}", format=format, dpi=300) 65 | plt.close() 66 | 67 | plt.figure(figsize=(8, 4)) 68 | sns.boxplot(data=df, y="Group", x="GC_content", hue="Group", 69 | palette="Set2", dodge=False, showfliers=False) 70 | 71 | plt.xlabel("GC content (%)") 72 | plt.xlim(0, 101) 73 | plt.xticks(range(0, 101, 10)) 74 | 75 | plt.yticks() 76 | plt.legend([],[], frameon=False) # Hide the legend 77 | plt.tight_layout() 78 | plt.savefig(f"{path}/2_Read_GC_content.{format}", format=format, dpi=300) 79 | plt.close() 80 | 81 | ave = df["Length"].mean() 82 | if ave <= 1: 83 | len_scale = [0, 5] 84 | len_breaks = [i for i in range(0, 6, 1)] 85 | elif ave <= 5: 86 | len_scale = [0, 10] 87 | len_breaks = [i for i in range(0, 11, 1)] 88 | elif ave <= 10: 89 | len_scale = [0, 20] 90 | len_breaks = [i for i in range(0, 21, 2)] 91 | elif ave <= 20: 92 | len_scale = [0, 30] 93 | len_breaks = [i for i in range(0, 31, 5)] 94 | elif ave <= 30: 95 | len_scale = [0, 50] 96 | len_breaks = [i for i in range(0, 51, 5)] 97 | else: 98 | len_scale = [0, 100] 99 | len_breaks = [i for i in range(0, 101, 10)] 100 | 101 | plt.figure(figsize=(8, 6)) 102 | sns.kdeplot(data=df, x="Length", hue="Group", 103 | fill=True, common_norm=False, alpha=0.6, 104 | palette="Set2") 105 | 106 | plt.xlabel("Read length (Kb)") 107 | plt.ylabel("Probability Density Function") 108 | plt.xlim(len_scale) 109 | plt.xticks(len_breaks) 110 | plt.tight_layout() 111 | plt.savefig(f"{path}/3_Read_length.{format}", format=format, dpi=300) 112 | plt.close() 113 | 114 | def plot_observe_acc(format='svg', path='Giraffe_Results/2_Observed_quality'): 115 | # color_set = "Set2" 116 | df = process_in_chunks("Giraffe_Results/2_Observed_quality/Observed_information.txt") 117 | df = pd.DataFrame(df) 118 | 119 | df["Acc"] = df["Acc"] * 100 120 | min_1 = df["Acc"].min() 121 | 122 | min_1 = df["Acc"].min() 123 | 124 | if min_1 >= 95: 125 | acc_scale = [95, 100] 126 | acc_breaks = [i for i in range(95, 101, 1)] 127 | elif min_1 >= 90: 128 | acc_scale = [90, 100] 129 | acc_breaks = [i for i in range(90, 101, 1)] 130 | elif min_1 >= 80: 131 | acc_scale = [80, 100] 132 | acc_breaks = [i for i in range(80, 101, 2)] 133 | elif min_1 >= 70: 134 | acc_scale = [70, 100] 135 | acc_breaks = [i for i in range(70, 101, 5)] 136 | elif min_1 >= 60: 137 | acc_scale = [60, 100] 138 | acc_breaks = [i for i in range(60, 101, 5)] 139 | elif min_1 >= 50: 140 | acc_scale = [50, 100] 141 | acc_breaks = [i for i in range(50, 101, 5)] 142 | elif min_1 >= 40: 143 | acc_scale = [40, 100] 144 | acc_breaks = [i for i in range(40, 101, 10)] 145 | elif min_1 >= 30: 146 | acc_scale = [30, 100] 147 | acc_breaks = [i for i in range(30, 101, 10)] 148 | elif min_1 >= 20: 149 | acc_scale = [20, 100] 150 | acc_breaks = [i for i in range(20, 101, 10)] 151 | elif min_1 >= 10: 152 | acc_scale = [10, 100] 153 | acc_breaks = [i for i in range(10, 101, 10)] 154 | else: 155 | acc_scale = [0, 100] 156 | acc_breaks = [i for i in range(0, 101, 5)] 157 | 158 | # Plot density plot for observed read accuracy 159 | # sns.set(style='darkgrid') 160 | plt.figure(figsize=(8, 6)) 161 | 162 | ax = sns.kdeplot(data=df, x="Acc", hue="Group", fill=True, 163 | common_norm=False, alpha=0.6, palette = "Set2") 164 | sns.move_legend(ax, "upper left") 165 | ax 166 | 167 | plt.xlabel("Observed read accuracy (%)") 168 | plt.ylabel("Probability Density Function") 169 | plt.xlim(acc_scale) 170 | plt.xticks(acc_breaks) 171 | 172 | plt.tight_layout() 173 | plt.savefig(f"{path}/1_Observed_read_accuracy.{format}", format=format, dpi=300) 174 | plt.close() 175 | 176 | # Compute mismatch proportions 177 | df["p_ins"] = 100 * df["Ins"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"]) 178 | df["p_del"] = 100 * df["Del"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"]) 179 | df["p_sub"] = 100 * df["Sub"] / (df["Ins"] + df["Del"] + df["Sub"] + df["Mat"]) 180 | 181 | # Melt the dataframe for mismatch proportions 182 | df1 = pd.melt(df, id_vars=['Group'], value_vars=['p_ins', 'p_del', 'p_sub']) 183 | max_1 = df["p_ins"].max() 184 | max_2 = df["p_del"].max() 185 | max_3 = df["p_sub"].max() 186 | max_4 = max(max_1, max_2, max_3) 187 | 188 | if max_4 <= 5: 189 | mis_scale = [0, 5] 190 | mis_breaks = [i for i in range(0, 6, 1)] 191 | elif max_4 <= 10: 192 | mis_scale = [0, 10] 193 | mis_breaks = [i for i in range(0, 11, 1)] 194 | elif max_4 <= 20: 195 | mis_scale = [0, 20] 196 | mis_breaks = [i for i in range(0, 21, 2)] 197 | elif max_4 <= 30: 198 | mis_scale = [0, 30] 199 | mis_breaks = [i for i in range(0, 31, 5)] 200 | elif max_4 <= 40: 201 | mis_scale = [0, 40] 202 | mis_breaks = [i for i in range(0, 41, 5)] 203 | elif max_4 <= 50: 204 | mis_scale = [0, 50] 205 | mis_breaks = [i for i in range(0, 51, 5)] 206 | elif max_4 <= 60: 207 | mis_scale = [0, 60] 208 | mis_breaks = [i for i in range(0, 61, 10)] 209 | elif max_4 <= 70: 210 | mis_scale = [0, 70] 211 | mis_breaks = [i for i in range(0, 71, 10)] 212 | elif max_4 <= 80: 213 | mis_scale = [0, 80] 214 | mis_breaks = [i for i in range(0, 81, 10)] 215 | elif max_4 <= 90: 216 | mis_scale = [0, 90] 217 | mis_breaks = [i for i in range(0, 91, 10)] 218 | else: 219 | mis_scale = [0, 100] 220 | mis_breaks = [i for i in range(0, 101, 10)] 221 | 222 | # Plot boxplot for mismatch proportions 223 | plt.figure(figsize=(8, 6)) 224 | sns.boxplot(data=df1, x="variable", y="value", hue="Group", 225 | showfliers=False, width=0.5, gap=0.1, saturation=0.6, 226 | palette = "Set2", linecolor="black") 227 | 228 | plt.ylabel("Mismatch proportion (%)") 229 | plt.ylim(mis_scale) 230 | plt.yticks(mis_breaks) 231 | plt.xticks(ticks=[0, 1, 2], labels=["Deletion", "Insertion", "Substitution"]) 232 | plt.xlabel("") 233 | 234 | # Ensure the legend is created correctly 235 | handles, labels = plt.gca().get_legend_handles_labels() 236 | if not handles: 237 | for group in df["Group"].unique(): 238 | handle = plt.Line2D([0], [0], color=sns.color_palette("pastel")[0], lw=2) 239 | handles.append(handle) 240 | labels.append(group) 241 | 242 | plt.legend(handles=handles, labels=labels, title='Group') 243 | plt.tight_layout() 244 | plt.savefig(f"{path}/2_Observed_mismatch_proportion.{format}", format=format, dpi=300) 245 | plt.close() 246 | 247 | def plot_observe_homo(format='svg', path='Giraffe_Results/2_Observed_quality'): 248 | # Load data 249 | df = process_in_chunks("Giraffe_Results/2_Observed_quality/Homoploymer_summary.txt") 250 | df["Accuracy"] = df["Accuracy"] * 100 251 | 252 | # Determine scale and breaks for y-axis based on minimum accuracy 253 | min_acc = df["Accuracy"].min() 254 | max_acc = df["Accuracy"].max() 255 | 256 | min_value = int((min_acc//10) * 10) 257 | max_value = int((max_acc//10) * 10 + 10) 258 | homo_scale = [min_value, max_value] 259 | 260 | dif = max_value - min_value 261 | 262 | if dif <= 10: 263 | homo_breaks = [i for i in range(min_value, max_value+1, 1)] 264 | elif dif <= 20: 265 | homo_breaks = [i for i in range(min_value, max_value+1, 2)] 266 | elif dif <= 30: 267 | homo_breaks = [i for i in range(min_value, max_value+1, 5)] 268 | elif dif <= 40: 269 | homo_breaks = [i for i in range(min_value, max_value+1, 5)] 270 | elif dif <= 50: 271 | homo_breaks = [i for i in range(min_value, max_value+1, 5)] 272 | elif dif <= 60: 273 | homo_breaks = [i for i in range(min_value, max_value+1, 5)] 274 | elif dif <= 70: 275 | homo_breaks = [i for i in range(min_value, max_value+1, 5)] 276 | elif dif <= 80: 277 | homo_breaks = [i for i in range(min_value, max_value+1, 5)] 278 | elif dif <= 90: 279 | homo_breaks = [i for i in range(min_value, max_value+1, 5)] 280 | else: 281 | homo_breaks = [i for i in range(min_value, max_value+1, 10)] 282 | 283 | # Create the plot 284 | plt.figure(figsize=(8, 6)) 285 | sns.lineplot(data=df, x='Base', y='Accuracy', hue='Group', linewidth=1.5, 286 | markers=True, dashes=False, palette = "Set2", alpha=0.6, legend=False) 287 | sns.scatterplot(data=df, x='Base',y='Accuracy', hue='Group', 288 | palette = "Set2", s=50, edgecolor="black") 289 | 290 | # Customize plot 291 | plt.ylim(homo_scale) 292 | plt.yticks(homo_breaks) 293 | plt.ylabel('Accuracy of homopolymer identification (%)') 294 | plt.xlabel('Base') 295 | 296 | # Save plot 297 | output_path = f"{path}/3_Homoploymer_summary.{format}" 298 | plt.savefig(output_path, format=format, dpi=300, bbox_inches='tight') 299 | plt.close() 300 | 301 | def plot_GC_bias(input_binsize, format='svg', path='Giraffe_Results/3_GC_bias'): 302 | # sns.set_style("whitegrid") 303 | # Load the first dataset 304 | df = pd.read_csv("Giraffe_Results/3_GC_bias/Bin_distribution.txt", sep="\t") 305 | accuracy_scale = [0, 100] 306 | accuracy_breaks = [i for i in range(0, 101, 10)] 307 | 308 | 309 | # Plot distribution length 310 | plt.figure(figsize=(8, 5)) 311 | 312 | sns.lineplot(data=df, x="GC_content", y="Number", color="#96D1E8", linewidth=1.5, alpha=0.3) 313 | sns.scatterplot(data=df, x="GC_content", y="Number", color="#96D1E8", edgecolor="black", s=15) 314 | 315 | plt.xlim(accuracy_scale) 316 | plt.xticks(accuracy_breaks) 317 | plt.xlabel("GC content (%)") 318 | plt.ylabel(f"Number of bins (bin size = {input_binsize} bp)") 319 | plt.grid(False) 320 | plt.savefig(f"{path}/1_Bin_distribution.{format}", dpi=300) 321 | plt.close() 322 | 323 | # Load the second dataset 324 | df1 = pd.read_csv("Giraffe_Results/3_GC_bias/Relationship_normalization.txt", sep="\t") 325 | 326 | # Plot GC bias 327 | plt.figure(figsize=(8, 5)) 328 | sns.lineplot(data=df1, x="GC_content", y="Normalized_depth", hue="Group", 329 | palette = "Set2", linewidth=1.5, alpha=.6) 330 | sns.scatterplot(data=df1, x="GC_content", y="Normalized_depth", hue="Group", 331 | palette = "Set2", edgecolor="black", s=20, legend=False) 332 | 333 | plt.axhline(1, color="grey", linestyle="dotted") 334 | plt.ylim(0, 2) 335 | 336 | depth_breaks = (0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0) 337 | plt.yticks(depth_breaks) 338 | 339 | plt.xlabel("GC content (%)") 340 | plt.ylabel("Normalized depth") 341 | plt.grid(False) 342 | plt.savefig(f"{path}/2_Relationship_normalization.{format}", dpi=300) 343 | plt.close() 344 | 345 | def plot_modi_bin(format="svg", path="Giraffe_Results/4_Regional_modification"): 346 | 347 | df = pd.read_csv("Giraffe_Results/4_Regional_modification/Regional_methylation_proportion.txt", sep="\t", names=["ID", "Value", "Group"]) 348 | df["Value"] = df["Value"] * 100 349 | 350 | # sns.set(style="whitegrid") 351 | plt.figure(figsize=(20, 5)) 352 | 353 | # Violin plot 354 | sns.violinplot(data=df, y="Group", x="Value", 355 | width=0.5, alpha=0.7, inner="box", split=True, 356 | inner_kws=dict(box_width=8, whis_width=2, color=".8")) 357 | 358 | methyl_scale = (0,100) 359 | methyl_breaks = [i for i in range(0, 101, 10)] 360 | 361 | plt.xlim(methyl_scale) 362 | plt.xticks(methyl_breaks) 363 | plt.xlabel("Methylation proportion (%)") 364 | plt.ylabel("") 365 | plt.yticks(fontsize=12, color='black') 366 | plt.legend([],[], frameon=False) # Remove legend 367 | 368 | plt.savefig(f"{path}/1_Regional_modification.{format}", dpi=300) 369 | plt.close() 370 | -------------------------------------------------------------------------------- /Giraffe_View/summary_html.py: -------------------------------------------------------------------------------- 1 | from Giraffe_View.function import * 2 | import pandas as pd 3 | import re 4 | import os 5 | import pathlib 6 | 7 | def generate_giraffe_data(input_table): 8 | # Initialize the dictionary structure 9 | giraffe_data = { 10 | "samples": [], 11 | "metrics": { 12 | "Estimate (average)": { 13 | "Estimated read accuracy": [], 14 | "Read length": [], 15 | "Read GC content": [] 16 | }, 17 | "Observed (average)": { 18 | "Observed read accuracy": [], 19 | "Observed read identification": [], 20 | "Substitution proportion": [], 21 | "Insertion proportion": [], 22 | "Deletion proportion": [], 23 | "Homopolymer accuracy (A)": [], 24 | "Homopolymer accuracy (T)": [], 25 | "Homopolymer accuracy (G)": [], 26 | "Homopolymer accuracy (C)": [] 27 | } 28 | } 29 | } 30 | 31 | # Read sample IDs from the input table 32 | with open(input_table, "r") as ff: 33 | giraffe_data["samples"] = [line.strip().split()[0] for line in ff] 34 | 35 | # Process the Estimated results 36 | estimated_file = "Giraffe_Results/1_Estimated_quality/Estimated_information.txt" 37 | df_estimated = process_in_chunks(estimated_file) 38 | df_estimated = pd.DataFrame(df_estimated) 39 | 40 | for sample in giraffe_data["samples"]: 41 | sample_df = df_estimated[df_estimated['Group'] == sample] 42 | giraffe_data["metrics"]["Estimate (average)"]["Estimated read accuracy"].append(round(100*sample_df["Accuracy"].mean(), 2)) 43 | giraffe_data["metrics"]["Estimate (average)"]["Read length"].append(round(sample_df["Length"].mean(), 2)) 44 | giraffe_data["metrics"]["Estimate (average)"]["Read GC content"].append(round(100*sample_df["GC_content"].mean(), 2)) 45 | 46 | # Process the Observed results 47 | observed_file = "Giraffe_Results/2_Observed_quality/Observed_information.txt" 48 | df_observed = process_in_chunks(observed_file) 49 | df_observed = pd.DataFrame(df_observed) 50 | 51 | # Calculate proportions 52 | df_observed["p_ins"] = 100 * df_observed["Ins"] / (df_observed["Ins"] + df_observed["Del"] + df_observed["Sub"] + df_observed["Mat"]) 53 | df_observed["p_del"] = 100 * df_observed["Del"] / (df_observed["Ins"] + df_observed["Del"] + df_observed["Sub"] + df_observed["Mat"]) 54 | df_observed["p_sub"] = 100 * df_observed["Sub"] / (df_observed["Ins"] + df_observed["Del"] + df_observed["Sub"] + df_observed["Mat"]) 55 | 56 | for sample in giraffe_data["samples"]: 57 | sample_df = df_observed[df_observed['Group'] == sample] 58 | giraffe_data["metrics"]["Observed (average)"]["Observed read accuracy"].append(round(100*sample_df["Acc"].mean(), 2)) 59 | giraffe_data["metrics"]["Observed (average)"]["Observed read identification"].append(100*round(sample_df["Iden"].mean(), 2)) 60 | giraffe_data["metrics"]["Observed (average)"]["Substitution proportion"].append(round(sample_df["p_sub"].mean(), 2)) 61 | giraffe_data["metrics"]["Observed (average)"]["Deletion proportion"].append(round(sample_df["p_del"].mean(), 2)) 62 | giraffe_data["metrics"]["Observed (average)"]["Insertion proportion"].append(round(sample_df["p_ins"].mean(), 2)) 63 | 64 | # Process the Homopolymer results 65 | homopolymer_file = "Giraffe_Results/2_Observed_quality/Homoploymer_summary.txt" 66 | df_homopolymer = process_in_chunks(homopolymer_file) 67 | df_homopolymer = pd.DataFrame(df_homopolymer) 68 | 69 | for sample in giraffe_data["samples"]: 70 | sample_df = df_homopolymer[df_homopolymer['Group'] == sample] 71 | for base in ["A", "T", "G", "C"]: 72 | accuracy = sample_df[sample_df['Base'] == base]["Accuracy"].mean() if not sample_df[sample_df['Base'] == base].empty else float('nan') 73 | giraffe_data["metrics"]["Observed (average)"][f"Homopolymer accuracy ({base})"].append(round(100*accuracy, 2) if not pd.isna(accuracy) else float('nan')) 74 | 75 | return giraffe_data 76 | 77 | def generate_giraffe_html(giraffe_data, summary_figures, output_file): 78 | with open(output_file, 'w') as f: 79 | f.write("\n\n\n") 80 | f.write("\n") 94 | f.write("\n\n") 95 | f.write("
\n

Giraffe Report

\n
\n") 96 | 97 | # Navigation Index 98 | f.write("\n") 148 | 149 | 150 | # Main content area 151 | f.write("
\n") 152 | 153 | # Create table headers 154 | headers = "Metric" 155 | for sample in giraffe_data['samples']: 156 | headers += f"{sample}" 157 | headers += "\n" 158 | 159 | # Generate table rows 160 | rows = "" 161 | for group, metrics in giraffe_data['metrics'].items(): 162 | # Add group header 163 | rows += f"{group}\n" 164 | for metric, values in metrics.items(): 165 | rows += f"{metric}" 166 | for value in values: 167 | rows += f"{value:.2f}" 168 | rows += "\n" 169 | 170 | # Combine headers and rows into a table 171 | f.write("
\n

Statistics

\n") 172 | f.write("
\n") # Center the table 173 | f.write("\n") 174 | f.write(headers) 175 | f.write(rows) 176 | f.write("
\n") 177 | f.write("
\n") # End of centering div 178 | f.write("
\n") 179 | 180 | # Summary Section with Figures 181 | # f.write("
\n

Figures

\n") 182 | for category, figures in summary_figures.items(): 183 | f.write(f"

{category}

\n") 184 | for figure in figures: 185 | if figure == "Summary_html/1_Read_estimate_accuracy.png": 186 | figure_title = "Estimated accuracy" 187 | f.write(f"
\n") 188 | f.write(f"{figure_title}\n") 189 | f.write(f"

Note: If the scale of accuracy is not suitable, please use the giraffe_plot function to replot.

\n") 190 | f.write(f"

giraffe_plot estimate_acc --input Estimated_information.txt --x_min 95 --x_max 100 --x_gap 1

\n") 191 | f.write(f"
\n") 192 | 193 | elif figure == "Summary_html/2_Read_GC_content.png": 194 | figure_title = "Read GC content" 195 | f.write(f"
\n") 196 | f.write(f"{figure_title}\n") 197 | # f.write(f"

This is a description!

\n") 198 | f.write(f"
\n") 199 | 200 | elif figure == "Summary_html/3_Read_length.png": 201 | figure_title = "Read length" 202 | f.write(f"
\n") 203 | f.write(f"{figure_title}\n") 204 | # f.write(f"

This is a description!

\n") 205 | f.write(f"
\n") 206 | 207 | elif figure == "Summary_html/1_Observed_read_accuracy.png": 208 | figure_title = "Observed accuracy" 209 | f.write(f"
\n") 210 | f.write(f"{figure_title}\n") 211 | f.write(f"

Note: If the scale of accuracy is not suitable, please use the giraffe_plot function to replot.

\n") 212 | f.write(f"

giraffe_plot observe_acc --input Observed_information.txt --x_min 95 --x_max 100 --x_gap 1

\n") 213 | f.write(f"
\n") 214 | 215 | elif figure == "Summary_html/2_Observed_mismatch_proportion.png": 216 | figure_title = "Mismatch proportion" 217 | f.write(f"
\n") 218 | f.write(f"{figure_title}\n") 219 | f.write(f"

Note: If the scale of proportion is not suitable, please use the giraffe_plot function to replot.

\n") 220 | f.write(f"

giraffe_plot observe_mismatch --input Observed_information.txt --y_max 5 --y_gap 1

\n") 221 | f.write(f"
\n") 222 | 223 | elif figure == "Summary_html/3_Homoploymer_summary.png": 224 | figure_title = "Homopolymer identification" 225 | f.write(f"
\n") 226 | f.write(f"{figure_title}\n") 227 | f.write(f"

Note: If the scale of accuracy is not suitable, please use the giraffe_plot function to replot.

\n") 228 | f.write(f"

giraffe_plot observe_homo --input Homoploymer_summary.txt --y_min 90 --y_max 100 --y_gap 2

\n") 229 | f.write(f"
\n") 230 | 231 | elif figure == "Summary_html/1_Bin_distribution.png": 232 | figure_title = "Bin distribution" 233 | f.write(f"
\n") 234 | f.write(f"{figure_title}\n") 235 | # f.write(f"

This is a description!

\n") 236 | f.write(f"
\n") 237 | 238 | elif figure == "Summary_html/2_Relationship_normalization.png": 239 | figure_title = "Relationship (depth and GC conetent)" 240 | f.write(f"
\n") 241 | f.write(f"{figure_title}\n") 242 | f.write(f"

Note: If the scale of GC content is not suitable, please use the renormalization_sequencing_bias for normalzation and giraffe_plot for plotting.

\n") 243 | f.write(f"

renormalization_sequencing_bias -i S1_distribution.txt -l 30 -r 60 -o S1.txt

\n") 244 | f.write(f"

giraffe_plot gcbias --input new_gcbias.txt --x_min 30 --x_max 60 --x_gap 2\n") 245 | f.write(f"

\n") 246 | 247 | else: 248 | continue 249 | 250 | 251 | # f.write(f"
\n") 252 | # # f.write(f"

{figure_title}

\n") 253 | # # f.write(f"

Description for {figure_title}.

\n") 254 | # f.write(f"{figure_title}\n") 255 | # f.write(f"
\n") 256 | 257 | # for figure in figures: 258 | # if figure == "Summary_html/1_Read_estimate_accuracy.png": 259 | # figure_title = "Estimated accuracy" 260 | # f.write(f"
\n") 261 | # f.write(f"{figure_title}\n") 262 | # f.write(f"

If the scale of accuracy is not suitable. Plasese using the giraffe_plot to replot.

\n") 263 | # f.write(f"

giraffe_plot estimate_acc --input Estimated_information.txt --x_min 50 --x_max 100 --x_gap 10

\n") 264 | # f.write(f"
\n") 265 | 266 | # elif figure == "Summary_html/2_Read_GC_content.png": 267 | # figure_title = "Read GC content" 268 | # f.write(f"
  • {figure_title}
  • \n") 269 | 270 | # elif figure == "Summary_html/3_Read_length.png": 271 | # figure_title = "Read length" 272 | # f.write(f"
  • {figure_title}
  • \n") 273 | 274 | # elif figure == "Summary_html/1_Observed_read_accuracy.png": 275 | # figure_title = "Observed accuracy" 276 | # f.write(f"
  • {figure_title}
  • \n") 277 | 278 | # elif figure == "Summary_html/2_Observed_mismatch_proportion.png": 279 | # figure_title = "Mismatch proportion" 280 | # f.write(f"
  • {figure_title}
  • \n") 281 | 282 | # elif figure == "Summary_html/3_Homoploymer_summary.png": 283 | # figure_title = "Homopolymer identification" 284 | # f.write(f"
  • {figure_title}
  • \n") 285 | 286 | 287 | # elif figure == "Summary_html/1_Bin_distribution.png": 288 | # figure_title = "Bin distribution" 289 | # f.write(f"
  • {figure_title}
  • \n") 290 | 291 | # elif figure == "Summary_html/2_Relationship_normalization.png": 292 | # figure_title = "Relationship (depth and GC conetent)" 293 | # f.write(f"
  • {figure_title}
  • \n") 294 | 295 | # else: 296 | # continue 297 | 298 | # for category, figures in summary_figures.items(): 299 | # f.write(f"

    {category}

    \n") 300 | # for figure in figures: 301 | # figure_title = os.path.splitext(os.path.basename(figure))[0].replace('_', ' ').title() 302 | # f.write(f"
    \n") 303 | # f.write(f"

    {figure_title}

    \n") 304 | # # f.write(f"

    Description for {figure_title}.

    \n") 305 | # f.write(f"{figure_title}\n") 306 | # f.write(f"
    \n") 307 | 308 | f.write("
    \n") 309 | f.write("
    \n") 310 | f.write("\n\n") 311 | 312 | def summarize_giraffe_results(input_data): 313 | path = "Summary_html" 314 | giraffe_data = generate_giraffe_data(input_data) 315 | 316 | summary_figures = { 317 | "Estimate": [ 318 | f"{path}/1_Read_estimate_accuracy.png", 319 | f"{path}/2_Read_GC_content.png", 320 | f"{path}/3_Read_length.png" 321 | ], 322 | "Observe": [ 323 | f"{path}/1_Observed_read_accuracy.png", 324 | f"{path}/2_Observed_mismatch_proportion.png", 325 | f"{path}/3_Homoploymer_summary.png" 326 | ], 327 | "GC bias": [ 328 | f"{path}/1_Bin_distribution.png", 329 | f"{path}/2_Relationship_normalization.png" 330 | ]} 331 | 332 | generate_giraffe_html(giraffe_data, summary_figures, "Giraffe_Results/giraffe_report.html") 333 | -------------------------------------------------------------------------------- /Giraffe_View/giraffe: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import datetime 3 | import sys 4 | import argparse 5 | import pathlib 6 | from os import system 7 | from Giraffe_View.function import * 8 | from Giraffe_View.homopolymer import * 9 | from Giraffe_View.observed_read_accuracy import * 10 | from Giraffe_View.gc_bias import * 11 | from Giraffe_View.estimated_read_accuracy import * 12 | from Giraffe_View.regional_modification import * 13 | from Giraffe_View.plot import * 14 | from Giraffe_View.summary_html import * 15 | 16 | working_path = pathlib.Path().resolve() 17 | 18 | def estimated(args): 19 | mkdir_d("1_Estimated_quality") 20 | if args.read: 21 | input_dataset = loading_dataset(args.read) 22 | for data in input_dataset.keys(): 23 | now = datetime.datetime.now() 24 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start estimated read accuracy analysis!") 25 | # if args.less_memory: 26 | calculate_estimated_accuracy_slow(data, input_dataset[data]["path"], args.cpu) 27 | # else: 28 | # calculate_estimated_accuracy(data, input_dataset[data]["path"], args.cpu) 29 | 30 | elif args.unaligned: 31 | input_dataset = loading_dataset(args.unaligned) 32 | for data in input_dataset.keys(): 33 | now = datetime.datetime.now() 34 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start estimated read accuracy analysis!") 35 | bam2fastq(input_dataset[data]["path"], args.cpu) 36 | 37 | system("bash bam2fq.sh") 38 | # if args.less_memory: 39 | calculate_estimated_accuracy_slow(data, "giraffe_tmp.fastq", args.cpu) 40 | # else: 41 | # calculate_estimated_accuracy(data, "giraffe_tmp.fastq", args.cpu) 42 | system("rm bam2fq.sh giraffe_tmp.fastq") 43 | 44 | merge_results() 45 | 46 | if args.plot: 47 | now = datetime.datetime.now() 48 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!") 49 | plot_estimate() 50 | 51 | now = datetime.datetime.now() 52 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!") 53 | else: 54 | now = datetime.datetime.now() 55 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!") 56 | 57 | mes = "The results are available at " + str(working_path) + "/Giraffe_Results/1_Estimated_quality!" 58 | now = datetime.datetime.now() 59 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(mes)) 60 | 61 | def observed(args): 62 | now = datetime.datetime.now() 63 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start data processing!") 64 | mkdir_d("2_Observed_quality") 65 | 66 | if args.read: 67 | if not args.ref: 68 | error_with_color("Please input a reference!!!") 69 | 70 | input_dataset = loading_dataset(args.read) 71 | for data in input_dataset.keys(): 72 | now = datetime.datetime.now() 73 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start read mapping!") 74 | data_process(data, input_dataset[data]["type"], input_dataset[data]["path"], args.ref, args.cpu) 75 | bamfile = "Giraffe_Results/2_Observed_quality/" + str(data) + ".bam" 76 | 77 | now = datetime.datetime.now() 78 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!") 79 | run_observed_accuracy(bamfile, data, args.cpu) 80 | 81 | temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt" 82 | with open("merge_supplementary.sh", "w") as ff: 83 | mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out) 84 | ff.write(mes + "\n") 85 | ff.close() 86 | 87 | system("bash merge_supplementary.sh") 88 | supplementary_read_processing(data) 89 | 90 | system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt") 91 | system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt") 92 | 93 | now = datetime.datetime.now() 94 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!") 95 | run_homopolymer_from_bam(bamfile, data, args.cpu) 96 | 97 | of = open("header", "w") 98 | of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n") 99 | of.close() 100 | 101 | output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt" 102 | ff = open("merge_homopolymer.sh", "w") 103 | ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output)) 104 | ff.close() 105 | 106 | system("bash merge_homopolymer.sh") 107 | system("rm header") 108 | system("rm merge_homopolymer.sh ") 109 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt") 110 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ") 111 | 112 | now = datetime.datetime.now() 113 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!") 114 | homopolymer_summary_2(data) 115 | 116 | elif args.aligned: 117 | input_dataset = loading_dataset(args.aligned) 118 | for data in input_dataset.keys(): 119 | bamfile = input_dataset[data]["path"] 120 | 121 | now = datetime.datetime.now() 122 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!") 123 | 124 | if not os.path.exists(bamfile+".bai"): 125 | system("samtools index -@ " + str(args.cpu) + " " + bamfile) 126 | 127 | run_observed_accuracy(bamfile, data, args.cpu) 128 | 129 | temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt" 130 | with open("merge_supplementary.sh", "w") as ff: 131 | mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out) 132 | ff.write(mes + "\n") 133 | ff.close() 134 | 135 | system("bash merge_supplementary.sh") 136 | supplementary_read_processing(data) 137 | 138 | system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt") 139 | system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt") 140 | 141 | now = datetime.datetime.now() 142 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!") 143 | run_homopolymer_from_bam(bamfile, data, args.cpu) 144 | 145 | of = open("header", "w") 146 | of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n") 147 | of.close() 148 | 149 | output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt" 150 | ff = open("merge_homopolymer.sh", "w") 151 | ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output)) 152 | ff.close() 153 | 154 | system("bash merge_homopolymer.sh") 155 | system("rm header") 156 | system("rm merge_homopolymer.sh ") 157 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt") 158 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ") 159 | 160 | now = datetime.datetime.now() 161 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!") 162 | homopolymer_summary_2(data) 163 | 164 | elif args.unaligned: 165 | if not args.ref: 166 | error_with_color("Please input a reference!!!") 167 | input_dataset = loading_dataset(args.unaligned) 168 | for data in input_dataset.keys(): 169 | now = datetime.datetime.now() 170 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start read mapping!") 171 | bam2fastq(input_dataset[data]["path"], args.cpu) 172 | system("bash bam2fq.sh") 173 | data_process(data, input_dataset[data]["type"], "giraffe_tmp.fastq", args.ref, args.cpu) 174 | system("rm bam2fq.sh giraffe_tmp.fastq") 175 | bamfile = "Giraffe_Results/2_Observed_quality/" + str(data) + ".bam" 176 | 177 | now = datetime.datetime.now() 178 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!") 179 | run_observed_accuracy(bamfile, data, args.cpu) 180 | 181 | temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt" 182 | with open("merge_supplementary.sh", "w") as ff: 183 | mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out) 184 | ff.write(mes + "\n") 185 | ff.close() 186 | 187 | system("bash merge_supplementary.sh") 188 | supplementary_read_processing(data) 189 | 190 | system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt") 191 | system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt") 192 | 193 | now = datetime.datetime.now() 194 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!") 195 | run_homopolymer_from_bam(bamfile, data, args.cpu) 196 | 197 | of = open("header", "w") 198 | of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n") 199 | of.close() 200 | 201 | output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt" 202 | ff = open("merge_homopolymer.sh", "w") 203 | ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output)) 204 | ff.close() 205 | 206 | system("bash merge_homopolymer.sh") 207 | system("rm header") 208 | system("rm merge_homopolymer.sh ") 209 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt") 210 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ") 211 | 212 | now = datetime.datetime.now() 213 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!") 214 | homopolymer_summary_2(data) 215 | 216 | now = datetime.datetime.now() 217 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start merge the observed quality results!") 218 | merge_results_observed_acc() 219 | merge_results_observed_homopolymer() 220 | 221 | if args.plot: 222 | now = datetime.datetime.now() 223 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!") 224 | plot_observe_acc() 225 | plot_observe_homo() 226 | else: 227 | pass 228 | 229 | mes = "The results are available at " + str(working_path) + "/Giraffe_Results/2_Observed_quality!" 230 | now = datetime.datetime.now() 231 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(mes)) 232 | 233 | def GC_bias(args): 234 | mkdir_d("3_GC_bias") 235 | input_dataset = loading_dataset(args.aligned) 236 | for data in input_dataset.keys(): 237 | now = datetime.datetime.now() 238 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start GC bias analysis!") 239 | compute_GC_bias(args.ref, input_dataset[data]["path"], args.binsize, data, args.cpu) 240 | merge_GC_content_and_depth(args.binsize, data) 241 | 242 | merge_files() 243 | get_bin_number_within_GC_content() 244 | 245 | if args.plot: 246 | now = datetime.datetime.now() 247 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!") 248 | plot_GC_bias(input_binsize=str(args.binsize)) 249 | now = datetime.datetime.now() 250 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!") 251 | else: 252 | now = datetime.datetime.now() 253 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!") 254 | 255 | mes = "The results are available at " + str(working_path) + "/Giraffe_Results/3_GC_bias!" 256 | now = datetime.datetime.now() 257 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(mes)) 258 | 259 | def methylation(args): 260 | now = datetime.datetime.now() 261 | mkdir_d("4_Regional_modification") 262 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis start!") 263 | 264 | 265 | input_dataset = loading_dataset(args.methyl) 266 | for data in input_dataset.keys(): 267 | now = datetime.datetime.now() 268 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " +f"{data}" + ": Calculating regional modification analysis!") 269 | run_regional_methylation(input_dataset[data]["path"], args.region, data, args.cpu) 270 | 271 | now = datetime.datetime.now() 272 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Summarize results!") 273 | system("cat Giraffe_Results/4_Regional_modification/Temp_methy_* > Giraffe_Results/4_Regional_modification/Regional_methylation_proportion.txt") 274 | system("rm Giraffe_Results/4_Regional_modification/Temp_methy_*") 275 | 276 | if args.plot: 277 | now = datetime.datetime.now() 278 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!") 279 | plot_modi_bin() 280 | print_with_color("Analysis finished!") 281 | else: 282 | now = datetime.datetime.now() 283 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!") 284 | 285 | now = datetime.datetime.now() 286 | mes = "[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] The results are available at " + str(working_path) + "/Giraffe_Results/4_Regional_modification!" 287 | print_with_color(str(mes)) 288 | 289 | def total(args): 290 | now = datetime.datetime.now() 291 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis start!") 292 | 293 | mkdir_d("1_Estimated_quality") 294 | mkdir_d("2_Observed_quality") 295 | mkdir_d("3_GC_bias") 296 | mkdir_d("Summary_html") 297 | 298 | if args.read: 299 | input_dataset = loading_dataset(args.read) 300 | data_table = args.read 301 | for data in input_dataset.keys(): 302 | # estimate 303 | now = datetime.datetime.now() 304 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start estimated read accuracy analysis!") 305 | 306 | # if args.less_memory: 307 | calculate_estimated_accuracy_slow(data, input_dataset[data]["path"], args.cpu) 308 | # else: 309 | # calculate_estimated_accuracy(data, input_dataset[data]["path"], args.cpu) 310 | 311 | # observe 312 | now = datetime.datetime.now() 313 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start read mapping!") 314 | data_process(data, input_dataset[data]["type"], input_dataset[data]["path"], args.ref, args.cpu) 315 | bamfile = "Giraffe_Results/2_Observed_quality/" + str(data) + ".bam" 316 | 317 | now = datetime.datetime.now() 318 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!") 319 | run_observed_accuracy(bamfile, data, args.cpu) 320 | 321 | temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt" 322 | with open("merge_supplementary.sh", "w") as ff: 323 | mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out) 324 | ff.write(mes + "\n") 325 | ff.close() 326 | 327 | system("bash merge_supplementary.sh") 328 | supplementary_read_processing(data) 329 | 330 | system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt") 331 | system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt") 332 | 333 | now = datetime.datetime.now() 334 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!") 335 | run_homopolymer_from_bam(bamfile, data, args.cpu) 336 | 337 | of = open("header", "w") 338 | of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n") 339 | of.close() 340 | 341 | output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt" 342 | ff = open("merge_homopolymer.sh", "w") 343 | ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output)) 344 | ff.close() 345 | 346 | system("bash merge_homopolymer.sh") 347 | system("rm header") 348 | system("rm merge_homopolymer.sh ") 349 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt") 350 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ") 351 | 352 | now = datetime.datetime.now() 353 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!") 354 | homopolymer_summary_2(data) 355 | 356 | # gc bias 357 | now = datetime.datetime.now() 358 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start GC bias analysis!") 359 | compute_GC_bias(args.ref, bamfile, args.binsize, data, args.cpu) 360 | merge_GC_content_and_depth(args.binsize, data) 361 | 362 | elif args.unaligned: 363 | input_dataset = loading_dataset(args.unaligned) 364 | data_table = args.unaligned 365 | for data in input_dataset.keys(): 366 | 367 | # estimate 368 | now = datetime.datetime.now() 369 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start estimated read accuracy analysis!") 370 | bam2fastq(input_dataset[data]["path"], args.cpu) 371 | system("bash bam2fq.sh") 372 | # if args.less_memory: 373 | calculate_estimated_accuracy_slow(data, "giraffe_tmp.fastq", args.cpu) 374 | # else: 375 | # calculate_estimated_accuracy(data, "giraffe_tmp.fastq", args.cpu) 376 | 377 | # observe 378 | now = datetime.datetime.now() 379 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start read mapping!") 380 | data_process(data, input_dataset[data]["type"], "giraffe_tmp.fastq", args.ref, args.cpu) 381 | system("rm bam2fq.sh giraffe_tmp.fastq") 382 | bamfile = "Giraffe_Results/2_Observed_quality/" + str(data) + ".bam" 383 | 384 | now = datetime.datetime.now() 385 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start observed read accuracy analysis!") 386 | run_observed_accuracy(bamfile, data, args.cpu) 387 | 388 | temp_out = "Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt" 389 | with open("merge_supplementary.sh", "w") as ff: 390 | mes = "cat Giraffe_Results/2_Observed_quality/*_supplementary_*.txt > " + str(temp_out) 391 | ff.write(mes + "\n") 392 | ff.close() 393 | 394 | system("bash merge_supplementary.sh") 395 | supplementary_read_processing(data) 396 | 397 | system("rm merge_supplementary.sh Giraffe_Results/2_Observed_quality/*_supplementary_*.txt") 398 | system("rm Giraffe_Results/2_Observed_quality/giraffe_supplementary.temp.txt") 399 | 400 | now = datetime.datetime.now() 401 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start homopolymer analysis!") 402 | run_homopolymer_from_bam(bamfile, data, args.cpu) 403 | 404 | of = open("header", "w") 405 | of.write("pos\tnum_of_mat\tdepth\ttype\tGroup\n") 406 | of.close() 407 | 408 | output ="Giraffe_Results/2_Observed_quality/" + str(data) + ".homopolymer_in_reference.txt" 409 | ff = open("merge_homopolymer.sh", "w") 410 | ff.write("cat header Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt > " + str(output)) 411 | ff.close() 412 | 413 | system("bash merge_homopolymer.sh") 414 | system("rm header") 415 | system("rm merge_homopolymer.sh ") 416 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_in_reference_*.txt") 417 | system("rm Giraffe_Results/2_Observed_quality/*_homopolymer_detail_*.txt ") 418 | 419 | now = datetime.datetime.now() 420 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start summarize the homopolymer results!") 421 | homopolymer_summary_2(data) 422 | 423 | # gc bias 424 | now = datetime.datetime.now() 425 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(data) + ": Start GC bias analysis!") 426 | compute_GC_bias(args.ref, bamfile, args.binsize, data, args.cpu) 427 | merge_GC_content_and_depth(args.binsize, data) 428 | 429 | # merge results 430 | now = datetime.datetime.now() 431 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start merge the results!") 432 | # estimate 433 | merge_results() 434 | # observe 435 | merge_results_observed_acc() 436 | merge_results_observed_homopolymer() 437 | # gc_bias 438 | merge_files() 439 | get_bin_number_within_GC_content() 440 | 441 | 442 | # plot 443 | now = datetime.datetime.now() 444 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start plotting!") 445 | 446 | plot_estimate() 447 | plot_estimate("png","Giraffe_Results/Summary_html") 448 | plot_observe_acc() 449 | plot_observe_acc("png","Giraffe_Results/Summary_html") 450 | plot_observe_homo() 451 | plot_observe_homo("png","Giraffe_Results/Summary_html") 452 | plot_GC_bias(input_binsize=str(args.binsize)) 453 | plot_GC_bias(input_binsize=str(args.binsize), format="png", path="Giraffe_Results/Summary_html") 454 | 455 | now = datetime.datetime.now() 456 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Start summarizing!") 457 | summarize_giraffe_results(data_table) 458 | 459 | now = datetime.datetime.now() 460 | mes = "The results are available at " + str(working_path) + "/Giraffe_Results!" 461 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] Analysis finished!") 462 | print_with_color("[" + now.strftime("%Y-%m-%d %H:%M:%S") + "] " + str(mes)) 463 | 464 | if __name__ == '__main__': 465 | 466 | version = "0.2.3" 467 | 468 | parser = argparse.ArgumentParser(description="", 469 | usage="\n %(prog)s [subcommands] [options] # Users can execute subcommands as needed to perform specific tasks." 470 | "\n %(prog)s --read --ref --cpu # Running function of estimate, observe, and gcbias with FASTQ reads." 471 | "\n %(prog)s --read --ref --cpu # Running function of estimate, observe, and gcbias with unaligned SAM/BAM reads." 472 | "\n\nexample for table (sample_ID data_type file_path):\n" 473 | " sample_A ONT /home/user/data/S1.fastq\n" 474 | " sample_B ONT /home/user/data/S2.fastq\n" 475 | " sample_C ONT /home/user/data/S3.fastq\n" 476 | " ..." 477 | "\n\nnote:\n" 478 | " version: " + str(version) + "\n" 479 | " data_type: ONT, ONT_RNA, or Pacbio\n" 480 | " For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.") 481 | 482 | parser.add_argument("--read", type=str, metavar="", required=False, help="table of FASTQ read files") 483 | parser.add_argument("--unaligned", type=str, metavar="", required=False, help="table of the unaligned SAM/BAM files") 484 | parser.add_argument("--ref", type=str, metavar="", required=False, help="reference file") 485 | parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (recommend to set this equal to the number of chromosomes, default:10)", default=10) 486 | parser.add_argument("--binsize", type=int, metavar="", required=False, help="reference will be split into bins of the specified size (default:1000)", default=1000) 487 | # parser.add_argument("--plot", required=False, help="results visualization", action='store_true') 488 | # parser.add_argument("--less_memory", required=False, help="using less memory but takes more time to complete the estimated analysis.", action='store_true') 489 | 490 | 491 | 492 | # Define subparsers 493 | subparsers = parser.add_subparsers(dest='function', help=None, description=None, prog="giraffe", metavar=" subcommand and function") 494 | 495 | estimated_parser = subparsers.add_parser('estimate', help='Estimated accuracy, length, and GC content.', 496 | usage='\n %(prog)s --read # For the FASTQ reads.\n' 497 | ' %(prog)s --unaligned # For the unaligned SAM/BAM files.' 498 | "\n\nexample for table (sample_ID data_type file_path):\n" 499 | " sample_A ONT /home/user/data/S1.fastq\n" 500 | " sample_B ONT /home/user/data/S2.fastq\n" 501 | " sample_C ONT /home/user/data/S3.fastq\n" 502 | " ..." 503 | "\n\nnote:\n" 504 | " version: " + str(version) + "\n" 505 | " data_type: ONT, ONT_RNA, or Pacbio\n" 506 | " For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.") 507 | 508 | estimated_parser.add_argument("--read", type=str, metavar="", required=False, help="table of FASTQ read files") 509 | estimated_parser.add_argument("--unaligned", type=str, metavar="", required=False, help="table of the unaligned SAM/BAM files") 510 | estimated_parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (default:10)", default=10) 511 | estimated_parser.add_argument("--plot", required=False, help="results visualization", action='store_true') 512 | # estimated_parser.add_argument("--less_memory", required=False, help="using less memory but takes more time to complete the task", action='store_true') 513 | 514 | observed_parser = subparsers.add_parser('observe', help='Observed accuracy, mismatch proportion, and homopolymer identification.', 515 | usage="\n %(prog)s --aligned \t\t\t\t# For aligned SAM/BAM files. Please remove the secondary alignment (--secondary=no) and add MD tag (--MD) during mapping!\n" 516 | " %(prog)s --read --ref \t\t\t# For FASTQ reads.\n" 517 | " %(prog)s --unaligned --ref \t# For unaligned SAM/BAM files." 518 | "\n\nexample for table (sample_ID data_type file_path):\n" 519 | " sample_A ONT /home/user/data/S1.fastq\n" 520 | " sample_B ONT /home/user/data/S2.fastq\n" 521 | " sample_C ONT /home/user/data/S3.fastq\n" 522 | "\n\nnote:\n" 523 | " version: " + str(version) + "\n" 524 | " data_type: ONT, ONT_RNA, or Pacbio\n" 525 | " For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.") 526 | 527 | observed_parser.add_argument("--read", type=str, metavar="", required=False, help="table of the FASTQ read files") 528 | observed_parser.add_argument("--aligned", type=str, metavar="", required=False, help="table of the aligned SAM/BAM files") 529 | observed_parser.add_argument("--unaligned", type=str, metavar="", required=False, help="table of the unaligned SAM/BAM files") 530 | observed_parser.add_argument("--ref", type=str, metavar="", required=False, help="reference file") 531 | observed_parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (recommend to set this equal to the number of chromosomes, default:10)", default=10) 532 | observed_parser.add_argument("--plot", required=False, help="results visualization", action='store_true') 533 | 534 | GC_bias_parser = subparsers.add_parser('gcbias', help='Relationship between GC content and sequencing depth.', 535 | usage="\n %(prog)s --ref --aligned --binsize 5000 --cpu 24\n\n" 536 | "example for table (sample_ID data_type file_path):\n" 537 | " sample_A ONT /home/user/data/S1.sort.bam\n" 538 | " sample_B ONT /home/user/data/S2.sort.bam\n" 539 | " sample_C ONT /home/user/data/S3.sort.bam\n" 540 | " ..." 541 | "\n\nnote:\n" 542 | " version: " + str(version) + "\n" 543 | " data_type: ONT, ONT_RNA, or Pacbio\n" 544 | " For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.") 545 | GC_bias_parser.add_argument("--ref", type=str, metavar="", required=True, help="reference file") 546 | GC_bias_parser.add_argument("--aligned", type=str, metavar="", required=True, help="table of sorted SAM/BAM files") 547 | GC_bias_parser.add_argument("--binsize", type=int, metavar="", required=False, help="reference will be split into bins of the specified size (default:1000)", default=1000) 548 | GC_bias_parser.add_argument("--plot", required=False, help="results visualization", action='store_true') 549 | GC_bias_parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (recommend to set this equal to the number of chromosomes, default:10)", default=10) 550 | 551 | methylation_parser = subparsers.add_parser('modbin', help='Average modification proportion at regional level.', 552 | usage="\n %(prog)s --methyl --region \n\n" 553 | "example for table (sample_ID data_type file_path):\n" 554 | " sample_A ONT /home/user/data/S1_5mC.txt\n" 555 | " sample_B ONT /home/user/data/S2_5mC.txt\n" 556 | " sample_C ONT /home/user/data/S3_5mC.txt\n" 557 | " ..." 558 | "\n\nexample for methylation file (Chrom Start End Value):\n" 559 | " contig_A\t132\t133\t0.92\n" 560 | " contig_A\t255\t256\t0.27\n" 561 | " contig_A\t954\t955\t0.52\n" 562 | " ..." 563 | "\n\nnote:\n" 564 | " version: " + str(version) + "\n" 565 | " data_type: ONT, ONT_RNA, or Pacbio\n" 566 | " For more details, please refer to the documentation: https://giraffe-documentation.readthedocs.io/en/latest.") 567 | 568 | 569 | methylation_parser.add_argument("--methyl", type=str, metavar="", required=True, help="table of methylation files") 570 | methylation_parser.add_argument("--region", type=str, metavar="", required=True, help="target region file (Chromosome\tStart\tEnd\tRegion_name)") 571 | methylation_parser.add_argument("--cpu", type=int, metavar="", required=False, help="number of processes or threads (recommend to set this equal to the number of chromosomes, default:10)", default=10) 572 | methylation_parser.add_argument("--plot", required=False, help="results visualization", action='store_true') 573 | args = parser.parse_args() 574 | 575 | # Add function to print help if no arguments are provided 576 | if len(sys.argv) == 1: 577 | parser.print_help(sys.stderr) 578 | sys.exit(1) 579 | 580 | # Call the appropriate function based on the subparser used 581 | if args.function == "observe": 582 | if len(sys.argv) == 2: 583 | observed_parser.print_help(sys.stderr) 584 | sys.exit(1) 585 | else: 586 | observed(args) 587 | 588 | elif args.function == "modbin": 589 | methylation(args) 590 | 591 | elif args.function == "gcbias": 592 | GC_bias(args) 593 | 594 | elif args.function == "estimate": 595 | if len(sys.argv) == 2: 596 | estimated_parser.print_help(sys.stderr) 597 | sys.exit(1) 598 | else: 599 | estimated(args) 600 | else: 601 | total(args) 602 | #create a summary in html 603 | --------------------------------------------------------------------------------