├── AY179745.fasta ├── Bos_taurus_Chr1.bed ├── Bos_taurus_Chr1_GTF.gtf ├── D10845.fasta ├── D10847.fasta ├── Downloading_FASTA_for_retrieved_accession_id_from_NCBI.ipynb ├── Extracting_primary_structure_from_PDB_file.ipynb ├── GC_content_multi_fasta.ipynb ├── GC_content_single_fasta.ipynb ├── GO_enrichment_analysis.py ├── README.md ├── Retrieving_secondary_structure_from_PDB_file_using_DSSP.ipynb ├── adjacency_list.txt ├── append.py ├── average_read_length_fastq.py ├── basic_ops_biopython.py ├── biopython_parse_fasta.py ├── biopython_parse_genbank.py ├── biopython_read_fasta.py ├── biopython_read_genbank.py ├── bismark.txt ├── check_pallindrome.ipynb ├── cnt_ATGC_biopython.py ├── code_request_001.py ├── count_ATGC.py ├── count_headers.py ├── count_kmers.ipynb ├── dataset_294_4.txt ├── demo.pdb ├── downloaded_fasta01.fasta ├── extract_identical_headers.ipynb ├── extracting_only_gene_ids.ipynb ├── fasta_converted_1.fasta ├── fasta_converted_2.fasta ├── fastq_to_fasta_1.ipynb ├── fastq_to_fasta_2.ipynb ├── file1.fasta ├── file2.fasta ├── file3.fasta ├── find_GC_by_position.py ├── gene_ids.txt ├── generate_kmers.ipynb ├── generate_pallindrome.ipynb ├── getting_sequence_in_one_line.py ├── gtf_to_bed.py ├── headers.txt ├── histogram_of_qualities.py ├── input_fasta.fasta ├── mean_methylation_rate.py ├── merged.fasta ├── merging_fasta_files.py ├── merging_fasta_files_modified.py ├── metagenomic.fasta ├── metagenomic_out.fasta ├── multi_GC_input.fasta ├── multi_fasta.fasta ├── noname_fastq.fastq ├── out_file_0.fasta ├── out_file_1.fasta ├── output.fasta ├── output_bismark.txt ├── pairwise_local_seq_align.ipynb ├── pairwise_seq_align.ipynb ├── parsing_pdb.ipynb ├── pdb1crn.dssp ├── pdb1crn.ent ├── pdb5h7a.ent ├── phred_to_Q.py ├── primary_str_from_pdb.fasta ├── python_for_bioinformatics.pdf ├── query_cover.py ├── query_cover_filter.py ├── read1.py ├── read2.py ├── read3.py ├── reading_fastq.py ├── remove_seq.py ├── renaming_fastq_headers.ipynb ├── rev_comp_multi_fasta.fasta ├── rev_comp_multi_fasta.py ├── rev_complement.py ├── sample.fastq ├── sample_fastq.fastq ├── sample_multi_fasta.fasta ├── seq_from_fasta_using_id_1.py ├── seq_from_fasta_using_id_2.py ├── sequence_length_distribution_from_Fastq.ipynb ├── single_GC_input.fasta ├── total_reads_fastq.py ├── trie_construction.ipynb ├── withname_fastq.fastq └── write.py /AY179745.fasta: -------------------------------------------------------------------------------- 1 | >AY179745.1 Ursus maritimus isolate PBX2818 SRY (SRY) gene, partial cds 2 | AACGCATTCATGGTGTGGTCTCGTGATCAAAGGCGCAAGGTGGCTCTAGAGAATCCCCAA 3 | ATGCAAAACTCAGAGATCAGCAAGCAGCTGGGGTA 4 | -------------------------------------------------------------------------------- /D10845.fasta: -------------------------------------------------------------------------------- 1 | >D10845.1 Capra hircus SRY gene, conserved motif, partial sequence 2 | CTCGTGAACGAAGACGAAAGGTGGCTCTAGAGAATCCCAAATTGCAAAACTCAGAGATCA 3 | GCAAGCAGCTGGGATACGAGTGGAAAAGGCTTACAGATGCTGAAAAGCGCCCATTCTTTG 4 | AGGAGGCACAGAGACTACTAGCTATA 5 | -------------------------------------------------------------------------------- /D10847.fasta: -------------------------------------------------------------------------------- 1 | >D10847.1 Ovis ovis SRY gene, conserved motif, partial sequence 2 | CTCGTGAACGAAGACGAAAGGTGGCTCTAGAGAATCCCAAACTGCAAAACTCAGAGATCA 3 | GCAAGCAGCTGGGATACGAGTGGAAAAGGCTTACAGATGCTGAAAAGCGCCCATTCTTTG 4 | AGGAGGCACAGAGACTACTAGCTATA 5 | -------------------------------------------------------------------------------- /Downloading_FASTA_for_retrieved_accession_id_from_NCBI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from Bio import Entrez\n", 10 | "from Bio import SeqIO\n", 11 | "Entrez.email = \"763936021a@gmail.com\" \n", 12 | "\n", 13 | "##### fetching accession ids from ncbi ####\n", 14 | "database = \"nucleotide\"\n", 15 | "search_term = \"myoglobin homo sapiens\"\n", 16 | "id_type = \"acc\"\n", 17 | "ret_max = 10\n", 18 | "\n", 19 | "acc_handle = Entrez.esearch(db=database, term=search_term, retmax=ret_max, idtype = id_type)\n", 20 | "record = Entrez.read(acc_handle)\n", 21 | "\n", 22 | "retrieved_ids = record[\"IdList\"]\n", 23 | "print(\"Retrieved accesion ids: \",retrieved_ids)\n", 24 | "##### fetching accession ids from ncbi ####" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "#### downloading fasta for the fetched ids ####\n", 34 | "out_file = r\"D:\\Pycharm_project\\ML_Lab\\downloaded_fasta.fasta\"\n", 35 | "\n", 36 | "fasta_handle = Entrez.efetch(db=\"nucleotide\", id=retrieved_ids, rettype=\"fasta\", retmode=\"text\")\n", 37 | "\n", 38 | "# writing fasta records to out file \n", 39 | "with open(out_file,\"w\") as fw:\n", 40 | " fw.write(fasta_handle.read())" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# reading fasta records from out file\n", 50 | "fasta_records = SeqIO.parse(out_file, \"fasta\")\n", 51 | "\n", 52 | "for fasta_record in fasta_records:\n", 53 | " print(fasta_record.description)\n", 54 | " " 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.6.4" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 2 79 | } 80 | -------------------------------------------------------------------------------- /Extracting_primary_structure_from_PDB_file.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# DOWNLOADING PDB FILE \n", 10 | "# EXTRACTING PRIMARY STRUCTURE\n", 11 | "# SAVING IT TO A FASTA FILE\n", 12 | "\n", 13 | "from pathlib import Path\n", 14 | "from Bio.PDB import PDBList\n", 15 | "from Bio import SeqIO\n", 16 | "from Bio.SeqRecord import SeqRecord\n", 17 | "\n", 18 | "out_dir = Path(r\"D:\\Pycharm_project\\ML_Lab\") # destination path for our PDB file\n", 19 | "out_fasta_path = out_dir/\"fasta_from_pdb.fasta\" # destination path for out fasta file\n", 20 | "pdb_code = \"5H7A\" # PDB ID of our desired protein\n", 21 | "file_name = \"pdb\"+pdb_code+\".ent\"\n", 22 | "pdb_file_path = out_dir/file_name\n", 23 | "\n", 24 | "# try and except block to prevent re downloading the same file on multiple executions of the code\n", 25 | "try:\n", 26 | " # trying to open the file\n", 27 | " # will print the given error message if PDB file already exists\n", 28 | " fr = open(pdb_file_path,\"r\")\n", 29 | " fr.close()\n", 30 | " print(\"pdb file already exist\")\t\n", 31 | "\n", 32 | "except:\n", 33 | " # will download the file only if it doen not already exists\n", 34 | " pdbl = PDBList()\n", 35 | " pdbl.retrieve_pdb_file(pdb_code, pdir = out_dir, file_format = \"pdb\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Parsing primary structure from the PDB file\n", 45 | "record_list = SeqIO.parse(file_name,\"pdb-atom\")\n", 46 | "fasta_record_list = []\n", 47 | "for record in record_list:\n", 48 | " header = \">\" + record.id\n", 49 | " sequence = record.seq\n", 50 | " fasta_record_list.append(SeqRecord(id = header, seq = sequence))\n", 51 | "\n", 52 | "# writing the primary structures to fasta file\n", 53 | "SeqIO.write(fasta_record_list,out_fasta_path,\"fasta\")\n", 54 | "print(\"Fasta file successfully created at: \", out_fasta_path )" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.6.4" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 2 79 | } 80 | -------------------------------------------------------------------------------- /GC_content_multi_fasta.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | ">header1\n", 13 | "Content of C: 22.31 %\n", 14 | "Content of G: 28.46 %\n", 15 | "\n", 16 | ">header2\n", 17 | "Content of C: 23.44 %\n", 18 | "Content of G: 29.26 %\n", 19 | "\n", 20 | ">header3\n", 21 | "Content of C: 46.11 %\n", 22 | "Content of G: 18.33 %\n", 23 | "\n", 24 | ">header4\n", 25 | "Content of C: 34.04 %\n", 26 | "Content of G: 22.46 %\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "# Finding G and C content for a multi fasta file\n", 32 | "\n", 33 | "# reading the input multi fasta file\n", 34 | "fr=open('multi_GC_input.fasta','r')\n", 35 | "\n", 36 | "seq = \"\" # this will store the sequence of each fasta \n", 37 | "\n", 38 | "for line in fr: # looping over each line in our fasta file\n", 39 | " if \">\" in line: # check if it is a header line\n", 40 | " if seq != \"\": # check if the seq variable is an empty string or not\n", 41 | " length = len(seq) # this will store the length of a fasta sequence\n", 42 | " print('Content of C: ', round((seq.count(\"C\")/length)*100,2),'%') # finding and printing % of C\n", 43 | " print('Content of G: ', round((seq.count(\"G\")/length)*100,2),'%') # finding and printing % of G\n", 44 | " print(\"\")\n", 45 | " seq = \"\" # again making seq as empty string to store the sequence of the next fasta sequence\n", 46 | " print(line, end=\"\") # this line prints the header\n", 47 | " \n", 48 | " else: # if it is not a header line\n", 49 | " line = line.rstrip(\"\\n\") # remove '\\n' from the end of the line\n", 50 | " seq = seq+line # concatenate it to seq\n", 51 | " \n", 52 | "# closing the input file\n", 53 | "fr.close()\n", 54 | "length = len(seq) # this will store the length of last fasta sequence\n", 55 | "print('Content of C: ', round((seq.count(\"C\")/length)*100,2),'%') # finding and printing % of C\n", 56 | "print('Content of G: ', round((seq.count(\"G\")/length)*100,2),'%') # finding and printing % of G" 57 | ] 58 | } 59 | ], 60 | "metadata": { 61 | "kernelspec": { 62 | "display_name": "Python 3", 63 | "language": "python", 64 | "name": "python3" 65 | }, 66 | "language_info": { 67 | "codemirror_mode": { 68 | "name": "ipython", 69 | "version": 3 70 | }, 71 | "file_extension": ".py", 72 | "mimetype": "text/x-python", 73 | "name": "python", 74 | "nbconvert_exporter": "python", 75 | "pygments_lexer": "ipython3", 76 | "version": "3.6.4" 77 | } 78 | }, 79 | "nbformat": 4, 80 | "nbformat_minor": 2 81 | } 82 | -------------------------------------------------------------------------------- /GC_content_single_fasta.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Content of C: 21.62 %\n", 13 | "Content of G: 26.68 %\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "# Finding G and C content for a single fasta file\n", 19 | "\n", 20 | "# reading the input single fasta file\n", 21 | "fr=open('single_GC_input.fasta','r')\n", 22 | "\n", 23 | "seq = \"\" # this will store the entire fasta as a single string\n", 24 | "\n", 25 | "# looping over each line in our fasta file\n", 26 | "for line in fr:\n", 27 | " if \">\" in line: # check if it a header line\n", 28 | " continue\n", 29 | " else:\n", 30 | " line = line.rstrip(\"\\n\")\n", 31 | " seq = seq+line\n", 32 | " \n", 33 | "# closing the input file\n", 34 | "fr.close()\n", 35 | "\n", 36 | "total_len = len(seq) # this will store total number of bases in the sequence\n", 37 | "print('Content of C: ', round((seq.count(\"C\")/total_len)*100,2),'%') # finding and printing % of C\n", 38 | "print('Content of G: ', round((seq.count(\"G\")/total_len)*100,2),'%') # finding and printing % of G" 39 | ] 40 | } 41 | ], 42 | "metadata": { 43 | "kernelspec": { 44 | "display_name": "Python 3", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.6.4" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 2 63 | } 64 | -------------------------------------------------------------------------------- /GO_enrichment_analysis.py: -------------------------------------------------------------------------------- 1 | 2 | # GO (Gene Ontology) enrichment analysis using python 3 | 4 | import pandas as pd 5 | import xlsxwriter 6 | import os 7 | 8 | in_path = "enter path of your input excel file" 9 | # e.g: in_path = "C:\Users\dell\Desktop\testing_go_en\GO_enrichment_input.xlsx" 10 | 11 | sheet_name = "enter sheet name" 12 | # e.g: sheet_name = "UpRegulated" 13 | 14 | sheet_col_name_list = [enter the names of the three GO columns] 15 | # e.g: sheet_col_name_list = ["GO Biological Process Term","GO Cellular Component Term","GO Molecular Function Term"] 16 | 17 | def Go_En(in_path,sheet_name,sheet_col_name_list): 18 | #reading our input excel file and specifying the sheet name and converting it to dataframe: 19 | 20 | df=pd.read_excel(in_path,sheet_name) 21 | 22 | in_path=in_path.strip("GO_enrichment_input.xlsx") 23 | out_path=in_path+sheet_name #sheet_name is same as our folder name and out_path is where the folder is created 24 | try: 25 | os.mkdir(out_path) 26 | except: 27 | print("it will overwrite") 28 | workbook_path=out_path+"/"+sheet_name+"_"+"GO_Enrichment_Analysis.xlsx" 29 | workbook = xlsxwriter.Workbook(workbook_path) 30 | 31 | for sheet_col_name in sheet_col_name_list: 32 | #exracting the GO term we want to work on from the dataframe 33 | working_col=df[sheet_col_name] #working_col is a series 34 | working_col.dropna(inplace=True) #removing NAN values from our series 35 | 36 | gene_sym_GO = df[["Gene Symbol", sheet_col_name]] 37 | gene_sym_GO = gene_sym_GO.dropna() 38 | #print(gene_sym_GO) 39 | #for each string present in each row of our series 40 | #we replace the ",GO" with # and split the strings with "#" 41 | working_col=working_col.str.replace(",GO","#").str.split("#") 42 | 43 | #now each row of our series has a list instead of a string 44 | 45 | list1=[] 46 | for row in working_col: #looping over each row i.e a list, in our series 47 | 48 | for item in row: #looping over each term in our list 49 | # splitting each string of a list of a row by "~" 50 | #this will again create a list 51 | #extract the string at the index 1 of our generated list and remove "," from its end 52 | item=item.split("~")[1].strip(",") 53 | 54 | #appending the extracted string to a list 55 | list1.append(item) 56 | #print(list1) 57 | 58 | 59 | #assigning our list to a set to remove duplicates 60 | s1=set(list1) 61 | #print(len(s1)) 62 | #print(len(list1)) 63 | 64 | list2=[] 65 | for te in s1: 66 | x=() 67 | #x is a tuple 68 | x=(te,list1.count(te)) 69 | 70 | list2.append(x) 71 | 72 | 73 | df1=pd.DataFrame(list2) 74 | 75 | 76 | 77 | 78 | #FINDING CORRESPONDNG GENE SYMBOL FOR RESPECTIVE GENE ONTOLOGY TERM 79 | #taking all the GO terms in 0th column of df1 in the variable term 80 | #df1[0] returns a series which is stored in term 81 | term = df1[0] 82 | #print(term) 83 | gene_symbol = [] #empty list to contain all the gene symbols for respective GO in the term series 84 | 85 | #looping over each GO term in term 86 | for item1 in term: 87 | counter=0 #counter variable 88 | gene_sym_str="" #stores the gene symbol for each GO term 89 | 90 | #looping over each row in our dataframe gene_sym_GO 91 | #it contains the columns of Gene Symbol and Gene Ontology(input) 92 | 93 | i=0 # i will store the no.of rows in gene_sym_GO 94 | for index, row in gene_sym_GO.iterrows(): 95 | 96 | i+=1 #incrementing i for each run of loop 97 | 98 | 99 | #converting our series 'row' to list which has gene symbol at 0th index 100 | #and Gene Ontology Term(input) at 1st index 101 | #i.e row_list[0]=gene symbol and row_list[1]=gene ontology term(input) 102 | row_list = row.tolist() 103 | 104 | 105 | #comparing each GO term(output) to GO term(input) for each row 106 | if item1 in row_list[1]: 107 | 108 | counter+=1 109 | if counter > 1: #will execute if item1 matches to more than one rows 110 | gene_sym_str += ","+row_list[0] 111 | 112 | 113 | else: #will execute ony if item1 matches to only one row 114 | gene_sym_str=row_list[0] 115 | 116 | if i==gene_sym_GO.shape[0]: #will execute only on the last iteration of the gene_sym_GO for loop 117 | # appending the final result in gene_symbol after 118 | gene_symbol.append(gene_sym_str) 119 | 120 | 121 | #print(gene_symbol) 122 | #print(len(gene_symbol)) 123 | 124 | 125 | 126 | #Adding gene_symbol to df1 by converting it to a dataframe 127 | # ERROR WAS IN INSERTING gene_symbol list INTO DATAFRAME 128 | 129 | 130 | gene_symbol_df=pd.DataFrame(gene_symbol) 131 | #print(gene_symbol_df) 132 | #print(df1[0]) 133 | concat_df=pd.concat([gene_symbol_df, df1],axis=1) 134 | concat_df.columns=[0,1,2] 135 | concat_df.sort_values(by=2,inplace=True,ascending=False) 136 | #print(concat_df[[0,1]]) 137 | 138 | 139 | 140 | # Writing Data To Xlsx worksheets : Gene Symbol, GO term, Count 141 | 142 | worksheet = workbook.add_worksheet(sheet_col_name) 143 | bold = workbook.add_format({'bold': 1}) 144 | 145 | headings = ['Gene Symbol','Term', 'Count'] 146 | 147 | worksheet.write_row('A1', headings, bold) 148 | worksheet.write_column('A2', concat_df[0]) 149 | worksheet.write_column('B2', concat_df[1]) 150 | worksheet.write_column('C2', concat_df[2]) 151 | 152 | #creating a chart object 153 | chart1=workbook.add_chart({'type': 'pie'}) 154 | chart1.add_series({ 155 | 'categories': [sheet_col_name,1,1,20,1], 156 | 'values': [sheet_col_name,1,2,20,2], 157 | 'data_labels':{'value':True}, 158 | }) 159 | 160 | # Add a title. 161 | chart1.set_title({'name': sheet_col_name}) 162 | 163 | # Set an Excel chart style. Colors with white outline and shadow. 164 | chart1.set_style(10) 165 | 166 | 167 | # Insert the chart into the worksheet (with an offset). 168 | worksheet.insert_chart('E2', chart1, {'x_scale': 2.5, 'y_scale':2.5}) 169 | 170 | 171 | workbook.close() 172 | return workbook_path 173 | 174 | output_path = Go_En(in_path,sheet_name,sheet_col_name_list) 175 | 176 | print("file successfully generated at: ",output_path) 177 | 178 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 179 | ##your personal queries are also invited 180 | 181 | 182 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-for-Bioinformatics 2 | Python for Bioinformatics consist of a variety of python programmes,frequently used by bioinformaticians and biotechnologists while handling biological data especially raw NGS data and for solving problems related to genomics. 3 | -------------------------------------------------------------------------------- /Retrieving_secondary_structure_from_PDB_file_using_DSSP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# make sure you have 'dssp-2.0.4-win32.exe' file in your working directory \n", 10 | "# 'dssp-2.0.4-win32.exe' can be downloaded from https://github.com/ecapriotti/lb1-2/blob/master/dssp/dssp-2.0.4-win32.exe\n", 11 | "# Ubuntu and Mac users can refer to the following link for installing DSSP: https://ssbio.readthedocs.io/en/latest/instructions/dssp.html\n", 12 | "# dssp sample command: D:\\Pycharm_project\\ML_Lab\\PDB\\dssp-2.0.4-win32.exe-i 1crn.pdb -o 1crn.dssp\n", 13 | "\n", 14 | "from Bio.PDB import PDBList\n", 15 | "from pathlib import Path\n", 16 | "import os\n", 17 | "structure_id = \"1crn\"\n", 18 | "out_dir = Path(r\"D:\\Pycharm_project\\ML_Lab\") # destination path for our PDB file\n", 19 | "\n", 20 | "# downloading the PDB file\n", 21 | "pdbl = PDBList()\n", 22 | "pdbl.retrieve_pdb_file(structure_id, pdir = out_dir, file_format = \"pdb\")\n", 23 | "\n", 24 | "filename = \"pdb1crn.ent\"\n", 25 | "out_file = filename.split(\".\")[0]+\".dssp\"\n", 26 | "\n", 27 | "# command for parsing the secondary structure using DSSP\n", 28 | "command = r\"D:\\Pycharm_project\\ML_Lab\\PDB\\dssp-2.0.4-win32.exe -i \" + filename + \" -o \" + out_file\n", 29 | "\n", 30 | "os.system(command)\n", 31 | "print(\"Output dssp file successfully created at: \", out_dir/out_file)" 32 | ] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.6.4" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 2 56 | } 57 | -------------------------------------------------------------------------------- /append.py: -------------------------------------------------------------------------------- 1 | 2 | # appending contents to a file 3 | 4 | fr = open("path of the file in which you want to append","a") 5 | 6 | fr.write("Welcome to my fb page, enjoy learning python" + "\n") 7 | fr.close() 8 | 9 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 10 | ##your personal queries are also invited 11 | 12 | 13 | -------------------------------------------------------------------------------- /average_read_length_fastq.py: -------------------------------------------------------------------------------- 1 | # Finding the average length of the reads in a fastq file using biopython 2 | 3 | # Type "python average_read_length_fastq.py -i Path of your fastq file" for running the code 4 | # Example: python average_read_length_fastq.py -i C:\Users\dell\Desktop\test\sample_fastq.fastq 5 | 6 | # Type "python average_read_length_fastq.py -h" for help/usage description 7 | 8 | # Output: Average length of the reads in a fastq file 9 | 10 | # Sample input file: sample_fastq.fastq 11 | 12 | from Bio import SeqIO 13 | import argparse 14 | 15 | parser=argparse.ArgumentParser(description="Finding the average length of the reads in a fastq file , USAGE: python average_read_length_fastq.py -i path/to/fastq/file ") 16 | parser.add_argument("-i", help="ENTER FULL PATH OF THE FASTQ FILE") 17 | args = parser.parse_args() 18 | 19 | fastq_path = args.i 20 | fastq_records = SeqIO.parse(fastq_path,"fastq") 21 | total_read_count = 0 22 | total_read_length = 0 23 | for fastq_record in fastq_records: 24 | total_read_count += 1 25 | total_read_length += len(fastq_record.seq) 26 | 27 | average_read_length = total_read_length//total_read_count 28 | print("Average read length in fastq file is: ",average_read_length) 29 | 30 | ## please leave a message if you face any issues while using the code 31 | ## your personal queries are also invited -------------------------------------------------------------------------------- /basic_ops_biopython.py: -------------------------------------------------------------------------------- 1 | 2 | # some basic operations on a sequence using biopython 3 | 4 | from Bio.Seq import Seq 5 | my_seq = Seq("AGAACCGGTAGCTGACGT") 6 | print(my_seq) 7 | print(len(my_seq)) 8 | print(my_seq[::-1] 9 | print(my_seq[0:2]) 10 | print(my_seq.count("A")) 11 | print(my_seq.count("T")) 12 | print(my_seq.count("G")) 13 | print(my_seq.count("C")) 14 | print(my_seq.reverse_complement()) 15 | print(my_seq.transcribe()) 16 | print(my_seq.translate()) 17 | 18 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 19 | ##your personal queries are also invited 20 | 21 | 22 | -------------------------------------------------------------------------------- /biopython_parse_fasta.py: -------------------------------------------------------------------------------- 1 | 2 | # reading a fasta file having multiple sequences using biopython 3 | 4 | from Bio.SeqIO import parse 5 | for seq_record in parse("path of your fasta file","fasta"): 6 | print(seq_record) 7 | print(seq_record.seq) 8 | print(seq_record.id) 9 | print(seq_record.name) 10 | print(seq_record.description) 11 | 12 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 13 | ##your personal queries are also invited 14 | 15 | -------------------------------------------------------------------------------- /biopython_parse_genbank.py: -------------------------------------------------------------------------------- 1 | 2 | # reading a genbank file having multiple sequences using biopython 3 | 4 | from Bio.SeqIO import parse 5 | for seq_record in parse("path of your genbank file","genbank"): 6 | print(seq_record) 7 | print(seq_record.seq) 8 | print(seq_record.id) 9 | print(seq_record.name) 10 | print(seq_record.description) 11 | 12 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 13 | ##your personal queries are also invited 14 | 15 | 16 | -------------------------------------------------------------------------------- /biopython_read_fasta.py: -------------------------------------------------------------------------------- 1 | 2 | # reading a fasta file having a single sequence using biopython 3 | 4 | from Bio.SeqIO import read 5 | seq_record = SeqIO.read("path of your single fasta file","fasta") 6 | print(seq_record) 7 | print(seq_record.seq) 8 | print(seq_record.id) 9 | print(seq_record.name) 10 | print(seq_record.description) 11 | 12 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 13 | ##your personal queries are also invited 14 | 15 | 16 | -------------------------------------------------------------------------------- /biopython_read_genbank.py: -------------------------------------------------------------------------------- 1 | 2 | # reading a genbank file having a single sequence using biopython 3 | 4 | from Bio.SeqIO import read 5 | seq_record = SeqIO.read("path of your single genbank file","genbank") 6 | print(seq_record) 7 | print(seq_record.seq) 8 | print(seq_record.id) 9 | print(seq_record.name) 10 | print(seq_record.description) 11 | 12 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 13 | ##your personal queries are also invited 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /bismark.txt: -------------------------------------------------------------------------------- 1 | chr1 136388 136388 0.777778 NR_039983_6 2 | chr1 136409 136409 0 NR_039983_6 3 | chr1 136423 136423 1 NR_039983_6 4 | chr1 136425 136425 0 NR_039983_6 5 | chr1 136473 136473 1 NR_039983_6 6 | chr1 661865 661865 0 NR_028327_4 7 | chr1 661900 661900 0 NR_028327_4 8 | chr1 661905 661905 0 NR_028327_4 9 | chr1 661925 661925 0 NR_028327_4 10 | chr1 762972 762972 0 NR_047526_1 11 | chr1 762972 762972 0 NR_015368_1 12 | chr1 762972 762972 0 NR_047519_1 13 | chr1 762972 762972 0 NR_047520_1 14 | chr1 762972 762972 0 NR_047521_1 15 | chr1 762972 762972 0 NR_047522_1 16 | chr1 762972 762972 0 NR_047523_1 17 | chr1 762972 762972 0 NR_047524_1 18 | chr1 805475 805475 0 NR_027055_5 19 | chr1 805478 805478 0 NR_027055_5 20 | chr1 805480 805480 0 NR_027055_5 21 | chr1 805485 805485 0 NR_027055_5 22 | chr1 805487 805487 0 NR_027055_5 23 | chr1 805499 805499 0 NR_027055_5 24 | chr1 805501 805501 0 NR_027055_5 25 | chr1 805520 805520 1 NR_027055_5 26 | chr1 805523 805523 0 NR_027055_5 27 | chr1 861218 861218 0 NM_152486_1 28 | chr1 861241 861241 0 NM_152486_1 29 | chr1 861250 861250 1 NM_152486_1 30 | chr1 861252 861252 1 NM_152486_1 31 | chr1 861260 861260 1 NM_152486_1 32 | chr1 861265 861265 1 NM_152486_1 33 | chr1 861287 861287 0.4 NM_152486_1 34 | chr1 861297 861297 0.4 NM_152486_1 35 | chr1 861317 861317 0.4 NM_152486_1 36 | chr1 861353 861353 0.6 NM_152486_1 37 | chr1 861360 861360 0.6 NM_152486_1 38 | chr1 863508 863508 1 NM_152486_3 39 | chr1 863540 863540 1 NM_152486_3 40 | chr1 863545 863545 1 NM_152486_3 41 | chr1 863549 863549 1 NM_152486_3 42 | chr1 863556 863556 0 NM_152486_3 43 | chr1 864803 864803 1 NM_152486_4 44 | chr1 864819 864819 1 NM_152486_4 45 | chr1 865539 865539 0 NM_152486_5 46 | chr1 867870 867870 1 NM_152486_8 47 | chr1 867891 867891 1 NM_152486_8 48 | chr1 867913 867913 1 NM_152486_8 49 | chr1 868602 868602 1 NM_152486_8 50 | chr1 868611 868611 1 NM_152486_8 51 | chr1 868684 868684 1 NM_152486_9 52 | chr1 872512 872512 1 NM_152486_13 53 | chr1 872519 872519 0.5 NM_152486_13 54 | chr1 874638 874638 1 NM_152486_15 55 | chr1 874658 874658 0 NM_152486_15 56 | chr1 874670 874670 1 NM_152486_15 57 | chr1 874672 874672 1 NM_152486_15 58 | chr1 875282 875282 0 NM_152486_16 59 | chr1 875310 875310 0 NM_152486_16 60 | chr1 875315 875315 0 NM_152486_16 61 | chr1 875346 875346 0 NM_152486_16 62 | chr1 875365 875365 1 NM_152486_16 63 | chr1 875636 875636 0 NM_152486_16 64 | chr1 875640 875640 0 NM_152486_16 65 | chr1 875644 875644 0 NM_152486_16 66 | chr1 875650 875650 0 NM_152486_16 67 | chr1 875651 875651 0 NM_152486_16 68 | chr1 875652 875652 0 NM_152486_16 69 | chr1 875653 875653 0 NM_152486_16 70 | chr1 875667 875667 0 NM_152486_16 71 | chr1 875669 875669 0 NM_152486_16 72 | chr1 875670 875670 0 NM_152486_16 73 | chr1 875672 875672 0 NM_152486_16 74 | chr1 875673 875673 0 NM_152486_16 75 | chr1 875675 875675 0 NM_152486_16 76 | chr1 875688 875688 0 NM_152486_16 77 | chr1 875689 875689 0 NM_152486_16 78 | chr1 875693 875693 0 NM_152486_16 79 | chr1 875694 875694 0 NM_152486_16 80 | chr1 875696 875696 0 NM_152486_16 81 | chr1 875697 875697 0 NM_152486_16 82 | chr1 875698 875698 0 NM_152486_16 83 | chr1 875699 875699 0 NM_152486_16 84 | chr1 875705 875705 0 NM_152486_16 85 | chr1 875736 875736 0 NM_152486_16 86 | chr1 875742 875742 0 NM_152486_16 87 | chr1 875744 875744 0 NM_152486_16 88 | chr1 875750 875750 0 NM_152486_16 89 | chr1 875760 875760 0 NM_152486_16 90 | chr1 875769 875769 0 NM_152486_16 91 | chr1 875774 875774 0 NM_152486_16 92 | chr1 875778 875778 0 NM_152486_16 93 | chr1 875782 875782 0 NM_152486_16 94 | chr1 875784 875784 0 NM_152486_16 95 | chr1 875787 875787 0 NM_152486_16 96 | chr1 876538 876538 0 NM_152486_17 97 | chr1 876551 876551 0 NM_152486_17 98 | chr1 876552 876552 0 NM_152486_17 99 | chr1 876605 876605 0 NM_152486_17 100 | chr1 876606 876606 0 NM_152486_17 101 | chr1 876616 876616 0 NM_152486_17 102 | chr1 876617 876617 0 NM_152486_17 103 | chr1 876619 876619 0 NM_152486_17 104 | chr1 876620 876620 0 NM_152486_17 105 | chr1 876643 876643 0 NM_152486_17 106 | -------------------------------------------------------------------------------- /check_pallindrome.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "ACCTAGGT is pallindromic\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# Check whether a sequence is pallindromic or not\n", 18 | "# A dna sequence is pallindromic if it is same as its reverse complement\n", 19 | "from Bio.Seq import Seq\n", 20 | "dna = 'ACCTAGGT' # our input dna sequence as string\n", 21 | "dna = Seq(dna) # converting our input dna sequence to a Seq object\n", 22 | "reverse_complement = dna.reverse_complement() # finding reverse complement of dna\n", 23 | "# checking if dna is same as reverse complement\n", 24 | "if dna == reverse_complement: \n", 25 | " print(f'{dna} is pallindromic')\n", 26 | "else:\n", 27 | " print(f'{dna} is not pallindromic')" 28 | ] 29 | } 30 | ], 31 | "metadata": { 32 | "kernelspec": { 33 | "display_name": "Python 3", 34 | "language": "python", 35 | "name": "python3" 36 | }, 37 | "language_info": { 38 | "codemirror_mode": { 39 | "name": "ipython", 40 | "version": 3 41 | }, 42 | "file_extension": ".py", 43 | "mimetype": "text/x-python", 44 | "name": "python", 45 | "nbconvert_exporter": "python", 46 | "pygments_lexer": "ipython3", 47 | "version": "3.6.4" 48 | } 49 | }, 50 | "nbformat": 4, 51 | "nbformat_minor": 2 52 | } 53 | -------------------------------------------------------------------------------- /cnt_ATGC_biopython.py: -------------------------------------------------------------------------------- 1 | # counting ATGC in your multiple fasta file using biopython and generating bar plots for the same using matplotlib 2 | 3 | import numpy as np 4 | from Bio.SeqIO import parse 5 | from matplotlib import pyplot as plt 6 | count={"A":[],"G":[],"C":[],"T":[]} 7 | header=[] 8 | for sec_rec_obj in parse("path of your fasta file","fasta"): 9 | count["A"].append(sec_rec_obj.seq.count("A")) 10 | count["G"].append(sec_rec_obj.seq.count("G")) 11 | count["C"].append(sec_rec_obj.seq.count("C")) 12 | count["T"].append(sec_rec_obj.seq.count("T")) 13 | header.append(sec_rec_obj.name) 14 | 15 | #CODE FOR PLOTTING: 16 | 17 | x_index=np.arange(len(header)) 18 | width=0.2 19 | fig,ax=plt.subplots() 20 | ax.bar(x_index-width, count["A"],width=width,label="Count of A",color="yellow") 21 | ax.bar(x_index, count["G"],width=width,label="Count of G",color="green") 22 | ax.bar(x_index+width, count["C"],width=width,label="Count of C",color="blue") 23 | ax.bar(x_index+(2*width), count["T"],width=width,label="Count of T",color="red") 24 | ax.set_xticks(x_index) 25 | ax.set_xticklabels(header,rotation=45,ha="right") 26 | ax.legend() 27 | ax.set_xlabel('headers of fasta') 28 | ax.set_ylabel('count') 29 | plt.show() 30 | 31 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 32 | ##your personal queries are also invited 33 | 34 | 35 | -------------------------------------------------------------------------------- /code_request_001.py: -------------------------------------------------------------------------------- 1 | #### code_request_001 #### 2 | 3 | # Performing cutadapt on a set of fastq files 4 | # This code will perform cutadapt for both single and paired end files, taking one file at a time 5 | # The cutadapt command used is this code is "cutadapt -j 1 --error-rate=0.1 --times=1 --overlap=3 --minimum-length=20 --quality-cutoff=20 -o " + output_file + " " + input_file" 6 | # Input: Path of fastq files(both single or paired end) 7 | # Output: Trimmed output files(generated after cutadapt) of the respective input files 8 | 9 | #### Enter the path of your input fastq files #### 10 | input_file_list = ["path of file1","path of file2"...] 11 | # e.g: input_file_list = ["C:\Users\dell\Desktop\file1.fatsq","C:\Users\dell\Desktop\file2.fatsq"...] 12 | #### Enter the path of your input fastq files #### 13 | 14 | from subprocess import check_output 15 | import os 16 | 17 | # function fetch_cutadapt for fetching and returning system's cutadapt path 18 | def fetch_cutadapt_path(): 19 | cutadapt_path_comm = "which cutadapt".split(" ") 20 | out = check_output(cutadapt_path_comm) 21 | cutadapt_comm_path = str(out, 'UTF8').strip("\n") + " " 22 | return cutadapt_comm_path 23 | 24 | # function cutadapt for running the cutadapt command on each file 25 | def cutadapt(file,cutadapt_comm_path): 26 | filename = file.split("/")[-1] 27 | # generating the output file path from input file path 28 | out_file_path = file.strip(".fq")+"_trimmed"+".fq" 29 | # generating the complete cutadapt command 30 | cutadapt_comm = cutadapt_comm_path + "-j 1 --error-rate=0.1 --times=1 --overlap=3 --minimum-length=20 --quality-cutoff=20 -o " + out_file_path + " " + file 31 | # running the cutadapt command 32 | os.system(cutadapt_comm) 33 | print("cutadapt successful for {}, output generated at: {}".format(filename,out_file_path)) 34 | 35 | # main function takes input_file_list as input and passes each file 36 | # from the list to the cutadapt function 37 | def main(input_file_list): 38 | # calling fetch_cutadapt_path function and storing result in cutadapt_comm_path 39 | cutadapt_comm_path = fetch_cutadapt_path() 40 | # looping over each file path in input_file_list 41 | for file in input_file_list: 42 | # calling cutadapt function for each file 43 | cutadapt(file,cutadapt_comm_path) 44 | 45 | # calling main function 46 | main(input_file_list) 47 | print("cutadapt successfull for all input files") 48 | 49 | ##please leave a message if you face any issues while using the code 50 | -------------------------------------------------------------------------------- /count_ATGC.py: -------------------------------------------------------------------------------- 1 | 2 | # counting ATGC in all fasta sequences in a multiple fasta file 3 | 4 | fr = open("path of your multiple fasta file","r") 5 | header = "" 6 | seq = "" 7 | for line in fr: 8 | if ">" in line: 9 | header = line 10 | if seq != "": 11 | print(seq) 12 | print("No. of A: ",seq.count("A")) 13 | print("No. of T: ",seq.count("T")) 14 | print("No. of G: ",seq.count("G")) 15 | print("No. of C: ",seq.count("C")) 16 | seq = "" 17 | print(header) 18 | 19 | else: 20 | line = line.rstrip("\n") 21 | seq = seq+line 22 | print(seq) 23 | print("No. of A: ",seq.count("A")) 24 | print("No. of T: ",seq.count("T")) 25 | print("No. of G: ",seq.count("G")) 26 | print("No. of C: ",seq.count("C")) 27 | 28 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 29 | ##your personal queries are also invited 30 | 31 | 32 | -------------------------------------------------------------------------------- /count_headers.py: -------------------------------------------------------------------------------- 1 | 2 | # counting headers of a multiple fasta file 3 | 4 | fr = open(r"path of your multiple fasta file","r") 5 | count = 0 6 | for line in fr: 7 | if (">" in line): 8 | count = count + 1 9 | fr.close() 10 | 11 | print("number of headers are: ", count) 12 | 13 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 14 | ##your personal queries are also invited 15 | 16 | 17 | -------------------------------------------------------------------------------- /count_kmers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "{'ATG': 2, 'TGA': 2, 'GAC': 3, 'ACG': 4, 'CGT': 1, 'GTG': 1, 'TGC': 1, 'GCA': 1, 'CAG': 1, 'AGT': 1, 'GTA': 1, 'TAG': 2, 'AGA': 3, 'CGA': 3, 'GAT': 3, 'ATA': 2, 'GAG': 1, 'TAA': 1, 'AAC': 1}\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# Finding the count of all possible kmers in a DNA sequence\n", 18 | "\n", 19 | "dna = \"ATGACGTGCAGTAGACGATAGAGATAACGATGACGA\" # our input dna sequence\n", 20 | "k = 3 # length of the kmer\n", 21 | "\n", 22 | "kmers = [] # this list stores all the generated kmers\n", 23 | "for i in range(len(dna) - k +1): # looping from 0 to length of DNA-k+1\n", 24 | " kmer = dna[i:i+k] # slicing DNA according to our kmer length\n", 25 | " kmers.append(kmer) # appending kmer to our list kmers\n", 26 | "kmer_count = {} # this dictionary will store the count of each of the kmers with kmer as key and count as value\n", 27 | "for i in range(len(kmers)): # looping over the length of our kmers list\n", 28 | " if kmers[i] in kmer_count: # if the kmer is already present in our kmer_count dictionary \n", 29 | " kmer_count[kmers[i]] += 1 # incrementing the count of that kmer by 1\n", 30 | " else: # if the kmer is not already present in our kmer_count dictionary\n", 31 | " kmer_count[kmers[i]] = 1 # initialising the count of that kmer as 1\n", 32 | "\n", 33 | "print(kmer_count)" 34 | ] 35 | } 36 | ], 37 | "metadata": { 38 | "kernelspec": { 39 | "display_name": "Python 3", 40 | "language": "python", 41 | "name": "python3" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": { 45 | "name": "ipython", 46 | "version": 3 47 | }, 48 | "file_extension": ".py", 49 | "mimetype": "text/x-python", 50 | "name": "python", 51 | "nbconvert_exporter": "python", 52 | "pygments_lexer": "ipython3", 53 | "version": "3.6.4" 54 | } 55 | }, 56 | "nbformat": 4, 57 | "nbformat_minor": 2 58 | } 59 | -------------------------------------------------------------------------------- /dataset_294_4.txt: -------------------------------------------------------------------------------- 1 | AGTGATATCGGAGCTGGTTTTGAGTTGTCAGTGTGGTCTCCTGATAGCCCATGGGATCATATATTAGGTTCAGCGTGCTCAG 2 | GAAGCAATGGAGAGAATTACTACAAAATACAGTTGGCTCCTGATCTTAGCGATTGGGCATTGCATGCGCTTGGGTCGTTA 3 | CTGGCAGTAATGCATCCCCGCTTATCCTGCTCGGCTAAAGCATCATCCCAACTGACTAACGCTAGCTCCTTATCAGGAGTCCCATTACC 4 | TGTTGTCAAACCCGATCTGCGGTGGCCTATGGAAGGAGGATAAGGTCATACTTCTGAATCCTGACGATATAATAAAAAGTTATG 5 | ACTACTGCGCAAAAATAGACGTACAGATGAAACCTCCGGCACCCCGACAGTGGCTCGACAATTTTACAAAAACGGTGAGTACTTCGACGGAACTCTG 6 | TCGAGTCCATACCTCAACCAGCCTTTCCGCAGACTTTAATGAAGCAGACACCTCGTTCTGGCGAAAACATAGGACACCGGTTAGTTGAGAGCCCCTTG 7 | AGTGCGACAAACATATACCGGGATTCTTCCGCACACGAAACAGATTAGCTCGAACGTCCGCAGGGGGATTCCACCTATCGAGTCTC 8 | GTTGTTGTTCGTGGATGAGTTACACTGCCATTCATGGTCAAGAGGGGTCAATTGAATCAGGACGATCAAATATCTTAGTGCCGAGAAG 9 | ATGACTGATCGATCAAACTAATATTCGACAATGGGAGCCCGGCGGAACATTTAACCATGGGGTAAGTGATGGTGTGCCTA 10 | TGCAAAGGGCCAACTCGAGGTACCAAAACAATAAGAGCAGTAATACGGTCGGGAGGTTGGTCCGACCCGCCTCCATCTTCAACAATAACTTGGGTT 11 | GATTTTTCTTGTACCATCGCGGTGAGTCATAATGCGTATCTCATTATCTACCGGGAATTCTCCTCCAAGGCAAAGTTTTTCGTGCTTTGCAGGC 12 | AAGTACTACCGGTGGCTGCGGCCGGAAGCCCGACCAGCGAAGCTCAAAAGCACGGTCACACTGTTGGAAAATAGGTCCATGGGTGCTTGAACTACCCGA 13 | GGCGACAAGTTTTGAGTAGAGTAGCAAGTACGCAAACCACGATGGCCAGGTCTGTCCATAGCCTTTAGGACAAGCTTGGTCTGCACAG 14 | CCGAGATACCCCTTTTACTATGACATTGTATGGTTGTGGCTCACGGAGTCAACTCTTTCGGCTTCGGCATCTCGCCATGGGCGTGCTCT 15 | ACGGAGACTGCGGGAAACGTACCGGCTGTTGCATTGAAGACATCGGTCCGTAGGGTGCGCAAGACGATACTCCCTTCTCG 16 | AGCACGTCCAGGAGGTTCTAGTTTCGCCATTGTCCTTGCACAGGAAGTGCGAGCACATAAGATTAGAGAGGTCTTAAACCGCTACCAAG 17 | CCGCTTCGAGTTCCAGGACTGCCAGATCCCCAGTTTGGCCCCCCATTGACGTTCCTCCGCGTCGTAGTTCAAGGCAACAAATGAGCGAGG 18 | GCTAGGGAACGTGGCATATGCCACTGGCTTCCATATGAGTTCGGAGTGGTCGGCGTCTCTGATAGTCGTCACGGCTGACTATTAC 19 | TGTCCGCTTGCCCAGGCCTTTTGAGGGACACAGTCAACTTTCCTGTCTCCACACTAATCCGATCTCAGTCTTACTGTCCATATTAGGAAGCT 20 | AGTGTTAAGCAGTTTTTGTCTACAAATAGAATAGTGGACTCAATGCACCGTGTCACCCCCCAACATACATGTCTCACTAAGTGAAGGGGA 21 | CGAGAGAAGAAGGGCCGTAGCGAGCCCCCTGAGTCTTACTTCAGGTAATAATTTCCTGTCGGCTTGTATCAAAAATACGTTTTGAGGG 22 | GTTCAATCACGACGACTTCAACGACCAAGGAAGTTTTGGGCTCGTCCTCCTATCCGTGATAGCTAAGCCCAGACTTAGTTGG 23 | TTTTCCTCCAATACGCTCTTTCCTTTGGAGATCTTTTATACATATTTTATGAGAGAACGACGATTCAAGAGAGCTGGTTAGCCGATGACATTTT 24 | AATGCTGTAATGTTAGCGGAACCAGCCAGACCCGACGCTCGGCCCTACACTCTATTGCGAGGGTAAATGGGTGTAGGGAGTGCAGGATT 25 | CTGAGAGGCAGCGCACATACAGAGCGCGATTCACTTAGGGACCGAGGATGTTTTCATAGATCCGAGTGCGTGGCGACTTTTCGCATACC 26 | TGATCTAAAGCTCTCCTTACCGACGACTTCACATTCGCAATCCCCACGTGGAATGCGAATAAAGGAGTTACTCTCCCTACGGGGCACAATC 27 | GAATAGTATTTCCCAACCCACCGTCCGTAGGCTGTAGACAACATAGCTACTGGTGTATAAGCTGATATCGGTCTGGTCATTGA 28 | CTGCAAGACATATCTTGAACGCACGATTTCTCCAGACAACTGCTTCAGTACATCGAATCTTTGCAGAGTTGATTAAGAACTCTGACGACTAGACTACAAT 29 | GCCCGGGCGCCCTGTCAAGTTCTATAAAGCTGTGTTTGTATGGGCATAACCAAACGTCACTGTATGTCAGTTGTCGTTAAGACGTCTT 30 | TCTGCTGACCGTGCTGTGCCTCGGGAAGCGCGCTGTCCCGCCCTAGACGTTAGGTCCTGTTATTTGTTCCGTAGAGCCTCTATTGAAATTGCTCGTC 31 | CGTAGAAGATTCTTAGTACTAACTGTCTCCTCTAAGGCCGCGGCCACGGCAAACGTCTGTGTTCCCGGATGAGAGCACGTCTCTTG 32 | GAGGGTTTCTTGTACTTAGTTGTATCACGGTAGCGATATCAGTTTTATTCGGTATCATTGCGTCTACTTAGTTTCCCTTAAGGAGGCATGGCTCC 33 | ATATTGGACCTGGGCACCGGTATTCCGGAGCTTTAATGACCATTTGACTACTCTGCTATGCTTAAGTCTACTATTACGGCCACCGCGGACTGA 34 | TTACGCCACTCTGATCGACATTACCCTCGTCTGTGCTAGGTGGTTTTGGTGACGAGCGGGGGCGCGCACAAAAGCCCATTAACCCTAACTTCT 35 | GCTTGCCTGTCCCACCCACGCGCGAACAACATCATCTTGCCTGTCCAGTTATGATTGACCCAAGCAGTCTGCCCCCTAAACAGAAACATCC 36 | ATTCGACCGTGCGCCGACAACCACGGAAGCTTGGTCACCAGTATCCCCCCTTCCACCTAGGGGACATGCTATCTATGGCAAACCA 37 | ACCTTAGATACCGTGTCACATCGTCCATCCGTAGCATACTTACAATCCGGACGCAGATTAGTACAGTATAAATCCCTGCGC 38 | CTAAGAAGCCAACTGATTAGCGGCGAGTAGTACCGTTGGAGCGGGTTTGAGATCCCTGCGGATGGCTAACTAACAAGTCAAAAGAATTCCGCCCAAC 39 | AAATCCCATAAGACACATCACGAGTTACATTGCCCCAACTAGAAGGTTTTTGATTCTGTTAGCGCACCTTTAACGCACTTTTCTCAATCT 40 | TGGGCAGTAGGGAAAGAGTTCATTAATTCGCACCCCAGTAATCCATTCATTTACTTGCCCCTGTATCAATTTGGATGTTCGCTC 41 | AACAGATTCAGGGCCAATGCTCGTCCTGGAACTGCGAGCTTCAGGAAAGATTATGGAGTCACACAGCTCATTGGATCGATCCATTGTGGCCTAAGGTA 42 | CGGTTCGGCTCGAGTATGCAACGGAAGTTTGTAGCGCCGCGGGGCTTCTATTGAATAGAACGACGGAGCTGCTTGCAGGATA 43 | TTTGATTAATAAGTCACGCCGGAAGATAAATGTTAGCTGCGCCTCAGACCATGTCTATTTCCCTACATTCTGGAGCAATTGAGTACAATGCATTGTGT 44 | GCTCCCTCGTTGCTTTTAGGCGCTACTCCTCCCCCAAACAGCCACCTCCCCTAGACCCCCCCGTTCTAGAGCGCAACAATCATTGCTCTCTACATA 45 | GTTTCTTCGCCGAGTAGATCAAGGGCCCCCCGCAGGGAGCTAAAGGAGAGGGAAACTATTACGGACCAAAGGGCTCGTGAGGA 46 | ATCTAGATTGACTTGCGGGACCTCTAAGAAGTATACGCAAGGATTTGCAGAATAAGCATAACTGATCTGCGTGCACCAGACATAGCTTGAGAGCAC 47 | CATAGAATATAGAAGCTCGGCAATCGCTGGGAGATGTAGCGCGAGTGGTCTATTCGAGTAACCCATGGTGCAAATGCAAACGCTTTCAA 48 | GAGTTAACCTCTGTTACTTGATCCGTAACATGCCTAGGATCGCCAACAAACCGCGATTTGATGAGCAGCTGACACTGCCTCGCACATTTG 49 | CTGATCGCGTCGATGTACATCCTGTACGGCTCGCCAGCATCTGCTAGGCTTAGGTCAGAGCTTTAAAGAGCGTCGCCCCACGCAGTGGC 50 | CGCTAAATCCTCATCTTGGCCTCCTGGAGTTACCACTGCTAGATCCTGGTTTGAAAAGTGGCGGTAACAATGCTTTCTCACCAACTCGGCTGCGGCGAG 51 | GCTCGATAATAATCCTATCAGGACATGAATGAGTTCCCATCTCCCCCAGAATCATATTGCTCAACCTCCCTAGCACGTAAAAAGACCTTCGCGGGTTA 52 | TCCTGGGCAAGCACTACCCTCCCCGCTGAACTGGCACAGATCCTCCAAGACTAGTGGAATTATGGCGGACATGAACGCTTAGTTTTAGTGGGGCGC 53 | CGACTGGAAGGTCGCGGCTACCCCTCGACATCATAGTCCCGCACGGTGTCGCCGGAGAGTGATGGAAACGGAAAGAACTGT 54 | CCCACGACACCCAGGACCGTTAGAACCTCGTCAACTGGGTGAACAGGCCTCGTACCACCTATGAACATACAGCCCCGAGAAAAGAGGCCGAGACCA 55 | CTACGTGTTTAAATTACGTGATCAACTAGCTCTGCTGTCTGTGTTCGATAGAACAAACATGTACTGTCTGAGCGGGAGCATA 56 | AGTACATCCAATGTTCAAAACTGCTACCCTTGACTATCGAATGGCGGGTGTGGCCCGTCATATAGGAAGTTCTGGGCGTGCCTATGCTCCACAT 57 | GGCTTAGAATGGCTAGAGGCGAGGCCGCTCCCTGCATTCGTGGCATTGAGCATCCTGACCTTCTGCGTGCCAATCAGATGCCA 58 | AAGGGCACTCTCCACAGTGCGGCACAAGGGCACGGACCGCTTGTAGAAACGCTACTACTTCATAAAGACGCTCAGCAGTTGCCAGT 59 | TATATGTGAGGACTTCCCCATTGAGACCACCACTGCGACAGCCCAGTCCATCGGTTCCCTCGTTAAGCTGAGTCCACAGACAAGCG 60 | AGCTACCTGCAAAGGGACCAGACGTCAGAGTCTTGGGGTAAATCAAAGGGTTCAATTGGCGTCGACAGAGGCTCCAACCCTAGAAATCCCGGTCGC 61 | AAATATAGTTAGTGAGGGATTACTGCATGTGGATATGGCCCGGTCCGGTTGATCCACCCATCCTCTATGAAGGCCTTCTCT 62 | CCCATTTACGGGTTTGCGACTAGTTGCAGGACTGTATGGATTAAGCTGTCTCGGATCCGTTAATGGATATCCATAAGGACGA 63 | GGGACCAGAGATACCTTGAGTTTTAGCGTGGCTGGTGTTATCTACACTATGGAATAAACACGGATAACTTGGGCTATCATTTCGTCCGGGAACGTAGTAG 64 | TTGTACAGTTGTCTCTGCATCCCCACCGCCAGACATTAATGTGCAACGGGTGAAGCGTCAACAACACCATATGCTATACGGGGGAAGT 65 | CCCCTATACGCACGTAACGGCCCTTGATATTGCAGTTCCACTTATGCCTGTTCCGATTGCACATATGTAATTGATCCATAGATCCCTGACAATG 66 | ACTTCATCCGGGGGCGTACTCACCATCTTGAGGATCCAATGGCCGCTAGAGAGACACGTTAGATCAGCTAATGACGGGCCGTGACATTG 67 | ATTTAGCTTGAGGCACCGGTAATGTCGTCCAGACATGATCGGCACGCGATCGGTAATGTATGGCCTTGCGCATCCCCGGATACGCTCGAATCCTGTACC 68 | ACCGACAGCTTTCACTCACGACAGACGACTGAATGTCTCCTTCAAGTCACGCGGCCCGCGTAACGGGAGGTAAGTTCGCTCTACGTCT 69 | CCACGATTTTTCTGTGTGAGGATGGGTCCTGGCGTTCTAATGTAGGAGTCCTAAGGTCGTCATTGCAAACCTAAACACCGAATGACAAGGTGA 70 | AGTTACCTAGACTCAAGAAGTGATCTCCAGTGCGGGGACTCCTCTGGTACTAAGTCAATCCTACACACTAATTTAAACTTGTGCTTAGTCTATTCAC 71 | CCCTGTTCCGGAAACGAAGGGACCCAAAGTAGCCATGAACTATCACCGATTGAGGACGTGTGGTTATTTTCCATCCCGATGTTGGCAT 72 | GGTTCACCCCGGATTTCGACTGGCAAGGTCGTTACGTCGCCATGACTTGAGGGAACCCATCGCAAGGAGTCTCGAGAGAA 73 | CGATCAGGCCCTGCTCCGCCGAATTGCATTGAAATTCTCGAATTCAATAATCTAATTTGGCCGCTACTCCTGATCCGAGTGCTCATCCTCGA 74 | CTGGCACAATCATCAGGGTGAAACGGATTGTCTCTGGCCTACTCAGGTCTACCTGGTACCGGTAGGAAGCGCGGAGCTGAGGAGGATAACTG 75 | TCACCATGGTCGGTCGAGTATTCGGCCTCTATTTACGCGAGGCGTGGCCCTAAATCATACTAGGTGTCTTCATCCTCAGGAGCCGA 76 | CAGCCTGGTGAGTTAAAGTACTCAAATTGTGAGTCGCCGTTTAAATAGTACGCTGGAGCTGATGACCGGACAGCAAGACTG 77 | ACTACTCATCGGTGCGACTAACATTGTTCTAATAGTCCACCAATGGTCAATACTTCAAATGATCTCCCTGGATCCATCCCACT 78 | CATGAATTGGAATACCTGAGTGACTTACGGCTGCCCTTCTCGCCCATTGCGCTCGTTTGCGTTCTCTCTTGTATTGCCGGGGCACCAACA 79 | TGGACTCGTTCCAACTTGCTTTCGGCCGTAGGCTCGAGATCAAATGGGGGAGGCATTAGCCCTATCGCGGGTCCCCTGGTGAGCATCT 80 | ACGCAAGAGTCCTCTCCCGAGTAAGGTGAGCACAGTCTGGTTGAGATTGAACGGTAAGAAAATATGAATCTCGCGGTTCTGC 81 | -------------------------------------------------------------------------------- /downloaded_fasta01.fasta: -------------------------------------------------------------------------------- 1 | >NM_001382809.1 Homo sapiens myoglobin (MB), transcript variant 5, mRNA 2 | GAGCATGTTGGCCTGGTCCTTTGCTAGGTACTGTAGAGCAGGTGAGAGAGTGAGGGGGAAGGACTCCAAA 3 | TTAGACCAGTTCTTAGCCATGAAGCAGAGACTCTGAAGCCAGACTACCTGGGTCCCAATCTTGGGCTTGG 4 | TATTTCCTCGCTGTGTGACTCTGGATGGAGGCTCGCTCTGTTGCCAGGCTGGAGTACAGCGATCTCGGCT 5 | CACTGCAACCTCTGCCTCCCGGGTTCAAGTGATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAG 6 | ACTGCGCCATGGGGCTCAGCGACGGGGAATGGCAGTTGGTGCTGAACGTCTGGGGGAAGGTGGAGGCTGA 7 | CATCCCAGGCCATGGGCAGGAAGTCCTCATCAGGCTCTTTAAGGGTCACCCAGAGACTCTGGAGAAGTTT 8 | GACAAGTTCAAGCACCTGAAGTCAGAGGACGAGATGAAGGCGTCTGAGGACTTAAAGAAGCATGGTGCCA 9 | CCGTGCTCACCGCCCTGGGTGGCATCCTTAAGAAGAAGGGGCATCATGAGGCAGAGATTAAGCCCCTGGC 10 | ACAGTCGCATGCCACCAAGCACAAGATCCCCGTGAAGTACCTGGAGTTCATCTCGGAATGCATCATCCAG 11 | GTTCTGCAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCCCAGGGGGCCATGAACAAGGCCCTGGAGC 12 | TGTTCCGGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCTTCCAGGGCTAGGCCCCTGCCGCTCCCAC 13 | CCCCACCCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTGATCTCGTGTAGCCATATAGAGTTTGCT 14 | TCTGAGTGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCTGAGGGGCTGGGGCTGGGGTGTTGAAGT 15 | TGGCTTTGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCATCACCCTGGGAACCGGGAGTGGCCCTTG 16 | GCTCACTGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCCTTTCTTCTAAATCCCAACCGAACTTCTT 17 | CCAACCTCCAAACTGGCTGTAACCCCAAATCCAAGCCATTAACTACACCTGACAGTAGCAATTGTCTGAT 18 | TAATCACTGGCCCCTTGAAGACAGCAGAATGTCCCTTTGCAATGAGGAGGAGATCTGGGCTGGGCGGGCC 19 | AGCTGGGGAAGCATTTGACTATCTGGAACTTGTGTGTGCCTCCTCAGGTATGGCAGTGACTCACCTGGTT 20 | TTAATAAAACAACCTGCAACATCTCA 21 | 22 | >NM_001382812.1 Homo sapiens myoglobin (MB), transcript variant 8, mRNA 23 | GGCCGCTTGGCTGGAGGCTCTGCGAGGACAGCTGGGGAGAAGGGGAGCTGTGGCTCTTTAAGGGTCACCC 24 | AGAGACTCTGGAGAAGTTTGACAAGTTCAAGCACCTGAAGTCAGAGGACGAGATGAAGGCGTCTGAGGAC 25 | TTAAAGAAGCATGGTGCCACCGTGCTCACCGCCCTGGGTGGCATCCTTAAGAAGAAGGGGCATCATGAGG 26 | CAGAGATTAAGCCCCTGGCACAGTCGCATGCCACCAAGCACAAGATCCCCGTGAAGTACCTGGAGTTCAT 27 | CTCGGAATGCATCATCCAGGTTCTGCAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCCCAGGGGGCC 28 | ATGAACAAGGCCCTGGAGCTGTTCCGGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCTTCCAGGGCT 29 | AGGCCCCTGCCGCTCCCACCCCCACCCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTGATCTCGTG 30 | TAGCCATATAGAGTTTGCTTCTGAGTGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCTGAGGGGCT 31 | GGGGCTGGGGTGTTGAAGTTGGCTTTGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCATCACCCTGG 32 | GAACCGGGAGTGGCCCTTGGCTCACTGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCCTTTCTTCTA 33 | AATCCCAACCGAACTTCTTCCAACCTCCAAACTGGCTGTAACCCCAAATCCAAGCCATTAACTACACCTG 34 | ACAGTAGCAATTGTCTGATTAATCACTGGCCCCTTGAAGACAGCAGAATGTCCCTTTGCAATGAGGAGGA 35 | GATCTGGGCTGGGCGGGCCAGCTGGGGAAGCATTTGACTATCTGGAACTTGTGTGTGCCTCCTCAGGTAT 36 | GGCAGTGACTCACCTGGTTTTAATAAAACAACCTGCAACATCTCA 37 | 38 | >NM_001382811.1 Homo sapiens myoglobin (MB), transcript variant 7, mRNA 39 | AGAAGGAGGAGGAAGAGAGTAGTGAAGATCGTGGGGAGTCCTGGCTTTGAGGAGCCAGAGTTCTCCTTCT 40 | TGCCACATCTTTGATCTCCCTTGGTGGTGTCTTAAGCCCAGATTTACCAAAGGGAATTGTCAGCTGTCCA 41 | AGGGCTAGCAAATTCCTAGGTCACCTAGATTGGATTTTCTGACCATAAAAACTGTGGGCCAGGTGCACAG 42 | CTGCCTGAGGGGCTCAAACCTGTGCAGACTGCGCCATGGGGCTCAGCGACGGGGAATGGCAGTTGGTGCT 43 | GAACGTCTGGGGGAAGGTGGAGGCTGACATCCCAGGCCATGGGCAGGAAGTCCTCATCAGGCTCTTTAAG 44 | GGTCACCCAGAGACTCTGGAGAAGTTTGACAAGTTCAAGCACCTGAAGTCAGAGGACGAGATGAAGGCGT 45 | CTGAGGACTTAAAGAAGCATGGTGCCACCGTGCTCACCGCCCTGGGTGGCATCCTTAAGAAGAAGGGGCA 46 | TCATGAGGCAGAGATTAAGCCCCTGGCACAGTCGCATGCCACCAAGCACAAGATCCCCGTGAAGTACCTG 47 | GAGTTCATCTCGGAATGCATCATCCAGGTTCTGCAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCCC 48 | AGGGGGCCATGAACAAGGCCCTGGAGCTGTTCCGGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCTT 49 | CCAGGGCTAGGCCCCTGCCGCTCCCACCCCCACCCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTG 50 | ATCTCGTGTAGCCATATAGAGTTTGCTTCTGAGTGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCT 51 | GAGGGGCTGGGGCTGGGGTGTTGAAGTTGGCTTTGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCAT 52 | CACCCTGGGAACCGGGAGTGGCCCTTGGCTCACTGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCCT 53 | TTCTTCTAAATCCCAACCGAACTTCTTCCAACCTCCAAACTGGCTGTAACCCCAAATCCAAGCCATTAAC 54 | TACACCTGACAGTAGCAATTGTCTGATTAATCACTGGCCCCTTGAAGACAGCAGAATGTCCCTTTGCAAT 55 | GAGGAGGAGATCTGGGCTGGGCGGGCCAGCTGGGGAAGCATTTGACTATCTGGAACTTGTGTGTGCCTCC 56 | TCAGGTATGGCAGTGACTCACCTGGTTTTAATAAAACAACCTGCAACATCTCA 57 | 58 | >NM_001382813.1 Homo sapiens myoglobin (MB), transcript variant 9, mRNA 59 | GAGCATGTTGGCCTGGTCCTTTGCTAGGTACTGTAGAGCAGGTGAGAGAGTGAGGGGGAAGGACTCCAAA 60 | TTAGACCAGTTCTTAGCCATGAAGCAGAGACTCTGAAGCCAGACTACCTGGGTCCCAATCTTGGGCTTGG 61 | TATTTCCTCGCTGTGTGACTCTGGGCTCTTTAAGGGTCACCCAGAGACTCTGGAGAAGTTTGACAAGTTC 62 | AAGCACCTGAAGTCAGAGGACGAGATGAAGGCGTCTGAGGACTTAAAGAAGCATGGTGCCACCGTGCTCA 63 | CCGCCCTGGGTGGCATCCTTAAGAAGAAGGGGCATCATGAGGCAGAGATTAAGCCCCTGGCACAGTCGCA 64 | TGCCACCAAGCACAAGATCCCCGTGAAGTACCTGGAGTTCATCTCGGAATGCATCATCCAGGTTCTGCAG 65 | AGCAAGCATCCCGGGGACTTTGGTGCTGATGCCCAGGGGGCCATGAACAAGGCCCTGGAGCTGTTCCGGA 66 | AGGACATGGCCTCCAACTACAAGGAGCTGGGCTTCCAGGGCTAGGCCCCTGCCGCTCCCACCCCCACCCA 67 | TCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTGATCTCGTGTAGCCATATAGAGTTTGCTTCTGAGTGT 68 | CTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCTGAGGGGCTGGGGCTGGGGTGTTGAAGTTGGCTTTGC 69 | ATGCCCAGCGATGCGCCTCCCTGTGGGATGTCATCACCCTGGGAACCGGGAGTGGCCCTTGGCTCACTGT 70 | GTTCTGCATGGTTTGGATCTGAATTAATTGTCCTTTCTTCTAAATCCCAACCGAACTTCTTCCAACCTCC 71 | AAACTGGCTGTAACCCCAAATCCAAGCCATTAACTACACCTGACAGTAGCAATTGTCTGATTAATCACTG 72 | GCCCCTTGAAGACAGCAGAATGTCCCTTTGCAATGAGGAGGAGATCTGGGCTGGGCGGGCCAGCTGGGGA 73 | AGCATTTGACTATCTGGAACTTGTGTGTGCCTCCTCAGGTATGGCAGTGACTCACCTGGTTTTAATAAAA 74 | CAACCTGCAACATCTCA 75 | 76 | >NM_001382810.1 Homo sapiens myoglobin (MB), transcript variant 6, mRNA 77 | GGCCGCTTGGCTGGAGGCTCTGCGAGGACAGCTGGGGAGAAGGGGAGCTGTGATGGAGGCTCGCTCTGTT 78 | GCCAGGCTGGAGTACAGCGATCTCGGCTCACTGCAACCTCTGCCTCCCGGGTTCAAGTGATTCTCCTGCC 79 | TCAGCCTCCCAAGTAGCTGGGACTACAGACTGCGCCATGGGGCTCAGCGACGGGGAATGGCAGTTGGTGC 80 | TGAACGTCTGGGGGAAGGTGGAGGCTGACATCCCAGGCCATGGGCAGGAAGTCCTCATCAGGCTCTTTAA 81 | GGGTCACCCAGAGACTCTGGAGAAGTTTGACAAGTTCAAGCACCTGAAGTCAGAGGACGAGATGAAGGCG 82 | TCTGAGGACTTAAAGAAGCATGGTGCCACCGTGCTCACCGCCCTGGGTGGCATCCTTAAGAAGAAGGGGC 83 | ATCATGAGGCAGAGATTAAGCCCCTGGCACAGTCGCATGCCACCAAGCACAAGATCCCCGTGAAGTACCT 84 | GGAGTTCATCTCGGAATGCATCATCCAGGTTCTGCAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCC 85 | CAGGGGGCCATGAACAAGGCCCTGGAGCTGTTCCGGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCT 86 | TCCAGGGCTAGGCCCCTGCCGCTCCCACCCCCACCCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCT 87 | GATCTCGTGTAGCCATATAGAGTTTGCTTCTGAGTGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGC 88 | TGAGGGGCTGGGGCTGGGGTGTTGAAGTTGGCTTTGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCA 89 | TCACCCTGGGAACCGGGAGTGGCCCTTGGCTCACTGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCC 90 | TTTCTTCTAAATCCCAACCGAACTTCTTCCAACCTCCAAACTGGCTGTAACCCCAAATCCAAGCCATTAA 91 | CTACACCTGACAGTAGCAATTGTCTGATTAATCACTGGCCCCTTGAAGACAGCAGAATGTCCCTTTGCAA 92 | TGAGGAGGAGATCTGGGCTGGGCGGGCCAGCTGGGGAAGCATTTGACTATCTGGAACTTGTGTGTGCCTC 93 | CTCAGGTATGGCAGTGACTCACCTGGTTTTAATAAAACAACCTGCAACATCTCA 94 | 95 | >NM_001362846.2 Homo sapiens myoglobin (MB), transcript variant 4, mRNA 96 | GGCCGCTTGGCTGGAGGCTCTGCGAGGACAGCTGGGGAGAAGGGGAGCTGTGACTGCGCCATGGGGCTCA 97 | GCGACGGGGAATGGCAGTTGGTGCTGAACGTCTGGGGGAAGGTGGAGGCTGACATCCCAGGCCATGGGCA 98 | GGAAGTCCTCATCAGGCTCTTTAAGGGTCACCCAGAGACTCTGGAGAAGTTTGACAAGTTCAAGCACCTG 99 | AAGTCAGAGGACGAGATGAAGGCGTCTGAGGACTTAAAGAAGCATGGTGCCACCGTGCTCACCGCCCTGG 100 | GTGGCATCCTTAAGAAGAAGGGGCATCATGAGGCAGAGATTAAGCCCCTGGCACAGTCGCATGCCACCAA 101 | GCACAAGATCCCCGTGAAGTACCTGGAGTTCATCTCGGAATGCATCATCCAGGTTCTGCAGAGCAAGCAT 102 | CCCGGGGACTTTGGTGCTGATGCCCAGGGGGCCATGAACAAGGCCCTGGAGCTGTTCCGGAAGGACATGG 103 | CCTCCAACTACAAGGAGCTGGGCTTCCAGGGCTAGGCCCCTGCCGCTCCCACCCCCACCCATCTGGGCCC 104 | CGGGTTCAAGAGAGAGCGGGGTCTGATCTCGTGTAGCCATATAGAGTTTGCTTCTGAGTGTCTGCTTTGT 105 | TTAGTAGAGGTGGGCAGGAGGAGCTGAGGGGCTGGGGCTGGGGTGTTGAAGTTGGCTTTGCATGCCCAGC 106 | GATGCGCCTCCCTGTGGGATGTCATCACCCTGGGAACCGGGAGTGGCCCTTGGCTCACTGTGTTCTGCAT 107 | GGTTTGGATCTGAATTAATTGTCCTTTCTTCTAAATCCCAACCGAACTTCTTCCAACCTCCAAACTGGCT 108 | GTAACCCCAAATCCAAGCCATTAACTACACCTGACAGTAGCAATTGTCTGATTAATCACTGGCCCCTTGA 109 | AGACAGCAGAATGTCCCTTTGCAATGAGGAGGAGATCTGGGCTGGGCGGGCCAGCTGGGGAAGCATTTGA 110 | CTATCTGGAACTTGTGTGTGCCTCCTCAGGTATGGCAGTGACTCACCTGGTTTTAATAAAACAACCTGCA 111 | ACATCTCA 112 | 113 | >NM_005368.3 Homo sapiens myoglobin (MB), transcript variant 1, mRNA 114 | AAACCCCAGCTGTTGGGGCCAGGACACCCAGTGAGCCCATACTTGCTCTTTTTGTCTTCTTCAGACTGCG 115 | CCATGGGGCTCAGCGACGGGGAATGGCAGTTGGTGCTGAACGTCTGGGGGAAGGTGGAGGCTGACATCCC 116 | AGGCCATGGGCAGGAAGTCCTCATCAGGCTCTTTAAGGGTCACCCAGAGACTCTGGAGAAGTTTGACAAG 117 | TTCAAGCACCTGAAGTCAGAGGACGAGATGAAGGCGTCTGAGGACTTAAAGAAGCATGGTGCCACCGTGC 118 | TCACCGCCCTGGGTGGCATCCTTAAGAAGAAGGGGCATCATGAGGCAGAGATTAAGCCCCTGGCACAGTC 119 | GCATGCCACCAAGCACAAGATCCCCGTGAAGTACCTGGAGTTCATCTCGGAATGCATCATCCAGGTTCTG 120 | CAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCCCAGGGGGCCATGAACAAGGCCCTGGAGCTGTTCC 121 | GGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCTTCCAGGGCTAGGCCCCTGCCGCTCCCACCCCCAC 122 | CCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTGATCTCGTGTAGCCATATAGAGTTTGCTTCTGAG 123 | TGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCTGAGGGGCTGGGGCTGGGGTGTTGAAGTTGGCTT 124 | TGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCATCACCCTGGGAACCGGGAGTGGCCCTTGGCTCAC 125 | TGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCCTTTCTTCTAAATCCCAACCGAACTTCTTCCAACC 126 | TCCAAACTGGCTGTAACCCCAAATCCAAGCCATTAACTACACCTGACAGTAGCAATTGTCTGATTAATCA 127 | CTGGCCCCTTGAAGACAGCAGAATGTCCCTTTGCAATGAGGAGGAGATCTGGGCTGGGCGGGCCAGCTGG 128 | GGAAGCATTTGACTATCTGGAACTTGTGTGTGCCTCCTCAGGTATGGCAGTGACTCACCTGGTTTTAATA 129 | AAACAACCTGCAACATCTCA 130 | 131 | >NM_203378.1 Homo sapiens myoglobin (MB), transcript variant 3, mRNA 132 | AATGGCACCTGCCCTAAAATAGCTTCCCATGTGAGGGCTAGAGAAAGGAAAAGATTAGACCCTCCCTGGA 133 | TGAGAGAGAGAAAGTGAAGGAGGGCAGGGGAGGGGGACAGCGAGCCATTGAGCGATCTTTGTCAAGCATC 134 | CCAGAAGACTGCGCCATGGGGCTCAGCGACGGGGAATGGCAGTTGGTGCTGAACGTCTGGGGGAAGGTGG 135 | AGGCTGACATCCCAGGCCATGGGCAGGAAGTCCTCATCAGGCTCTTTAAGGGTCACCCAGAGACTCTGGA 136 | GAAGTTTGACAAGTTCAAGCACCTGAAGTCAGAGGACGAGATGAAGGCGTCTGAGGACTTAAAGAAGCAT 137 | GGTGCCACCGTGCTCACCGCCCTGGGTGGCATCCTTAAGAAGAAGGGGCATCATGAGGCAGAGATTAAGC 138 | CCCTGGCACAGTCGCATGCCACCAAGCACAAGATCCCCGTGAAGTACCTGGAGTTCATCTCGGAATGCAT 139 | CATCCAGGTTCTGCAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCCCAGGGGGCCATGAACAAGGCC 140 | CTGGAGCTGTTCCGGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCTTCCAGGGCTAGGCCCCTGCCG 141 | CTCCCACCCCCACCCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTGATCTCGTGTAGCCATATAGA 142 | GTTTGCTTCTGAGTGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCTGAGGGGCTGGGGCTGGGGTG 143 | TTGAAGTTGGCTTTGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCATCACCCTGGGAACCGGGAGTG 144 | GCCCTTGGCTCACTGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCCTTTCTTCTAAATCCCAACCGA 145 | ACTTCTTCCAACCTCCAAACTGGCTGTAACCCCAAATCCAAGCCATTAACTACACCTGACAGTAGCAATT 146 | GTCTGATTAATCACTGGCCCCTTGAAGACAGCAGAATGTCCCTTTGCAATGAGGAGGAGATCTGGGCTGG 147 | GCGGGCCAGCTGGGGAAGCATTTGACTATCTGGAACTTGTGTGTGCCTCCTCAGGTATGGCAGTGACTCA 148 | CCTGGTTTTAATAAAACAACCTGCAACATCTCA 149 | 150 | >NM_203377.1 Homo sapiens myoglobin (MB), transcript variant 2, mRNA 151 | GAGCATGTTGGCCTGGTCCTTTGCTAGGTACTGTAGAGCAGGTGAGAGAGTGAGGGGGAAGGACTCCAAA 152 | TTAGACCAGTTCTTAGCCATGAAGCAGAGACTCTGAAGCCAGACTACCTGGGTCCCAATCTTGGGCTTGG 153 | TATTTCCTCGCTGTGTGACTCTGGACTGCGCCATGGGGCTCAGCGACGGGGAATGGCAGTTGGTGCTGAA 154 | CGTCTGGGGGAAGGTGGAGGCTGACATCCCAGGCCATGGGCAGGAAGTCCTCATCAGGCTCTTTAAGGGT 155 | CACCCAGAGACTCTGGAGAAGTTTGACAAGTTCAAGCACCTGAAGTCAGAGGACGAGATGAAGGCGTCTG 156 | AGGACTTAAAGAAGCATGGTGCCACCGTGCTCACCGCCCTGGGTGGCATCCTTAAGAAGAAGGGGCATCA 157 | TGAGGCAGAGATTAAGCCCCTGGCACAGTCGCATGCCACCAAGCACAAGATCCCCGTGAAGTACCTGGAG 158 | TTCATCTCGGAATGCATCATCCAGGTTCTGCAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCCCAGG 159 | GGGCCATGAACAAGGCCCTGGAGCTGTTCCGGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCTTCCA 160 | GGGCTAGGCCCCTGCCGCTCCCACCCCCACCCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTGATC 161 | TCGTGTAGCCATATAGAGTTTGCTTCTGAGTGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCTGAG 162 | GGGCTGGGGCTGGGGTGTTGAAGTTGGCTTTGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCATCAC 163 | CCTGGGAACCGGGAGTGGCCCTTGGCTCACTGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCCTTTC 164 | TTCTAAATCCCAACCGAACTTCTTCCAACCTCCAAACTGGCTGTAACCCCAAATCCAAGCCATTAACTAC 165 | ACCTGACAGTAGCAATTGTCTGATTAATCACTGGCCCCTTGAAGACAGCAGAATGTCCCTTTGCAATGAG 166 | GAGGAGATCTGGGCTGGGCGGGCCAGCTGGGGAAGCATTTGACTATCTGGAACTTGTGTGTGCCTCCTCA 167 | GGTATGGCAGTGACTCACCTGGTTTTAATAAAACAACCTGCAACATCTCA 168 | 169 | >NM_001171660.2 Homo sapiens cytochrome b5 reductase 3 (CYB5R3), transcript variant 5, mRNA 170 | ACTTGCCCCAGGGAGTTTCCAGAAGGGACCAGGGGCTGTGAAAAGTCTACCAAGGGCCTGGTTAGCTAGG 171 | GAAGGGCTCATCAGGGAAGATTTGATGTTTGAGCTAGGCTCTGAGGAATGAATAGGAGTTTGCTGGTTGG 172 | ATGCATGCAGAGCAAGGACATTTGGGGCAGAGAGGAGAGCATATGCGAACGCCTGAAGCAAGATGGGCTT 173 | GATGTTGAAAGAGCAGAGAGCTGGGAGTTGGGCCATATGGTGCTCTTCCCAGTCTGGTTCCTGTACAGTC 174 | TGCTCATGAAGCTGTTCCAGCGCTCCACGCCAGCCATCACCCTCGAGAGCCCGGACATCAAGTACCCGCT 175 | GCGGCTCATCGACCGGGAGATCATCAGCCATGACACCCGGCGCTTCCGCTTTGCCCTGCCGTCACCCCAG 176 | CACATCCTGGGCCTCCCTGTCGGCCAGCACATCTACCTCTCGGCTCGAATTGATGGAAACCTGGTCGTCC 177 | GGCCCTATACACCCATCTCCAGCGATGATGACAAGGGCTTCGTGGACCTGGTCATCAAGGTTTACTTCAA 178 | GGACACCCATCCCAAGTTTCCCGCTGGAGGGAAGATGTCTCAGTACCTGGAGAGCATGCAGATTGGAGAC 179 | ACCATTGAGTTCCGGGGCCCCAGTGGGCTGCTGGTCTACCAGGGCAAAGGGAAGTTCGCCATCCGACCTG 180 | ACAAAAAGTCCAACCCTATCATCAGGACAGTGAAGTCTGTGGGCATGATCGCGGGAGGGACAGGCATCAC 181 | CCCGATGCTGCAGGTGATCCGCGCCATCATGAAGGACCCTGATGACCACACTGTGTGCCACCTGCTCTTT 182 | GCCAACCAGACCGAGAAGGACATCCTGCTGCGACCTGAGCTGGAGGAACTCAGGAACAAACATTCTGCAC 183 | GCTTCAAGCTCTGGTACACGCTGGACAGAGCCCCTGAAGCCTGGGACTACGGCCAGGGCTTCGTGAATGA 184 | GGAGATGATCCGGGACCACCTTCCACCCCCAGAGGAGGAGCCGCTGGTGCTGATGTGTGGCCCCCCACCC 185 | ATGATCCAGTACGCCTGCCTTCCCAACCTGGACCACGTGGGCCACCCCACGGAGCGCTGCTTCGTCTTCT 186 | GAGGGCCGGGCACGGTCACACGGCCACCCGCCCCGCGCACCCCACGCCCTGTTCACGCTCACCCAGTCAC 187 | CTCCCCACATCGCACACTGGGGCCCCGGGTTCAGCCTGGCCTGCCCGTGCCCTGGTGAATCACCTGGCTG 188 | AGCAGTTCCCCTGGAGCCCCTTCGGGAGCAGGGCTGTGTCCCAGATGGGCCACGGCTGAGCCTTCAGAGT 189 | ACGTCCTGCCTGGCACTTACTGGTCCTTACCAGAGACGCCCAGCCCCATCCCTGTCCTCATGACCCCTCG 190 | TCCACCCCCCACACACACTATAAGGCTGAGGGCTGCCAGCAGCCCCGTCTGCCCACCATTCCCGGCCGTG 191 | GACCATAGTCGGGATGTCAGCAGACACACATGGGCAGCCCAAAGCTGCAGGTGCCAGGGCCCACCCCAGC 192 | CTCGCCTGTCACCCCCACTCCCGCCTCAGGGCCAGGCCCAGGCCTCACCACCTGACGCTGCATGAGACAT 193 | TGACACCAGAAAGCCCTCTTGGGGGCACTGCTCCCTACCCCAGGGCCCTGGCCAGCCGGGAGCTTGGCTC 194 | TCCTCTGGCTAGAGTGGGAAGAGGGGGCTGGCCATGGGGCCCTCCCAGAACCTCAGCATTTCCTTCCAGC 195 | CCATCCAAACACTGAGGCAGCCTTGGGGAACCCCGAGCTGGGGGGTTGGCAGCCCACTGCACCGCCTCAG 196 | GGTTTTGGGGTCCTGGGCTGGGGCCACCATCCCTGATGGCAGAACTCCCACAACCACATGTATTTATTCC 197 | TCTGTCCTAAACCGTCCCCTCCTTCCCTCACCCCCAGCACAGGGGGATTCTGAGCAGTGCCTCTTGTCTG 198 | AGGGACATATCAGTGACCTCGACGTTGCCTTTAGACTACAGTTGTGTTAGCCTCTTGCGTATTGGCTTTT 199 | TCAGAGTCATTTATGAGCAGAAAAAAAAAAAGTAAAACTTTGCTAATATTAACCCTTCTCTAGCTCCTCG 200 | AGGGTCTGTGACCTGCAACACAAGGGGTGGGGTCAGGAAAGGGCTGGGGAAGACCTAGCATTTTTTTTTT 201 | CTTTTTTTTTTTTTTTTGAGACGGAGTCTCCCTTTGTCACCCAGGCTGGAGTGGCATGATCTCAGCTCAC 202 | TACAACCTCCACCTCTCGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTAAAAGTG 203 | CCCACCACCACACCCAGCTAGTTTTTGTATTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCG 204 | CCCAGGCTGGAGTGCAGTGGCGGGATCTCGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTC 205 | CTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGCCCGCCACTACGCCCGGCTAATTTTTTGTATTTT 206 | TAGTAGAGACGGGGTTTCACCGTTTTAGCCGGGATGGTCTCGATCTCCTGACCTCGTGATCCGCCCGCCT 207 | CGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACTGCGCCCGGCTCCAGCATTTATTTCTGATGTA 208 | TCTTTGTGGTAGAAAATTTGGAAAGTGCAGAGAAGTATACACAGGAAGAAAAATTCCCAACCCCCAGAGG 209 | CAAACCAGCTGAAACCACGCAACCCCAGTCACCCCAATGCACCGCGAGGCTGCTGCCTCCTGTCAGGGTC 210 | AGATGAGCCTCGAGGCTCAGGAAAGTCAGAGGATGCCATCTGCATGGTGGTAAATTACAGAGGTGATGAG 211 | GCAAGGTGGGTGTGGGGCTGTTCTTAAAACGGGGCAGCAGGAAGGCCCCAAGGAGATGGATTTGGGCTGG 212 | GACGGGAAGAGAGAGCTGGCCATGCTGGGGTGGGTGGGTGTTCAAATGGTGGAAACAGCAGACGCAAAGG 213 | CCCTGCCGTTGGAACCAGCTTGTGGAATAAACTTTCAGAAACAGA 214 | 215 | -------------------------------------------------------------------------------- /extract_identical_headers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 16, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Output fasta files have been successfully created\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# EXTRACTING FASTA SEQUECNES WITH IDENTICAL HEADERS HAVING DIFFERENT LENGTHS\n", 18 | "\n", 19 | "# The extracted sequences will be stored in a fasta file with an index at end of the identical headers such as\n", 20 | "# header_1\n", 21 | "# header_2 ... and so on\n", 22 | "\n", 23 | "from Bio import SeqIO\n", 24 | "\n", 25 | "# Enter the name/path of your input fasta file here\n", 26 | "input_fasta = \"input_fasta.fasta\"\n", 27 | "\n", 28 | "def main(input_fasta):\n", 29 | " # Reading the input fasta file\n", 30 | " fr = open(input_fasta,\"r\")\n", 31 | " records = SeqIO.parse(fr,\"fasta\")\n", 32 | " \n", 33 | " # Dictionary identical_headers now store only the headers for sequences with identical headers as its keys\n", 34 | " identical_headers = getting_identical_header(records)\n", 35 | " fr.close()\n", 36 | " \n", 37 | " # Again reading our input fasta file for fetching only those sequences which have identical headers\n", 38 | " fr = open(input_fasta,\"r\")\n", 39 | " records = SeqIO.parse(fr,\"fasta\")\n", 40 | " \n", 41 | " # Dictionary identical_headers now store the complete records for sequences with identical headers\n", 42 | " identical_headers = getting_identical_header_records(records, identical_headers)\n", 43 | " fr.close()\n", 44 | " \n", 45 | " # Modifying the headers before writing to the output files (adding index to the headers)\n", 46 | " identical_headers = modifying_headers(identical_headers)\n", 47 | " \n", 48 | " # Writing the extracted sequences with identical headers to output fasta files\n", 49 | " writing_identical_header_records(identical_headers)\n", 50 | " \n", 51 | " print(\"Output fasta files have been successfully created\")\n", 52 | "\n", 53 | "# Function modifying_headers adds index number at the end of each identical header\n", 54 | "def modifying_headers(identical_headers):\n", 55 | " for header in identical_headers:\n", 56 | " for iteration,record in enumerate(identical_headers[header]):\n", 57 | " record.description = record.description + \"_\" + str(iteration)\n", 58 | " \n", 59 | " return(identical_headers)\n", 60 | " \n", 61 | "# Function writing_identical_header_records writes the fasta sequences with identical headers to output files\n", 62 | "def writing_identical_header_records(identical_headers):\n", 63 | " for iteration,header in enumerate(identical_headers):\n", 64 | " out_file_name = \"out_file_\"+str(iteration)+\".fasta\"\n", 65 | " fw = open(out_file_name,\"w\")\n", 66 | " SeqIO.write(identical_headers[header], fw, \"fasta\")\n", 67 | " fw.close()\n", 68 | " \n", 69 | "# Function getting_identical_header_records fetches the fasta sequences with identical headers\n", 70 | "def getting_identical_header_records(records, identical_headers): \n", 71 | " # Looping over each fasta record and checking for sequences with identical headers\n", 72 | "\n", 73 | " for iteration, record in enumerate(records):\n", 74 | " header = record.description\n", 75 | " if header in identical_headers:\n", 76 | " identical_headers[header].append(record)\n", 77 | " \n", 78 | " return identical_headers \n", 79 | " \n", 80 | "# Function getting_identical_header fetches only the identical headers\n", 81 | "def getting_identical_header(records):\n", 82 | " # Looping over each fasta record and storing the headers along with their counts in a dictionary\n", 83 | " header_counts = {} # This dictionary stores the headers along with their counts\n", 84 | "\n", 85 | " for record in records:\n", 86 | " header = record.description\n", 87 | " if header not in header_counts.keys():\n", 88 | " header_counts[header] = 1\n", 89 | " else:\n", 90 | " header_counts[header] += 1\n", 91 | " \n", 92 | " # fetching only the identical headers from the dictionary header_counts\n", 93 | " identical_headers = {} # This list stores the identical headers\n", 94 | "\n", 95 | " for header in header_counts:\n", 96 | " if header_counts[header] > 1:\n", 97 | " identical_headers[header] = []\n", 98 | " \n", 99 | " return identical_headers\n", 100 | "\n", 101 | "# Calling main function\n", 102 | "main(input_fasta)\n", 103 | " \n", 104 | " \n", 105 | " " 106 | ] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.6.4" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 2 130 | } 131 | -------------------------------------------------------------------------------- /extracting_only_gene_ids.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Extracted 4008 gene ids from the sample_genome.gb\n", 13 | "Output file gene_ids.txt successfully created\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "# Extracting all the gene ids from the genbank file of a genome using Biopython\n", 19 | "# Writing the extracted gene ids to a output file 'gene_ids.txt'\n", 20 | "from Bio import SeqIO\n", 21 | "input_file = 'sample_genome.gb'\n", 22 | "output_file = 'gene_ids.txt'\n", 23 | "record = SeqIO.read(input_file,'genbank')\n", 24 | "gene_id_list = [] # will store all the gene ids extracted from this genbank file\n", 25 | "for feature in record.features: # looping over all the features in genbank file\n", 26 | " if feature.type == 'gene': # if feature is a gene\n", 27 | " gene_id = feature.qualifiers['db_xref'][0].split(':')[1] # extracting its id\n", 28 | " gene_id_list.append(gene_id) # appending gene id to gene_id_list \n", 29 | " \n", 30 | "# writing each gene id to output file 'gene_ids.txt'\n", 31 | "print(f'Extracted {len(gene_id_list)} gene ids from the {input_file}')\n", 32 | "fw = open(output_file,'w') \n", 33 | "for id in gene_id_list:\n", 34 | " fw.write(id + '\\n')\n", 35 | "fw.close()\n", 36 | "print(f'Output file {output_file} successfully created')" 37 | ] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.6.4" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 2 61 | } 62 | -------------------------------------------------------------------------------- /fastq_to_fasta_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Fasta file 'fasta_converted_1.fasta' has been successfully created\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# CONVERTING FASTQ(only reads) TO FASTA \n", 18 | "\n", 19 | "# Input: Fastq file\n", 20 | "# Output: Fasta file containing only the reads from the fastq file\n", 21 | "\n", 22 | "# Enter the name/path of input fastq file\n", 23 | "input_file = \"sample.fastq\"\n", 24 | "\n", 25 | "# Enter the name/path of output fasta file\n", 26 | "output_file = \"fasta_converted_1.fasta\"\n", 27 | "\n", 28 | "fr = open(input_file,\"r\")\n", 29 | "fw = open(output_file,\"w\")\n", 30 | "while True:\n", 31 | " header = fr.readline().replace(\"@\",\">\")\n", 32 | " fw.write(header)\n", 33 | " read = fr.readline()\n", 34 | " fw.write(read)\n", 35 | " fr.readline()\n", 36 | " fr.readline()\n", 37 | " if len(header) == 0:\n", 38 | " break\n", 39 | "fr.close()\n", 40 | "fw.close()\n", 41 | "print(f\"Fasta file '{output_file}' has been successfully created\")" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 3", 48 | "language": "python", 49 | "name": "python3" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 3 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython3", 61 | "version": "3.6.4" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 2 66 | } 67 | -------------------------------------------------------------------------------- /fastq_to_fasta_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Fasta file '{output_file}' has been successfully created\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# CONVERTING FASTQ(only phred scores) TO FASTA \n", 18 | "\n", 19 | "# Input: Fastq file\n", 20 | "# Output: Fasta file containing only the phred scores(qualities) from the fastq file\n", 21 | "\n", 22 | "# Enter the name/path of input fastq file\n", 23 | "input_file = \"sample.fastq\"\n", 24 | "\n", 25 | "# Enter the name/path of output fasta file\n", 26 | "output_file = \"fasta_converted_2.fasta\"\n", 27 | "\n", 28 | "fr = open(input_file,\"r\")\n", 29 | "fw = open(output_file,\"w\")\n", 30 | "while True:\n", 31 | " header = fr.readline().replace(\"@\",\">\")\n", 32 | " fw.write(header)\n", 33 | " fr.readline()\n", 34 | " fr.readline()\n", 35 | " qual = fr.readline()\n", 36 | " fw.write(qual)\n", 37 | " if len(header) == 0:\n", 38 | " break\n", 39 | "fr.close()\n", 40 | "fw.close()\n", 41 | "print(\"Fasta file '{output_file}' has been successfully created\")" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 3", 48 | "language": "python", 49 | "name": "python3" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 3 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython3", 61 | "version": "3.6.4" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 2 66 | } 67 | -------------------------------------------------------------------------------- /file1.fasta: -------------------------------------------------------------------------------- 1 | >NC_000024.10:c2787682-2786855 Homo sapiens chromosome Y, GRCh38.p13 Primary Assembly 2 | AGAAGTGAGTTTTGGATAGTAAAATAAGTTTCGAACTCTGGCACCTTTCAATTTTGTCGCACTCTCCTTG 3 | TTTTTGACAATGCAATCATATGCTTCTGCTATGTTAAGCGTATTCAACAGCGATGATTACAGTCCAGCTG 4 | TGCAAGAGAATATTCCCGCTCTCCGGAGAAGCTCTTCCTTCCTTTGCACTGAAAGCTGTAACTCTAAGTA 5 | TCAGTGTGAAACGGGAGAAAACAGTAAAGGCAACGTCCAGGATAGAGTGAAGCGACCCATGAACGCATTC 6 | ATCGTGTGGTCTCGCGATCAGAGGCGCAAGATGGCTCTAGAGAATCCCAGAATGCGAAACTCAGAGATCA 7 | GCAAGCAGCTGGGATACCAGTGGAAAATGCTTACTGAAGCCGAAAAATGGCCATTCTTCCAGGAGGCACA 8 | GAAATTACAGGCCATGCACAGAGAGAAATACCCGAATTATAAGTATCGACCTCGTCGGAAGGCGAAGATG 9 | CTGCCGAAGAATTGCAGTTTGCTTCCCGCAGATCCCGCTTCGGTACTCTGCAGCGAAGTGCAACTGGACA 10 | ACAGGTTGTACAGGGATGACTGTACGAAAGCCACACACTCAAGAATGGAGCACCAGCTAGGCCACTTACC 11 | GCCCATCAACGCAGCCAGCTCACCGCAGCAACGGGACCGCTACAGCCACTGGACAAAGCTGTAGGACAAT 12 | CGGGTAACATTGGCTACAAAGACCTACCTAGATGCTCCTTTTTACGATAACTTACAGCCCTCACTTTCTT 13 | ATGTTTAGTTTCAATATTGTTTTCTTTTCTCTGGCTAATAAAGGCCTTATTCATTTCA 14 | 15 | -------------------------------------------------------------------------------- /file2.fasta: -------------------------------------------------------------------------------- 1 | >NC_000087.7:c2663658-2662471 Mus musculus strain C57BL/6J chromosome Y, GRCm38.p6 C57BL/6J 2 | ATGGAGGGCCATGTCAAGCGCCCCATGAATGCATTTATGGTGTGGTCCCGTGGTGAGAGGCACAAGTTGG 3 | CCCAGCAGAATCCCAGCATGCAAAATACAGAGATCAGCAAGCAGCTGGGATGCAGGTGGAAAAGCCTTAC 4 | AGAAGCCGAAAAAAGGCCCTTTTTCCAGGAGGCACAGAGATTGAAGATCCTACACAGAGAGAAATACCCA 5 | AACTATAAATATCAGCCTCATCGGAGGGCTAAAGTGTCACAGAGGAGTGGCATTTTACAGCCTGCAGTTG 6 | CCTCAACAAAACTGTACAACCTTCTGCAGTGGGACAGGAACCCACATGCCATCACATACAGGCAAGACTG 7 | GAGTAGAGCTGCACACCTGTACTCCAAAAACCAGCAAAGCTTTTATTGGCAGCCTGTTGATATCCCCACT 8 | GGGCACCTGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCATAACCACCACCAGCAGCAACAGC 9 | AGTTCTATGACCACCACCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCATGACCACCACCA 10 | GCAGAAGCAGCAGTTTCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCACCACCACCACCAG 11 | GAGCAGCAGTTCCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCAGCAGCAGCAGCAGCAGC 12 | AGCAGCAGCAGCAGTTCCATGACCACCACCAGCAGAAGCAGCAGTTCCATGACCACCACCACCACCAACA 13 | GCAGCAGCAGTTCCATGACCACCAGCAGCAGCAGCAGCAGTTCCATGACCACCAGCAGCAGCAGCATCAG 14 | TTCCATGACCACCCCCAGCAGAAGCAGCAGTTCCATGACCACCCCCAGCAGCAACAGCAGTTCCATGACC 15 | ACCACCACCAGCAGCAGCAGAAGCAGCAGTTCCATGACCACCACCAGCAGAAGCAGCAGTTCCATGACCA 16 | CCACCAGCAGAAGCAGCAGTTCCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCACCAGCAG 17 | CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCACGACCAGCAGCTTACCTACTTACTAACAGCTG 18 | ACATCACTGGTGAGCATACACCATACCAGGAGCACCTCAGCACAGCCCTGTGGTTGGCAGTCTCATGA 19 | 20 | -------------------------------------------------------------------------------- /file3.fasta: -------------------------------------------------------------------------------- 1 | >NC_024475.1:327176-327685 Rattus norvegicus strain mixed chromosome Y, Rnor_6.0 2 | ATGGAGGGCCATGTCAAGCGCCCCATGAATGCATTTATGGTGTGGTCCCGTGGAGAGAGGCGCAAGTTGG 3 | CTCAACAGAATCCCAGCATGCAGAATTCAGAGATCAGCAAGCATCTGGGATATCAGTGGAAAAGCCTTAC 4 | AGAAGCCGAAAAAAGGCCCTTTTTCCAGGAGGCGCAGAGACTGAAGACCCTACACAGAGAGAAATATCCA 5 | AACTATAAATATCAGCCTCATCGAAGGGTTAAAGTGCCACAGAGGAGTTATACTTTGCAGCGTGAAGTTG 6 | CCTCAACAAAACTGTACAACCTGCTGCAATGGGACAACAACCTACACACTATCATATACGGACAGGACTG 7 | GGCTAGAGCTGCACACCAGTCCTCCAAGAACCAGAAAAGCATTTATTTACAGCCTGTGGACATCCCCACT 8 | GGATACCCACTACAGCAGAAACAGCAGCACCAGCAGCAGCAGCACGTGCACCTGCAGCAGCAGCAGCAGC 9 | AGCAGCACCAGTTCCACTAG 10 | 11 | -------------------------------------------------------------------------------- /find_GC_by_position.py: -------------------------------------------------------------------------------- 1 | # function findGC_pos for finding GC content in reads of fastq file by position 2 | 3 | from reading_fastq import read_fastq 4 | 5 | def findGC_pos(reads): 6 | GC = [0] * 127 #since reads in our fastq file are of 127 bases 7 | Total = [0] * 127 8 | avg_GC = [0] * 127 9 | for read in reads: 10 | for i in range(len(read)): 11 | if read[i] == "C" or read[i] == "G": 12 | GC[i] += 1 13 | Total[i] += 1 14 | for i in range(len(read)): 15 | avg_GC[i] = GC[i]/Total[i] 16 | return avg_GC 17 | 18 | seqs,quals = read_fastq("path of your fastq file") 19 | 20 | GC_content = findGC_pos(seqs) 21 | print(GC_content) 22 | 23 | 24 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 25 | ##your personal queries are also invited 26 | 27 | 28 | -------------------------------------------------------------------------------- /generate_kmers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "{'TGA', 'GAG', 'AGA', 'TGC', 'GCA', 'ACG', 'CGA', 'GAT', 'GTG', 'ATA', 'ATG', 'CGT', 'GAC', 'CAG', 'GTA', 'TAG', 'AGT', 'AAC', 'TAA'}\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# Generating all possible kmers(substrings) of a particular length from a DNA sequence\n", 18 | "\n", 19 | "dna = \"ATGACGTGCAGTAGACGATAGAGATAACGATGACGA\" # our input dna sequence\n", 20 | "k = 3 # length of the kmer\n", 21 | " \n", 22 | "kmers = set() # this set will store all the possible unique kmers (duplicates not allowed)\n", 23 | "for i in range(len(dna) - k +1): # looping from 0 to length of DNA-k+1\n", 24 | " kmer = dna[i:i+k] # slicing DNA according to our kmer length\n", 25 | " kmers.add(kmer) # adding kmer to our set kmers\n", 26 | " \n", 27 | "print(kmers)" 28 | ] 29 | } 30 | ], 31 | "metadata": { 32 | "kernelspec": { 33 | "display_name": "Python 3", 34 | "language": "python", 35 | "name": "python3" 36 | }, 37 | "language_info": { 38 | "codemirror_mode": { 39 | "name": "ipython", 40 | "version": 3 41 | }, 42 | "file_extension": ".py", 43 | "mimetype": "text/x-python", 44 | "name": "python", 45 | "nbconvert_exporter": "python", 46 | "pygments_lexer": "ipython3", 47 | "version": "3.6.4" 48 | } 49 | }, 50 | "nbformat": 4, 51 | "nbformat_minor": 2 52 | } 53 | -------------------------------------------------------------------------------- /generate_pallindrome.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "A random pallindromic sequence of length 10 is: ACGCATGCGT\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# GENERATE A RANDOM PALLINDROMIC DNA SEQUENCE OF A GIVEN LENGTH\n", 18 | "\n", 19 | "import random\n", 20 | "\n", 21 | "# dictionary that stores complement of each base\n", 22 | "complement = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}\n", 23 | "\n", 24 | "# list that stores our 4 bases\n", 25 | "bases = ['A','T','G','C']\n", 26 | "\n", 27 | "N = 10 # required length of pallindromic sequence (should be even)\n", 28 | "\n", 29 | "pallindrome = [0]*N # this will later store our pallindromic DNA sequence\n", 30 | "\n", 31 | "# looping from 0 to N-1\n", 32 | "for i in range(N):\n", 33 | " r = random.randint(0,3) # generates a random number between 0 and 3 including 0 and 3\n", 34 | " base = bases[r] # fetching the base from the bases list according to r\n", 35 | " pallindrome[i] = base # storing base at index i in the pallindrome list\n", 36 | " pallindrome[N-i-1] = complement[bases[r]] # storing complement of the base at index N-i-1 in the pallindrome list \n", 37 | " \n", 38 | "pallindrome = ''.join(pallindrome) # converting pallindrome list to a string\n", 39 | "print(f'A random pallindromic sequence of length {N} is: {pallindrome}')" 40 | ] 41 | } 42 | ], 43 | "metadata": { 44 | "kernelspec": { 45 | "display_name": "Python 3", 46 | "language": "python", 47 | "name": "python3" 48 | }, 49 | "language_info": { 50 | "codemirror_mode": { 51 | "name": "ipython", 52 | "version": 3 53 | }, 54 | "file_extension": ".py", 55 | "mimetype": "text/x-python", 56 | "name": "python", 57 | "nbconvert_exporter": "python", 58 | "pygments_lexer": "ipython3", 59 | "version": "3.6.4" 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 2 64 | } 65 | -------------------------------------------------------------------------------- /getting_sequence_in_one_line.py: -------------------------------------------------------------------------------- 1 | 2 | # getting sequences in a fasta file in one line 3 | 4 | fr = open("path of single or multiple fasta file","r") 5 | header = "" 6 | seq = "" 7 | for line in fr: 8 | if ">" in line: 9 | header = line 10 | if seq != "": 11 | print(seq) 12 | seq = "" 13 | print(header) 14 | 15 | else: 16 | line = line.rstrip("\n") 17 | seq = seq+line 18 | print(seq) 19 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 20 | ##your personal queries are also invited 21 | 22 | 23 | -------------------------------------------------------------------------------- /gtf_to_bed.py: -------------------------------------------------------------------------------- 1 | # Converting GTF format to BED format (One bed record per whole gene) using BCBIO module 2 | # Type "python gtf_to_bed.py -i Path/of/your/GTF/file - o Output file name with extension" for running the code 3 | # Example: python gtf_to_bed.py -i C:\Users\dell\Bos_taurus_Chr1_GTF.gtf -o Bos_taurus_Chr1.bed 4 | # Type "python gtf_to_bed.py -h" for help/usage description 5 | # Output: BED file with chr, start and end of the genes present in the gtf file 6 | # Sample input file: Bos_taurus_Chr1.gtf 7 | # Sample output file: Bos_taurus_Chr1.bed 8 | 9 | from BCBio import GFF 10 | from pathlib import Path 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser(description="Converting GTF format to BED format (One bed record per whole gene)", 14 | usage= "python gtf_to_bed.py -i Path/of/your/GTF/file - o Output file name with extension") 15 | 16 | parser.add_argument("-i", help="ENTER FULL PATH OF THE GTF FILE") 17 | parser.add_argument("-o", help="ENTER OUTPUT FILE NAME WITH EXTENSION") 18 | args = parser.parse_args() 19 | 20 | GTF_path = Path(args.i) 21 | out_filename = args.o 22 | 23 | # generating the path of the output bed file using GTF_path 24 | BED_path = GTF_path.parent/out_filename 25 | 26 | # parsing records from the GTF file as a list of seq record objects 27 | records = GFF.parse(GTF_path) 28 | 29 | fw = open(BED_path,"w") 30 | 31 | # looping over each record in our GTF file 32 | for record in records: 33 | if type(record) is list: 34 | continue 35 | else: 36 | chr = record.id 37 | # looping over each feature within a record 38 | for feature in record.features: 39 | start = str(feature.location.start) 40 | end = str(feature.location.end) 41 | # generating the line to be written in the output bed file 42 | line = chr + "\t" + start + "\t" + end + "\n" 43 | fw.write(line) 44 | fw.close() 45 | print(f"Output BED file successfully generated at {BED_path}") 46 | 47 | ## please contact us if you face any issues while using the code 48 | ## your personal queries are also invited 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /headers.txt: -------------------------------------------------------------------------------- 1 | >D10845.1 Capra hircus SRY gene, conserved motif, partial sequence 2 | >D10847.1 Ovis ovis SRY gene, conserved motif, partial sequence 3 | >AY179745.1 Ursus maritimus isolate PBX2818 SRY (SRY) gene, partial cds -------------------------------------------------------------------------------- /histogram_of_qualities.py: -------------------------------------------------------------------------------- 1 | 2 | # function create_hist for creating histogram for qualities of the reads in your fastq file 3 | # function visualise for visualizing the same using matplotlib 4 | 5 | from QtoPhred import phredtoQ 6 | from reading_fastq import read_fastq 7 | 8 | def create_hist(qualities): 9 | hist = [0] * 50 10 | for seq in qualities: 11 | for phred_score in seq: 12 | q = phredtoQ(phred_score) 13 | hist[q] += 1 14 | 15 | return hist 16 | 17 | seqs,quals = read_fastq("path of your fastq file") 18 | 19 | h = create_hist(quals) 20 | print(h) 21 | 22 | def visualise(hist): 23 | import matplotlib.pyplot as plt 24 | plt.bar(range(len(hist)),hist) 25 | plt.show() 26 | 27 | visualise(h) 28 | 29 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 30 | ##your personal queries are also invited 31 | 32 | 33 | -------------------------------------------------------------------------------- /input_fasta.fasta: -------------------------------------------------------------------------------- 1 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA 2 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG 3 | AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG 4 | CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA 5 | AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA 6 | ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGAT 7 | AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 8 | GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCC 9 | AGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGT 10 | TTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTGGCAGCAGCTGCCGTGCGAATCCCCCATGTT 11 | GTCGTGCTTGTCGGACAGGCAGGAGAACCCTTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGAT 12 | GTGACCCCAGGTCAGGCGGGGGCACCCGCTGAGTTTACGC 13 | 14 | >gi|2765654|emb|Z78529.1|CLZ78529 C.lichiangense 5.8S rRNA gene and ITS1 and ITS2 DNA 15 | ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAATCTGGAGGACTTGTGG 16 | TTATTTGGCTCGCTAGGGATTTCCTTTTGTGGTGACCATGATTTGTCATTGGGCCTCATTGAGAGCTTTC 17 | ATGGCGGGTTTGAACCTCTAGCACGGTGCAGTTTGCACCAAGGTATATAAAGAATCACCGATGAATGACA 18 | TTATTGTCAAAAAAGTCGGAGGTGTGGTGTGTTATTGGTCATGCCAATGAATTGTTGATGACTCTCGCCG 19 | AGGGATATCTTGGCTCTTGCATCGATGAAGAATCCCACCGAAATGTGATAAGTGGTGTGAATTGCAGAAT 20 | CCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCCTGG 21 | GCGTCGTATGTTTTATCTCTCCTTCCAATGCTTGTCCAGCATATAGCTAGGCCATCATTGTGTGGATGTG 22 | AAAGATTGGCCCCTTGTGCTTAGGTGCGGTGGGTCTAAGGATATGTGTTTTGATGGTCTGAAACTTGGCA 23 | AGAGGTGGAGGATGCTGGCAGCCGCAAGGCTATTGTTTGAATCCCCCATGTTGTCATATTTGTTGGGCCT 24 | ATAGAACAACTTGTTTGGACCCTAATTAAGGCA 25 | 26 | >gi|2765657|emb|Z78532.1|CCZ78532 C.californicum 5.8S rRNA gene and ITS1 and ITS2 DNA 27 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAGAATATATGATCGAGTG 28 | AATCTGGAGGACCTGTGGTAACTCAGCTCGTCGTGGCACTGCTTTTGTCGTGACCCTGCTTTGTTGTTGG 29 | GCCTCCTCAAGAGCTTTCATGGCAGGTTTGAACTTTAGTACGGTGCAGTTTGCGCCAAGTCATATAAAGC 30 | ATCACTGATGAATGACATTATTGTCAGAAAAAATCAGAGGGGCAGTATGCTACTGAGCATGCCAGTGAAT 31 | TTTTATGACTCTCGCAACGGATATCTTGGCTCTAACATCGATGAAGAACGCAGCTAAATGCGATAAGTGG 32 | TGTGAATTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCTCGAGGCCATCAGGCTAAG 33 | GGCACGCCTGCCTGGGCGTCGTGTGTTGCGTCTCTCCTACCAATGCTTGCTTGGCATATCGCTAAGCTGG 34 | CATTATACGGATGTGAATGATTGGCCCCTTGTGCCTAGGTGCGGTGGGTCTAAGGATTGTTGCTTTGATG 35 | GGTAGGAATGTGGCACGAGGTGGAGAATGCTAACAGTCATAAGGCTGCTATTTGAATCCCCCATGTTGTT 36 | GTATTTTTTCGAACCTACACAAGAACCTAATTGAACCCCAATGGAGCTAAAATAACCATTGGGCAGTTGA 37 | TTTCCATTCAGATGCGACCCCAGGTCAGGCGGGGCCACCCGCTGAGTTGAGGC 38 | 39 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA 40 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG 41 | AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG 42 | CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA 43 | AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA 44 | ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGAT 45 | AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 46 | GGCTAAGGGCACGCCTGCTTGGGCGTCGC 47 | 48 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA 49 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG 50 | AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG 51 | CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA 52 | AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA 53 | ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGAT 54 | AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 55 | GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCC 56 | AGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGT 57 | TTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATG 58 | 59 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA 60 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG 61 | AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG 62 | CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA 63 | AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA 64 | ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGAT 65 | AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAA 66 | 67 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA 68 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG 69 | AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG 70 | CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA 71 | AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA 72 | ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGAT 73 | AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 74 | GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAAT 75 | 76 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA 77 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG 78 | AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG 79 | CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA 80 | AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA 81 | ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAG 82 | 83 | >gi|2765656|emb|Z78531.1|CFZ78531 C.fasciculatum 5.8S rRNA gene and ITS1 and ITS2 DNA 84 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAGAACATACGATCGAGTG 85 | AATCCGGAGGACCCGTGGTTACACGGCTCACCGTGGCTTTGCTCTCGTGGTGAACCCGGTTTGCGACCGG 86 | GCCGCCTCGGGAACTTTCATGGCGGGTTTGAACGTCTAGCGCGGCGCAGTTTGCGCCAAGTCATATGGAG 87 | CGTCACCGATGGATGGCATTTTTGTCAAGAAAAACTCGGAGGGGCGGCGTCTGTTGCGCGTGCCAATGAA 88 | TTTATGACGACTCTCGGCAACGGGATATCTGGCTCTTGCATCGATGAAGAACGCAGCGAAATGCGATAAG 89 | TGGTGTGAATTGCAGAATCCCGCGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCT 90 | AAGGGCACGCCTGCCTGGGCGTCGTGTGCTGCGTCTCTCCTGATAATGCTTGATTGGCATGCGGCTAGTC 91 | TGTCATTGTGAGGACGTGAAAGATTGGCCCCTTGCGCCTAGGTGCGGCGGGTCTAAGCATCGGTGTTCTG 92 | ATGGCCCGGAACTTGGCAGTAGGTGGAGGATGCTGGCAGCCGCAAGGCTGCCGTTCGAATCCCCCGTGTT 93 | GTCGTACTCGTCAGGCCTACAGAAGAACCTGTTTGAACCCCCAGTGGACGCAAAACCGCCCTCGGGCGGT 94 | GATTTCCATTCAGATGCGACCCCAGTCAGGCGGGCCACCCGTGAGTAA 95 | 96 | >gi|2765654|emb|Z78529.1|CLZ78529 C.lichiangense 5.8S rRNA gene and ITS1 and ITS2 DNA 97 | ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAATCTGGAGGACTTGTGG 98 | TTATTTGGCTCGCTAGGGATTTCCTTTTGTGGTGACCATGATTTGTCATTGGGCCTCATTGAGAGCTTTC 99 | ATGGCGGGTTTGAACCTCTAGCACGGTGCAGTTTGCACCAAGGTATATAAAGAATCACCGATGAATGACA 100 | TTATTGTCAAAAAAGTCGGAGGTGTGGTGTGTTATTGGTCATGCCAATGAATTGTTGATGACTCTCGCCG 101 | AGGGATATCTTGGCTCTTGCATCGATGAAGAATCCCACCGAAATGTGATAAGTGGTGTGAATTGCAGAAT 102 | CCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCCTGG 103 | GCGTCGTATGTTTTATCTCTCCTTCCAATGCTTGTCCAGCATATAGCTAGGCCATCATTGTGTGGATGTG 104 | AAAGATTGGCCCCTTGTGCTT 105 | 106 | >gi|2765655|emb|Z78530.1|CMZ78530 C.margaritaceum 5.8S rRNA gene and ITS1 and ITS2 DNA 107 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACATAATAAACGATTGAGTG 108 | AATCTGGAGGACTTGTGGTAATTTGGCTCGCTAGGGATATCCTTTTGTGGTGACCATGATTTGTCATTGG 109 | GCCTCATTGAGAGCTTTCATGGCGGGTTTGAACCTCTAGCACGGTCCAGTTTGCACCAAGGTATATAAAG 110 | AATCACCGATGAATGACATTATTGCCCCACACAACGTCGGAGGTGTGGTGTGTTAATGTTCATTCCAATG 111 | AATTTTGATGACTCTCGGCAGACGGATATCTTGACTCTTGCATCGATGAAGAACGCACCGAAATGTGATA 112 | AGTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGG 113 | CTAAGGGCACGCCTGCCTGGGCGTCGTATGTTTTATCTCTCCTTCCAATGCTTGTCCAGCATATAGCTAG 114 | GCCATCATTGTGTGGATGTGAAAGATTGGCCCCTTGTGCTTAGGTGCGGTGGGTCTAAGGATATGTGTTT 115 | TGATGGTCTGAAACTTGGCAAGAGGTGGAGGATGCTGGCAGCCGCAAGGCTATTGTTTGAATCCCCCATG 116 | TTGTCATGTTTGTTGGGCCTATAGAACAACTTGTTTGGACCCTAATTAAGGCAAAACAATCCTTGGGTGG 117 | TTGATTTCCAATCAGATGCGACCCCAGTCAGGGGGCCACCCCAT 118 | 119 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA 120 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG 121 | AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG 122 | CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA 123 | AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA 124 | ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGAT 125 | AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 126 | GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCC 127 | AGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGT 128 | TTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTGGCAGCAGCTGCCGTGCGAATCCCCCATGTT 129 | GTCGTGCTTGTCGGAC 130 | 131 | >gi|2765654|emb|Z78529.1|CLZ78529 C.lichiangense 5.8S rRNA gene and ITS1 and ITS2 DNA 132 | ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAATCTGGAGGACTTGTGG 133 | TTATTTGGCTCGCTAGGGATTTCCTTTTGTGGTGACCATGATTTGTCATTGGGCCTCATTGAGAGCTTTC 134 | ATGGCGGGTTTGAACCTCTAGCACGGTGCAGTTTGCACCAAGGTATATAAAGAATCACCGATGAATGACA 135 | TTATTGTCAAAAAAGTCGGAGGTGTGGTGTGTTATTGGTCATGCCAATGAATTGTTGATGACTCTCGCCG 136 | AGGGATATCTTGGCTCTTGCATCGATGAAGAATCCCACCGAAATGTGATAAGTGGTGTGAATTGCAGAAT 137 | CCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCCTGG 138 | GCGTCGTATGTTTTATCTCTCCTTCCAATGCTTGTCCAGCATATAGCTAGGCCATCATTGTGTGGATGTG 139 | AAAGATTGGCCCCTTGTGCTTAGGTGCGGTGGGTCTAAGGATATGTGTTTTGATGGTCTGAAACTTGGCA 140 | AGAGGTGGAGGATGCTGGCAGCCGCAAGGCTATTGTTTGAATCCCCCATGTTGTCATATTTGTTGGGCCT 141 | ATAGAACAACTTGTTTGGACCCTAATTAAGGCAAAACAATCCTTGGGTGGTTGATTTCCAATCAGATGCG 142 | ACCCCAGTCAGCGGGCCACCAGCTGAGCTAAAA 143 | 144 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA 145 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG 146 | AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG 147 | CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA 148 | AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA 149 | ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGAT 150 | AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACG 151 | 152 | >gi|2765654|emb|Z78529.1|CLZ78529 C.lichiangense 5.8S rRNA gene and ITS1 and ITS2 DNA 153 | ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAATCTGGAGGACTTGTGG 154 | TTATTTGGCTCGCTAGGGATTTCCTTTTGTGGTGACCATGATTTGTCATTGGGCCTCATTGAGAGCTTTC 155 | ATGGCGGGTTTGAACCTCTAGCACGGTGCAGTTTGCACCAAGGTATATAAAGAATCACCGATGAATGACA 156 | TTATTGTCAAAAAAGTCGGAGGTGTGGTGTGTTATTGGTCATGCCAATGAATTGTTGATGACTCTCGCCG 157 | AGGGATATCTTGGCTCTTGCATCGATGAAGAATCCCACCGAAATGTGATAAGTGGTGTGAATTGCAGAAT 158 | CCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGG -------------------------------------------------------------------------------- /mean_methylation_rate.py: -------------------------------------------------------------------------------- 1 | #### code_request_002 #### 2 | 3 | # Finding mean of normalised methylation rate of gene parts using pandas 4 | # Our input file is a TSV(tab delimited format) containing list of gene parts with their normalised methylation rate 5 | # Input: Path of the input TSV file 6 | # Output: TSV file containing list of gene parts with their mean normalised methylation rate 7 | # Sample input file: bismark.txt 8 | # Sample output file: output_bismark.txt 9 | 10 | # Column indexes and headers of our input file: 11 | # column 0 -> chromosome number 12 | # column 1 -> start coordinate 13 | # column 2 -> end coordinate 14 | # column 3 -> methylation rate 15 | # column 4 -> gene_part (e.g NR_039983_6 refers to the 6th part of the gene) 16 | 17 | # import pathlib and pandas 18 | import pandas as pd 19 | from pathlib import Path 20 | 21 | in_path = r"Enter the path of your input tab delimited file" 22 | # e.g: in_path = r"C:\Users\dell\Desktop\code_request_002\bismark.txt" 23 | 24 | in_path = Path(in_path) 25 | in_filename = in_path.parts[-1] 26 | out_filename = "output_"+in_filename 27 | out_path = in_path.parent/out_filename # generating path for our output file 28 | 29 | # reading our TSV file as a dataframe 'df' 30 | df = pd.read_csv(in_path,sep="\t",header = None) 31 | 32 | # initialising empty lists for each of our columns in the output file 33 | mean_list = [] 34 | index1_list = [] 35 | index2_list = [] 36 | chr_list = [] 37 | 38 | # extracting unique gene parts from our dataframe(column 4) into a series 39 | gene_parts = df[4].unique() 40 | 41 | # converting series of unique gene parts to gene_parts list 42 | gene_parts = gene_parts.tolist() 43 | 44 | # looping over each gene part in our gene_parts list 45 | for gene_part in gene_parts: 46 | # extracting the column region having methylation rate of a specific gene part 47 | working_col = df[3][df[4]==gene_part] 48 | # extracting the starting and ending coordinates of a specific gene part 49 | index1 = df[1][df[4]==gene_part].min() 50 | index2 = df[1][df[4]==gene_part].max() 51 | # extracting the chr of a specific gene part 52 | chr = df[0][df[4]==gene_part].reset_index(drop = True)[0] 53 | 54 | # appending the mean of the methylation rate of a specific gene part to mean_list 55 | mean_list.append(working_col.mean()) 56 | # appending other extracted values of a specific gene part to their respective lists 57 | index1_list.append(index1) 58 | index2_list.append(index2) 59 | chr_list.append(chr) 60 | 61 | # creating the output dataframe 'df_out' using chr_list,index1_list,index2_list,mean_list,gene_parts 62 | df_out = pd.DataFrame(list(zip(chr_list,index1_list,index2_list,mean_list,gene_parts))) 63 | 64 | # writing the output dataframe 'df_out' as tab delimited values to our output file 65 | df_out.to_csv(out_path,sep = "\t",index = False,header = False) 66 | 67 | ##please leave a message if you face any issues while using the code 68 | -------------------------------------------------------------------------------- /merged.fasta: -------------------------------------------------------------------------------- 1 | >NC_000024.10:c2787682-2786855 Homo sapiens chromosome Y, GRCh38.p13 Primary Assembly 2 | AGAAGTGAGTTTTGGATAGTAAAATAAGTTTCGAACTCTGGCACCTTTCAATTTTGTCGCACTCTCCTTG 3 | TTTTTGACAATGCAATCATATGCTTCTGCTATGTTAAGCGTATTCAACAGCGATGATTACAGTCCAGCTG 4 | TGCAAGAGAATATTCCCGCTCTCCGGAGAAGCTCTTCCTTCCTTTGCACTGAAAGCTGTAACTCTAAGTA 5 | TCAGTGTGAAACGGGAGAAAACAGTAAAGGCAACGTCCAGGATAGAGTGAAGCGACCCATGAACGCATTC 6 | ATCGTGTGGTCTCGCGATCAGAGGCGCAAGATGGCTCTAGAGAATCCCAGAATGCGAAACTCAGAGATCA 7 | GCAAGCAGCTGGGATACCAGTGGAAAATGCTTACTGAAGCCGAAAAATGGCCATTCTTCCAGGAGGCACA 8 | GAAATTACAGGCCATGCACAGAGAGAAATACCCGAATTATAAGTATCGACCTCGTCGGAAGGCGAAGATG 9 | CTGCCGAAGAATTGCAGTTTGCTTCCCGCAGATCCCGCTTCGGTACTCTGCAGCGAAGTGCAACTGGACA 10 | ACAGGTTGTACAGGGATGACTGTACGAAAGCCACACACTCAAGAATGGAGCACCAGCTAGGCCACTTACC 11 | GCCCATCAACGCAGCCAGCTCACCGCAGCAACGGGACCGCTACAGCCACTGGACAAAGCTGTAGGACAAT 12 | CGGGTAACATTGGCTACAAAGACCTACCTAGATGCTCCTTTTTACGATAACTTACAGCCCTCACTTTCTT 13 | ATGTTTAGTTTCAATATTGTTTTCTTTTCTCTGGCTAATAAAGGCCTTATTCATTTCA 14 | 15 | >NC_000087.7:c2663658-2662471 Mus musculus strain C57BL/6J chromosome Y, GRCm38.p6 C57BL/6J 16 | ATGGAGGGCCATGTCAAGCGCCCCATGAATGCATTTATGGTGTGGTCCCGTGGTGAGAGGCACAAGTTGG 17 | CCCAGCAGAATCCCAGCATGCAAAATACAGAGATCAGCAAGCAGCTGGGATGCAGGTGGAAAAGCCTTAC 18 | AGAAGCCGAAAAAAGGCCCTTTTTCCAGGAGGCACAGAGATTGAAGATCCTACACAGAGAGAAATACCCA 19 | AACTATAAATATCAGCCTCATCGGAGGGCTAAAGTGTCACAGAGGAGTGGCATTTTACAGCCTGCAGTTG 20 | CCTCAACAAAACTGTACAACCTTCTGCAGTGGGACAGGAACCCACATGCCATCACATACAGGCAAGACTG 21 | GAGTAGAGCTGCACACCTGTACTCCAAAAACCAGCAAAGCTTTTATTGGCAGCCTGTTGATATCCCCACT 22 | GGGCACCTGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCATAACCACCACCAGCAGCAACAGC 23 | AGTTCTATGACCACCACCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCATGACCACCACCA 24 | GCAGAAGCAGCAGTTTCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCACCACCACCACCAG 25 | GAGCAGCAGTTCCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCAGCAGCAGCAGCAGCAGC 26 | AGCAGCAGCAGCAGTTCCATGACCACCACCAGCAGAAGCAGCAGTTCCATGACCACCACCACCACCAACA 27 | GCAGCAGCAGTTCCATGACCACCAGCAGCAGCAGCAGCAGTTCCATGACCACCAGCAGCAGCAGCATCAG 28 | TTCCATGACCACCCCCAGCAGAAGCAGCAGTTCCATGACCACCCCCAGCAGCAACAGCAGTTCCATGACC 29 | ACCACCACCAGCAGCAGCAGAAGCAGCAGTTCCATGACCACCACCAGCAGAAGCAGCAGTTCCATGACCA 30 | CCACCAGCAGAAGCAGCAGTTCCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCACCAGCAG 31 | CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCACGACCAGCAGCTTACCTACTTACTAACAGCTG 32 | ACATCACTGGTGAGCATACACCATACCAGGAGCACCTCAGCACAGCCCTGTGGTTGGCAGTCTCATGA 33 | 34 | >NC_024475.1:327176-327685 Rattus norvegicus strain mixed chromosome Y, Rnor_6.0 35 | ATGGAGGGCCATGTCAAGCGCCCCATGAATGCATTTATGGTGTGGTCCCGTGGAGAGAGGCGCAAGTTGG 36 | CTCAACAGAATCCCAGCATGCAGAATTCAGAGATCAGCAAGCATCTGGGATATCAGTGGAAAAGCCTTAC 37 | AGAAGCCGAAAAAAGGCCCTTTTTCCAGGAGGCGCAGAGACTGAAGACCCTACACAGAGAGAAATATCCA 38 | AACTATAAATATCAGCCTCATCGAAGGGTTAAAGTGCCACAGAGGAGTTATACTTTGCAGCGTGAAGTTG 39 | CCTCAACAAAACTGTACAACCTGCTGCAATGGGACAACAACCTACACACTATCATATACGGACAGGACTG 40 | GGCTAGAGCTGCACACCAGTCCTCCAAGAACCAGAAAAGCATTTATTTACAGCCTGTGGACATCCCCACT 41 | GGATACCCACTACAGCAGAAACAGCAGCACCAGCAGCAGCAGCACGTGCACCTGCAGCAGCAGCAGCAGC 42 | AGCAGCACCAGTTCCACTAG 43 | 44 | -------------------------------------------------------------------------------- /merging_fasta_files.py: -------------------------------------------------------------------------------- 1 | # Merging multiple fasta files (any number of fasta files can be merged) 2 | 3 | # Input: Path of the Fasta files that have to be merged 4 | # Output: A single fasta file containing sequences from all the individual fasta files 5 | # Sample Input Files: file1.fasta,file2.fasta,file3.fasta 6 | # Sample Output file: merged.fasta 7 | 8 | #### Enter path of fasta files within quotes, seperated by commas #### 9 | in_paths = [r"path of 1st fasta file", r"path of second fasta file"... and so on] 10 | 11 | # example: 12 | # in_paths = [r"C:\Users\dell\Desktop\NEGENOME_Python\advanced\merging_fasta_files\file1.fasta", 13 | # r"C:\Users\dell\Desktop\NEGENOME_Python\advanced\merging_fasta_files\file2.fasta", 14 | # r"C:\Users\dell\Desktop\NEGENOME_Python\advanced\merging_fasta_files\file3.fasta"] 15 | 16 | 17 | # function MergeFasta for merging the fasta files 18 | def MergeFasta(in_paths): 19 | from pathlib import Path 20 | # generating path for our output file 'merged.fasta', using the path 21 | # of our first input files 22 | out_path = Path(in_paths[0]).parent/"merged.fasta" 23 | 24 | fw = open(out_path,"w") 25 | for path in in_paths: 26 | path = Path(path) 27 | fr = open(path,"r") 28 | for line in fr: 29 | fw.write(line) 30 | if line != "\n": 31 | fw.write("\n") 32 | 33 | fw.close() 34 | fr.close() 35 | 36 | return out_path 37 | 38 | out_path = MergeFasta(in_paths) 39 | print("Output file successfully generated at: ",out_path) 40 | 41 | ##please leave a message if you face any issues while using the code 42 | ##your personal queries are also invited 43 | 44 | 45 | -------------------------------------------------------------------------------- /merging_fasta_files_modified.py: -------------------------------------------------------------------------------- 1 | # Merging multiple fasta files modified(any number of fasta files can be merged) 2 | # This is a simplified version of the previous code 'merging_fasta_files.py' 3 | 4 | # Input: Path of the Fasta files that have to be merged 5 | # Output: A single fasta file containing sequences from all the individual fasta files 6 | # Sample Input Files: file1.fasta,file2.fasta,file3.fasta 7 | # Sample Output file: merged.fasta 8 | 9 | #### Enter path of fasta files within quotes, seperated by commas #### 10 | in_paths = [r"path of 1st fasta file", r"path of second fasta file"... and so on] 11 | 12 | # example: 13 | # in_paths = [r"C:\Users\dell\Desktop\test\file1.fasta", 14 | # r"C:\Users\dell\Desktop\test\file2.fasta", 15 | # r"C:\Users\dell\Desktop\test\file3.fasta"] 16 | 17 | 18 | # function MergeFasta for merging the fasta files 19 | def MergeFasta(in_paths): 20 | from pathlib import Path 21 | # generating path for our output file 'merged.fasta', using the path 22 | # of our first input files 23 | out_path = Path(in_paths[0]).parent/"merged.fasta" 24 | 25 | with open(out_path,"w") as fw: 26 | for path in in_paths: 27 | path = Path(path) 28 | with open(path,"r") as fr: 29 | temp = fr.readlines() 30 | fw.writelines(temp) 31 | if temp[-1] != "\n": 32 | fw.write("\n") 33 | 34 | 35 | 36 | return out_path 37 | 38 | out_path = MergeFasta(in_paths) 39 | print("Output file successfully generated at: ",out_path) 40 | 41 | ##please leave a message if you face any issues while using the code 42 | ##your personal queries are also invited 43 | 44 | 45 | -------------------------------------------------------------------------------- /metagenomic.fasta: -------------------------------------------------------------------------------- 1 | >header 1 2 | ATAGATAGATAGATAGCAGACGACGACGACGATGATAGAGCAGACGACGATGGACGATGACGCAGCGAGTGTAGCAGCGCAGTGGTAGTGATAGGTAG 3 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 4 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 5 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 6 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 7 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 8 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 9 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 10 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 11 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 12 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 13 | 14 | >header 2 15 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 16 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 17 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 18 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 19 | 20 | >header 3 21 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 22 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 23 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 24 | 25 | >header 4 26 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 27 | 28 | >header 5 29 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGG 30 | 31 | >header 6 32 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 33 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 34 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 35 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 36 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 37 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 38 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 39 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 40 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 41 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG -------------------------------------------------------------------------------- /metagenomic_out.fasta: -------------------------------------------------------------------------------- 1 | >header 1 2 | ATAGATAGATAGATAGCAGACGACGACGACGATGATAGAGCAGACGACGATGGACGATGA 3 | CGCAGCGAGTGTAGCAGCGCAGTGGTAGTGATAGGTAGAGTACATTAGACGATGACGATA 4 | GGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCG 5 | AGGGATGACGGATATGAGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGG 6 | GGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACA 7 | TTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGT 8 | GTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACATTAGACGATGACGATAGGACGA 9 | TAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGAT 10 | GACGGATATGAGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGG 11 | GGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACATTAGAC 12 | GATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGT 13 | GGAGCAGGCGAGGGATGACGGATATGAGTACATTAGACGATGACGATAGGACGATAGCAG 14 | AGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGA 15 | TATGAGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGG 16 | GGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACATTAGACGATGAC 17 | GATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCA 18 | GGCGAGGGATGACGGATATGAGTACATTAGACGATGACGATAGGACGATAGCAGAGACGA 19 | CGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATG 20 | 21 | >header 2 22 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGG 23 | GGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACATTAGACGATGACGATA 24 | GGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCG 25 | AGGGATGACGGATATGAGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGG 26 | GGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACA 27 | TTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGT 28 | GTGTGTGGAGCAGGCGAGGGATGACGGATATG 29 | 30 | >header 6 31 | AGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGG 32 | GGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACATTAGACGATGACGATA 33 | GGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCG 34 | AGGGATGACGGATATGAGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGG 35 | GGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACA 36 | TTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGT 37 | GTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACATTAGACGATGACGATAGGACGA 38 | TAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGAT 39 | GACGGATATGAGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGG 40 | GGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACATTAGAC 41 | GATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGT 42 | GGAGCAGGCGAGGGATGACGGATATGAGTACATTAGACGATGACGATAGGACGATAGCAG 43 | AGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGA 44 | TATGAGTACATTAGACGATGACGATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGG 45 | GGGGGGGGGTGTGTGTGGAGCAGGCGAGGGATGACGGATATGAGTACATTAGACGATGAC 46 | GATAGGACGATAGCAGAGACGACGGGGGGGGGGGGGGGGGGGGGGGGTGTGTGTGGAGCA 47 | GGCGAGGGATGACGGATATG 48 | 49 | -------------------------------------------------------------------------------- /multi_GC_input.fasta: -------------------------------------------------------------------------------- 1 | >header1 2 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 3 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 4 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 5 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 6 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 7 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAACCCCCCCCCCC 8 | CCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGAAAACCCC 9 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 10 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 11 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 12 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 13 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 14 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 15 | 16 | >header2 17 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 18 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 19 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 20 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 21 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 22 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 23 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 24 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 25 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 26 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 27 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 28 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 29 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 30 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCATAGCA 31 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 32 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 33 | ATGCGATGCGGACGCCCAGTAGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 34 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 35 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 36 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 37 | ATGCGATGCGGACGCCCAGT 38 | 39 | >header3 40 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 41 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 42 | ATGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 43 | 44 | >header4 45 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 46 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 47 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 48 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 49 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 50 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 51 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 52 | CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 53 | CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 54 | CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 55 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 56 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 57 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 58 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 59 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 60 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 61 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 62 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 63 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA -------------------------------------------------------------------------------- /multi_fasta.fasta: -------------------------------------------------------------------------------- 1 | >D10844.1 Cervus nippon SRY gene, conserved motif, partial sequence 2 | CTCGTGAACGAAGACGAAAGGTGGCTCTAGAGAATCCCCAAATGCAAAACTCAGAGATCAGCAAGCAGCT 3 | GGGGTATGAGTGGAAAAGGCTTACAGATGCTGAAAAGCGACCATTCTTTGAGGAGGCACAGAGACTACTA 4 | GCCGTA 5 | 6 | >D10845.1 Capra hircus SRY gene, conserved motif, partial sequence 7 | CTCGTGAACGAAGACGAAAGGTGGCTCTAGAGAATCCCAAATTGCAAAACTCAGAGATCAGCAAGCAGCT 8 | GGGATACGAGTGGAAAAGGCTTACAGATGCTGAAAAGCGCCCATTCTTTGAGGAGGCACAGAGACTACTA 9 | GCTATA 10 | 11 | >D10846.1 Sus scrofa SRY gene, conserved motif, partial sequence 12 | CTCGTGATCAAAGGAGAAAAGTGGCTCTAGAGAACCCTCAAATGCAAAACTCAGAGATCAGCAAGTGGCT 13 | GGGATGCAAGTGGAAAATGCTTACAGAAGCCGAAAAGCGCCCATTCTTCGAGGAGGCACAGAGGCTACAG 14 | GCGGTG 15 | 16 | >D10847.1 Ovis ovis SRY gene, conserved motif, partial sequence 17 | CTCGTGAACGAAGACGAAAGGTGGCTCTAGAGAATCCCAAACTGCAAAACTCAGAGATCAGCAAGCAGCT 18 | GGGATACGAGTGGAAAAGGCTTACAGATGCTGAAAAGCGCCCATTCTTTGAGGAGGCACAGAGACTACTA 19 | GCTATA 20 | 21 | >AF338391.1 Saguinus midas midas SRY (SRY) gene, partial cds 22 | CAGTTGGAGGTAGGCTGGTTGGGCGGGGTTGAGAGGGGTGCTGGGGGCGGAGAAACGTAAGTTGCAATAC 23 | AAAAGTTAAGGTAACAACGAATTTGGTAGATGTAAGTTTGGGATAGTAAAATGAGTTTCCCACTGTGTCA 24 | TCTTGGAATTTTGTCGTACCCTACTTGTTTTTGACAATGCAGTCTTATGCTTCCGCTATGTTGAGAGTAT 25 | TTAACAGTAATGAATACAATCCAGCTGCGCAACAGAATATCCCTGATTCCGGGAAAAGCTCTTCCGTCAT 26 | TTGGACTGACAACTCTAGTTCAAAGGATCAGTGCCAAACAAGAGAAAACAGTAAAGGCAGCGTCCAGAAC 27 | AGAGTGAAGCGACCCATGAACGCTTTTATTGTGTGGTCTCGTGATCAAAGGCGCAAGATGGCTGTAGAGA 28 | ATCCCCAAATGCGAAACTCAGAGATCAGCAAGAGGCTGGGATACCAGTGGAAATTGCTTACTGAAGCCGA 29 | AAAATGGCCATTCTTCCAGGAGGCACAGAAACTACAGGCCATGCACAGAGAGAAATACCCGAATTATAAG 30 | TATCGACCTCGTCGGAAGGCCAACATGCTGCAGAACAACAGTTTGCTTACCGCCGATCCCTCTTCAGAAC 31 | TCTATGGTGAAATGCAAGTAGAGGACAGGTTGTACACCTTCTCATACAGTGATAACTGTACGAAATCCAC 32 | CGAATCAACAATGGAGCACCCGCTAGGCCTCTCACCGCCAGTCAACCTAGACAGCTCACCGCAGCGGCGG 33 | GACCGCTACAGCCACTCGACAAACCTGCAGGACAAT 34 | 35 | >AF338392.1 Callithrix aurita SRY (SRY) gene, partial cds 36 | CAGTTGGATGTAGGCTGGTTGGGCGGGGTTGAGAGGGTGCTGGGGGGCGGAGAAAGGTAAGTTGCATTAC 37 | AAAAGTTAAGGTAACAACGAATTTGGTAAAAGTAAGTTTGGGATAGTAAAATGAGTTTCCCACTGTGTCA 38 | TCTTGGAATTTTGTCGTACCCTACTTGTTTTTGAAAATGCAGTCTTATGCTTCCGCTATGTTGAGAGTAT 39 | TTAACAGTGATGAATACAATCCAGCTGCGCTACAGAATATCCCTGATTCCGGGAAAAGCTCTTCCGTCAT 40 | TTGGACTGACAACTGTAGCTCAAAGGATCCGTGGCAAACAGGAGAAAACAGTAAAGGCAGCGTCCAGAAC 41 | AGAGTGAAGCGACCCATGAACGCTTTCATTGTGTGGTCTCGTGATCAAAGGCGCAAGATGGCTGTAGAGA 42 | ATCCCCAAATGCGAAATTCAGAGATCAGCAAGCGGCTGGGATACCAGTGGAAATTGCTTACTGAAGCCGA 43 | AAAATGGCCATTCTTCCAGGAGGCACAGAAACTACAGGCCATGCACAGAGAGAAATACCCGAATTATAAG 44 | TATCGACCTCGTCGGAAGGCCAATATGCTGCAGAACAATGACAGTTTGCTTACCGCCGATCCATCTTCAG 45 | AAACTCTGCAAGCAGAGGACAGGTTGTACACCTTCTCATACAGTGATAACAGTAAGAAATCCACCCAATC 46 | AACAATGGAACACCCGCTAGGCCTCTCACTGCCAGTCAACCCAGACAGCTCACCGCAGCAGCGGGACCGC 47 | TGCAGCCACTCGACAAACCTGCAGGACAATCGGGTAACATTGACTACAAAGAT 48 | 49 | >AY179745.1 Ursus maritimus isolate PBX2818 SRY (SRY) gene, partial cds 50 | AACGCATTCATGGTGTGGTCTCGTGATCAAAGGCGCAAGGTGGCTCTAGAGAATCCCCAAATGCAAAACT 51 | CAGAGATCAGCAAGCAGCTGGGGTA 52 | 53 | >KT356227.1 Macaca fuscata SRY (SRY) gene, partial cds 54 | CTGGTGGGCGGAGTTGAGAGGGGTGTTGGGGGCGGAGAAATGAAAGTTTCATTACAAAAGTTAAGGTAAC 55 | AAAGAATCTGGTAGAAGTAAGTTGTGGATAGTAAATTAAGTTTCAAAATCTGGCACCTTTCAGTTTTGTC 56 | GCAGCCTCCTTGTTTTTGACAATGCAATCATATGCTTCTGCCATGTTAAGCGTATTTAACACTGATGGTT 57 | ACAGTCCAGCTGCGCAACAGAATATTCCTGCTCTCCGGAGAAGCTCTTCCTTCATTTGCACTGAAAGCTG 58 | TAGCTCTAAGTATCAGTGTGAAGCAGGAGAAAACAGTAAAGGCAGCGTCCAGGATAAAGTGAAGCGACCC 59 | ATGAACGCATTCATTGTGTGGTCTCGCGATCAGAGGCGCAAGATGGCTCTAGAGAATCCCAAAATGCGAA 60 | ACTCAGAGATCAGCAAGCAGCTGGGATACCAGTGGAAAATGCTTACCGAAGCCGATAAATGGCCATTCTT 61 | CCAGGAGGCACAGAAACTACAGGCCATGCATAGAGAGAAATACCCGAATTATAAGTATCGACCTCGTCGG 62 | AAGGCGAAGATGCTGCAAAACAGTTGCAGTTTGCTTCCGGCAGATCCCTCTTCGGTACCCTGCAGAGAAG 63 | TGTACAACAACAGGTTGTACAGGGATGACTGTACCAAAGCCACGCACTCAAGAATGCAGCACCAGTTAGG 64 | CCACTTACCGCCCATCAACACAGCCAGCTCACCGCAGCAACGGGACCGCTACAGCCACTCGACAAAGCT 65 | 66 | -------------------------------------------------------------------------------- /noname_fastq.fastq: -------------------------------------------------------------------------------- 1 | @No name 2 | GCACCTACCGATTGAATGATTCGGTGAAACTTTCGGACCGTGACTTAGCGTCCTTCGGGGCACTTCGTCGTGGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 3 | + 4 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 5 | @No name 6 | GCACCTACCGATTGAATGATTCGGTGAAACTTTCGGACCGTGACTTAGCGTCCTTCGGGGCACTTCGTCGTGGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 7 | + 8 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 9 | @No name 10 | GCACCTACCGATTGAATGGTCCGGTGAAACCTTCGGACTGTGGCAACGTTGCTTCATTGGAGCGTCGCCGTGGGAAGTTGTTTAAACCTTACCATTTAGAGGAAGGTGTAGTCGTAACAAGGTTTCC 11 | + 12 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 13 | @No name 14 | GCACCTACCGATTGAATGATTCGGTGAAACTTTCGGACCGTGACTTAGCGTCCTTCGGGGCGCTTCGTCGTGGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCA 15 | + 16 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ6H+JJJJJJJJJJCJ$JJJJJJ4JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ7JJJJJJJJJJJJJJJJJJ 17 | @No name 18 | GCACCTACCGATTGAATGATTCGGTGAAAATCTCGGACTGTGGCTCGGACGCCCTCGGGCGACCTTGCTGTAGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 19 | + 20 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 21 | @No name 22 | GCACCTACCGATTGAATGATTCGGTGAAACTTTCGGACCGTGACTTAGCGTCCTTCGGGGCACTTCGTCGTGGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 23 | + 24 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 25 | @No name 26 | GCACCTACCGATTGAATGATTCGGTGAAAATCTAGGACTAGAGCGAAGACGCCCTCGGGCGACTTTGCTTTGGGAATTCATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 27 | + 28 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ -------------------------------------------------------------------------------- /out_file_0.fasta: -------------------------------------------------------------------------------- 1 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA_0 2 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA 3 | CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT 4 | GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC 5 | CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC 6 | CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA 7 | CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG 8 | AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 9 | GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCG 10 | GCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCG 11 | GCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTG 12 | GCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGACAGGCAGGAGAACCC 13 | TTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGATGTGACCCCAGGTCAGGCGGG 14 | GGCACCCGCTGAGTTTACGC 15 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA_1 16 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA 17 | CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT 18 | GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC 19 | CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC 20 | CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA 21 | CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG 22 | AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 23 | GGCTAAGGGCACGCCTGCTTGGGCGTCGC 24 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA_2 25 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA 26 | CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT 27 | GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC 28 | CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC 29 | CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA 30 | CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG 31 | AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 32 | GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCG 33 | GCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCG 34 | GCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATG 35 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA_3 36 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA 37 | CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT 38 | GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC 39 | CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC 40 | CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA 41 | CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG 42 | AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAA 43 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA_4 44 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA 45 | CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT 46 | GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC 47 | CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC 48 | CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA 49 | CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG 50 | AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 51 | GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAAT 52 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA_5 53 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA 54 | CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT 55 | GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC 56 | CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC 57 | CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA 58 | CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAG 59 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA_6 60 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA 61 | CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT 62 | GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC 63 | CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC 64 | CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA 65 | CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG 66 | AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA 67 | GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCG 68 | GCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCG 69 | GCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTG 70 | GCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGAC 71 | >gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA_7 72 | CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA 73 | CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT 74 | GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC 75 | CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC 76 | CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA 77 | CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG 78 | AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACG 79 | -------------------------------------------------------------------------------- /out_file_1.fasta: -------------------------------------------------------------------------------- 1 | >gi|2765654|emb|Z78529.1|CLZ78529 C.lichiangense 5.8S rRNA gene and ITS1 and ITS2 DNA_0 2 | ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAATCTGGA 3 | GGACTTGTGGTTATTTGGCTCGCTAGGGATTTCCTTTTGTGGTGACCATGATTTGTCATT 4 | GGGCCTCATTGAGAGCTTTCATGGCGGGTTTGAACCTCTAGCACGGTGCAGTTTGCACCA 5 | AGGTATATAAAGAATCACCGATGAATGACATTATTGTCAAAAAAGTCGGAGGTGTGGTGT 6 | GTTATTGGTCATGCCAATGAATTGTTGATGACTCTCGCCGAGGGATATCTTGGCTCTTGC 7 | ATCGATGAAGAATCCCACCGAAATGTGATAAGTGGTGTGAATTGCAGAATCCCGTGAACC 8 | ATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCCTGG 9 | GCGTCGTATGTTTTATCTCTCCTTCCAATGCTTGTCCAGCATATAGCTAGGCCATCATTG 10 | TGTGGATGTGAAAGATTGGCCCCTTGTGCTTAGGTGCGGTGGGTCTAAGGATATGTGTTT 11 | TGATGGTCTGAAACTTGGCAAGAGGTGGAGGATGCTGGCAGCCGCAAGGCTATTGTTTGA 12 | ATCCCCCATGTTGTCATATTTGTTGGGCCTATAGAACAACTTGTTTGGACCCTAATTAAG 13 | GCA 14 | >gi|2765654|emb|Z78529.1|CLZ78529 C.lichiangense 5.8S rRNA gene and ITS1 and ITS2 DNA_1 15 | ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAATCTGGA 16 | GGACTTGTGGTTATTTGGCTCGCTAGGGATTTCCTTTTGTGGTGACCATGATTTGTCATT 17 | GGGCCTCATTGAGAGCTTTCATGGCGGGTTTGAACCTCTAGCACGGTGCAGTTTGCACCA 18 | AGGTATATAAAGAATCACCGATGAATGACATTATTGTCAAAAAAGTCGGAGGTGTGGTGT 19 | GTTATTGGTCATGCCAATGAATTGTTGATGACTCTCGCCGAGGGATATCTTGGCTCTTGC 20 | ATCGATGAAGAATCCCACCGAAATGTGATAAGTGGTGTGAATTGCAGAATCCCGTGAACC 21 | ATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCCTGG 22 | GCGTCGTATGTTTTATCTCTCCTTCCAATGCTTGTCCAGCATATAGCTAGGCCATCATTG 23 | TGTGGATGTGAAAGATTGGCCCCTTGTGCTT 24 | >gi|2765654|emb|Z78529.1|CLZ78529 C.lichiangense 5.8S rRNA gene and ITS1 and ITS2 DNA_2 25 | ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAATCTGGA 26 | GGACTTGTGGTTATTTGGCTCGCTAGGGATTTCCTTTTGTGGTGACCATGATTTGTCATT 27 | GGGCCTCATTGAGAGCTTTCATGGCGGGTTTGAACCTCTAGCACGGTGCAGTTTGCACCA 28 | AGGTATATAAAGAATCACCGATGAATGACATTATTGTCAAAAAAGTCGGAGGTGTGGTGT 29 | GTTATTGGTCATGCCAATGAATTGTTGATGACTCTCGCCGAGGGATATCTTGGCTCTTGC 30 | ATCGATGAAGAATCCCACCGAAATGTGATAAGTGGTGTGAATTGCAGAATCCCGTGAACC 31 | ATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCCTGG 32 | GCGTCGTATGTTTTATCTCTCCTTCCAATGCTTGTCCAGCATATAGCTAGGCCATCATTG 33 | TGTGGATGTGAAAGATTGGCCCCTTGTGCTTAGGTGCGGTGGGTCTAAGGATATGTGTTT 34 | TGATGGTCTGAAACTTGGCAAGAGGTGGAGGATGCTGGCAGCCGCAAGGCTATTGTTTGA 35 | ATCCCCCATGTTGTCATATTTGTTGGGCCTATAGAACAACTTGTTTGGACCCTAATTAAG 36 | GCAAAACAATCCTTGGGTGGTTGATTTCCAATCAGATGCGACCCCAGTCAGCGGGCCACC 37 | AGCTGAGCTAAAA 38 | >gi|2765654|emb|Z78529.1|CLZ78529 C.lichiangense 5.8S rRNA gene and ITS1 and ITS2 DNA_3 39 | ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAATCTGGA 40 | GGACTTGTGGTTATTTGGCTCGCTAGGGATTTCCTTTTGTGGTGACCATGATTTGTCATT 41 | GGGCCTCATTGAGAGCTTTCATGGCGGGTTTGAACCTCTAGCACGGTGCAGTTTGCACCA 42 | AGGTATATAAAGAATCACCGATGAATGACATTATTGTCAAAAAAGTCGGAGGTGTGGTGT 43 | GTTATTGGTCATGCCAATGAATTGTTGATGACTCTCGCCGAGGGATATCTTGGCTCTTGC 44 | ATCGATGAAGAATCCCACCGAAATGTGATAAGTGGTGTGAATTGCAGAATCCCGTGAACC 45 | ATCGAGTCTTTGAACGCAAGTTGCGCCCGAGG 46 | -------------------------------------------------------------------------------- /output.fasta: -------------------------------------------------------------------------------- 1 | >D10845.1 Capra hircus SRY gene, conserved motif, partial sequence 2 | CTCGTGAACGAAGACGAAAGGTGGCTCTAGAGAATCCCAAATTGCAAAACTCAGAGATCAGCAAGCAGCTGGGATACGAGTGGAAAAGGCTTACAGATGCTGAAAAGCGCCCATTCTTTGAGGAGGCACAGAGACTACTAGCTATA 3 | 4 | >D10847.1 Ovis ovis SRY gene, conserved motif, partial sequence 5 | CTCGTGAACGAAGACGAAAGGTGGCTCTAGAGAATCCCAAACTGCAAAACTCAGAGATCAGCAAGCAGCTGGGATACGAGTGGAAAAGGCTTACAGATGCTGAAAAGCGCCCATTCTTTGAGGAGGCACAGAGACTACTAGCTATA 6 | 7 | >AY179745.1 Ursus maritimus isolate PBX2818 SRY (SRY) gene, partial cds 8 | AACGCATTCATGGTGTGGTCTCGTGATCAAAGGCGCAAGGTGGCTCTAGAGAATCCCCAAATGCAAAACTCAGAGATCAGCAAGCAGCTGGGGTA 9 | 10 | -------------------------------------------------------------------------------- /output_bismark.txt: -------------------------------------------------------------------------------- 1 | chr1 136388 136473 0.5555556 NR_039983_6 2 | chr1 661865 661925 0.0 NR_028327_4 3 | chr1 762972 762972 0.0 NR_047526_1 4 | chr1 762972 762972 0.0 NR_015368_1 5 | chr1 762972 762972 0.0 NR_047519_1 6 | chr1 762972 762972 0.0 NR_047520_1 7 | chr1 762972 762972 0.0 NR_047521_1 8 | chr1 762972 762972 0.0 NR_047522_1 9 | chr1 762972 762972 0.0 NR_047523_1 10 | chr1 762972 762972 0.0 NR_047524_1 11 | chr1 805475 805523 0.1111111111111111 NR_027055_5 12 | chr1 861218 861360 0.5818181818181818 NM_152486_1 13 | chr1 863508 863556 0.8 NM_152486_3 14 | chr1 864803 864819 1.0 NM_152486_4 15 | chr1 865539 865539 0.0 NM_152486_5 16 | chr1 867870 868611 1.0 NM_152486_8 17 | chr1 868684 868684 1.0 NM_152486_9 18 | chr1 872512 872519 0.75 NM_152486_13 19 | chr1 874638 874672 0.75 NM_152486_15 20 | chr1 875282 875787 0.02631578947368421 NM_152486_16 21 | chr1 876538 876643 0.0 NM_152486_17 22 | -------------------------------------------------------------------------------- /pairwise_local_seq_align.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 16, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "1 ATGTGACGATGCGA\n", 13 | " | | | |||||\n", 14 | "1 A----A-G-TGCGA\n", 15 | " Score=8\n", 16 | "\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "# Performing local pairwise sequence aligment using Biopython \n", 22 | "\n", 23 | "from Bio import pairwise2\n", 24 | "from Bio.pairwise2 import format_alignment\n", 25 | "\n", 26 | "# sequences to be aligned\n", 27 | "seq_1 = \"ATGTGACGATGCGAT\"\n", 28 | "seq_2 = \"AAGTGCGA\"\n", 29 | "\n", 30 | "# The names of the alignment functions in this module follow the convention XX \n", 31 | "# here is “local” \n", 32 | "# and XX is a 2 character code indicating the parameters it takes \n", 33 | "alignments = pairwise2.align.localxx(seq_1, seq_2) \n", 34 | "\n", 35 | "# pairwise2.align.localxx(seq_1, seq_2) returns the list of all local alignments between the 2 sequences\n", 36 | "# to view the alignments in a proper format we use format_alignment method\n", 37 | "# we are just viewing the first alignment from the list of all the alignments\n", 38 | "# for local alignment, format_alignment method will only show the aligned parts of the sequences\n", 39 | "print(format_alignment(*alignments[0])) # using * to unpack alignment before passong in to format_alignment" 40 | ] 41 | } 42 | ], 43 | "metadata": { 44 | "kernelspec": { 45 | "display_name": "Python 3", 46 | "language": "python", 47 | "name": "python3" 48 | }, 49 | "language_info": { 50 | "codemirror_mode": { 51 | "name": "ipython", 52 | "version": 3 53 | }, 54 | "file_extension": ".py", 55 | "mimetype": "text/x-python", 56 | "name": "python", 57 | "nbconvert_exporter": "python", 58 | "pygments_lexer": "ipython3", 59 | "version": "3.6.4" 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 2 64 | } 65 | -------------------------------------------------------------------------------- /pairwise_seq_align.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "AT-GTGAC-GATGC-GAT\n", 13 | "| ||| | |||| |||\n", 14 | "A-AGTG-CCGATG-TGAT\n", 15 | " Score=12\n", 16 | "\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "# Performing global pairwise sequence aligment using Biopython \n", 22 | "\n", 23 | "from Bio import pairwise2\n", 24 | "from Bio.pairwise2 import format_alignment\n", 25 | "\n", 26 | "# sequences to be aligned\n", 27 | "seq_1 = \"ATGTGACGATGCGAT\"\n", 28 | "seq_2 = \"AAGTGCCGATGTGAT\"\n", 29 | "\n", 30 | "# The names of the alignment functions in this module follow the convention XX \n", 31 | "# here is either “global” \n", 32 | "# and XX is a 2 character code indicating the parameters it takes \n", 33 | "alignments = pairwise2.align.globalxx(seq_1, seq_2) \n", 34 | "\n", 35 | "# pairwise2.align.globalxx(seq_1, seq_2) returns the list of all global alignments between the 2 sequences\n", 36 | "# to view the alignments in a proper format we use format_alignment method\n", 37 | "# we are just viewing the first alignment from the list of all the alignments\n", 38 | "# it is important to unpack each alignment using * before passing it to format_alignment method\n", 39 | "print(format_alignment(*alignments[0]))" 40 | ] 41 | } 42 | ], 43 | "metadata": { 44 | "kernelspec": { 45 | "display_name": "Python 3", 46 | "language": "python", 47 | "name": "python3" 48 | }, 49 | "language_info": { 50 | "codemirror_mode": { 51 | "name": "ipython", 52 | "version": 3 53 | }, 54 | "file_extension": ".py", 55 | "mimetype": "text/x-python", 56 | "name": "python", 57 | "nbconvert_exporter": "python", 58 | "pygments_lexer": "ipython3", 59 | "version": "3.6.4" 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 2 64 | } 65 | -------------------------------------------------------------------------------- /parsing_pdb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Extracting all amino acids with atom counts from PDB file\n", 10 | "\n", 11 | "amino_acids = {} # this dictionary will store the amino acids with their atom counts\n", 12 | "fr = open('demo.pdb','r') # opening PDB file in read mode\n", 13 | "# looping over each line in PDB file\n", 14 | "for line in fr:\n", 15 | " line = line.split() # separating each element in a single line\n", 16 | " id = line[0] # fetching the id\n", 17 | " if id == 'ATOM': # check if we are reading the atoms\n", 18 | " amino_acid = line[3] # fetching the amino acid\n", 19 | " atom = line[-1] # fetching the atom\n", 20 | " if amino_acid in amino_acids: # will execute if amino acid already present in amino_acids\n", 21 | " if atom in amino_acids[amino_acid]: # will excute if atom already in amino_acids[amino_acid]\n", 22 | " amino_acids[amino_acid][atom] += 1 # incrementing the count of atom by 1\n", 23 | " else: # this will excute when we encounter a new atom for same amino acid\n", 24 | " amino_acids[amino_acid][atom] = 1 # initialising the count of atom as 1 \n", 25 | " else: # this will execute when we encounter a new amino acid\n", 26 | " amino_acids[amino_acid] = {} # creating an empty dictionary inside our amino_acids dictionary\n", 27 | " amino_acids[amino_acid][atom] = 1 # initialising the count of atom as 1\n", 28 | "print(amino_acids) " 29 | ] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "Python 3", 35 | "language": "python", 36 | "name": "python3" 37 | }, 38 | "language_info": { 39 | "codemirror_mode": { 40 | "name": "ipython", 41 | "version": 3 42 | }, 43 | "file_extension": ".py", 44 | "mimetype": "text/x-python", 45 | "name": "python", 46 | "nbconvert_exporter": "python", 47 | "pygments_lexer": "ipython3", 48 | "version": "3.6.4" 49 | } 50 | }, 51 | "nbformat": 4, 52 | "nbformat_minor": 2 53 | } 54 | -------------------------------------------------------------------------------- /pdb1crn.dssp: -------------------------------------------------------------------------------- 1 | ==== Secondary Structure Definition by the program DSSP, CMBI version by M.L. Hekkelman/2010-10-21 ==== DATE=2020-12-14 . 2 | REFERENCE W. KABSCH AND C.SANDER, BIOPOLYMERS 22 (1983) 2577-2637 . 3 | HEADER PLANT PROTEIN 30-APR-81 1CRN . 4 | COMPND 2 MOLECULE: CRAMBIN; . 5 | SOURCE 2 ORGANISM_SCIENTIFIC: CRAMBE HISPANICA SUBSP. ABYSSINICA; . 6 | AUTHOR W.A.HENDRICKSON,M.M.TEETER . 7 | 46 1 3 3 0 TOTAL NUMBER OF RESIDUES, NUMBER OF CHAINS, NUMBER OF SS-BRIDGES(TOTAL,INTRACHAIN,INTERCHAIN) . 8 | 3007.3 ACCESSIBLE SURFACE OF PROTEIN (ANGSTROM**2) . 9 | 27 58.7 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(J) , SAME NUMBER PER 100 RESIDUES . 10 | 0 0.0 TOTAL NUMBER OF HYDROGEN BONDS IN PARALLEL BRIDGES, SAME NUMBER PER 100 RESIDUES . 11 | 3 6.5 TOTAL NUMBER OF HYDROGEN BONDS IN ANTIPARALLEL BRIDGES, SAME NUMBER PER 100 RESIDUES . 12 | 0 0.0 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-5), SAME NUMBER PER 100 RESIDUES . 13 | 0 0.0 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-4), SAME NUMBER PER 100 RESIDUES . 14 | 0 0.0 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-3), SAME NUMBER PER 100 RESIDUES . 15 | 0 0.0 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-2), SAME NUMBER PER 100 RESIDUES . 16 | 0 0.0 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-1), SAME NUMBER PER 100 RESIDUES . 17 | 0 0.0 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+0), SAME NUMBER PER 100 RESIDUES . 18 | 0 0.0 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+1), SAME NUMBER PER 100 RESIDUES . 19 | 1 2.2 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+2), SAME NUMBER PER 100 RESIDUES . 20 | 5 10.9 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+3), SAME NUMBER PER 100 RESIDUES . 21 | 15 32.6 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+4), SAME NUMBER PER 100 RESIDUES . 22 | 1 2.2 TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+5), SAME NUMBER PER 100 RESIDUES . 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 *** HISTOGRAMS OF *** . 24 | 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 RESIDUES PER ALPHA HELIX . 25 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 PARALLEL BRIDGES PER LADDER . 26 | 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ANTIPARALLEL BRIDGES PER LADDER . 27 | 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 LADDERS PER SHEET . 28 | # RESIDUE AA STRUCTURE BP1 BP2 ACC N-H-->O O-->H-N N-H-->O O-->H-N TCO KAPPA ALPHA PHI PSI X-CA Y-CA Z-CA 29 | 1 1 A T 0 0 75 0, 0.0 34,-2.8 0, 0.0 2,-0.4 0.000 360.0 360.0 360.0 147.7 17.0 12.8 4.3 30 | 2 2 A T E -A 34 0A 21 32,-0.2 2,-0.4 36,-0.1 32,-0.2 -0.848 360.0-172.4-107.8 144.3 13.9 11.5 6.1 31 | 3 3 A a E -A 33 0A 0 30,-2.4 30,-2.8 -2,-0.4 42,-0.2 -0.997 8.4-157.3-131.2 133.3 13.7 10.7 9.8 32 | 4 4 A b - 0 0 0 42,-2.5 42,-2.6 -2,-0.4 28,-0.2 -0.889 18.9-136.5-118.9 151.2 10.6 9.0 11.4 33 | 5 5 A P S S+ 0 0 54 0, 0.0 2,-0.3 0, 0.0 40,-0.3 0.623 84.6 3.5 -76.2 -19.0 9.4 9.0 15.0 34 | 6 6 A S S > S- 0 0 49 38,-0.1 4,-1.9 1,-0.1 5,-0.1 -0.954 76.2-102.4-157.9 166.0 8.7 5.3 15.3 35 | 7 7 A I H > S+ 0 0 120 -2,-0.3 4,-2.6 2,-0.2 5,-0.2 0.901 121.0 54.0 -63.6 -42.1 8.9 2.1 13.3 36 | 8 8 A V H > S+ 0 0 98 1,-0.2 4,-2.5 2,-0.2 5,-0.2 0.898 107.5 51.3 -55.9 -44.6 5.1 2.2 12.5 37 | 9 9 A A H > S+ 0 0 7 1,-0.2 4,-2.3 2,-0.2 -1,-0.2 0.910 111.0 47.6 -61.4 -43.8 5.6 5.8 11.1 38 | 10 10 A R H X S+ 0 0 56 -4,-1.9 4,-2.5 2,-0.2 -1,-0.2 0.906 111.1 50.4 -63.2 -43.3 8.5 4.6 8.8 39 | 11 11 A S H X S+ 0 0 64 -4,-2.6 4,-1.7 2,-0.2 -2,-0.2 0.911 112.8 47.1 -61.2 -42.4 6.5 1.6 7.6 40 | 12 12 A N H X S+ 0 0 82 -4,-2.5 4,-2.5 1,-0.2 -1,-0.2 0.877 110.2 53.6 -64.9 -39.5 3.5 3.9 6.8 41 | 13 13 A F H X S+ 0 0 5 -4,-2.3 4,-2.0 1,-0.2 -1,-0.2 0.908 106.5 51.2 -59.1 -47.2 5.9 6.4 5.1 42 | 14 14 A N H X S+ 0 0 87 -4,-2.5 4,-0.6 1,-0.2 -1,-0.2 0.870 111.8 47.2 -62.8 -35.2 7.3 3.6 2.8 43 | 15 15 A V H >< S+ 0 0 93 -4,-1.7 3,-1.3 2,-0.2 -2,-0.2 0.927 108.8 54.9 -69.2 -41.2 3.8 2.6 1.7 44 | 16 16 A c H 3<>S+ 0 0 19 -4,-2.5 5,-0.8 1,-0.3 -2,-0.2 0.862 107.5 51.8 -56.7 -36.0 2.9 6.3 1.1 45 | 17 17 A R H ><5S+ 0 0 94 -4,-2.0 3,-1.6 1,-0.2 -1,-0.3 0.673 84.9 88.5 -77.1 -16.1 5.9 6.5 -1.2 46 | 18 18 A L T <<5S+ 0 0 143 -3,-1.3 -1,-0.2 -4,-0.6 -2,-0.2 0.881 91.3 43.1 -53.2 -46.2 4.9 3.4 -3.3 47 | 19 19 A P T 3 5S- 0 0 109 0, 0.0 -1,-0.3 0, 0.0 -2,-0.1 0.550 126.2 -97.4 -77.2 -7.6 2.8 5.4 -5.8 48 | 20 20 A G T < 5 + 0 0 52 -3,-1.6 -3,-0.2 1,-0.2 -2,-0.1 0.547 61.2 169.9 106.3 7.3 5.4 8.2 -6.0 49 | 21 21 A T < - 0 0 39 -5,-0.8 -1,-0.2 1,-0.1 5,-0.1 -0.260 44.7 -97.9 -52.7 136.3 3.8 10.6 -3.5 50 | 22 22 A P >> - 0 0 83 0, 0.0 4,-2.2 0, 0.0 3,-0.7 -0.194 26.6-120.9 -57.0 146.6 6.1 13.5 -2.7 51 | 23 23 A E H 3> S+ 0 0 69 1,-0.2 4,-2.5 2,-0.2 5,-0.1 0.872 110.6 61.4 -56.4 -36.2 8.1 13.1 0.5 52 | 24 24 A A H 3> S+ 0 0 62 1,-0.2 4,-1.7 2,-0.2 -1,-0.2 0.876 106.9 44.1 -63.4 -34.9 6.6 16.3 1.9 53 | 25 25 A I H <> S+ 0 0 99 -3,-0.7 4,-1.8 2,-0.2 -1,-0.2 0.907 112.8 51.0 -74.8 -37.9 3.1 14.9 1.8 54 | 26 26 A c H X S+ 0 0 0 -4,-2.2 4,-1.9 2,-0.2 6,-0.4 0.848 106.4 58.4 -64.9 -31.7 4.2 11.5 3.2 55 | 27 27 A A H X S+ 0 0 12 -4,-2.5 4,-2.7 -5,-0.2 5,-0.5 0.957 110.0 39.9 -62.0 -54.0 5.9 13.5 6.0 56 | 28 28 A T H < S+ 0 0 120 -4,-1.7 -1,-0.2 1,-0.2 -2,-0.2 0.830 113.9 56.1 -68.8 -25.5 2.7 15.2 7.2 57 | 29 29 A Y H < S+ 0 0 173 -4,-1.8 -1,-0.2 -5,-0.2 -2,-0.2 0.894 123.2 22.6 -67.6 -36.3 0.7 12.0 6.7 58 | 30 30 A T H < S- 0 0 22 -4,-1.9 -2,-0.2 -3,-0.2 -3,-0.2 0.628 101.8-110.7-109.0 -18.5 3.0 10.0 9.0 59 | 31 31 A G S < S+ 0 0 37 -4,-2.7 -3,-0.2 1,-0.4 -4,-0.1 0.338 74.1 133.2 91.8 -3.1 4.8 12.3 11.4 60 | 32 32 A b - 0 0 5 -5,-0.5 -1,-0.4 -6,-0.4 2,-0.3 -0.337 44.9-142.4 -69.5 164.4 8.1 11.7 9.6 61 | 33 33 A I E -A 3 0A 51 -30,-2.8 -30,-2.4 -3,-0.1 2,-0.5 -0.885 6.3-134.8-129.8 157.0 10.3 14.8 8.8 62 | 34 34 A I E -A 2 0A 78 -2,-0.3 -32,-0.2 -32,-0.2 3,-0.0 -0.975 25.5-178.3-111.6 129.6 12.6 15.9 6.0 63 | 35 35 A I - 0 0 35 -34,-2.8 5,-0.0 -2,-0.5 -2,-0.0 -0.898 32.4-131.9-124.1 158.1 15.9 17.5 6.9 64 | 36 36 A P S S+ 0 0 141 0, 0.0 -1,-0.1 0, 0.0 -34,-0.1 0.804 89.0 59.3 -78.2 -24.4 18.6 18.9 4.7 65 | 37 37 A G S S- 0 0 41 2,-0.1 -2,-0.2 -36,-0.1 3,-0.0 0.011 85.1-118.1 -89.7-161.7 21.5 17.0 6.5 66 | 38 38 A A S S+ 0 0 81 1,-0.1 2,-0.7 2,-0.0 -36,-0.1 0.404 83.1 88.8-120.8 1.2 22.0 13.2 7.0 67 | 39 39 A T - 0 0 127 -38,-0.1 -2,-0.1 -36,-0.0 -1,-0.1 -0.889 62.5-154.0-114.6 104.5 21.9 12.9 10.8 68 | 40 40 A a - 0 0 44 -2,-0.7 -38,-0.0 -38,-0.1 -2,-0.0 -0.513 23.2-112.6 -75.5 145.4 18.5 12.3 12.3 69 | 41 41 A P > - 0 0 56 0, 0.0 3,-2.1 0, 0.0 -1,-0.1 -0.214 29.4-103.2 -71.2 162.7 17.9 13.4 15.9 70 | 42 42 A G G > S+ 0 0 76 1,-0.3 3,-0.5 2,-0.1 -2,-0.1 0.710 119.9 55.0 -61.9 -23.1 17.3 11.0 18.7 71 | 43 43 A D G 3 S+ 0 0 116 1,-0.2 -1,-0.3 2,-0.1 3,-0.1 0.477 111.7 45.3 -89.7 -2.0 13.6 11.6 18.8 72 | 44 44 A Y G < S+ 0 0 68 -3,-2.1 -40,-0.3 1,-0.1 -1,-0.2 -0.513 75.2 140.6-130.3 60.0 13.3 10.7 15.1 73 | 45 45 A A < 0 0 71 -3,-0.5 -3,-0.1 -40,-0.3 -1,-0.1 0.452 360.0 360.0 -88.8 -2.5 15.4 7.7 15.2 74 | 46 46 A N 0 0 76 -42,-2.6 -42,-2.5 -3,-0.1 -39,-0.1 -0.256 360.0 360.0-112.9 360.0 13.5 5.4 12.9 75 | -------------------------------------------------------------------------------- /phred_to_Q.py: -------------------------------------------------------------------------------- 1 | 2 | # function Qtophred to convert Q value to phred score 3 | # function phredtoQ to convert phred score to Q value 4 | 5 | def Qtophred(Q): 6 | return chr(Q+33) 7 | 8 | print(Qtophred(10)) 9 | 10 | 11 | def phredtoQ(phred): 12 | return (ord(phred)-33) 13 | 14 | print(phredtoQ("+")) 15 | 16 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 17 | ##your personal queries are also invited 18 | 19 | 20 | -------------------------------------------------------------------------------- /primary_str_from_pdb.fasta: -------------------------------------------------------------------------------- 1 | >>5H7A:A 2 | KFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILS 3 | LPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQS 4 | LKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEA 5 | KKLNDAQA 6 | >>5H7A:B 7 | MKFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEIL 8 | SLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQ 9 | SLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAE 10 | AKKLNDAQA 11 | >>5H7A:C 12 | MKFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEIL 13 | SLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQ 14 | SLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAE 15 | AKKLNDAQA 16 | >>5H7A:D 17 | KFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILS 18 | LPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQS 19 | LKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEA 20 | KKLNDAQA 21 | >>5H7A:E 22 | FNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILSL 23 | PNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSL 24 | KDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAK 25 | KLNDAQA 26 | >>5H7A:F 27 | KFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILS 28 | LPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQS 29 | LKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEA 30 | KKLNDAQA 31 | >>5H7A:G 32 | KFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILS 33 | LPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQS 34 | LKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEA 35 | KKLNDAQA 36 | >>5H7A:H 37 | FNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILSL 38 | PNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSL 39 | KDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAK 40 | KLNDAQA 41 | >>5H7A:I 42 | KFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILS 43 | LPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQS 44 | LKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEA 45 | KKLNDAQA 46 | >>5H7A:J 47 | FNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILSL 48 | PNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSL 49 | KDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAK 50 | KLNDAQA 51 | >>5H7A:K 52 | FNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILSL 53 | PNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSL 54 | KDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAK 55 | KLNDAQA 56 | >>5H7A:L 57 | NKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILSLP 58 | NLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLK 59 | DDPSQSANLLAEAKKLNEQQAAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKK 60 | LNDAQA 61 | -------------------------------------------------------------------------------- /python_for_bioinformatics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ashish438/Python-for-Bioinformatics/09f641aa75c924027a1f8e1b3672e13b42612e96/python_for_bioinformatics.pdf -------------------------------------------------------------------------------- /query_cover.py: -------------------------------------------------------------------------------- 1 | # finding query coverage from the output file of offline blast and writing the output of this code to a different file 2 | 3 | input_file="path of your blast output file" 4 | 5 | l1=input_file.split("/") 6 | file_name=l1[len(l1)-1] 7 | 8 | out_path=input_file.strip(file_name) 9 | out_file_path=out_path+"output_"+file_name 10 | 11 | fr=open(input_file,"r") 12 | 13 | fw=open(out_file_path,"w") 14 | 15 | for line in fr: 16 | col=line.split("\t") 17 | EML=int(col[3])-int(col[4])-int(col[5]) 18 | QL=int(col[12]) 19 | QC=(EML/QL)*100 20 | #print(QC) 21 | line=line.strip("\n") 22 | nl=line+"\t"+str(QC)+"\n" 23 | #print(nl) 24 | fw.write(nl) 25 | 26 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 27 | ##your personal queries are also invited 28 | 29 | 30 | -------------------------------------------------------------------------------- /query_cover_filter.py: -------------------------------------------------------------------------------- 1 | 2 | # finding query coverage with filter from the output file of offline blast and writing the output of this code to a different file 3 | 4 | input_path="path of your blast output file" 5 | 6 | input_path_list=input_path.split("/") 7 | input_filename=input_path_list[-1] 8 | 9 | choice = input("Please enter your choice as 1 or 2: ") 10 | if choice == 1: 11 | out_file_path=input_path.strip(i_filename)+"eq_100"+"output_"+input_filename 12 | 13 | elif choice == 2: 14 | out_file_path = input_path.strip(i_filename) + "less_than_100_"+"output_" + input_filename 15 | 16 | else: 17 | out_file_path = input_path.strip(i_filename) + "output_" + input_filename 18 | 19 | o_file=open(out_file_path,"w") 20 | 21 | for line in i_file: 22 | col=line.split("\t") 23 | x=int(col[3])-int(col[4])-int(col[5]) 24 | y=int(col[12]) 25 | qc=(x/y)*100 26 | qc_line = (line.strip("\n")) + "\t" + str(qc) + "\n" 27 | if choice == 1: 28 | if qc == 100: 29 | o_file.write(qc_line) 30 | elif choice ==2: 31 | if qc < 100: 32 | o_file.write(qc_line) 33 | else : 34 | o_file.write(qc_line) 35 | 36 | i_file.close() 37 | o_file.close() 38 | 39 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 40 | ##your personal queries are also invited 41 | 42 | 43 | -------------------------------------------------------------------------------- /read1.py: -------------------------------------------------------------------------------- 1 | #reading entire fasta file at once using read() 2 | 3 | fr = open("path of your fasta file here","r") 4 | entire_file = fr.read() 5 | print(entire_file) 6 | fr.close() 7 | 8 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 9 | ##your personal queries are also invited 10 | -------------------------------------------------------------------------------- /read2.py: -------------------------------------------------------------------------------- 1 | # reading a fasta file line by line using readline() 2 | fr = open("path of your fasta file here"."r") 3 | first_line=fr.readline() 4 | sec_line=fr.readline() 5 | third_line=fr.readline() 6 | print(first_line) 7 | print(sec_line) 8 | print(third_line) 9 | fr.close() 10 | 11 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 12 | ##your personal queries are also invited 13 | -------------------------------------------------------------------------------- /read3.py: -------------------------------------------------------------------------------- 1 | # reading a fasta file line by line using for: 2 | 3 | 4 | fr = open("path of your fasta file here","r") 5 | 6 | for line in fr: 7 | print(line) 8 | 9 | fr.close() 10 | 11 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 12 | ##your personal queries are also invited 13 | 14 | -------------------------------------------------------------------------------- /reading_fastq.py: -------------------------------------------------------------------------------- 1 | # function read_fastq for fetching reads and phred scores in fastq file in two seperate lists 2 | 3 | def read_fastq(input): 4 | sequences = [] 5 | qualities = [] 6 | fr = open(input,"r") 7 | while(True): 8 | fr.readline() 9 | seq = fr.readline() 10 | fr.readline() 11 | qual = fr.readline() 12 | if len(seq) == 0: 13 | break 14 | sequences.append(seq) 15 | qualities.append(qual) 16 | return sequences,qualities 17 | 18 | seqs,quals = read_fastq("path of your fastq file") 19 | print(seqs[:5]) 20 | print(quals[:5]) 21 | 22 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 23 | ##your personal queries are also invited 24 | 25 | 26 | -------------------------------------------------------------------------------- /remove_seq.py: -------------------------------------------------------------------------------- 1 | # Removing all sequences(contigs) less than a particular length from a fasta file derived from metagenomic data 2 | # Here the length of the sequences to be removed is taken as 300 bp 3 | # Type "python remove_seq.py -i Path/of/your/fasta/file -l Cut off length of the sequences to be removed 4 | # - o Output file name with extension" for running the code 5 | # Example: python remove_seq.py -i C:\Users\dell\PycharmProject\fb_page\metagenomic.fasta -l 300 6 | # -o metagenomic_out.fasta 7 | # Type "python remove_seq.py -h" for help/usage description 8 | # Output: Fasta file with only those sequences which are greater than or equal to the specified length(e.g. 300) 9 | # Sample input file: metagenomic.fasta 10 | # Sample output file: metagenomic_out.fasta 11 | 12 | import argparse 13 | from pathlib import Path 14 | from Bio import SeqIO 15 | 16 | parser = argparse.ArgumentParser(description="Removing all sequences(contigs) less than a particular length from a fasta file", 17 | usage= "remove_seq.py -i path/to/fasta/file -l Cut off length of the sequences to be removed -o Output filename with extension") 18 | 19 | parser.add_argument("-i", help="ENTER FULL PATH OF THE FASTA FILE") 20 | parser.add_argument("-l", help="ENTER CUT OFF LENGTH OF THE SEQUENCES TO BE REMOVED", type=int) 21 | parser.add_argument("-o", help="ENTER OUTPUT FILE NAME WITH EXTENSION") 22 | args = parser.parse_args() 23 | 24 | fasta_path = Path(args.i) 25 | cut_off_length = args.l 26 | 27 | # generating out file path using the fasta path 28 | out_filepath = fasta_path.parent/args.o 29 | 30 | # parsing all the fasta sequences as seq records in the variable 'fasta_records' 31 | fasta_records = SeqIO.parse(fasta_path,"fasta") 32 | 33 | # an empty list to store the names of all the removed sequences 34 | removed_seq = [] 35 | 36 | with open(out_filepath, "w") as fw: 37 | # looping over each fasta record stored inside 'fasta_records' 38 | for record in fasta_records: 39 | print(len(record.seq)) 40 | # checking for length of each fasta sequence 41 | if len(record.seq) >= cut_off_length: 42 | # writing the required fasta sequences to output file 43 | SeqIO.write(record, fw, format="fasta") 44 | fw.write("\n") 45 | else: 46 | removed_seq.append(record.description) 47 | 48 | print("The following sequences have been removed: ") 49 | for seq_name in removed_seq: 50 | print(seq_name) 51 | 52 | print(f"Output file successfully created at {out_filepath}") 53 | 54 | ## please contact us if you face any issues while using the code 55 | ## your personal queries are also invited 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /renaming_fastq_headers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# RENAMING THE HEADERS OF A FASTQ FILE\n", 10 | "# THE FOLLOWING ARE THE SAMPLE FIRST TWO SEQUENCES OF THE FASTQ FILE:\n", 11 | "# @No name\n", 12 | "# CAAGCGTGTCACCTATACCCCTCCGCCGGGGCAAAA\n", 13 | "# +\n", 14 | "# ????????DDBDDDDDFFFFF9CEEHCECHHHBFHFH\n", 15 | "# @No name\n", 16 | "# CCAACTGCTGTTCACACGGAACCTTTCCCCACTTCAG\n", 17 | "# +\n", 18 | "# BBBDDBDDDDDFFFFEFIIIIHIIIHHIIHHHHIIIF\n", 19 | "\n", 20 | "# THE SEQUENCES WILL BE RENAMED AS FOLLOWS GIVING A UNIQUE IDENTITY TO EACH READ:\n", 21 | "# @1_1\n", 22 | "# CAAGCGTGTCACCTATACCCCTCCGCCGGGGCAAAA\n", 23 | "# +\n", 24 | "# ????????DDBDDDDDFFFFF9CEEHCECHHHBFHFH\n", 25 | "# @1_2\n", 26 | "# CCAACTGCTGTTCACACGGAACCTTTCCCCACTTCAG\n", 27 | "# +\n", 28 | "# BBBDDBDDDDDFFFFEFIIIIHIIIHHIIHHHHIIIF" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 49, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# ENTER THE NAME/PATH OF YOUR INPUT FASTQ FILE \n", 38 | "input_fastq_file = r\"D:\\Python_for_Bioinformatics\\Facebook_Linkedin_content\\py\\noname_fastq.fastq\"\n", 39 | "\n", 40 | "# ENTER THE NAME/PATH OF YOUR OUTPUT FASTQ FILE\n", 41 | "output_fastq_file = r\"D:\\Python_for_Bioinformatics\\Facebook_Linkedin_content\\py\\withname_fastq.fastq\"\n", 42 | "\n", 43 | "# The generator function takes in the input fastq file, reads all the sequences as SeqRecord Objects\n", 44 | "# and modifies these attributes \"id\", \"name\", \"description\" to the desired header\n", 45 | "def generator_func(input_fastq_file):\n", 46 | " from Bio import SeqIO \n", 47 | " fr = open(input_fastq_file,\"r\")\n", 48 | " seq_records = SeqIO.parse(fr,\"fastq\")\n", 49 | " i,j = 1,1\n", 50 | " for seq_record in seq_records:\n", 51 | " header = str(i)+\"_\"+str(j) # this will be equal to 1_1 for first sequence, 2_2 for second sequence and so on \n", 52 | " seq_record.id = header\n", 53 | " seq_record.name = header\n", 54 | " seq_record.description = header\n", 55 | " j += 1\n", 56 | " yield seq_record \n", 57 | " fr.close()\n", 58 | "\n", 59 | "fw = open(output_fastq_file,\"w\")\n", 60 | "modified_seq_records = generator_func(input_fastq_file) # fucnction call\n", 61 | "\n", 62 | "#WRITING THE MODIFIED SEQUENCE HEADERS OBJECTS TO OUR OUT FILE\n", 63 | "for modified_seq_record in modified_seq_records:\n", 64 | " SeqIO.write(modified_seq_record, fw, \"fastq\")\n", 65 | "fw.close()\n" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.6.4" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /rev_comp_multi_fasta.fasta: -------------------------------------------------------------------------------- 1 | >NC_000024.10:c2787682-2786855 Homo sapiens chromosome Y, GRCh38.p13 Primary Assembly(reverse_complement) 2 | TGAAATGAATAAGGCCTTTATTAGCCAGAGAAAAGAAAACAATATTGAAACTAAACATAA 3 | GAAAGTGAGGGCTGTAAGTTATCGTAAAAAGGAGCATCTAGGTAGGTCTTTGTAGCCAAT 4 | GTTACCCGATTGTCCTACAGCTTTGTCCAGTGGCTGTAGCGGTCCCGTTGCTGCGGTGAG 5 | CTGGCTGCGTTGATGGGCGGTAAGTGGCCTAGCTGGTGCTCCATTCTTGAGTGTGTGGCT 6 | TTCGTACAGTCATCCCTGTACAACCTGTTGTCCAGTTGCACTTCGCTGCAGAGTACCGAA 7 | GCGGGATCTGCGGGAAGCAAACTGCAATTCTTCGGCAGCATCTTCGCCTTCCGACGAGGT 8 | CGATACTTATAATTCGGGTATTTCTCTCTGTGCATGGCCTGTAATTTCTGTGCCTCCTGG 9 | AAGAATGGCCATTTTTCGGCTTCAGTAAGCATTTTCCACTGGTATCCCAGCTGCTTGCTG 10 | ATCTCTGAGTTTCGCATTCTGGGATTCTCTAGAGCCATCTTGCGCCTCTGATCGCGAGAC 11 | CACACGATGAATGCGTTCATGGGTCGCTTCACTCTATCCTGGACGTTGCCTTTACTGTTT 12 | TCTCCCGTTTCACACTGATACTTAGAGTTACAGCTTTCAGTGCAAAGGAAGGAAGAGCTT 13 | CTCCGGAGAGCGGGAATATTCTCTTGCACAGCTGGACTGTAATCATCGCTGTTGAATACG 14 | CTTAACATAGCAGAAGCATATGATTGCATTGTCAAAAACAAGGAGAGTGCGACAAAATTG 15 | AAAGGTGCCAGAGTTCGAAACTTATTTTACTATCCAAAACTCACTTCT 16 | >NC_000087.7:c2663658-2662471 Mus musculus strain C57BL/6J chromosome Y, GRCm38.p6 C57BL/6J(reverse_complement) 17 | TCATGAGACTGCCAACCACAGGGCTGTGCTGAGGTGCTCCTGGTATGGTGTATGCTCACC 18 | AGTGATGTCAGCTGTTAGTAAGTAGGTAAGCTGCTGGTCGTGGAACTGCTGCTGCTGCTG 19 | CTGCTGCTGCTGCTGCTGCTGCTGGTGGTGGTCATGGAACTGCTGTTGCTGCTGGTGGTG 20 | GTCATGGAACTGCTGCTTCTGCTGGTGGTGGTCATGGAACTGCTGCTTCTGCTGGTGGTG 21 | GTCATGGAACTGCTGCTTCTGCTGCTGCTGGTGGTGGTGGTCATGGAACTGCTGTTGCTG 22 | CTGGGGGTGGTCATGGAACTGCTGCTTCTGCTGGGGGTGGTCATGGAACTGATGCTGCTG 23 | CTGCTGGTGGTCATGGAACTGCTGCTGCTGCTGCTGGTGGTCATGGAACTGCTGCTGCTG 24 | TTGGTGGTGGTGGTGGTCATGGAACTGCTGCTTCTGCTGGTGGTGGTCATGGAACTGCTG 25 | CTGCTGCTGCTGCTGCTGCTGCTGCTGGTGGTCATGGAACTGCTGTTGCTGCTGGTGGTG 26 | GTCATGGAACTGCTGCTCCTGGTGGTGGTGGTGGTGGTCATGGAACTGCTGTTGCTGCTG 27 | GTGGTGGTCATGAAACTGCTGCTTCTGCTGGTGGTGGTCATGGAACTGCTGCTGCTGCTG 28 | CTGCTGCTGCTGCTGCTGCTGGTGGTGGTCATAGAACTGCTGTTGCTGCTGGTGGTGGTT 29 | ATGGAACTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCAGGTGCCCAGTGGGGATATC 30 | AACAGGCTGCCAATAAAAGCTTTGCTGGTTTTTGGAGTACAGGTGTGCAGCTCTACTCCA 31 | GTCTTGCCTGTATGTGATGGCATGTGGGTTCCTGTCCCACTGCAGAAGGTTGTACAGTTT 32 | TGTTGAGGCAACTGCAGGCTGTAAAATGCCACTCCTCTGTGACACTTTAGCCCTCCGATG 33 | AGGCTGATATTTATAGTTTGGGTATTTCTCTCTGTGTAGGATCTTCAATCTCTGTGCCTC 34 | CTGGAAAAAGGGCCTTTTTTCGGCTTCTGTAAGGCTTTTCCACCTGCATCCCAGCTGCTT 35 | GCTGATCTCTGTATTTTGCATGCTGGGATTCTGCTGGGCCAACTTGTGCCTCTCACCACG 36 | GGACCACACCATAAATGCATTCATGGGGCGCTTGACATGGCCCTCCAT 37 | >NC_024475.1:327176-327685 Rattus norvegicus strain mixed chromosome Y, Rnor_6.0(reverse_complement) 38 | CTAGTGGAACTGGTGCTGCTGCTGCTGCTGCTGCTGCAGGTGCACGTGCTGCTGCTGCTG 39 | GTGCTGCTGTTTCTGCTGTAGTGGGTATCCAGTGGGGATGTCCACAGGCTGTAAATAAAT 40 | GCTTTTCTGGTTCTTGGAGGACTGGTGTGCAGCTCTAGCCCAGTCCTGTCCGTATATGAT 41 | AGTGTGTAGGTTGTTGTCCCATTGCAGCAGGTTGTACAGTTTTGTTGAGGCAACTTCACG 42 | CTGCAAAGTATAACTCCTCTGTGGCACTTTAACCCTTCGATGAGGCTGATATTTATAGTT 43 | TGGATATTTCTCTCTGTGTAGGGTCTTCAGTCTCTGCGCCTCCTGGAAAAAGGGCCTTTT 44 | TTCGGCTTCTGTAAGGCTTTTCCACTGATATCCCAGATGCTTGCTGATCTCTGAATTCTG 45 | CATGCTGGGATTCTGTTGAGCCAACTTGCGCCTCTCTCCACGGGACCACACCATAAATGC 46 | ATTCATGGGGCGCTTGACATGGCCCTCCAT 47 | -------------------------------------------------------------------------------- /rev_comp_multi_fasta.py: -------------------------------------------------------------------------------- 1 | # Finding reverse complement of all the sequences in a multi fasta using biopython 2 | 3 | # Type: "python rev_comp_multi_fasta.py -i Path of your multi fasta file -o Output file name with extension" for running the code 4 | # Example: python rev_comp_multi_fasta.py -i C:\Users\dell\Desktop\test\sample_multi_fasta.fasta -o rev_comp_multi_fasta.fasta 5 | 6 | # Type: "python rev_comp_multi_fasta.py -h" for help/usage description 7 | 8 | # Output: A fasta file having reverse complement of all the sequences 9 | 10 | # Sample input file: sample_multi_fasta.fasta 11 | # Sample output file: rev_comp_multi_fasta.fasta 12 | 13 | import argparse 14 | from Bio import SeqIO 15 | from pathlib import Path 16 | 17 | parser=argparse.ArgumentParser(description="Finding reverse complement of all the sequences in a multi fasta, USAGE: python rev_comp_multi_fasta.py -i Path/to/multi/fasta/file -o Output.fasta") 18 | parser.add_argument("-i", help="ENTER FULL PATH OF THE MULTI FASTA FILE") 19 | parser.add_argument("-o", help="OUTPUT FILE NAME WITH EXTENSION") 20 | args = parser.parse_args() 21 | 22 | input_path = Path(args.i) 23 | out_path = input_path.parent/args.o 24 | 25 | fasta_records = SeqIO.parse(input_path,"fasta") 26 | new_records = [] 27 | 28 | for fasta_record in fasta_records: 29 | fasta_record.description = fasta_record.description + "(reverse_complement)" 30 | fasta_record.seq = fasta_record.seq.reverse_complement() 31 | new_records.append(fasta_record) 32 | 33 | with open(out_path,"w") as fw: 34 | SeqIO.write(new_records,fw,"fasta") 35 | 36 | print("Output file successfully generated at: ",out_path) 37 | 38 | ## please leave a message if you face any issues while using the code 39 | ## your personal queries are also invited 40 | 41 | -------------------------------------------------------------------------------- /rev_complement.py: -------------------------------------------------------------------------------- 1 | 2 | # program to find the reverse complement of a dna sequence 3 | 4 | str1 = "ATCGATCGATCGATCG" 5 | complement = "" 6 | for i in range(len(str1)): 7 | if (str1[i] == "A"): 8 | complement += "T" 9 | elif (str1[i] == "G"): 10 | complement += "C" 11 | elif (str1[i] == "C"): 12 | complement += "G" 13 | elif (str1[i] == "T"): 14 | complement += "A" 15 | rev_complement = complement[::-1] 16 | print(rev_complement) 17 | 18 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 19 | ##your personal queries are also invited 20 | 21 | 22 | -------------------------------------------------------------------------------- /sample_multi_fasta.fasta: -------------------------------------------------------------------------------- 1 | >NC_000024.10:c2787682-2786855 Homo sapiens chromosome Y, GRCh38.p13 Primary Assembly 2 | AGAAGTGAGTTTTGGATAGTAAAATAAGTTTCGAACTCTGGCACCTTTCAATTTTGTCGCACTCTCCTTG 3 | TTTTTGACAATGCAATCATATGCTTCTGCTATGTTAAGCGTATTCAACAGCGATGATTACAGTCCAGCTG 4 | TGCAAGAGAATATTCCCGCTCTCCGGAGAAGCTCTTCCTTCCTTTGCACTGAAAGCTGTAACTCTAAGTA 5 | TCAGTGTGAAACGGGAGAAAACAGTAAAGGCAACGTCCAGGATAGAGTGAAGCGACCCATGAACGCATTC 6 | ATCGTGTGGTCTCGCGATCAGAGGCGCAAGATGGCTCTAGAGAATCCCAGAATGCGAAACTCAGAGATCA 7 | GCAAGCAGCTGGGATACCAGTGGAAAATGCTTACTGAAGCCGAAAAATGGCCATTCTTCCAGGAGGCACA 8 | GAAATTACAGGCCATGCACAGAGAGAAATACCCGAATTATAAGTATCGACCTCGTCGGAAGGCGAAGATG 9 | CTGCCGAAGAATTGCAGTTTGCTTCCCGCAGATCCCGCTTCGGTACTCTGCAGCGAAGTGCAACTGGACA 10 | ACAGGTTGTACAGGGATGACTGTACGAAAGCCACACACTCAAGAATGGAGCACCAGCTAGGCCACTTACC 11 | GCCCATCAACGCAGCCAGCTCACCGCAGCAACGGGACCGCTACAGCCACTGGACAAAGCTGTAGGACAAT 12 | CGGGTAACATTGGCTACAAAGACCTACCTAGATGCTCCTTTTTACGATAACTTACAGCCCTCACTTTCTT 13 | ATGTTTAGTTTCAATATTGTTTTCTTTTCTCTGGCTAATAAAGGCCTTATTCATTTCA 14 | 15 | >NC_000087.7:c2663658-2662471 Mus musculus strain C57BL/6J chromosome Y, GRCm38.p6 C57BL/6J 16 | ATGGAGGGCCATGTCAAGCGCCCCATGAATGCATTTATGGTGTGGTCCCGTGGTGAGAGGCACAAGTTGG 17 | CCCAGCAGAATCCCAGCATGCAAAATACAGAGATCAGCAAGCAGCTGGGATGCAGGTGGAAAAGCCTTAC 18 | AGAAGCCGAAAAAAGGCCCTTTTTCCAGGAGGCACAGAGATTGAAGATCCTACACAGAGAGAAATACCCA 19 | AACTATAAATATCAGCCTCATCGGAGGGCTAAAGTGTCACAGAGGAGTGGCATTTTACAGCCTGCAGTTG 20 | CCTCAACAAAACTGTACAACCTTCTGCAGTGGGACAGGAACCCACATGCCATCACATACAGGCAAGACTG 21 | GAGTAGAGCTGCACACCTGTACTCCAAAAACCAGCAAAGCTTTTATTGGCAGCCTGTTGATATCCCCACT 22 | GGGCACCTGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCATAACCACCACCAGCAGCAACAGC 23 | AGTTCTATGACCACCACCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCATGACCACCACCA 24 | GCAGAAGCAGCAGTTTCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCACCACCACCACCAG 25 | GAGCAGCAGTTCCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCAGCAGCAGCAGCAGCAGC 26 | AGCAGCAGCAGCAGTTCCATGACCACCACCAGCAGAAGCAGCAGTTCCATGACCACCACCACCACCAACA 27 | GCAGCAGCAGTTCCATGACCACCAGCAGCAGCAGCAGCAGTTCCATGACCACCAGCAGCAGCAGCATCAG 28 | TTCCATGACCACCCCCAGCAGAAGCAGCAGTTCCATGACCACCCCCAGCAGCAACAGCAGTTCCATGACC 29 | ACCACCACCAGCAGCAGCAGAAGCAGCAGTTCCATGACCACCACCAGCAGAAGCAGCAGTTCCATGACCA 30 | CCACCAGCAGAAGCAGCAGTTCCATGACCACCACCAGCAGCAACAGCAGTTCCATGACCACCACCAGCAG 31 | CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGTTCCACGACCAGCAGCTTACCTACTTACTAACAGCTG 32 | ACATCACTGGTGAGCATACACCATACCAGGAGCACCTCAGCACAGCCCTGTGGTTGGCAGTCTCATGA 33 | 34 | >NC_024475.1:327176-327685 Rattus norvegicus strain mixed chromosome Y, Rnor_6.0 35 | ATGGAGGGCCATGTCAAGCGCCCCATGAATGCATTTATGGTGTGGTCCCGTGGAGAGAGGCGCAAGTTGG 36 | CTCAACAGAATCCCAGCATGCAGAATTCAGAGATCAGCAAGCATCTGGGATATCAGTGGAAAAGCCTTAC 37 | AGAAGCCGAAAAAAGGCCCTTTTTCCAGGAGGCGCAGAGACTGAAGACCCTACACAGAGAGAAATATCCA 38 | AACTATAAATATCAGCCTCATCGAAGGGTTAAAGTGCCACAGAGGAGTTATACTTTGCAGCGTGAAGTTG 39 | CCTCAACAAAACTGTACAACCTGCTGCAATGGGACAACAACCTACACACTATCATATACGGACAGGACTG 40 | GGCTAGAGCTGCACACCAGTCCTCCAAGAACCAGAAAAGCATTTATTTACAGCCTGTGGACATCCCCACT 41 | GGATACCCACTACAGCAGAAACAGCAGCACCAGCAGCAGCAGCACGTGCACCTGCAGCAGCAGCAGCAGC 42 | AGCAGCACCAGTTCCACTAG 43 | 44 | -------------------------------------------------------------------------------- /seq_from_fasta_using_id_1.py: -------------------------------------------------------------------------------- 1 | # Extracting sequences of our choice from a multi fasta file using headers 2 | # All the required sequences will be extracted in a single output file 3 | 4 | # Input: 5 | # 1.) Path of multi fasta file from which specific sequences have to be extracted 6 | # 2.) Path of headers.txt which is a text file containing the headers of our required sequences 7 | # Output: A fasta file named as output.fasta, containing the sequences of our choice 8 | # Sample Input Files: multi_fasta.fasta, headers.txt 9 | # Sample Output File: output.fasta 10 | 11 | #### Enter the path of required files #### 12 | multi_fasta_path = "Enter the path of your multi fasta file" 13 | # e.g: multi_fasta_path = r"C:\Users\dell\Desktop\NEGENOME_Python\advanced\seq_from_fasta_using_id\multi_fasta.fasta" 14 | 15 | headers_txt_path = "Enter the path of your headers.txt file" 16 | # e.g: headers_txt_path = r"C:\Users\dell\Desktop\NEGENOME_Python\advanced\seq_from_fasta_using_id\headers.txt" 17 | #### Enter the path of required files #### 18 | 19 | # function ExtractSeqFromFasta takes the path of the multi fasta file and headers.txt 20 | # file as input and extracts the required fasta sequence from the fasta files using headers 21 | def ExtractSeqFromFasta(multi_fasta_path,headers_txt_path): 22 | from pathlib import Path 23 | 24 | multi_fasta_path = Path(multi_fasta_path) 25 | headers_txt_path = Path(headers_txt_path) 26 | out_path = multi_fasta_path.parent/"output.fasta" 27 | 28 | headers_list = [] 29 | 30 | fr1 = open(multi_fasta_path,"r") 31 | fr2 = open(headers_txt_path,"r") 32 | fw = open(out_path,"w") 33 | 34 | # looping over each line in our headers.fasta and appending each header 35 | # to the headers_list 36 | for line in fr2: 37 | line = line.strip("\n") 38 | headers_list.append(line) 39 | 40 | # seq stores the sequence for our required headers 41 | seq = "" 42 | 43 | # looping over each line in our multi fasta file 44 | for line in fr1: 45 | line = line.strip("\n") # removing "\n" from each line 46 | if ">" in line: 47 | count = 0 48 | if seq != "": 49 | # writing the extracted sequence for each required 50 | # to the output file 51 | fw.write(seq+"\n\n") 52 | seq = "" 53 | 54 | # looping over each header in the headers_list 55 | for header in headers_list: 56 | # this if condition will be true when the header in 57 | # our headers_list matches with the header in our 58 | # multi fasta file 59 | if line == header: 60 | count += 1 61 | headers_list.remove(header) 62 | line += "\n" 63 | break 64 | 65 | if count == 1: 66 | if ">" in line: 67 | # writing only the required header line to the output file 68 | fw.write(line) 69 | else: 70 | # generating the required fasta sequence 71 | seq += line 72 | 73 | 74 | fr1.close() 75 | fr2.close() 76 | fw.close() 77 | 78 | return out_path 79 | 80 | out_path = ExtractSeqFromFasta(multi_fasta_path,headers_txt_path) 81 | print("Output file successfully generated at: ", out_path) 82 | 83 | ##please leave a message if you face any issues while using the code 84 | ##your personal queries are also invited 85 | 86 | 87 | -------------------------------------------------------------------------------- /seq_from_fasta_using_id_2.py: -------------------------------------------------------------------------------- 1 | # Extracting required sequences from a multi fasta file using headers(using Biopython). 2 | # Each required sequences will be extracted in a different output file. 3 | 4 | # Input: 5 | # 1.) Path of multi fasta file from which specific sequences have to be extracted. 6 | # 2.) Path of headers.txt which is a text file containing the headers of our required sequences. 7 | 8 | # Output: Different fasta files, each named after the headers of our required sequences. 9 | 10 | # Sample Input Files: multi_fasta.fasta, headers.txt 11 | # Sample Output Files: AY179745.fasta, D10845.fasta, D10847.fasta 12 | 13 | # multi_fasta_path = r"Enter the path of your multi fasta file" 14 | multi_fasta_path = r"C:\Users\dell\Desktop\NEGENOME_Python\advanced\seq_from_fasta_using_id\multi_fasta.fasta" 15 | 16 | # headers_txt_path = r"Enter the path of your headers.txt file" 17 | headers_txt_path = r"C:\Users\dell\Desktop\NEGENOME_Python\advanced\seq_from_fasta_using_id\headers.txt" 18 | 19 | 20 | # function ExtractSeqFromFasta takes the path of the multi fasta file and headers.txt 21 | # file as input and extracts the required fasta sequence from the fasta files using headers 22 | def ExtractSeqFromFasta(multi_fasta_path,headers_txt_path): 23 | from pathlib import Path 24 | from Bio import SeqIO 25 | 26 | # converting our string paths to Path objects 27 | multi_fasta_path = Path(multi_fasta_path) 28 | headers_txt_path = Path(headers_txt_path) 29 | 30 | # adding each header in our headers.fasta to the headers_list 31 | with open(headers_txt_path,"r") as fr2: 32 | headers_list = fr2.readlines() 33 | headers_list = [header.rstrip("\n").replace(">","") for header in headers_list] 34 | 35 | # using parse function to extract all sequences from our multi fasta file in all_fasta_records 36 | all_fasta_records = SeqIO.parse(multi_fasta_path,'fasta') 37 | 38 | out_path_list = [] # for storing paths of our output files 39 | 40 | # looping over each fasta record in all_fasta_records 41 | for fasta_record in all_fasta_records: 42 | # checking if the description of our fasta record matches with the header in our header list 43 | if fasta_record.description in headers_list: 44 | # generating path of our output files using the record ids 45 | out_path = multi_fasta_path.parent/fasta_record.id 46 | # writing fasta record to it's output file 47 | SeqIO.write(fasta_record,out_path,"fasta") 48 | out_path_list.append(out_path) 49 | 50 | return out_path_list 51 | 52 | out_path_list = ExtractSeqFromFasta(multi_fasta_path,headers_txt_path) 53 | print("Output files successfully generated at: ", out_path_list) 54 | 55 | ##please leave a message if you face any issues while using the code 56 | ##your personal queries are also invited 57 | 58 | 59 | -------------------------------------------------------------------------------- /sequence_length_distribution_from_Fastq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 71, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEGCAYAAABPdROvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nO3deVxU59n/8c/FroCobCqigOCuEcVdE6PRWJvN7EsTk5iYmL1Jnixt0/Z5mmZrG7M1MVFjzWI2s7ZZXaIxxqjgrqCCoIgLKIoogiz37485/EotyiDMnBnmer9e85qZM2f5zhG5OOc+577FGINSSil1tvzsDqCUUsq7aSFRSinVJFpIlFJKNYkWEqWUUk2ihUQppVSTBNgdwN2ioqJMQkKC3TGUUsqrZGRkHDTGRNf3mc8VkoSEBNLT0+2OoZRSXkVEdp3uMz21pZRSqkm0kCillGoSLSRKKaWaRAuJUkqpJtFCopRSqkm0kCillGoSLSRKKaWaxOfuI1FKKU9UdrKKrP2lZB84xuGyk5yorCYsOIDo8GB6dWxDt+gw/P3E7pj10kKilFI2OVpeyefrCvhmy35W5xZTWX368aHahAQwvncHJqfGMTI5EhHPKSpaSJRSys0Kj5bz6tIcPliTz4nKapJjwrh1ZCKDurajR4dwosKCCQn051hFFftLytlcUMKKnIN8t3U/H6/dQ++ObXhwfHfG9YrxiIIivjZCYlpamtEuUpRSdjhZVcMbP+TwyvfZVFYbJqfGcdPwrvSLi3CqIFRUVfPZugJmLttJ7sHjnN8jmj9d1pfO7Vq7PLuIZBhj0ur9TAuJUkq53rrdh3n0441sP3CMSf068MiFPUmICj2rdVVW1/CPFXm8sGg7/n7Cc1eew8S+HZo58X86UyHRq7aUUsqFjDHMXr6TK2eupLS8itk3pfHqDYPOuogABPr7cfu5SXx532i6RoZy5zsZzFi4HbsODLSNRCmlXOR4RRX/s2ADX23az4Tesfz16nNoExLYbOtPiArl4+kj+M2nm3hx8Q72lZzg6cv7u/3qLi0kSinlAoeOVXDrP9awqaCE30zqye2jk1zSMB4U4MdfruxPp7ateGnxDqpr4C9X9sfPjcXEZYVEROKBt4AOQA3whjHmRRFpD3wAJAB5wNXGmMMiMgR4o3Zx4I/GmE+tdS0FOgInrM8nGGMKRSTY2sYg4BBwjTEmz1XfSSmlnLH7UBk3vbmKfSXlvHFjGhf0jnXp9kSEB8d3J8BPeH7hdoIC/Hhqcl+3XdHlyiOSKuAhY8xaEQkHMkRkIXAzsNgY84yIPAY8BjwKbAbSjDFVItIR2CAi/zTGVFnru8EYc2or+VTgsDEmWUSuBZ4FrnHhd1JKqTPafaiMa95YyYnKaubfPpRBXdu7bdv3jUuhvLKaV5fm0KV9a6aP6eaW7bqssd0Ys88Ys9Z6XQpkAnHApcA8a7Z5wGXWPGV1ikYI4EyrUd11LQDGiSdcVK2U8kl7Dpdx3ayfOVFZzXu3D3NrEan1Pxf24JJzOvHsN1l8s3mfW7bplqu2RCQBSAVWAbHGmH3gKDZATJ35horIFmATcGedwgIwV0TWi8gTdYpFHJBvrasKKAEi69n+NBFJF5H0oqKiZv9+SilVWFrO9bNWUVpeyTtTh9KrYxtbcogIz13Zn9QubXnoww3sLDrm8m26vJCISBjwMfCAMebomeY1xqwyxvQBBgOPi0iI9dENxph+wGjrcWPt6utbTT3rfcMYk2aMSYuOrnfseqWUOmtlJ6u4bV46RaUVzLt1CH3jImzNExLoz9+vH0hQgB93z19HeWW1S7fn0kIiIoE4isi7xphPrMkHrDYQrOfCU5czxmQCx4G+1vsC67kUmA8MsWbdA8Rb6woAIoBiV30fpZQ6VXWN4f7317O5oISXr0sltUs7uyMB0KltK56/egCZ+47y9FeZLt2WywqJdfppDpBpjHm+zkdfAFOs11OAz635E61igIh0BXoAeSISICJR1vRA4CIcDfOnrutKYInxtVv1lVK2evLLrSzceoDfX9Tb5VdnNdb5PWO4ZWQC81bu4qecgy7bjiuPSEbiOAU11mrbWC8ik4BngPEisgMYb70HGIXjSq31wKfAXcaYg0Aw8K2IbATWAwXALGuZOUCkiGQDD+K4AkwppdziwzX5zF2Rxy0jE7h5ZKLdcer1yIU9SYwK5ZEFGzlWUdXwAmdB+9pSSqmzsLmghMtf+4nBCe1469ahHjtWCEDGrmKunLmS/7mwB3eNST6rdZypry29s10ppRqppKyS6e9mEBkaxEvXpnp0EQEY1LU9828bxpBE11yOrIVEKaUaoabG8NBH69lfUs4HdwwnMizY7khOGd7tv+6MaDba+69SSjXC3J/yWJRZyG8n9WKgh1yhZTctJEop5aSs/Ud59pssLugVy5QRCXbH8RhaSJRSygnlldU88P562oQE8uwV/TxiiFtPoW0kSinlhL9+u42s/aXMvXmw17SLuIsekSilVANW5hxi9o+53DisK+f3jGl4AR+jhUQppc6gvLKaxz/ZSJf2rfnNpF52x/FIempLKaXOYMai7eQdKmP+bUNpFeRvdxyPpEckSil1GpsLSpi9PJdr0uIZkRxldxyPpYVEKaXqUVldwyMLNtI+NEhPaTVAT20ppVQ9Zi3fydZ9R5n5q4FEtA60O45H0yMSpZQ6RX5xGS8u2sHEPh2Y2Lej3XE8nhYSpZQ6xf/9ayv+fsIfLultdxSvoIVEKaXq+H5bIQu3HuDesSl0jGhldxyvoIVEKaUsFVXV/N8/t5IUFcqtoxLsjuM1tLFdKaUsc37MJffgcebdOoTgAL1nxFl6RKKUUsDeIyd4eXE2F/aJ5bzu0XbH8SpaSJRSCnjm6yxqjOF3v9QG9sbSQqKU8nnr84/wxYa9TDs3ifj2re2O43W0kCilfJoxhqe+zCQqLIg7zutmdxyvpIVEKeXTFm49wOq8Yn49vjthwXr90dnQQqKU8lmV1TU883UWyTFhXJMWb3ccr6Xl14MZYzh47CQlJyopr6wmLDiAyLAgwkO03x+lmsN7q3ez8+Bx5kxJI8Bf/64+W1pIPMy+khN8s3k/S7IK2VxQwuGyyv+aJ65tK86Jj2BMjxgu6BVL+9AgG5Iq5d2OllfywqIdDE+KZKyOetgkWkg8xKY9JcxclsM3W/ZTXWNIjgljYt8OpMSEExkWREigP8crqth/tJysfaWszi3mq037CfL3Y2LfDtw2OpH+ndva/TWU8hozl+ZQfPwkv/1lL0TE7jheTQuJzYpKK3j2mywWZOyhTUgAt41O5NrBXUiMCj3jcsYYtuw9yoKMPXyydg9fbNjLuJ4xPD6pF8kxYW5Kr5R3OnC0nDk/5jI5NY6+cRF2x/F6WkhstDjzAI8s2EhpeRV3nJfE3ecn08bJ9g8RoW9cBH3jInhoQnfeWrmL15flMOnF5dx5XhJ3nZ9MSKB28aBUfV5Zkk11jeHXF3S3O0qLoIXEBjU1hr9+t41Xl+bQq2Mb3ps2gO6x4We9vvCQQO4+P5mr0+L585dbeWlJNl9v3s/fbxjYpPUq1RLlF5fx/prdXDM4ni6RevNhc9DLFNysvLKae95by6tLc7huSDyf3T2i2X7ZR4cH88K1qfzjlsEcLjvJpa+sYEHGnmZZt1ItxYuLd+Anwr1jU+yO0mJoIXGj8spq7nwng6827ee3k3rx1OR+LulhdEyPGL66bzTnxEfw8EcbePJfW6mpMc2+HaW8TXZhKZ+s3cONw7rSISLE7jgthhYSN6msrmH6Oxks3VbEM5f34/Zzk1x6pUhMmxDevW0YN49IYPaPudw9fy3lldUu255S3mDGwh20CvRn+hjtCqU5aSFxA2MMj3+yie+3FfH05f24dkgXt2zX30/4w8W9+d0ve/HNlv1MeXM1xyuq3LJtpTzN5oISvty0j6mjEokMC7Y7TouihcQNXl6SzYKMPTxwQQrXuamI1BIRbhudxAvXDGBNXjG3zF3DMS0mygc9v3A7Ea0Cue3cJLujtDhaSFxs6bZCnl+4nctT47h/nH2Ne5cOiOOl61LJ2H2YKW+upuykFhPlO9btPsySrELuOC/J6UvslfO0kLjQ3iMn+PUH6+nZIZynLu9n+92zF/XvxEvXprJu92HufnctldU1tuZRyl1eXLyD9qFBTBmeYHeUFsllhURE4kXkexHJFJEtInK/Nb29iCwUkR3Wcztr+hARWW89NojI5DrrGiQim0QkW0ReEus3sogEi8gH1vRVIpLgqu/TWJXVNdz73jpOVtXw6g0DPebmwF/278iTl/Xj+21FPPbxJozRq7lUy7Yh/whLtxVx2+hEQrWbeJdw5RFJFfCQMaYXMAy4W0R6A48Bi40xKcBi6z3AZiDNGDMAmAi8LiK1/+qvAdOAFOsx0Zo+FThsjEkGZgDPuvD7NMprS3PI2HWYp6/oT1K0Z3VZcv3QLjxwQQofr93Dc99uszuOUi710uIdtG0dyE16NOIyLiskxph9xpi11utSIBOIAy4F5lmzzQMus+YpM8bUnrgPAQyAiHQE2hhjVhrHn89v1S5zyroWAOPE7vNHwPYDpby8ZAcXn9OJS87pZHecet0/LoXrh3bhtaU5fJieb3ccpVxic0EJi7MKmToyUQetciG3tJFYp5xSgVVArDFmHziKDRBTZ76hIrIF2ATcaRWWOKDu7dl7rGlYz/nWuqqAEiDSld+lIdU1hkcWbCQ8JJA/XtzbzihnJCL83yV9GJUcxe8+3cy63YftjqRUs3tp8Q7ahAQwZWSC3VFaNJcXEhEJAz4GHjDGHD3TvMaYVcaYPsBg4HERCQHqO8KoPbF/ps/qZpgmIukikl5UVNS4L9BIc1fksj7/CH+4uLfHX6se4O/Hy9elEhsRzJ3vZFB4tNzuSEo1m617j/Ld1gPcOipRr9RyMZcWEhEJxFFE3jXGfGJNPmCdrqo9bVV46nLGmEzgONAXxxFI5zofdwb2Wq/3APHWugKACKC4nvW9YYxJM8akRUdHN8dXq1dRaQUvLNrB2J4xHntK61TtQoOYdVOaowfidzKoqNK731XL8PKSHYQHB3DLyES7o7R4rrxqS4A5QKYx5vk6H30BTLFeTwE+t+ZPrG1cF5GuQA8gzzr9VSoiw6x13lS7zCnruhJYYmy8DOlv322joqqaJy7qbfulvo3Rs0Mb/nbVOazbfYRnv9bGd+X9svYf5evN+7llZAIRrfRoxNVceUQyErgRGFvnst5JwDPAeBHZAYy33gOMAjaIyHrgU+AuY8xB67PpwGwgG8gBvramzwEiRSQbeJB/XwHmdpsLSvggPZ+bRyQ0OCiVJ/pFv47cPCKBN1fksmjrAbvjKNUkLy/JJiw4gFtH6dGIO7jsMgZjzI/U34YBMK6e+d8G3j7NutJxnOY6dXo5cFUTYjYLYwz/96+ttGsdxD1e3DX145N6siavmIcXbOCr+0bTqW0ruyMp1WjZhcf4atM+pp/Xjbatg+yO4xP0zvZm8P22QlbnFvPg+O5efRgdHODPK9cPpLKqhvvfX0eV3vmuvNDry3IIDvBjqh6NuI0WkiYyxvD8wu10ad+aawbH2x2nyRKjQnnq8n6syTvMzGU5dsdRqlH2HjnBp+sKuCYt3uOvmmxJtJA00cKtB9hccJT7xqUQ6N8yduelA+K4qH9HXly8gy17S+yOo5TTZi/PBeB27eHXrVrGbz6b1NQYZizaQWJUKJcN8I7LfZ31p0v70rZ1EA99uEEvCVZeofj4Sd5bvZtLBnSiczsdi92dtJA0wbdb9pO57yj3j0shoIUcjdRqFxrEM5f3I2t/KS8u2mF3HKUa9I+f8jhRWc2d5+noh+7Wsn77uZExhr8vzSYpKpSLveTmw8Ya1yuWq9M6M3NZDmu1CxXlwY5XVDHvpzzG946le2y43XF8jhaSs7Ry5yE2Fxzl9nOT8PfznpsPG+uJi3rTMaIVD3+4Qcd8Vx7rvdW7KTlRyV06FrsttJCcpTd+2ElUWBCTU+MantmLhYcE8swV/dh58Divfp9tdxyl/ktFVTWzlu9keFIkqV3a2R3HJ2khOQvb9peydFsRU4YneMyAVa40OiWayalxvLYsh+0HSu2Oo9R/+GxdAQeOVjBdj0Zso4XkLMxavpNWgf78alhXu6O4ze9+2YvQ4AAe/2QTNTU6qqLyDNU1hpnLdtI3rg2jU6LsjuOztJA0UlFpBZ+vL+CqtM60C/Wd7hciw4L53S97k7HrMPNX77Y7jlIAfLN5P7kHj3PXmGSv6ii1pdFC0kgfpudTWW18ctjOKwbGMaJbJM9+ncUBHbtE2cwYw2vLHFdOXting91xfJoWkkaorjHMX7WbEd0iSY7xrHHY3UFEeGpyPyqqa/jTv7baHUf5uJ9yHFdOTmvhV056Ay0kjbB0WyEFR074VNvIqRKiQpl+Xjf+tXEfK3MO2R1H+TDHlZPBTB7Ysq+c9AZOFRIR+a8u3H3ROz/vIiY8mPG9Y+2OYqvpY7rRuV0r/vjFFiq1h2Blg237S1m2vYhbRiYQHNDyr5z0dM4ekcwUkdUicpeItHVpIg+VX1zG0u1FXDs4vsV0zni2QgL9eeKi3mw7UMrbK3fZHUf5oNnWlZM3DO1idxSFk4XEGDMKuAHH+OjpIjJfRMa7NJmH+TA9HwGuHaI/uAATesdybvdoZizcTlFphd1xlA8pPFrOZ+sLuDqtsw5c5SGc/tPaGLMD+B3wKHAe8JKIZInI5a4K50nuGpPMvFuH6KiBFhHhDxf3pryqmue+ybI7jvIh//gpj+oao8PoehBn20j6i8gMIBMYC1xsjOllvZ7hwnweo1WQP6NTou2O4VG6RYcxdVQSH2Xs0U4dlVscr6jinZ93cWGfDnSNDLU7jrI4e0TyCrAWOMcYc7cxZi2AMWYvjqMU5aPuHZtMbJtg/vjFFr3jXbncR+n5HC2v0oGrPIyzhWQSMN8YcwJARPxEpDWAMeZtV4VTni80OIBHJ/Zk454Svtiw1+44qgWrqq5hzopc0rq2Y6B2zuhRnC0ki4C6jQOtrWlKcdmAOPrFRfDcN1na1bxymW+3HCC/+AS3jdajEU/jbCEJMcYcq31jvdaxLBUAfn7Cb3/Zi70l5cz5MdfuOKoFMsbwxvKdJES29vn7uDyRs4XkuIgMrH0jIoOAE66JpLzRsKRIJvSO5dXvsyks1X64VPNK33WYDflHmDpau0PxRM4WkgeAj0RkuYgsBz4A7nFdLOWNHvtFTyqqapixUMd4V83rjR920q51IFcO7Gx3FFUPZ29IXAP0BKYDdwG9jDEZrgymvE9SdBi/GtaVD9bsZtt+HQBLNY+dRcdYlHmAG4d1pVWQdofiiRrT18dgoD+QClwnIje5JpLyZvePSyEsOIA/f5VpdxTVQsz5MZdAfz9u9MGhG7yFszckvg38FRiFo6AMBtJcmEt5qXahQdw3LoUfthexbHuR3XGUlzt0rIIFGXu4YmAc0eHBdsdRpxHg5HxpQG9jjN5xphp04/CuvLVyF09/lcno5Cj8tHFUnaW3f95FRVUNU0fpJb+ezNlTW5sBHYJMOSU4wJ+HJnQna38pn28osDuO8lLlldW8vXIX43rG+ORAct7E2UISBWwVkW9F5IvahyuDKe92cf9O9OnUhr99t52TVTpmiWq8T9YWcOj4Se0OxQs4e2rrj64MoVoePz/hkYk9mfLmauav2sXNI7WnVuW8mhrD7OU76RcXwdDE9nbHUQ1w9vLfZUAeEGi9XoOjE0elTuvclCiGJbXn5SXZHKuosjuO8iKLswrZefA4t5+bhIi2sXk6Z6/auh1YALxuTYoDPnNVKNUyiAiPTuzJoeMnmbNcu05Rzpu1fCdxbVsxqa82zXoDZ9tI7gZGAkfh/w9yFeOqUKrlSO3Sjol9OvDGDzkcOqYjKaqGrc8/wurcYm4ZmUCAjw9r7S2c/VeqMMacrH0jIgGAXgqsnPLwhd05UVnNK99n2x1FeYFZy3cSHhKgw1p7EWcLyTIR+Q3Qyhqr/SPgn66LpVqS5JhwrhoUz7s/7ya/uMzuOMqD5ReX8fWmfVw/tAthwc5eC6Ts5mwheQwoAjYBdwBf0cDIiCISLyLfi0imiGwRkfut6e1FZKGI7LCe21nTx4tIhohssp7H1lnXUhHZJiLrrUeMNT1YRD4QkWwRWSUiCY3dAco97r8gBQRmLNpudxTlwd5ckYufCDePSLA7imoEZ6/aqjHGzDLGXGWMudJ63dCprSrgIWts92HA3SLSG0dRWmyMSQEWW+8BDuIYC74fMAU4deTFG4wxA6xHoTVtKnDYGJOMY+z4Z535Psr9OrVtxc0jEvh0XYF26KjqVVJWyQdr8rnknE50jGjV8ALKYzh71VauiOw89XGmZYwx++qM7V4KZOK42utSYJ412zzgMmueddYY8ABbgBARaahznbrrWgCME71W0GNNP68boUEBzFioRyXqv81fvZuyk9U6AqIXakxfW7VCgKsAp+8Ssk45pQKrgFhjzD5wFJva01SnuAJYZ4ype5nPXBGpBj4GnrSOiOKAfGtdVSJSAkTiOLqpu/1pwDSALl20Ac8u7UKDmDoqkRcX72BzQQl94yLsjqQ8xMmqGuauyGVUchS9O7WxO45qJGdPbR2q8ygwxrwAjG1wQUBEwnD88n/AGHPUifn74DhFdUedyTdYp7xGW48ba2evL249+d8wxqQZY9Kio6Odia1cZOroRCJaBfK8HpWoOr7YsJfC0grtDsVLOXtqa2CdR5qI3AmEO7FcII4i8q4x5hNr8gER6Wh93hEorDN/Z+BT4CZjTE7tdGNMgfVcCswHhlgf7QHirWUDgAig2JnvpOzRJiSQO85LYklWIRm7DtsdR3kAYxzdofSIDefclCi746iz4OxVW3+r83gaGARcfaYFrLaKOUCmMeb5Oh99gaMxHev5c2v+tsCXwOPGmBV11hMgIlHW60DgIhy9EZ+6riuBJdrVveebMjyByNAgnl+4ze4oygMs33GQrP2l2h2KF3OqjcQYc/5ZrHskjlNQm0RkvTXtN8AzwIciMhXYjaO9BRxjwCcDT4jIE9a0CcBx4FuriPgDi4BZ1udzgLdFJBvHkci1Z5FTuVlocADTx3TjyS8z+SnnICO66V+hvmzW8p3EtgnmknM62R1FnSWnComIPHimz0854qid9iP1t2EAjKtn/ieBJ08z/6DTbLecfxci5UV+Nawrs5bv5PnvtjP8zkj9S9RHbd17lOU7DvLIxB4EBWh3KN7K2X+5NGA6jquk4oA7gd442kkabCtR6lQhgf7cMzaF9F2HdUheHzZ7+U5aB/lzw5CudkdRTdCYga0GGmMeMsY8hOMIobMx5n+NMf/runiqJbsmLZ64tq14fuF2tGnL9+wrOcEXG/ZyzeB4IloH2h1HNYGzhaQLcLLO+5NAQrOnUT4lKMCP+y9IYeOeEhZuPWB3HOVm/1iRR40x3KqDnnk9ZwvJ28BqEfmjiPwBx42Fb7kulvIVl6fGkRgVyvMLt1NTo0clvqK0vJL5q3YzqV9H4tu3tjuOaiJnb0j8M3ALcBg4AtxijHnKlcGUbwjw9+OBC1LI2l/Kl5v22R1HuckHa/Iprahimt6A2CI05jKJ1sBRY8yLwB4R0eNR1Swu7t+JHrHhzFi0narqGrvjKBerrK5h7oo8hiS2p3/ntnbHUc3A2Tvb/wA8CjxuTQoE3nFVKOVb/PyEX4/vzs6i43y2fm/DCyiv9tWmfRQcOcE07ZyxxXD2iGQycAmOmwOxeunVy35Vs7mwTyx949rw4uLtnKzSo5KWyhjDrOU7SYoOZWxPHa27pXC2kJy0uh4xACIS6rpIyheJCA9N6EF+8QkWZOyxO45ykZ93FrO54Ci3j07Cz09vQm0pnC0kH4rI60BbEbmd/+ymRKlmMaZ7NAO7tOXlJTsor6y2O45ygVnLdxIVFsTk1Di7o6hm5OxVW3/FMXDUx0AP4PfGmJddGUz5HhHh4Qk92FdSznurd9sdRzWzHQdKWZJVyI3DEggJ9Lc7jmpGDfa1JSL+wLfGmAuAha6PpHzZiOQohidF8vfvc7h2cBdaBekvnJZi9vJcggP8uHG4dofS0jR4RGKMqQbKRESHs1Nu8dCE7hw8VsFbK/PsjqKaSWFpOZ+uK+CqtM60Dw2yO45qZs4OtVuOozv4hVhXbgEYY+5zSSrl09IS2nNe92hmLsvh+qFdCA/Rfpi83dsrd1FZU8PUUXrJb0vkbGP7l8ATwA9ARp2HUi7x0ITuHC6rZO6KPLujqCYqO1nF2z/vYkLvWBKj9ILPluiMRyQi0sUYs9sYM89dgZQC6N+5LRN6xzJr+U6mDE/Q3mG92Pur8zlSVqndobRgDR2RfFb7QkQ+dnEWpf7Dr8d3p7S8ilnLd9odRZ2lyuoaZi/fyZCE9gzq2t7uOMpFGiokde8Y0j8nlFv16tiGi/p35M0VuRw6VmF3HHUWvli/l70l5Uwf083uKMqFGiok5jSvlXKLBy7oTnllNTOX5dgdRTVSTY3h9R9y6NkhnDE9ou2Oo1yooUJyjogcFZFSoL/1+qiIlIrIUXcEVL4tOSaMy1LjeGvlLgqPltsdRzXCkqxCth84xp3ndUNEu0Npyc5YSIwx/saYNsaYcGNMgPW69n0bd4VUvu3+cSlU1xj+/n223VFUI8xclkNc21Zc1L+j3VGUizVmPBKlbNE1MpSr0uKZv3o3ew6X2R1HOWFNXjHpuw4z7dwkAvz110xLp//CyivcOzYZQXhliR6VeIPXlubQPjSIq9Pi7Y6i3EALifIKndq24vqhXfgoYw95B483vICyTdb+oyzJKuTmEQnaV5qP0EKivMZdY7oR6C+8tHiH3VHUGby+bCetg/y5STtn9BlaSJTXiGkTwk3DE/h0fQHZhaV2x1H12HO4jC827OW6IV1o21o7Z/QVWkiUV7nj3CRaB/ozY5EelXii2ctzEWDqqES7oyg30kKivEpkWDC3jkrky4372LpXb2XyJIeOVfD+mt1clhpHp7at7I6j3EgLifI6t41Ook1IAM8v3G53FFXHnB9zqaiq4c7ztNkMiSwAABQeSURBVDclX6OFRHmdiFaB3D46iUWZB1iff8TuOAooKavkrZW7mNS3I8kx4XbHUW6mhUR5pVtGJdKudaAelXiIuT/lcqyiinvGJtsdRdlAC4nySmHBAUwf040fthexOrfY7jg+rbS8kjd/zGV871h6ddSek3yRFhLltW4clkB0eDB/+24bxmjn1HZ5a+UujpZXca8ejfgsLSTKa7UK8ufuMd1YlVvMiuxDdsfxSWUnq5jzYy7ndY+mf+e2dsdRNtFCorzadUO70CkihOe+zdKjEhvMX7Wb4uMnuW+cHo34Mi0kyqsFB/jz4IQebNxTwleb9tsdx6eUV1bz+g87GdEtUofR9XFaSJTXm5waR4/YcP7ybRaV1TV2x/EZH6zJp6i0gnvHptgdRdnMZYVEROJF5HsRyRSRLSJyvzW9vYgsFJEd1nM7a/p4EckQkU3W89g66xpkTc8WkZfEGm5NRIJF5ANr+ioRSXDV91Gey99PePQXPcg7VMb7a/LtjuMTKqocwx8PTmjHsCQ9GvF1rjwiqQIeMsb0AoYBd4tIb+AxYLExJgVYbL0HOAhcbIzpB0wB3q6zrteAaUCK9ZhoTZ8KHDbGJAMzgGdd+H2UBzu/RwxDEtvz4qIdHK+osjtOi/fBmnz2lZRz37gUHUZXua6QGGP2GWPWWq9LgUwgDrgUmGfNNg+4zJpnnTFmrzV9CxBiHXF0BNoYY1YaR2vqW7XLnLKuBcA40Z9qnyQiPPaLnhw8VsGcH3PtjtOilVdW88qSbIYktGdUcpTdcZQHcEsbiXXKKRVYBcQaY/aBo9gAMfUscgWwzhhTgaP47Knz2R5rGtZzvrWuKqAEiKxn+9NEJF1E0ouKiprjKykPNLBLOy7sE8vry3I4dKzC7jgt1js/76KwtIIHJ3TXoxEFuKGQiEgY8DHwgDGmwe5aRaQPjlNUd9ROqmc248Rn/55gzBvGmDRjTFp0dLRzwZVX+p8Le3KispqXdUhelyg7WcXMZTmMTI5kWNJ//c2mfJRLC4mIBOIoIu8aYz6xJh+wTldhPRfWmb8z8ClwkzEmx5q8B+hcZ7Wdgb11Pou3lg0AIgDtL8OHJceEcc3geN5dtYvdh8rsjtPizPtpFwePneTB8T3sjqI8iCuv2hJgDpBpjHm+zkdf4GhMx3r+3Jq/LfAl8LgxZkXtzNbpr1IRGWat86baZU5Z15XAEqN3pfm8+8d1x99P+Ot32+yO0qKUllfy+g85jOkRzaCu7eyOozyIK49IRgI3AmNFZL31mAQ8A4wXkR3AeOs9wD1AMvBEnflr20+mA7OBbCAH+NqaPgeIFJFs4EH+fQWY8mEdIkK4dWQiX2zYywbtZr7ZzF2Rx5GySh4c393uKMrDiK/9AZ+WlmbS09PtjqFcrLS8kvP/upSEyFA+unO4Ngo3UUlZJaOeW8LQxEhmT0mzO46ygYhkGGPq/cfXO9tVixQeEshDE3qQvuuwdp3SDGb+kENpeZUejah6aSFRLdbVafH07BDO019nUl5ZbXccr7Wv5ARv/pjLZQM60buTjjei/psWEtVi+fsJv7+oN3sOn+DNFXqT4tmasXA7xsBDE/RKLVU/LSSqRRuRHMUFvWJ59fscCkvL7Y7jdbYfKGVBxh5uHN6V+Pat7Y6jPJQWEtXi/WZST8orq3n+Ox3fvbGe+yaL0KAA7jlfxxtRp6eFRLV4SdFhTBmRwAfp+WzZW2J3HK+xOreYRZmF3DmmG+1Cg+yOozyYFhLlE+4bm0LbVoH86V9bdSRFJxhjePrrTDq0cdyTo9SZaCFRPiGidSAPju/OzzuL+efGfXbH8XjfbN7Put1H+PX4FFoF+dsdR3k4LSTKZ1w/tCt949rw5L+2ckzHLDmt8spqnv46i+6xYVwxsHPDCyifp4VE+Qx/P+FPl/al6FgFLyzUhvfTeXNFLruLy/j9RX0I8NdfEaph+lOifEpql3ZcOzieuT/lkbW/wVENfM6Bo+W8siSb8b1jGZWig1Yp52ghUT7nkQt70iYkgN9/tkUb3k/x7DdZVFUbfvfLXnZHUV5EC4nyOe1Cg3h0Yk9W5xXz6boCu+N4jHW7D/PJ2gKmjk6ka2So3XGUF9FConzS1WnxDIhvy1NfZVJSVml3HNvV1Bj++M+txIQHc7fefKgaSQuJ8kl+fsKTl/XlcFklT32VaXcc232Ukc+G/CM8OrEnYcEBdsdRXkYLifJZfeMiuH10Eh+k57Mi+6DdcWxz8FgFT32VxZDE9lw+MM7uOMoLaSFRPu2BC1JIjArl8U82UXbSN+8t+fOXmZSdrOKpyX11ADB1VrSQKJ8WEujPM5f3Y3dxmU926vjjjoN8uq6A6ed1Izkm3O44yktpIVE+b2hSJL8a1oU3V+Sybvdhu+O4TXllNb/7bBMJka25SxvYVRNoIVEKeHRiT2LbhPDIgo0+M5ri37/PJu9QGX+e3I+QQO1PS509LSRK4Rjj/dkr+rOj8BjPfpNldxyX21xQwmtLc7g8NY6RyXoHu2oaLSRKWc7tHs3NIxKYuyKP5TuK7I7jMhVV1Tz80QbahQbx+4t72x1HtQBaSJSq47Ff9CQlJoyHP9rA4eMn7Y7jEi8u2kHW/lKevaIfbVvrgFWq6bSQKFVHSKA/M64ZQPHxk/zm000tri+udbsPM3NZDlendWZsz1i746gWQguJUqfoGxfBg+N78PXm/XyUvsfuOM2m7GQVD324gY4RrXjiIj2lpZqPFhKl6jHt3CRGdIvkic83s3Vvy+hu/vefbyH30HH+clV/wkMC7Y6jWhAtJErVw99PePHaVCJaBXL3/LWUlnt3x46frN3Dgow93Ds2hRHd9Cot1by0kCh1GtHhwbxy/UB2F5fx6Mcbvba9JKfoGL/7bDNDEttz31i98VA1Py0kSp3BkMT2PDqxB19t2s/s5bl2x2m0EyeruWf+OoID/Hjp2lQdOle5hP5UKdWA20cnMalfB576OpNFWw/YHcdpxhgeXrCBrP1HmXHNADpEhNgdSbVQWkiUaoCI8LerBtAvLoL73l/nNY3vryzJ5suN+3hsYk/G9IixO45qwbSQKOWEVkH+zL4pjYhWgUydt4bCo+V2Rzqjbzbv528Lt3N5ahzTzk2yO45q4bSQKOWkmDYhzJ6SRsmJSqbMXeOxQ/Suzz/Cgx+u55z4tjx1eT8dY0S5nBYSpRqhT6cIZv5qEDmFx5gydzXHKjxrMKzswlJumbuayLAgZt04SHv1VW6hhUSpRjq3ezQvX5/KpoISbp+X7jHdzu89coKb5qzG38+Pt28dSkwbbVxX7qGFRKmzcGGfDvz1qv78nHuI299Kt32Y3vziMq55YyWl5VXMu3UwCVGhtuZRvsVlhURE4kXkexHJFJEtInK/Nb29iCwUkR3WcztreqQ1/zEReeWUdS0VkW0ist56xFjTg0XkAxHJFpFVIpLgqu+j1Kkmp3bmuSv6syL7IDfMXsWRMnt6C847eJxrXl9JSVkl79w2lD6dImzJoXyXK49IqoCHjDG9gGHA3SLSG3gMWGyMSQEWW+8ByoEngIdPs74bjDEDrEehNW0qcNgYkwzMAJ510XdRql5XpcXz6g2D2FJwlKtfX0l+cZlbt78h/whXvb6SE5XVvDdtGOfEt3Xr9pUCFxYSY8w+Y8xa63UpkAnEAZcC86zZ5gGXWfMcN8b8iKOgOKvuuhYA40QvUVFuNrFvB+beMph9JeVc8sqP/JRz0C3b/XLjPq5+fSXBAX58eMdwPRJRtnFLG4l1yikVWAXEGmP2gaPYAM7eKTXXOq31RJ1iEQfkW+uqAkqAyHq2P01E0kUkvaio5Y58p+wzMjmKL+4ZRWRYMDfOWc2rS7OprnFN31wnq2p4+qtM7p6/lr5xEXx290hSYsNdsi2lnOHyQiIiYcDHwAPGmLO9JfgGY0w/YLT1uLF29fXM+1//e40xbxhj0owxadHR0WcZQakzS4wK5dO7RjCxTwee+2YbV7++kryDx5t1G9mFpUx+dQWv/7CT64d24d3bhhIVFtys21CqsVxaSEQkEEcRedcY84k1+YCIdLQ+7wgUnm75WsaYAuu5FJgPDLE+2gPEW+sKACKA4ub8Dko1RnhIIK9cn8oL1wxg+4FSJrzwA898ncXRJnZDf6yiime+zuIXLy5nX0k5s25K46nJ/fQ+EeURAly1Yuv00xwg0xjzfJ2PvgCmAM9Yz583sJ4AoK0x5qBVmC4CFp2yrpXAlcAS4619fasWQ0S4LDWOYUmRPPdtFq//kMMHa3Zz84hErh/ahehw548gio+f5N2fd/HmilwOl1Vy5aDOPDqxZ6PWoZSriat+74rIKGA5sAmosSb/Bkc7yYdAF2A3cJUxpthaJg9oAwQBR4AJwC7gByAQ8MdRRB40xlSLSAjwNo72l2LgWmPMzjPlSktLM+np6c33RZVqwOaCEv723Ta+31ZEoL8wolsUE/rEMrBLO1Jiwv6ja/eq6hp2F5exKreYJVmFfJ9VSFWNYVzPGO4dl8IAvSpL2UREMowxafV+5mt/wGshUXbJKTrGh2vy+XrzfnZblwn7CUSFBRMS6E9ldQ1FpRVUWY30sW2CueScTlyVFk93bUxXNtNCUocWEmU3Ywx5h8pYn3+YnMLjFJaWc7KqhgB/P2LCg0mIDGVg13Z0iw7VDheVxzhTIXFZG4lSqn4iQmJUKInajYlqIbSvLaWUUk2ihUQppVSTaCFRSinVJFpIlFJKNYkWEqWUUk2ihUQppVSTaCFRSinVJFpIlFJKNYnP3dkuIkU4+u86G1GAe0YtahzN1Tiaq/E8NZvmapym5OpqjKl3HA6fKyRNISLpp+siwE6aq3E0V+N5ajbN1TiuyqWntpRSSjWJFhKllFJNooWkcd6wO8BpaK7G0VyN56nZNFfjuCSXtpEopZRqEj0iUUop1SRaSJRSSjWJFhJAREJEZLWIbBCRLSLyv9b0ASLys4isF5F0ERlymuUnisg2EckWkcc8KFeeiGyqnc8Nuc4RkZXWNv8pIm1Os7y795ezuVyyv+qs319E1onIv6z37UVkoYjssJ7bnWY5l+yvZsjl7v11lfXvWiMip72E1Yb95Wwud++vv4hIlohsFJFPRaTtaZZr+v4yxvj8AxAgzHodCKwChgHfAb+wpk8CltazrD+QAyQBQcAGoLfduazP8oAoN+6vNcB51vRbgT95yP5qMJcr91ed9T8IzAf+Zb1/DnjMev0Y8Kw791dTctm0v3oBPYClQNpplrFjfzWYy6b9NQEIsF4/68qfLz0iAYzDMettoPUw1qP2r9cIYG89iw8Bso0xO40xJ4H3gUs9IJfLnCFXD+AHa/pC4Ip6FrdjfzmTy6VEpDPwS2B2ncmXAvOs1/OAy+pZ1GX7q4m5XKq+XMaYTGPMtgYWdfv+cjKXS50m13fGmCrr7c9A53oWbZb9pYXEYh0WrgcKgYXGmFXAA8BfRCQf+CvweD2LxgH5dd7vsabZnQscv0S/E5EMEZnWXJnOkGszcIk1y1VAfD2L2rG/nMkFLtxfwAvAI0BNnWmxxph9ANZzTD3LuXR/NSEXuH9/OcOO/eUsO/fXrcDX9Uxvlv2lhcRijKk2xgzAUbWHiEhfYDrwa2NMPPBrYE49i0p9q/OAXAAjjTEDgV8Ad4vIuS7Odau1nQwgHDhZz6J27C9ncoGL9peIXAQUGmMyzmbxeqY1y/5qYi7Q/dVYtuwvEfktUAW8W9/H9Uxr9P7SQnIKY8wRHOc6JwJTgE+sjz7CcRh4qj3851+4nXHBqaazyIUxZq/1XAh8err5miuXMSbLGDPBGDMIeA/HuddTuX1/OZnLlftrJHCJiOThOHUwVkTeAQ6ISEcA67mwnmVdub+aksuO/eUMO/aXU+zYXyIyBbgIuMFYjSKnaJ795aqGH296ANFAW+t1K2C5tfMzgTHW9HFARj3LBgA7gUT+3VjVxwNyhQLhdV7/hOOXqitzxVjT/IC3gFs9ZH85k8tl++uU7Yzh342hf+E/G7Wfc+f+amIut++vOtOWcvrGdrfvLydz2fHzNRHYCkSfYf5m2V/N+iW89QH0B9YBG3GcT/+9NX0UkGHt3FXAIGt6J+CrOstPArbj+Ev3t56QC8dVGBusxxY35brf2g/bgWf4d88Jdu+vBnO5cn+dkrHuf/RIYDGww3pu78791ZRcNu2vyTj+gq4ADgDfesj+ajCXTfsrG0f7x3rrMdNV+0u7SFFKKdUk2kailFKqSbSQKKWUahItJEoppZpEC4lSSqkm0UKilFKqSbSQKNVMRORYw3M1af03i0inOu/zRCTKldtUyhlaSJTyHjfjuAdAKY8SYHcApVoyEYkGZgJdrEkPGGNWiMgfrWlJ1vMLxpiXrGWeAG7AcTPZQRw3n+YBacC7InICGG6t714RuRhHT8dXGWOy3PG9lKpLj0iUcq0XgRnGmME4uq+v2117T+BCHH0u/UFEAq2Bka4AUoHLcRQPjDELgHQcfSYNMMacsNZx0Dg6AnwNeNgdX0ipU+kRiVKudQHQW+T/d7LaRkTCrddfGmMqgAoRKQRicXR/83ltoRCRfzaw/trOOzNwFB6l3E4LiVKu5QcMr3MEAYBVWCrqTKrG8f+xvm69z6R2HbXLK+V2empLKdf6Drin9o2IDGhg/h+Bi8Ux/nwYjlHvapXiGE9FKY+if8Eo1Xxai8ieOu+fB+4D/i4iG3H8f/sBuPN0KzDGrBGRL3D0ErsLR7tIifXxP4CZpzS2K2U77f1XKQ8jImHGmGMi0hpH4ZlmjFlrdy6lTkePSJTyPG+ISG8gBJinRUR5Oj0iUUop1STa2K6UUqpJtJAopZRqEi0kSimlmkQLiVJKqSbRQqKUUqpJ/h+CLqnbIV6cVgAAAABJRU5ErkJggg==\n", 11 | "text/plain": [ 12 | "
" 13 | ] 14 | }, 15 | "metadata": { 16 | "needs_background": "light" 17 | }, 18 | "output_type": "display_data" 19 | } 20 | ], 21 | "source": [ 22 | "# GETTING SEQUENCE LENGTH DISTRIBUTION FROM A FASTQ FILE\n", 23 | "\n", 24 | "from Bio import SeqIO\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "import numpy as np\n", 27 | "from scipy.interpolate import make_interp_spline\n", 28 | "\n", 29 | "# Enter the path of input fastq file here\n", 30 | "input_fastq_path = r\"C:\\Users\\dell\\Desktop\\Seq_len_dist\\seq_len_dist.fastq\"\n", 31 | "records = SeqIO.parse(input_fastq_path,\"fastq\")\n", 32 | "\n", 33 | "len_frequency = {} # stores the frequecy of sequence of a particular length\n", 34 | "\n", 35 | "for record in records:\n", 36 | " if len(record.seq) not in len_frequency.keys():\n", 37 | " len_frequency[len(record.seq)] = 1\n", 38 | " else:\n", 39 | " len_frequency[len(record.seq)] += 1\n", 40 | " \n", 41 | "\n", 42 | "# Sorting dictionary len_frequency on keys\n", 43 | "len_frequency = sorted(len_frequency.items())\n", 44 | "\n", 45 | "x = np.array([]) \n", 46 | "y = np.array([])\n", 47 | "\n", 48 | "for key,value in len_frequency:\n", 49 | " x = np.append(x,key)\n", 50 | " y = np.append(y,value)\n", 51 | " \n", 52 | "# Plotting a smooth graph\n", 53 | "X_Y_Spline = make_interp_spline(x, y)\n", 54 | " \n", 55 | "# Returns evenly spaced numbers\n", 56 | "# over a specified interval.\n", 57 | "X_ = np.linspace(x.min(), x.max(), 500)\n", 58 | "Y_ = X_Y_Spline(X_)\n", 59 | " \n", 60 | "# Plotting the Graph\n", 61 | "plt.plot(X_, Y_)\n", 62 | "plt.xlabel(\"Length\")\n", 63 | "plt.ylabel(\"Frequency\")\n", 64 | "plt.show()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.6.4" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 2 96 | } 97 | -------------------------------------------------------------------------------- /single_GC_input.fasta: -------------------------------------------------------------------------------- 1 | >header1 2 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 3 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 4 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 5 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 6 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 7 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 8 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 9 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 10 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 11 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 12 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 13 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCA 14 | ATGCGATGCGGACGCCCAGTAGCAGTAAGCAGATAACAGACTAGACAGTAAAGCATAGCATGACGATCGAT -------------------------------------------------------------------------------- /total_reads_fastq.py: -------------------------------------------------------------------------------- 1 | # Finding total number of reads in a fastq file 2 | # Type "python total_reads_fastq.py -i Path of your fastq file" for running the code 3 | # Example: python total_reads_fastq.py -i C:\Users\dell\Desktop\test\sample_fastq.fastq 4 | # Type "python total_reads_fastq.py for help/usage description 5 | # Output: Total number of the reads in a fastq file 6 | # Sample input file: sample_fastq.fastq 7 | 8 | import argparse 9 | 10 | parser = argparse.ArgumentParser(description="Finding total number of reads in a fastq file, " 11 | "USAGE: python total_reads_fastq.py -i path/to/fastq/file") 12 | 13 | parser.add_argument("-i", help = "ENTER FULL PATH OF THE FASTQ FILE") 14 | args = parser.parse_args() 15 | fastq_path = args.i 16 | 17 | total_lines = 0 18 | with open(fastq_path,"r") as fr: 19 | for line in fr: 20 | total_lines += 1 21 | 22 | total_reads = total_lines//4 23 | print(f"Total number of reads in fastq file are {total_reads}") 24 | 25 | -------------------------------------------------------------------------------- /trie_construction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "szfg\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# CONSTRUCTING A TRIE FROM A GIVEN SET OF PATTERNS (ALONG WITH THE PSEUDO CODE)\n", 18 | "# The following code is the solution for a code challenge from Coursera's course:\n", 19 | "# \"Finding Mutations in DNA and Proteins (Bioinformatics VI)Week 1\" \n", 20 | "# For details regarding the Trie Construction Problem please visit https://www.coursera.org/learn/dna-mutations \n", 21 | "\n", 22 | "#Code Challenge: Solve the Trie Construction Problem.\n", 23 | "\n", 24 | "#Input: A collection of strings Patterns.\n", 25 | "#Output: The adjacency list corresponding to Trie(Patterns), in the following format. If Trie(Patterns) has n nodes, \n", 26 | "#first label the root with 0 and then label the remaining nodes with the integers 1 through n - 1 in any order you like. \n", 27 | "#Each edge of the adjacency list of Trie(Patterns) will be encoded by a triple: the first two members of the triple must be \n", 28 | "#the integers labeling the initial and terminal nodes of the edge, respectively; the third member of the triple must be the symbol labeling the edge.\n", 29 | "\n", 30 | "######PSEUDO CODE START######\n", 31 | "\"\"\" \n", 32 | "TrieConstruction(Patterns)\n", 33 | " Trie ← a graph consisting of a single node root\n", 34 | " for each string Pattern in Patterns\n", 35 | " currentNode ← root\n", 36 | " for i ← 0 to |Pattern| - 1\n", 37 | " currentSymbol ← Pattern[i]\n", 38 | " if there is an outgoing edge from currentNode with label currentSymbol\n", 39 | " currentNode ← ending node of this edge\n", 40 | " else\n", 41 | " add a new node newNode to Trie\n", 42 | " add a new edge from currentNode to newNode with label currentSymbol\n", 43 | " currentNode ← newNode\n", 44 | " return Trie\n", 45 | "\"\"\"\n", 46 | "######PSEUDO CODE END######\n", 47 | "\n", 48 | "def trie_construction(patterns):\n", 49 | " # initialising the trie with a root node as 0\n", 50 | " # we are representing the trie as dictionary of dictionaries\n", 51 | " trie = {0:{}}\n", 52 | " # looping over each pattern in the input list of patterns\n", 53 | " for each_pattern in patterns:\n", 54 | " # for every pattern we start with the root node\n", 55 | " current_node = 0\n", 56 | " # looping over every character of the pattern\n", 57 | " for i in range(len(each_pattern)):\n", 58 | " current_symbol = each_pattern[i]\n", 59 | " # checking if our current node has an edge labelled with our current symbol\n", 60 | " # if true, we move to the connecting node for this edge\n", 61 | " if current_symbol in trie[current_node]:\n", 62 | " current_node = trie[current_node][current_symbol] \n", 63 | " \n", 64 | " # if our current node does not have an edge labelled with our current symbol, then\n", 65 | " # create a edge labelled with our current symbol, connecting our cuurent node to a new node\n", 66 | " # and then move to this new node\n", 67 | " else:\n", 68 | " node = len(trie)\n", 69 | " trie[current_node][current_symbol] = node\n", 70 | " trie[node] = {} \n", 71 | " current_node = node\n", 72 | " \n", 73 | " return trie\n", 74 | "\n", 75 | "\n", 76 | "input_file_path = r\"D:\\Python_for_Bioinformatics\\PFB_clients\\Custom_Script__Development\\Ekta_dadlani\\cc_1\\dataset_294_4.txt\"\n", 77 | "out_file_path = r\"D:\\Python_for_Bioinformatics\\PFB_clients\\Custom_Script__Development\\Ekta_dadlani\\cc_1\\adjacency_list.txt\"\n", 78 | "\n", 79 | "# reading our input file and creating a list for the patterns given in the file\n", 80 | "fr = open(input_file_path,\"r\")\n", 81 | "patterns = fr.read().split(\"\\n\")\n", 82 | "fr.close()\n", 83 | "\n", 84 | "# calling the trie_construction(patterns) function\n", 85 | "trie = trie_construction(patterns)\n", 86 | "\n", 87 | "# writing the adjacency list to a file\n", 88 | "fw = open(out_file_path,\"w\")\n", 89 | "adjacency_list = []\n", 90 | "for node in trie:\n", 91 | " for edge_label in trie[node]: \n", 92 | " line = str(node) + \"->\" + str(trie[node][edge_label]) + \":\" + edge_label\n", 93 | " adjacency_list.append(line)\n", 94 | "\n", 95 | "fw.write(\"\\n\".join(adjacency_list))\n", 96 | "fw.close()\n", 97 | "\n" 98 | ] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.6.4" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 2 122 | } 123 | -------------------------------------------------------------------------------- /withname_fastq.fastq: -------------------------------------------------------------------------------- 1 | @1_1 2 | GCACCTACCGATTGAATGATTCGGTGAAACTTTCGGACCGTGACTTAGCGTCCTTCGGGGCACTTCGTCGTGGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 3 | + 4 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 5 | @1_2 6 | GCACCTACCGATTGAATGATTCGGTGAAACTTTCGGACCGTGACTTAGCGTCCTTCGGGGCACTTCGTCGTGGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 7 | + 8 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 9 | @1_3 10 | GCACCTACCGATTGAATGGTCCGGTGAAACCTTCGGACTGTGGCAACGTTGCTTCATTGGAGCGTCGCCGTGGGAAGTTGTTTAAACCTTACCATTTAGAGGAAGGTGTAGTCGTAACAAGGTTTCC 11 | + 12 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 13 | @1_4 14 | GCACCTACCGATTGAATGATTCGGTGAAACTTTCGGACCGTGACTTAGCGTCCTTCGGGGCGCTTCGTCGTGGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCA 15 | + 16 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ6H+JJJJJJJJJJCJ$JJJJJJ4JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ7JJJJJJJJJJJJJJJJJJ 17 | @1_5 18 | GCACCTACCGATTGAATGATTCGGTGAAAATCTCGGACTGTGGCTCGGACGCCCTCGGGCGACCTTGCTGTAGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 19 | + 20 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 21 | @1_6 22 | GCACCTACCGATTGAATGATTCGGTGAAACTTTCGGACCGTGACTTAGCGTCCTTCGGGGCACTTCGTCGTGGGAAGTTATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 23 | + 24 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 25 | @1_7 26 | GCACCTACCGATTGAATGATTCGGTGAAAATCTAGGACTAGAGCGAAGACGCCCTCGGGCGACTTTGCTTTGGGAATTCATTTAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC 27 | + 28 | JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ 29 | -------------------------------------------------------------------------------- /write.py: -------------------------------------------------------------------------------- 1 | # writing contents to a file 2 | 3 | fr = open("path of the from file ","r") 4 | fw = open("path of the to file","w") 5 | 6 | for line in fr: 7 | fw.write(line) 8 | 9 | fw.close() 10 | fr.close() 11 | 12 | ##please leave a message if you want a text version of this code or if you face any issues while using the code 13 | ##your personal queries are also invited 14 | 15 | 16 | 17 | --------------------------------------------------------------------------------