├── examples ├── human.pdf ├── human.png └── human.org ├── LICENSE ├── common.py ├── makedb.py ├── download_organism.py ├── README.md ├── download_ko.py ├── get_ranks.py ├── download_proteins.py ├── make_keg.py ├── process_proteins.py ├── FastaReader.py └── plot_keg.py /examples/human.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlyPythons/KEGGTools/HEAD/examples/human.pdf -------------------------------------------------------------------------------- /examples/human.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlyPythons/KEGGTools/HEAD/examples/human.png -------------------------------------------------------------------------------- /examples/human.org: -------------------------------------------------------------------------------- 1 | hsa 9606 Eukaryota Metazoa Chordata Mammalia Primates Hominidae Homo Homo sapiens -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Junpeng Fan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | 4 | 5 | LOG = logging.getLogger(__name__) 6 | 7 | __version__ = "0.1.0" 8 | __author__ = ("Junpeng Fan",) 9 | __email__ = "jpfan@whu.edu.cn" 10 | 11 | 12 | def read_org(file): 13 | """ 14 | read .org file create by get_organism.py 15 | :return: a list [org.abbr, name, url] 16 | """ 17 | 18 | r = [] 19 | 20 | LOG.info("get organism infomation from %r" % file) 21 | 22 | for line in open(file): 23 | line = line.strip() 24 | 25 | if line: 26 | r.append(line.split("\t")) 27 | 28 | LOG.info("get %s records" % len(r)) 29 | 30 | return r 31 | 32 | 33 | def read_org_ko(file): 34 | """ 35 | read KEGG organism KO .keg file 36 | :param file: file name 37 | :return: dict contains {protein_id: {"ko": [], "path": []}}, if protein_id has no ko, the ko will be "-" 38 | """ 39 | 40 | r = {} 41 | 42 | path_id = "" 43 | 44 | for n, line in enumerate(open(file)): 45 | line = line.strip() 46 | 47 | if not line: 48 | continue 49 | 50 | tag = line[0] 51 | 52 | if tag == "C": 53 | path_id = "ko"+line[-6:-1] 54 | continue 55 | 56 | if tag != "D": 57 | continue 58 | 59 | tmp = line.split("\t") 60 | gene = tmp[0].split()[1] 61 | 62 | if len(tmp) == 2: 63 | ko = tmp[1].split()[0] 64 | else: 65 | LOG.warning("line %s: %r has no ko" % (n+1, line)) 66 | ko = "" 67 | 68 | if gene not in r: 69 | r[gene] = {"ko": [], "path": []} 70 | 71 | if ko not in r[gene]["ko"]: 72 | r[gene]["ko"].append(ko) 73 | 74 | if path_id not in r[gene]["path"]: 75 | r[gene]["path"].append(path_id) 76 | 77 | return r 78 | -------------------------------------------------------------------------------- /makedb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os.path 5 | import sys 6 | import argparse 7 | import logging 8 | 9 | from common import read_org, read_org_ko, __email__, __version__, __author__ 10 | 11 | 12 | LOG = logging.getLogger(__name__) 13 | 14 | 15 | __all__ = [] 16 | 17 | 18 | def cat_proteins(org, pep, keg, out): 19 | 20 | pep_out = open(out+".pep.fasta", "w") 21 | ko_out = open(out+".pep2ko.txt", "w") 22 | num = len(org) 23 | 24 | for n, o in enumerate(set(org)): 25 | LOG.info("%s/%s process %s" % (n+1, num, o)) 26 | pep_file = os.path.join(pep, "%s.pep.fasta" % o) 27 | keg_file = os.path.join(keg, "%s00001.keg" % o) 28 | 29 | if os.path.exists(pep_file) and os.path.exists(keg_file): 30 | pep_out.write(open(pep_file).read()) 31 | 32 | for k, v in read_org_ko(keg_file).items(): 33 | if v["ko"]: 34 | ko = ";".join(v["ko"]) 35 | else: 36 | ko = "-" 37 | 38 | ko_out.write("%s\t%s\t%s\n" % (k, ko, ";".join(v["path"]))) 39 | else: 40 | LOG.warning("%r has no .keg or .pep.fasta") 41 | 42 | pep_out.close() 43 | ko_out.close() 44 | 45 | return 0 46 | 47 | 48 | def set_args(): 49 | 50 | args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 51 | description=""" 52 | extract proteins of KEGG organisms you wanted to make database 53 | 54 | version: %s 55 | contact: %s <%s>\ 56 | """ % (__version__, " ".join(__author__), __email__)) 57 | 58 | args.add_argument("--org", metavar="FILE", required=True, 59 | help="a list of KEGG organism abbr. at the first column") 60 | args.add_argument("--keg", metavar="DIR", required=True, 61 | help="directory contains {org}00001.keg") 62 | args.add_argument("--pep", metavar="DIR", required=True, 63 | help="directory contains {org}.pep.fasta.gz from NCBI") 64 | args.add_argument("--out", metavar="STR", default="kegg", help="output prefix (default: kegg)") 65 | 66 | return args.parse_args() 67 | 68 | 69 | def main(): 70 | 71 | logging.basicConfig( 72 | stream=sys.stderr, 73 | level=logging.INFO, 74 | format="[%(levelname)s] %(message)s" 75 | ) 76 | 77 | args = set_args() 78 | 79 | orgs = read_org(args.org) 80 | 81 | cat_proteins([i[0] for i in orgs], args.pep, args.keg, args.out) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | 87 | -------------------------------------------------------------------------------- /download_organism.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import logging 6 | import urllib.request 7 | import argparse 8 | 9 | from common import __author__, __version__, __email__ 10 | 11 | 12 | LOG = logging.getLogger(__name__) 13 | 14 | 15 | def html2org(url="http://www.kegg.jp/kegg/catalog/org_list.html"): 16 | """ 17 | request KEGG organism url and get the organism abb. name and download url 18 | :param url: the url of KEGG organism, default is http://www.kegg.jp/kegg/catalog/org_list.html 19 | :param out: the output 20 | :return: dict contain org information 21 | """ 22 | 23 | r = {} 24 | 25 | LOG.info("open url %r to get KEGG org list" % url) 26 | file = urllib.request.urlopen(url) 27 | 28 | org = name = link = "" 29 | n = 0 30 | 31 | for line in file: 32 | line = line.decode("utf-8").strip() 33 | 34 | if "show_organism?org=" in line: 35 | 36 | if org: 37 | r[org] = [name, link] 38 | 39 | org = line.split("")[0].split("'>")[-1] 40 | name = link = "" 41 | n = 1 42 | continue 43 | 44 | if n == 1: 45 | name = line.split("")[0].split("'>")[-1] 46 | n = 2 47 | continue 48 | if n == 2: 49 | if "ftp://" not in line: 50 | continue 51 | link = line.split("href='")[-1].split("'>")[0] 52 | n = 0 53 | 54 | if org: 55 | r[org] = [name, link] 56 | 57 | LOG.info("get %s records from KEGG org" % len(r)) 58 | 59 | return r 60 | 61 | 62 | def set_args(): 63 | 64 | args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 65 | description=""" 66 | download KEGG organism information from KEGG website. 67 | 68 | version: %s 69 | contact: %s <%s>\ 70 | """ % (__version__, " ".join(__author__), __email__)) 71 | 72 | args.add_argument("--url", default="http://www.kegg.jp/kegg/catalog/org_list.html", 73 | help="KEGG organism url (default: http://www.kegg.jp/kegg/catalog/org_list.html)") 74 | args.add_argument("--out", metavar="FILE", 75 | default="KEGG.org", help="output filename (default: KEGG.org)") 76 | 77 | return args.parse_args() 78 | 79 | 80 | def main(): 81 | 82 | logging.basicConfig( 83 | stream=sys.stderr, 84 | level=logging.INFO, 85 | format="[%(levelname)s] %(message)s" 86 | ) 87 | 88 | args = set_args() 89 | org_dict = html2org(args.url) 90 | 91 | LOG.info("output records to %s" % args.out) 92 | 93 | with open(args.out, "w") as fh: 94 | for k, v in sorted(org_dict.items()): 95 | fh.write("%s\t%s\t%s\n" % (k, v[0], v[1])) 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KEGGTools 2 | A collection of tools to download and process KEGG database 3 | ## Introduction 4 | KEGG is an important database to research on the function of proteins and metabolic pathways of certain organism, but the sequences in KEGG databases can not be downloaded free. KEGGTools is developed to solve this problem. 5 | ### Requirements 6 | * Python 3.5+ 7 | ### Install 8 | to git 9 | ``` 10 | git clone https://github.com/FlyPythons/KEGGTools.git 11 | ``` 12 | or download 13 | ``` 14 | wget https://github.com/FlyPythons/KEGGTools/archive/master.zip 15 | unzip mater.zip 16 | ``` 17 | ## Examples 18 | ### Download KEGG database 19 | 1. Download information of KEGG organisms from KEGG 20 | ``` 21 | python3 download_organism.py --url http://www.kegg.jp/kegg/catalog/org_list.html --out KEGG.org 22 | ``` 23 | This will get 5426 KEGG organisms from KEGG-Genome. 24 | 25 | 2. Download protein sequences from NCBI by the urls in 'KEGG.org' 26 | ``` 27 | python3 download_proteins --org KEGG.org -out NCBI-proteins --concurrent 2 28 | ``` 29 | This will get 5419 gzip formatted protein sequences of KEGG organisms from NCBI. Other 7 organsims are not from NCBI, they are "bpg dosa lem lja pfd pfh smin" 30 | 3. Download KO information from KEGG 31 | ``` 32 | python3 download_ko.py --org KEGG.org --out KEGG-KO --concurrent 10 33 | ``` 34 | This will get 5394 keg formatted file consist KO information of KEGG organisms. Other 32 organisms have no KO information in KEGG, they are "ebc pcd apor pgz vta cola haf mii aea nmj bgm aon kso zpa afq amih ypac mee msao dpc rhq dlu cgrn sfk actt pbf kst vbh fmo ful pbp dod " 35 | 4. Get proteins included in KO files from NCBI download proteins 36 | ``` 37 | python3 process_proteins.py --org KEGG.org --keg KEGG-KO --pep NCBI-proteins --out KO-proteins 38 | ``` 39 | This will get protein sequences of 5381 organisms. 13 of organsims can not find matched id in KEGG-KO and NCBI-protein, they are "agl cpor pary smiz pshi tng vrm dpl dco hlc ecor nwe xph"; 32 organisms have no KO information in KEGG, they are present in step 4. 40 | So finally, we have a KEGG database consist 5381 organisms, we can use them to do KEGG annotation. 41 | ### Process KEGG database downloaded 42 | * Get the NCBI Taxonomy ranks of KEGG organisms 43 | ``` 44 | python3 get_ranks.py --keg br08610.keg --taxon taxonomy.ranks --out KEGG.ranks 45 | ``` 46 | This will find 4715 Bacteria, 442 Eukaryota, 269 Archaea in KEGG organisms. 47 | * Extract the information of KEGG organisms you wanted to make db 48 | ``` 49 | python3 makedb.py --org human.org --keg KEGG-KO --pep NCBI-proteins --out human 50 | ``` 51 | This will create 2 files consist of protein fasta file("human.pep.fasta") and protein related KO and pathway ID("human.pep2ko.txt"). 52 | ### Plot KEGG annotation result 53 | make kegg annotaion result like "human.pep2ko.txt" 54 | * Create KEGG pathway file ".keg" 55 | ``` 56 | python3 make_keg.py --keg ko00001.keg --in human.pep2ko.txt --out human 57 | ``` 58 | This will create a keg file named "human.keg" 59 | * Plot KEGG pathway file 60 | ``` 61 | python3 plot_key.py --keg human.keg --out human 62 | ``` 63 | This will create a pdf file named "human.pdf" 64 | ![image](https://github.com/FlyPythons/KEGGTools/raw/master/examples/human.png) -------------------------------------------------------------------------------- /download_ko.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import urllib.request 5 | import argparse 6 | import os.path 7 | import logging 8 | import sys 9 | import time 10 | from multiprocessing import Pool 11 | 12 | from common import read_org, __author__, __email__, __version__ 13 | 14 | 15 | LOG = logging.getLogger(__name__) 16 | 17 | 18 | def download(org, status, output_dir): 19 | """ 20 | download .keg file contains KO information from KEGG 21 | :param org: KEGG organism abbr. 22 | :param status: the download status 23 | :param output_dir: output directory 24 | :return: 0 25 | """ 26 | 27 | time.sleep(2) 28 | 29 | LOG.info("%s processing %s" % (status, org)) 30 | id = org + "00001.keg" 31 | out_file = os.path.join(output_dir, id) 32 | 33 | if os.path.exists(out_file): 34 | LOG.info("%s has been downloaded before, skip") 35 | return 0 36 | 37 | file = urllib.request.urlopen("http://www.kegg.jp/kegg-bin/download_htext?htext=%s&format=htext&filedir=" % id) 38 | 39 | if not file.read(): 40 | LOG.warning("%s has no KO file" % org) 41 | return org 42 | else: 43 | with open(out_file, "wb") as out: 44 | out.write(file.read()) 45 | 46 | return 0 47 | 48 | 49 | def download_ko(orgs, output_dir, concurrent=1): 50 | 51 | if not os.path.exists(output_dir): 52 | os.makedirs(output_dir) 53 | 54 | pool = Pool(processes=concurrent) 55 | results = [] 56 | num = len(orgs) 57 | 58 | for n, org in enumerate(orgs): 59 | 60 | index = "%s/%s" % (n + 1, num) 61 | results.append(pool.apply_async(download, (org, index, output_dir))) 62 | 63 | pool.close() 64 | pool.join() 65 | 66 | returns = [] 67 | 68 | for i, result in enumerate(results): 69 | returns.append(result.get()) 70 | 71 | fail = [i for i in returns if i] 72 | LOG.info("%s records, %s failed! Here are they!" % (len(orgs), len(orgs)-len(fail))) 73 | 74 | print("\n".join(fail)) 75 | 76 | return 0 77 | 78 | 79 | def set_args(): 80 | 81 | args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 82 | description=""" 83 | download all KEGG Orthology (KO) of KEGG organisms. 84 | 85 | version: %s 86 | contact: %s <%s>\ 87 | """ % (__version__, " ".join(__author__), __email__)) 88 | 89 | args.add_argument("--org", metavar="FILE", required=True, 90 | help="a list of KEGG organism abbr. at the first column") 91 | args.add_argument("--out", metavar="DIR", 92 | default=".", help="output directory (default: current directory)") 93 | args.add_argument("--concurrent", metavar="INT", type=int, 94 | default=5, help="number of processes concurrent (default: 1)") 95 | 96 | return args.parse_args() 97 | 98 | 99 | def main(): 100 | 101 | logging.basicConfig( 102 | stream=sys.stderr, 103 | level=logging.INFO, 104 | format="[%(levelname)s] %(message)s" 105 | ) 106 | args = set_args() 107 | 108 | orgs = read_org(args.org) 109 | LOG.info("download .keg from KEGG") 110 | download_ko([i[0] for i in orgs], args.out, args.concurrent) 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | 116 | -------------------------------------------------------------------------------- /get_ranks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import logging 6 | import string 7 | import sys 8 | 9 | from common import __version__, __email__, __author__ 10 | 11 | 12 | LOG = logging.getLogger(__name__) 13 | __all__ = [] 14 | 15 | 16 | def read_kegg_org(file): 17 | """ 18 | read br08610.keg 19 | :param file: 20 | :return: 21 | """ 22 | r = {} 23 | 24 | taxon = "" 25 | level = "A" 26 | levels = {k: n for n, k in enumerate(string.ascii_uppercase)} 27 | 28 | for line in open(file): 29 | line = line.strip() 30 | 31 | tag = line[0] 32 | 33 | if tag not in levels: 34 | continue 35 | 36 | if "TAX:" in line: 37 | taxon = line.split("TAX:")[-1].split("]")[0] 38 | level = tag 39 | continue 40 | 41 | if levels[tag] - levels[level] == 1: 42 | if taxon: 43 | # print("%s\t %s" % (line.split()[1], taxon)) 44 | org = line.split()[1] 45 | 46 | if not org.isdigit(): 47 | r[org] = taxon 48 | else: 49 | taxon = "" 50 | 51 | return r 52 | 53 | 54 | def read_taxon(file): 55 | 56 | r = {} 57 | 58 | for line in open(file): 59 | 60 | if line.startswith("#"): 61 | continue 62 | 63 | line = line.rstrip("\n") 64 | taxon_id = line.split()[0] 65 | r[taxon_id] = line 66 | 67 | return r 68 | 69 | 70 | def org2taxon(org, taxon): 71 | 72 | r = {} 73 | 74 | LOG.info("reading KEGG Organisms taxon from %r" % org) 75 | org = read_kegg_org(org) 76 | LOG.info("reading NCBI taxon ranks from %r" % taxon) 77 | taxon = read_taxon(taxon) 78 | 79 | LOG.info("process KEGG Organisms ranks") 80 | for o, t in org.items(): 81 | 82 | if t in taxon: 83 | r[o] = taxon[t] 84 | else: 85 | LOG.info("taxon_id %r not in taxon file" % t) 86 | 87 | return r 88 | 89 | 90 | def set_args(): 91 | args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 92 | description=""" 93 | get KEGG organism classification by taxon id 94 | 95 | version: %s 96 | contact: %s <%s>\ 97 | """ % (__version__, " ".join(__author__), __email__)) 98 | 99 | args.add_argument("--keg", metavar="FILE", required=True, 100 | help="The htex file of KEGG Organisms in the NCBI Taxonomy, usually named as 'br08610.keg'") 101 | args.add_argument("--taxon", metavar="FILE", required=True, 102 | help="NCBI Taxonomy file, taxon_id, rank information separated with tab") 103 | args.add_argument("--out", metavar="FILE", default="KEGG.ranks", help="output file (default: KEGG.ranks)") 104 | 105 | return args.parse_args() 106 | 107 | 108 | def main(): 109 | 110 | logging.basicConfig( 111 | stream=sys.stderr, 112 | level=logging.INFO, 113 | format="[%(levelname)s] %(message)s" 114 | ) 115 | 116 | args = set_args() 117 | 118 | org_dict = org2taxon(args.keg, args.taxon) 119 | 120 | LOG.info("output result to %r" % args.out) 121 | 122 | with open(args.out, "w") as fh: 123 | 124 | for k, v in sorted(org_dict.items()): 125 | fh.write("%s\t%s\n" % (k, v)) 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | 131 | -------------------------------------------------------------------------------- /download_proteins.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import urllib.request 5 | import argparse 6 | import os.path 7 | import logging 8 | import sys 9 | import time 10 | from multiprocessing import Pool 11 | 12 | from common import read_org, __author__, __email__, __version__ 13 | 14 | 15 | LOG = logging.getLogger(__name__) 16 | 17 | 18 | def download(org, status, output_dir): 19 | """ 20 | download proteins from NCBI according to KEGG organism url 21 | :param org: [organism abbr., organism name, url] 22 | :param status: download status, m/n 23 | :param output_dir: output directory 24 | :return: 0 25 | """ 26 | 27 | time.sleep(2) 28 | 29 | o, name, url = org 30 | LOG.info("%s get %s proteins from %r" % (status, o, url)) 31 | url = "%s/%s_translated_cds.faa.gz" % (url, url.split("/")[-1]) 32 | file = urllib.request.urlopen(url) 33 | out_file = os.path.join(output_dir, "%s.pep.fasta.gz" % o) 34 | 35 | if not os.path.exists(out_file): 36 | 37 | with open(out_file, "wb") as out: 38 | out.write(file.read()) 39 | 40 | return 0 41 | 42 | 43 | def get_proteins(orgs, output_dir, concurrent=1): 44 | """ 45 | download proteins from NCBI use multiprocessing 46 | :param orgs: org list read from .org 47 | :param output_dir: output directory 48 | :param concurrent: max concurrent process to download 49 | :return: 0 50 | """ 51 | if not os.path.exists(output_dir): 52 | os.makedirs(output_dir) 53 | 54 | pool = Pool(processes=concurrent) 55 | results = [] 56 | num = len(orgs) 57 | 58 | for n, org in enumerate(orgs): 59 | 60 | index = "%s/%s" % (n + 1, num) 61 | results.append(pool.apply_async(download, (org, index, output_dir))) 62 | 63 | pool.close() 64 | pool.join() 65 | 66 | returns = [] 67 | 68 | for i, result in enumerate(results): 69 | returns.append(result.get()) 70 | 71 | fail = [i for i in returns if i != 0] 72 | LOG.info("%s success, %s failed" % (len(orgs)-len(fail), len(fail))) 73 | 74 | return 0 75 | 76 | 77 | def set_args(): 78 | 79 | args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 80 | description=""" 81 | download protein sequences of KEGG organism from NCBI. 82 | 83 | version: %s 84 | contact: %s <%s>\ 85 | """ % (__version__, " ".join(__author__), __email__)) 86 | 87 | args.add_argument("--org", metavar="FILE", required=True, help=".org file created by download_organism.py") 88 | args.add_argument("--out", metavar="DIR", default=".", help="output directory (default: current directory)") 89 | args.add_argument("--concurrent", metavar="INT", type=int, 90 | default=1, help="number of download processes concurrent (default: 1)") 91 | 92 | return args.parse_args() 93 | 94 | 95 | def main(): 96 | 97 | logging.basicConfig( 98 | stream=sys.stderr, 99 | level=logging.INFO, 100 | format="[%(levelname)s] %(message)s" 101 | ) 102 | args = set_args() 103 | 104 | orgs = read_org(args.org) 105 | 106 | allowed_orgs = [] 107 | 108 | for org in orgs: 109 | 110 | if len(org) != 3: 111 | LOG.info("record %r has no url, skip" % org) 112 | continue 113 | 114 | o, name, url = org 115 | 116 | if "ftp.ncbi.nlm.nih.gov" not in url: 117 | LOG.info("record %r not in NCBI, skip" % org) 118 | continue 119 | 120 | allowed_orgs.append(org) 121 | 122 | LOG.info("%s records pass, downloading..." % len(allowed_orgs)) 123 | get_proteins(allowed_orgs, args.out, args.concurrent) 124 | 125 | 126 | if __name__ == "__main__": 127 | main() 128 | 129 | -------------------------------------------------------------------------------- /make_keg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import logging 7 | 8 | from common import __author__, __email__, __version__ 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | __all__ = [] 13 | 14 | 15 | def read_tbl(file): 16 | """ 17 | read table 18 | :param file: 19 | :return: 20 | """ 21 | 22 | for line in open(file): 23 | line = line.strip() 24 | 25 | if line.startswith("#") or not line: 26 | continue 27 | 28 | yield line.split("\t") 29 | 30 | 31 | def cluster_protein(file): 32 | """ 33 | cluster protein by pathway and ko 34 | :param file: kegg annotation result consist protein id, ko and pathways joined with "\t" 35 | :return: dict {pathway: {ko: [proteins]}} 36 | """ 37 | path_dict = {} 38 | LOG.info("reading kegg result from '%r'" % file) 39 | 40 | for protein, ko, pathway in read_tbl(file): 41 | paths = pathway.split(";") 42 | 43 | for path in paths: 44 | if path not in path_dict: 45 | path_dict[path] = {} 46 | 47 | if ko not in path_dict[path]: 48 | path_dict[path][ko] = [] 49 | 50 | path_dict[path][ko].append(protein) 51 | 52 | return path_dict 53 | 54 | 55 | def output_keg(keg, path_dict, output): 56 | """ 57 | output .keg by kegg annotation result 58 | :param keg: ko00001.keg 59 | :param path_dict: see function cluster_protein 60 | :param output: output file 61 | :return: 0 62 | """ 63 | path_id = "" 64 | 65 | LOG.info("output kegg map to '%r'" % output) 66 | fh = open(output, "w") 67 | 68 | for line in open(keg): 69 | line = line.strip() 70 | 71 | if not line: 72 | continue 73 | 74 | tag = line[0] 75 | 76 | if tag == "C": 77 | path_id = "ko" + line.split()[1] 78 | fh.write("%s\n" % line) 79 | continue 80 | elif tag == "D": 81 | 82 | if path_id not in path_dict: 83 | continue 84 | 85 | mess = line.split() 86 | ko = mess[1] 87 | name = " ".join(mess[2:]) 88 | 89 | if ko not in path_dict[path_id]: 90 | continue 91 | 92 | for p in path_dict[path_id][ko]: 93 | 94 | if ko == "-": 95 | fh.write("D %s\t\n" % p) 96 | else: 97 | fh.write("D %s\t%s %s\n" % (p, ko, name)) 98 | else: 99 | fh.write("%s\n" % line) 100 | 101 | return 0 102 | 103 | 104 | def set_args(): 105 | 106 | args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 107 | description=""" 108 | create .keg file from kegg annotation result 109 | 110 | version: %s 111 | contact: %s <%s>\ 112 | """ % (__version__, " ".join(__author__), __email__)) 113 | 114 | args.add_argument("--keg", metavar="FILE", required=True, 115 | help="KO file downloaded from KEGG, usually named 'ko00001.keg'") 116 | args.add_argument("--in", metavar="FILE", dest="input", required=True, 117 | help="KEGG annotation result consist protein id, KO, pathway joined with '\t'") 118 | args.add_argument("--out", metavar="STR", default="out", help="output prefix (default: out)") 119 | 120 | return args.parse_args() 121 | 122 | 123 | def main(): 124 | 125 | logging.basicConfig( 126 | stream=sys.stderr, 127 | level=logging.INFO, 128 | format="[%(levelname)s] %(message)s" 129 | ) 130 | 131 | args = set_args() 132 | 133 | path_dict = cluster_protein(args.input) 134 | output_keg(args.keg, path_dict, args.out+".keg") 135 | 136 | 137 | if __name__ == "__main__": 138 | main() 139 | 140 | -------------------------------------------------------------------------------- /process_proteins.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import argparse 7 | import logging 8 | 9 | from common import read_org_ko, read_org, __email__, __version__, __author__ 10 | from FastaReader import open_fasta 11 | 12 | 13 | LOG = logging.getLogger(__name__) 14 | 15 | __all__ = [] 16 | 17 | 18 | def process_protein(org, keg, pep, out): 19 | """ 20 | get the protein seq of with ko 21 | :param org: the organism abbr. 22 | :param keg: directory contains org.keg 23 | :param pep: directory contains org.pep.fasta.gz 24 | :param out: output directory 25 | :return: 26 | """ 27 | 28 | r = [] 29 | keg_name = os.path.join(keg, "%s00001.keg" % org) 30 | pep_name = os.path.join(pep, "%s.pep.fasta.gz" % org) 31 | 32 | if not os.path.exists(keg_name): 33 | LOG.info("keg %r not exists, skip" % keg_name) 34 | return "%s\tno keg" % org 35 | 36 | if not os.path.exists(pep_name): 37 | LOG.info("pep %r not exists, skip" % pep_name) 38 | return "%s\tno protein" % org 39 | 40 | gene_dict = read_org_ko(keg_name) 41 | 42 | if not gene_dict: 43 | LOG.info("keg %r is empty, skip" % keg_name) 44 | return "%s\t keg is empty" % org 45 | 46 | records = [] 47 | 48 | for record in open_fasta(pep_name): 49 | name = record.name 50 | 51 | # gene_id is in db_xref or locus_tag 52 | 53 | if "locus_tag=" in name: 54 | id = name.split("locus_tag=")[1].split("]")[0] 55 | elif "db_xref=GeneID:" in name: 56 | id = name.split("db_xref=GeneID:")[1].split("]")[0] 57 | elif "protein_id=" in name: 58 | id = name.split("protein_id=")[1].split("]")[0].split(".")[0] 59 | else: 60 | continue 61 | 62 | if id in gene_dict: 63 | r.append(id) 64 | 65 | records.append(">%s:%s\n%s\n" % (org, id, record.seq)) 66 | 67 | if r: 68 | with open(os.path.join(out, "%s.pep.fasta" % org), "w") as fh: 69 | fh.write("".join(records)) 70 | return 0 71 | else: 72 | return "%s\tpep not match with keg" % org 73 | 74 | 75 | def process_proteins(orgs, keg, pep, out): 76 | """ 77 | get the protein seq of ids in keg from pep 78 | :param keg: 79 | :param pep: 80 | :param out: 81 | :return: 82 | """ 83 | 84 | num = len(orgs) 85 | 86 | for n, org in enumerate(orgs): 87 | LOG.info("%s/%s process %s" % (n+1, num, org)) 88 | process_protein(org, keg, pep, out) 89 | 90 | return 0 91 | 92 | 93 | def set_args(): 94 | 95 | args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 96 | description=""" 97 | extract proteins in KO from NCBI download files 98 | 99 | version: %s 100 | contact: %s <%s>\ 101 | """ % (__version__, " ".join(__author__), __email__)) 102 | 103 | args.add_argument("--org", metavar="FILE", required=True, 104 | help="a list of KEGG organism abbr. at the first column") 105 | args.add_argument("--keg", metavar="DIR", required=True, 106 | help="directory contains {org}00001.keg") 107 | args.add_argument("--pep", metavar="DIR", required=True, 108 | help="directory contains {org}.pep.fasta.gz from NCBI") 109 | args.add_argument("--out", metavar="DIR", default=".", help="output directory (default: current directory)") 110 | 111 | return args.parse_args() 112 | 113 | 114 | def main(): 115 | 116 | logging.basicConfig( 117 | stream=sys.stderr, 118 | level=logging.INFO, 119 | format="[%(levelname)s] %(message)s" 120 | ) 121 | 122 | args = set_args() 123 | 124 | orgs = read_org(args.org) 125 | process_proteins([i[0] for i in orgs], args.keg, args.pep, args.out) 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | 131 | -------------------------------------------------------------------------------- /FastaReader.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import 3 | 4 | import gzip 5 | from os.path import abspath, expanduser 6 | 7 | 8 | def split_header(name): 9 | """ 10 | split fasta header to id and description 11 | :param name: 12 | :return: 13 | """ 14 | parts = name.split(None, 1) 15 | 16 | if len(parts) == 1: 17 | parts.append("") 18 | 19 | return parts 20 | 21 | 22 | class FastaRecord(object): 23 | """ 24 | object to process a fasta record 25 | """ 26 | DELIMITER = ">" 27 | 28 | def __init__(self, name, seq): 29 | try: 30 | assert "\n" not in name 31 | assert "\n" not in seq 32 | assert self.DELIMITER not in seq 33 | self._name = name 34 | self._seq = seq 35 | self._id, self._description = split_header(name) 36 | except AssertionError: 37 | raise ValueError("Invalid FASTA record data") 38 | 39 | @property 40 | def name(self): 41 | """ 42 | the name of the seq, strings after ">" 43 | """ 44 | return self._name 45 | 46 | @property 47 | def id(self): 48 | """ 49 | The id of the seq, equal to the FASTA header 50 | up to the first whitespace. 51 | """ 52 | return self._id 53 | 54 | @property 55 | def description(self): 56 | """ 57 | The description of the seq in the FASTA file, equal to 58 | the contents of the FASTA header following the first whitespace 59 | """ 60 | return self._description 61 | 62 | @property 63 | def seq(self): 64 | """ 65 | The seq of the record 66 | 67 | """ 68 | return self._seq 69 | 70 | @property 71 | def length(self): 72 | """ 73 | the length of the seq 74 | """ 75 | return len(self._seq) 76 | 77 | @classmethod 78 | def from_string(cls, string): 79 | """ 80 | Interprets a string as a FASTA record. Does not make any 81 | assumptions about wrapping of the seq string. 82 | """ 83 | string = string.strip() 84 | 85 | try: 86 | lines = string.splitlines() 87 | assert len(lines) > 1 88 | assert lines[0][0] == cls.DELIMITER 89 | name = lines[0][1:] 90 | seq = "".join(lines[1:]) 91 | return FastaRecord(name, seq) 92 | except AssertionError: 93 | raise ValueError("String not recognized as a valid FASTA record") 94 | 95 | def __str__(self): 96 | """ 97 | str conversion 98 | :return: 99 | """ 100 | return ">%s\n%s" % (self.name, self.seq) 101 | 102 | 103 | def check_format(filename): 104 | """ 105 | check the format of file 106 | :param filename: 107 | :return: 108 | """ 109 | allowed_format = [".fa", ".fasta", ".fa.gz", ".fasta.gz"] 110 | 111 | if any([f for f in allowed_format if filename.endswith(f)]): 112 | return 0 113 | else: 114 | msg = "file format is not in %s" % allowed_format 115 | raise Exception(msg) 116 | 117 | 118 | def yield_fasta_records(stream): 119 | """ 120 | yield fastq records from stream 121 | :param stream: a stream object 122 | :return: 123 | """ 124 | string = "" 125 | 126 | for line in stream: 127 | line = line.strip() 128 | 129 | if not line: 130 | continue 131 | 132 | if string and line.startswith(">"): 133 | yield FastaRecord.from_string(string) 134 | string = "" 135 | 136 | string += "%s\n" % line 137 | 138 | if string: 139 | yield FastaRecord.from_string(string) 140 | 141 | 142 | def open_fasta(filename): 143 | """ 144 | read fasta file and return fasta records 145 | :param filename: 146 | :return: 147 | """ 148 | check_format(filename) 149 | filename = abspath(expanduser(filename)) 150 | mode = 'r' 151 | 152 | if filename.endswith(".gz"): 153 | stream = gzip.open(filename, mode) 154 | else: 155 | stream = open(filename, mode) 156 | 157 | return yield_fasta_records(stream) 158 | -------------------------------------------------------------------------------- /plot_keg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import argparse 6 | import logging 7 | import sys 8 | from collections import OrderedDict 9 | 10 | from common import __author__, __email__, __version__ 11 | 12 | LOG = logging.getLogger(__name__) 13 | 14 | __all__ = [] 15 | 16 | 17 | def stat_keg(keg): 18 | """ 19 | read pathway information from .keg 20 | :param keg: .keg file 21 | :return: a dict {pathway_A: {pathway_B: [proteins]}} 22 | """ 23 | 24 | r = OrderedDict() 25 | LOG.info("reading kegg map from %r" % keg) 26 | 27 | path1 = "" 28 | path2 = "" 29 | 30 | for line in open(keg): 31 | line = line.strip() 32 | 33 | if not line: 34 | continue 35 | 36 | tag = line[0] 37 | 38 | if tag == "A" and "" in line: 39 | path1 = line[4:-4] 40 | r[path1] = OrderedDict() 41 | continue 42 | 43 | if tag == "B" and "" in line: 44 | path2 = line[6:-4] 45 | r[path1][path2] = [] 46 | continue 47 | 48 | if tag == "D": 49 | r[path1][path2].append(line.split()[1]) 50 | 51 | return r 52 | 53 | 54 | def plot_keg(keg_dict, out): 55 | """ 56 | plot function 57 | :param keg_dict: see stat_keg 58 | :param out: output filename 59 | :return: 0 60 | """ 61 | x = [] 62 | y = [] 63 | n = 1 64 | 65 | for path1 in keg_dict: 66 | x.append(n) 67 | y.append(0) 68 | n += 1 69 | 70 | for path2 in keg_dict[path1]: 71 | num = len(set(keg_dict[path1][path2])) 72 | 73 | if not num: 74 | continue 75 | 76 | x.append(n) 77 | y.append(num) 78 | n += 1 79 | 80 | y_max = max(y) * 1.1 81 | 82 | colors = [] 83 | color = ["", "blue", "green", "red", "purple", "skyblue", "orange", "gray"] 84 | lv = 0 85 | n = 1 86 | 87 | LOG.info("plot KEGG annotation result to %r" % out) 88 | from matplotlib import pyplot as plt 89 | 90 | fig = plt.figure(figsize=(8, 8)) 91 | ax = fig.add_subplot(111, ) 92 | 93 | for path1 in keg_dict: 94 | 95 | colors.append("white") 96 | lv += 1 97 | ax.text(y_max / -1.7, n, path1, fontsize=8, verticalalignment='center', horizontalalignment='left', family="Arial", 98 | color=color[lv]) 99 | n += 1 100 | 101 | for path2 in keg_dict[path1]: 102 | 103 | num = len(set(keg_dict[path1][path2])) 104 | 105 | if not num: 106 | continue 107 | 108 | ax.text(num, n, num, fontsize=8, verticalalignment='center', family="Arial",) 109 | ax.text(y_max / -1.8, n, path2, fontsize=8, verticalalignment='center', horizontalalignment='left', family="Arial", 110 | color=color[lv]) 111 | colors.append(color[lv]) 112 | n += 1 113 | 114 | ax.barh(x, y, color=colors, alpha=0.5) 115 | ax.set_xlim([y_max / -100, y_max]) 116 | ax.set_ylim([0, ax.get_ylim()[1]]) 117 | plt.xticks(fontsize=8, family="Arial",) 118 | ax.set_yticks([]) 119 | plt.subplots_adjust(top=0.95, left=0.35, right=0.95, bottom=0.05) 120 | plt.xlabel("Number of Genes", fontsize=10, family="Arial",) 121 | 122 | ax.invert_yaxis() 123 | plt.savefig(out) 124 | 125 | return 0 126 | 127 | 128 | def set_args(): 129 | 130 | args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 131 | description=""" 132 | plot kegg annotation from .keg 133 | 134 | version: %s 135 | contact: %s <%s>\ 136 | """ % (__version__, " ".join(__author__), __email__)) 137 | 138 | args.add_argument("--keg", metavar="FILE", required=True, 139 | help="KO file named '*.keg', can be make by make_keg.py") 140 | args.add_argument("--out", metavar="STR", default="out", help="output prefix (default: out)") 141 | 142 | return args.parse_args() 143 | 144 | 145 | def main(): 146 | 147 | logging.basicConfig( 148 | stream=sys.stderr, 149 | level=logging.INFO, 150 | format="[%(levelname)s] %(message)s" 151 | ) 152 | 153 | args = set_args() 154 | keg_dict = stat_keg(args.keg) 155 | plot_keg(keg_dict, args.out+".pdf") 156 | 157 | 158 | if __name__ == "__main__": 159 | main() 160 | 161 | --------------------------------------------------------------------------------