├── examples
    ├── human.pdf
    ├── human.png
    └── human.org
├── LICENSE
├── common.py
├── makedb.py
├── download_organism.py
├── README.md
├── download_ko.py
├── get_ranks.py
├── download_proteins.py
├── make_keg.py
├── process_proteins.py
├── FastaReader.py
└── plot_keg.py


/examples/human.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FlyPythons/KEGGTools/HEAD/examples/human.pdf


--------------------------------------------------------------------------------
/examples/human.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FlyPythons/KEGGTools/HEAD/examples/human.png


--------------------------------------------------------------------------------
/examples/human.org:
--------------------------------------------------------------------------------
1 | hsa 9606	Eukaryota	Metazoa	Chordata	Mammalia	Primates	Hominidae	Homo	Homo sapiens


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Junpeng Fan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import logging
 3 | 
 4 | 
 5 | LOG = logging.getLogger(__name__)
 6 | 
 7 | __version__ = "0.1.0"
 8 | __author__ = ("Junpeng Fan",)
 9 | __email__ = "jpfan@whu.edu.cn"
10 | 
11 | 
12 | def read_org(file):
13 |     """
14 |     read .org file create by get_organism.py
15 |     :return: a list [org.abbr, name, url]
16 |     """
17 | 
18 |     r = []
19 | 
20 |     LOG.info("get organism infomation from %r" % file)
21 | 
22 |     for line in open(file):
23 |         line = line.strip()
24 | 
25 |         if line:
26 |             r.append(line.split("\t"))
27 | 
28 |     LOG.info("get %s records" % len(r))
29 | 
30 |     return r
31 | 
32 | 
33 | def read_org_ko(file):
34 |     """
35 |     read KEGG organism KO .keg file
36 |     :param file: file name
37 |     :return: dict contains {protein_id: {"ko": [], "path": []}}, if protein_id has no ko, the ko will be "-"
38 |     """
39 | 
40 |     r = {}
41 | 
42 |     path_id = ""
43 | 
44 |     for n, line in enumerate(open(file)):
45 |         line = line.strip()
46 | 
47 |         if not line:
48 |             continue
49 | 
50 |         tag = line[0]
51 | 
52 |         if tag == "C":
53 |             path_id = "ko"+line[-6:-1]
54 |             continue
55 | 
56 |         if tag != "D":
57 |             continue
58 | 
59 |         tmp = line.split("\t")
60 |         gene = tmp[0].split()[1]
61 | 
62 |         if len(tmp) == 2:
63 |             ko = tmp[1].split()[0]
64 |         else:
65 |             LOG.warning("line %s: %r has no ko" % (n+1, line))
66 |             ko = ""
67 | 
68 |         if gene not in r:
69 |             r[gene] = {"ko": [], "path": []}
70 | 
71 |         if ko not in r[gene]["ko"]:
72 |             r[gene]["ko"].append(ko)
73 | 
74 |         if path_id not in r[gene]["path"]:
75 |             r[gene]["path"].append(path_id)
76 | 
77 |     return r
78 | 


--------------------------------------------------------------------------------
/makedb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os.path
 5 | import sys
 6 | import argparse
 7 | import logging
 8 | 
 9 | from common import read_org, read_org_ko, __email__, __version__, __author__
10 | 
11 | 
12 | LOG = logging.getLogger(__name__)
13 | 
14 | 
15 | __all__ = []
16 | 
17 | 
18 | def cat_proteins(org, pep, keg, out):
19 | 
20 |     pep_out = open(out+".pep.fasta", "w")
21 |     ko_out = open(out+".pep2ko.txt", "w")
22 |     num = len(org)
23 | 
24 |     for n, o in enumerate(set(org)):
25 |         LOG.info("%s/%s process %s" % (n+1, num, o))
26 |         pep_file = os.path.join(pep, "%s.pep.fasta" % o)
27 |         keg_file = os.path.join(keg, "%s00001.keg" % o)
28 | 
29 |         if os.path.exists(pep_file) and os.path.exists(keg_file):
30 |             pep_out.write(open(pep_file).read())
31 | 
32 |             for k, v in read_org_ko(keg_file).items():
33 |                 if v["ko"]:
34 |                     ko = ";".join(v["ko"])
35 |                 else:
36 |                     ko = "-"
37 | 
38 |                 ko_out.write("%s\t%s\t%s\n" % (k, ko, ";".join(v["path"])))
39 |         else:
40 |             LOG.warning("%r has no .keg or .pep.fasta")
41 | 
42 |     pep_out.close()
43 |     ko_out.close()
44 | 
45 |     return 0
46 | 
47 | 
48 | def set_args():
49 | 
50 |     args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
51 |                                    description="""
52 | extract proteins of KEGG organisms you wanted to make database
53 | 
54 | version: %s
55 | contact: %s <%s>\
56 |     """ % (__version__, " ".join(__author__), __email__))
57 | 
58 |     args.add_argument("--org", metavar="FILE", required=True,
59 |                       help="a list of KEGG organism abbr. at the first column")
60 |     args.add_argument("--keg", metavar="DIR", required=True,
61 |                       help="directory contains {org}00001.keg")
62 |     args.add_argument("--pep", metavar="DIR", required=True,
63 |                       help="directory contains {org}.pep.fasta.gz from NCBI")
64 |     args.add_argument("--out", metavar="STR", default="kegg", help="output prefix (default: kegg)")
65 | 
66 |     return args.parse_args()
67 | 
68 | 
69 | def main():
70 | 
71 |     logging.basicConfig(
72 |         stream=sys.stderr,
73 |         level=logging.INFO,
74 |         format="[%(levelname)s] %(message)s"
75 |     )
76 | 
77 |     args = set_args()
78 | 
79 |     orgs = read_org(args.org)
80 | 
81 |     cat_proteins([i[0] for i in orgs], args.pep, args.keg, args.out)
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 
87 | 


--------------------------------------------------------------------------------
/download_organism.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import logging
  6 | import urllib.request
  7 | import argparse
  8 | 
  9 | from common import __author__, __version__, __email__
 10 | 
 11 | 
 12 | LOG = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def html2org(url="http://www.kegg.jp/kegg/catalog/org_list.html"):
 16 |     """
 17 |     request KEGG organism url and get the organism abb. name and download url
 18 |     :param url: the url of KEGG organism, default is http://www.kegg.jp/kegg/catalog/org_list.html
 19 |     :param out: the output
 20 |     :return: dict contain org information
 21 |     """
 22 | 
 23 |     r = {}
 24 | 
 25 |     LOG.info("open url %r to get KEGG org list" % url)
 26 |     file = urllib.request.urlopen(url)
 27 | 
 28 |     org = name = link = ""
 29 |     n = 0
 30 | 
 31 |     for line in file:
 32 |         line = line.decode("utf-8").strip()
 33 | 
 34 |         if "show_organism?org=" in line:
 35 | 
 36 |             if org:
 37 |                 r[org] = [name, link]
 38 | 
 39 |             org = line.split("</a>")[0].split("'>")[-1]
 40 |             name = link = ""
 41 |             n = 1
 42 |             continue
 43 | 
 44 |         if n == 1:
 45 |             name = line.split("</a>")[0].split("'>")[-1]
 46 |             n = 2
 47 |             continue
 48 |         if n == 2:
 49 |             if "ftp://" not in line:
 50 |                 continue
 51 |             link = line.split("href='")[-1].split("'>")[0]
 52 |             n = 0
 53 | 
 54 |     if org:
 55 |         r[org] = [name, link]
 56 | 
 57 |     LOG.info("get %s records from KEGG org" % len(r))
 58 | 
 59 |     return r
 60 | 
 61 | 
 62 | def set_args():
 63 | 
 64 |     args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
 65 |                                    description="""
 66 | download KEGG organism information from KEGG website.
 67 | 
 68 | version: %s
 69 | contact: %s <%s>\
 70 | """ % (__version__, " ".join(__author__), __email__))
 71 | 
 72 |     args.add_argument("--url", default="http://www.kegg.jp/kegg/catalog/org_list.html",
 73 |                       help="KEGG organism url (default: http://www.kegg.jp/kegg/catalog/org_list.html)")
 74 |     args.add_argument("--out", metavar="FILE",
 75 |                       default="KEGG.org", help="output filename (default: KEGG.org)")
 76 | 
 77 |     return args.parse_args()
 78 | 
 79 | 
 80 | def main():
 81 | 
 82 |     logging.basicConfig(
 83 |         stream=sys.stderr,
 84 |         level=logging.INFO,
 85 |         format="[%(levelname)s] %(message)s"
 86 |     )
 87 | 
 88 |     args = set_args()
 89 |     org_dict = html2org(args.url)
 90 | 
 91 |     LOG.info("output records to %s" % args.out)
 92 | 
 93 |     with open(args.out, "w") as fh:
 94 |         for k, v in sorted(org_dict.items()):
 95 |             fh.write("%s\t%s\t%s\n" % (k, v[0], v[1]))
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # KEGGTools
 2 | A collection of tools to download and process KEGG database
 3 | ## Introduction
 4 | KEGG is an important database to research on the function of proteins and metabolic pathways of certain organism, but the sequences in KEGG databases can not be downloaded free. KEGGTools is developed to solve this problem.
 5 | ### Requirements
 6 | * Python 3.5+
 7 | ### Install
 8 | to git  
 9 | ```
10 | git clone https://github.com/FlyPythons/KEGGTools.git
11 | ```
12 | or download
13 | ```
14 | wget https://github.com/FlyPythons/KEGGTools/archive/master.zip
15 | unzip mater.zip
16 | ```
17 | ## Examples
18 | ### Download KEGG database
19 | 1. Download information of KEGG organisms from KEGG   
20 | ```
21 | python3 download_organism.py --url http://www.kegg.jp/kegg/catalog/org_list.html --out KEGG.org
22 | ```
23 | This will get 5426 KEGG organisms from KEGG-Genome.  
24 | 
25 | 2. Download protein sequences from NCBI by the urls in 'KEGG.org'
26 | ```
27 | python3 download_proteins --org KEGG.org -out NCBI-proteins --concurrent 2
28 | ```
29 | This will get 5419 gzip formatted protein sequences of KEGG organisms from NCBI. Other 7 organsims are not from NCBI, they are "bpg dosa lem lja pfd pfh smin"  
30 | 3. Download KO information from KEGG
31 | ```
32 | python3 download_ko.py --org KEGG.org --out KEGG-KO --concurrent 10
33 | ```
34 | This will get 5394 keg formatted file consist KO information of KEGG organisms. Other 32 organisms have no KO information in KEGG, they are "ebc pcd apor pgz vta cola haf mii aea nmj bgm aon kso zpa afq amih ypac mee msao dpc rhq dlu cgrn sfk actt pbf kst vbh fmo ful pbp dod "  
35 | 4. Get proteins included in KO files from NCBI download proteins  
36 | ```
37 | python3 process_proteins.py --org KEGG.org --keg KEGG-KO --pep NCBI-proteins --out KO-proteins
38 | ```
39 | This will get protein sequences of 5381 organisms. 13 of organsims can not find matched id in KEGG-KO and NCBI-protein, they are "agl cpor pary smiz pshi tng vrm dpl dco hlc ecor nwe xph"; 32 organisms have no KO information in KEGG, they are present in step 4.  
40 | So finally, we have a KEGG database consist 5381 organisms, we can use them to do KEGG annotation. 
41 | ### Process KEGG database downloaded
42 | * Get the NCBI Taxonomy ranks of KEGG organisms
43 | ```
44 | python3 get_ranks.py --keg br08610.keg --taxon taxonomy.ranks --out KEGG.ranks
45 | ```
46 | This will find 4715 Bacteria, 442 Eukaryota, 269 Archaea in KEGG organisms.
47 | * Extract the information of KEGG organisms you wanted to make db
48 | ```
49 | python3 makedb.py --org human.org --keg KEGG-KO --pep NCBI-proteins --out human
50 | ```
51 | This will create 2 files consist of protein fasta file("human.pep.fasta") and protein related KO and pathway ID("human.pep2ko.txt").
52 | ### Plot KEGG annotation result
53 | make kegg annotaion result like "human.pep2ko.txt"  
54 | * Create KEGG pathway file ".keg"
55 | ```
56 | python3 make_keg.py --keg ko00001.keg --in human.pep2ko.txt --out human
57 | ```
58 | This will create a keg file named "human.keg"
59 | * Plot KEGG pathway file
60 | ```
61 | python3 plot_key.py --keg human.keg --out human
62 | ```
63 | This will create a pdf file named "human.pdf"  
64 | ![image](https://github.com/FlyPythons/KEGGTools/raw/master/examples/human.png)


--------------------------------------------------------------------------------
/download_ko.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import urllib.request
  5 | import argparse
  6 | import os.path
  7 | import logging
  8 | import sys
  9 | import time
 10 | from multiprocessing import Pool
 11 | 
 12 | from common import read_org, __author__, __email__, __version__
 13 | 
 14 | 
 15 | LOG = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def download(org, status, output_dir):
 19 |     """
 20 |     download .keg file contains KO information from KEGG
 21 |     :param org: KEGG organism abbr.
 22 |     :param status: the download status
 23 |     :param output_dir: output directory
 24 |     :return: 0
 25 |     """
 26 | 
 27 |     time.sleep(2)
 28 | 
 29 |     LOG.info("%s processing %s" % (status, org))
 30 |     id = org + "00001.keg"
 31 |     out_file = os.path.join(output_dir, id)
 32 | 
 33 |     if os.path.exists(out_file):
 34 |         LOG.info("%s has been downloaded before, skip")
 35 |         return 0
 36 | 
 37 |     file = urllib.request.urlopen("http://www.kegg.jp/kegg-bin/download_htext?htext=%s&format=htext&filedir=" % id)
 38 | 
 39 |     if not file.read():
 40 |         LOG.warning("%s has no KO file" % org)
 41 |         return org
 42 |     else:
 43 |         with open(out_file, "wb") as out:
 44 |             out.write(file.read())
 45 | 
 46 |     return 0
 47 | 
 48 | 
 49 | def download_ko(orgs, output_dir, concurrent=1):
 50 | 
 51 |     if not os.path.exists(output_dir):
 52 |         os.makedirs(output_dir)
 53 | 
 54 |     pool = Pool(processes=concurrent)
 55 |     results = []
 56 |     num = len(orgs)
 57 | 
 58 |     for n, org in enumerate(orgs):
 59 | 
 60 |         index = "%s/%s" % (n + 1, num)
 61 |         results.append(pool.apply_async(download, (org, index, output_dir)))
 62 | 
 63 |     pool.close()
 64 |     pool.join()
 65 | 
 66 |     returns = []
 67 | 
 68 |     for i, result in enumerate(results):
 69 |         returns.append(result.get())
 70 | 
 71 |     fail = [i for i in returns if i]
 72 |     LOG.info("%s records, %s failed! Here are they!" % (len(orgs), len(orgs)-len(fail)))
 73 | 
 74 |     print("\n".join(fail))
 75 | 
 76 |     return 0
 77 | 
 78 | 
 79 | def set_args():
 80 | 
 81 |     args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
 82 |                                    description="""
 83 | download all KEGG Orthology (KO) of KEGG organisms.
 84 | 
 85 | version: %s
 86 | contact: %s <%s>\
 87 |     """ % (__version__, " ".join(__author__), __email__))
 88 | 
 89 |     args.add_argument("--org", metavar="FILE", required=True,
 90 |                       help="a list of KEGG organism abbr. at the first column")
 91 |     args.add_argument("--out", metavar="DIR",
 92 |                       default=".", help="output directory (default: current directory)")
 93 |     args.add_argument("--concurrent", metavar="INT", type=int,
 94 |                       default=5, help="number of processes concurrent (default: 1)")
 95 | 
 96 |     return args.parse_args()
 97 | 
 98 | 
 99 | def main():
100 | 
101 |     logging.basicConfig(
102 |         stream=sys.stderr,
103 |         level=logging.INFO,
104 |         format="[%(levelname)s] %(message)s"
105 |     )
106 |     args = set_args()
107 | 
108 |     orgs = read_org(args.org)
109 |     LOG.info("download .keg from KEGG")
110 |     download_ko([i[0] for i in orgs], args.out, args.concurrent)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     main()
115 | 
116 | 


--------------------------------------------------------------------------------
/get_ranks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import logging
  6 | import string
  7 | import sys
  8 | 
  9 | from common import __version__, __email__, __author__
 10 | 
 11 | 
 12 | LOG = logging.getLogger(__name__)
 13 | __all__ = []
 14 | 
 15 | 
 16 | def read_kegg_org(file):
 17 |     """
 18 |     read br08610.keg
 19 |     :param file:
 20 |     :return:
 21 |     """
 22 |     r = {}
 23 | 
 24 |     taxon = ""
 25 |     level = "A"
 26 |     levels = {k: n for n, k in enumerate(string.ascii_uppercase)}
 27 | 
 28 |     for line in open(file):
 29 |         line = line.strip()
 30 | 
 31 |         tag = line[0]
 32 | 
 33 |         if tag not in levels:
 34 |             continue
 35 | 
 36 |         if "TAX:" in line:
 37 |             taxon = line.split("TAX:")[-1].split("]")[0]
 38 |             level = tag
 39 |             continue
 40 | 
 41 |         if levels[tag] - levels[level] == 1:
 42 |             if taxon:
 43 |                 # print("%s\t %s" % (line.split()[1], taxon))
 44 |                 org = line.split()[1]
 45 | 
 46 |                 if not org.isdigit():
 47 |                     r[org] = taxon
 48 |         else:
 49 |             taxon = ""
 50 | 
 51 |     return r
 52 | 
 53 | 
 54 | def read_taxon(file):
 55 | 
 56 |     r = {}
 57 | 
 58 |     for line in open(file):
 59 | 
 60 |         if line.startswith("#"):
 61 |             continue
 62 | 
 63 |         line = line.rstrip("\n")
 64 |         taxon_id = line.split()[0]
 65 |         r[taxon_id] = line
 66 | 
 67 |     return r
 68 | 
 69 | 
 70 | def org2taxon(org, taxon):
 71 | 
 72 |     r = {}
 73 | 
 74 |     LOG.info("reading KEGG Organisms taxon from %r" % org)
 75 |     org = read_kegg_org(org)
 76 |     LOG.info("reading NCBI taxon ranks from %r" % taxon)
 77 |     taxon = read_taxon(taxon)
 78 | 
 79 |     LOG.info("process KEGG Organisms ranks")
 80 |     for o, t in org.items():
 81 | 
 82 |         if t in taxon:
 83 |             r[o] = taxon[t]
 84 |         else:
 85 |             LOG.info("taxon_id %r not in taxon file" % t)
 86 | 
 87 |     return r
 88 | 
 89 | 
 90 | def set_args():
 91 |     args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
 92 |                                    description="""
 93 | get KEGG organism classification by taxon id
 94 | 
 95 | version: %s
 96 | contact: %s <%s>\
 97 |     """ % (__version__, " ".join(__author__), __email__))
 98 | 
 99 |     args.add_argument("--keg", metavar="FILE", required=True,
100 |                       help="The htex file of KEGG Organisms in the NCBI Taxonomy, usually named as 'br08610.keg'")
101 |     args.add_argument("--taxon", metavar="FILE", required=True,
102 |                       help="NCBI Taxonomy file, taxon_id, rank information separated with tab")
103 |     args.add_argument("--out", metavar="FILE", default="KEGG.ranks", help="output file (default: KEGG.ranks)")
104 | 
105 |     return args.parse_args()
106 | 
107 | 
108 | def main():
109 | 
110 |     logging.basicConfig(
111 |         stream=sys.stderr,
112 |         level=logging.INFO,
113 |         format="[%(levelname)s] %(message)s"
114 |     )
115 | 
116 |     args = set_args()
117 | 
118 |     org_dict = org2taxon(args.keg, args.taxon)
119 | 
120 |     LOG.info("output result to %r" % args.out)
121 | 
122 |     with open(args.out, "w") as fh:
123 | 
124 |         for k, v in sorted(org_dict.items()):
125 |             fh.write("%s\t%s\n" % (k, v))
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 
131 | 


--------------------------------------------------------------------------------
/download_proteins.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import urllib.request
  5 | import argparse
  6 | import os.path
  7 | import logging
  8 | import sys
  9 | import time
 10 | from multiprocessing import Pool
 11 | 
 12 | from common import read_org, __author__, __email__, __version__
 13 | 
 14 | 
 15 | LOG = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def download(org, status, output_dir):
 19 |     """
 20 |     download proteins from NCBI according to KEGG organism url
 21 |     :param org: [organism abbr., organism name, url]
 22 |     :param status: download status, m/n
 23 |     :param output_dir: output directory
 24 |     :return: 0
 25 |     """
 26 | 
 27 |     time.sleep(2)
 28 | 
 29 |     o, name, url = org
 30 |     LOG.info("%s get %s proteins from %r" % (status, o, url))
 31 |     url = "%s/%s_translated_cds.faa.gz" % (url, url.split("/")[-1])
 32 |     file = urllib.request.urlopen(url)
 33 |     out_file = os.path.join(output_dir, "%s.pep.fasta.gz" % o)
 34 | 
 35 |     if not os.path.exists(out_file):
 36 | 
 37 |         with open(out_file, "wb") as out:
 38 |             out.write(file.read())
 39 | 
 40 |     return 0
 41 | 
 42 | 
 43 | def get_proteins(orgs, output_dir, concurrent=1):
 44 |     """
 45 |     download proteins from NCBI use multiprocessing
 46 |     :param orgs: org list read from .org
 47 |     :param output_dir: output directory
 48 |     :param concurrent: max concurrent process to download
 49 |     :return: 0
 50 |     """
 51 |     if not os.path.exists(output_dir):
 52 |         os.makedirs(output_dir)
 53 | 
 54 |     pool = Pool(processes=concurrent)
 55 |     results = []
 56 |     num = len(orgs)
 57 | 
 58 |     for n, org in enumerate(orgs):
 59 | 
 60 |         index = "%s/%s" % (n + 1, num)
 61 |         results.append(pool.apply_async(download, (org, index, output_dir)))
 62 | 
 63 |     pool.close()
 64 |     pool.join()
 65 | 
 66 |     returns = []
 67 | 
 68 |     for i, result in enumerate(results):
 69 |         returns.append(result.get())
 70 | 
 71 |     fail = [i for i in returns if i != 0]
 72 |     LOG.info("%s success, %s failed" % (len(orgs)-len(fail), len(fail)))
 73 | 
 74 |     return 0
 75 | 
 76 | 
 77 | def set_args():
 78 | 
 79 |     args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
 80 |                                    description="""
 81 | download protein sequences of KEGG organism from NCBI.
 82 | 
 83 | version: %s
 84 | contact: %s <%s>\
 85 |     """ % (__version__, " ".join(__author__), __email__))
 86 | 
 87 |     args.add_argument("--org", metavar="FILE", required=True, help=".org file created by download_organism.py")
 88 |     args.add_argument("--out", metavar="DIR", default=".", help="output directory (default: current directory)")
 89 |     args.add_argument("--concurrent", metavar="INT", type=int,
 90 |                       default=1, help="number of download processes concurrent (default: 1)")
 91 | 
 92 |     return args.parse_args()
 93 | 
 94 | 
 95 | def main():
 96 | 
 97 |     logging.basicConfig(
 98 |         stream=sys.stderr,
 99 |         level=logging.INFO,
100 |         format="[%(levelname)s] %(message)s"
101 |     )
102 |     args = set_args()
103 | 
104 |     orgs = read_org(args.org)
105 | 
106 |     allowed_orgs = []
107 | 
108 |     for org in orgs:
109 | 
110 |         if len(org) != 3:
111 |             LOG.info("record %r has no url, skip" % org)
112 |             continue
113 | 
114 |         o, name, url = org
115 | 
116 |         if "ftp.ncbi.nlm.nih.gov" not in url:
117 |             LOG.info("record %r not in NCBI, skip" % org)
118 |             continue
119 | 
120 |         allowed_orgs.append(org)
121 | 
122 |     LOG.info("%s records pass, downloading..." % len(allowed_orgs))
123 |     get_proteins(allowed_orgs, args.out, args.concurrent)
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     main()
128 | 
129 | 


--------------------------------------------------------------------------------
/make_keg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import argparse
  6 | import logging
  7 | 
  8 | from common import __author__, __email__, __version__
  9 | 
 10 | LOG = logging.getLogger(__name__)
 11 | 
 12 | __all__ = []
 13 | 
 14 | 
 15 | def read_tbl(file):
 16 |     """
 17 |     read table
 18 |     :param file:
 19 |     :return:
 20 |     """
 21 | 
 22 |     for line in open(file):
 23 |         line = line.strip()
 24 | 
 25 |         if line.startswith("#") or not line:
 26 |             continue
 27 | 
 28 |         yield line.split("\t")
 29 | 
 30 | 
 31 | def cluster_protein(file):
 32 |     """
 33 |     cluster protein by pathway and ko
 34 |     :param file: kegg annotation result consist protein id, ko and pathways joined with "\t"
 35 |     :return: dict {pathway: {ko: [proteins]}}
 36 |     """
 37 |     path_dict = {}
 38 |     LOG.info("reading kegg result from '%r'" % file)
 39 | 
 40 |     for protein, ko, pathway in read_tbl(file):
 41 |         paths = pathway.split(";")
 42 | 
 43 |         for path in paths:
 44 |             if path not in path_dict:
 45 |                 path_dict[path] = {}
 46 | 
 47 |             if ko not in path_dict[path]:
 48 |                 path_dict[path][ko] = []
 49 | 
 50 |             path_dict[path][ko].append(protein)
 51 | 
 52 |     return path_dict
 53 | 
 54 | 
 55 | def output_keg(keg, path_dict, output):
 56 |     """
 57 |     output .keg by kegg annotation result
 58 |     :param keg: ko00001.keg
 59 |     :param path_dict: see function cluster_protein
 60 |     :param output: output file
 61 |     :return: 0
 62 |     """
 63 |     path_id = ""
 64 | 
 65 |     LOG.info("output kegg map to '%r'" % output)
 66 |     fh = open(output, "w")
 67 | 
 68 |     for line in open(keg):
 69 |         line = line.strip()
 70 | 
 71 |         if not line:
 72 |             continue
 73 | 
 74 |         tag = line[0]
 75 | 
 76 |         if tag == "C":
 77 |             path_id = "ko" + line.split()[1]
 78 |             fh.write("%s\n" % line)
 79 |             continue
 80 |         elif tag == "D":
 81 | 
 82 |             if path_id not in path_dict:
 83 |                 continue
 84 | 
 85 |             mess = line.split()
 86 |             ko = mess[1]
 87 |             name = " ".join(mess[2:])
 88 | 
 89 |             if ko not in path_dict[path_id]:
 90 |                 continue
 91 | 
 92 |             for p in path_dict[path_id][ko]:
 93 | 
 94 |                 if ko == "-":
 95 |                     fh.write("D      %s\t\n" % p)
 96 |                 else:
 97 |                     fh.write("D      %s\t%s %s\n" % (p, ko, name))
 98 |         else:
 99 |             fh.write("%s\n" % line)
100 | 
101 |     return 0
102 | 
103 | 
104 | def set_args():
105 | 
106 |     args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
107 |                                    description="""
108 | create .keg file from kegg annotation result
109 | 
110 | version: %s
111 | contact: %s <%s>\
112 |     """ % (__version__, " ".join(__author__), __email__))
113 | 
114 |     args.add_argument("--keg", metavar="FILE", required=True,
115 |                       help="KO file downloaded from KEGG, usually named 'ko00001.keg'")
116 |     args.add_argument("--in", metavar="FILE", dest="input", required=True,
117 |                       help="KEGG annotation result consist protein id, KO, pathway joined with '\t'")
118 |     args.add_argument("--out", metavar="STR", default="out", help="output prefix (default: out)")
119 | 
120 |     return args.parse_args()
121 | 
122 | 
123 | def main():
124 | 
125 |     logging.basicConfig(
126 |         stream=sys.stderr,
127 |         level=logging.INFO,
128 |         format="[%(levelname)s] %(message)s"
129 |     )
130 | 
131 |     args = set_args()
132 | 
133 |     path_dict = cluster_protein(args.input)
134 |     output_keg(args.keg, path_dict, args.out+".keg")
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     main()
139 | 
140 | 


--------------------------------------------------------------------------------
/process_proteins.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import os
  6 | import argparse
  7 | import logging
  8 | 
  9 | from common import read_org_ko, read_org, __email__, __version__, __author__
 10 | from FastaReader import open_fasta
 11 | 
 12 | 
 13 | LOG = logging.getLogger(__name__)
 14 | 
 15 | __all__ = []
 16 | 
 17 | 
 18 | def process_protein(org, keg, pep, out):
 19 |     """
 20 |     get the protein seq of with ko
 21 |     :param org: the organism abbr.
 22 |     :param keg: directory contains org.keg
 23 |     :param pep: directory contains org.pep.fasta.gz
 24 |     :param out: output directory
 25 |     :return:
 26 |     """
 27 | 
 28 |     r = []
 29 |     keg_name = os.path.join(keg, "%s00001.keg" % org)
 30 |     pep_name = os.path.join(pep, "%s.pep.fasta.gz" % org)
 31 | 
 32 |     if not os.path.exists(keg_name):
 33 |         LOG.info("keg %r not exists, skip" % keg_name)
 34 |         return "%s\tno keg" % org
 35 | 
 36 |     if not os.path.exists(pep_name):
 37 |         LOG.info("pep %r not exists, skip" % pep_name)
 38 |         return "%s\tno protein" % org
 39 | 
 40 |     gene_dict = read_org_ko(keg_name)
 41 | 
 42 |     if not gene_dict:
 43 |         LOG.info("keg %r is empty, skip" % keg_name)
 44 |         return "%s\t keg is empty" % org
 45 | 
 46 |     records = []
 47 | 
 48 |     for record in open_fasta(pep_name):
 49 |         name = record.name
 50 | 
 51 |         # gene_id is in db_xref or locus_tag
 52 | 
 53 |         if "locus_tag=" in name:
 54 |             id = name.split("locus_tag=")[1].split("]")[0]
 55 |         elif "db_xref=GeneID:" in name:
 56 |             id = name.split("db_xref=GeneID:")[1].split("]")[0]
 57 |         elif "protein_id=" in name:
 58 |             id = name.split("protein_id=")[1].split("]")[0].split(".")[0]
 59 |         else:
 60 |             continue
 61 | 
 62 |         if id in gene_dict:
 63 |             r.append(id)
 64 | 
 65 |         records.append(">%s:%s\n%s\n" % (org, id, record.seq))
 66 | 
 67 |     if r:
 68 |         with open(os.path.join(out, "%s.pep.fasta" % org), "w") as fh:
 69 |             fh.write("".join(records))
 70 |         return 0
 71 |     else:
 72 |         return "%s\tpep not match with keg" % org
 73 | 
 74 | 
 75 | def process_proteins(orgs, keg, pep, out):
 76 |     """
 77 |     get the protein seq of ids in keg from pep
 78 |     :param keg:
 79 |     :param pep:
 80 |     :param out:
 81 |     :return:
 82 |     """
 83 | 
 84 |     num = len(orgs)
 85 | 
 86 |     for n, org in enumerate(orgs):
 87 |         LOG.info("%s/%s process %s" % (n+1, num, org))
 88 |         process_protein(org, keg, pep, out)
 89 | 
 90 |     return 0
 91 | 
 92 | 
 93 | def set_args():
 94 | 
 95 |     args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
 96 |                                    description="""
 97 | extract proteins in KO from NCBI download files
 98 | 
 99 | version: %s
100 | contact: %s <%s>\
101 |     """ % (__version__, " ".join(__author__), __email__))
102 | 
103 |     args.add_argument("--org", metavar="FILE", required=True,
104 |                       help="a list of KEGG organism abbr. at the first column")
105 |     args.add_argument("--keg", metavar="DIR", required=True,
106 |                       help="directory contains {org}00001.keg")
107 |     args.add_argument("--pep", metavar="DIR", required=True,
108 |                       help="directory contains {org}.pep.fasta.gz from NCBI")
109 |     args.add_argument("--out", metavar="DIR", default=".", help="output directory (default: current directory)")
110 | 
111 |     return args.parse_args()
112 | 
113 | 
114 | def main():
115 | 
116 |     logging.basicConfig(
117 |         stream=sys.stderr,
118 |         level=logging.INFO,
119 |         format="[%(levelname)s] %(message)s"
120 |     )
121 | 
122 |     args = set_args()
123 | 
124 |     orgs = read_org(args.org)
125 |     process_proteins([i[0] for i in orgs], args.keg, args.pep, args.out)
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 
131 | 


--------------------------------------------------------------------------------
/FastaReader.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import absolute_import
  3 | 
  4 | import gzip
  5 | from os.path import abspath, expanduser
  6 | 
  7 | 
  8 | def split_header(name):
  9 |     """
 10 |     split fasta header to id and description
 11 |     :param name:
 12 |     :return:
 13 |     """
 14 |     parts = name.split(None, 1)
 15 | 
 16 |     if len(parts) == 1:
 17 |         parts.append("")
 18 | 
 19 |     return parts
 20 | 
 21 | 
 22 | class FastaRecord(object):
 23 |     """
 24 |     object to process a fasta record
 25 |     """
 26 |     DELIMITER = ">"
 27 | 
 28 |     def __init__(self, name, seq):
 29 |         try:
 30 |             assert "\n" not in name
 31 |             assert "\n" not in seq
 32 |             assert self.DELIMITER not in seq
 33 |             self._name = name
 34 |             self._seq = seq
 35 |             self._id, self._description = split_header(name)
 36 |         except AssertionError:
 37 |             raise ValueError("Invalid FASTA record data")
 38 | 
 39 |     @property
 40 |     def name(self):
 41 |         """
 42 |         the name of the seq, strings after ">"
 43 |         """
 44 |         return self._name
 45 | 
 46 |     @property
 47 |     def id(self):
 48 |         """
 49 |         The id of the seq, equal to the FASTA header
 50 |         up to the first whitespace.
 51 |         """
 52 |         return self._id
 53 | 
 54 |     @property
 55 |     def description(self):
 56 |         """
 57 |         The description of the seq in the FASTA file, equal to
 58 |         the contents of the FASTA header following the first whitespace
 59 |         """
 60 |         return self._description
 61 | 
 62 |     @property
 63 |     def seq(self):
 64 |         """
 65 |         The seq of the record
 66 | 
 67 |         """
 68 |         return self._seq
 69 | 
 70 |     @property
 71 |     def length(self):
 72 |         """
 73 |         the length of the seq
 74 |         """
 75 |         return len(self._seq)
 76 | 
 77 |     @classmethod
 78 |     def from_string(cls, string):
 79 |         """
 80 |         Interprets a string as a FASTA record.  Does not make any
 81 |         assumptions about wrapping of the seq string.
 82 |         """
 83 |         string = string.strip()
 84 | 
 85 |         try:
 86 |             lines = string.splitlines()
 87 |             assert len(lines) > 1
 88 |             assert lines[0][0] == cls.DELIMITER
 89 |             name = lines[0][1:]
 90 |             seq = "".join(lines[1:])
 91 |             return FastaRecord(name, seq)
 92 |         except AssertionError:
 93 |             raise ValueError("String not recognized as a valid FASTA record")
 94 | 
 95 |     def __str__(self):
 96 |         """
 97 |         str conversion
 98 |         :return:
 99 |         """
100 |         return ">%s\n%s" % (self.name, self.seq)
101 | 
102 | 
103 | def check_format(filename):
104 |     """
105 |     check the format of file
106 |     :param filename:
107 |     :return:
108 |     """
109 |     allowed_format = [".fa", ".fasta", ".fa.gz", ".fasta.gz"]
110 | 
111 |     if any([f for f in allowed_format if filename.endswith(f)]):
112 |         return 0
113 |     else:
114 |         msg = "file format is not in %s" % allowed_format
115 |         raise Exception(msg)
116 | 
117 | 
118 | def yield_fasta_records(stream):
119 |     """
120 |     yield fastq records from stream
121 |     :param stream: a stream object
122 |     :return:
123 |     """
124 |     string = ""
125 | 
126 |     for line in stream:
127 |         line = line.strip()
128 | 
129 |         if not line:
130 |             continue
131 | 
132 |         if string and line.startswith(">"):
133 |             yield FastaRecord.from_string(string)
134 |             string = ""
135 | 
136 |         string += "%s\n" % line
137 | 
138 |     if string:
139 |         yield FastaRecord.from_string(string)
140 | 
141 | 
142 | def open_fasta(filename):
143 |     """
144 |     read fasta file and return fasta records
145 |     :param filename:
146 |     :return:
147 |     """
148 |     check_format(filename)
149 |     filename = abspath(expanduser(filename))
150 |     mode = 'r'
151 | 
152 |     if filename.endswith(".gz"):
153 |         stream = gzip.open(filename, mode)
154 |     else:
155 |         stream = open(filename, mode)
156 | 
157 |     return yield_fasta_records(stream)
158 | 


--------------------------------------------------------------------------------
/plot_keg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | import argparse
  6 | import logging
  7 | import sys
  8 | from collections import OrderedDict
  9 | 
 10 | from common import __author__, __email__, __version__
 11 | 
 12 | LOG = logging.getLogger(__name__)
 13 | 
 14 | __all__ = []
 15 | 
 16 | 
 17 | def stat_keg(keg):
 18 |     """
 19 |     read pathway information from .keg
 20 |     :param keg: .keg file
 21 |     :return: a dict {pathway_A: {pathway_B: [proteins]}}
 22 |     """
 23 | 
 24 |     r = OrderedDict()
 25 |     LOG.info("reading kegg map from %r" % keg)
 26 | 
 27 |     path1 = ""
 28 |     path2 = ""
 29 | 
 30 |     for line in open(keg):
 31 |         line = line.strip()
 32 | 
 33 |         if not line:
 34 |             continue
 35 | 
 36 |         tag = line[0]
 37 | 
 38 |         if tag == "A" and "<b>" in line:
 39 |             path1 = line[4:-4]
 40 |             r[path1] = OrderedDict()
 41 |             continue
 42 | 
 43 |         if tag == "B" and "<b>" in line:
 44 |             path2 = line[6:-4]
 45 |             r[path1][path2] = []
 46 |             continue
 47 | 
 48 |         if tag == "D":
 49 |             r[path1][path2].append(line.split()[1])
 50 | 
 51 |     return r
 52 | 
 53 | 
 54 | def plot_keg(keg_dict, out):
 55 |     """
 56 |     plot function
 57 |     :param keg_dict: see stat_keg
 58 |     :param out: output filename
 59 |     :return: 0
 60 |     """
 61 |     x = []
 62 |     y = []
 63 |     n = 1
 64 | 
 65 |     for path1 in keg_dict:
 66 |         x.append(n)
 67 |         y.append(0)
 68 |         n += 1
 69 | 
 70 |         for path2 in keg_dict[path1]:
 71 |             num = len(set(keg_dict[path1][path2]))
 72 | 
 73 |             if not num:
 74 |                 continue
 75 | 
 76 |             x.append(n)
 77 |             y.append(num)
 78 |             n += 1
 79 | 
 80 |     y_max = max(y) * 1.1
 81 | 
 82 |     colors = []
 83 |     color = ["", "blue", "green", "red", "purple", "skyblue", "orange", "gray"]
 84 |     lv = 0
 85 |     n = 1
 86 | 
 87 |     LOG.info("plot KEGG annotation result to %r" % out)
 88 |     from matplotlib import pyplot as plt
 89 | 
 90 |     fig = plt.figure(figsize=(8, 8))
 91 |     ax = fig.add_subplot(111, )
 92 | 
 93 |     for path1 in keg_dict:
 94 | 
 95 |         colors.append("white")
 96 |         lv += 1
 97 |         ax.text(y_max / -1.7, n, path1, fontsize=8, verticalalignment='center', horizontalalignment='left', family="Arial",
 98 |                 color=color[lv])
 99 |         n += 1
100 | 
101 |         for path2 in keg_dict[path1]:
102 | 
103 |             num = len(set(keg_dict[path1][path2]))
104 | 
105 |             if not num:
106 |                 continue
107 | 
108 |             ax.text(num, n, num, fontsize=8, verticalalignment='center', family="Arial",)
109 |             ax.text(y_max / -1.8, n, path2, fontsize=8, verticalalignment='center', horizontalalignment='left', family="Arial",
110 |                     color=color[lv])
111 |             colors.append(color[lv])
112 |             n += 1
113 | 
114 |     ax.barh(x, y, color=colors, alpha=0.5)
115 |     ax.set_xlim([y_max / -100, y_max])
116 |     ax.set_ylim([0, ax.get_ylim()[1]])
117 |     plt.xticks(fontsize=8, family="Arial",)
118 |     ax.set_yticks([])
119 |     plt.subplots_adjust(top=0.95, left=0.35, right=0.95, bottom=0.05)
120 |     plt.xlabel("Number of Genes", fontsize=10, family="Arial",)
121 | 
122 |     ax.invert_yaxis()
123 |     plt.savefig(out)
124 | 
125 |     return 0
126 | 
127 | 
128 | def set_args():
129 | 
130 |     args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
131 |                                    description="""
132 | plot kegg annotation from .keg
133 | 
134 | version: %s
135 | contact: %s <%s>\
136 |     """ % (__version__, " ".join(__author__), __email__))
137 | 
138 |     args.add_argument("--keg", metavar="FILE", required=True,
139 |                       help="KO file named '*.keg', can be make by make_keg.py")
140 |     args.add_argument("--out", metavar="STR", default="out", help="output prefix (default: out)")
141 | 
142 |     return args.parse_args()
143 | 
144 | 
145 | def main():
146 | 
147 |     logging.basicConfig(
148 |         stream=sys.stderr,
149 |         level=logging.INFO,
150 |         format="[%(levelname)s] %(message)s"
151 |     )
152 | 
153 |     args = set_args()
154 |     keg_dict = stat_keg(args.keg)
155 |     plot_keg(keg_dict, args.out+".pdf")
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     main()
160 | 
161 | 


--------------------------------------------------------------------------------