├── src
    ├── __init__.py
    ├── get_all_data.py
    ├── search_save_herbs.py
    ├── dev
    │   └── uniprot.py
    └── tcmsp.py
├── herb_list.txt
├── requirements.txt
├── data
    ├── sample_data
    │   ├── herbs_data.xlsx
    │   ├── diseases_data.xlsx
    │   ├── targets_data.xlsx
    │   └── ingredients_data.xlsx
    └── spider_data
    │   ├── Baizhu_disease.xlsx
    │   ├── Baizhu_targets.xlsx
    │   ├── Chenpi_disease.xlsx
    │   ├── Chenpi_targets.xlsx
    │   ├── Mahuang_disease.xlsx
    │   ├── Mahuang_targets.xlsx
    │   ├── Baizhu_ingredients.xlsx
    │   ├── Chenpi_ingredients.xlsx
    │   ├── Mahuanggen_disease.xlsx
    │   ├── Mahuanggen_targets.xlsx
    │   ├── Mahuang_ingredients.xlsx
    │   └── Mahuanggen_ingredients.xlsx
├── .gitignore
├── LICENSE
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/herb_list.txt:
--------------------------------------------------------------------------------
1 | 麻黄
2 | Baizhu
3 | Citrus Reticulata
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.11.2
2 | lxml==4.9.2
3 | pandas==1.5.2
4 | requests==2.28.1
5 | 


--------------------------------------------------------------------------------
/data/sample_data/herbs_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/sample_data/herbs_data.xlsx


--------------------------------------------------------------------------------
/data/sample_data/diseases_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/sample_data/diseases_data.xlsx


--------------------------------------------------------------------------------
/data/sample_data/targets_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/sample_data/targets_data.xlsx


--------------------------------------------------------------------------------
/data/sample_data/ingredients_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/sample_data/ingredients_data.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Baizhu_disease.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Baizhu_disease.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Baizhu_targets.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Baizhu_targets.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Chenpi_disease.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Chenpi_disease.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Chenpi_targets.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Chenpi_targets.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Mahuang_disease.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuang_disease.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Mahuang_targets.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuang_targets.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Baizhu_ingredients.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Baizhu_ingredients.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Chenpi_ingredients.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Chenpi_ingredients.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Mahuanggen_disease.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuanggen_disease.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Mahuanggen_targets.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuanggen_targets.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Mahuang_ingredients.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuang_ingredients.xlsx


--------------------------------------------------------------------------------
/data/spider_data/Mahuanggen_ingredients.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuanggen_ingredients.xlsx


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | */.DS_Store
 3 | 
 4 | __pycache__/
 5 | *.pyc
 6 | *.egg-info/
 7 | dist/
 8 | build/
 9 | .tox/
10 | .env
11 | *.log
12 | 


--------------------------------------------------------------------------------
/src/get_all_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python3
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | from tcmsp import TcmspSpider
 5 | 
 6 | 
 7 | def get_data(type):
 8 | 
 9 |     tcmsp = TcmspSpider()
10 |     url = f"https://tcmsp-e.com/browse.php?qc={type}"
11 | 
12 |     # 获取页面
13 |     html = tcmsp.get_response(url)
14 |     data = tcmsp.get_json_data(html, num=8, pattern="grid")
15 | 
16 |     # 保存数据
17 |     tcmsp.text_to_excel(
18 |         data,
19 |         file_path=f"{tcmsp.sample_file_path}",
20 |         file_name=f"{type}_data",
21 |         index=False
22 |     )
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     type_list = ["herbs", "ingredients", "targets", "diseases"]
27 |     for type in type_list:
28 |         print(f"正在下载：{type}")
29 |         get_data(type)
30 | 


--------------------------------------------------------------------------------
/src/search_save_herbs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python3
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | from tcmsp import TcmspSpider
 5 | 
 6 | 
 7 | def get_herb_data():
 8 |     """
 9 |     Search for herbs to be queried and download data.
10 |     :return: None
11 |     """
12 |     tcmsp = TcmspSpider()
13 | 
14 |     # 构建药物列表
15 |     herb_list = []
16 |     with open("./herb_list.txt", "r", encoding="utf-8") as f:
17 |         for line in f:
18 |             herb_list.append(line.strip())
19 | 
20 |     print(f"共有{len(herb_list)}个药物需要查询！\n")
21 | 
22 |     tcmsp.token = tcmsp.get_token()
23 | 
24 |     # 遍历需要查询的药物
25 |     for herb in herb_list:
26 |         if herb == "":
27 |             continue
28 | 
29 |         herb_three_names = tcmsp.get_herb_name(herb)
30 | 
31 |         # 如果查询到多个药物，逐一下载
32 |         for name in herb_three_names:
33 |             herb_cn_name = name["herb_cn_name"]
34 |             herb_en_name = name["herb_en_name"]
35 |             herb_pinyin_name = name["herb_pinyin"]
36 |             tcmsp.get_herb_data(herb_cn_name, herb_en_name, herb_pinyin_name)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     get_herb_data()
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 shujuecn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/dev/uniprot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python3
 2 | # -*- encoding: utf-8 -*-
 3 | '''
 4 | @Brief  : Uniprot数据库爬虫
 5 | @Time   : 2023/02/12 22:10:08
 6 | @Author : https://github.com/shujuecn
 7 | '''
 8 | 
 9 | import requests
10 | 
11 | 
12 | class UniProtAPI:
13 |     def __init__(self):
14 |         self.headers = {
15 |             "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; NCE-AL10 Build/HUAWEINCE-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36"
16 |         }
17 |         self.fields = "accession,reviewed,id,protein_name,gene_names,organism_name,length"
18 | 
19 |     def search(self, keyword, data_format):
20 |         if data_format not in ['xlsx', 'json']:
21 |             raise ValueError(f"Invalid data format: {data_format}. Supported formats are 'xlsx' and 'json'.")
22 | 
23 |         url = f"https://rest.uniprot.org/uniprotkb/stream?fields={self.fields}&format={data_format}&query=%28{keyword}%29"
24 | 
25 |         response = requests.get(url)
26 |         return response.content
27 | 
28 | 
29 | uniprot = UniProtAPI()
30 | data = uniprot.search("Muscarinic acetylcholine receptor M1", "json")
31 | print(data)
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TCMSP-Spider
 2 | 
 3 | TCMSP-Spider is a Python tool for extracting data from [TCMSP](https://www.tcmsp-e.com) (Traditional Chinese Medicine Systems Pharmacology Database and Analysis Platform) website. It allows you to search for a specific drug and retrieve its related ingredients, targets, and diseases. Additionally, you can download "all" data of drugs, ingredients, targets, and diseases. The tool can be easily configured to query and download a list of drugs, eliminating the need to manually pass `token` parameters.
 4 | 
 5 | ## Installation
 6 | 
 7 | 1. Clone the repository and navigate to the project directory:
 8 | 
 9 | ```
10 | git clone https://github.com/shujuecn/TCMSP-Spider.git
11 | cd TCMSP-Spider
12 | ```
13 | 
14 | 2. Install the required dependencies:
15 | 
16 | ```
17 | pip3 install -r requirements.txt
18 | ```
19 | 
20 | ## Usage
21 | ### Searching data by drug name
22 | 
23 | 1. Add the names of the drugs you want to search for in `herb_list.txt`. You can add multiple drugs, and the names can be written in Chinese, Pinyin or Latin, for example:
24 | 
25 | ```
26 | 麻黄
27 | Baizhu
28 | Citrus Reticulata
29 | ```
30 | 
31 | 2. Run the following command to start the search process:
32 | 
33 | ```
34 | python3 src/search_save_herbs.py
35 | ```
36 | 
37 | The program will automatically obtain the `token` value and query all the drugs specified in `herb_list.txt`. Because a single Chinese or Pinyin name may correspond to multiple drugs, the program will download the ingredients, targets, and diseases of each drug, and save them in an Excel (.xlsx) file in the `data/spider_data` folder.
38 | 
39 | ```
40 | 麻黄 -> 麻黄、麻黄根
41 | fuzi -> Baifuzi、Difuzi、Fuzi、Laifuzi
42 | ```
43 | ### Downloading "all" data
44 | 
45 | On the [TCMSP Browse Database](https://tcmsp-e.com/browse.php?qc=herbs) page, the website provides four types of data, including "all" drugs, ingredients, targets, and diseases. You can use the following command to download these data and save them in an Excel (.xlsx) file in the `data/sample_data` folder.
46 | 
47 | ```
48 | python3 src/get_all_data.py
49 | ```
50 | 
51 | ### Querying relationships
52 | 
53 | Using the data downloaded with "Get all data," you can use the program to query the relationships between drugs, ingredients, targets, and diseases. For example:
54 | 
55 | ```
56 | Target ID: TAR00006
57 | 
58 | Related diseases: Chronic inflammatory diseases...
59 | Related ingredients: cyanidol...
60 | Related herbs: Asteris Radix Et Rhizoma...
61 | ```
62 | 
63 | While it is not currently available in the current version of the program, in the future, it may be possible to use the data downloaded using "Get all data" to query for relationships between different elements, such as finding all the ingredients related to a certain disease or target. This feature is not yet implemented in the current version, but may be added in a future update.
64 | 
65 | ## LICENSE
66 | 
67 | This project is released under the MIT open source license. If you have any suggestions or feedback, please feel free to submit an issue or pull request.
68 | 
69 | ## Changelog
70 | 
71 | * 2023/02/09: Initial commit. Completed the search function and data download function.
72 | * 2023/02/10: Refactored the project structure and added the "download all data" function.
73 | 
74 | 


--------------------------------------------------------------------------------
/src/tcmsp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python3
  2 | # -*- encoding: utf-8 -*-
  3 | '''
  4 | @Brief  : TCMSP数据库爬虫
  5 | @Time   : 2023/02/09 19:39:55
  6 | @Author : https://github.com/shujuecn
  7 | '''
  8 | 
  9 | import os
 10 | import re
 11 | import json
 12 | import requests
 13 | import pandas as pd
 14 | from bs4 import BeautifulSoup as bs
 15 | import lxml.html
 16 | 
 17 | 
 18 | class TcmspSpider:
 19 |     def __init__(self):
 20 | 
 21 |         self.root_url = "https://www.tcmsp-e.com/tcmspsearch.php"
 22 |         self.headers = {
 23 |             "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; NCE-AL10 Build/HUAWEINCE-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36",
 24 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
 25 |             "Accept-Encoding": "gzip, deflate, br",
 26 |         }
 27 | 
 28 |         # self.file_path = "./data/"
 29 | 
 30 |         self.spider_file_path = "./data/spider_data/"
 31 |         self.sample_file_path = "./data/sample_data/"
 32 |         self.create_folder(self.spider_file_path)
 33 |         self.create_folder(self.sample_file_path)
 34 | 
 35 |         self.token = None
 36 | 
 37 |     def create_folder(self, path):
 38 |         """
 39 |         Create folder
 40 |         :return: None
 41 |         """
 42 | 
 43 |         if not os.path.exists(path):
 44 |             os.makedirs(path)
 45 | 
 46 |     def get_response(self, url):
 47 |         """
 48 |         Get response from url
 49 |         :param url: url
 50 |         :return: html
 51 |         """
 52 |         try:
 53 |             response = requests.get(url=url, headers=self.headers)
 54 |             response.raise_for_status()
 55 |             html = response.content.decode("utf-8")
 56 |             return html
 57 |         except requests.exceptions.RequestException as e:
 58 |             print(e)
 59 |             return
 60 | 
 61 |     def get_token(self):
 62 |         """
 63 |         Get token
 64 |         :return: token
 65 |         """
 66 | 
 67 |         html = self.get_response(self.root_url)
 68 |         root = lxml.html.fromstring(html)
 69 |         token = root.xpath('//form[@id="SearchForm"]//input[@name="token"]/@value')
 70 |         if token:
 71 |             print("token获取成功！\n")
 72 |             return token[0]
 73 |         else:
 74 |             print("token获取失败！\n")
 75 |             return
 76 | 
 77 |     def get_herb_name(self, herb_name):
 78 |         """
 79 |         Get herb's English name
 80 |         :param herb_name: herb's Chinese name
 81 |         :return: herb's English name
 82 |         """
 83 | 
 84 |         print(f"正在查询: {herb_name}...\n")
 85 | 
 86 |         url = f"{self.root_url}?qs=herb_all_name&q={herb_name}&token={self.token}"
 87 |         html = self.get_response(url)
 88 | 
 89 |         if html:
 90 |             soup = bs(html, "html.parser")
 91 |             script = soup.findAll("script")[8].__str__()
 92 |             # 解析药物的名称
 93 |             herb_three_names = re.findall(r"\n.*data:\s(.*),", script)[0]
 94 | 
 95 |             if herb_three_names != "[]":
 96 |                 herb_three_names = json.loads(herb_three_names)
 97 |                 return herb_three_names
 98 | 
 99 |             else:
100 |                 print(f"未查询到{herb_name}的信息！")
101 |                 return None
102 |         else:
103 |             pass
104 | 
105 |     def get_herb_data(self, cn_name, en_name, pinyin_name):
106 |         """
107 |         Get herb's data
108 |         :param cn_name: herb's Chinese name
109 |         :param en_name: herb's Latin name
110 |         :param pinyin_name: herb's pinyin name
111 |         :return: None
112 |         """
113 | 
114 |         # Construction request link
115 |         en_name = en_name.replace(" ", "%20")
116 |         url = f"{self.root_url}?qr={en_name}&qsr=herb_en_name&token={self.token}"
117 | 
118 |         print(f"正在下载: {cn_name}...")
119 |         html = self.get_response(url)
120 |         if html:
121 | 
122 |             # 提取json数据
123 |             # data = self.get_json_data(html)
124 | 
125 |             # 导出 Ingredients
126 |             ingredients_pattern = "grid"
127 |             ingredients_data = self.get_json_data(html, 11, ingredients_pattern)
128 |             self.text_to_excel(
129 |                 ingredients_data,
130 |                 file_path=f"{self.spider_file_path}",
131 |                 file_name=f"{pinyin_name}_ingredients",
132 |                 index="MOL_ID"
133 |             )
134 | 
135 |             # 导出 Targets
136 |             targets_pattern = "grid2"
137 |             targets_data = self.get_json_data(html, 11, targets_pattern)
138 |             self.text_to_excel(
139 |                 targets_data,
140 |                 file_path=f"{self.spider_file_path}",
141 |                 file_name=f"{pinyin_name}_targets",
142 |                 index="MOL_ID"
143 |             )
144 | 
145 |             # 导出 Disease
146 |             # INDEX参数为False，因为Disease表格中没有MOL_ID
147 |             disease_pattern = "grid3"
148 |             disease_data = self.get_json_data(html, 11, disease_pattern)
149 |             self.text_to_excel(
150 |                 disease_data,
151 |                 file_path=f"{self.spider_file_path}",
152 |                 file_name=f"{pinyin_name}_disease",
153 |                 index=False
154 |             )
155 | 
156 |         print(f"{cn_name}下载完成！\n")
157 | 
158 |     def get_json_data(self, html, num, pattern):
159 |         """
160 |         Get json text
161 |         :param html: html
162 |         :param num: script number()
163 |         :param pattern: regular expression
164 |         :return: json text
165 |         """
166 |         soup = bs(html, "html.parser")
167 |         scripts = soup.findAll("script")
168 | 
169 |         # The serial number of data in different pages is different
170 |         text = scripts[num].__str__()
171 | 
172 |         pattern = rf"\$\(\"\#{pattern}\".*\n.*\n.*data\:\s(\[.*\])"
173 |         match = re.compile(pattern).search(text)
174 |         result = match.group(1)
175 |         data = json.loads(result)
176 | 
177 |         return data
178 | 
179 |     def text_to_excel(self, data, file_path, file_name, index):
180 |         """
181 |         Regular expression extracts json data and converts to excel
182 |         :param text: text
183 |         :param pattern: regular expression
184 |         :param file_name: file name
185 |         :return: None
186 |         """
187 | 
188 |         # Regular expression extracts json data
189 |         if data:
190 |             df = pd.DataFrame(data)
191 | 
192 |             # Custom index columns
193 |             if index:
194 |                 df.set_index(index, inplace=True)
195 |                 df.to_excel(f"{file_path}{file_name}.xlsx", index=True)
196 |             else:
197 |                 df.to_excel(f"{file_path}{file_name}.xlsx", index=False)
198 | 
199 |             print(f"已保存：{file_name}.xlsx")
200 | 
201 |         else:
202 |             print(f"未查询到{file_name}的信息！")
203 | 


--------------------------------------------------------------------------------