├── src ├── __init__.py ├── get_all_data.py ├── search_save_herbs.py ├── dev │ └── uniprot.py └── tcmsp.py ├── herb_list.txt ├── requirements.txt ├── data ├── sample_data │ ├── herbs_data.xlsx │ ├── diseases_data.xlsx │ ├── targets_data.xlsx │ └── ingredients_data.xlsx └── spider_data │ ├── Baizhu_disease.xlsx │ ├── Baizhu_targets.xlsx │ ├── Chenpi_disease.xlsx │ ├── Chenpi_targets.xlsx │ ├── Mahuang_disease.xlsx │ ├── Mahuang_targets.xlsx │ ├── Baizhu_ingredients.xlsx │ ├── Chenpi_ingredients.xlsx │ ├── Mahuanggen_disease.xlsx │ ├── Mahuanggen_targets.xlsx │ ├── Mahuang_ingredients.xlsx │ └── Mahuanggen_ingredients.xlsx ├── .gitignore ├── LICENSE └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /herb_list.txt: -------------------------------------------------------------------------------- 1 | 麻黄 2 | Baizhu 3 | Citrus Reticulata 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.11.2 2 | lxml==4.9.2 3 | pandas==1.5.2 4 | requests==2.28.1 5 | -------------------------------------------------------------------------------- /data/sample_data/herbs_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/sample_data/herbs_data.xlsx -------------------------------------------------------------------------------- /data/sample_data/diseases_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/sample_data/diseases_data.xlsx -------------------------------------------------------------------------------- /data/sample_data/targets_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/sample_data/targets_data.xlsx -------------------------------------------------------------------------------- /data/sample_data/ingredients_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/sample_data/ingredients_data.xlsx -------------------------------------------------------------------------------- /data/spider_data/Baizhu_disease.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Baizhu_disease.xlsx -------------------------------------------------------------------------------- /data/spider_data/Baizhu_targets.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Baizhu_targets.xlsx -------------------------------------------------------------------------------- /data/spider_data/Chenpi_disease.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Chenpi_disease.xlsx -------------------------------------------------------------------------------- /data/spider_data/Chenpi_targets.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Chenpi_targets.xlsx -------------------------------------------------------------------------------- /data/spider_data/Mahuang_disease.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuang_disease.xlsx -------------------------------------------------------------------------------- /data/spider_data/Mahuang_targets.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuang_targets.xlsx -------------------------------------------------------------------------------- /data/spider_data/Baizhu_ingredients.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Baizhu_ingredients.xlsx -------------------------------------------------------------------------------- /data/spider_data/Chenpi_ingredients.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Chenpi_ingredients.xlsx -------------------------------------------------------------------------------- /data/spider_data/Mahuanggen_disease.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuanggen_disease.xlsx -------------------------------------------------------------------------------- /data/spider_data/Mahuanggen_targets.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuanggen_targets.xlsx -------------------------------------------------------------------------------- /data/spider_data/Mahuang_ingredients.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuang_ingredients.xlsx -------------------------------------------------------------------------------- /data/spider_data/Mahuanggen_ingredients.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shujuecn/TCMSP-Spider/HEAD/data/spider_data/Mahuanggen_ingredients.xlsx -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | */.DS_Store 3 | 4 | __pycache__/ 5 | *.pyc 6 | *.egg-info/ 7 | dist/ 8 | build/ 9 | .tox/ 10 | .env 11 | *.log 12 | -------------------------------------------------------------------------------- /src/get_all_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- encoding: utf-8 -*- 3 | 4 | from tcmsp import TcmspSpider 5 | 6 | 7 | def get_data(type): 8 | 9 | tcmsp = TcmspSpider() 10 | url = f"https://tcmsp-e.com/browse.php?qc={type}" 11 | 12 | # 获取页面 13 | html = tcmsp.get_response(url) 14 | data = tcmsp.get_json_data(html, num=8, pattern="grid") 15 | 16 | # 保存数据 17 | tcmsp.text_to_excel( 18 | data, 19 | file_path=f"{tcmsp.sample_file_path}", 20 | file_name=f"{type}_data", 21 | index=False 22 | ) 23 | 24 | 25 | if __name__ == '__main__': 26 | type_list = ["herbs", "ingredients", "targets", "diseases"] 27 | for type in type_list: 28 | print(f"正在下载:{type}") 29 | get_data(type) 30 | -------------------------------------------------------------------------------- /src/search_save_herbs.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- encoding: utf-8 -*- 3 | 4 | from tcmsp import TcmspSpider 5 | 6 | 7 | def get_herb_data(): 8 | """ 9 | Search for herbs to be queried and download data. 10 | :return: None 11 | """ 12 | tcmsp = TcmspSpider() 13 | 14 | # 构建药物列表 15 | herb_list = [] 16 | with open("./herb_list.txt", "r", encoding="utf-8") as f: 17 | for line in f: 18 | herb_list.append(line.strip()) 19 | 20 | print(f"共有{len(herb_list)}个药物需要查询!\n") 21 | 22 | tcmsp.token = tcmsp.get_token() 23 | 24 | # 遍历需要查询的药物 25 | for herb in herb_list: 26 | if herb == "": 27 | continue 28 | 29 | herb_three_names = tcmsp.get_herb_name(herb) 30 | 31 | # 如果查询到多个药物,逐一下载 32 | for name in herb_three_names: 33 | herb_cn_name = name["herb_cn_name"] 34 | herb_en_name = name["herb_en_name"] 35 | herb_pinyin_name = name["herb_pinyin"] 36 | tcmsp.get_herb_data(herb_cn_name, herb_en_name, herb_pinyin_name) 37 | 38 | 39 | if __name__ == "__main__": 40 | get_herb_data() 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 shujuecn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/dev/uniprot.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- encoding: utf-8 -*- 3 | ''' 4 | @Brief : Uniprot数据库爬虫 5 | @Time : 2023/02/12 22:10:08 6 | @Author : https://github.com/shujuecn 7 | ''' 8 | 9 | import requests 10 | 11 | 12 | class UniProtAPI: 13 | def __init__(self): 14 | self.headers = { 15 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; NCE-AL10 Build/HUAWEINCE-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36" 16 | } 17 | self.fields = "accession,reviewed,id,protein_name,gene_names,organism_name,length" 18 | 19 | def search(self, keyword, data_format): 20 | if data_format not in ['xlsx', 'json']: 21 | raise ValueError(f"Invalid data format: {data_format}. Supported formats are 'xlsx' and 'json'.") 22 | 23 | url = f"https://rest.uniprot.org/uniprotkb/stream?fields={self.fields}&format={data_format}&query=%28{keyword}%29" 24 | 25 | response = requests.get(url) 26 | return response.content 27 | 28 | 29 | uniprot = UniProtAPI() 30 | data = uniprot.search("Muscarinic acetylcholine receptor M1", "json") 31 | print(data) 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TCMSP-Spider 2 | 3 | TCMSP-Spider is a Python tool for extracting data from [TCMSP](https://www.tcmsp-e.com) (Traditional Chinese Medicine Systems Pharmacology Database and Analysis Platform) website. It allows you to search for a specific drug and retrieve its related ingredients, targets, and diseases. Additionally, you can download "all" data of drugs, ingredients, targets, and diseases. The tool can be easily configured to query and download a list of drugs, eliminating the need to manually pass `token` parameters. 4 | 5 | ## Installation 6 | 7 | 1. Clone the repository and navigate to the project directory: 8 | 9 | ``` 10 | git clone https://github.com/shujuecn/TCMSP-Spider.git 11 | cd TCMSP-Spider 12 | ``` 13 | 14 | 2. Install the required dependencies: 15 | 16 | ``` 17 | pip3 install -r requirements.txt 18 | ``` 19 | 20 | ## Usage 21 | ### Searching data by drug name 22 | 23 | 1. Add the names of the drugs you want to search for in `herb_list.txt`. You can add multiple drugs, and the names can be written in Chinese, Pinyin or Latin, for example: 24 | 25 | ``` 26 | 麻黄 27 | Baizhu 28 | Citrus Reticulata 29 | ``` 30 | 31 | 2. Run the following command to start the search process: 32 | 33 | ``` 34 | python3 src/search_save_herbs.py 35 | ``` 36 | 37 | The program will automatically obtain the `token` value and query all the drugs specified in `herb_list.txt`. Because a single Chinese or Pinyin name may correspond to multiple drugs, the program will download the ingredients, targets, and diseases of each drug, and save them in an Excel (.xlsx) file in the `data/spider_data` folder. 38 | 39 | ``` 40 | 麻黄 -> 麻黄、麻黄根 41 | fuzi -> Baifuzi、Difuzi、Fuzi、Laifuzi 42 | ``` 43 | ### Downloading "all" data 44 | 45 | On the [TCMSP Browse Database](https://tcmsp-e.com/browse.php?qc=herbs) page, the website provides four types of data, including "all" drugs, ingredients, targets, and diseases. You can use the following command to download these data and save them in an Excel (.xlsx) file in the `data/sample_data` folder. 46 | 47 | ``` 48 | python3 src/get_all_data.py 49 | ``` 50 | 51 | ### Querying relationships 52 | 53 | Using the data downloaded with "Get all data," you can use the program to query the relationships between drugs, ingredients, targets, and diseases. For example: 54 | 55 | ``` 56 | Target ID: TAR00006 57 | 58 | Related diseases: Chronic inflammatory diseases... 59 | Related ingredients: cyanidol... 60 | Related herbs: Asteris Radix Et Rhizoma... 61 | ``` 62 | 63 | While it is not currently available in the current version of the program, in the future, it may be possible to use the data downloaded using "Get all data" to query for relationships between different elements, such as finding all the ingredients related to a certain disease or target. This feature is not yet implemented in the current version, but may be added in a future update. 64 | 65 | ## LICENSE 66 | 67 | This project is released under the MIT open source license. If you have any suggestions or feedback, please feel free to submit an issue or pull request. 68 | 69 | ## Changelog 70 | 71 | * 2023/02/09: Initial commit. Completed the search function and data download function. 72 | * 2023/02/10: Refactored the project structure and added the "download all data" function. 73 | 74 | -------------------------------------------------------------------------------- /src/tcmsp.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- encoding: utf-8 -*- 3 | ''' 4 | @Brief : TCMSP数据库爬虫 5 | @Time : 2023/02/09 19:39:55 6 | @Author : https://github.com/shujuecn 7 | ''' 8 | 9 | import os 10 | import re 11 | import json 12 | import requests 13 | import pandas as pd 14 | from bs4 import BeautifulSoup as bs 15 | import lxml.html 16 | 17 | 18 | class TcmspSpider: 19 | def __init__(self): 20 | 21 | self.root_url = "https://www.tcmsp-e.com/tcmspsearch.php" 22 | self.headers = { 23 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; NCE-AL10 Build/HUAWEINCE-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36", 24 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 25 | "Accept-Encoding": "gzip, deflate, br", 26 | } 27 | 28 | # self.file_path = "./data/" 29 | 30 | self.spider_file_path = "./data/spider_data/" 31 | self.sample_file_path = "./data/sample_data/" 32 | self.create_folder(self.spider_file_path) 33 | self.create_folder(self.sample_file_path) 34 | 35 | self.token = None 36 | 37 | def create_folder(self, path): 38 | """ 39 | Create folder 40 | :return: None 41 | """ 42 | 43 | if not os.path.exists(path): 44 | os.makedirs(path) 45 | 46 | def get_response(self, url): 47 | """ 48 | Get response from url 49 | :param url: url 50 | :return: html 51 | """ 52 | try: 53 | response = requests.get(url=url, headers=self.headers) 54 | response.raise_for_status() 55 | html = response.content.decode("utf-8") 56 | return html 57 | except requests.exceptions.RequestException as e: 58 | print(e) 59 | return 60 | 61 | def get_token(self): 62 | """ 63 | Get token 64 | :return: token 65 | """ 66 | 67 | html = self.get_response(self.root_url) 68 | root = lxml.html.fromstring(html) 69 | token = root.xpath('//form[@id="SearchForm"]//input[@name="token"]/@value') 70 | if token: 71 | print("token获取成功!\n") 72 | return token[0] 73 | else: 74 | print("token获取失败!\n") 75 | return 76 | 77 | def get_herb_name(self, herb_name): 78 | """ 79 | Get herb's English name 80 | :param herb_name: herb's Chinese name 81 | :return: herb's English name 82 | """ 83 | 84 | print(f"正在查询: {herb_name}...\n") 85 | 86 | url = f"{self.root_url}?qs=herb_all_name&q={herb_name}&token={self.token}" 87 | html = self.get_response(url) 88 | 89 | if html: 90 | soup = bs(html, "html.parser") 91 | script = soup.findAll("script")[8].__str__() 92 | # 解析药物的名称 93 | herb_three_names = re.findall(r"\n.*data:\s(.*),", script)[0] 94 | 95 | if herb_three_names != "[]": 96 | herb_three_names = json.loads(herb_three_names) 97 | return herb_three_names 98 | 99 | else: 100 | print(f"未查询到{herb_name}的信息!") 101 | return None 102 | else: 103 | pass 104 | 105 | def get_herb_data(self, cn_name, en_name, pinyin_name): 106 | """ 107 | Get herb's data 108 | :param cn_name: herb's Chinese name 109 | :param en_name: herb's Latin name 110 | :param pinyin_name: herb's pinyin name 111 | :return: None 112 | """ 113 | 114 | # Construction request link 115 | en_name = en_name.replace(" ", "%20") 116 | url = f"{self.root_url}?qr={en_name}&qsr=herb_en_name&token={self.token}" 117 | 118 | print(f"正在下载: {cn_name}...") 119 | html = self.get_response(url) 120 | if html: 121 | 122 | # 提取json数据 123 | # data = self.get_json_data(html) 124 | 125 | # 导出 Ingredients 126 | ingredients_pattern = "grid" 127 | ingredients_data = self.get_json_data(html, 11, ingredients_pattern) 128 | self.text_to_excel( 129 | ingredients_data, 130 | file_path=f"{self.spider_file_path}", 131 | file_name=f"{pinyin_name}_ingredients", 132 | index="MOL_ID" 133 | ) 134 | 135 | # 导出 Targets 136 | targets_pattern = "grid2" 137 | targets_data = self.get_json_data(html, 11, targets_pattern) 138 | self.text_to_excel( 139 | targets_data, 140 | file_path=f"{self.spider_file_path}", 141 | file_name=f"{pinyin_name}_targets", 142 | index="MOL_ID" 143 | ) 144 | 145 | # 导出 Disease 146 | # INDEX参数为False,因为Disease表格中没有MOL_ID 147 | disease_pattern = "grid3" 148 | disease_data = self.get_json_data(html, 11, disease_pattern) 149 | self.text_to_excel( 150 | disease_data, 151 | file_path=f"{self.spider_file_path}", 152 | file_name=f"{pinyin_name}_disease", 153 | index=False 154 | ) 155 | 156 | print(f"{cn_name}下载完成!\n") 157 | 158 | def get_json_data(self, html, num, pattern): 159 | """ 160 | Get json text 161 | :param html: html 162 | :param num: script number() 163 | :param pattern: regular expression 164 | :return: json text 165 | """ 166 | soup = bs(html, "html.parser") 167 | scripts = soup.findAll("script") 168 | 169 | # The serial number of data in different pages is different 170 | text = scripts[num].__str__() 171 | 172 | pattern = rf"\$\(\"\#{pattern}\".*\n.*\n.*data\:\s(\[.*\])" 173 | match = re.compile(pattern).search(text) 174 | result = match.group(1) 175 | data = json.loads(result) 176 | 177 | return data 178 | 179 | def text_to_excel(self, data, file_path, file_name, index): 180 | """ 181 | Regular expression extracts json data and converts to excel 182 | :param text: text 183 | :param pattern: regular expression 184 | :param file_name: file name 185 | :return: None 186 | """ 187 | 188 | # Regular expression extracts json data 189 | if data: 190 | df = pd.DataFrame(data) 191 | 192 | # Custom index columns 193 | if index: 194 | df.set_index(index, inplace=True) 195 | df.to_excel(f"{file_path}{file_name}.xlsx", index=True) 196 | else: 197 | df.to_excel(f"{file_path}{file_name}.xlsx", index=False) 198 | 199 | print(f"已保存:{file_name}.xlsx") 200 | 201 | else: 202 | print(f"未查询到{file_name}的信息!") 203 | --------------------------------------------------------------------------------