├── README.md ├── WOS 爬虫总结.pdf ├── __parameters__.py └── wos_spyder.py /README.md: -------------------------------------------------------------------------------- 1 | ## WOS 爬虫总结 2 | 3 | ### 目的 4 | 5 | * 快速获得文献背景:出版年份,被引频次,作者,DOI,文献类型,引用的参考文献 6 | * 获得参考文献的下载链接,实现文献的批量化下载 7 | 8 | ### 条件 9 | 10 | * 所处机构或者学校**购买WOS的数据库**,并且将爬虫**置于校园网环境中**。 11 | * 如果要实现后期的文献下载需要**购买**所需文献的**数据库**。 12 | * 后期有时间会完善用账号密码校外访问数据库。 13 | 14 | ### 使用方法 15 | 16 | ##### 1. 所需要安装的python3+包 17 | 18 | ```py 19 | pip install requests 20 | pip install lxml 21 | pip install bs4 22 | ``` 23 | 24 | ##### 2. 测试例子(没有将程序打包,所以需要将程序下载使用) 25 | 26 | * 导出所有的检索结果 27 | 28 | ```python 29 | test = 'TS=LN AND PY=(2018-2020)' # 检索式一定要有两个条件以上 30 | test_start = 1 # 导出起始页码 31 | test_end = 501 # 导出终止页码 32 | file_name='LNOI' # 保存文件的名称,默认为 .txt 文件,如果想要保存其他格式,那是不可能的! 33 | file_type = 'fieldtagged' 34 | demo = export_paper(search_expression=test, export_start=test_start, 35 | export_end= test_end,file_name='LNOI',file_type=file_type) 36 | demo.save() 37 | ``` 38 | 39 | * 导出所有的参考文献 40 | 41 | ```python 42 | # 运行结束会生成两个txt文件,一个是'file_name.txt'为所选需要的文献,一个是'no_doi.txt' 用于存储没有DOI的文献信息 43 | aim = 'TS=LNOI AND PY=2020 AND DO=10.1515/nanoph-2020-0013' # 建议用DOI搜索,这样保证搜索结果的唯一性 44 | file_name = 'LNOI' 45 | aim_paper = get_references(search_expression=aim, file_name=file_name) 46 | aim_paper.get_main() # 接口和上一个有点不一样,两个爬取逻辑有点小差异 47 | ``` 48 | 49 | * WOS 检索式参考 50 | 51 | ```python 52 | # ''' 高级检索参考 : 53 | # 布尔运算符: AND、OR、NOT、SAME、NEAR 54 | # 字段标识: 55 | # TS= 主题 56 | # TI= 标题 57 | # AU= 作者 [索引] 58 | # AI= 作者识别号 59 | # GP= 团体作者 [索引] 60 | # ED= 编者 61 | # AB= 摘要 62 | # AK= 作者关键词 63 | # KP= Keyword Plus ® 64 | # SO= 出版物名称 [索引] 65 | # DO= DOI 66 | # PY= 出版年 67 | # AD= 地址 68 | # SU= 研究方向 69 | # IS= ISSN/ISBN 70 | # ''' 71 | ``` 72 | 73 | ### 爬虫细节分享 74 | 75 | #### 参考资料 76 | 77 | * 主要参考[博主](https://blog.csdn.net/tomleung1996/article/details/86627443)的思路,博主的代码[仓库地址](https://github.com/tomleung1996/wos_crawler) 78 | 79 | #### 需要的关于爬虫的基础知识 80 | 81 | * [爬虫原理与cookies,session](https://blog.csdn.net/hfutzhouyonghang/article/details/81009760) 82 | * python [Requests 基础使用](https://blog.csdn.net/shanzhizi/article/details/50903748) 83 | * [关于异步加载和异步传输的概念](https://blog.csdn.net/liaoningxinmin/article/details/80794774) 84 | * [爬虫问题的重定向302错误](https://blog.csdn.net/xc_zhou/article/details/80952208) 85 | * [BeautifulSoup 模块使用指南](https://www.jianshu.com/p/2b783f7914c6) 86 | * [python 正则 re 表达式](https://www.cnblogs.com/CYHISTW/p/11363209.html) 87 | 88 | 89 | 90 | #### 操作爬虫时的好用的工具 91 | 92 | * [在线解析工具](https://www.sojson.com/jshtml.html) 93 | * 新建txt文档 94 | 95 | #### 爬取逻辑 96 | * 参考总结pdf文件,git放图好累 97 | 98 | 99 | ### 遇到的奇葩问题总结 100 | 101 | 1. 使用`print(response.text())`发现控制台(我的IDE为VSCODE)的显示内容与在网页端的查看源代码显示不同,可能是VSCODE后台省略了,可以保存为 txt 文件或者用bs4解码,看看是否已经获取到了想要的界面。 102 | 2. 获取到的源码,没有中文,中文显示都是`...`这个问题我百度了好久,都没有解决,后来脑子一抽,修改了一下`headers`的`'Accept-Language': 'zh-CN,zh;q=0.9',`居然就成功了, 我很开心,哈哈哈! 103 | 3. 在获取所有的参考文献的时候出了点一点意外,无法成功导出文献,原因是提交的 `form_data` 出问题了,需要仔细检查! 104 | -------------------------------------------------------------------------------- /WOS 爬虫总结.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Guogeda/WOS/9b4af967b9ce28da4451527b0687a472c5650652/WOS 爬虫总结.pdf -------------------------------------------------------------------------------- /__parameters__.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | get_sid_url = 'http://www.webofknowledge.com/' 5 | 6 | search_url = 'http://apps.webofknowledge.com/UA_AdvancedSearch.do' 7 | 8 | search_url_redict = 'http://apps.webofknowledge.com/UA_AdvancedSearch_input.do;jsessionid={jse}?product=UA&search_mode=AdvancedSearch&replaceSetId=&goToPageLoc=SearchHistoryTableBanner&SID={sid}&errorQid={qid}' 9 | 10 | entry_url = 'http://apps.webofknowledge.com/summary.do;jsessionid={jse}?product=UA&doc=1&qid={qid}&SID={sid}&search_mode=AdvancedSearch&update_back2search_link_param=yes' 11 | 12 | search_header= { 13 | 'Origin': 'https://apps.webofknowledge.com', 14 | 'Connection': 'keep-alive', 15 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 16 | 'Accept-Encoding': 'gzip, deflate', 17 | 'Accept-Language': 'zh-CN,zh;q=0.9', 18 | 'Upgrade-insecure-requests': str(1), 19 | # 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36", 20 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.68', 21 | 'Content-Type': 'application/x-www-form-urlencoded', 22 | } 23 | 24 | search_data = { 25 | 'action': 'search', 26 | 'product': 'UA', 27 | 'search_mode': 'AdvancedSearch', 28 | 'input_invalid_notice': r'检索错误: 请输入检索词。', 29 | 'input_invalid_notice_limits':r'
注意: 滚动框中显示的字段必须至少与一个其他检索字段相组配。', 30 | 'SID':None, 31 | 'formUpdated': 'true', 32 | 'replaceSetId':'', 33 | 'goToPageLoc':'SearchHistoryTableBanner', 34 | 'value(input1)': None, 35 | "value(searchOp)": "search", 36 | 'limitStatus': 'collapsed', 37 | 'ss_lemmatization': 'On', 38 | 'ss_spellchecking': 'Suggest', 39 | 'SinceLastVisit_UTC': '', 40 | 'SinceLastVisit_DATE': '', 41 | 'period': 'Range Selection', 42 | 'range': 'ALL', 43 | 'startYear': '1900', 44 | 'endYear': time.strftime('%Y'), 45 | # 'editions': ['CCR', 'SCI', 'ISTP', 'IC'], 46 | 'editions':['WOS.CCR','WOS.SCI','WOS.ISTP','WOS.IC','CSCD.CSCD','CCC.CCCB','CCC.CCCA','CCC.CCCY','CCC.CCCT','CCC.CCCBC','CCC.CCCS','CCC.CCCEC','CCC.CCCP','CCC.CCCC','DIIDW.EDerwent','DIIDW.MDerwent','DIIDW.CDerwent','KJD.KJD','MEDLINE.MEDLINE','RSCI.RSCI','SCIELO.SCIELO'], 47 | 'collections':['MEDLINE','SCIELO','WOS','CSCD','CCC','KJD','RSCI','SCIELO'], 48 | 'update_back2search_link_param': 'yes', 49 | 'ssStatus': 'display:none', 50 | 'ss_showsuggestions': 'ON', 51 | 'ss_query_language': 'auto', 52 | 'rs_sort_by': 'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A' 53 | } 54 | 55 | 56 | export_url = 'http://apps.webofknowledge.com//OutboundService.do?action=go&&' 57 | 58 | export_data = { 59 | "selectedIds": None, # 选择指定页面下载 60 | "displayCitedRefs": "true", 61 | "displayTimesCited": "true", 62 | "displayUsageInfo": "true", 63 | "viewType": "summary", 64 | "product": "UA", 65 | "rurl": None, 66 | "mark_id": "UDB", 67 | "search_mode": "AdvancedSearch", # 68 | "locale": "zh_CN", 69 | "view_name": "UA-summary", # 70 | "sortBy": "PY.D;LD.D;SO.A;VL.D;PG.A;AU.A", 71 | "mode": "OpenOutputService", # 72 | "qid": None, 73 | "SID": None, 74 | "format": "saveToFile", 75 | "filters": "AUTHORSIDENTIFIERS ISSN_ISBN CITTIMES ABSTRACT SOURCE TITLE AUTHORS", 76 | "mark_to": None, 77 | "mark_from": None, 78 | "queryNatural": None, 79 | "count_new_items_marked": "0", 80 | "use_two_ets": "false", 81 | "IncitesEntitled": "no", 82 | "value(record_select_type)": "range", # 83 | "markFrom": None, 84 | "markTo": None, 85 | "fields_selection": "AUTHORSIDENTIFIERS ISSN_ISBN CITTIMES ABSTRACT SOURCE TITLE AUTHORS", 86 | "save_options": None, 87 | } 88 | 89 | paper_url = 'http://apps.webofknowledge.com/full_record.do;jsessionid={jse}?product=UA&search_mode=AdvancedSearch&qid={qid}&SID={sid}&page={page}&doc={doc}' 90 | 91 | reference_url ='http://apps.webofknowledge.com/summary.do?product=UA&parentProduct=UA&search_mode=CitedRefList&parentQid={parent_qid}&parentDoc={doc}&qid={qid}&SID={SID}&colName=WOS&page={page}' 92 | 93 | base_url = 'http://apps.webofknowledge.com/' -------------------------------------------------------------------------------- /wos_spyder.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import re 4 | from bs4 import BeautifulSoup 5 | import time 6 | import os 7 | 8 | from __parameters__ import * 9 | 10 | class enter_wos(): 11 | def __init__(self, search_expression): 12 | super().__init__() 13 | self._init_session() 14 | self.sid = self._get_sid(self.request.url) 15 | self.jse = self._get_jessionid(self.request.headers['Set-Cookie']) 16 | 17 | self.search_data = search_data 18 | self.search_expression = search_expression 19 | 20 | self.AdvancedSearch() 21 | 22 | def AdvancedSearch(self): 23 | 24 | self.search_data['SID'] = self.sid 25 | self.search_data['value(input1)'] = self.search_expression 26 | 27 | adsearch_response = self.session.post(url=search_url,data=self.search_data,headers=search_header,allow_redirects=False) 28 | 29 | self.qid = self._get_qid(adsearch_response.headers['Location']) 30 | 31 | full_entry_url=entry_url.format(jse=self.jse,qid=self.qid,sid=self.sid) 32 | # print(full_entry_url) 33 | 34 | self.entry_response = self.session.get(url=full_entry_url,headers=search_header) 35 | self.entry_response.encoding = self.entry_response.apparent_encoding 36 | with open('entry_response.txt', 'w', encoding='utf-8') as file: 37 | file.write(self.entry_response.text) 38 | 39 | def _init_session(self): 40 | self.session = requests.Session() 41 | self.request = self.session.get(url=get_sid_url, headers=search_header) 42 | self.cookie = requests.utils.dict_from_cookiejar(self.request.cookies) 43 | 44 | def _get_sid(self, sid_str): 45 | sid_pattern = r'SID=(\w+)&' 46 | return re.findall(sid_pattern, sid_str)[0].replace('SID=', '').replace('&', '') 47 | 48 | def _get_db(self): 49 | soup = BeautifulSoup(self.request.text, 'lxml') 50 | db_str = str(soup.find('select', attrs={'id': 'ss_showsuggestions'}).get('onchange')) 51 | db_pattern = r'WOS\.(\w+)' 52 | pattern = re.compile(db_pattern) 53 | result = pattern.findall(db_str) 54 | if result is not None: 55 | print('已购买的数据库为:',result) 56 | self.db_list = result 57 | 58 | def _get_qid(self, qid_str): 59 | qid_pattern = r'Qid=(\d+)' 60 | return re.findall(qid_pattern, qid_str)[0] 61 | 62 | def _get_jessionid(self, jsessionid_str): 63 | jsessionid_pattern = r'JSESSIONID=(\w+)' 64 | return re.findall(jsessionid_pattern, jsessionid_str)[0] 65 | 66 | class export_paper(enter_wos): 67 | 68 | def __init__(self, search_expression, export_start, export_end, file_name, file_type='fieldtagged', select_id=''): 69 | super().__init__(search_expression=search_expression) 70 | self.export_start = export_start 71 | self.export_end = export_end 72 | self.file_name = file_name 73 | self.file_type = file_type 74 | self.export_data = export_data 75 | self.select_id = select_id 76 | 77 | def download(self): 78 | soup = BeautifulSoup(self.entry_response.text, 'lxml') 79 | self.paper_num = int(soup.find('span', attrs={'id': 'footer_formatted_count'}).get_text().replace(',', '')) 80 | print('now we found {paper_num} articles'.format(paper_num = self.paper_num)) 81 | 82 | self.export_data['selectedIds'] = self.select_id 83 | self.export_data['rurl'] = self.entry_response.url 84 | self.export_data['qid'] = str(self.qid) 85 | self.export_data['SID'] = str(self.sid) 86 | self.export_data['queryNatural'] = self.search_expression 87 | 88 | self.export_end = self.export_end if self.export_end < self.paper_num else self.paper_num 89 | 90 | span = 500 91 | iter_num = self.paper_num // span + 1 92 | start_index = self.export_start // 500 93 | end_index = self.export_end // 500 94 | 95 | start = self.export_start 96 | 97 | for i in range(iter_num): 98 | if start_index == i and end_index >= i: 99 | end = self.export_end if self.export_end < (i+1) * span else (i+1) * span 100 | print ('{start} to {end} start export, filetype is {filetype}'.format( 101 | start = start , end = end , filetype = self.file_type 102 | )) 103 | self.export_data['mark_to'] = end 104 | self.export_data['mark_from'] = start 105 | self.export_data['markFrom'] = start 106 | self.export_data['markTo'] = end 107 | self.export_data['save_options'] = self.file_type 108 | 109 | export_response = self.session.post(data=self.export_data,url=export_url,headers=search_header,allow_redirects=False) 110 | export_response.encoding = export_response.apparent_encoding 111 | 112 | download_url = export_response.headers['Location'] 113 | download_response = self.session.get(url=download_url, headers = search_header) 114 | 115 | start_index += 1 116 | start = (i+1) * span + 1 117 | 118 | yield download_response.text 119 | time.sleep(10) # 等10s继续获取,道德 120 | 121 | def save(self): 122 | contents = self.download() 123 | file_name = '{}.txt'.format(self.file_name) 124 | with open(file_name,'w',encoding='utf-8') as f: 125 | for content in contents: 126 | f.write(content) 127 | f.write('\n') 128 | print('save ok') 129 | 130 | class get_references(enter_wos): 131 | def __init__(self, search_expression, file_name, file_type='fieldtagged'): 132 | super().__init__(search_expression=search_expression) 133 | if 'no_doi.txt' in os.listdir('.'): 134 | os.remove('no_doi.txt') 135 | self.file_name = file_name 136 | self.file_type = file_type 137 | self.export_data = export_data 138 | self.no_doi = [] 139 | 140 | 141 | def get_main(self): 142 | # 进入论文详情页 143 | exact_url = paper_url.format(jse=self.jse,qid=self.qid,sid = self.sid, page=str(1), doc=str(1)) 144 | paper_response = self.session.get(url=exact_url,headers=search_header) 145 | paper_soup = BeautifulSoup(paper_response.text, 'lxml') 146 | 147 | self.num = self._get_nums(paper_soup) # 获取参考论文数量 148 | self.name = self._get_paper_name(paper_soup) # 获取论文名字 149 | self.page = int(self.num) // 30 + 1 # 获取页面数目 150 | self.base_references_url = self._get_all_references_url(paper_soup) # 获取每个页面的url 151 | 152 | self.save() # 保存 153 | 154 | def export(self): 155 | print('we need to export {num} papers'.format(num=self.num)) 156 | print('we find paper name is {}'.format(self.name)) 157 | 158 | self.export_data['SID'] = str(self.sid) 159 | self.export_data['queryNatural'] = '从: '+self.name 160 | self.export_data['mark_to'] = '' 161 | self.export_data['mark_from'] = '' 162 | self.export_data['markFrom'] = '' 163 | self.export_data['markTo'] = '' 164 | self.export_data['colName'] = 'WOS' 165 | self.export_data['search_mode'] = 'CitedRefList' 166 | self.export_data['view_name'] = 'UA-CitedRefList-summary' 167 | self.export_data['mode'] = 'CitedRefList-OpenOutputService' 168 | self.export_data['value(record_select_type)'] = 'pagerecords' 169 | self.export_data['sortBy'] = 'CAU.A;CW.A;CY.D;CV.D;CG.A' 170 | self.export_data['save_options'] = self.file_type 171 | 172 | for i in range(self.page): 173 | 174 | print('now we export page of {}, total has {}'.format(i+1,self.page)) 175 | 176 | #获得进入“查看引用”界面,每个页面有30个参考文献 177 | references_url = self.base_references_url + str(i+1) 178 | reference_response = self.session.get(url=references_url,headers=search_header) 179 | 180 | # 获取son_qid 181 | self.son_qid = self._get_son_qid(reference_response.url) 182 | reference_soup = BeautifulSoup(reference_response.text, 'lxml') 183 | 184 | self.select_id = self._filter(reference_soup, i) # 有的参考文献不会导出 185 | self.export_data['qid'] = str(self.son_qid) 186 | self.export_data['selectedIds'] = ';'.join(i for i in self.select_id) 187 | self.export_data['rurl'] = reference_response.url 188 | # 获取导出标记的url 189 | new_export_url = export_url 190 | export_response = self.session.post(data=self.export_data,url=new_export_url,headers=search_header,allow_redirects=False) 191 | # 获得下载 'txt‘ 文件的下载地址 192 | download_url = export_response.headers['Location'] 193 | download_response = self.session.get(url=download_url, headers = search_header) 194 | 195 | yield download_response.text 196 | 197 | def save(self): 198 | contents = self.export() 199 | file_name = '{}.txt'.format(self.file_name) 200 | with open(file_name,'w',encoding='utf-8') as f: 201 | for content in contents: 202 | f.write(content) 203 | f.write('\n') 204 | print('we found {} have no doi'.format(self.no_doi)) 205 | print('save ok') 206 | 207 | def _filter(self, soup, i): 208 | has_doi = [] 209 | check_boxs = soup.findAll(name='div',attrs={'class':'search-results-checkbox-align'}) 210 | for index,check_box in enumerate(check_boxs): 211 | try: 212 | has_doi.append(check_box.input['value']) 213 | except KeyError as e: 214 | paper_id = str(index + 1 + 30 * i) 215 | self.no_doi.append(paper_id) 216 | self._save_no_doi(soup, index, paper_id) 217 | return has_doi 218 | 219 | def _save_no_doi(self,soup,index,paper_id): 220 | data_info = soup.findAll(name='div',attrs={'class':'reference-item-non-ar'})[index] 221 | try: 222 | title, press,*page_num, year = [i.get_text() for i in data_info.findAll(name='value')] 223 | except ValueError as e: 224 | title = 'no title' 225 | press, *page_num, year = [i.get_text() for i in data_info.findAll(name='value')] 226 | try: 227 | author = data_info.a.get_text() 228 | except AttributeError as e: 229 | author = 'nopoeple' 230 | 231 | with open('no_doi.txt', 'a') as f: 232 | f.write('{paper_id},{title},{author},{press},{year}'.format( 233 | paper_id = paper_id, title = title, author = author, press = press, year = year 234 | )) 235 | f.write('\n') 236 | 237 | def _get_son_qid(self,son_qid_str): 238 | son_qid_pattern = r'qid=(\d+)' 239 | return re.findall(son_qid_pattern, son_qid_str)[0] 240 | 241 | def _get_paper_name(self, soup): 242 | name = soup.find(name='div',attrs={'class':'title'}).value.get_text() 243 | return name 244 | 245 | def _get_nums(self, soup): 246 | num_str = soup.find(name='div',attrs={'class':'cited-ref-separator'}).h2.get_text() 247 | num_pattern = r'\d+' 248 | return re.findall(num_pattern, num_str)[0] 249 | 250 | def _get_all_references_url(self, soup): 251 | url_str = soup.find(name='div',attrs={'class':'cited-ref-separator'}).a['href'] 252 | # return base_url + url_str[:10] + ';jsessionid={}'.format(self.jse) + url_str[10:-1] 253 | return base_url + url_str[:-1] 254 | 255 | 256 | if __name__ == "__main__": 257 | # ''' 高级检索参考 : 258 | # 布尔运算符: AND、OR、NOT、SAME、NEAR 259 | # 字段标识: 260 | # TS= 主题 261 | # TI= 标题 262 | # AU= 作者 [索引] 263 | # AI= 作者识别号 264 | # GP= 团体作者 [索引] 265 | # ED= 编者 266 | # AB= 摘要 267 | # AK= 作者关键词 268 | # KP= Keyword Plus ® 269 | # SO= 出版物名称 [索引] 270 | # DO= DOI 271 | # PY= 出版年 272 | # AD= 地址 273 | # SU= 研究方向 274 | # IS= ISSN/ISBN 275 | # ''' 276 | 277 | 278 | aim = 'TS=LNOI AND PY=2020 AND DO=10.1515/nanoph-2020-0013' 279 | file_name = 'LNOI' 280 | file_type = 'fieldtagged' 281 | aim_paper = get_references(search_expression=aim, file_name=file_name, file_type=file_type) 282 | aim_paper.get_main() 283 | 284 | # 导出 参考文献 txt 285 | # test = 'TS=LN AND PY=(2018-2020)' # 检索式一定要有两个条件以上 286 | # test_start = 1 287 | # test_end = 501 288 | # file_type = 'fieldtagged' 289 | # demo = export_paper(search_expression=test, export_start=test_start, 290 | # export_end= test_end,file_name='LNOI',file_type=file_type) 291 | # demo.save() 292 | 293 | 294 | 295 | 296 | 297 | 298 | --------------------------------------------------------------------------------