├── 58tongcheng ├── test1.py └── test2.py ├── 91porn.py ├── README.md ├── download-citation ├── 2018-12-13-22-18-39.ris ├── 2018-12-13.ris ├── pa.py ├── pa1.py ├── ro.ris ├── ros.html └── springer.py ├── download_biao_qing_win.py ├── huaban.py ├── ip_pachong.py ├── login.py ├── login2.py ├── meizitu3.py ├── meizitu_pro.py ├── meizitu_pro2.py ├── my_blog ├── article │ └── templatetags │ │ ├── __init__.py │ │ └── custom_markdown.py └── templates │ ├── aboutme.html │ ├── archives.html │ ├── base.html │ ├── home.html │ ├── post.html │ ├── tag.html │ └── test.html ├── paqubiaoqing.py ├── porn ├── down_video.py └── test1.py ├── requests1.py ├── requests2.py ├── requests3.py ├── scraping_ajax.py ├── selenium ├── test1.py ├── test2.py ├── test3.py └── test4.py ├── some ├── aj.py ├── pa.py ├── pa1.py ├── springer.py ├── xuanke.py ├── xuanke2.py ├── zhihu.py ├── zhihu2.py └── zhihu3.py └── zhihu └── denglu.py /58tongcheng/test1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/58tongcheng/test1.py -------------------------------------------------------------------------------- /58tongcheng/test2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/58tongcheng/test2.py -------------------------------------------------------------------------------- /91porn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import requests, re, random, time, os, csv 5 | from bs4 import BeautifulSoup as bs 6 | from parsel import Selector 7 | 8 | headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 9 | 'Accept-Encoding':'gzip, deflate, sdch', 10 | 'Accept-Language':'zh-CN,zh;q=0.8', 11 | 'Cache-Control':'max-age=0', 12 | 'Connection':'keep-alive', 13 | 'DNT':'1', 14 | 'Host':'email.91dizhi.at.gmail.com.8h9.space', 15 | 'Upgrade-Insecure-Requests':'1', 16 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} 17 | 18 | def download_urls(url): 19 | r = requests.get(url, headers=headers, timeout=30) 20 | r.encoding = 'utf-8' 21 | html = r.text 22 | obj = bs(html, 'html.parser') 23 | lists = obj.find_all('div', {'class': re.compile('imagechannel.*?')}) 24 | for i in lists: 25 | try: 26 | a = i.find('a') 27 | video_url = a.attrs['href'] 28 | img_url = a.find('img').attrs['src'] 29 | title = a.find('img').attrs['title'] 30 | print(video_url, img_url, title) 31 | 32 | with open('91porn_all.csv', 'a', newline='', encoding='utf_8_sig') as csvfile: 33 | ww = csv.writer(csvfile, dialect='excel') 34 | ww.writerow([title, img_url, video_url]) 35 | except: 36 | continue 37 | 38 | def crawl_urls(n): 39 | for i in range(1,n+1): 40 | url = 'http://email.91dizhi.at.gmail.com.8h9.space/v.php?category=mf&viewtype=basic&page=' + str(i) 41 | try: # 尝试三次,如果3次请求仍然不能成功,则跳过该页,继续爬取下一页 42 | download_urls(url) 43 | except: 44 | try: 45 | download_urls(url) 46 | except: 47 | try: 48 | download_urls(url) 49 | except: 50 | continue 51 | time.sleep(0.001) 52 | 53 | n = 3526 # 总页数 54 | crawl_urls(n) 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-sipder 2 | ## python爬虫学习教程 3 | 4 | ### 爬取妹子图爬虫 5 | 6 | By [Jim-Bin](https://github.com/Jim-bin). 7 | 8 | #### Description 9 | 10 | 实现的爬取[妹子图](http://www.meizitu.com/) 11 | 12 | #### 下载meizitu3.py 13 | #### Installation 14 | 15 | > pip install bs4 16 | 17 |    > pip install requests 18 | 19 | #### Usage 20 | 21 | * 妹子图:`python meizitu3.py` 22 | -------------------------------------------------------------------------------- /download-citation/pa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | 在以下环境测试通过: 5 | python 2.7.15或者3.7.0 6 | win10或者lubuntu 7 | ''' 8 | 9 | # 导入模块 10 | import time 11 | import requests, re, random, os 12 | from bs4 import BeautifulSoup 13 | from requests import Session 14 | 15 | session = Session() 16 | 17 | 18 | ''' 19 | 给定页数,爬取每页所有图片的url,通过此url可以打开图片所在的网页 20 | 所有url存在一个列表中 21 | ''' 22 | 23 | 24 | def scrapy_img_urls(nums): 25 | lss = [] 26 | for num in range(1, nums + 1): 27 | url = 'http://www.doutula.com/photo/list/?page=' + str(num) 28 | html = requests.get(url, headers=headers) 29 | html.encoding = 'utf-8' 30 | 31 | text = html.text 32 | bsop = BeautifulSoup(text, 'html.parser') 33 | ass = bsop.find('div', {'class': 'page-content'}).find('div').findAll('a') 34 | 35 | for a in ass: 36 | # print(a.attrs['href']) 37 | lss.append(a.attrs['href']) 38 | time.sleep(1) 39 | return lss 40 | 41 | 42 | ''' 43 | 接收每个图片的url,打开此url,找到图片真实的地址,通过此地址可以下载图片 44 | 找到图片真实的url和名字之后调用download_url函数可以下载图片 45 | ''' 46 | 47 | 48 | def download_img_url(url): 49 | html = requests.get(url, headers=headers) 50 | html.encoding = 'utf-8' 51 | 52 | text = html.text 53 | bsop = BeautifulSoup(text, 'html.parser') 54 | img = bsop.find('div', {'class': 'col-xs-12 col-sm-12 artile_des'}) 55 | img_url = img.find('img').attrs['src'] 56 | img_title = img.find('img').attrs['alt'] 57 | print(img_url + " " + img_title) 58 | 59 | download_img(img_url, img_title) 60 | 61 | 62 | ''' 63 | 下载图片,该函数接收两个参数,一个是图片的真实地址,一个是图片的名字 64 | 名字中如果有特殊字符则需要处理,不然windows下可能无法保存,处理名字调用format_name函数 65 | 打开指定文件夹保存图片,如果没有则创建。 66 | ''' 67 | 68 | 69 | def download_img(img_url, img_title): 70 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹 71 | if not os.path.exists(file_path): 72 | os.makedirs(file_path) 73 | os.chdir(file_path) 74 | 75 | # 图片保存到本地 76 | exists = os.path.exists(img_title) 77 | if not exists: 78 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True) 79 | img_html.encoding = 'utf-8' 80 | with open(img_title + ".gif", 'wb') as f: 81 | f.write(img_html.content) 82 | f.close() 83 | 84 | 85 | def format_name(img_title): 86 | ''' 87 | 对名字进行处理,如果包含下属字符,则直接剔除该字符 88 | :param img_title: 89 | :return: 90 | ''' 91 | for i in ['\\', '/', ':', '*', '?', '"', '<', '>', '!', '|']: 92 | while i in img_title: 93 | img_title = img_title.strip().replace(i, '') 94 | return img_title 95 | 96 | 97 | def royal(url): 98 | html = requests.get(url, headers=headers, stream=True, timeout=20, verify=True) 99 | html.encoding = 'utf-8' 100 | text = html.text 101 | bsop = BeautifulSoup(text, 'html.parser') 102 | timeofissued = bsop.find('meta', {'name':'DC.issued'}).attrs['content'].split('/')[0] 103 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content'] 104 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content'] 105 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content'] 106 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content'] 107 | citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content'] 108 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content'] 109 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content'] 110 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content'] 111 | PB = bsop.find('meta', {'name':'DC.publisher'}).attrs['content'] 112 | M3 = citation_doi 113 | citation_url = 'http://dx.doi.org/' + citation_doi 114 | citation_abstract = bsop.find('meta', {'name':'citation_abstract'}).attrs['content'].strip() 115 | SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1] 116 | 117 | with open(citation_title + ".ris", 'w') as f: 118 | f.write('TY - JOUR\n') 119 | f.write('T1 - ' + citation_title + '\n') 120 | f.write('Y1 - ' + timeofissued + '\n') 121 | f.write('SP - ' + citation_firstpage + '\n') 122 | f.write('EP - ' + citation_lastpage + '\n') 123 | f.write('JF - ' + citation_journal_title + '\n') 124 | f.write('JO - ' + citation_journal_abbrev + '\n') 125 | f.write('VL - ' + citation_volume + '\n') 126 | f.write('RS - ' + citation_issue + '\n') 127 | f.write('PB - ' + PB + '\n') 128 | f.write('SN - ' + SN + '\n') 129 | f.write('DO - ' + citation_doi + '\n') 130 | f.write('M3 - ' + M3 + '\n') 131 | f.write('UR - ' + citation_url + '\n') 132 | print(citation_url) 133 | f.write('N2 - ' + citation_abstract + '\n') 134 | print(citation_abstract) 135 | 136 | authors = bsop.findAll('span', {'class': 'article__author-link'}) 137 | for author in authors: 138 | author = author.find('a').text.split(' ') 139 | author = author[-1] + ', ' + ' '.join(author[:-1]) 140 | f.write('A1 - ' + author + '\n') 141 | f.write('ER - ' + '\n') 142 | f.close() 143 | 144 | # authors = bsop.findAll('span', {'class':'article__author-link'}) 145 | # for author in authors: 146 | # author = author.find('a').text.split(' ') 147 | # author = author[-1] + ', ' + ' '.join(author[:-1]) 148 | # with open(author + ".ris", 'w') as f: 149 | # f.write('TY - JOUR') 150 | # f.write('T1 - ' + citation_title) 151 | # f.write('T1 - ' + authors) 152 | # f.close() 153 | 154 | # print(author) 155 | # print(timeofissued) 156 | 157 | 158 | 159 | 160 | 161 | # print(authors) 162 | # with open("ro.ris", 'wb') as f: 163 | # f.write(html.content) 164 | # f.close() 165 | 166 | 167 | def scawurls(url): 168 | 169 | headers1 = { 170 | 'Accept':'text/html, */*; q=0.01', 171 | 'Connection': 'keep-alive', 172 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 173 | 'DNT':'1', 174 | 'Host':'pubs.rsc.org', 175 | 'Origin':'https://pubs.rsc.org', 176 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 177 | 'X-NewRelic-ID':'VQYFWF9aDBABV1laBgcFUw ==', 178 | 'X-Requested-With':'XMLHttpRequest' 179 | } 180 | 181 | data = { 182 | 'searchterm': 'AAEAAAD/////AQAAAAAAAAAMAgAAAGNSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAADlSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLlNlYXJjaFRlcm0OAAAAGTxDYXRlZ29yeT5rX19CYWNraW5nRmllbGQcPFN1YkNhdGVnb3J5PmtfX0JhY2tpbmdGaWVsZBw8Q29udGVudFR5cGU + a19fQmFja2luZ0ZpZWxkGjxDcml0ZXJpYXM + a19fQmFja2luZ0ZpZWxkFzxGYWNldHM + a19fQmFja2luZ0ZpZWxkHDxSZXF1ZXN0VGltZT5rX19CYWNraW5nRmllbGQfPEF1dGhvckNyaXRlcmlhPmtfX0JhY2tpbmdGaWVsZCA8UHVibGljYXRpb25EYXRlPmtfX0JhY2tpbmdGaWVsZBk8RXhjbHVkZXM + a19fQmFja2luZ0ZpZWxkFzxTb3VyY2U + a19fQmFja2luZ0ZpZWxkHzxPdXRwdXRTdGFuZGFyZD5rX19CYWNraW5nRmllbGQePFJlc3VsdHNGb3JtYXQ + a19fQmFja2luZ0ZpZWxkHjxEaXNwbGF5Q291bnRzPmtfX0JhY2tpbmdGaWVsZCA8UHJvZHVjdFBhZ2VTaXplPmtfX0JhY2tpbmdGaWVsZAEBAQMDAAQEAwEBAQEBwgFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5MaXN0YDFbW1JTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXcIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0NPVJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guQXV0aG9yQ3JpdGVyaWECAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlAgAAAMIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0CAAAABgMAAAADQWxsCgYEAAAAA0FsbAkFAAAACQYAAAAAAAAAAAAAAAkHAAAACQgAAAAJCQAAAAoKCgoKBAUAAADCAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWUsIFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cywgVmVyc2lvbj0yMDE4LjAuNTQ5LjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49bnVsbF1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24EAAA6UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWVbXQIAAAAICAkKAAAABAAAAAQAAAABBgAAAAUAAAAJCwAAAAAAAAAAAAAABQcAAAA9UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JDcml0ZXJpYQIAAAAgPEJvb2xlYW5PcGVyYXRvcj5rX19CYWNraW5nRmllbGQYPEF1dGhvcnM + a19fQmFja2luZ0ZpZWxkAQPDAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JJbmZvLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXQIAAAAKCgUIAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlBQAAAB88SXNTZWxlY3RlZERhdGU + a19fQmFja2luZ0ZpZWxkGTxEYXRlVHlwZT5rX19CYWNraW5nRmllbGQbPFdpdGhJbkxhc3Q + a19fQmFja2luZ0ZpZWxkGjxEYXRlUmFuZ2U + a19fQmFja2luZ0ZpZWxkHDxEaXNwbGF5RGF0ZT5rX19CYWNraW5nRmllbGQAAQQEAQE5UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5XaXRoSW5MYXN0AgAAADhSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLkRhdGVSYW5nZQIAAAACAAAAAAoKCgoBCQAAAAUAAAAJCwAAAAAAAAAAAAAABwoAAAAAAQAAAAQAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAkMAAAACQ0AAAAJDgAAAAkPAAAABwsAAAAAAQAAAAAAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAUMAAAAOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlBAAAABU8TmFtZT5rX19CYWNraW5nRmllbGQcPERpc3BsYXlOYW1lPmtfX0JhY2tpbmdGaWVsZBY8VmFsdWU + a19fQmFja2luZ0ZpZWxkIDxCb29sZWFuT3BlcmF0b3I + a19fQmFja2luZ0ZpZWxkAQEBAQIAAAAGEAAAAAhmcmVldGV4dAoGEQAAAG9kZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtIEFORCBDdSwgT1IgY29wcGVyLCBPUiBlbGVjdHJvbGVzcywgT1IgcHJpbnRpbmcsIE9SIGZsZXhpYmxlLCBPUiBzdWJzdHJhdGUsIE9SIHBsYXN0aWMKAQ0AAAAMAAAABhIAAAAHQWxsVGV4dAoGEwAAABlkZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtCgEOAAAADAAAAAYUAAAAC0F0bGVhc3RUZXh0CgYVAAAAP0N1LCBjb3BwZXIsIGVsZWN0cm9sZXNzLCBwcmludGluZywgZmxleGlibGUsIHN1YnN0cmF0ZSwgcGxhc3RpYwoBDwAAAAwAAAAGFgAAABBPcmlnaW5hbEZyZWVUZXh0CgYXAAAAb2RlcG9zaXRpb24sIHBhdHRlcm4sIGZpbG0gQU5EIEN1LCBPUiBjb3BwZXIsIE9SIGVsZWN0cm9sZXNzLCBPUiBwcmludGluZywgT1IgZmxleGlibGUsIE9SIHN1YnN0cmF0ZSwgT1IgcGxhc3RpYwoL', 183 | 'resultcount': '282607', 184 | 'category': 'all', 185 | 'pageno': '2' 186 | } 187 | 188 | html = requests.post(url, data=data, headers=headers1, stream=True, timeout=20, verify=True) 189 | html.encoding = 'utf-8' 190 | text = html.text 191 | # print(text) 192 | bsop = BeautifulSoup(text, 'html.parser') 193 | divs = bsop.findAll('div', {'class': 'capsule capsule--article '}) 194 | for i in divs: 195 | article_url = 'https://pubs.rsc.org' + i.find('a').attrs['href'] 196 | print(article_url) 197 | # royal(article_url) 198 | 199 | # with open("ros.html", 'wb') as f: 200 | # f.write(html.content) 201 | # f.close() 202 | # print(text) 203 | 204 | # session.head('https://pubs.rsc.org/en/results/all?Category=All&AllText=deposition%2C%20pattern%2C%20film&AtleastText=Cu%2C%20copper%2C%20electroless%2C%20printing%2C%20flexible%2C%20substrate%2C%20plastic&IncludeReference=false&SelectJournal=false&DateRange=false&SelectDate=false&Type=Months&DateFromMonth=Months&DateToMonth=Months&PriceCode=False&OpenAccess=false') 205 | 206 | # 构造headers 207 | UserAgent_List = [ 208 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 209 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 210 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 211 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 212 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 213 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 214 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 215 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 216 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 217 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 218 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 219 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 220 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 221 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 222 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 223 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 224 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 225 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 226 | ] 227 | headers = {'User-Agent': random.choice(UserAgent_List), 228 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 229 | 'Accept-Encoding': 'gzip', 230 | } 231 | 232 | url = 'https://pubs.rsc.org/en/search/journalresult' 233 | scawurls(url) 234 | 235 | 236 | 237 | # url = 'https://pubs.rsc.org/en/content/articlelanding/2017/tc/c7tc00038c#!divAbstract' 238 | # royal(url) 239 | 240 | # nums = 5 241 | # # 图片存储路径,在linux系统下 242 | # file_path = '/home/zhangyb/downloadfiles/pythonpro/biaoqing' 243 | # # 图片存储路径,在windows系统下 244 | # # file_path = 'E:\downloadfiles\pythonpro\biaoqing' 245 | # urls = scrapy_img_urls(nums) 246 | # for i in urls: 247 | # print(i) 248 | # download_img_url(i) 249 | 250 | 251 | # url = 'https://pubs.rsc.org/en/results/all?Category=All&AllText=deposition%2C%20pattern%2C%20film&AtleastText=Cu%2C%20copper%2C%20electroless%2C%20printing%2C%20flexible%2C%20substrate%2C%20plastic&IncludeReference=false&SelectJournal=false&DateRange=false&SelectDate=false&Type=Months&DateFromMonth=Months&DateToMonth=Months&PriceCode=False&OpenAccess=false' 252 | # r = requests.get(url, headers=headers) 253 | # print(r.text) 254 | -------------------------------------------------------------------------------- /download-citation/pa1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import random 5 | from bs4 import BeautifulSoup 6 | import time 7 | 8 | download_time = time.strftime("%Y-%m-%d", time.localtime()) 9 | 10 | 11 | def royal(article_urls): 12 | for article_url in article_urls: 13 | # try: 14 | html = requests.get(article_url, headers=headers, stream=True, timeout=20, verify=True) 15 | html.encoding = 'utf-8' 16 | text = html.text 17 | bsop = BeautifulSoup(text, 'html.parser') 18 | try: 19 | timeofissued = bsop.find('meta', {'name':'DC.issued'}).attrs['content'].split('/')[0] 20 | except: 21 | pass 22 | try: 23 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content'] 24 | except: 25 | pass 26 | try: 27 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content'] 28 | except: 29 | pass 30 | try: 31 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content'] 32 | except: 33 | pass 34 | try: 35 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content'] 36 | except: 37 | pass 38 | try: 39 | citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content'] 40 | except: 41 | pass 42 | try: 43 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content'] 44 | except: 45 | pass 46 | try: 47 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content'] 48 | except: 49 | pass 50 | try: 51 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content'] 52 | except: 53 | pass 54 | try: 55 | PB = bsop.find('meta', {'name':'DC.publisher'}).attrs['content'] 56 | except: 57 | pass 58 | try: 59 | M3 = citation_doi 60 | except: 61 | pass 62 | try: 63 | citation_url = 'http://dx.doi.org/' + citation_doi 64 | except: 65 | pass 66 | try: 67 | citation_abstract = bsop.find('meta', {'name':'citation_abstract'}).attrs['content'].strip() 68 | except: 69 | pass 70 | try: 71 | SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1] 72 | except: 73 | pass 74 | # except: 75 | # print(article_url) 76 | # continue 77 | 78 | with open(download_time + ".ris", 'a', encoding='utf-8') as f: 79 | f.write('TY - JOUR\n') 80 | f.write('T1 - ' + citation_title + '\n') 81 | f.write('Y1 - ' + timeofissued + '\n') 82 | f.write('SP - ' + citation_firstpage + '\n') 83 | f.write('EP - ' + citation_lastpage + '\n') 84 | f.write('JF - ' + citation_journal_title + '\n') 85 | f.write('JO - ' + citation_journal_abbrev + '\n') 86 | f.write('VL - ' + citation_volume + '\n') 87 | f.write('RS - ' + citation_issue + '\n') 88 | f.write('PB - ' + PB + '\n') 89 | f.write('SN - ' + SN + '\n') 90 | f.write('DO - ' + citation_doi + '\n') 91 | f.write('M3 - ' + M3 + '\n') 92 | f.write('UR - ' + citation_url + '\n') 93 | print(citation_url) 94 | f.write('N2 - ' + citation_abstract + '\n') 95 | # print(citation_abstract) 96 | 97 | authors = bsop.findAll('span', {'class': 'article__author-link'}) 98 | for author in authors: 99 | author = author.find('a').text.split(' ') 100 | author = author[-1] + ', ' + ' '.join(author[:-1]) 101 | f.write('A1 - ' + author + '\n') 102 | f.write('ER - ' + '\n\n\n') 103 | f.close() 104 | time.sleep(1) 105 | 106 | 107 | def crawl_article_url(nums): 108 | article_urls = [] 109 | for num in range(1, nums+1): 110 | 111 | url = 'https://pubs.rsc.org/en/search/journalresult' 112 | 113 | headers1 = { 114 | 'Accept':'text/html, */*; q=0.01', 115 | 'Connection': 'keep-alive', 116 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 117 | 'DNT':'1', 118 | 'Host':'pubs.rsc.org', 119 | 'Origin':'https://pubs.rsc.org', 120 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 121 | 'X-NewRelic-ID':'VQYFWF9aDBABV1laBgcFUw ==', 122 | 'X-Requested-With':'XMLHttpRequest' 123 | } 124 | 125 | data = { 126 | 'searchterm': 'AAEAAAD/////AQAAAAAAAAAMAgAAAGNSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAADlSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLlNlYXJjaFRlcm0OAAAAGTxDYXRlZ29yeT5rX19CYWNraW5nRmllbGQcPFN1YkNhdGVnb3J5PmtfX0JhY2tpbmdGaWVsZBw8Q29udGVudFR5cGU + a19fQmFja2luZ0ZpZWxkGjxDcml0ZXJpYXM + a19fQmFja2luZ0ZpZWxkFzxGYWNldHM + a19fQmFja2luZ0ZpZWxkHDxSZXF1ZXN0VGltZT5rX19CYWNraW5nRmllbGQfPEF1dGhvckNyaXRlcmlhPmtfX0JhY2tpbmdGaWVsZCA8UHVibGljYXRpb25EYXRlPmtfX0JhY2tpbmdGaWVsZBk8RXhjbHVkZXM + a19fQmFja2luZ0ZpZWxkFzxTb3VyY2U + a19fQmFja2luZ0ZpZWxkHzxPdXRwdXRTdGFuZGFyZD5rX19CYWNraW5nRmllbGQePFJlc3VsdHNGb3JtYXQ + a19fQmFja2luZ0ZpZWxkHjxEaXNwbGF5Q291bnRzPmtfX0JhY2tpbmdGaWVsZCA8UHJvZHVjdFBhZ2VTaXplPmtfX0JhY2tpbmdGaWVsZAEBAQMDAAQEAwEBAQEBwgFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5MaXN0YDFbW1JTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXcIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0NPVJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guQXV0aG9yQ3JpdGVyaWECAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlAgAAAMIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0CAAAABgMAAAADQWxsCgYEAAAAA0FsbAkFAAAACQYAAAAAAAAAAAAAAAkHAAAACQgAAAAJCQAAAAoKCgoKBAUAAADCAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWUsIFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cywgVmVyc2lvbj0yMDE4LjAuNTQ5LjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49bnVsbF1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24EAAA6UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWVbXQIAAAAICAkKAAAABAAAAAQAAAABBgAAAAUAAAAJCwAAAAAAAAAAAAAABQcAAAA9UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JDcml0ZXJpYQIAAAAgPEJvb2xlYW5PcGVyYXRvcj5rX19CYWNraW5nRmllbGQYPEF1dGhvcnM + a19fQmFja2luZ0ZpZWxkAQPDAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JJbmZvLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXQIAAAAKCgUIAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlBQAAAB88SXNTZWxlY3RlZERhdGU + a19fQmFja2luZ0ZpZWxkGTxEYXRlVHlwZT5rX19CYWNraW5nRmllbGQbPFdpdGhJbkxhc3Q + a19fQmFja2luZ0ZpZWxkGjxEYXRlUmFuZ2U + a19fQmFja2luZ0ZpZWxkHDxEaXNwbGF5RGF0ZT5rX19CYWNraW5nRmllbGQAAQQEAQE5UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5XaXRoSW5MYXN0AgAAADhSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLkRhdGVSYW5nZQIAAAACAAAAAAoKCgoBCQAAAAUAAAAJCwAAAAAAAAAAAAAABwoAAAAAAQAAAAQAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAkMAAAACQ0AAAAJDgAAAAkPAAAABwsAAAAAAQAAAAAAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAUMAAAAOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlBAAAABU8TmFtZT5rX19CYWNraW5nRmllbGQcPERpc3BsYXlOYW1lPmtfX0JhY2tpbmdGaWVsZBY8VmFsdWU + a19fQmFja2luZ0ZpZWxkIDxCb29sZWFuT3BlcmF0b3I + a19fQmFja2luZ0ZpZWxkAQEBAQIAAAAGEAAAAAhmcmVldGV4dAoGEQAAAG9kZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtIEFORCBDdSwgT1IgY29wcGVyLCBPUiBlbGVjdHJvbGVzcywgT1IgcHJpbnRpbmcsIE9SIGZsZXhpYmxlLCBPUiBzdWJzdHJhdGUsIE9SIHBsYXN0aWMKAQ0AAAAMAAAABhIAAAAHQWxsVGV4dAoGEwAAABlkZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtCgEOAAAADAAAAAYUAAAAC0F0bGVhc3RUZXh0CgYVAAAAP0N1LCBjb3BwZXIsIGVsZWN0cm9sZXNzLCBwcmludGluZywgZmxleGlibGUsIHN1YnN0cmF0ZSwgcGxhc3RpYwoBDwAAAAwAAAAGFgAAABBPcmlnaW5hbEZyZWVUZXh0CgYXAAAAb2RlcG9zaXRpb24sIHBhdHRlcm4sIGZpbG0gQU5EIEN1LCBPUiBjb3BwZXIsIE9SIGVsZWN0cm9sZXNzLCBPUiBwcmludGluZywgT1IgZmxleGlibGUsIE9SIHN1YnN0cmF0ZSwgT1IgcGxhc3RpYwoL', 127 | 'resultcount': '282607', 128 | 'category': 'all', 129 | 'pageno': str(num) 130 | } 131 | 132 | html = requests.post(url, data=data, headers=headers1, stream=True, timeout=20, verify=True) 133 | html.encoding = 'utf-8' 134 | text = html.text 135 | # print(text) 136 | bsop = BeautifulSoup(text, 'html.parser') 137 | divs = bsop.findAll('div', {'class': 'capsule capsule--article '}) 138 | for i in divs: 139 | article_url = 'https://pubs.rsc.org' + i.find('a').attrs['href'] 140 | # print(article_url) 141 | article_urls.append(article_url) 142 | print("第" + str(num) + "页爬取完毕") 143 | time.sleep(1) 144 | return article_urls 145 | 146 | 147 | # 构造headers 148 | UserAgent_List = [ 149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 150 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 151 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 152 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 153 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 154 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 155 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 156 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 157 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 158 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 159 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 160 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 161 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 162 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 163 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 164 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 165 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 166 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 167 | ] 168 | headers = {'User-Agent': random.choice(UserAgent_List), 169 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 170 | 'Accept-Encoding': 'gzip', 171 | } 172 | nums = 5 # 爬取的页数 173 | 174 | article_urls = crawl_article_url(nums) 175 | royal(article_urls) 176 | 177 | 178 | 179 | # url = 'https://pubs.rsc.org/en/content/articlelanding/2017/tc/c7tc00038c#!divAbstract' 180 | # royal(url) 181 | -------------------------------------------------------------------------------- /download-citation/ros.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | 282607 items 4 | - Showing page 1 of 11305 5 |
6 | 7 |
8 |
9 | 10 | Inactive - no previous page 11 | 12 |
13 |
14 | 15 | Next page 16 | 17 |
18 |
19 | 20 |
21 | 22 |
23 | 24 | 25 | 26 |
27 | 28 | 30 | 31 | 32 | Review Article 33 | 34 | 35 |

36 | Inkjet printing metals on flexible materials for plastic and paper electronics 37 |

38 | 39 | 42 | 43 |
44 |
45 |

Inorganic printed electronics is now recognized as an area of tremendous commercial potential and technical progress.

46 |
47 |
48 | 49 | Graphical abstract: Inkjet printing metals on flexible materials for plastic and paper electronics 53 |
54 |
55 | 56 |
57 | 58 |
59 | From the themed collection: 60 | Recent Review Articles
61 | 62 | 83 | 84 |
85 | 86 | 87 |
88 | 89 | 91 | 92 | 93 | Review Article 94 | 95 | 96 |

97 | Chemical formation of soft metal electrodes for flexible and wearable electronics 98 |

99 | 100 | 103 | 104 |
105 |
106 |

Efficient chemical approaches to fabricating soft metal electrodes aiming at wearable electronics are summarized and reviewed.

107 |
108 |
109 | 110 | Graphical abstract: Chemical formation of soft metal electrodes for flexible and wearable electronics 114 |
115 |
116 | 117 |
118 | 119 | 120 | 141 | 142 |
143 | 144 | 145 |
146 | 147 | 149 | 150 | 151 | Review Article 152 | 153 | 154 |

155 | Inkjet printing wearable electronic devices 156 |

157 | 158 | 161 | 162 |
163 |
164 |

In this review, the recent advances in inks, strategies, and the applications of inkjet-printed wearable electronics have been summarized.

165 |
166 |
167 | 168 | Graphical abstract: Inkjet printing wearable electronic devices 172 |
173 |
174 | 175 |
176 | 177 |
178 | From the themed collection: 179 | Recent Review Articles
180 | 181 | 202 | 203 |
204 | 205 | 206 |
207 | 208 | 210 | 211 | 212 | Open Access 213 | Review Article 214 | 215 | 216 |

217 | Inkjet printed nanomaterial based flexible radio frequency identification (RFID) tag sensors for the internet of nano things 218 |

219 | 220 | 223 | 224 |
225 |
226 |

The Internet of Things (IoT) has limitless possibilities for applications in the entire spectrum of our daily lives, from healthcare to automobiles to public safety.

227 |
228 |
229 | 230 | Graphical abstract: Inkjet printed nanomaterial based flexible radio frequency identification (RFID) tag sensors for the internet of nano things 234 |
235 |
236 | 237 |
238 | 239 |
240 | From the themed collection: 241 | 2017 Review articles
242 | 243 | 264 | 265 |
266 | 267 | 268 |
269 | 270 | 272 | 273 | 274 | Review Article 275 | 276 | 277 |

278 | Inorganic nanomaterials for printed electronics: a review 279 |

280 | 281 | 284 | 285 |
286 |
287 |

Owing to their capability of bypassing conventional high-priced and inflexible silicon based electronics to manufacture a variety of devices on flexible substrates by using large-scale and high-volume printing techniques, printed electronics (PE) have attracted increasing attention in the field of manufacturing industry for electronic devices.

288 |
289 |
290 | 291 | Graphical abstract: Inorganic nanomaterials for printed electronics: a review 295 |
296 |
297 | 298 |
299 | 300 |
301 | From the themed collection: 302 | Recent Review Articles
303 | 304 | 325 | 326 |
327 | 328 | 329 |
330 | 331 | 333 | 334 | 335 | Open Access 336 | Review Article 337 | 338 | 339 |

340 | Precursor strategies for metallic nano- and micropatterns using soft lithography 341 |

342 | 343 | 346 | 347 |
348 |
349 |

Soft lithographic methods describe a set of printing methods which are widely used for the preparation of structured surfaces.

350 |
351 |
352 | 353 | Graphical abstract: Precursor strategies for metallic nano- and micropatterns using soft lithography 357 |
358 |
359 | 360 |
361 | 362 |
363 | From the themed collection: 364 | 2015 Journal of Materials Chemistry C Hot Papers
365 | 366 | 387 | 388 |
389 | 390 | 391 |
392 | 393 | 395 | 396 | 397 | Open Access 398 | Review Article 399 | 400 | 401 |

402 | Science and technology roadmap for graphene, related two-dimensional crystals, and hybrid systems 403 |

404 | 405 | 408 | 409 |
410 |
411 |

We present the science and technology roadmap for graphene, related two-dimensional crystals, and hybrid systems, targeting an evolution in technology, that might lead to impacts and benefits reaching into most areas of society.

412 |
413 |
414 | 415 | Graphical abstract: Science and technology roadmap for graphene, related two-dimensional crystals, and hybrid systems 419 |
420 |
421 | 422 |
423 | 424 | 425 | 446 | 447 |
448 | 449 | 450 |
451 | 452 | 454 | 455 | 456 | Review Article 457 | 458 | 459 |

460 | Performance of hybrid nanostructured conductive cotton materials as wearable devices: an overview of materials, fabrication, properties and applications 461 |

462 | 463 | 466 | 467 |
468 |
469 |

Recent advances and overview of hybrid nanostructured cotton materials will boost an essential encouragement for the development of next generation smart textiles and flexible devices which could be worn by human beings.

470 |
471 |
472 | 473 | Graphical abstract: Performance of hybrid nanostructured conductive cotton materials as wearable devices: an overview of materials, fabrication, properties and applications 477 |
478 |
479 | 480 |
481 | 482 | 483 | 504 | 505 |
506 | 507 | 508 |
509 | 510 | 512 | 513 | 514 | Review Article 515 | 516 | 517 |

518 | Inkjet-printed optoelectronics 519 |

520 | 521 | 524 | 525 |
526 |
527 |

A systematical overview of optoelectronics fabricated by inkjet printing techniques.

528 |
529 |
530 | 531 | Graphical abstract: Inkjet-printed optoelectronics 535 |
536 |
537 | 538 |
539 | 540 |
541 | From the themed collection: 542 | Recent Review Articles
543 | 544 | 565 | 566 |
567 | 568 | 569 |
570 | 571 | 573 | 574 | 575 | Review Article 576 | 577 | 578 |

579 | Copper conductive inks: synthesis and utilization in flexible electronics 580 |

581 | 582 | 585 | 586 |
587 |
588 |

Conductive inks are a recent advance in electronics and have promising future applications in flexible electronics and smart applications.

589 |
590 |
591 | 592 | Graphical abstract: Copper conductive inks: synthesis and utilization in flexible electronics 596 |
597 |
598 | 599 |
600 | 601 | 602 | 623 | 624 |
625 | 626 | 627 |
628 | 629 | 631 | 632 | 633 | Paper 634 | 635 | 636 |

637 | Printable organic and polymeric semiconducting materials and devices 638 |

639 | 640 | 643 | 644 | 645 |
646 | 647 |
648 | From the themed collection: 649 | Functional Organic Materials for Devices
650 | 651 | 671 | 672 |
673 | 674 | 675 |
676 | 677 | 679 | 680 | 681 | Open Access 682 | Review Article 683 | 684 | 685 |

686 | MOF positioning technology and device fabrication 687 |

688 | 689 | 692 | 693 |
694 |
695 |

Methods for permanent localisation, dynamic localisation and spatial control of functional materials within MOF crystals are critical for the development of miniaturised MOF-based devices for a number of technological applications.

696 |
697 |
698 | 699 | Graphical abstract: MOF positioning technology and device fabrication 703 |
704 |
705 | 706 |
707 | 708 |
709 | From the themed collection: 710 | Metal Organic Frameworks (MOFs)
711 | 712 | 733 | 734 |
735 | 736 | 737 |
738 | 739 | 741 | 742 | 743 | Review Article 744 | 745 | 746 |

747 | Multifunctional cellulose-paper for light harvesting and smart sensing applications 748 |

749 | 750 | 753 | 754 |
755 |
756 |

Opto-electronics on/with paper is fostering a novel generation of flexible and recyclable devices for sunlight harvesting and intelligent optical sensing.

757 |
758 |
759 | 760 | Graphical abstract: Multifunctional cellulose-paper for light harvesting and smart sensing applications 764 |
765 |
766 | 767 |
768 | 769 |
770 | From the themed collection: 771 | Recent Review Articles
772 | 773 | 794 | 795 |
796 | 797 | 798 |
799 | 800 | 802 | 803 | 804 | Application 805 | 806 | 807 |

808 | Recent advances in upscalable wet methods and ink formulations for printed electronics 809 |

810 | 811 | 814 | 815 |
816 |
817 |

Advances in upscalable wet methods and ink formulations have improved the properties of printed molecular thin films along with the performance of printed electronic devices.

818 |
819 |
820 | 821 | Graphical abstract: Recent advances in upscalable wet methods and ink formulations for printed electronics 825 |
826 |
827 | 828 |
829 | 830 | 831 | 852 | 853 |
854 | 855 | 856 |
857 | 858 | 860 | 861 | 862 | Review Article 863 | 864 | 865 |

866 | Use of nanocellulose in printed electronics: a review 867 |

868 | 869 | 872 | 873 |
874 |
875 |

Since the last decade, interest in cellulose nanomaterials, known as nanocellulose, has been growing in printed electronics.

876 |
877 |
878 | 879 | Graphical abstract: Use of nanocellulose in printed electronics: a review 883 |
884 |
885 | 886 |
887 | 888 | 889 | 910 | 911 |
912 | 913 | 914 |
915 | 916 | 918 | 919 | 920 | Feature Article 921 | 922 | 923 |

924 | Flexible rechargeable lithium ion batteries: advances and challenges in materials and process technologies 925 |

926 | 927 | 930 | 931 |
932 |
933 |

This review summarizes the advances and challenges in materials and process technologies in flexible rechargeable lithium ion batteries research.

934 |
935 |
936 | 937 | Graphical abstract: Flexible rechargeable lithium ion batteries: advances and challenges in materials and process technologies 941 |
942 |
943 | 944 |
945 | 946 |
947 | From the themed collection: 948 | Flexible energy storage and conversion
949 | 950 | 971 | 972 |
973 | 974 | 975 |
976 | 977 | 979 | 980 | 981 | Paper 982 | 983 | 984 |

985 | Fabrication of dual-side metal patterns onto textile substrates for wearable electronics by combining wax-dot printing with electroless plating 986 |

987 | 988 | 991 | 992 |
993 |
994 |

Dual-side Cu patterns with well-defined boundaries were plated onto fabric by combining wax-dot printing with electroless copper plating.

995 |
996 |
997 | 998 | Graphical abstract: Fabrication of dual-side metal patterns onto textile substrates for wearable electronics by combining wax-dot printing with electroless plating 1002 |
1003 |
1004 | 1005 |
1006 | 1007 | 1008 | 1029 | 1030 |
1031 | 1032 | 1033 |
1034 | 1035 | 1037 | 1038 | 1039 | Review Article 1040 | 1041 | 1042 |

1043 | Recent advances in flexible organic light-emitting diodes 1044 |

1045 | 1046 | 1049 | 1050 |
1051 |
1052 |

This review summarizes the recent achievements in flexible OLEDs involving transparent conductive electrodes, device fabrication, light extraction technologies, as well as encapsulation methods.

1053 |
1054 |
1055 | 1056 | Graphical abstract: Recent advances in flexible organic light-emitting diodes 1060 |
1061 |
1062 | 1063 |
1064 | 1065 | 1066 | 1087 | 1088 |
1089 | 1090 | 1091 |
1092 | 1093 | 1095 | 1096 | 1097 | Feature Article 1098 | 1099 | 1100 |

1101 | Progress of alternative sintering approaches of inkjet-printed metal inks and their application for manufacturing of flexible electronic devices 1102 |

1103 | 1104 | 1107 | 1108 |
1109 |
1110 |

This review discusses the advances in alternative sintering approaches for conductive, metal containing inkjet inks.

1111 |
1112 |
1113 | 1114 | Graphical abstract: Progress of alternative sintering approaches of inkjet-printed metal inks and their application for manufacturing of flexible electronic devices 1118 |
1119 |
1120 | 1121 |
1122 | 1123 | 1124 | 1145 | 1146 |
1147 | 1148 | 1149 |
1150 | 1151 | 1153 | 1154 | 1155 | Critical Review 1156 | 1157 | 1158 |

1159 | Solution processing of transparent conductors: from flask to film 1160 |

1161 | 1162 | 1165 | 1166 |
1167 |
1168 |

This critical review focuses the solution deposition of transparent conductors with a particular focus on transparent conducting oxide (TCO) thin-films.

1169 |
1170 |
1171 | 1172 | Graphical abstract: Solution processing of transparent conductors: from flask to film 1176 |
1177 |
1178 | 1179 |
1180 | 1181 | 1182 | 1203 | 1204 |
1205 | 1206 | 1207 |
1208 | 1209 | 1211 | 1212 | 1213 | Review Article 1214 | 1215 | 1216 |

1217 | Patterning of controllable surface wettability for printing techniques 1218 |

1219 | 1220 | 1223 | 1224 |
1225 |
1226 |

Patterning of controllable surface wettability for printing techniques, personal points for the future development and remaining challenges are addressed.

1227 |
1228 |
1229 | 1230 | Graphical abstract: Patterning of controllable surface wettability for printing techniques 1234 |
1235 |
1236 | 1237 |
1238 | 1239 | 1240 | 1261 | 1262 |
1263 | 1264 | 1265 |
1266 | 1267 | 1269 | 1270 | 1271 | Critical Review 1272 | 1273 | 1274 |

1275 | Advanced printing and deposition methodologies for the fabrication of biosensors and biodevices 1276 |

1277 | 1278 | 1281 | 1282 |
1283 |
1284 |

Advanced printing and deposition methodologies are revolutionising the way biological molecules are deposited and leading to changes in the mass production of biosensors and biodevices.

1285 |
1286 |
1287 | 1288 | Graphical abstract: Advanced printing and deposition methodologies for the fabrication of biosensors and biodevices 1292 |
1293 |
1294 | 1295 |
1296 | 1297 | 1298 | 1319 | 1320 |
1321 | 1322 | 1323 |
1324 | 1325 | 1327 | 1328 | 1329 | Paper 1330 | 1331 | 1332 |

1333 | Fabrication of flexible copper-based electronics with high-resolution and high-conductivity on paper via inkjet printing 1334 |

1335 | 1336 | 1339 | 1340 |
1341 |
1342 |

Flexible paper-based electronics with high-conductivity were fabricated by inkjet printing of noble metal salts and subsequent electroless deposition of metals.

1343 |
1344 |
1345 | 1346 | Graphical abstract: Fabrication of flexible copper-based electronics with high-resolution and high-conductivity on paper via inkjet printing 1350 |
1351 |
1352 | 1353 |
1354 | 1355 | 1356 | 1377 | 1378 |
1379 | 1380 | 1381 |
1382 | 1383 | 1385 | 1386 | 1387 | Paper 1388 | 1389 | 1390 |

1391 | Chemically modified flexible strips as electrochemical biosensors 1392 |

1393 | 1394 | 1397 | 1398 |
1399 |
1400 |

Copper-coated, polyaniline modified, non-conductive flexible polyester OHP strips are demonstrated as an electrochemical biosensor for non-enzymatic detection of glucose.

1401 |
1402 |
1403 | 1404 | Graphical abstract: Chemically modified flexible strips as electrochemical biosensors 1408 |
1409 |
1410 | 1411 |
1412 | 1413 | 1414 | 1435 | 1436 |
1437 | 1438 | 1439 |
1440 | 1441 | 1443 | 1444 | 1445 | Critical Review 1446 | 1447 | 1448 |

1449 | Recent advances in large-scale assembly of semiconducting inorganic nanowires and nanofibers for electronics, sensors and photovoltaics 1450 |

1451 | 1452 | 1455 | 1456 |
1457 |
1458 |

This article mainly summarizes recent advances in large-scale assembly of semiconducting inorganic nanowires and nanofibers for electronics, sensors and photovoltaics.

1459 |
1460 |
1461 | 1462 | Graphical abstract: Recent advances in large-scale assembly of semiconducting inorganic nanowires and nanofibers for electronics, sensors and photovoltaics 1466 |
1467 |
1468 | 1469 |
1470 | 1471 | 1472 | 1493 | 1494 |
1495 |
1496 | 282607 items 1497 | - Showing page 1 of 11305 1498 |
1499 | 1500 |
1501 |
1502 | 1503 | Inactive - no previous page 1504 | 1505 |
1506 |
1507 | 1508 | Next page 1509 | 1510 |
1511 |
1512 | 1513 |
1514 | 1515 |
1516 | -------------------------------------------------------------------------------- /download-citation/springer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import random 5 | from bs4 import BeautifulSoup 6 | import time 7 | 8 | download_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) 9 | 10 | 11 | def royal(article_urls): 12 | for article_url in article_urls: 13 | # try: 14 | html = requests.get(article_url, headers=headers, stream=True, timeout=20, verify=True) 15 | html.encoding = 'utf-8' 16 | text = html.text 17 | bsop = BeautifulSoup(text, 'html.parser') 18 | try: 19 | timeofissued = bsop.find('meta', {'name':'citation_cover_date'}).attrs['content'].split('/')[0] 20 | except: 21 | pass 22 | try: 23 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content'] 24 | except: 25 | pass 26 | try: 27 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content'] 28 | except: 29 | pass 30 | try: 31 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content'] 32 | except: 33 | pass 34 | try: 35 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content'] 36 | except: 37 | pass 38 | try: 39 | # citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content'] 40 | citation_issue = bsop.find('span', {'id':'electronic-issn'}).text 41 | except: 42 | pass 43 | try: 44 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content'] 45 | except: 46 | pass 47 | try: 48 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content'] 49 | except: 50 | pass 51 | try: 52 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content'] 53 | except: 54 | pass 55 | try: 56 | PB = bsop.find('meta', {'name':'citation_publisher'}).attrs['content'] 57 | except: 58 | pass 59 | try: 60 | M3 = citation_doi 61 | except: 62 | pass 63 | try: 64 | citation_url = 'http://dx.doi.org/' + citation_doi 65 | except: 66 | pass 67 | try: 68 | # citation_abstract = bsop.find('p', {'id':'Par1'}).attrs['content'].strip() 69 | citation_abstract = bsop.find('p', {'id':'Par1'}).text 70 | except: 71 | pass 72 | try: 73 | # SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1] 74 | SN = bsop.find('span', {'id':'electronic-issn'}).text 75 | except: 76 | pass 77 | # except: 78 | # print(article_url) 79 | # continue 80 | 81 | with open(download_time + ".ris", 'a', encoding='utf-8') as f: 82 | f.write('TY - JOUR\n') 83 | f.write('T1 - ' + citation_title + '\n') 84 | f.write('Y1 - ' + timeofissued + '\n') 85 | f.write('SP - ' + citation_firstpage + '\n') 86 | f.write('EP - ' + citation_lastpage + '\n') 87 | f.write('JF - ' + citation_journal_title + '\n') 88 | f.write('JO - ' + citation_journal_abbrev + '\n') 89 | f.write('VL - ' + citation_volume + '\n') 90 | f.write('RS - ' + citation_issue + '\n') 91 | f.write('PB - ' + PB + '\n') 92 | f.write('SN - ' + SN + '\n') 93 | f.write('DO - ' + citation_doi + '\n') 94 | f.write('M3 - ' + M3 + '\n') 95 | f.write('UR - ' + citation_url + '\n') 96 | print(citation_url) 97 | f.write('N2 - ' + citation_abstract + '\n') 98 | # print(citation_abstract) 99 | 100 | authors = bsop.findAll('meta', {'name': 'citation_author'}) 101 | for author in authors: 102 | # print(author) 103 | author = author.attrs['content'].split(" ") 104 | # print(author) 105 | author = author[-1] + ', ' + ' '.join(author[:-1]) 106 | f.write('A1 - ' + author + '\n') 107 | f.write('ER - ' + '\n\n\n') 108 | f.close() 109 | time.sleep(1) 110 | 111 | 112 | def crawl_article_url(nums): 113 | article_urls = [] 114 | for num in range(1, nums+1): 115 | 116 | url = 'https://link.springer.com/search/page/' + str(num) + '?date-facet-mode=between&facet-start-year=2010&facet-language=%22En%22&query=printing%2C+AND+Cu+AND+pattern%2C+AND+film%2C+AND+flexible%2C+AND+plastic%2C+AND+substrate%2C+AND+copper&facet-end-year=2019&showAll=true&facet-content-type=%22Article%22' 117 | 118 | html = requests.get(url, headers=headers, stream=True, timeout=20, verify=True) 119 | html.encoding = 'utf-8' 120 | text = html.text 121 | # print(text) 122 | bsop = BeautifulSoup(text, 'html.parser') 123 | divs = bsop.find('ol', {'id': 'results-list'}).findAll('li') 124 | for i in divs: 125 | # print(i) 126 | article_url = 'https://link.springer.com' + i.find('h2').find('a').attrs['href'] 127 | print(article_url) 128 | article_urls.append(article_url) 129 | print("第" + str(num) + "页爬取完毕") 130 | time.sleep(1) 131 | return article_urls 132 | 133 | 134 | # 构造headers 135 | UserAgent_List = [ 136 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 137 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 138 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 139 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 140 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 141 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 142 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 143 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 144 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 145 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 146 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 147 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 148 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 150 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 151 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 152 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 153 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 154 | ] 155 | headers = {'User-Agent': random.choice(UserAgent_List), 156 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 157 | 'Accept-Encoding': 'gzip', 158 | } 159 | nums = 1 # 爬取的页数 160 | 161 | article_urls = crawl_article_url(nums) 162 | royal(article_urls) -------------------------------------------------------------------------------- /download_biao_qing_win.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | 在以下环境测试通过: 5 | python 2.7.15或者3.7.0 6 | win10或者lubuntu 7 | ''' 8 | 9 | # 导入模块 10 | import time 11 | import requests, re, random, os 12 | from bs4 import BeautifulSoup 13 | 14 | ''' 15 | 给定页数,爬取每页所有图片的url,通过此url可以打开图片所在的网页 16 | 所有url存在一个列表中 17 | ''' 18 | def scrapy_img_urls(nums): 19 | lss = [] 20 | for num in range(1, nums+1): 21 | url = 'http://www.doutula.com/photo/list/?page=' + str(num) 22 | html = requests.get(url, headers=headers) 23 | html.encoding = 'utf-8' 24 | 25 | text = html.text 26 | bsop = BeautifulSoup(text, 'html.parser') 27 | ass = bsop.find('div', {'class': 'page-content'}).find('div').findAll('a') 28 | 29 | for a in ass: 30 | # print(a.attrs['href']) 31 | lss.append(a.attrs['href']) 32 | time.sleep(1) 33 | return lss 34 | 35 | ''' 36 | 接收每个图片的url,打开此url,找到图片真实的地址,通过此地址可以下载图片 37 | 找到图片真实的url和名字之后调用download_url函数可以下载图片 38 | ''' 39 | def download_img_url(url): 40 | html = requests.get(url, headers=headers) 41 | html.encoding = 'utf-8' 42 | 43 | text = html.text 44 | bsop = BeautifulSoup(text, 'html.parser') 45 | img = bsop.find('div', {'class': 'col-xs-12 col-sm-12 artile_des'}) 46 | img_url = img.find('img').attrs['src'] 47 | img_title = img.find('img').attrs['alt'] 48 | print(img_url + " " + img_title) 49 | 50 | download_img(img_url, img_title) 51 | 52 | ''' 53 | 下载图片,该函数接收两个参数,一个是图片的真实地址,一个是图片的名字 54 | 名字中如果有特殊字符则需要处理,不然windows下可能无法保存,处理名字调用format_name函数 55 | 打开指定文件夹保存图片,如果没有则创建。 56 | ''' 57 | def download_img(img_url, img_title): 58 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹 59 | if not os.path.exists(file_path): 60 | os.makedirs(file_path) 61 | os.chdir(file_path) 62 | 63 | # 图片保存到本地 64 | exists = os.path.exists(img_title) 65 | if not exists: 66 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True) 67 | img_html.encoding = 'utf-8' 68 | with open(img_title + ".gif", 'wb') as f: 69 | f.write(img_html.content) 70 | f.close() 71 | 72 | 73 | def format_name(img_title): 74 | ''' 75 | 对名字进行处理,如果包含下属字符,则直接剔除该字符 76 | :param img_title: 77 | :return: 78 | ''' 79 | for i in ['\\','/',':','*','?','"','<','>','!','|']: 80 | while i in img_title: 81 | img_title = img_title.strip().replace(i, '') 82 | return img_title 83 | 84 | # 构造headers 85 | UserAgent_List = [ 86 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 87 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 88 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 89 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 90 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 91 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 92 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 93 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 94 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 95 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 96 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 97 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 98 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 99 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 100 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 101 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 102 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 103 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 104 | ] 105 | headers = {'User-Agent': random.choice(UserAgent_List), 106 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 107 | 'Accept-Encoding': 'gzip', 108 | } 109 | 110 | nums=5 111 | # 图片存储路径,在linux系统下 112 | # file_path = '/home/zhangyb/downloadfiles/pythonpro/biaoqing' 113 | # 图片存储路径,在windows系统下 114 | 115 | file_path = 'E:\downloadfiles\pythonpro\biaoqing' 116 | urls = scrapy_img_urls(nums) 117 | for i in urls: 118 | print(i) 119 | download_img_url(i) 120 | 121 | 122 | # download_img_url('http://www.doutula.com/photo/6437987') 123 | # download_img('https://ws1.sinaimg.cn/large/9150e4e5gy1fx94eo4pdwg203q02g0so.gif', u'好想打死你啊') -------------------------------------------------------------------------------- /huaban.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | python 2.7.12 5 | ''' 6 | 7 | import requests 8 | from parsel import Selector 9 | import time 10 | import re, random, os 11 | 12 | 13 | def scraw_pin_ids(): 14 | 15 | pin_ids = [] 16 | pin_id = '1068018182' 17 | 18 | flag = True 19 | while flag: 20 | try: 21 | url = "http://huaban.com/favorite/beauty/" 22 | headers1 = { 23 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 24 | 'Accept':'application/json', 25 | 'X-Request':'JSON', 26 | 'X-Requested-With':'XMLHttpRequest', 27 | } 28 | 29 | params = { 30 | 'j0l4lymf':'', 31 | 'max':pin_id, 32 | 'limit':'20', 33 | 'wfl':'1', 34 | } 35 | 36 | z1 = requests.get(url, params=params, headers=headers1) 37 | 38 | if z1.json()['pins']: 39 | for i in z1.json()['pins']: 40 | pin_ids.append(i['pin_id']) 41 | pin_id = pin_ids[-1] 42 | print i['pin_id'] 43 | # with open("pin_ids.txt",'ab') as f: 44 | # f.write(str(i['pin_id'])+"\n") 45 | # f.close() 46 | time.sleep(0.001) 47 | else: 48 | flag = False 49 | return set(pin_ids) 50 | except: 51 | continue 52 | 53 | def scraw_urls(pin_ids): 54 | 55 | urls = [] 56 | 57 | urlss = ['http://huaban.com/pins/' + str(i) +'/' for i in pin_ids] 58 | for url in urlss: 59 | try: 60 | headers = { 61 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 62 | } 63 | 64 | z3 = requests.get(url, headers=headers) 65 | 66 | text = z3.text 67 | 68 | pattern = re.compile('"key":"(.*?)"', re.S) 69 | items = re.findall(pattern, text) 70 | 71 | urls.extend(items) 72 | print items 73 | print '============================================================================================================' 74 | except: 75 | continue 76 | return set(urls) 77 | 78 | def download(urls): 79 | headers1 = { 80 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 81 | } 82 | n = 1 83 | urls = set(urls) 84 | for url in urls: 85 | try: 86 | if not os.path.exists(os.path.join(file_path, "huaban")): 87 | os.makedirs(os.path.join(file_path, "huaban")) 88 | os.chdir(file_path + '\\' + "huaban") 89 | try: 90 | url = 'http://img.hb.aicdn.com/' + url 91 | r = requests.get(url, headers=headers1) 92 | if len(r.content)>40000: 93 | with open(str(n)+".jpg", 'wb') as f: 94 | f.write(r.content) 95 | f.close() 96 | print u"第" + str(n) + u"张图片下载成功" 97 | n+=1 98 | # time.sleep(3) 99 | except: 100 | continue 101 | except: 102 | continue 103 | 104 | # 图片存储路径 105 | file_path = 'E:\selfprogress\programming\project\pa1024\huabannnnnnn' 106 | pin_ids = scraw_pin_ids() 107 | urls = scraw_urls(pin_ids) 108 | download(urls) 109 | -------------------------------------------------------------------------------- /ip_pachong.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | python 3.7.0 5 | ''' 6 | 7 | # 导入模块 8 | import time 9 | import requests, re, random, os 10 | from bs4 import BeautifulSoup 11 | 12 | def ip_test(ip, url_for_test='https://www.baidu.com', set_timeout=10): 13 | ''' 14 | 检测爬取到的ip地址可否使用,能使用返回True,否则返回False,默认去访问百度测试代理 15 | :param ip: 16 | :param url_for_test: 17 | :param set_timeout: 18 | :return: 19 | ''' 20 | try: 21 | r = requests.get(url_for_test, headers=headers, proxies={'http': ip[0]+':'+ip[1]}, timeout=set_timeout) 22 | if r.status_code == 200: 23 | return True 24 | else: 25 | return False 26 | except: 27 | return False 28 | 29 | def scrawl_ip(url, num, url_for_test='https://www.baidu.com'): 30 | ''' 31 | 爬取代理ip地址,代理的url是西祠代理 32 | :param url: 33 | :param num: 34 | :param url_for_test: 35 | :return: 36 | ''' 37 | ip_list = [] 38 | for num_page in range(1, num+1): 39 | url = url + str(num_page) 40 | 41 | response = requests.get(url, headers=headers) 42 | response.encoding = 'utf-8' 43 | content = response.text 44 | 45 | pattern = re.compile('.*?alt="Cn" />.*?.*?(.*?).*?(.*?)', re.S) 46 | items = re.findall(pattern, content) 47 | for ip in items: 48 | if ip_test(ip[1], url_for_test): # 测试爬取到ip是否可用,测试通过则加入ip_list列表之中 49 | print('测试通过,IP地址为' + str(ip[0]) + ':' + str(ip[1])) 50 | ip_list.append(ip[0]+':'+ip[1]) 51 | return ip_list 52 | 53 | time.sleep(5) # 等待5秒爬取下一页 54 | 55 | def get_random_ip(): # 随机获取一个IP 56 | ind = random.randint(0, len(total_ip)-1) 57 | return total_ip[ind] 58 | 59 | 60 | # 爬取代理的url地址,选择的是西祠代理 61 | url_ip = "http://www.xicidaili.com/nt/" 62 | 63 | # 设定等待时间 64 | set_timeout = 10 65 | 66 | # 爬取代理的页数,2表示爬取2页的ip地址 67 | num = 2 68 | 69 | # 代理的使用次数 70 | count_time = 5 71 | 72 | # 构造headers 73 | UserAgent_List = [ 74 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 75 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 76 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 77 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 78 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 79 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 80 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 81 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 82 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 83 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 84 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 85 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 86 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 87 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 88 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 89 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 90 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 91 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 92 | ] 93 | 94 | headers = {'User-Agent': random.choice(UserAgent_List), 95 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 96 | 'Accept-Encoding': 'gzip', 97 | } 98 | 99 | 100 | # 爬取IP代理 101 | total_ip = scrawl_ip(url_ip, num) 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /login.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/login.py -------------------------------------------------------------------------------- /login2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/login2.py -------------------------------------------------------------------------------- /meizitu3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | python 3.5.2 5 | ''' 6 | 7 | # 导入模块 8 | import time 9 | import requests, re, random, os 10 | from bs4 import BeautifulSoup 11 | 12 | def ip_test(ip, url_for_test='https://www.baidu.com', set_timeout=10): 13 | ''' 14 | 检测爬取到的ip地址可否使用,能使用返回True,否则返回False,默认去访问百度测试代理 15 | :param ip: 16 | :param url_for_test: 17 | :param set_timeout: 18 | :return: 19 | ''' 20 | try: 21 | r = requests.get(url_for_test, headers=headers, proxies={'http': ip[0]+':'+ip[1]}, timeout=set_timeout) 22 | if r.status_code == 200: 23 | return True 24 | else: 25 | return False 26 | except: 27 | return False 28 | 29 | def scrawl_ip(url, num, url_for_test='https://www.baidu.com'): 30 | ''' 31 | 爬取代理ip地址,代理的url是西祠代理 32 | :param url: 33 | :param num: 34 | :param url_for_test: 35 | :return: 36 | ''' 37 | ip_list = [] 38 | for num_page in range(1, num+1): 39 | url = url + str(num_page) 40 | 41 | response = requests.get(url, headers=headers) 42 | response.encoding = 'utf-8' 43 | content = response.text 44 | 45 | pattern = re.compile('.*?alt="Cn" />.*?.*?(.*?).*?(.*?)', re.S) 46 | items = re.findall(pattern, content) 47 | for ip in items: 48 | if ip_test(ip[1], url_for_test): # 测试爬取到ip是否可用,测试通过则加入ip_list列表之中 49 | print('测试通过,IP地址为' + str(ip[0]) + ':' + str(ip[1])) 50 | ip_list.append(ip[0]+':'+ip[1]) 51 | return ip_list 52 | 53 | time.sleep(5) # 等待5秒爬取下一页 54 | 55 | def get_random_ip(): # 随机获取一个IP 56 | ind = random.randint(0, len(total_ip)-1) 57 | return total_ip[ind] 58 | 59 | def download_img(img_list, img_title): 60 | ''' 61 | 通过scrawl_url函数获得了单个图册里面所有图片的url列表和图册的名字,就可以下载图片了 62 | 此函数的作用下载单个图册里面的所有图片 63 | 接收参数img_list是单个图册里面所有图片的的url, 64 | 如['http://mm.howkuai.com/wp-content/uploads/2017a/02/07/01.jpg', 65 | 'http://mm.howkuai.com/wp-content/uploads/2017a/02/07/02.jpg',...] 66 | img_title是单个图册的名字,如’香车美女,最完美的黄金搭档‘ 67 | :param img_list: 68 | :param img_title: 69 | :return: 70 | ''' 71 | 72 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹 73 | for img_urls in img_list: 74 | img_url = img_urls.attrs['src'] # 单个图片的url地址 75 | print(img_url) 76 | title = img_urls.attrs['alt'] # 单个图片的名字 77 | print(title) 78 | 79 | try: 80 | if not os.path.exists(os.path.join(file_path, img_title)): 81 | os.makedirs(os.path.join(file_path, img_title)) 82 | os.chdir(file_path + '\\' + img_title) 83 | 84 | # 图片保存到本地 85 | exists = os.path.exists(img_title) 86 | if not exists: 87 | try: 88 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True) 89 | with open(title+".jpg", 'wb') as f: 90 | f.write(img_html.content) 91 | f.close() 92 | except: 93 | continue 94 | except: 95 | continue 96 | 97 | def scrawl_list(url_list, proxy_flag=False, try_time=0): 98 | ''' 99 | 此函数的作用是爬取每一页面所有图册的url,一个页面包含10个图册,所有调用一次函数则返回一个包含10个url的列表 100 | 格式如['http://www.meizitu.com/a/list_1_1.html',...] 101 | :param url_list: 102 | :param proxy_flag: 103 | :param try_time: 104 | :return: 105 | ''' 106 | if not proxy_flag: # 不使用代理 107 | try: 108 | html = requests.get(url_list, headers=headers, timeout=10) 109 | html.encoding = 'gb2312' 110 | text = html.text 111 | 112 | bsop = BeautifulSoup(text, 'html.parser') 113 | 114 | url_imgs = [] 115 | li_list = bsop.find('ul', {'class': 'wp-list clearfix'}).findAll('li', {'class':'wp-item'}) 116 | for i in li_list: 117 | url_img = i.find('h3',{'class':'tit'}).find('a').attrs['href'] 118 | url_imgs.append(url_img) 119 | return url_imgs 120 | except: 121 | return scrawl_list(url_list, proxy_flag=True) # 否则调用自己,使用3次IP代理 122 | else: # 使用代理时 123 | if try_time','!','|']: 231 | while i in img_title: 232 | img_title = img_title.strip().replace(i, '') 233 | return img_title 234 | 235 | def get_total_pages(first_url): 236 | ''' 237 | 获取妹子图所有页面 238 | :param first_url: 239 | :return: 240 | ''' 241 | html = requests.get(first_url, headers=headers, timeout=10) 242 | html.encoding = 'gb2312' 243 | text = html.text 244 | bsop = BeautifulSoup(text, 'html.parser') 245 | lis =bsop.find('div',{'id':'wp_page_numbers'}).find('ul').findAll('li') 246 | pages = lis[-1].find('a').attrs['href'].split('.')[0].split('_')[-1] 247 | pages = int(pages) 248 | return pages 249 | 250 | 251 | # 妹子图的首页,用来获取总的页数 252 | first_url = 'http://www.meizitu.com/a/list_1_1.html' 253 | 254 | # 爬取代理的url地址,选择的是西祠代理 255 | url_ip = "http://www.xicidaili.com/nt/" 256 | 257 | # 设定等待时间 258 | set_timeout = 10 259 | 260 | # 爬取代理的页数,2表示爬取2页的ip地址 261 | num = 2 262 | 263 | # 代理的使用次数 264 | count_time = 5 265 | 266 | # 构造headers 267 | UserAgent_List = [ 268 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 269 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 270 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 271 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 272 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 273 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 274 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 275 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 276 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 277 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 278 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 279 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 280 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 281 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 282 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 283 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 284 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 285 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 286 | ] 287 | headers = {'User-Agent': random.choice(UserAgent_List), 288 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 289 | 'Accept-Encoding': 'gzip', 290 | } 291 | 292 | # 图片存储路径 293 | file_path = 'E:\selfprogress\programming\project\meizitu' 294 | 295 | # 获取总页数 296 | pages = get_total_pages(first_url) 297 | 298 | # 爬取IP代理 299 | total_ip = scrawl_ip(url_ip, num) 300 | 301 | # 带爬取的url 302 | url_imgss = download_urls(pages) 303 | 304 | for i in url_imgss: 305 | for j in i: 306 | try: 307 | with open('url.txt','a') as f: 308 | f.write(j+"\n") 309 | f.close() 310 | print("写入url.txt文件成功") 311 | except: 312 | print("写入url.txt文件失败") 313 | 314 | for url_imgs in url_imgss: 315 | for url_img in url_imgs: 316 | img_list, img_title = scrawl_url(url_img) 317 | if not img_list: 318 | continue 319 | download_img(img_list, img_title) 320 | 321 | time.sleep(5) 322 | 323 | 324 | 325 | 326 | 327 | -------------------------------------------------------------------------------- /meizitu_pro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # 导入模块 4 | import time 5 | import requests, re, random, os 6 | from bs4 import BeautifulSoup 7 | 8 | def ip_test(ip, url_for_test='https://www.baidu.com', set_timeout=30): 9 | try: 10 | r = requests.get(url_for_test, headers=headers, proxies={'http': ip[0]+':'+ip[1]}, timeout=set_timeout) 11 | if r.status_code == 200: 12 | return True 13 | else: 14 | return False 15 | except: 16 | return False 17 | 18 | def scrawl_ip(url, num, url_for_test='https://www.baidu.com'): 19 | ip_list = [] 20 | for num_page in range(1, num): 21 | url = url + str(num_page) 22 | 23 | response = requests.get(url, headers=headers) 24 | response.encoding = 'utf-8' 25 | content = response.text 26 | 27 | pattern = re.compile('.*?alt="Cn" />.*?.*?(.*?).*?(.*?)', re.S) 28 | items = re.findall(pattern, content) 29 | for ip in items: 30 | if ip_test(ip[1], url_for_test): # 测试爬取到ip是否可用,测试通过则加入ip_list列表之中 31 | print('测试通过,IP地址为' + str(ip[0]) + ':' + str(ip[1])) 32 | ip_list.append(ip[0]+':'+ip[1]) 33 | return ip_list 34 | 35 | time.sleep(10) # 等待10秒爬取下一页 36 | 37 | def get_random_ip(): # 随机获取一个IP 38 | ind = random.randint(0, len(total_ip)-1) 39 | # print(total_ip[ind]) 40 | return total_ip[ind] 41 | 42 | 43 | def download_img(img_list): 44 | img_title = img_list[0].attrs['alt'] 45 | for img_url in img_list: 46 | img_url = img_url.attrs['src'] 47 | title = img_url.split('/')[-1] 48 | 49 | if not os.path.exists(os.path.join(file_path, img_title)): 50 | os.makedirs(os.path.join(file_path, img_title)) 51 | os.chdir(file_path + '\\' + img_title) 52 | 53 | # 图片保存到本地 54 | exists = os.path.exists( title) 55 | if not exists: 56 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=30, verify=True) 57 | with open(title, 'wb') as f: 58 | f.write(img_html.content) 59 | f.close() 60 | 61 | def scrawl_url(url, proxy_flag=False, try_time=0): 62 | if not proxy_flag: # 不使用代理 63 | try: 64 | html = requests.get(url, headers=headers, timeout=30) 65 | html.encoding = 'gb2312' 66 | 67 | text = html.text 68 | code = html.status_code 69 | print(code) 70 | bsop = BeautifulSoup(text, 'html.parser') 71 | img_list = bsop.find('div', {'class': 'postContent'}).find('p').findAll('img') 72 | 73 | return img_list 74 | 75 | except: 76 | return scrawl_url(url, proxy_flag=True) # 否则调用自己,使用3次IP代理 77 | else: # 使用代理时 78 | if try_time 2 | 3 | 4 | 5 | $Title$ 6 | 7 | 8 | $END$ 9 | 10 | -------------------------------------------------------------------------------- /my_blog/templates/archives.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 |
5 | {% for post in post_list %} 6 |
7 |
8 |

{{ post.title }}

9 | 10 | 13 |
14 |
15 | {% endfor %} 16 |
17 | {% endblock %} -------------------------------------------------------------------------------- /my_blog/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | $Title$ 6 | 7 | 8 | $END$ 9 | 10 | -------------------------------------------------------------------------------- /my_blog/templates/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | $Title$ 6 | 7 | 8 | $END$ 9 | 10 | -------------------------------------------------------------------------------- /my_blog/templates/post.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | $Title$ 6 | 7 | 8 | $END$ 9 | 10 | -------------------------------------------------------------------------------- /my_blog/templates/tag.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | $Title$ 6 | 7 | 8 | $END$ 9 | 10 | -------------------------------------------------------------------------------- /my_blog/templates/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | $Title$ 6 | 7 | 8 | $END$ 9 | 10 | -------------------------------------------------------------------------------- /paqubiaoqing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | 在以下环境测试通过: 5 | python 2.7.15或者3.7.0 6 | win10或者lubuntu 7 | ''' 8 | 9 | # 导入模块 10 | import time 11 | import requests, re, random, os 12 | from bs4 import BeautifulSoup 13 | 14 | ''' 15 | 给定页数,爬取每页所有图片的url,通过此url可以打开图片所在的网页 16 | 所有url存在一个列表中 17 | ''' 18 | def scrapy_img_urls(nums): 19 | lss = [] 20 | for num in range(1, nums+1): 21 | url = 'http://www.doutula.com/photo/list/?page=' + str(num) 22 | html = requests.get(url, headers=headers) 23 | html.encoding = 'utf-8' 24 | 25 | text = html.text 26 | bsop = BeautifulSoup(text, 'html.parser') 27 | ass = bsop.find('div', {'class': 'page-content'}).find('div').findAll('a') 28 | 29 | for a in ass: 30 | # print(a.attrs['href']) 31 | lss.append(a.attrs['href']) 32 | time.sleep(1) 33 | return lss 34 | 35 | ''' 36 | 接收每个图片的url,打开此url,找到图片真实的地址,通过此地址可以下载图片 37 | 找到图片真实的url和名字之后调用download_url函数可以下载图片 38 | ''' 39 | def download_img_url(url): 40 | html = requests.get(url, headers=headers) 41 | html.encoding = 'utf-8' 42 | 43 | text = html.text 44 | bsop = BeautifulSoup(text, 'html.parser') 45 | img = bsop.find('div', {'class': 'col-xs-12 col-sm-12 artile_des'}) 46 | img_url = img.find('img').attrs['src'] 47 | img_title = img.find('img').attrs['alt'] 48 | print(img_url + " " + img_title) 49 | 50 | download_img(img_url, img_title) 51 | 52 | ''' 53 | 下载图片,该函数接收两个参数,一个是图片的真实地址,一个是图片的名字 54 | 名字中如果有特殊字符则需要处理,不然windows下可能无法保存,处理名字调用format_name函数 55 | 打开指定文件夹保存图片,如果没有则创建。 56 | ''' 57 | def download_img(img_url, img_title): 58 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹 59 | if not os.path.exists(file_path): 60 | os.makedirs(file_path) 61 | os.chdir(file_path) 62 | 63 | # 图片保存到本地 64 | exists = os.path.exists(img_title) 65 | if not exists: 66 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True) 67 | img_html.encoding = 'utf-8' 68 | with open(img_title + ".gif", 'wb') as f: 69 | f.write(img_html.content) 70 | f.close() 71 | 72 | 73 | def format_name(img_title): 74 | ''' 75 | 对名字进行处理,如果包含下属字符,则直接剔除该字符 76 | :param img_title: 77 | :return: 78 | ''' 79 | for i in ['\\','/',':','*','?','"','<','>','!','|']: 80 | while i in img_title: 81 | img_title = img_title.strip().replace(i, '') 82 | return img_title 83 | 84 | # 构造headers 85 | UserAgent_List = [ 86 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 87 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 88 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 89 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 90 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 91 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 92 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 93 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 94 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 95 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 96 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 97 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 98 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 99 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 100 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 101 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 102 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 103 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 104 | ] 105 | headers = {'User-Agent': random.choice(UserAgent_List), 106 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 107 | 'Accept-Encoding': 'gzip', 108 | } 109 | 110 | nums=5 111 | # 图片存储路径,在linux系统下 112 | file_path = '/home/zhangyb/downloadfiles/pythonpro/biaoqing' 113 | # 图片存储路径,在windows系统下 114 | # file_path = 'E:\downloadfiles\pythonpro\biaoqing' 115 | urls = scrapy_img_urls(nums) 116 | for i in urls: 117 | print(i) 118 | download_img_url(i) 119 | 120 | 121 | # download_img_url('http://www.doutula.com/photo/6437987') 122 | # download_img('https://ws1.sinaimg.cn/large/9150e4e5gy1fx94eo4pdwg203q02g0so.gif', u'好想打死你啊') -------------------------------------------------------------------------------- /porn/down_video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/porn/down_video.py -------------------------------------------------------------------------------- /porn/test1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/porn/test1.py -------------------------------------------------------------------------------- /requests1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | 6 | url = 'http://tieba.baidu.com/p/4468445702' 7 | html = requests.get(url) 8 | html.encoding = 'utf-8' 9 | 10 | text = html.text 11 | bsop = BeautifulSoup(text,'html.parser') 12 | img_list = bsop.find('div',{'id':'post_content_87286618651'}).findAll('img') 13 | img_src = img_list[0].attrs['src'] 14 | 15 | print(img_src) 16 | img = requests.get(img_src) 17 | with open('a.jpg', 'ab') as f: 18 | f.write(img.content) 19 | f.close() 20 | 21 | 22 | # content = html.content 23 | # print(text) 24 | # print(content) -------------------------------------------------------------------------------- /requests2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | 6 | url = 'http://tieba.baidu.com/p/4468445702' 7 | html = requests.get(url) 8 | html.encoding = 'utf-8' 9 | 10 | text = html.text 11 | bsop = BeautifulSoup(text,'html.parser') 12 | img_list = bsop.find('div',{'id':'post_content_87286618651'}).findAll('img') 13 | img_src = img_list[0].attrs['src'] 14 | 15 | print(img_src) 16 | img = requests.get(img_src) 17 | with open('a.jpg', 'ab') as f: 18 | f.write(img.content) 19 | f.close() 20 | 21 | 22 | # content = html.content 23 | # print(text) 24 | # print(content) -------------------------------------------------------------------------------- /requests3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/requests3.py -------------------------------------------------------------------------------- /scraping_ajax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/scraping_ajax.py -------------------------------------------------------------------------------- /selenium/test1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/selenium/test1.py -------------------------------------------------------------------------------- /selenium/test2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/selenium/test2.py -------------------------------------------------------------------------------- /selenium/test3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/selenium/test3.py -------------------------------------------------------------------------------- /selenium/test4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/selenium/test4.py -------------------------------------------------------------------------------- /some/aj.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*_ 2 | 3 | import requests 4 | import json 5 | 6 | headers = { 7 | 'Accept': 'application/json', 8 | 'Accept-Encoding': 'gzip, deflate, br', 9 | 'Accept-Language': 'zh-CN,zh;q=0.9', 10 | 'Connection': 'keep-alive', 11 | 'Content-Length': '1919', 12 | 'Content-Type': 'application/json', 13 | 'Cookie': 'bid=FvGxnjrHNYI; gr_user_id=c211a350-d924-429f-9028-afd61661913f; _vwo_uuid_v2=DD2B02C913FD5A4D2EFE19BBBB71F1473|8e6abeedccfd8ccd3b590f121d180376; __utmc=30149280; __utmz=30149280.1545471350.6.6.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); viewed="10756112_5273955_1088168_27069345_26601155_10590856"; _ga=GA1.3.497061328.1543886034; ap_v=0,6.0; __utma=30149280.497061328.1543886034.1545471350.1545887406.7; _gid=GA1.3.452249281.1545887527; _pk_ref.100001.a7dd=%5B%22%22%2C%22%22%2C1545887527%2C%22https%3A%2F%2Fwww.jianshu.com%2Fp%2Fb29375404479%22%5D; _pk_ses.100001.a7dd=*; _pk_id.100001.a7dd=ee586b77c5c08a27.1545487781.2.1545889502.1545488713.', 14 | 'DNT': '1', 15 | 'Host': 'read.douban.com', 16 | 'Origin': 'https://read.douban.com', 17 | 'Referer': 'https://read.douban.com/category/?kind=114', 18 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 19 | 'X-CSRF-Token': 'null', 20 | } 21 | 22 | data = {"sort":"hot","page":1,"kind":114,"query":"\n query getFilterWorksList($works_ids: [ID!], $user_id: ID) {\n worksList(worksIds: $works_ids) {\n \n \n title\n cover\n url\n isBundle\n \n \n url\n title\n \n \n author {\n name\n url\n }\n origAuthor {\n name\n url\n }\n translator {\n name\n url\n }\n \n \n abstract\n editorHighlight\n \n \n isOrigin\n kinds {\n \n name @skip(if: true)\n shortName @include(if: true)\n id\n \n }\n ... on WorksBase @include(if: true) {\n wordCount\n wordCountUnit\n }\n ... on WorksBase @include(if: true) {\n \n isEssay\n \n ... on EssayWorks {\n favorCount\n }\n \n \n isNew\n \n averageRating\n ratingCount\n url\n \n \n \n }\n ... on WorksBase @include(if: false) {\n isColumn\n isEssay\n onSaleTime\n ... on ColumnWorks {\n updateTime\n }\n }\n ... on WorksBase @include(if: true) {\n isColumn\n ... on ColumnWorks {\n isFinished\n }\n }\n ... on EssayWorks {\n essayActivityData {\n \n title\n uri\n tag {\n name\n color\n background\n icon2x\n icon3x\n iconSize {\n height\n }\n iconPosition {\n x y\n }\n }\n \n }\n }\n highlightTags {\n name\n }\n \n ... on WorksBase @include(if: false) {\n \n fixedPrice\n salesPrice\n isRebate\n \n }\n ... on EbookWorks {\n \n fixedPrice\n salesPrice\n isRebate\n \n }\n ... on WorksBase @include(if: true) {\n ... on EbookWorks {\n id\n isPurchased(userId: $user_id)\n isInWishlist(userId: $user_id)\n }\n }\n \n id\n isOrigin\n }\n }\n ","variables":{"user_id":""}} 23 | 24 | url = 'https://read.douban.com/j/kind/' 25 | 26 | r = requests.post(url, headers=headers, data=json.dumps(data)) 27 | text = r.text 28 | text = json.loads(text) 29 | total = text["total"] 30 | lists = text["list"] 31 | for i in lists: 32 | title = i.['title'] 33 | cover = i.['cover'] 34 | book_url = 'https://read.douban.com' + i.['book_url'] 35 | book_url = 'https://read.douban.com' + i.['book_url'] 36 | # print(total) 37 | # print(lists) 38 | -------------------------------------------------------------------------------- /some/pa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | 在以下环境测试通过: 5 | python 2.7.15或者3.7.0 6 | win10或者lubuntu 7 | ''' 8 | 9 | # 导入模块 10 | import time 11 | import requests, re, random, os 12 | from bs4 import BeautifulSoup 13 | from requests import Session 14 | 15 | session = Session() 16 | 17 | 18 | ''' 19 | 给定页数,爬取每页所有图片的url,通过此url可以打开图片所在的网页 20 | 所有url存在一个列表中 21 | ''' 22 | 23 | 24 | def scrapy_img_urls(nums): 25 | lss = [] 26 | for num in range(1, nums + 1): 27 | url = 'http://www.doutula.com/photo/list/?page=' + str(num) 28 | html = requests.get(url, headers=headers) 29 | html.encoding = 'utf-8' 30 | 31 | text = html.text 32 | bsop = BeautifulSoup(text, 'html.parser') 33 | ass = bsop.find('div', {'class': 'page-content'}).find('div').findAll('a') 34 | 35 | for a in ass: 36 | # print(a.attrs['href']) 37 | lss.append(a.attrs['href']) 38 | time.sleep(1) 39 | return lss 40 | 41 | 42 | ''' 43 | 接收每个图片的url,打开此url,找到图片真实的地址,通过此地址可以下载图片 44 | 找到图片真实的url和名字之后调用download_url函数可以下载图片 45 | ''' 46 | 47 | 48 | def download_img_url(url): 49 | html = requests.get(url, headers=headers) 50 | html.encoding = 'utf-8' 51 | 52 | text = html.text 53 | bsop = BeautifulSoup(text, 'html.parser') 54 | img = bsop.find('div', {'class': 'col-xs-12 col-sm-12 artile_des'}) 55 | img_url = img.find('img').attrs['src'] 56 | img_title = img.find('img').attrs['alt'] 57 | print(img_url + " " + img_title) 58 | 59 | download_img(img_url, img_title) 60 | 61 | 62 | ''' 63 | 下载图片,该函数接收两个参数,一个是图片的真实地址,一个是图片的名字 64 | 名字中如果有特殊字符则需要处理,不然windows下可能无法保存,处理名字调用format_name函数 65 | 打开指定文件夹保存图片,如果没有则创建。 66 | ''' 67 | 68 | 69 | def download_img(img_url, img_title): 70 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹 71 | if not os.path.exists(file_path): 72 | os.makedirs(file_path) 73 | os.chdir(file_path) 74 | 75 | # 图片保存到本地 76 | exists = os.path.exists(img_title) 77 | if not exists: 78 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True) 79 | img_html.encoding = 'utf-8' 80 | with open(img_title + ".gif", 'wb') as f: 81 | f.write(img_html.content) 82 | f.close() 83 | 84 | 85 | def format_name(img_title): 86 | ''' 87 | 对名字进行处理,如果包含下属字符,则直接剔除该字符 88 | :param img_title: 89 | :return: 90 | ''' 91 | for i in ['\\', '/', ':', '*', '?', '"', '<', '>', '!', '|']: 92 | while i in img_title: 93 | img_title = img_title.strip().replace(i, '') 94 | return img_title 95 | 96 | 97 | def royal(url): 98 | html = requests.get(url, headers=headers, stream=True, timeout=20, verify=True) 99 | html.encoding = 'utf-8' 100 | text = html.text 101 | bsop = BeautifulSoup(text, 'html.parser') 102 | timeofissued = bsop.find('meta', {'name':'DC.issued'}).attrs['content'].split('/')[0] 103 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content'] 104 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content'] 105 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content'] 106 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content'] 107 | citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content'] 108 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content'] 109 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content'] 110 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content'] 111 | PB = bsop.find('meta', {'name':'DC.publisher'}).attrs['content'] 112 | M3 = citation_doi 113 | citation_url = 'http://dx.doi.org/' + citation_doi 114 | citation_abstract = bsop.find('meta', {'name':'citation_abstract'}).attrs['content'].strip() 115 | SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1] 116 | 117 | with open(citation_title + ".ris", 'w') as f: 118 | f.write('TY - JOUR\n') 119 | f.write('T1 - ' + citation_title + '\n') 120 | f.write('Y1 - ' + timeofissued + '\n') 121 | f.write('SP - ' + citation_firstpage + '\n') 122 | f.write('EP - ' + citation_lastpage + '\n') 123 | f.write('JF - ' + citation_journal_title + '\n') 124 | f.write('JO - ' + citation_journal_abbrev + '\n') 125 | f.write('VL - ' + citation_volume + '\n') 126 | f.write('RS - ' + citation_issue + '\n') 127 | f.write('PB - ' + PB + '\n') 128 | f.write('SN - ' + SN + '\n') 129 | f.write('DO - ' + citation_doi + '\n') 130 | f.write('M3 - ' + M3 + '\n') 131 | f.write('UR - ' + citation_url + '\n') 132 | print(citation_url) 133 | f.write('N2 - ' + citation_abstract + '\n') 134 | print(citation_abstract) 135 | 136 | authors = bsop.findAll('span', {'class': 'article__author-link'}) 137 | for author in authors: 138 | author = author.find('a').text.split(' ') 139 | author = author[-1] + ', ' + ' '.join(author[:-1]) 140 | f.write('A1 - ' + author + '\n') 141 | f.write('ER - ' + '\n') 142 | f.close() 143 | 144 | # authors = bsop.findAll('span', {'class':'article__author-link'}) 145 | # for author in authors: 146 | # author = author.find('a').text.split(' ') 147 | # author = author[-1] + ', ' + ' '.join(author[:-1]) 148 | # with open(author + ".ris", 'w') as f: 149 | # f.write('TY - JOUR') 150 | # f.write('T1 - ' + citation_title) 151 | # f.write('T1 - ' + authors) 152 | # f.close() 153 | 154 | # print(author) 155 | # print(timeofissued) 156 | 157 | 158 | 159 | 160 | 161 | # print(authors) 162 | # with open("ro.ris", 'wb') as f: 163 | # f.write(html.content) 164 | # f.close() 165 | 166 | 167 | def scawurls(url): 168 | 169 | headers1 = { 170 | 'Accept':'text/html, */*; q=0.01', 171 | 'Connection': 'keep-alive', 172 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 173 | 'DNT':'1', 174 | 'Host':'pubs.rsc.org', 175 | 'Origin':'https://pubs.rsc.org', 176 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 177 | 'X-NewRelic-ID':'VQYFWF9aDBABV1laBgcFUw ==', 178 | 'X-Requested-With':'XMLHttpRequest' 179 | } 180 | 181 | data = { 182 | 'searchterm': 'AAEAAAD/////AQAAAAAAAAAMAgAAAGNSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAADlSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLlNlYXJjaFRlcm0OAAAAGTxDYXRlZ29yeT5rX19CYWNraW5nRmllbGQcPFN1YkNhdGVnb3J5PmtfX0JhY2tpbmdGaWVsZBw8Q29udGVudFR5cGU + a19fQmFja2luZ0ZpZWxkGjxDcml0ZXJpYXM + a19fQmFja2luZ0ZpZWxkFzxGYWNldHM + a19fQmFja2luZ0ZpZWxkHDxSZXF1ZXN0VGltZT5rX19CYWNraW5nRmllbGQfPEF1dGhvckNyaXRlcmlhPmtfX0JhY2tpbmdGaWVsZCA8UHVibGljYXRpb25EYXRlPmtfX0JhY2tpbmdGaWVsZBk8RXhjbHVkZXM + a19fQmFja2luZ0ZpZWxkFzxTb3VyY2U + a19fQmFja2luZ0ZpZWxkHzxPdXRwdXRTdGFuZGFyZD5rX19CYWNraW5nRmllbGQePFJlc3VsdHNGb3JtYXQ + a19fQmFja2luZ0ZpZWxkHjxEaXNwbGF5Q291bnRzPmtfX0JhY2tpbmdGaWVsZCA8UHJvZHVjdFBhZ2VTaXplPmtfX0JhY2tpbmdGaWVsZAEBAQMDAAQEAwEBAQEBwgFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5MaXN0YDFbW1JTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXcIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0NPVJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guQXV0aG9yQ3JpdGVyaWECAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlAgAAAMIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0CAAAABgMAAAADQWxsCgYEAAAAA0FsbAkFAAAACQYAAAAAAAAAAAAAAAkHAAAACQgAAAAJCQAAAAoKCgoKBAUAAADCAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWUsIFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cywgVmVyc2lvbj0yMDE4LjAuNTQ5LjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49bnVsbF1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24EAAA6UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWVbXQIAAAAICAkKAAAABAAAAAQAAAABBgAAAAUAAAAJCwAAAAAAAAAAAAAABQcAAAA9UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JDcml0ZXJpYQIAAAAgPEJvb2xlYW5PcGVyYXRvcj5rX19CYWNraW5nRmllbGQYPEF1dGhvcnM + a19fQmFja2luZ0ZpZWxkAQPDAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JJbmZvLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXQIAAAAKCgUIAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlBQAAAB88SXNTZWxlY3RlZERhdGU + a19fQmFja2luZ0ZpZWxkGTxEYXRlVHlwZT5rX19CYWNraW5nRmllbGQbPFdpdGhJbkxhc3Q + a19fQmFja2luZ0ZpZWxkGjxEYXRlUmFuZ2U + a19fQmFja2luZ0ZpZWxkHDxEaXNwbGF5RGF0ZT5rX19CYWNraW5nRmllbGQAAQQEAQE5UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5XaXRoSW5MYXN0AgAAADhSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLkRhdGVSYW5nZQIAAAACAAAAAAoKCgoBCQAAAAUAAAAJCwAAAAAAAAAAAAAABwoAAAAAAQAAAAQAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAkMAAAACQ0AAAAJDgAAAAkPAAAABwsAAAAAAQAAAAAAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAUMAAAAOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlBAAAABU8TmFtZT5rX19CYWNraW5nRmllbGQcPERpc3BsYXlOYW1lPmtfX0JhY2tpbmdGaWVsZBY8VmFsdWU + a19fQmFja2luZ0ZpZWxkIDxCb29sZWFuT3BlcmF0b3I + a19fQmFja2luZ0ZpZWxkAQEBAQIAAAAGEAAAAAhmcmVldGV4dAoGEQAAAG9kZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtIEFORCBDdSwgT1IgY29wcGVyLCBPUiBlbGVjdHJvbGVzcywgT1IgcHJpbnRpbmcsIE9SIGZsZXhpYmxlLCBPUiBzdWJzdHJhdGUsIE9SIHBsYXN0aWMKAQ0AAAAMAAAABhIAAAAHQWxsVGV4dAoGEwAAABlkZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtCgEOAAAADAAAAAYUAAAAC0F0bGVhc3RUZXh0CgYVAAAAP0N1LCBjb3BwZXIsIGVsZWN0cm9sZXNzLCBwcmludGluZywgZmxleGlibGUsIHN1YnN0cmF0ZSwgcGxhc3RpYwoBDwAAAAwAAAAGFgAAABBPcmlnaW5hbEZyZWVUZXh0CgYXAAAAb2RlcG9zaXRpb24sIHBhdHRlcm4sIGZpbG0gQU5EIEN1LCBPUiBjb3BwZXIsIE9SIGVsZWN0cm9sZXNzLCBPUiBwcmludGluZywgT1IgZmxleGlibGUsIE9SIHN1YnN0cmF0ZSwgT1IgcGxhc3RpYwoL', 183 | 'resultcount': '282607', 184 | 'category': 'all', 185 | 'pageno': '2' 186 | } 187 | 188 | html = requests.post(url, data=data, headers=headers1, stream=True, timeout=20, verify=True) 189 | html.encoding = 'utf-8' 190 | text = html.text 191 | # print(text) 192 | bsop = BeautifulSoup(text, 'html.parser') 193 | divs = bsop.findAll('div', {'class': 'capsule capsule--article '}) 194 | for i in divs: 195 | article_url = 'https://pubs.rsc.org' + i.find('a').attrs['href'] 196 | print(article_url) 197 | # royal(article_url) 198 | 199 | # with open("ros.html", 'wb') as f: 200 | # f.write(html.content) 201 | # f.close() 202 | # print(text) 203 | 204 | # session.head('https://pubs.rsc.org/en/results/all?Category=All&AllText=deposition%2C%20pattern%2C%20film&AtleastText=Cu%2C%20copper%2C%20electroless%2C%20printing%2C%20flexible%2C%20substrate%2C%20plastic&IncludeReference=false&SelectJournal=false&DateRange=false&SelectDate=false&Type=Months&DateFromMonth=Months&DateToMonth=Months&PriceCode=False&OpenAccess=false') 205 | 206 | # 构造headers 207 | UserAgent_List = [ 208 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 209 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 210 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 211 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 212 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 213 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 214 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 215 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 216 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 217 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 218 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 219 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 220 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 221 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 222 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 223 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 224 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 225 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 226 | ] 227 | headers = {'User-Agent': random.choice(UserAgent_List), 228 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 229 | 'Accept-Encoding': 'gzip', 230 | } 231 | 232 | url = 'https://pubs.rsc.org/en/search/journalresult' 233 | scawurls(url) 234 | 235 | 236 | 237 | # url = 'https://pubs.rsc.org/en/content/articlelanding/2017/tc/c7tc00038c#!divAbstract' 238 | # royal(url) 239 | 240 | # nums = 5 241 | # # 图片存储路径,在linux系统下 242 | # file_path = '/home/zhangyb/downloadfiles/pythonpro/biaoqing' 243 | # # 图片存储路径,在windows系统下 244 | # # file_path = 'E:\downloadfiles\pythonpro\biaoqing' 245 | # urls = scrapy_img_urls(nums) 246 | # for i in urls: 247 | # print(i) 248 | # download_img_url(i) 249 | 250 | 251 | # url = 'https://pubs.rsc.org/en/results/all?Category=All&AllText=deposition%2C%20pattern%2C%20film&AtleastText=Cu%2C%20copper%2C%20electroless%2C%20printing%2C%20flexible%2C%20substrate%2C%20plastic&IncludeReference=false&SelectJournal=false&DateRange=false&SelectDate=false&Type=Months&DateFromMonth=Months&DateToMonth=Months&PriceCode=False&OpenAccess=false' 252 | # r = requests.get(url, headers=headers) 253 | # print(r.text) 254 | -------------------------------------------------------------------------------- /some/pa1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import random 5 | from bs4 import BeautifulSoup 6 | import time 7 | 8 | download_time = time.strftime("%Y-%m-%d", time.localtime()) 9 | 10 | 11 | def royal(article_urls): 12 | for article_url in article_urls: 13 | # try: 14 | html = requests.get(article_url, headers=headers, stream=True, timeout=20, verify=True) 15 | html.encoding = 'utf-8' 16 | text = html.text 17 | bsop = BeautifulSoup(text, 'html.parser') 18 | try: 19 | timeofissued = bsop.find('meta', {'name':'DC.issued'}).attrs['content'].split('/')[0] 20 | except: 21 | pass 22 | try: 23 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content'] 24 | except: 25 | pass 26 | try: 27 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content'] 28 | except: 29 | pass 30 | try: 31 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content'] 32 | except: 33 | pass 34 | try: 35 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content'] 36 | except: 37 | pass 38 | try: 39 | citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content'] 40 | except: 41 | pass 42 | try: 43 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content'] 44 | except: 45 | pass 46 | try: 47 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content'] 48 | except: 49 | pass 50 | try: 51 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content'] 52 | except: 53 | pass 54 | try: 55 | PB = bsop.find('meta', {'name':'DC.publisher'}).attrs['content'] 56 | except: 57 | pass 58 | try: 59 | M3 = citation_doi 60 | except: 61 | pass 62 | try: 63 | citation_url = 'http://dx.doi.org/' + citation_doi 64 | except: 65 | pass 66 | try: 67 | citation_abstract = bsop.find('meta', {'name':'citation_abstract'}).attrs['content'].strip() 68 | except: 69 | pass 70 | try: 71 | SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1] 72 | except: 73 | pass 74 | # except: 75 | # print(article_url) 76 | # continue 77 | 78 | with open(download_time + ".ris", 'a', encoding='utf-8') as f: 79 | f.write('TY - JOUR\n') 80 | f.write('T1 - ' + citation_title + '\n') 81 | f.write('Y1 - ' + timeofissued + '\n') 82 | f.write('SP - ' + citation_firstpage + '\n') 83 | f.write('EP - ' + citation_lastpage + '\n') 84 | f.write('JF - ' + citation_journal_title + '\n') 85 | f.write('JO - ' + citation_journal_abbrev + '\n') 86 | f.write('VL - ' + citation_volume + '\n') 87 | f.write('RS - ' + citation_issue + '\n') 88 | f.write('PB - ' + PB + '\n') 89 | f.write('SN - ' + SN + '\n') 90 | f.write('DO - ' + citation_doi + '\n') 91 | f.write('M3 - ' + M3 + '\n') 92 | f.write('UR - ' + citation_url + '\n') 93 | print(citation_url) 94 | f.write('N2 - ' + citation_abstract + '\n') 95 | # print(citation_abstract) 96 | 97 | authors = bsop.findAll('span', {'class': 'article__author-link'}) 98 | for author in authors: 99 | author = author.find('a').text.split(' ') 100 | author = author[-1] + ', ' + ' '.join(author[:-1]) 101 | f.write('A1 - ' + author + '\n') 102 | f.write('ER - ' + '\n\n\n') 103 | f.close() 104 | time.sleep(1) 105 | 106 | 107 | def crawl_article_url(nums): 108 | article_urls = [] 109 | for num in range(1, nums+1): 110 | 111 | url = 'https://pubs.rsc.org/en/search/journalresult' 112 | 113 | headers1 = { 114 | 'Accept':'text/html, */*; q=0.01', 115 | 'Connection': 'keep-alive', 116 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 117 | 'DNT':'1', 118 | 'Host':'pubs.rsc.org', 119 | 'Origin':'https://pubs.rsc.org', 120 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 121 | 'X-NewRelic-ID':'VQYFWF9aDBABV1laBgcFUw ==', 122 | 'X-Requested-With':'XMLHttpRequest' 123 | } 124 | 125 | data = { 126 | 'searchterm': 'AAEAAAD/////AQAAAAAAAAAMAgAAAGNSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAADlSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLlNlYXJjaFRlcm0OAAAAGTxDYXRlZ29yeT5rX19CYWNraW5nRmllbGQcPFN1YkNhdGVnb3J5PmtfX0JhY2tpbmdGaWVsZBw8Q29udGVudFR5cGU + a19fQmFja2luZ0ZpZWxkGjxDcml0ZXJpYXM + a19fQmFja2luZ0ZpZWxkFzxGYWNldHM + a19fQmFja2luZ0ZpZWxkHDxSZXF1ZXN0VGltZT5rX19CYWNraW5nRmllbGQfPEF1dGhvckNyaXRlcmlhPmtfX0JhY2tpbmdGaWVsZCA8UHVibGljYXRpb25EYXRlPmtfX0JhY2tpbmdGaWVsZBk8RXhjbHVkZXM + a19fQmFja2luZ0ZpZWxkFzxTb3VyY2U + a19fQmFja2luZ0ZpZWxkHzxPdXRwdXRTdGFuZGFyZD5rX19CYWNraW5nRmllbGQePFJlc3VsdHNGb3JtYXQ + a19fQmFja2luZ0ZpZWxkHjxEaXNwbGF5Q291bnRzPmtfX0JhY2tpbmdGaWVsZCA8UHJvZHVjdFBhZ2VTaXplPmtfX0JhY2tpbmdGaWVsZAEBAQMDAAQEAwEBAQEBwgFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5MaXN0YDFbW1JTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXcIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0NPVJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guQXV0aG9yQ3JpdGVyaWECAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlAgAAAMIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0CAAAABgMAAAADQWxsCgYEAAAAA0FsbAkFAAAACQYAAAAAAAAAAAAAAAkHAAAACQgAAAAJCQAAAAoKCgoKBAUAAADCAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWUsIFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cywgVmVyc2lvbj0yMDE4LjAuNTQ5LjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49bnVsbF1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24EAAA6UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWVbXQIAAAAICAkKAAAABAAAAAQAAAABBgAAAAUAAAAJCwAAAAAAAAAAAAAABQcAAAA9UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JDcml0ZXJpYQIAAAAgPEJvb2xlYW5PcGVyYXRvcj5rX19CYWNraW5nRmllbGQYPEF1dGhvcnM + a19fQmFja2luZ0ZpZWxkAQPDAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JJbmZvLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXQIAAAAKCgUIAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlBQAAAB88SXNTZWxlY3RlZERhdGU + a19fQmFja2luZ0ZpZWxkGTxEYXRlVHlwZT5rX19CYWNraW5nRmllbGQbPFdpdGhJbkxhc3Q + a19fQmFja2luZ0ZpZWxkGjxEYXRlUmFuZ2U + a19fQmFja2luZ0ZpZWxkHDxEaXNwbGF5RGF0ZT5rX19CYWNraW5nRmllbGQAAQQEAQE5UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5XaXRoSW5MYXN0AgAAADhSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLkRhdGVSYW5nZQIAAAACAAAAAAoKCgoBCQAAAAUAAAAJCwAAAAAAAAAAAAAABwoAAAAAAQAAAAQAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAkMAAAACQ0AAAAJDgAAAAkPAAAABwsAAAAAAQAAAAAAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAUMAAAAOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlBAAAABU8TmFtZT5rX19CYWNraW5nRmllbGQcPERpc3BsYXlOYW1lPmtfX0JhY2tpbmdGaWVsZBY8VmFsdWU + a19fQmFja2luZ0ZpZWxkIDxCb29sZWFuT3BlcmF0b3I + a19fQmFja2luZ0ZpZWxkAQEBAQIAAAAGEAAAAAhmcmVldGV4dAoGEQAAAG9kZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtIEFORCBDdSwgT1IgY29wcGVyLCBPUiBlbGVjdHJvbGVzcywgT1IgcHJpbnRpbmcsIE9SIGZsZXhpYmxlLCBPUiBzdWJzdHJhdGUsIE9SIHBsYXN0aWMKAQ0AAAAMAAAABhIAAAAHQWxsVGV4dAoGEwAAABlkZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtCgEOAAAADAAAAAYUAAAAC0F0bGVhc3RUZXh0CgYVAAAAP0N1LCBjb3BwZXIsIGVsZWN0cm9sZXNzLCBwcmludGluZywgZmxleGlibGUsIHN1YnN0cmF0ZSwgcGxhc3RpYwoBDwAAAAwAAAAGFgAAABBPcmlnaW5hbEZyZWVUZXh0CgYXAAAAb2RlcG9zaXRpb24sIHBhdHRlcm4sIGZpbG0gQU5EIEN1LCBPUiBjb3BwZXIsIE9SIGVsZWN0cm9sZXNzLCBPUiBwcmludGluZywgT1IgZmxleGlibGUsIE9SIHN1YnN0cmF0ZSwgT1IgcGxhc3RpYwoL', 127 | 'resultcount': '282607', 128 | 'category': 'all', 129 | 'pageno': str(num) 130 | } 131 | 132 | html = requests.post(url, data=data, headers=headers1, stream=True, timeout=20, verify=True) 133 | html.encoding = 'utf-8' 134 | text = html.text 135 | # print(text) 136 | bsop = BeautifulSoup(text, 'html.parser') 137 | divs = bsop.findAll('div', {'class': 'capsule capsule--article '}) 138 | for i in divs: 139 | article_url = 'https://pubs.rsc.org' + i.find('a').attrs['href'] 140 | # print(article_url) 141 | article_urls.append(article_url) 142 | print("第" + str(num) + "页爬取完毕") 143 | time.sleep(1) 144 | return article_urls 145 | 146 | 147 | # 构造headers 148 | UserAgent_List = [ 149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 150 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 151 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 152 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 153 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 154 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 155 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 156 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 157 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 158 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 159 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 160 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 161 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 162 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 163 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 164 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 165 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 166 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 167 | ] 168 | headers = {'User-Agent': random.choice(UserAgent_List), 169 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 170 | 'Accept-Encoding': 'gzip', 171 | } 172 | nums = 5 # 爬取的页数 173 | 174 | article_urls = crawl_article_url(nums) 175 | royal(article_urls) 176 | 177 | 178 | 179 | # url = 'https://pubs.rsc.org/en/content/articlelanding/2017/tc/c7tc00038c#!divAbstract' 180 | # royal(url) 181 | -------------------------------------------------------------------------------- /some/springer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import random 5 | from bs4 import BeautifulSoup 6 | import time 7 | 8 | download_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) 9 | 10 | 11 | def royal(article_urls): 12 | for article_url in article_urls: 13 | # try: 14 | html = requests.get(article_url, headers=headers, stream=True, timeout=20, verify=True) 15 | html.encoding = 'utf-8' 16 | text = html.text 17 | bsop = BeautifulSoup(text, 'html.parser') 18 | try: 19 | timeofissued = bsop.find('meta', {'name':'citation_cover_date'}).attrs['content'].split('/')[0] 20 | except: 21 | pass 22 | try: 23 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content'] 24 | except: 25 | pass 26 | try: 27 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content'] 28 | except: 29 | pass 30 | try: 31 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content'] 32 | except: 33 | pass 34 | try: 35 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content'] 36 | except: 37 | pass 38 | try: 39 | # citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content'] 40 | citation_issue = bsop.find('span', {'id':'electronic-issn'}).text 41 | except: 42 | pass 43 | try: 44 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content'] 45 | except: 46 | pass 47 | try: 48 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content'] 49 | except: 50 | pass 51 | try: 52 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content'] 53 | except: 54 | pass 55 | try: 56 | PB = bsop.find('meta', {'name':'citation_publisher'}).attrs['content'] 57 | except: 58 | pass 59 | try: 60 | M3 = citation_doi 61 | except: 62 | pass 63 | try: 64 | citation_url = 'http://dx.doi.org/' + citation_doi 65 | except: 66 | pass 67 | try: 68 | # citation_abstract = bsop.find('p', {'id':'Par1'}).attrs['content'].strip() 69 | citation_abstract = bsop.find('p', {'id':'Par1'}).text 70 | except: 71 | pass 72 | try: 73 | # SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1] 74 | SN = bsop.find('span', {'id':'electronic-issn'}).text 75 | except: 76 | pass 77 | # except: 78 | # print(article_url) 79 | # continue 80 | 81 | with open(download_time + ".ris", 'a', encoding='utf-8') as f: 82 | f.write('TY - JOUR\n') 83 | f.write('T1 - ' + citation_title + '\n') 84 | f.write('Y1 - ' + timeofissued + '\n') 85 | f.write('SP - ' + citation_firstpage + '\n') 86 | f.write('EP - ' + citation_lastpage + '\n') 87 | f.write('JF - ' + citation_journal_title + '\n') 88 | f.write('JO - ' + citation_journal_abbrev + '\n') 89 | f.write('VL - ' + citation_volume + '\n') 90 | f.write('RS - ' + citation_issue + '\n') 91 | f.write('PB - ' + PB + '\n') 92 | f.write('SN - ' + SN + '\n') 93 | f.write('DO - ' + citation_doi + '\n') 94 | f.write('M3 - ' + M3 + '\n') 95 | f.write('UR - ' + citation_url + '\n') 96 | print(citation_url) 97 | f.write('N2 - ' + citation_abstract + '\n') 98 | # print(citation_abstract) 99 | 100 | authors = bsop.findAll('meta', {'name': 'citation_author'}) 101 | for author in authors: 102 | # print(author) 103 | author = author.attrs['content'] 104 | # print(author) 105 | author = author[-1] + ', ' + ' '.join(author[:-1]) 106 | f.write('A1 - ' + author + '\n') 107 | f.write('ER - ' + '\n\n\n') 108 | f.close() 109 | time.sleep(1) 110 | 111 | 112 | def crawl_article_url(nums): 113 | article_urls = [] 114 | for num in range(1, nums+1): 115 | 116 | url = 'https://link.springer.com/search/page/' + str(num) + '?date-facet-mode=between&facet-start-year=2010&facet-language=%22En%22&query=printing%2C+AND+Cu+AND+pattern%2C+AND+film%2C+AND+flexible%2C+AND+plastic%2C+AND+substrate%2C+AND+copper&facet-end-year=2019&showAll=true&facet-content-type=%22Article%22' 117 | 118 | html = requests.get(url, headers=headers, stream=True, timeout=20, verify=True) 119 | html.encoding = 'utf-8' 120 | text = html.text 121 | # print(text) 122 | bsop = BeautifulSoup(text, 'html.parser') 123 | divs = bsop.find('ol', {'id': 'results-list'}).findAll('li') 124 | for i in divs: 125 | # print(i) 126 | article_url = 'https://link.springer.com' + i.find('h2').find('a').attrs['href'] 127 | print(article_url) 128 | article_urls.append(article_url) 129 | print("第" + str(num) + "页爬取完毕") 130 | time.sleep(1) 131 | return article_urls 132 | 133 | 134 | # 构造headers 135 | UserAgent_List = [ 136 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 137 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 138 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 139 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 140 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 141 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 142 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 143 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 144 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 145 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 146 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 147 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 148 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 150 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 151 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 152 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 153 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 154 | ] 155 | headers = {'User-Agent': random.choice(UserAgent_List), 156 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 157 | 'Accept-Encoding': 'gzip', 158 | } 159 | nums = 1 # 爬取的页数 160 | 161 | article_urls = crawl_article_url(nums) 162 | royal(article_urls) -------------------------------------------------------------------------------- /some/xuanke.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*_ 2 | 3 | import requests, time 4 | import hmac, json 5 | from bs4 import BeautifulSoup 6 | from hashlib import sha1 7 | 8 | def get_captcha(url): 9 | ''' 处理验证码 ''' 10 | 11 | r = requests.get(url, headers=headers) 12 | text = r.text 13 | obj = BeautifulSoup(text, 'html.parser') 14 | captchaurl = 'http://zhjwxk.cic.tsinghua.edu.cn' + obj.find("img", {"id":"captcha"}).attrs['src'] 15 | rr = requests.get(captchaurl, headers=headers) 16 | textt = rr.content 17 | 18 | with open('captcha.gif', 'wb') as fb: 19 | fb.write(textt) 20 | a = input('captcha:') 21 | print(a) 22 | return a 23 | 24 | 25 | s = requests.Session() 26 | url = 'https://zhjwxk.cic.tsinghua.edu.cn/j_acegi_formlogin_xsxk.do' 27 | 28 | headers = { 29 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 30 | 'Accept-Encoding': 'gzip, deflate, br', 31 | 'Accept-Language': 'zh-CN,zh;q=0.9', 32 | 'Cache-Control': 'max-age=0', 33 | 'Connection': 'keep-alive', 34 | 'Content-Length': '66', 35 | 'Content-Type': 'application/x-www-form-urlencoded', 36 | 'Cookie': 'JSESSIONID=cafgDstvY9fVWd2VutTFw; thuwebcookie=990146470.20480.0000', 37 | 'DNT': '1', 38 | 'Host': 'zhjwxk.cic.tsinghua.edu.cn', 39 | 'Origin': 'http://zhjwxk.cic.tsinghua.edu.cn', 40 | 'Referer': 'http://zhjwxk.cic.tsinghua.edu.cn/xklogin.do', 41 | 'Upgrade-Insecure-Requests': '1', 42 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 43 | } 44 | data = { 45 | 'j_username': 'zhang-yb18', 46 | 'j_password': 'ZHANG2338', 47 | 'captchaflag': 'login1', 48 | '_login_image_': get_captcha(url), 49 | } 50 | 51 | 52 | r = s.post(url, headers=headers, data=data) 53 | text = r.text 54 | print(text) -------------------------------------------------------------------------------- /some/xuanke2.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | 3 | import requests, time 4 | import hmac, json 5 | from bs4 import BeautifulSoup 6 | from hashlib import sha1 7 | 8 | 9 | 10 | 11 | def get_captcha(url): 12 | ''' 处理验证码 ''' 13 | 14 | r = requests.get(url, headers=headers) 15 | text = r.text 16 | obj = BeautifulSoup(text, 'html.parser') 17 | captchaurl = 'http://zhjwxk.cic.tsinghua.edu.cn' + obj.find("img", {"id":"captcha"}).attrs['src'] 18 | rr = requests.get(captchaurl, headers=headers) 19 | textt = rr.content 20 | 21 | with open('captcha.gif', 'wb') as fb: 22 | fb.write(textt) 23 | return input('captcha:') 24 | 25 | 26 | 27 | 28 | 29 | def login(username, password, oncaptcha, sessiona, headers): 30 | ''' 处理登录 ''' 31 | 32 | resp1 = sessiona.get('https://www.zhihu.com/signin', headers=headers) # 拿cookie:_xsrf 33 | resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn', 34 | headers=headers) # 拿cookie:capsion_ticket 35 | need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 36 | 37 | grantType = 'password' 38 | clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20' 39 | source = 'com.zhihu.web' 40 | timestamp = str((time.time() * 1000)).split('.')[0] # 签名只按这个时间戳变化 41 | 42 | captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login' % (time.time() * 1000), 43 | headers=headers).content 44 | 45 | data = { 46 | "client_id": clientId, 47 | "grant_type": grantType, 48 | "timestamp": timestamp, 49 | "source": source, 50 | "signature": get_signature(grantType, clientId, source, timestamp), # 获取签名 51 | "username": username, 52 | "password": password, 53 | "lang": "cn", 54 | "captcha": oncaptcha(captcha_content, need_cap), # 获取图片验证码 55 | "ref_source": "other_", 56 | "utm_source": "" 57 | } 58 | 59 | print("**2**: " + str(data)) 60 | print("-" * 50) 61 | resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in', data, headers=headers).content 62 | print(BeautifulSoup(resp, 'html.parser')) 63 | 64 | print("-" * 50) 65 | return resp 66 | 67 | 68 | 69 | headers = { 70 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 71 | 'Accept-Encoding': 'gzip, deflate, br', 72 | 'Accept-Language': 'zh-CN,zh;q=0.9', 73 | 'Cache-Control': 'max-age=0', 74 | 'Connection': 'keep-alive', 75 | 'Content-Length': '66', 76 | 'Content-Type': 'application/x-www-form-urlencoded', 77 | 'Cookie': 'JSESSIONID=cafgDstvY9fVWd2VutTFw; thuwebcookie=990146470.20480.0000', 78 | 'DNT': '1', 79 | 'Host': 'zhjwxk.cic.tsinghua.edu.cn', 80 | 'Origin': 'http://zhjwxk.cic.tsinghua.edu.cn', 81 | 'Referer': 'http://zhjwxk.cic.tsinghua.edu.cn/xklogin.do', 82 | 'Upgrade-Insecure-Requests': '1', 83 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 84 | } 85 | data = { 86 | 'j_username': 'zhang-yb18', 87 | 'j_password': 'ZHANG2338', 88 | 'captchaflag': 'login1', 89 | '_login_image_': get_captcha, 90 | } 91 | 92 | 93 | if __name__ == "__main__": 94 | sessiona = requests.Session() 95 | 96 | login('fendushu@163.com', 'ZHANG2338', get_captcha, sessiona, headers) # 用户名密码换自己的就好了 97 | resp = sessiona.get('https://www.zhihu.com/inbox', headers=headers) # 登录进去了,可以看私信了 98 | print(BeautifulSoup(resp.content, 'html.parser')) 99 | 100 | -------------------------------------------------------------------------------- /some/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*_ 2 | 3 | from selenium import webdriver 4 | 5 | import requests 6 | 7 | from time import sleep 8 | 9 | from bs4 import BeautifulSoup 10 | 11 | browser = webdriver.Chrome(executable_path='F:\\pro\\blog\herokublog\\blogtestgithub\\royal\\chromedriver.exe') 12 | 13 | url= 'https://www.zhihu.com/' 14 | 15 | s = requests.Session() 16 | 17 | s.headers.clear()#清除requests头部中的Python机器人信息,否则登录失败 18 | 19 | browser.get(url) 20 | 21 | browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[2]/span').click()#避免屏幕失去焦点 22 | 23 | browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[1]/div[2]/div[1]/input').send_keys('fendushu@163.com') 24 | 25 | browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[2]/div/div[1]/input').send_keys('ZHANG2338') 26 | 27 | try: 28 | 29 | img = browser.find_element_by_xpath('//* [@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[3]/div/div[2]/img')#验证码图片链接--倒立文字 30 | 31 | sleep(10) 32 | 33 | except: 34 | 35 | img= browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[3]/div/span/div/img').get_attribute("src")#验证码图片链接--字母数字 36 | 37 | sleep(10)#填写验证码 38 | 39 | else: 40 | 41 | pass 42 | 43 | browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/button').submit()#登录 44 | 45 | sleep(5)#等待Cookies加载 46 | 47 | cookies = browser.get_cookies() 48 | 49 | browser.quit() 50 | 51 | for cookie in cookies: 52 | s.cookies.set(cookie['name'],cookie['value'])#为session设置cookies 53 | 54 | html=s.get(url).text 55 | 56 | soup = BeautifulSoup(html) 57 | 58 | items = soup.find_all('a',attrs={'data-za-detail-view-element_name':"Title"})#获取登录后加载出的前几个话题的标题 59 | 60 | for item in items: 61 | print(item.string) 62 | 63 | -------------------------------------------------------------------------------- /some/zhihu2.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*_ 2 | 3 | 4 | __author__ = 'zkqiang' 5 | __zhihu__ = 'https://www.zhihu.com/people/z-kqiang' 6 | __github__ = 'https://github.com/zkqiang/Zhihu-Login' 7 | 8 | import requests 9 | import time 10 | import re 11 | import base64 12 | import hmac 13 | import hashlib 14 | import json 15 | import matplotlib.pyplot as plt 16 | from http import cookiejar 17 | from PIL import Image 18 | 19 | 20 | class ZhihuAccount(object): 21 | 22 | def __init__(self): 23 | self.login_url = 'https://www.zhihu.com/signup' 24 | self.login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in' 25 | self.login_data = { 26 | 'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20', 27 | 'grant_type': 'password', 28 | 'source': 'com.zhihu.web', 29 | 'username': '', 30 | 'password': '', 31 | # 传入'cn'是倒立汉字验证码 32 | 'lang': 'en', 33 | 'ref_source': 'homepage', 34 | } 35 | self.session = requests.session() 36 | self.session.headers = { 37 | 'Host': 'www.zhihu.com', 38 | 'Referer': 'https://www.zhihu.com/', 39 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' 40 | '(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' 41 | } 42 | self.session.cookies = cookiejar.LWPCookieJar(filename='./cookies.txt') 43 | 44 | def login(self, username=None, password=None, captcha_lang='en', load_cookies=True): 45 | """ 46 | 模拟登录知乎 47 | :param username: 登录手机号 48 | :param password: 登录密码 49 | :param captcha_lang: 验证码类型 'en' or 'cn' 50 | :param load_cookies: 是否读取上次保存的 Cookies 51 | :return: bool 52 | """ 53 | if load_cookies and self.load_cookies(): 54 | if self.check_login(): 55 | print('登录成功') 56 | return True 57 | 58 | headers = self.session.headers.copy() 59 | headers.update({ 60 | 'xsrftoken': self._get_xsrf(), 61 | 'x-zse-83': '3_1.1' 62 | }) 63 | self.session.headers = headers['x-udid'] = self._get_udid(headers) 64 | username, password = self._check_user_pass(username, password) 65 | self.login_data.update({ 66 | 'username': username, 67 | 'password': password, 68 | 'captcha_lang': captcha_lang 69 | }) 70 | timestamp = str(int(time.time()*1000)) 71 | self.login_data.update({ 72 | 'captcha': self._get_captcha(self.login_data['lang'], headers), 73 | 'timestamp': timestamp, 74 | 'signature': self._get_signature(timestamp) 75 | }) 76 | 77 | resp = self.session.post(self.login_api, data=self.login_data, headers=headers) 78 | if 'error' in resp.text: 79 | print(json.loads(resp.text)['error']['message']) 80 | if self.check_login(): 81 | print('登录成功') 82 | return True 83 | print('登录失败') 84 | return False 85 | 86 | def load_cookies(self): 87 | """ 88 | 读取 Cookies 文件加载到 Session 89 | :return: bool 90 | """ 91 | try: 92 | self.session.cookies.load(ignore_discard=True) 93 | return True 94 | except FileNotFoundError: 95 | return False 96 | 97 | def check_login(self): 98 | """ 99 | 检查登录状态,访问登录页面出现跳转则是已登录, 100 | 如登录成功保存当前 Cookies 101 | :return: bool 102 | """ 103 | resp = self.session.get(self.login_url, allow_redirects=False) 104 | if resp.status_code == 302: 105 | self.session.cookies.save() 106 | return True 107 | return False 108 | 109 | def _get_xsrf(self): 110 | """ 111 | 从登录页面获取 xsrf 112 | :return: str 113 | """ 114 | resp = self.session.get('https://www.zhihu.com/', allow_redirects=False) 115 | xsrf = resp.cookies['_xsrf'] 116 | return xsrf 117 | 118 | def _get_udid(self, headers): 119 | """ 120 | 从uuid接口获得 uuid 121 | :param headers: 带授权信息的请求头部 122 | :return: str 123 | """ 124 | resp = self.session.post('https://www.zhihu.com/udid', headers=headers) 125 | udid = re.search(r'[\w=\-]+', resp.cookies['d_c0'])[0] 126 | return udid 127 | 128 | def _get_captcha(self, lang, headers): 129 | """ 130 | 请求验证码的 API 接口,无论是否需要验证码都需要请求一次 131 | 如果需要验证码会返回图片的 base64 编码 132 | 根据 lang 参数匹配验证码,需要人工输入 133 | :param lang: 返回验证码的语言(en/cn) 134 | :param headers: 带授权信息的请求头部 135 | :return: 验证码的 POST 参数 136 | """ 137 | if lang == 'cn': 138 | api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=cn' 139 | else: 140 | api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en' 141 | resp = self.session.get(api, headers=headers) 142 | show_captcha = re.search(r'true', resp.text) 143 | 144 | if show_captcha: 145 | put_resp = self.session.put(api, headers=headers) 146 | json_data = json.loads(put_resp.text) 147 | img_base64 = json_data['img_base64'].replace(r'\n', '') 148 | with open('./captcha.jpg', 'wb') as f: 149 | f.write(base64.b64decode(img_base64)) 150 | img = Image.open('./captcha.jpg') 151 | if lang == 'cn': 152 | plt.imshow(img) 153 | print('点击所有倒立的汉字,按回车提交') 154 | points = plt.ginput(7) 155 | capt = json.dumps({'img_size': [200, 44], 156 | 'input_points': [[i[0]/2, i[1]/2] for i in points]}) 157 | else: 158 | img.show() 159 | capt = input('请输入图片里的验证码:') 160 | # 这里必须先把参数 POST 验证码接口 161 | self.session.post(api, data={'input_text': capt}, headers=headers) 162 | return capt 163 | return '' 164 | 165 | def _get_signature(self, timestamp): 166 | """ 167 | 通过 Hmac 算法计算返回签名 168 | 实际是几个固定字符串加时间戳 169 | :param timestamp: 时间戳 170 | :return: 签名 171 | """ 172 | ha = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=hashlib.sha1) 173 | grant_type = self.login_data['grant_type'] 174 | client_id = self.login_data['client_id'] 175 | source = self.login_data['source'] 176 | ha.update(bytes((grant_type + client_id + source + timestamp), 'utf-8')) 177 | return ha.hexdigest() 178 | 179 | def _check_user_pass(self, username, password): 180 | """ 181 | 检查用户名和密码是否已输入,若无则手动输入 182 | """ 183 | if username is None: 184 | username = self.login_data.get('username') 185 | if not username: 186 | username = input('请输入手机号:') 187 | if len(username) == 11 and username.isdigit() and '+86' not in username: 188 | username = '+86' + username 189 | 190 | if password is None: 191 | password = self.login_data.get('password') 192 | if not password: 193 | password = input('请输入密码:') 194 | return username, password 195 | 196 | 197 | if __name__ == '__main__': 198 | account = ZhihuAccount() 199 | account.login(username=None, password=None, captcha_lang='en', load_cookies=True) -------------------------------------------------------------------------------- /some/zhihu3.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | 3 | import requests, time 4 | import hmac, json 5 | from bs4 import BeautifulSoup 6 | from hashlib import sha1 7 | 8 | 9 | def get_captcha(data, need_cap): 10 | ''' 处理验证码 ''' 11 | if need_cap is False: 12 | return 13 | with open('captcha.gif', 'wb') as fb: 14 | fb.write(data) 15 | return input('captcha:') 16 | 17 | 18 | def get_signature(grantType, clientId, source, timestamp): 19 | ''' 处理签名 ''' 20 | 21 | hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4', None, sha1) 22 | hm.update(str.encode(grantType)) 23 | hm.update(str.encode(clientId)) 24 | hm.update(str.encode(source)) 25 | hm.update(str.encode(timestamp)) 26 | 27 | return str(hm.hexdigest()) 28 | 29 | 30 | def login(username, password, oncaptcha, sessiona, headers): 31 | ''' 处理登录 ''' 32 | 33 | resp1 = sessiona.get('https://www.zhihu.com/signin', headers=headers) # 拿cookie:_xsrf 34 | resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn', 35 | headers=headers) # 拿cookie:capsion_ticket 36 | need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 37 | 38 | grantType = 'password' 39 | clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20' 40 | source = 'com.zhihu.web' 41 | timestamp = str((time.time() * 1000)).split('.')[0] # 签名只按这个时间戳变化 42 | 43 | captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login' % (time.time() * 1000), 44 | headers=headers).content 45 | 46 | data = { 47 | "client_id": clientId, 48 | "grant_type": grantType, 49 | "timestamp": timestamp, 50 | "source": source, 51 | "signature": get_signature(grantType, clientId, source, timestamp), # 获取签名 52 | "username": username, 53 | "password": password, 54 | "lang": "cn", 55 | "captcha": oncaptcha(captcha_content, need_cap), # 获取图片验证码 56 | "ref_source": "other_", 57 | "utm_source": "" 58 | } 59 | 60 | print("**2**: " + str(data)) 61 | print("-" * 50) 62 | resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in', data, headers=headers).content 63 | print(BeautifulSoup(resp, 'html.parser')) 64 | 65 | print("-" * 50) 66 | return resp 67 | 68 | 69 | if __name__ == "__main__": 70 | sessiona = requests.Session() 71 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0', 72 | 'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} 73 | 74 | login('fendushu@163.com', 'ZHANG2338', get_captcha, sessiona, headers) # 用户名密码换自己的就好了 75 | resp = sessiona.get('https://www.zhihu.com/inbox', headers=headers) # 登录进去了,可以看私信了 76 | print(BeautifulSoup(resp.content, 'html.parser')) 77 | 78 | ### chcp 65001 (win下改变cmd字符集) 79 | ### python c:\python34\login_zhihu.py 80 | ### 有非常无语的事情发生,还以为代码没生效 -------------------------------------------------------------------------------- /zhihu/denglu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/zhihu/denglu.py --------------------------------------------------------------------------------