├── 58tongcheng
├── test1.py
└── test2.py
├── 91porn.py
├── README.md
├── download-citation
├── 2018-12-13-22-18-39.ris
├── 2018-12-13.ris
├── pa.py
├── pa1.py
├── ro.ris
├── ros.html
└── springer.py
├── download_biao_qing_win.py
├── huaban.py
├── ip_pachong.py
├── login.py
├── login2.py
├── meizitu3.py
├── meizitu_pro.py
├── meizitu_pro2.py
├── my_blog
├── article
│ └── templatetags
│ │ ├── __init__.py
│ │ └── custom_markdown.py
└── templates
│ ├── aboutme.html
│ ├── archives.html
│ ├── base.html
│ ├── home.html
│ ├── post.html
│ ├── tag.html
│ └── test.html
├── paqubiaoqing.py
├── porn
├── down_video.py
└── test1.py
├── requests1.py
├── requests2.py
├── requests3.py
├── scraping_ajax.py
├── selenium
├── test1.py
├── test2.py
├── test3.py
└── test4.py
├── some
├── aj.py
├── pa.py
├── pa1.py
├── springer.py
├── xuanke.py
├── xuanke2.py
├── zhihu.py
├── zhihu2.py
└── zhihu3.py
└── zhihu
└── denglu.py
/58tongcheng/test1.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/58tongcheng/test1.py
--------------------------------------------------------------------------------
/58tongcheng/test2.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/58tongcheng/test2.py
--------------------------------------------------------------------------------
/91porn.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | import requests, re, random, time, os, csv
5 | from bs4 import BeautifulSoup as bs
6 | from parsel import Selector
7 |
8 | headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
9 | 'Accept-Encoding':'gzip, deflate, sdch',
10 | 'Accept-Language':'zh-CN,zh;q=0.8',
11 | 'Cache-Control':'max-age=0',
12 | 'Connection':'keep-alive',
13 | 'DNT':'1',
14 | 'Host':'email.91dizhi.at.gmail.com.8h9.space',
15 | 'Upgrade-Insecure-Requests':'1',
16 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
17 |
18 | def download_urls(url):
19 | r = requests.get(url, headers=headers, timeout=30)
20 | r.encoding = 'utf-8'
21 | html = r.text
22 | obj = bs(html, 'html.parser')
23 | lists = obj.find_all('div', {'class': re.compile('imagechannel.*?')})
24 | for i in lists:
25 | try:
26 | a = i.find('a')
27 | video_url = a.attrs['href']
28 | img_url = a.find('img').attrs['src']
29 | title = a.find('img').attrs['title']
30 | print(video_url, img_url, title)
31 |
32 | with open('91porn_all.csv', 'a', newline='', encoding='utf_8_sig') as csvfile:
33 | ww = csv.writer(csvfile, dialect='excel')
34 | ww.writerow([title, img_url, video_url])
35 | except:
36 | continue
37 |
38 | def crawl_urls(n):
39 | for i in range(1,n+1):
40 | url = 'http://email.91dizhi.at.gmail.com.8h9.space/v.php?category=mf&viewtype=basic&page=' + str(i)
41 | try: # 尝试三次,如果3次请求仍然不能成功,则跳过该页,继续爬取下一页
42 | download_urls(url)
43 | except:
44 | try:
45 | download_urls(url)
46 | except:
47 | try:
48 | download_urls(url)
49 | except:
50 | continue
51 | time.sleep(0.001)
52 |
53 | n = 3526 # 总页数
54 | crawl_urls(n)
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python-sipder
2 | ## python爬虫学习教程
3 |
4 | ### 爬取妹子图爬虫
5 |
6 | By [Jim-Bin](https://github.com/Jim-bin).
7 |
8 | #### Description
9 |
10 | 实现的爬取[妹子图](http://www.meizitu.com/)
11 |
12 | #### 下载meizitu3.py
13 | #### Installation
14 |
15 | > pip install bs4
16 |
17 | > pip install requests
18 |
19 | #### Usage
20 |
21 | * 妹子图:`python meizitu3.py`
22 |
--------------------------------------------------------------------------------
/download-citation/pa.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | '''
4 | 在以下环境测试通过:
5 | python 2.7.15或者3.7.0
6 | win10或者lubuntu
7 | '''
8 |
9 | # 导入模块
10 | import time
11 | import requests, re, random, os
12 | from bs4 import BeautifulSoup
13 | from requests import Session
14 |
15 | session = Session()
16 |
17 |
18 | '''
19 | 给定页数,爬取每页所有图片的url,通过此url可以打开图片所在的网页
20 | 所有url存在一个列表中
21 | '''
22 |
23 |
24 | def scrapy_img_urls(nums):
25 | lss = []
26 | for num in range(1, nums + 1):
27 | url = 'http://www.doutula.com/photo/list/?page=' + str(num)
28 | html = requests.get(url, headers=headers)
29 | html.encoding = 'utf-8'
30 |
31 | text = html.text
32 | bsop = BeautifulSoup(text, 'html.parser')
33 | ass = bsop.find('div', {'class': 'page-content'}).find('div').findAll('a')
34 |
35 | for a in ass:
36 | # print(a.attrs['href'])
37 | lss.append(a.attrs['href'])
38 | time.sleep(1)
39 | return lss
40 |
41 |
42 | '''
43 | 接收每个图片的url,打开此url,找到图片真实的地址,通过此地址可以下载图片
44 | 找到图片真实的url和名字之后调用download_url函数可以下载图片
45 | '''
46 |
47 |
48 | def download_img_url(url):
49 | html = requests.get(url, headers=headers)
50 | html.encoding = 'utf-8'
51 |
52 | text = html.text
53 | bsop = BeautifulSoup(text, 'html.parser')
54 | img = bsop.find('div', {'class': 'col-xs-12 col-sm-12 artile_des'})
55 | img_url = img.find('img').attrs['src']
56 | img_title = img.find('img').attrs['alt']
57 | print(img_url + " " + img_title)
58 |
59 | download_img(img_url, img_title)
60 |
61 |
62 | '''
63 | 下载图片,该函数接收两个参数,一个是图片的真实地址,一个是图片的名字
64 | 名字中如果有特殊字符则需要处理,不然windows下可能无法保存,处理名字调用format_name函数
65 | 打开指定文件夹保存图片,如果没有则创建。
66 | '''
67 |
68 |
69 | def download_img(img_url, img_title):
70 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹
71 | if not os.path.exists(file_path):
72 | os.makedirs(file_path)
73 | os.chdir(file_path)
74 |
75 | # 图片保存到本地
76 | exists = os.path.exists(img_title)
77 | if not exists:
78 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True)
79 | img_html.encoding = 'utf-8'
80 | with open(img_title + ".gif", 'wb') as f:
81 | f.write(img_html.content)
82 | f.close()
83 |
84 |
85 | def format_name(img_title):
86 | '''
87 | 对名字进行处理,如果包含下属字符,则直接剔除该字符
88 | :param img_title:
89 | :return:
90 | '''
91 | for i in ['\\', '/', ':', '*', '?', '"', '<', '>', '!', '|']:
92 | while i in img_title:
93 | img_title = img_title.strip().replace(i, '')
94 | return img_title
95 |
96 |
97 | def royal(url):
98 | html = requests.get(url, headers=headers, stream=True, timeout=20, verify=True)
99 | html.encoding = 'utf-8'
100 | text = html.text
101 | bsop = BeautifulSoup(text, 'html.parser')
102 | timeofissued = bsop.find('meta', {'name':'DC.issued'}).attrs['content'].split('/')[0]
103 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content']
104 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content']
105 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content']
106 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content']
107 | citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content']
108 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content']
109 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content']
110 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content']
111 | PB = bsop.find('meta', {'name':'DC.publisher'}).attrs['content']
112 | M3 = citation_doi
113 | citation_url = 'http://dx.doi.org/' + citation_doi
114 | citation_abstract = bsop.find('meta', {'name':'citation_abstract'}).attrs['content'].strip()
115 | SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1]
116 |
117 | with open(citation_title + ".ris", 'w') as f:
118 | f.write('TY - JOUR\n')
119 | f.write('T1 - ' + citation_title + '\n')
120 | f.write('Y1 - ' + timeofissued + '\n')
121 | f.write('SP - ' + citation_firstpage + '\n')
122 | f.write('EP - ' + citation_lastpage + '\n')
123 | f.write('JF - ' + citation_journal_title + '\n')
124 | f.write('JO - ' + citation_journal_abbrev + '\n')
125 | f.write('VL - ' + citation_volume + '\n')
126 | f.write('RS - ' + citation_issue + '\n')
127 | f.write('PB - ' + PB + '\n')
128 | f.write('SN - ' + SN + '\n')
129 | f.write('DO - ' + citation_doi + '\n')
130 | f.write('M3 - ' + M3 + '\n')
131 | f.write('UR - ' + citation_url + '\n')
132 | print(citation_url)
133 | f.write('N2 - ' + citation_abstract + '\n')
134 | print(citation_abstract)
135 |
136 | authors = bsop.findAll('span', {'class': 'article__author-link'})
137 | for author in authors:
138 | author = author.find('a').text.split(' ')
139 | author = author[-1] + ', ' + ' '.join(author[:-1])
140 | f.write('A1 - ' + author + '\n')
141 | f.write('ER - ' + '\n')
142 | f.close()
143 |
144 | # authors = bsop.findAll('span', {'class':'article__author-link'})
145 | # for author in authors:
146 | # author = author.find('a').text.split(' ')
147 | # author = author[-1] + ', ' + ' '.join(author[:-1])
148 | # with open(author + ".ris", 'w') as f:
149 | # f.write('TY - JOUR')
150 | # f.write('T1 - ' + citation_title)
151 | # f.write('T1 - ' + authors)
152 | # f.close()
153 |
154 | # print(author)
155 | # print(timeofissued)
156 |
157 |
158 |
159 |
160 |
161 | # print(authors)
162 | # with open("ro.ris", 'wb') as f:
163 | # f.write(html.content)
164 | # f.close()
165 |
166 |
167 | def scawurls(url):
168 |
169 | headers1 = {
170 | 'Accept':'text/html, */*; q=0.01',
171 | 'Connection': 'keep-alive',
172 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
173 | 'DNT':'1',
174 | 'Host':'pubs.rsc.org',
175 | 'Origin':'https://pubs.rsc.org',
176 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
177 | 'X-NewRelic-ID':'VQYFWF9aDBABV1laBgcFUw ==',
178 | 'X-Requested-With':'XMLHttpRequest'
179 | }
180 |
181 | data = {
182 | 'searchterm': 'AAEAAAD/////AQAAAAAAAAAMAgAAAGNSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAADlSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLlNlYXJjaFRlcm0OAAAAGTxDYXRlZ29yeT5rX19CYWNraW5nRmllbGQcPFN1YkNhdGVnb3J5PmtfX0JhY2tpbmdGaWVsZBw8Q29udGVudFR5cGU + a19fQmFja2luZ0ZpZWxkGjxDcml0ZXJpYXM + a19fQmFja2luZ0ZpZWxkFzxGYWNldHM + a19fQmFja2luZ0ZpZWxkHDxSZXF1ZXN0VGltZT5rX19CYWNraW5nRmllbGQfPEF1dGhvckNyaXRlcmlhPmtfX0JhY2tpbmdGaWVsZCA8UHVibGljYXRpb25EYXRlPmtfX0JhY2tpbmdGaWVsZBk8RXhjbHVkZXM + a19fQmFja2luZ0ZpZWxkFzxTb3VyY2U + a19fQmFja2luZ0ZpZWxkHzxPdXRwdXRTdGFuZGFyZD5rX19CYWNraW5nRmllbGQePFJlc3VsdHNGb3JtYXQ + a19fQmFja2luZ0ZpZWxkHjxEaXNwbGF5Q291bnRzPmtfX0JhY2tpbmdGaWVsZCA8UHJvZHVjdFBhZ2VTaXplPmtfX0JhY2tpbmdGaWVsZAEBAQMDAAQEAwEBAQEBwgFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5MaXN0YDFbW1JTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXcIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0NPVJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guQXV0aG9yQ3JpdGVyaWECAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlAgAAAMIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0CAAAABgMAAAADQWxsCgYEAAAAA0FsbAkFAAAACQYAAAAAAAAAAAAAAAkHAAAACQgAAAAJCQAAAAoKCgoKBAUAAADCAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWUsIFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cywgVmVyc2lvbj0yMDE4LjAuNTQ5LjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49bnVsbF1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24EAAA6UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWVbXQIAAAAICAkKAAAABAAAAAQAAAABBgAAAAUAAAAJCwAAAAAAAAAAAAAABQcAAAA9UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JDcml0ZXJpYQIAAAAgPEJvb2xlYW5PcGVyYXRvcj5rX19CYWNraW5nRmllbGQYPEF1dGhvcnM + a19fQmFja2luZ0ZpZWxkAQPDAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JJbmZvLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXQIAAAAKCgUIAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlBQAAAB88SXNTZWxlY3RlZERhdGU + a19fQmFja2luZ0ZpZWxkGTxEYXRlVHlwZT5rX19CYWNraW5nRmllbGQbPFdpdGhJbkxhc3Q + a19fQmFja2luZ0ZpZWxkGjxEYXRlUmFuZ2U + a19fQmFja2luZ0ZpZWxkHDxEaXNwbGF5RGF0ZT5rX19CYWNraW5nRmllbGQAAQQEAQE5UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5XaXRoSW5MYXN0AgAAADhSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLkRhdGVSYW5nZQIAAAACAAAAAAoKCgoBCQAAAAUAAAAJCwAAAAAAAAAAAAAABwoAAAAAAQAAAAQAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAkMAAAACQ0AAAAJDgAAAAkPAAAABwsAAAAAAQAAAAAAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAUMAAAAOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlBAAAABU8TmFtZT5rX19CYWNraW5nRmllbGQcPERpc3BsYXlOYW1lPmtfX0JhY2tpbmdGaWVsZBY8VmFsdWU + a19fQmFja2luZ0ZpZWxkIDxCb29sZWFuT3BlcmF0b3I + a19fQmFja2luZ0ZpZWxkAQEBAQIAAAAGEAAAAAhmcmVldGV4dAoGEQAAAG9kZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtIEFORCBDdSwgT1IgY29wcGVyLCBPUiBlbGVjdHJvbGVzcywgT1IgcHJpbnRpbmcsIE9SIGZsZXhpYmxlLCBPUiBzdWJzdHJhdGUsIE9SIHBsYXN0aWMKAQ0AAAAMAAAABhIAAAAHQWxsVGV4dAoGEwAAABlkZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtCgEOAAAADAAAAAYUAAAAC0F0bGVhc3RUZXh0CgYVAAAAP0N1LCBjb3BwZXIsIGVsZWN0cm9sZXNzLCBwcmludGluZywgZmxleGlibGUsIHN1YnN0cmF0ZSwgcGxhc3RpYwoBDwAAAAwAAAAGFgAAABBPcmlnaW5hbEZyZWVUZXh0CgYXAAAAb2RlcG9zaXRpb24sIHBhdHRlcm4sIGZpbG0gQU5EIEN1LCBPUiBjb3BwZXIsIE9SIGVsZWN0cm9sZXNzLCBPUiBwcmludGluZywgT1IgZmxleGlibGUsIE9SIHN1YnN0cmF0ZSwgT1IgcGxhc3RpYwoL',
183 | 'resultcount': '282607',
184 | 'category': 'all',
185 | 'pageno': '2'
186 | }
187 |
188 | html = requests.post(url, data=data, headers=headers1, stream=True, timeout=20, verify=True)
189 | html.encoding = 'utf-8'
190 | text = html.text
191 | # print(text)
192 | bsop = BeautifulSoup(text, 'html.parser')
193 | divs = bsop.findAll('div', {'class': 'capsule capsule--article '})
194 | for i in divs:
195 | article_url = 'https://pubs.rsc.org' + i.find('a').attrs['href']
196 | print(article_url)
197 | # royal(article_url)
198 |
199 | # with open("ros.html", 'wb') as f:
200 | # f.write(html.content)
201 | # f.close()
202 | # print(text)
203 |
204 | # session.head('https://pubs.rsc.org/en/results/all?Category=All&AllText=deposition%2C%20pattern%2C%20film&AtleastText=Cu%2C%20copper%2C%20electroless%2C%20printing%2C%20flexible%2C%20substrate%2C%20plastic&IncludeReference=false&SelectJournal=false&DateRange=false&SelectDate=false&Type=Months&DateFromMonth=Months&DateToMonth=Months&PriceCode=False&OpenAccess=false')
205 |
206 | # 构造headers
207 | UserAgent_List = [
208 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
209 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
210 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
211 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
212 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
213 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
214 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
215 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
216 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
217 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
218 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
219 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
220 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
221 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
222 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
223 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
224 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
225 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
226 | ]
227 | headers = {'User-Agent': random.choice(UserAgent_List),
228 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
229 | 'Accept-Encoding': 'gzip',
230 | }
231 |
232 | url = 'https://pubs.rsc.org/en/search/journalresult'
233 | scawurls(url)
234 |
235 |
236 |
237 | # url = 'https://pubs.rsc.org/en/content/articlelanding/2017/tc/c7tc00038c#!divAbstract'
238 | # royal(url)
239 |
240 | # nums = 5
241 | # # 图片存储路径,在linux系统下
242 | # file_path = '/home/zhangyb/downloadfiles/pythonpro/biaoqing'
243 | # # 图片存储路径,在windows系统下
244 | # # file_path = 'E:\downloadfiles\pythonpro\biaoqing'
245 | # urls = scrapy_img_urls(nums)
246 | # for i in urls:
247 | # print(i)
248 | # download_img_url(i)
249 |
250 |
251 | # url = 'https://pubs.rsc.org/en/results/all?Category=All&AllText=deposition%2C%20pattern%2C%20film&AtleastText=Cu%2C%20copper%2C%20electroless%2C%20printing%2C%20flexible%2C%20substrate%2C%20plastic&IncludeReference=false&SelectJournal=false&DateRange=false&SelectDate=false&Type=Months&DateFromMonth=Months&DateToMonth=Months&PriceCode=False&OpenAccess=false'
252 | # r = requests.get(url, headers=headers)
253 | # print(r.text)
254 |
--------------------------------------------------------------------------------
/download-citation/pa1.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | import random
5 | from bs4 import BeautifulSoup
6 | import time
7 |
8 | download_time = time.strftime("%Y-%m-%d", time.localtime())
9 |
10 |
11 | def royal(article_urls):
12 | for article_url in article_urls:
13 | # try:
14 | html = requests.get(article_url, headers=headers, stream=True, timeout=20, verify=True)
15 | html.encoding = 'utf-8'
16 | text = html.text
17 | bsop = BeautifulSoup(text, 'html.parser')
18 | try:
19 | timeofissued = bsop.find('meta', {'name':'DC.issued'}).attrs['content'].split('/')[0]
20 | except:
21 | pass
22 | try:
23 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content']
24 | except:
25 | pass
26 | try:
27 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content']
28 | except:
29 | pass
30 | try:
31 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content']
32 | except:
33 | pass
34 | try:
35 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content']
36 | except:
37 | pass
38 | try:
39 | citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content']
40 | except:
41 | pass
42 | try:
43 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content']
44 | except:
45 | pass
46 | try:
47 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content']
48 | except:
49 | pass
50 | try:
51 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content']
52 | except:
53 | pass
54 | try:
55 | PB = bsop.find('meta', {'name':'DC.publisher'}).attrs['content']
56 | except:
57 | pass
58 | try:
59 | M3 = citation_doi
60 | except:
61 | pass
62 | try:
63 | citation_url = 'http://dx.doi.org/' + citation_doi
64 | except:
65 | pass
66 | try:
67 | citation_abstract = bsop.find('meta', {'name':'citation_abstract'}).attrs['content'].strip()
68 | except:
69 | pass
70 | try:
71 | SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1]
72 | except:
73 | pass
74 | # except:
75 | # print(article_url)
76 | # continue
77 |
78 | with open(download_time + ".ris", 'a', encoding='utf-8') as f:
79 | f.write('TY - JOUR\n')
80 | f.write('T1 - ' + citation_title + '\n')
81 | f.write('Y1 - ' + timeofissued + '\n')
82 | f.write('SP - ' + citation_firstpage + '\n')
83 | f.write('EP - ' + citation_lastpage + '\n')
84 | f.write('JF - ' + citation_journal_title + '\n')
85 | f.write('JO - ' + citation_journal_abbrev + '\n')
86 | f.write('VL - ' + citation_volume + '\n')
87 | f.write('RS - ' + citation_issue + '\n')
88 | f.write('PB - ' + PB + '\n')
89 | f.write('SN - ' + SN + '\n')
90 | f.write('DO - ' + citation_doi + '\n')
91 | f.write('M3 - ' + M3 + '\n')
92 | f.write('UR - ' + citation_url + '\n')
93 | print(citation_url)
94 | f.write('N2 - ' + citation_abstract + '\n')
95 | # print(citation_abstract)
96 |
97 | authors = bsop.findAll('span', {'class': 'article__author-link'})
98 | for author in authors:
99 | author = author.find('a').text.split(' ')
100 | author = author[-1] + ', ' + ' '.join(author[:-1])
101 | f.write('A1 - ' + author + '\n')
102 | f.write('ER - ' + '\n\n\n')
103 | f.close()
104 | time.sleep(1)
105 |
106 |
107 | def crawl_article_url(nums):
108 | article_urls = []
109 | for num in range(1, nums+1):
110 |
111 | url = 'https://pubs.rsc.org/en/search/journalresult'
112 |
113 | headers1 = {
114 | 'Accept':'text/html, */*; q=0.01',
115 | 'Connection': 'keep-alive',
116 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
117 | 'DNT':'1',
118 | 'Host':'pubs.rsc.org',
119 | 'Origin':'https://pubs.rsc.org',
120 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
121 | 'X-NewRelic-ID':'VQYFWF9aDBABV1laBgcFUw ==',
122 | 'X-Requested-With':'XMLHttpRequest'
123 | }
124 |
125 | data = {
126 | 'searchterm': 'AAEAAAD/////AQAAAAAAAAAMAgAAAGNSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAADlSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLlNlYXJjaFRlcm0OAAAAGTxDYXRlZ29yeT5rX19CYWNraW5nRmllbGQcPFN1YkNhdGVnb3J5PmtfX0JhY2tpbmdGaWVsZBw8Q29udGVudFR5cGU + a19fQmFja2luZ0ZpZWxkGjxDcml0ZXJpYXM + a19fQmFja2luZ0ZpZWxkFzxGYWNldHM + a19fQmFja2luZ0ZpZWxkHDxSZXF1ZXN0VGltZT5rX19CYWNraW5nRmllbGQfPEF1dGhvckNyaXRlcmlhPmtfX0JhY2tpbmdGaWVsZCA8UHVibGljYXRpb25EYXRlPmtfX0JhY2tpbmdGaWVsZBk8RXhjbHVkZXM + a19fQmFja2luZ0ZpZWxkFzxTb3VyY2U + a19fQmFja2luZ0ZpZWxkHzxPdXRwdXRTdGFuZGFyZD5rX19CYWNraW5nRmllbGQePFJlc3VsdHNGb3JtYXQ + a19fQmFja2luZ0ZpZWxkHjxEaXNwbGF5Q291bnRzPmtfX0JhY2tpbmdGaWVsZCA8UHJvZHVjdFBhZ2VTaXplPmtfX0JhY2tpbmdGaWVsZAEBAQMDAAQEAwEBAQEBwgFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5MaXN0YDFbW1JTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXcIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0NPVJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guQXV0aG9yQ3JpdGVyaWECAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlAgAAAMIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0CAAAABgMAAAADQWxsCgYEAAAAA0FsbAkFAAAACQYAAAAAAAAAAAAAAAkHAAAACQgAAAAJCQAAAAoKCgoKBAUAAADCAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWUsIFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cywgVmVyc2lvbj0yMDE4LjAuNTQ5LjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49bnVsbF1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24EAAA6UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWVbXQIAAAAICAkKAAAABAAAAAQAAAABBgAAAAUAAAAJCwAAAAAAAAAAAAAABQcAAAA9UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JDcml0ZXJpYQIAAAAgPEJvb2xlYW5PcGVyYXRvcj5rX19CYWNraW5nRmllbGQYPEF1dGhvcnM + a19fQmFja2luZ0ZpZWxkAQPDAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JJbmZvLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXQIAAAAKCgUIAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlBQAAAB88SXNTZWxlY3RlZERhdGU + a19fQmFja2luZ0ZpZWxkGTxEYXRlVHlwZT5rX19CYWNraW5nRmllbGQbPFdpdGhJbkxhc3Q + a19fQmFja2luZ0ZpZWxkGjxEYXRlUmFuZ2U + a19fQmFja2luZ0ZpZWxkHDxEaXNwbGF5RGF0ZT5rX19CYWNraW5nRmllbGQAAQQEAQE5UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5XaXRoSW5MYXN0AgAAADhSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLkRhdGVSYW5nZQIAAAACAAAAAAoKCgoBCQAAAAUAAAAJCwAAAAAAAAAAAAAABwoAAAAAAQAAAAQAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAkMAAAACQ0AAAAJDgAAAAkPAAAABwsAAAAAAQAAAAAAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAUMAAAAOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlBAAAABU8TmFtZT5rX19CYWNraW5nRmllbGQcPERpc3BsYXlOYW1lPmtfX0JhY2tpbmdGaWVsZBY8VmFsdWU + a19fQmFja2luZ0ZpZWxkIDxCb29sZWFuT3BlcmF0b3I + a19fQmFja2luZ0ZpZWxkAQEBAQIAAAAGEAAAAAhmcmVldGV4dAoGEQAAAG9kZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtIEFORCBDdSwgT1IgY29wcGVyLCBPUiBlbGVjdHJvbGVzcywgT1IgcHJpbnRpbmcsIE9SIGZsZXhpYmxlLCBPUiBzdWJzdHJhdGUsIE9SIHBsYXN0aWMKAQ0AAAAMAAAABhIAAAAHQWxsVGV4dAoGEwAAABlkZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtCgEOAAAADAAAAAYUAAAAC0F0bGVhc3RUZXh0CgYVAAAAP0N1LCBjb3BwZXIsIGVsZWN0cm9sZXNzLCBwcmludGluZywgZmxleGlibGUsIHN1YnN0cmF0ZSwgcGxhc3RpYwoBDwAAAAwAAAAGFgAAABBPcmlnaW5hbEZyZWVUZXh0CgYXAAAAb2RlcG9zaXRpb24sIHBhdHRlcm4sIGZpbG0gQU5EIEN1LCBPUiBjb3BwZXIsIE9SIGVsZWN0cm9sZXNzLCBPUiBwcmludGluZywgT1IgZmxleGlibGUsIE9SIHN1YnN0cmF0ZSwgT1IgcGxhc3RpYwoL',
127 | 'resultcount': '282607',
128 | 'category': 'all',
129 | 'pageno': str(num)
130 | }
131 |
132 | html = requests.post(url, data=data, headers=headers1, stream=True, timeout=20, verify=True)
133 | html.encoding = 'utf-8'
134 | text = html.text
135 | # print(text)
136 | bsop = BeautifulSoup(text, 'html.parser')
137 | divs = bsop.findAll('div', {'class': 'capsule capsule--article '})
138 | for i in divs:
139 | article_url = 'https://pubs.rsc.org' + i.find('a').attrs['href']
140 | # print(article_url)
141 | article_urls.append(article_url)
142 | print("第" + str(num) + "页爬取完毕")
143 | time.sleep(1)
144 | return article_urls
145 |
146 |
147 | # 构造headers
148 | UserAgent_List = [
149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
150 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
151 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
152 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
153 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
154 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
155 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
156 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
157 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
158 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
159 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
160 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
161 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
162 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
163 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
164 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
165 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
166 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
167 | ]
168 | headers = {'User-Agent': random.choice(UserAgent_List),
169 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
170 | 'Accept-Encoding': 'gzip',
171 | }
172 | nums = 5 # 爬取的页数
173 |
174 | article_urls = crawl_article_url(nums)
175 | royal(article_urls)
176 |
177 |
178 |
179 | # url = 'https://pubs.rsc.org/en/content/articlelanding/2017/tc/c7tc00038c#!divAbstract'
180 | # royal(url)
181 |
--------------------------------------------------------------------------------
/download-citation/ros.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | 282607 items
4 | - Showing page 1 of 11305
5 |
6 |
7 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
85 |
86 |
87 |
143 |
144 |
145 |
204 |
205 |
206 |
266 |
267 |
268 |
327 |
328 |
329 |
389 |
390 |
391 |
392 |
393 |
395 |
396 |
397 |
398 | Review Article
399 |
400 |
401 |
402 | Science and technology roadmap for graphene, related two-dimensional crystals, and hybrid systems
403 |
404 |
405 |
406 | Andrea C. Ferrari, Francesco Bonaccorso, Vladimir Fal'ko, Konstantin S. Novoselov, Stephan Roche, Peter Bøggild, Stefano Borini, Frank H. L. Koppens, Vincenzo Palermo, Nicola Pugno, José A. Garrido, Roman Sordan, Alberto Bianco, Laura Ballerini, Maurizio Prato, Elefterios Lidorikis, Jani Kivioja, Claudio Marinelli, Tapani Ryhänen, Alberto Morpurgo, Jonathan N. Coleman, Valeria Nicolosi, Luigi Colombo, Albert Fert, Mar Garcia-Hernandez, Adrian Bachtold, Grégory F. Schneider, Francisco Guinea, Cees Dekker, Matteo Barbone, Zhipei Sun, Costas Galiotis, Alexander N. Grigorenko, Gerasimos Konstantatos, Andras Kis, Mikhail Katsnelson, Lieven Vandersypen, Annick Loiseau, Vittorio Morandi, Daniel Neumaier, Emanuele Treossi, Vittorio Pellegrini, Marco Polini, Alessandro Tredicucci, Gareth M. Williams, Byung Hee Hong, Jong-Hyun Ahn, Jong Min Kim, Herbert Zirath, Bart J. van Wees, Herre van der Zant, Luigi Occhipinti, Andrea Di Matteo, Ian A. Kinloch, Thomas Seyller, Etienne Quesnel, Xinliang Feng, Ken Teo, Nalin Rupesinghe, Pertti Hakonen, Simon R. T. Neil, Quentin Tannock, Tomas Löfwander and Jari Kinaret
407 |
408 |
409 |
410 |
411 |
We present the science and technology roadmap for graphene, related two-dimensional crystals, and hybrid systems, targeting an evolution in technology, that might lead to impacts and benefits reaching into most areas of society.
412 |
413 |
414 |
415 |

419 |
420 |
421 |
422 |
423 |
424 |
425 |
446 |
447 |
448 |
449 |
450 |
506 |
507 |
508 |
567 |
568 |
569 |
625 |
626 |
627 |
673 |
674 |
675 |
735 |
736 |
737 |
796 |
797 |
798 |
854 |
855 |
856 |
912 |
913 |
914 |
973 |
974 |
975 |
1031 |
1032 |
1033 |
1089 |
1090 |
1091 |
1147 |
1148 |
1149 |
1205 |
1206 |
1207 |
1263 |
1264 |
1265 |
1321 |
1322 |
1323 |
1379 |
1380 |
1381 |
1437 |
1438 |
1439 |
1495 |
1496 | 282607 items
1497 | - Showing page 1 of 11305
1498 |
1499 |
1500 |
1512 |
1513 |
1514 |
1515 |
1516 |
--------------------------------------------------------------------------------
/download-citation/springer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | import random
5 | from bs4 import BeautifulSoup
6 | import time
7 |
8 | download_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
9 |
10 |
11 | def royal(article_urls):
12 | for article_url in article_urls:
13 | # try:
14 | html = requests.get(article_url, headers=headers, stream=True, timeout=20, verify=True)
15 | html.encoding = 'utf-8'
16 | text = html.text
17 | bsop = BeautifulSoup(text, 'html.parser')
18 | try:
19 | timeofissued = bsop.find('meta', {'name':'citation_cover_date'}).attrs['content'].split('/')[0]
20 | except:
21 | pass
22 | try:
23 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content']
24 | except:
25 | pass
26 | try:
27 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content']
28 | except:
29 | pass
30 | try:
31 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content']
32 | except:
33 | pass
34 | try:
35 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content']
36 | except:
37 | pass
38 | try:
39 | # citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content']
40 | citation_issue = bsop.find('span', {'id':'electronic-issn'}).text
41 | except:
42 | pass
43 | try:
44 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content']
45 | except:
46 | pass
47 | try:
48 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content']
49 | except:
50 | pass
51 | try:
52 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content']
53 | except:
54 | pass
55 | try:
56 | PB = bsop.find('meta', {'name':'citation_publisher'}).attrs['content']
57 | except:
58 | pass
59 | try:
60 | M3 = citation_doi
61 | except:
62 | pass
63 | try:
64 | citation_url = 'http://dx.doi.org/' + citation_doi
65 | except:
66 | pass
67 | try:
68 | # citation_abstract = bsop.find('p', {'id':'Par1'}).attrs['content'].strip()
69 | citation_abstract = bsop.find('p', {'id':'Par1'}).text
70 | except:
71 | pass
72 | try:
73 | # SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1]
74 | SN = bsop.find('span', {'id':'electronic-issn'}).text
75 | except:
76 | pass
77 | # except:
78 | # print(article_url)
79 | # continue
80 |
81 | with open(download_time + ".ris", 'a', encoding='utf-8') as f:
82 | f.write('TY - JOUR\n')
83 | f.write('T1 - ' + citation_title + '\n')
84 | f.write('Y1 - ' + timeofissued + '\n')
85 | f.write('SP - ' + citation_firstpage + '\n')
86 | f.write('EP - ' + citation_lastpage + '\n')
87 | f.write('JF - ' + citation_journal_title + '\n')
88 | f.write('JO - ' + citation_journal_abbrev + '\n')
89 | f.write('VL - ' + citation_volume + '\n')
90 | f.write('RS - ' + citation_issue + '\n')
91 | f.write('PB - ' + PB + '\n')
92 | f.write('SN - ' + SN + '\n')
93 | f.write('DO - ' + citation_doi + '\n')
94 | f.write('M3 - ' + M3 + '\n')
95 | f.write('UR - ' + citation_url + '\n')
96 | print(citation_url)
97 | f.write('N2 - ' + citation_abstract + '\n')
98 | # print(citation_abstract)
99 |
100 | authors = bsop.findAll('meta', {'name': 'citation_author'})
101 | for author in authors:
102 | # print(author)
103 | author = author.attrs['content'].split(" ")
104 | # print(author)
105 | author = author[-1] + ', ' + ' '.join(author[:-1])
106 | f.write('A1 - ' + author + '\n')
107 | f.write('ER - ' + '\n\n\n')
108 | f.close()
109 | time.sleep(1)
110 |
111 |
112 | def crawl_article_url(nums):
113 | article_urls = []
114 | for num in range(1, nums+1):
115 |
116 | url = 'https://link.springer.com/search/page/' + str(num) + '?date-facet-mode=between&facet-start-year=2010&facet-language=%22En%22&query=printing%2C+AND+Cu+AND+pattern%2C+AND+film%2C+AND+flexible%2C+AND+plastic%2C+AND+substrate%2C+AND+copper&facet-end-year=2019&showAll=true&facet-content-type=%22Article%22'
117 |
118 | html = requests.get(url, headers=headers, stream=True, timeout=20, verify=True)
119 | html.encoding = 'utf-8'
120 | text = html.text
121 | # print(text)
122 | bsop = BeautifulSoup(text, 'html.parser')
123 | divs = bsop.find('ol', {'id': 'results-list'}).findAll('li')
124 | for i in divs:
125 | # print(i)
126 | article_url = 'https://link.springer.com' + i.find('h2').find('a').attrs['href']
127 | print(article_url)
128 | article_urls.append(article_url)
129 | print("第" + str(num) + "页爬取完毕")
130 | time.sleep(1)
131 | return article_urls
132 |
133 |
134 | # 构造headers
135 | UserAgent_List = [
136 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
137 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
138 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
139 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
140 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
141 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
142 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
143 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
144 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
145 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
146 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
147 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
148 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
150 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
151 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
152 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
153 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
154 | ]
155 | headers = {'User-Agent': random.choice(UserAgent_List),
156 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
157 | 'Accept-Encoding': 'gzip',
158 | }
159 | nums = 1 # 爬取的页数
160 |
161 | article_urls = crawl_article_url(nums)
162 | royal(article_urls)
--------------------------------------------------------------------------------
/download_biao_qing_win.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | '''
4 | 在以下环境测试通过:
5 | python 2.7.15或者3.7.0
6 | win10或者lubuntu
7 | '''
8 |
9 | # 导入模块
10 | import time
11 | import requests, re, random, os
12 | from bs4 import BeautifulSoup
13 |
14 | '''
15 | 给定页数,爬取每页所有图片的url,通过此url可以打开图片所在的网页
16 | 所有url存在一个列表中
17 | '''
18 | def scrapy_img_urls(nums):
19 | lss = []
20 | for num in range(1, nums+1):
21 | url = 'http://www.doutula.com/photo/list/?page=' + str(num)
22 | html = requests.get(url, headers=headers)
23 | html.encoding = 'utf-8'
24 |
25 | text = html.text
26 | bsop = BeautifulSoup(text, 'html.parser')
27 | ass = bsop.find('div', {'class': 'page-content'}).find('div').findAll('a')
28 |
29 | for a in ass:
30 | # print(a.attrs['href'])
31 | lss.append(a.attrs['href'])
32 | time.sleep(1)
33 | return lss
34 |
35 | '''
36 | 接收每个图片的url,打开此url,找到图片真实的地址,通过此地址可以下载图片
37 | 找到图片真实的url和名字之后调用download_url函数可以下载图片
38 | '''
39 | def download_img_url(url):
40 | html = requests.get(url, headers=headers)
41 | html.encoding = 'utf-8'
42 |
43 | text = html.text
44 | bsop = BeautifulSoup(text, 'html.parser')
45 | img = bsop.find('div', {'class': 'col-xs-12 col-sm-12 artile_des'})
46 | img_url = img.find('img').attrs['src']
47 | img_title = img.find('img').attrs['alt']
48 | print(img_url + " " + img_title)
49 |
50 | download_img(img_url, img_title)
51 |
52 | '''
53 | 下载图片,该函数接收两个参数,一个是图片的真实地址,一个是图片的名字
54 | 名字中如果有特殊字符则需要处理,不然windows下可能无法保存,处理名字调用format_name函数
55 | 打开指定文件夹保存图片,如果没有则创建。
56 | '''
57 | def download_img(img_url, img_title):
58 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹
59 | if not os.path.exists(file_path):
60 | os.makedirs(file_path)
61 | os.chdir(file_path)
62 |
63 | # 图片保存到本地
64 | exists = os.path.exists(img_title)
65 | if not exists:
66 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True)
67 | img_html.encoding = 'utf-8'
68 | with open(img_title + ".gif", 'wb') as f:
69 | f.write(img_html.content)
70 | f.close()
71 |
72 |
73 | def format_name(img_title):
74 | '''
75 | 对名字进行处理,如果包含下属字符,则直接剔除该字符
76 | :param img_title:
77 | :return:
78 | '''
79 | for i in ['\\','/',':','*','?','"','<','>','!','|']:
80 | while i in img_title:
81 | img_title = img_title.strip().replace(i, '')
82 | return img_title
83 |
84 | # 构造headers
85 | UserAgent_List = [
86 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
87 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
88 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
89 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
90 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
91 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
92 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
93 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
94 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
95 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
96 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
97 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
98 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
99 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
100 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
101 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
102 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
103 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
104 | ]
105 | headers = {'User-Agent': random.choice(UserAgent_List),
106 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
107 | 'Accept-Encoding': 'gzip',
108 | }
109 |
110 | nums=5
111 | # 图片存储路径,在linux系统下
112 | # file_path = '/home/zhangyb/downloadfiles/pythonpro/biaoqing'
113 | # 图片存储路径,在windows系统下
114 |
115 | file_path = 'E:\downloadfiles\pythonpro\biaoqing'
116 | urls = scrapy_img_urls(nums)
117 | for i in urls:
118 | print(i)
119 | download_img_url(i)
120 |
121 |
122 | # download_img_url('http://www.doutula.com/photo/6437987')
123 | # download_img('https://ws1.sinaimg.cn/large/9150e4e5gy1fx94eo4pdwg203q02g0so.gif', u'好想打死你啊')
--------------------------------------------------------------------------------
/huaban.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | '''
4 | python 2.7.12
5 | '''
6 |
7 | import requests
8 | from parsel import Selector
9 | import time
10 | import re, random, os
11 |
12 |
13 | def scraw_pin_ids():
14 |
15 | pin_ids = []
16 | pin_id = '1068018182'
17 |
18 | flag = True
19 | while flag:
20 | try:
21 | url = "http://huaban.com/favorite/beauty/"
22 | headers1 = {
23 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
24 | 'Accept':'application/json',
25 | 'X-Request':'JSON',
26 | 'X-Requested-With':'XMLHttpRequest',
27 | }
28 |
29 | params = {
30 | 'j0l4lymf':'',
31 | 'max':pin_id,
32 | 'limit':'20',
33 | 'wfl':'1',
34 | }
35 |
36 | z1 = requests.get(url, params=params, headers=headers1)
37 |
38 | if z1.json()['pins']:
39 | for i in z1.json()['pins']:
40 | pin_ids.append(i['pin_id'])
41 | pin_id = pin_ids[-1]
42 | print i['pin_id']
43 | # with open("pin_ids.txt",'ab') as f:
44 | # f.write(str(i['pin_id'])+"\n")
45 | # f.close()
46 | time.sleep(0.001)
47 | else:
48 | flag = False
49 | return set(pin_ids)
50 | except:
51 | continue
52 |
53 | def scraw_urls(pin_ids):
54 |
55 | urls = []
56 |
57 | urlss = ['http://huaban.com/pins/' + str(i) +'/' for i in pin_ids]
58 | for url in urlss:
59 | try:
60 | headers = {
61 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
62 | }
63 |
64 | z3 = requests.get(url, headers=headers)
65 |
66 | text = z3.text
67 |
68 | pattern = re.compile('"key":"(.*?)"', re.S)
69 | items = re.findall(pattern, text)
70 |
71 | urls.extend(items)
72 | print items
73 | print '============================================================================================================'
74 | except:
75 | continue
76 | return set(urls)
77 |
78 | def download(urls):
79 | headers1 = {
80 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
81 | }
82 | n = 1
83 | urls = set(urls)
84 | for url in urls:
85 | try:
86 | if not os.path.exists(os.path.join(file_path, "huaban")):
87 | os.makedirs(os.path.join(file_path, "huaban"))
88 | os.chdir(file_path + '\\' + "huaban")
89 | try:
90 | url = 'http://img.hb.aicdn.com/' + url
91 | r = requests.get(url, headers=headers1)
92 | if len(r.content)>40000:
93 | with open(str(n)+".jpg", 'wb') as f:
94 | f.write(r.content)
95 | f.close()
96 | print u"第" + str(n) + u"张图片下载成功"
97 | n+=1
98 | # time.sleep(3)
99 | except:
100 | continue
101 | except:
102 | continue
103 |
104 | # 图片存储路径
105 | file_path = 'E:\selfprogress\programming\project\pa1024\huabannnnnnn'
106 | pin_ids = scraw_pin_ids()
107 | urls = scraw_urls(pin_ids)
108 | download(urls)
109 |
--------------------------------------------------------------------------------
/ip_pachong.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | '''
4 | python 3.7.0
5 | '''
6 |
7 | # 导入模块
8 | import time
9 | import requests, re, random, os
10 | from bs4 import BeautifulSoup
11 |
12 | def ip_test(ip, url_for_test='https://www.baidu.com', set_timeout=10):
13 | '''
14 | 检测爬取到的ip地址可否使用,能使用返回True,否则返回False,默认去访问百度测试代理
15 | :param ip:
16 | :param url_for_test:
17 | :param set_timeout:
18 | :return:
19 | '''
20 | try:
21 | r = requests.get(url_for_test, headers=headers, proxies={'http': ip[0]+':'+ip[1]}, timeout=set_timeout)
22 | if r.status_code == 200:
23 | return True
24 | else:
25 | return False
26 | except:
27 | return False
28 |
29 | def scrawl_ip(url, num, url_for_test='https://www.baidu.com'):
30 | '''
31 | 爬取代理ip地址,代理的url是西祠代理
32 | :param url:
33 | :param num:
34 | :param url_for_test:
35 | :return:
36 | '''
37 | ip_list = []
38 | for num_page in range(1, num+1):
39 | url = url + str(num_page)
40 |
41 | response = requests.get(url, headers=headers)
42 | response.encoding = 'utf-8'
43 | content = response.text
44 |
45 | pattern = re.compile('.*?alt="Cn" />.*? | .*?(.*?) | .*?(.*?) | ', re.S)
46 | items = re.findall(pattern, content)
47 | for ip in items:
48 | if ip_test(ip[1], url_for_test): # 测试爬取到ip是否可用,测试通过则加入ip_list列表之中
49 | print('测试通过,IP地址为' + str(ip[0]) + ':' + str(ip[1]))
50 | ip_list.append(ip[0]+':'+ip[1])
51 | return ip_list
52 |
53 | time.sleep(5) # 等待5秒爬取下一页
54 |
55 | def get_random_ip(): # 随机获取一个IP
56 | ind = random.randint(0, len(total_ip)-1)
57 | return total_ip[ind]
58 |
59 |
60 | # 爬取代理的url地址,选择的是西祠代理
61 | url_ip = "http://www.xicidaili.com/nt/"
62 |
63 | # 设定等待时间
64 | set_timeout = 10
65 |
66 | # 爬取代理的页数,2表示爬取2页的ip地址
67 | num = 2
68 |
69 | # 代理的使用次数
70 | count_time = 5
71 |
72 | # 构造headers
73 | UserAgent_List = [
74 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
75 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
76 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
77 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
78 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
79 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
80 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
81 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
82 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
83 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
84 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
85 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
86 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
87 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
88 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
89 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
90 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
91 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
92 | ]
93 |
94 | headers = {'User-Agent': random.choice(UserAgent_List),
95 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
96 | 'Accept-Encoding': 'gzip',
97 | }
98 |
99 |
100 | # 爬取IP代理
101 | total_ip = scrawl_ip(url_ip, num)
102 |
103 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/login.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/login.py
--------------------------------------------------------------------------------
/login2.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/login2.py
--------------------------------------------------------------------------------
/meizitu3.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | '''
4 | python 3.5.2
5 | '''
6 |
7 | # 导入模块
8 | import time
9 | import requests, re, random, os
10 | from bs4 import BeautifulSoup
11 |
12 | def ip_test(ip, url_for_test='https://www.baidu.com', set_timeout=10):
13 | '''
14 | 检测爬取到的ip地址可否使用,能使用返回True,否则返回False,默认去访问百度测试代理
15 | :param ip:
16 | :param url_for_test:
17 | :param set_timeout:
18 | :return:
19 | '''
20 | try:
21 | r = requests.get(url_for_test, headers=headers, proxies={'http': ip[0]+':'+ip[1]}, timeout=set_timeout)
22 | if r.status_code == 200:
23 | return True
24 | else:
25 | return False
26 | except:
27 | return False
28 |
29 | def scrawl_ip(url, num, url_for_test='https://www.baidu.com'):
30 | '''
31 | 爬取代理ip地址,代理的url是西祠代理
32 | :param url:
33 | :param num:
34 | :param url_for_test:
35 | :return:
36 | '''
37 | ip_list = []
38 | for num_page in range(1, num+1):
39 | url = url + str(num_page)
40 |
41 | response = requests.get(url, headers=headers)
42 | response.encoding = 'utf-8'
43 | content = response.text
44 |
45 | pattern = re.compile('.*?alt="Cn" />.*? | .*?(.*?) | .*?(.*?) | ', re.S)
46 | items = re.findall(pattern, content)
47 | for ip in items:
48 | if ip_test(ip[1], url_for_test): # 测试爬取到ip是否可用,测试通过则加入ip_list列表之中
49 | print('测试通过,IP地址为' + str(ip[0]) + ':' + str(ip[1]))
50 | ip_list.append(ip[0]+':'+ip[1])
51 | return ip_list
52 |
53 | time.sleep(5) # 等待5秒爬取下一页
54 |
55 | def get_random_ip(): # 随机获取一个IP
56 | ind = random.randint(0, len(total_ip)-1)
57 | return total_ip[ind]
58 |
59 | def download_img(img_list, img_title):
60 | '''
61 | 通过scrawl_url函数获得了单个图册里面所有图片的url列表和图册的名字,就可以下载图片了
62 | 此函数的作用下载单个图册里面的所有图片
63 | 接收参数img_list是单个图册里面所有图片的的url,
64 | 如['http://mm.howkuai.com/wp-content/uploads/2017a/02/07/01.jpg',
65 | 'http://mm.howkuai.com/wp-content/uploads/2017a/02/07/02.jpg',...]
66 | img_title是单个图册的名字,如’香车美女,最完美的黄金搭档‘
67 | :param img_list:
68 | :param img_title:
69 | :return:
70 | '''
71 |
72 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹
73 | for img_urls in img_list:
74 | img_url = img_urls.attrs['src'] # 单个图片的url地址
75 | print(img_url)
76 | title = img_urls.attrs['alt'] # 单个图片的名字
77 | print(title)
78 |
79 | try:
80 | if not os.path.exists(os.path.join(file_path, img_title)):
81 | os.makedirs(os.path.join(file_path, img_title))
82 | os.chdir(file_path + '\\' + img_title)
83 |
84 | # 图片保存到本地
85 | exists = os.path.exists(img_title)
86 | if not exists:
87 | try:
88 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True)
89 | with open(title+".jpg", 'wb') as f:
90 | f.write(img_html.content)
91 | f.close()
92 | except:
93 | continue
94 | except:
95 | continue
96 |
97 | def scrawl_list(url_list, proxy_flag=False, try_time=0):
98 | '''
99 | 此函数的作用是爬取每一页面所有图册的url,一个页面包含10个图册,所有调用一次函数则返回一个包含10个url的列表
100 | 格式如['http://www.meizitu.com/a/list_1_1.html',...]
101 | :param url_list:
102 | :param proxy_flag:
103 | :param try_time:
104 | :return:
105 | '''
106 | if not proxy_flag: # 不使用代理
107 | try:
108 | html = requests.get(url_list, headers=headers, timeout=10)
109 | html.encoding = 'gb2312'
110 | text = html.text
111 |
112 | bsop = BeautifulSoup(text, 'html.parser')
113 |
114 | url_imgs = []
115 | li_list = bsop.find('ul', {'class': 'wp-list clearfix'}).findAll('li', {'class':'wp-item'})
116 | for i in li_list:
117 | url_img = i.find('h3',{'class':'tit'}).find('a').attrs['href']
118 | url_imgs.append(url_img)
119 | return url_imgs
120 | except:
121 | return scrawl_list(url_list, proxy_flag=True) # 否则调用自己,使用3次IP代理
122 | else: # 使用代理时
123 | if try_time','!','|']:
231 | while i in img_title:
232 | img_title = img_title.strip().replace(i, '')
233 | return img_title
234 |
235 | def get_total_pages(first_url):
236 | '''
237 | 获取妹子图所有页面
238 | :param first_url:
239 | :return:
240 | '''
241 | html = requests.get(first_url, headers=headers, timeout=10)
242 | html.encoding = 'gb2312'
243 | text = html.text
244 | bsop = BeautifulSoup(text, 'html.parser')
245 | lis =bsop.find('div',{'id':'wp_page_numbers'}).find('ul').findAll('li')
246 | pages = lis[-1].find('a').attrs['href'].split('.')[0].split('_')[-1]
247 | pages = int(pages)
248 | return pages
249 |
250 |
251 | # 妹子图的首页,用来获取总的页数
252 | first_url = 'http://www.meizitu.com/a/list_1_1.html'
253 |
254 | # 爬取代理的url地址,选择的是西祠代理
255 | url_ip = "http://www.xicidaili.com/nt/"
256 |
257 | # 设定等待时间
258 | set_timeout = 10
259 |
260 | # 爬取代理的页数,2表示爬取2页的ip地址
261 | num = 2
262 |
263 | # 代理的使用次数
264 | count_time = 5
265 |
266 | # 构造headers
267 | UserAgent_List = [
268 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
269 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
270 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
271 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
272 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
273 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
274 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
275 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
276 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
277 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
278 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
279 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
280 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
281 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
282 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
283 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
284 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
285 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
286 | ]
287 | headers = {'User-Agent': random.choice(UserAgent_List),
288 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
289 | 'Accept-Encoding': 'gzip',
290 | }
291 |
292 | # 图片存储路径
293 | file_path = 'E:\selfprogress\programming\project\meizitu'
294 |
295 | # 获取总页数
296 | pages = get_total_pages(first_url)
297 |
298 | # 爬取IP代理
299 | total_ip = scrawl_ip(url_ip, num)
300 |
301 | # 带爬取的url
302 | url_imgss = download_urls(pages)
303 |
304 | for i in url_imgss:
305 | for j in i:
306 | try:
307 | with open('url.txt','a') as f:
308 | f.write(j+"\n")
309 | f.close()
310 | print("写入url.txt文件成功")
311 | except:
312 | print("写入url.txt文件失败")
313 |
314 | for url_imgs in url_imgss:
315 | for url_img in url_imgs:
316 | img_list, img_title = scrawl_url(url_img)
317 | if not img_list:
318 | continue
319 | download_img(img_list, img_title)
320 |
321 | time.sleep(5)
322 |
323 |
324 |
325 |
326 |
327 |
--------------------------------------------------------------------------------
/meizitu_pro.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # 导入模块
4 | import time
5 | import requests, re, random, os
6 | from bs4 import BeautifulSoup
7 |
8 | def ip_test(ip, url_for_test='https://www.baidu.com', set_timeout=30):
9 | try:
10 | r = requests.get(url_for_test, headers=headers, proxies={'http': ip[0]+':'+ip[1]}, timeout=set_timeout)
11 | if r.status_code == 200:
12 | return True
13 | else:
14 | return False
15 | except:
16 | return False
17 |
18 | def scrawl_ip(url, num, url_for_test='https://www.baidu.com'):
19 | ip_list = []
20 | for num_page in range(1, num):
21 | url = url + str(num_page)
22 |
23 | response = requests.get(url, headers=headers)
24 | response.encoding = 'utf-8'
25 | content = response.text
26 |
27 | pattern = re.compile('.*?alt="Cn" />.*? | .*?(.*?) | .*?(.*?) | ', re.S)
28 | items = re.findall(pattern, content)
29 | for ip in items:
30 | if ip_test(ip[1], url_for_test): # 测试爬取到ip是否可用,测试通过则加入ip_list列表之中
31 | print('测试通过,IP地址为' + str(ip[0]) + ':' + str(ip[1]))
32 | ip_list.append(ip[0]+':'+ip[1])
33 | return ip_list
34 |
35 | time.sleep(10) # 等待10秒爬取下一页
36 |
37 | def get_random_ip(): # 随机获取一个IP
38 | ind = random.randint(0, len(total_ip)-1)
39 | # print(total_ip[ind])
40 | return total_ip[ind]
41 |
42 |
43 | def download_img(img_list):
44 | img_title = img_list[0].attrs['alt']
45 | for img_url in img_list:
46 | img_url = img_url.attrs['src']
47 | title = img_url.split('/')[-1]
48 |
49 | if not os.path.exists(os.path.join(file_path, img_title)):
50 | os.makedirs(os.path.join(file_path, img_title))
51 | os.chdir(file_path + '\\' + img_title)
52 |
53 | # 图片保存到本地
54 | exists = os.path.exists( title)
55 | if not exists:
56 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=30, verify=True)
57 | with open(title, 'wb') as f:
58 | f.write(img_html.content)
59 | f.close()
60 |
61 | def scrawl_url(url, proxy_flag=False, try_time=0):
62 | if not proxy_flag: # 不使用代理
63 | try:
64 | html = requests.get(url, headers=headers, timeout=30)
65 | html.encoding = 'gb2312'
66 |
67 | text = html.text
68 | code = html.status_code
69 | print(code)
70 | bsop = BeautifulSoup(text, 'html.parser')
71 | img_list = bsop.find('div', {'class': 'postContent'}).find('p').findAll('img')
72 |
73 | return img_list
74 |
75 | except:
76 | return scrawl_url(url, proxy_flag=True) # 否则调用自己,使用3次IP代理
77 | else: # 使用代理时
78 | if try_time
2 |
3 |
4 |
5 | $Title$
6 |
7 |
8 | $END$
9 |
10 |
--------------------------------------------------------------------------------
/my_blog/templates/archives.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% block content %}
4 |
5 | {% for post in post_list %}
6 |
15 | {% endfor %}
16 |
17 | {% endblock %}
--------------------------------------------------------------------------------
/my_blog/templates/base.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | $Title$
6 |
7 |
8 | $END$
9 |
10 |
--------------------------------------------------------------------------------
/my_blog/templates/home.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | $Title$
6 |
7 |
8 | $END$
9 |
10 |
--------------------------------------------------------------------------------
/my_blog/templates/post.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | $Title$
6 |
7 |
8 | $END$
9 |
10 |
--------------------------------------------------------------------------------
/my_blog/templates/tag.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | $Title$
6 |
7 |
8 | $END$
9 |
10 |
--------------------------------------------------------------------------------
/my_blog/templates/test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | $Title$
6 |
7 |
8 | $END$
9 |
10 |
--------------------------------------------------------------------------------
/paqubiaoqing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | '''
4 | 在以下环境测试通过:
5 | python 2.7.15或者3.7.0
6 | win10或者lubuntu
7 | '''
8 |
9 | # 导入模块
10 | import time
11 | import requests, re, random, os
12 | from bs4 import BeautifulSoup
13 |
14 | '''
15 | 给定页数,爬取每页所有图片的url,通过此url可以打开图片所在的网页
16 | 所有url存在一个列表中
17 | '''
18 | def scrapy_img_urls(nums):
19 | lss = []
20 | for num in range(1, nums+1):
21 | url = 'http://www.doutula.com/photo/list/?page=' + str(num)
22 | html = requests.get(url, headers=headers)
23 | html.encoding = 'utf-8'
24 |
25 | text = html.text
26 | bsop = BeautifulSoup(text, 'html.parser')
27 | ass = bsop.find('div', {'class': 'page-content'}).find('div').findAll('a')
28 |
29 | for a in ass:
30 | # print(a.attrs['href'])
31 | lss.append(a.attrs['href'])
32 | time.sleep(1)
33 | return lss
34 |
35 | '''
36 | 接收每个图片的url,打开此url,找到图片真实的地址,通过此地址可以下载图片
37 | 找到图片真实的url和名字之后调用download_url函数可以下载图片
38 | '''
39 | def download_img_url(url):
40 | html = requests.get(url, headers=headers)
41 | html.encoding = 'utf-8'
42 |
43 | text = html.text
44 | bsop = BeautifulSoup(text, 'html.parser')
45 | img = bsop.find('div', {'class': 'col-xs-12 col-sm-12 artile_des'})
46 | img_url = img.find('img').attrs['src']
47 | img_title = img.find('img').attrs['alt']
48 | print(img_url + " " + img_title)
49 |
50 | download_img(img_url, img_title)
51 |
52 | '''
53 | 下载图片,该函数接收两个参数,一个是图片的真实地址,一个是图片的名字
54 | 名字中如果有特殊字符则需要处理,不然windows下可能无法保存,处理名字调用format_name函数
55 | 打开指定文件夹保存图片,如果没有则创建。
56 | '''
57 | def download_img(img_url, img_title):
58 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹
59 | if not os.path.exists(file_path):
60 | os.makedirs(file_path)
61 | os.chdir(file_path)
62 |
63 | # 图片保存到本地
64 | exists = os.path.exists(img_title)
65 | if not exists:
66 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True)
67 | img_html.encoding = 'utf-8'
68 | with open(img_title + ".gif", 'wb') as f:
69 | f.write(img_html.content)
70 | f.close()
71 |
72 |
73 | def format_name(img_title):
74 | '''
75 | 对名字进行处理,如果包含下属字符,则直接剔除该字符
76 | :param img_title:
77 | :return:
78 | '''
79 | for i in ['\\','/',':','*','?','"','<','>','!','|']:
80 | while i in img_title:
81 | img_title = img_title.strip().replace(i, '')
82 | return img_title
83 |
84 | # 构造headers
85 | UserAgent_List = [
86 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
87 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
88 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
89 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
90 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
91 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
92 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
93 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
94 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
95 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
96 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
97 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
98 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
99 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
100 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
101 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
102 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
103 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
104 | ]
105 | headers = {'User-Agent': random.choice(UserAgent_List),
106 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
107 | 'Accept-Encoding': 'gzip',
108 | }
109 |
110 | nums=5
111 | # 图片存储路径,在linux系统下
112 | file_path = '/home/zhangyb/downloadfiles/pythonpro/biaoqing'
113 | # 图片存储路径,在windows系统下
114 | # file_path = 'E:\downloadfiles\pythonpro\biaoqing'
115 | urls = scrapy_img_urls(nums)
116 | for i in urls:
117 | print(i)
118 | download_img_url(i)
119 |
120 |
121 | # download_img_url('http://www.doutula.com/photo/6437987')
122 | # download_img('https://ws1.sinaimg.cn/large/9150e4e5gy1fx94eo4pdwg203q02g0so.gif', u'好想打死你啊')
--------------------------------------------------------------------------------
/porn/down_video.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/porn/down_video.py
--------------------------------------------------------------------------------
/porn/test1.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/porn/test1.py
--------------------------------------------------------------------------------
/requests1.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import requests
3 | from bs4 import BeautifulSoup
4 |
5 |
6 | url = 'http://tieba.baidu.com/p/4468445702'
7 | html = requests.get(url)
8 | html.encoding = 'utf-8'
9 |
10 | text = html.text
11 | bsop = BeautifulSoup(text,'html.parser')
12 | img_list = bsop.find('div',{'id':'post_content_87286618651'}).findAll('img')
13 | img_src = img_list[0].attrs['src']
14 |
15 | print(img_src)
16 | img = requests.get(img_src)
17 | with open('a.jpg', 'ab') as f:
18 | f.write(img.content)
19 | f.close()
20 |
21 |
22 | # content = html.content
23 | # print(text)
24 | # print(content)
--------------------------------------------------------------------------------
/requests2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import requests
3 | from bs4 import BeautifulSoup
4 |
5 |
6 | url = 'http://tieba.baidu.com/p/4468445702'
7 | html = requests.get(url)
8 | html.encoding = 'utf-8'
9 |
10 | text = html.text
11 | bsop = BeautifulSoup(text,'html.parser')
12 | img_list = bsop.find('div',{'id':'post_content_87286618651'}).findAll('img')
13 | img_src = img_list[0].attrs['src']
14 |
15 | print(img_src)
16 | img = requests.get(img_src)
17 | with open('a.jpg', 'ab') as f:
18 | f.write(img.content)
19 | f.close()
20 |
21 |
22 | # content = html.content
23 | # print(text)
24 | # print(content)
--------------------------------------------------------------------------------
/requests3.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/requests3.py
--------------------------------------------------------------------------------
/scraping_ajax.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/scraping_ajax.py
--------------------------------------------------------------------------------
/selenium/test1.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/selenium/test1.py
--------------------------------------------------------------------------------
/selenium/test2.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/selenium/test2.py
--------------------------------------------------------------------------------
/selenium/test3.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/selenium/test3.py
--------------------------------------------------------------------------------
/selenium/test4.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/selenium/test4.py
--------------------------------------------------------------------------------
/some/aj.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*_
2 |
3 | import requests
4 | import json
5 |
6 | headers = {
7 | 'Accept': 'application/json',
8 | 'Accept-Encoding': 'gzip, deflate, br',
9 | 'Accept-Language': 'zh-CN,zh;q=0.9',
10 | 'Connection': 'keep-alive',
11 | 'Content-Length': '1919',
12 | 'Content-Type': 'application/json',
13 | 'Cookie': 'bid=FvGxnjrHNYI; gr_user_id=c211a350-d924-429f-9028-afd61661913f; _vwo_uuid_v2=DD2B02C913FD5A4D2EFE19BBBB71F1473|8e6abeedccfd8ccd3b590f121d180376; __utmc=30149280; __utmz=30149280.1545471350.6.6.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); viewed="10756112_5273955_1088168_27069345_26601155_10590856"; _ga=GA1.3.497061328.1543886034; ap_v=0,6.0; __utma=30149280.497061328.1543886034.1545471350.1545887406.7; _gid=GA1.3.452249281.1545887527; _pk_ref.100001.a7dd=%5B%22%22%2C%22%22%2C1545887527%2C%22https%3A%2F%2Fwww.jianshu.com%2Fp%2Fb29375404479%22%5D; _pk_ses.100001.a7dd=*; _pk_id.100001.a7dd=ee586b77c5c08a27.1545487781.2.1545889502.1545488713.',
14 | 'DNT': '1',
15 | 'Host': 'read.douban.com',
16 | 'Origin': 'https://read.douban.com',
17 | 'Referer': 'https://read.douban.com/category/?kind=114',
18 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
19 | 'X-CSRF-Token': 'null',
20 | }
21 |
22 | data = {"sort":"hot","page":1,"kind":114,"query":"\n query getFilterWorksList($works_ids: [ID!], $user_id: ID) {\n worksList(worksIds: $works_ids) {\n \n \n title\n cover\n url\n isBundle\n \n \n url\n title\n \n \n author {\n name\n url\n }\n origAuthor {\n name\n url\n }\n translator {\n name\n url\n }\n \n \n abstract\n editorHighlight\n \n \n isOrigin\n kinds {\n \n name @skip(if: true)\n shortName @include(if: true)\n id\n \n }\n ... on WorksBase @include(if: true) {\n wordCount\n wordCountUnit\n }\n ... on WorksBase @include(if: true) {\n \n isEssay\n \n ... on EssayWorks {\n favorCount\n }\n \n \n isNew\n \n averageRating\n ratingCount\n url\n \n \n \n }\n ... on WorksBase @include(if: false) {\n isColumn\n isEssay\n onSaleTime\n ... on ColumnWorks {\n updateTime\n }\n }\n ... on WorksBase @include(if: true) {\n isColumn\n ... on ColumnWorks {\n isFinished\n }\n }\n ... on EssayWorks {\n essayActivityData {\n \n title\n uri\n tag {\n name\n color\n background\n icon2x\n icon3x\n iconSize {\n height\n }\n iconPosition {\n x y\n }\n }\n \n }\n }\n highlightTags {\n name\n }\n \n ... on WorksBase @include(if: false) {\n \n fixedPrice\n salesPrice\n isRebate\n \n }\n ... on EbookWorks {\n \n fixedPrice\n salesPrice\n isRebate\n \n }\n ... on WorksBase @include(if: true) {\n ... on EbookWorks {\n id\n isPurchased(userId: $user_id)\n isInWishlist(userId: $user_id)\n }\n }\n \n id\n isOrigin\n }\n }\n ","variables":{"user_id":""}}
23 |
24 | url = 'https://read.douban.com/j/kind/'
25 |
26 | r = requests.post(url, headers=headers, data=json.dumps(data))
27 | text = r.text
28 | text = json.loads(text)
29 | total = text["total"]
30 | lists = text["list"]
31 | for i in lists:
32 | title = i.['title']
33 | cover = i.['cover']
34 | book_url = 'https://read.douban.com' + i.['book_url']
35 | book_url = 'https://read.douban.com' + i.['book_url']
36 | # print(total)
37 | # print(lists)
38 |
--------------------------------------------------------------------------------
/some/pa.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | '''
4 | 在以下环境测试通过:
5 | python 2.7.15或者3.7.0
6 | win10或者lubuntu
7 | '''
8 |
9 | # 导入模块
10 | import time
11 | import requests, re, random, os
12 | from bs4 import BeautifulSoup
13 | from requests import Session
14 |
15 | session = Session()
16 |
17 |
18 | '''
19 | 给定页数,爬取每页所有图片的url,通过此url可以打开图片所在的网页
20 | 所有url存在一个列表中
21 | '''
22 |
23 |
24 | def scrapy_img_urls(nums):
25 | lss = []
26 | for num in range(1, nums + 1):
27 | url = 'http://www.doutula.com/photo/list/?page=' + str(num)
28 | html = requests.get(url, headers=headers)
29 | html.encoding = 'utf-8'
30 |
31 | text = html.text
32 | bsop = BeautifulSoup(text, 'html.parser')
33 | ass = bsop.find('div', {'class': 'page-content'}).find('div').findAll('a')
34 |
35 | for a in ass:
36 | # print(a.attrs['href'])
37 | lss.append(a.attrs['href'])
38 | time.sleep(1)
39 | return lss
40 |
41 |
42 | '''
43 | 接收每个图片的url,打开此url,找到图片真实的地址,通过此地址可以下载图片
44 | 找到图片真实的url和名字之后调用download_url函数可以下载图片
45 | '''
46 |
47 |
48 | def download_img_url(url):
49 | html = requests.get(url, headers=headers)
50 | html.encoding = 'utf-8'
51 |
52 | text = html.text
53 | bsop = BeautifulSoup(text, 'html.parser')
54 | img = bsop.find('div', {'class': 'col-xs-12 col-sm-12 artile_des'})
55 | img_url = img.find('img').attrs['src']
56 | img_title = img.find('img').attrs['alt']
57 | print(img_url + " " + img_title)
58 |
59 | download_img(img_url, img_title)
60 |
61 |
62 | '''
63 | 下载图片,该函数接收两个参数,一个是图片的真实地址,一个是图片的名字
64 | 名字中如果有特殊字符则需要处理,不然windows下可能无法保存,处理名字调用format_name函数
65 | 打开指定文件夹保存图片,如果没有则创建。
66 | '''
67 |
68 |
69 | def download_img(img_url, img_title):
70 | img_title = format_name(img_title) # 如果图册名字有特殊字符需要处理。不然在windows下保存不了文件夹
71 | if not os.path.exists(file_path):
72 | os.makedirs(file_path)
73 | os.chdir(file_path)
74 |
75 | # 图片保存到本地
76 | exists = os.path.exists(img_title)
77 | if not exists:
78 | img_html = requests.get(img_url, headers=headers, stream=True, timeout=20, verify=True)
79 | img_html.encoding = 'utf-8'
80 | with open(img_title + ".gif", 'wb') as f:
81 | f.write(img_html.content)
82 | f.close()
83 |
84 |
85 | def format_name(img_title):
86 | '''
87 | 对名字进行处理,如果包含下属字符,则直接剔除该字符
88 | :param img_title:
89 | :return:
90 | '''
91 | for i in ['\\', '/', ':', '*', '?', '"', '<', '>', '!', '|']:
92 | while i in img_title:
93 | img_title = img_title.strip().replace(i, '')
94 | return img_title
95 |
96 |
97 | def royal(url):
98 | html = requests.get(url, headers=headers, stream=True, timeout=20, verify=True)
99 | html.encoding = 'utf-8'
100 | text = html.text
101 | bsop = BeautifulSoup(text, 'html.parser')
102 | timeofissued = bsop.find('meta', {'name':'DC.issued'}).attrs['content'].split('/')[0]
103 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content']
104 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content']
105 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content']
106 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content']
107 | citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content']
108 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content']
109 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content']
110 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content']
111 | PB = bsop.find('meta', {'name':'DC.publisher'}).attrs['content']
112 | M3 = citation_doi
113 | citation_url = 'http://dx.doi.org/' + citation_doi
114 | citation_abstract = bsop.find('meta', {'name':'citation_abstract'}).attrs['content'].strip()
115 | SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1]
116 |
117 | with open(citation_title + ".ris", 'w') as f:
118 | f.write('TY - JOUR\n')
119 | f.write('T1 - ' + citation_title + '\n')
120 | f.write('Y1 - ' + timeofissued + '\n')
121 | f.write('SP - ' + citation_firstpage + '\n')
122 | f.write('EP - ' + citation_lastpage + '\n')
123 | f.write('JF - ' + citation_journal_title + '\n')
124 | f.write('JO - ' + citation_journal_abbrev + '\n')
125 | f.write('VL - ' + citation_volume + '\n')
126 | f.write('RS - ' + citation_issue + '\n')
127 | f.write('PB - ' + PB + '\n')
128 | f.write('SN - ' + SN + '\n')
129 | f.write('DO - ' + citation_doi + '\n')
130 | f.write('M3 - ' + M3 + '\n')
131 | f.write('UR - ' + citation_url + '\n')
132 | print(citation_url)
133 | f.write('N2 - ' + citation_abstract + '\n')
134 | print(citation_abstract)
135 |
136 | authors = bsop.findAll('span', {'class': 'article__author-link'})
137 | for author in authors:
138 | author = author.find('a').text.split(' ')
139 | author = author[-1] + ', ' + ' '.join(author[:-1])
140 | f.write('A1 - ' + author + '\n')
141 | f.write('ER - ' + '\n')
142 | f.close()
143 |
144 | # authors = bsop.findAll('span', {'class':'article__author-link'})
145 | # for author in authors:
146 | # author = author.find('a').text.split(' ')
147 | # author = author[-1] + ', ' + ' '.join(author[:-1])
148 | # with open(author + ".ris", 'w') as f:
149 | # f.write('TY - JOUR')
150 | # f.write('T1 - ' + citation_title)
151 | # f.write('T1 - ' + authors)
152 | # f.close()
153 |
154 | # print(author)
155 | # print(timeofissued)
156 |
157 |
158 |
159 |
160 |
161 | # print(authors)
162 | # with open("ro.ris", 'wb') as f:
163 | # f.write(html.content)
164 | # f.close()
165 |
166 |
167 | def scawurls(url):
168 |
169 | headers1 = {
170 | 'Accept':'text/html, */*; q=0.01',
171 | 'Connection': 'keep-alive',
172 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
173 | 'DNT':'1',
174 | 'Host':'pubs.rsc.org',
175 | 'Origin':'https://pubs.rsc.org',
176 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
177 | 'X-NewRelic-ID':'VQYFWF9aDBABV1laBgcFUw ==',
178 | 'X-Requested-With':'XMLHttpRequest'
179 | }
180 |
181 | data = {
182 | 'searchterm': 'AAEAAAD/////AQAAAAAAAAAMAgAAAGNSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAADlSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLlNlYXJjaFRlcm0OAAAAGTxDYXRlZ29yeT5rX19CYWNraW5nRmllbGQcPFN1YkNhdGVnb3J5PmtfX0JhY2tpbmdGaWVsZBw8Q29udGVudFR5cGU + a19fQmFja2luZ0ZpZWxkGjxDcml0ZXJpYXM + a19fQmFja2luZ0ZpZWxkFzxGYWNldHM + a19fQmFja2luZ0ZpZWxkHDxSZXF1ZXN0VGltZT5rX19CYWNraW5nRmllbGQfPEF1dGhvckNyaXRlcmlhPmtfX0JhY2tpbmdGaWVsZCA8UHVibGljYXRpb25EYXRlPmtfX0JhY2tpbmdGaWVsZBk8RXhjbHVkZXM + a19fQmFja2luZ0ZpZWxkFzxTb3VyY2U + a19fQmFja2luZ0ZpZWxkHzxPdXRwdXRTdGFuZGFyZD5rX19CYWNraW5nRmllbGQePFJlc3VsdHNGb3JtYXQ + a19fQmFja2luZ0ZpZWxkHjxEaXNwbGF5Q291bnRzPmtfX0JhY2tpbmdGaWVsZCA8UHJvZHVjdFBhZ2VTaXplPmtfX0JhY2tpbmdGaWVsZAEBAQMDAAQEAwEBAQEBwgFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5MaXN0YDFbW1JTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXcIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0NPVJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guQXV0aG9yQ3JpdGVyaWECAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlAgAAAMIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0CAAAABgMAAAADQWxsCgYEAAAAA0FsbAkFAAAACQYAAAAAAAAAAAAAAAkHAAAACQgAAAAJCQAAAAoKCgoKBAUAAADCAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWUsIFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cywgVmVyc2lvbj0yMDE4LjAuNTQ5LjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49bnVsbF1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24EAAA6UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWVbXQIAAAAICAkKAAAABAAAAAQAAAABBgAAAAUAAAAJCwAAAAAAAAAAAAAABQcAAAA9UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JDcml0ZXJpYQIAAAAgPEJvb2xlYW5PcGVyYXRvcj5rX19CYWNraW5nRmllbGQYPEF1dGhvcnM + a19fQmFja2luZ0ZpZWxkAQPDAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JJbmZvLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXQIAAAAKCgUIAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlBQAAAB88SXNTZWxlY3RlZERhdGU + a19fQmFja2luZ0ZpZWxkGTxEYXRlVHlwZT5rX19CYWNraW5nRmllbGQbPFdpdGhJbkxhc3Q + a19fQmFja2luZ0ZpZWxkGjxEYXRlUmFuZ2U + a19fQmFja2luZ0ZpZWxkHDxEaXNwbGF5RGF0ZT5rX19CYWNraW5nRmllbGQAAQQEAQE5UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5XaXRoSW5MYXN0AgAAADhSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLkRhdGVSYW5nZQIAAAACAAAAAAoKCgoBCQAAAAUAAAAJCwAAAAAAAAAAAAAABwoAAAAAAQAAAAQAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAkMAAAACQ0AAAAJDgAAAAkPAAAABwsAAAAAAQAAAAAAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAUMAAAAOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlBAAAABU8TmFtZT5rX19CYWNraW5nRmllbGQcPERpc3BsYXlOYW1lPmtfX0JhY2tpbmdGaWVsZBY8VmFsdWU + a19fQmFja2luZ0ZpZWxkIDxCb29sZWFuT3BlcmF0b3I + a19fQmFja2luZ0ZpZWxkAQEBAQIAAAAGEAAAAAhmcmVldGV4dAoGEQAAAG9kZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtIEFORCBDdSwgT1IgY29wcGVyLCBPUiBlbGVjdHJvbGVzcywgT1IgcHJpbnRpbmcsIE9SIGZsZXhpYmxlLCBPUiBzdWJzdHJhdGUsIE9SIHBsYXN0aWMKAQ0AAAAMAAAABhIAAAAHQWxsVGV4dAoGEwAAABlkZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtCgEOAAAADAAAAAYUAAAAC0F0bGVhc3RUZXh0CgYVAAAAP0N1LCBjb3BwZXIsIGVsZWN0cm9sZXNzLCBwcmludGluZywgZmxleGlibGUsIHN1YnN0cmF0ZSwgcGxhc3RpYwoBDwAAAAwAAAAGFgAAABBPcmlnaW5hbEZyZWVUZXh0CgYXAAAAb2RlcG9zaXRpb24sIHBhdHRlcm4sIGZpbG0gQU5EIEN1LCBPUiBjb3BwZXIsIE9SIGVsZWN0cm9sZXNzLCBPUiBwcmludGluZywgT1IgZmxleGlibGUsIE9SIHN1YnN0cmF0ZSwgT1IgcGxhc3RpYwoL',
183 | 'resultcount': '282607',
184 | 'category': 'all',
185 | 'pageno': '2'
186 | }
187 |
188 | html = requests.post(url, data=data, headers=headers1, stream=True, timeout=20, verify=True)
189 | html.encoding = 'utf-8'
190 | text = html.text
191 | # print(text)
192 | bsop = BeautifulSoup(text, 'html.parser')
193 | divs = bsop.findAll('div', {'class': 'capsule capsule--article '})
194 | for i in divs:
195 | article_url = 'https://pubs.rsc.org' + i.find('a').attrs['href']
196 | print(article_url)
197 | # royal(article_url)
198 |
199 | # with open("ros.html", 'wb') as f:
200 | # f.write(html.content)
201 | # f.close()
202 | # print(text)
203 |
204 | # session.head('https://pubs.rsc.org/en/results/all?Category=All&AllText=deposition%2C%20pattern%2C%20film&AtleastText=Cu%2C%20copper%2C%20electroless%2C%20printing%2C%20flexible%2C%20substrate%2C%20plastic&IncludeReference=false&SelectJournal=false&DateRange=false&SelectDate=false&Type=Months&DateFromMonth=Months&DateToMonth=Months&PriceCode=False&OpenAccess=false')
205 |
206 | # 构造headers
207 | UserAgent_List = [
208 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
209 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
210 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
211 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
212 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
213 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
214 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
215 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
216 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
217 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
218 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
219 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
220 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
221 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
222 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
223 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
224 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
225 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
226 | ]
227 | headers = {'User-Agent': random.choice(UserAgent_List),
228 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
229 | 'Accept-Encoding': 'gzip',
230 | }
231 |
232 | url = 'https://pubs.rsc.org/en/search/journalresult'
233 | scawurls(url)
234 |
235 |
236 |
237 | # url = 'https://pubs.rsc.org/en/content/articlelanding/2017/tc/c7tc00038c#!divAbstract'
238 | # royal(url)
239 |
240 | # nums = 5
241 | # # 图片存储路径,在linux系统下
242 | # file_path = '/home/zhangyb/downloadfiles/pythonpro/biaoqing'
243 | # # 图片存储路径,在windows系统下
244 | # # file_path = 'E:\downloadfiles\pythonpro\biaoqing'
245 | # urls = scrapy_img_urls(nums)
246 | # for i in urls:
247 | # print(i)
248 | # download_img_url(i)
249 |
250 |
251 | # url = 'https://pubs.rsc.org/en/results/all?Category=All&AllText=deposition%2C%20pattern%2C%20film&AtleastText=Cu%2C%20copper%2C%20electroless%2C%20printing%2C%20flexible%2C%20substrate%2C%20plastic&IncludeReference=false&SelectJournal=false&DateRange=false&SelectDate=false&Type=Months&DateFromMonth=Months&DateToMonth=Months&PriceCode=False&OpenAccess=false'
252 | # r = requests.get(url, headers=headers)
253 | # print(r.text)
254 |
--------------------------------------------------------------------------------
/some/pa1.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | import random
5 | from bs4 import BeautifulSoup
6 | import time
7 |
8 | download_time = time.strftime("%Y-%m-%d", time.localtime())
9 |
10 |
11 | def royal(article_urls):
12 | for article_url in article_urls:
13 | # try:
14 | html = requests.get(article_url, headers=headers, stream=True, timeout=20, verify=True)
15 | html.encoding = 'utf-8'
16 | text = html.text
17 | bsop = BeautifulSoup(text, 'html.parser')
18 | try:
19 | timeofissued = bsop.find('meta', {'name':'DC.issued'}).attrs['content'].split('/')[0]
20 | except:
21 | pass
22 | try:
23 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content']
24 | except:
25 | pass
26 | try:
27 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content']
28 | except:
29 | pass
30 | try:
31 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content']
32 | except:
33 | pass
34 | try:
35 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content']
36 | except:
37 | pass
38 | try:
39 | citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content']
40 | except:
41 | pass
42 | try:
43 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content']
44 | except:
45 | pass
46 | try:
47 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content']
48 | except:
49 | pass
50 | try:
51 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content']
52 | except:
53 | pass
54 | try:
55 | PB = bsop.find('meta', {'name':'DC.publisher'}).attrs['content']
56 | except:
57 | pass
58 | try:
59 | M3 = citation_doi
60 | except:
61 | pass
62 | try:
63 | citation_url = 'http://dx.doi.org/' + citation_doi
64 | except:
65 | pass
66 | try:
67 | citation_abstract = bsop.find('meta', {'name':'citation_abstract'}).attrs['content'].strip()
68 | except:
69 | pass
70 | try:
71 | SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1]
72 | except:
73 | pass
74 | # except:
75 | # print(article_url)
76 | # continue
77 |
78 | with open(download_time + ".ris", 'a', encoding='utf-8') as f:
79 | f.write('TY - JOUR\n')
80 | f.write('T1 - ' + citation_title + '\n')
81 | f.write('Y1 - ' + timeofissued + '\n')
82 | f.write('SP - ' + citation_firstpage + '\n')
83 | f.write('EP - ' + citation_lastpage + '\n')
84 | f.write('JF - ' + citation_journal_title + '\n')
85 | f.write('JO - ' + citation_journal_abbrev + '\n')
86 | f.write('VL - ' + citation_volume + '\n')
87 | f.write('RS - ' + citation_issue + '\n')
88 | f.write('PB - ' + PB + '\n')
89 | f.write('SN - ' + SN + '\n')
90 | f.write('DO - ' + citation_doi + '\n')
91 | f.write('M3 - ' + M3 + '\n')
92 | f.write('UR - ' + citation_url + '\n')
93 | print(citation_url)
94 | f.write('N2 - ' + citation_abstract + '\n')
95 | # print(citation_abstract)
96 |
97 | authors = bsop.findAll('span', {'class': 'article__author-link'})
98 | for author in authors:
99 | author = author.find('a').text.split(' ')
100 | author = author[-1] + ', ' + ' '.join(author[:-1])
101 | f.write('A1 - ' + author + '\n')
102 | f.write('ER - ' + '\n\n\n')
103 | f.close()
104 | time.sleep(1)
105 |
106 |
107 | def crawl_article_url(nums):
108 | article_urls = []
109 | for num in range(1, nums+1):
110 |
111 | url = 'https://pubs.rsc.org/en/search/journalresult'
112 |
113 | headers1 = {
114 | 'Accept':'text/html, */*; q=0.01',
115 | 'Connection': 'keep-alive',
116 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
117 | 'DNT':'1',
118 | 'Host':'pubs.rsc.org',
119 | 'Origin':'https://pubs.rsc.org',
120 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
121 | 'X-NewRelic-ID':'VQYFWF9aDBABV1laBgcFUw ==',
122 | 'X-Requested-With':'XMLHttpRequest'
123 | }
124 |
125 | data = {
126 | 'searchterm': 'AAEAAAD/////AQAAAAAAAAAMAgAAAGNSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAADlSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLlNlYXJjaFRlcm0OAAAAGTxDYXRlZ29yeT5rX19CYWNraW5nRmllbGQcPFN1YkNhdGVnb3J5PmtfX0JhY2tpbmdGaWVsZBw8Q29udGVudFR5cGU + a19fQmFja2luZ0ZpZWxkGjxDcml0ZXJpYXM + a19fQmFja2luZ0ZpZWxkFzxGYWNldHM + a19fQmFja2luZ0ZpZWxkHDxSZXF1ZXN0VGltZT5rX19CYWNraW5nRmllbGQfPEF1dGhvckNyaXRlcmlhPmtfX0JhY2tpbmdGaWVsZCA8UHVibGljYXRpb25EYXRlPmtfX0JhY2tpbmdGaWVsZBk8RXhjbHVkZXM + a19fQmFja2luZ0ZpZWxkFzxTb3VyY2U + a19fQmFja2luZ0ZpZWxkHzxPdXRwdXRTdGFuZGFyZD5rX19CYWNraW5nRmllbGQePFJlc3VsdHNGb3JtYXQ + a19fQmFja2luZ0ZpZWxkHjxEaXNwbGF5Q291bnRzPmtfX0JhY2tpbmdGaWVsZCA8UHJvZHVjdFBhZ2VTaXplPmtfX0JhY2tpbmdGaWVsZAEBAQMDAAQEAwEBAQEBwgFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5MaXN0YDFbW1JTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXcIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0NPVJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guQXV0aG9yQ3JpdGVyaWECAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlAgAAAMIBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuRW50aXR5Lk5hbWVWYWx1ZSwgUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLCBWZXJzaW9uPTIwMTguMC41NDkuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1udWxsXV0CAAAABgMAAAADQWxsCgYEAAAAA0FsbAkFAAAACQYAAAAAAAAAAAAAAAkHAAAACQgAAAAJCQAAAAoKCgoKBAUAAADCAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWUsIFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cywgVmVyc2lvbj0yMDE4LjAuNTQ5LjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49bnVsbF1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24EAAA6UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLkVudGl0eS5OYW1lVmFsdWVbXQIAAAAICAkKAAAABAAAAAQAAAABBgAAAAUAAAAJCwAAAAAAAAAAAAAABQcAAAA9UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JDcml0ZXJpYQIAAAAgPEJvb2xlYW5PcGVyYXRvcj5rX19CYWNraW5nRmllbGQYPEF1dGhvcnM + a19fQmFja2luZ0ZpZWxkAQPDAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkxpc3RgMVtbUlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5BdXRob3JJbmZvLCBSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMsIFZlcnNpb249MjAxOC4wLjU0OS4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGxdXQIAAAAKCgUIAAAAPlJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5TZWFyY2guUHVibGljYXRpb25EYXRlBQAAAB88SXNTZWxlY3RlZERhdGU + a19fQmFja2luZ0ZpZWxkGTxEYXRlVHlwZT5rX19CYWNraW5nRmllbGQbPFdpdGhJbkxhc3Q + a19fQmFja2luZ0ZpZWxkGjxEYXRlUmFuZ2U + a19fQmFja2luZ0ZpZWxkHDxEaXNwbGF5RGF0ZT5rX19CYWNraW5nRmllbGQAAQQEAQE5UlNDcHVicy5lUGxhdGZvcm0uU2VydmljZS5EYXRhQ29udHJhY3RzLlNlYXJjaC5XaXRoSW5MYXN0AgAAADhSU0NwdWJzLmVQbGF0Zm9ybS5TZXJ2aWNlLkRhdGFDb250cmFjdHMuU2VhcmNoLkRhdGVSYW5nZQIAAAACAAAAAAoKCgoBCQAAAAUAAAAJCwAAAAAAAAAAAAAABwoAAAAAAQAAAAQAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAkMAAAACQ0AAAAJDgAAAAkPAAAABwsAAAAAAQAAAAAAAAAEOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlAgAAAAUMAAAAOFJTQ3B1YnMuZVBsYXRmb3JtLlNlcnZpY2UuRGF0YUNvbnRyYWN0cy5FbnRpdHkuTmFtZVZhbHVlBAAAABU8TmFtZT5rX19CYWNraW5nRmllbGQcPERpc3BsYXlOYW1lPmtfX0JhY2tpbmdGaWVsZBY8VmFsdWU + a19fQmFja2luZ0ZpZWxkIDxCb29sZWFuT3BlcmF0b3I + a19fQmFja2luZ0ZpZWxkAQEBAQIAAAAGEAAAAAhmcmVldGV4dAoGEQAAAG9kZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtIEFORCBDdSwgT1IgY29wcGVyLCBPUiBlbGVjdHJvbGVzcywgT1IgcHJpbnRpbmcsIE9SIGZsZXhpYmxlLCBPUiBzdWJzdHJhdGUsIE9SIHBsYXN0aWMKAQ0AAAAMAAAABhIAAAAHQWxsVGV4dAoGEwAAABlkZXBvc2l0aW9uLCBwYXR0ZXJuLCBmaWxtCgEOAAAADAAAAAYUAAAAC0F0bGVhc3RUZXh0CgYVAAAAP0N1LCBjb3BwZXIsIGVsZWN0cm9sZXNzLCBwcmludGluZywgZmxleGlibGUsIHN1YnN0cmF0ZSwgcGxhc3RpYwoBDwAAAAwAAAAGFgAAABBPcmlnaW5hbEZyZWVUZXh0CgYXAAAAb2RlcG9zaXRpb24sIHBhdHRlcm4sIGZpbG0gQU5EIEN1LCBPUiBjb3BwZXIsIE9SIGVsZWN0cm9sZXNzLCBPUiBwcmludGluZywgT1IgZmxleGlibGUsIE9SIHN1YnN0cmF0ZSwgT1IgcGxhc3RpYwoL',
127 | 'resultcount': '282607',
128 | 'category': 'all',
129 | 'pageno': str(num)
130 | }
131 |
132 | html = requests.post(url, data=data, headers=headers1, stream=True, timeout=20, verify=True)
133 | html.encoding = 'utf-8'
134 | text = html.text
135 | # print(text)
136 | bsop = BeautifulSoup(text, 'html.parser')
137 | divs = bsop.findAll('div', {'class': 'capsule capsule--article '})
138 | for i in divs:
139 | article_url = 'https://pubs.rsc.org' + i.find('a').attrs['href']
140 | # print(article_url)
141 | article_urls.append(article_url)
142 | print("第" + str(num) + "页爬取完毕")
143 | time.sleep(1)
144 | return article_urls
145 |
146 |
147 | # 构造headers
148 | UserAgent_List = [
149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
150 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
151 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
152 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
153 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
154 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
155 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
156 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
157 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
158 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
159 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
160 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
161 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
162 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
163 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
164 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
165 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
166 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
167 | ]
168 | headers = {'User-Agent': random.choice(UserAgent_List),
169 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
170 | 'Accept-Encoding': 'gzip',
171 | }
172 | nums = 5 # 爬取的页数
173 |
174 | article_urls = crawl_article_url(nums)
175 | royal(article_urls)
176 |
177 |
178 |
179 | # url = 'https://pubs.rsc.org/en/content/articlelanding/2017/tc/c7tc00038c#!divAbstract'
180 | # royal(url)
181 |
--------------------------------------------------------------------------------
/some/springer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | import random
5 | from bs4 import BeautifulSoup
6 | import time
7 |
8 | download_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
9 |
10 |
11 | def royal(article_urls):
12 | for article_url in article_urls:
13 | # try:
14 | html = requests.get(article_url, headers=headers, stream=True, timeout=20, verify=True)
15 | html.encoding = 'utf-8'
16 | text = html.text
17 | bsop = BeautifulSoup(text, 'html.parser')
18 | try:
19 | timeofissued = bsop.find('meta', {'name':'citation_cover_date'}).attrs['content'].split('/')[0]
20 | except:
21 | pass
22 | try:
23 | citation_title = bsop.find('meta', {'name':'citation_title'}).attrs['content']
24 | except:
25 | pass
26 | try:
27 | citation_journal_title = bsop.find('meta', {'name':'citation_journal_title'}).attrs['content']
28 | except:
29 | pass
30 | try:
31 | citation_journal_abbrev = bsop.find('meta', {'name':'citation_journal_abbrev'}).attrs['content']
32 | except:
33 | pass
34 | try:
35 | citation_volume = bsop.find('meta', {'name':'citation_volume'}).attrs['content']
36 | except:
37 | pass
38 | try:
39 | # citation_issue = bsop.find('meta', {'name':'citation_issue'}).attrs['content']
40 | citation_issue = bsop.find('span', {'id':'electronic-issn'}).text
41 | except:
42 | pass
43 | try:
44 | citation_firstpage = bsop.find('meta', {'name':'citation_firstpage'}).attrs['content']
45 | except:
46 | pass
47 | try:
48 | citation_lastpage = bsop.find('meta', {'name':'citation_lastpage'}).attrs['content']
49 | except:
50 | pass
51 | try:
52 | citation_doi = bsop.find('meta', {'name':'citation_doi'}).attrs['content']
53 | except:
54 | pass
55 | try:
56 | PB = bsop.find('meta', {'name':'citation_publisher'}).attrs['content']
57 | except:
58 | pass
59 | try:
60 | M3 = citation_doi
61 | except:
62 | pass
63 | try:
64 | citation_url = 'http://dx.doi.org/' + citation_doi
65 | except:
66 | pass
67 | try:
68 | # citation_abstract = bsop.find('p', {'id':'Par1'}).attrs['content'].strip()
69 | citation_abstract = bsop.find('p', {'id':'Par1'}).text
70 | except:
71 | pass
72 | try:
73 | # SN = bsop.find('div', {'class':'article-nav__issue autopad--h'}).find('a').attrs['href'].split('=')[-1]
74 | SN = bsop.find('span', {'id':'electronic-issn'}).text
75 | except:
76 | pass
77 | # except:
78 | # print(article_url)
79 | # continue
80 |
81 | with open(download_time + ".ris", 'a', encoding='utf-8') as f:
82 | f.write('TY - JOUR\n')
83 | f.write('T1 - ' + citation_title + '\n')
84 | f.write('Y1 - ' + timeofissued + '\n')
85 | f.write('SP - ' + citation_firstpage + '\n')
86 | f.write('EP - ' + citation_lastpage + '\n')
87 | f.write('JF - ' + citation_journal_title + '\n')
88 | f.write('JO - ' + citation_journal_abbrev + '\n')
89 | f.write('VL - ' + citation_volume + '\n')
90 | f.write('RS - ' + citation_issue + '\n')
91 | f.write('PB - ' + PB + '\n')
92 | f.write('SN - ' + SN + '\n')
93 | f.write('DO - ' + citation_doi + '\n')
94 | f.write('M3 - ' + M3 + '\n')
95 | f.write('UR - ' + citation_url + '\n')
96 | print(citation_url)
97 | f.write('N2 - ' + citation_abstract + '\n')
98 | # print(citation_abstract)
99 |
100 | authors = bsop.findAll('meta', {'name': 'citation_author'})
101 | for author in authors:
102 | # print(author)
103 | author = author.attrs['content']
104 | # print(author)
105 | author = author[-1] + ', ' + ' '.join(author[:-1])
106 | f.write('A1 - ' + author + '\n')
107 | f.write('ER - ' + '\n\n\n')
108 | f.close()
109 | time.sleep(1)
110 |
111 |
112 | def crawl_article_url(nums):
113 | article_urls = []
114 | for num in range(1, nums+1):
115 |
116 | url = 'https://link.springer.com/search/page/' + str(num) + '?date-facet-mode=between&facet-start-year=2010&facet-language=%22En%22&query=printing%2C+AND+Cu+AND+pattern%2C+AND+film%2C+AND+flexible%2C+AND+plastic%2C+AND+substrate%2C+AND+copper&facet-end-year=2019&showAll=true&facet-content-type=%22Article%22'
117 |
118 | html = requests.get(url, headers=headers, stream=True, timeout=20, verify=True)
119 | html.encoding = 'utf-8'
120 | text = html.text
121 | # print(text)
122 | bsop = BeautifulSoup(text, 'html.parser')
123 | divs = bsop.find('ol', {'id': 'results-list'}).findAll('li')
124 | for i in divs:
125 | # print(i)
126 | article_url = 'https://link.springer.com' + i.find('h2').find('a').attrs['href']
127 | print(article_url)
128 | article_urls.append(article_url)
129 | print("第" + str(num) + "页爬取完毕")
130 | time.sleep(1)
131 | return article_urls
132 |
133 |
134 | # 构造headers
135 | UserAgent_List = [
136 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
137 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
138 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
139 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
140 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
141 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
142 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
143 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
144 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
145 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
146 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
147 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
148 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
150 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
151 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
152 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
153 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
154 | ]
155 | headers = {'User-Agent': random.choice(UserAgent_List),
156 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
157 | 'Accept-Encoding': 'gzip',
158 | }
159 | nums = 1 # 爬取的页数
160 |
161 | article_urls = crawl_article_url(nums)
162 | royal(article_urls)
--------------------------------------------------------------------------------
/some/xuanke.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*_
2 |
3 | import requests, time
4 | import hmac, json
5 | from bs4 import BeautifulSoup
6 | from hashlib import sha1
7 |
8 | def get_captcha(url):
9 | ''' 处理验证码 '''
10 |
11 | r = requests.get(url, headers=headers)
12 | text = r.text
13 | obj = BeautifulSoup(text, 'html.parser')
14 | captchaurl = 'http://zhjwxk.cic.tsinghua.edu.cn' + obj.find("img", {"id":"captcha"}).attrs['src']
15 | rr = requests.get(captchaurl, headers=headers)
16 | textt = rr.content
17 |
18 | with open('captcha.gif', 'wb') as fb:
19 | fb.write(textt)
20 | a = input('captcha:')
21 | print(a)
22 | return a
23 |
24 |
25 | s = requests.Session()
26 | url = 'https://zhjwxk.cic.tsinghua.edu.cn/j_acegi_formlogin_xsxk.do'
27 |
28 | headers = {
29 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
30 | 'Accept-Encoding': 'gzip, deflate, br',
31 | 'Accept-Language': 'zh-CN,zh;q=0.9',
32 | 'Cache-Control': 'max-age=0',
33 | 'Connection': 'keep-alive',
34 | 'Content-Length': '66',
35 | 'Content-Type': 'application/x-www-form-urlencoded',
36 | 'Cookie': 'JSESSIONID=cafgDstvY9fVWd2VutTFw; thuwebcookie=990146470.20480.0000',
37 | 'DNT': '1',
38 | 'Host': 'zhjwxk.cic.tsinghua.edu.cn',
39 | 'Origin': 'http://zhjwxk.cic.tsinghua.edu.cn',
40 | 'Referer': 'http://zhjwxk.cic.tsinghua.edu.cn/xklogin.do',
41 | 'Upgrade-Insecure-Requests': '1',
42 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
43 | }
44 | data = {
45 | 'j_username': 'zhang-yb18',
46 | 'j_password': 'ZHANG2338',
47 | 'captchaflag': 'login1',
48 | '_login_image_': get_captcha(url),
49 | }
50 |
51 |
52 | r = s.post(url, headers=headers, data=data)
53 | text = r.text
54 | print(text)
--------------------------------------------------------------------------------
/some/xuanke2.py:
--------------------------------------------------------------------------------
1 | # -*- coding:UTF-8 -*-
2 |
3 | import requests, time
4 | import hmac, json
5 | from bs4 import BeautifulSoup
6 | from hashlib import sha1
7 |
8 |
9 |
10 |
11 | def get_captcha(url):
12 | ''' 处理验证码 '''
13 |
14 | r = requests.get(url, headers=headers)
15 | text = r.text
16 | obj = BeautifulSoup(text, 'html.parser')
17 | captchaurl = 'http://zhjwxk.cic.tsinghua.edu.cn' + obj.find("img", {"id":"captcha"}).attrs['src']
18 | rr = requests.get(captchaurl, headers=headers)
19 | textt = rr.content
20 |
21 | with open('captcha.gif', 'wb') as fb:
22 | fb.write(textt)
23 | return input('captcha:')
24 |
25 |
26 |
27 |
28 |
29 | def login(username, password, oncaptcha, sessiona, headers):
30 | ''' 处理登录 '''
31 |
32 | resp1 = sessiona.get('https://www.zhihu.com/signin', headers=headers) # 拿cookie:_xsrf
33 | resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',
34 | headers=headers) # 拿cookie:capsion_ticket
35 | need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码
36 |
37 | grantType = 'password'
38 | clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
39 | source = 'com.zhihu.web'
40 | timestamp = str((time.time() * 1000)).split('.')[0] # 签名只按这个时间戳变化
41 |
42 | captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login' % (time.time() * 1000),
43 | headers=headers).content
44 |
45 | data = {
46 | "client_id": clientId,
47 | "grant_type": grantType,
48 | "timestamp": timestamp,
49 | "source": source,
50 | "signature": get_signature(grantType, clientId, source, timestamp), # 获取签名
51 | "username": username,
52 | "password": password,
53 | "lang": "cn",
54 | "captcha": oncaptcha(captcha_content, need_cap), # 获取图片验证码
55 | "ref_source": "other_",
56 | "utm_source": ""
57 | }
58 |
59 | print("**2**: " + str(data))
60 | print("-" * 50)
61 | resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in', data, headers=headers).content
62 | print(BeautifulSoup(resp, 'html.parser'))
63 |
64 | print("-" * 50)
65 | return resp
66 |
67 |
68 |
69 | headers = {
70 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
71 | 'Accept-Encoding': 'gzip, deflate, br',
72 | 'Accept-Language': 'zh-CN,zh;q=0.9',
73 | 'Cache-Control': 'max-age=0',
74 | 'Connection': 'keep-alive',
75 | 'Content-Length': '66',
76 | 'Content-Type': 'application/x-www-form-urlencoded',
77 | 'Cookie': 'JSESSIONID=cafgDstvY9fVWd2VutTFw; thuwebcookie=990146470.20480.0000',
78 | 'DNT': '1',
79 | 'Host': 'zhjwxk.cic.tsinghua.edu.cn',
80 | 'Origin': 'http://zhjwxk.cic.tsinghua.edu.cn',
81 | 'Referer': 'http://zhjwxk.cic.tsinghua.edu.cn/xklogin.do',
82 | 'Upgrade-Insecure-Requests': '1',
83 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
84 | }
85 | data = {
86 | 'j_username': 'zhang-yb18',
87 | 'j_password': 'ZHANG2338',
88 | 'captchaflag': 'login1',
89 | '_login_image_': get_captcha,
90 | }
91 |
92 |
93 | if __name__ == "__main__":
94 | sessiona = requests.Session()
95 |
96 | login('fendushu@163.com', 'ZHANG2338', get_captcha, sessiona, headers) # 用户名密码换自己的就好了
97 | resp = sessiona.get('https://www.zhihu.com/inbox', headers=headers) # 登录进去了,可以看私信了
98 | print(BeautifulSoup(resp.content, 'html.parser'))
99 |
100 |
--------------------------------------------------------------------------------
/some/zhihu.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*_
2 |
3 | from selenium import webdriver
4 |
5 | import requests
6 |
7 | from time import sleep
8 |
9 | from bs4 import BeautifulSoup
10 |
11 | browser = webdriver.Chrome(executable_path='F:\\pro\\blog\herokublog\\blogtestgithub\\royal\\chromedriver.exe')
12 |
13 | url= 'https://www.zhihu.com/'
14 |
15 | s = requests.Session()
16 |
17 | s.headers.clear()#清除requests头部中的Python机器人信息,否则登录失败
18 |
19 | browser.get(url)
20 |
21 | browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[2]/span').click()#避免屏幕失去焦点
22 |
23 | browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[1]/div[2]/div[1]/input').send_keys('fendushu@163.com')
24 |
25 | browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[2]/div/div[1]/input').send_keys('ZHANG2338')
26 |
27 | try:
28 |
29 | img = browser.find_element_by_xpath('//* [@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[3]/div/div[2]/img')#验证码图片链接--倒立文字
30 |
31 | sleep(10)
32 |
33 | except:
34 |
35 | img= browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[3]/div/span/div/img').get_attribute("src")#验证码图片链接--字母数字
36 |
37 | sleep(10)#填写验证码
38 |
39 | else:
40 |
41 | pass
42 |
43 | browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/button').submit()#登录
44 |
45 | sleep(5)#等待Cookies加载
46 |
47 | cookies = browser.get_cookies()
48 |
49 | browser.quit()
50 |
51 | for cookie in cookies:
52 | s.cookies.set(cookie['name'],cookie['value'])#为session设置cookies
53 |
54 | html=s.get(url).text
55 |
56 | soup = BeautifulSoup(html)
57 |
58 | items = soup.find_all('a',attrs={'data-za-detail-view-element_name':"Title"})#获取登录后加载出的前几个话题的标题
59 |
60 | for item in items:
61 | print(item.string)
62 |
63 |
--------------------------------------------------------------------------------
/some/zhihu2.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*_
2 |
3 |
4 | __author__ = 'zkqiang'
5 | __zhihu__ = 'https://www.zhihu.com/people/z-kqiang'
6 | __github__ = 'https://github.com/zkqiang/Zhihu-Login'
7 |
8 | import requests
9 | import time
10 | import re
11 | import base64
12 | import hmac
13 | import hashlib
14 | import json
15 | import matplotlib.pyplot as plt
16 | from http import cookiejar
17 | from PIL import Image
18 |
19 |
20 | class ZhihuAccount(object):
21 |
22 | def __init__(self):
23 | self.login_url = 'https://www.zhihu.com/signup'
24 | self.login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in'
25 | self.login_data = {
26 | 'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',
27 | 'grant_type': 'password',
28 | 'source': 'com.zhihu.web',
29 | 'username': '',
30 | 'password': '',
31 | # 传入'cn'是倒立汉字验证码
32 | 'lang': 'en',
33 | 'ref_source': 'homepage',
34 | }
35 | self.session = requests.session()
36 | self.session.headers = {
37 | 'Host': 'www.zhihu.com',
38 | 'Referer': 'https://www.zhihu.com/',
39 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
40 | '(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
41 | }
42 | self.session.cookies = cookiejar.LWPCookieJar(filename='./cookies.txt')
43 |
44 | def login(self, username=None, password=None, captcha_lang='en', load_cookies=True):
45 | """
46 | 模拟登录知乎
47 | :param username: 登录手机号
48 | :param password: 登录密码
49 | :param captcha_lang: 验证码类型 'en' or 'cn'
50 | :param load_cookies: 是否读取上次保存的 Cookies
51 | :return: bool
52 | """
53 | if load_cookies and self.load_cookies():
54 | if self.check_login():
55 | print('登录成功')
56 | return True
57 |
58 | headers = self.session.headers.copy()
59 | headers.update({
60 | 'xsrftoken': self._get_xsrf(),
61 | 'x-zse-83': '3_1.1'
62 | })
63 | self.session.headers = headers['x-udid'] = self._get_udid(headers)
64 | username, password = self._check_user_pass(username, password)
65 | self.login_data.update({
66 | 'username': username,
67 | 'password': password,
68 | 'captcha_lang': captcha_lang
69 | })
70 | timestamp = str(int(time.time()*1000))
71 | self.login_data.update({
72 | 'captcha': self._get_captcha(self.login_data['lang'], headers),
73 | 'timestamp': timestamp,
74 | 'signature': self._get_signature(timestamp)
75 | })
76 |
77 | resp = self.session.post(self.login_api, data=self.login_data, headers=headers)
78 | if 'error' in resp.text:
79 | print(json.loads(resp.text)['error']['message'])
80 | if self.check_login():
81 | print('登录成功')
82 | return True
83 | print('登录失败')
84 | return False
85 |
86 | def load_cookies(self):
87 | """
88 | 读取 Cookies 文件加载到 Session
89 | :return: bool
90 | """
91 | try:
92 | self.session.cookies.load(ignore_discard=True)
93 | return True
94 | except FileNotFoundError:
95 | return False
96 |
97 | def check_login(self):
98 | """
99 | 检查登录状态,访问登录页面出现跳转则是已登录,
100 | 如登录成功保存当前 Cookies
101 | :return: bool
102 | """
103 | resp = self.session.get(self.login_url, allow_redirects=False)
104 | if resp.status_code == 302:
105 | self.session.cookies.save()
106 | return True
107 | return False
108 |
109 | def _get_xsrf(self):
110 | """
111 | 从登录页面获取 xsrf
112 | :return: str
113 | """
114 | resp = self.session.get('https://www.zhihu.com/', allow_redirects=False)
115 | xsrf = resp.cookies['_xsrf']
116 | return xsrf
117 |
118 | def _get_udid(self, headers):
119 | """
120 | 从uuid接口获得 uuid
121 | :param headers: 带授权信息的请求头部
122 | :return: str
123 | """
124 | resp = self.session.post('https://www.zhihu.com/udid', headers=headers)
125 | udid = re.search(r'[\w=\-]+', resp.cookies['d_c0'])[0]
126 | return udid
127 |
128 | def _get_captcha(self, lang, headers):
129 | """
130 | 请求验证码的 API 接口,无论是否需要验证码都需要请求一次
131 | 如果需要验证码会返回图片的 base64 编码
132 | 根据 lang 参数匹配验证码,需要人工输入
133 | :param lang: 返回验证码的语言(en/cn)
134 | :param headers: 带授权信息的请求头部
135 | :return: 验证码的 POST 参数
136 | """
137 | if lang == 'cn':
138 | api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=cn'
139 | else:
140 | api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
141 | resp = self.session.get(api, headers=headers)
142 | show_captcha = re.search(r'true', resp.text)
143 |
144 | if show_captcha:
145 | put_resp = self.session.put(api, headers=headers)
146 | json_data = json.loads(put_resp.text)
147 | img_base64 = json_data['img_base64'].replace(r'\n', '')
148 | with open('./captcha.jpg', 'wb') as f:
149 | f.write(base64.b64decode(img_base64))
150 | img = Image.open('./captcha.jpg')
151 | if lang == 'cn':
152 | plt.imshow(img)
153 | print('点击所有倒立的汉字,按回车提交')
154 | points = plt.ginput(7)
155 | capt = json.dumps({'img_size': [200, 44],
156 | 'input_points': [[i[0]/2, i[1]/2] for i in points]})
157 | else:
158 | img.show()
159 | capt = input('请输入图片里的验证码:')
160 | # 这里必须先把参数 POST 验证码接口
161 | self.session.post(api, data={'input_text': capt}, headers=headers)
162 | return capt
163 | return ''
164 |
165 | def _get_signature(self, timestamp):
166 | """
167 | 通过 Hmac 算法计算返回签名
168 | 实际是几个固定字符串加时间戳
169 | :param timestamp: 时间戳
170 | :return: 签名
171 | """
172 | ha = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=hashlib.sha1)
173 | grant_type = self.login_data['grant_type']
174 | client_id = self.login_data['client_id']
175 | source = self.login_data['source']
176 | ha.update(bytes((grant_type + client_id + source + timestamp), 'utf-8'))
177 | return ha.hexdigest()
178 |
179 | def _check_user_pass(self, username, password):
180 | """
181 | 检查用户名和密码是否已输入,若无则手动输入
182 | """
183 | if username is None:
184 | username = self.login_data.get('username')
185 | if not username:
186 | username = input('请输入手机号:')
187 | if len(username) == 11 and username.isdigit() and '+86' not in username:
188 | username = '+86' + username
189 |
190 | if password is None:
191 | password = self.login_data.get('password')
192 | if not password:
193 | password = input('请输入密码:')
194 | return username, password
195 |
196 |
197 | if __name__ == '__main__':
198 | account = ZhihuAccount()
199 | account.login(username=None, password=None, captcha_lang='en', load_cookies=True)
--------------------------------------------------------------------------------
/some/zhihu3.py:
--------------------------------------------------------------------------------
1 | # -*- coding:UTF-8 -*-
2 |
3 | import requests, time
4 | import hmac, json
5 | from bs4 import BeautifulSoup
6 | from hashlib import sha1
7 |
8 |
9 | def get_captcha(data, need_cap):
10 | ''' 处理验证码 '''
11 | if need_cap is False:
12 | return
13 | with open('captcha.gif', 'wb') as fb:
14 | fb.write(data)
15 | return input('captcha:')
16 |
17 |
18 | def get_signature(grantType, clientId, source, timestamp):
19 | ''' 处理签名 '''
20 |
21 | hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4', None, sha1)
22 | hm.update(str.encode(grantType))
23 | hm.update(str.encode(clientId))
24 | hm.update(str.encode(source))
25 | hm.update(str.encode(timestamp))
26 |
27 | return str(hm.hexdigest())
28 |
29 |
30 | def login(username, password, oncaptcha, sessiona, headers):
31 | ''' 处理登录 '''
32 |
33 | resp1 = sessiona.get('https://www.zhihu.com/signin', headers=headers) # 拿cookie:_xsrf
34 | resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',
35 | headers=headers) # 拿cookie:capsion_ticket
36 | need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码
37 |
38 | grantType = 'password'
39 | clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
40 | source = 'com.zhihu.web'
41 | timestamp = str((time.time() * 1000)).split('.')[0] # 签名只按这个时间戳变化
42 |
43 | captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login' % (time.time() * 1000),
44 | headers=headers).content
45 |
46 | data = {
47 | "client_id": clientId,
48 | "grant_type": grantType,
49 | "timestamp": timestamp,
50 | "source": source,
51 | "signature": get_signature(grantType, clientId, source, timestamp), # 获取签名
52 | "username": username,
53 | "password": password,
54 | "lang": "cn",
55 | "captcha": oncaptcha(captcha_content, need_cap), # 获取图片验证码
56 | "ref_source": "other_",
57 | "utm_source": ""
58 | }
59 |
60 | print("**2**: " + str(data))
61 | print("-" * 50)
62 | resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in', data, headers=headers).content
63 | print(BeautifulSoup(resp, 'html.parser'))
64 |
65 | print("-" * 50)
66 | return resp
67 |
68 |
69 | if __name__ == "__main__":
70 | sessiona = requests.Session()
71 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',
72 | 'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}
73 |
74 | login('fendushu@163.com', 'ZHANG2338', get_captcha, sessiona, headers) # 用户名密码换自己的就好了
75 | resp = sessiona.get('https://www.zhihu.com/inbox', headers=headers) # 登录进去了,可以看私信了
76 | print(BeautifulSoup(resp.content, 'html.parser'))
77 |
78 | ### chcp 65001 (win下改变cmd字符集)
79 | ### python c:\python34\login_zhihu.py
80 | ### 有非常无语的事情发生,还以为代码没生效
--------------------------------------------------------------------------------
/zhihu/denglu.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jim-bin/Python-spider/511e2679925725f8e0a3e003bb0c9247faf73f4a/zhihu/denglu.py
--------------------------------------------------------------------------------