├── .gitignore
├── README.md
└── spider.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Jiepai
 2 | 
 3 | Jiepai Pictures of Toutiao
 4 | 
 5 | ## 2018/8/21 更新
 6 | 
 7 | 头条的接口已经改版，现已更新，感谢 @Switch-vov https://github.com/Python3WebSpider/Jiepai/issues/5 的贡献
 8 | 
 9 | ## 2019/2/18 更新
10 | 
11 | 感谢 @zhengbeiandy @joeytai https://github.com/Python3WebSpider/Jiepai/issues/13 的贡献
12 | 
13 | ## 2019/7/29 更新
14 | 
15 | 感谢 @Anodsaber https://github.com/Python3WebSpider/Jiepai/issues/21 的贡献
16 | 
17 | ## 2020/2/10 更新
18 | 
19 | 感谢 @siuszy https://github.com/Python3WebSpider/Jiepai/issues/25 的贡献
20 | 
21 | ## 2020/2/28 更新
22 | 
23 | 感谢 @A1bertY  https://github.com/Python3WebSpider/Jiepai/issues/25 的贡献
24 | 


--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
  1 | import requests,re,os
  2 | from hashlib import md5
  3 | from selenium import webdriver
  4 | 
  5 | def get_cookies(url):
  6 |     str=''
  7 |     options = webdriver.ChromeOptions()
  8 |     options.add_argument('--headless')
  9 |     browser = webdriver.Chrome(options=options)
 10 |     browser.get(url)
 11 |     for i in browser.get_cookies():
 12 |         try:
 13 |             name=i.get('name')
 14 |             value=i.get('value')
 15 |             str=str+name+'='+value+';'
 16 |         except ValueError as e:
 17 |             print(e)
 18 |     return str
 19 | 
 20 | def get_page(offset):
 21 |     params = {
 22 |         'aid': '24',
 23 |         'app_name': 'web_search',
 24 |         'offset': offset,
 25 |         'format': 'json',
 26 |         'keyword': '街拍',
 27 |         'autoload': 'true',
 28 |         'count': '20',
 29 |         'en_qc': '1',
 30 |         'cur_tab': '1',
 31 |         'from': 'search_tab',
 32 |         'pd': 'synthesis',
 33 |     }
 34 |     url='https://www.toutiao.com/api/search/content/'
 35 |     try:
 36 |         r=requests.get(url,params=params,headers=headers)
 37 |         if r.status_code==200:
 38 |             return r.json()
 39 |         else:
 40 |             print('requests get_page error!')
 41 |     except requests.ConnectionError:
 42 |         return None
 43 | 
 44 | def get_images(json):
 45 |     data=json.get('data')
 46 |     if data:
 47 |         for i in data:
 48 |             if i.get('title'):
 49 |                 title=re.sub('[\t]','',i.get('title'))
 50 |                 url=i.get('article_url')
 51 |                 if url:
 52 |                     r=requests.get(url,headers=headers)
 53 |                     if r.status_code==200:
 54 |                         images_pattern = re.compile('JSON.parse\("(.*?)"\),\n', re.S)
 55 |                         result = re.search(images_pattern, r.text)
 56 |                         if result:
 57 |                             b_url='http://p3.pstatp.com/origin/pgc-image/'
 58 |                             up=re.compile('url(.*?)"width',re.S)
 59 |                             results=re.findall(up,result.group(1))
 60 |                             if results:
 61 |                                 for result in results:
 62 |                                     yield {
 63 |                                         'title':title,
 64 |                                         'image':b_url+re.search('F([^F]*)\\\\",',result).group(1)
 65 |                                     }
 66 |                         else:
 67 |                             images = i.get('image_list')
 68 |                             for image in images:
 69 |                                 origin_image = re.sub("list.*?pgc-image", "large/pgc-image",
 70 |                                                       image.get('url'))  # 改成origin/pgc-image是原图
 71 |                                 yield {
 72 |                                     'image': origin_image,
 73 |                                     'title': title
 74 |                                 }
 75 | 
 76 | def save_image(item):
 77 |     img_path = 'img' + os.path.sep + item.get('title')
 78 |     if not os.path.exists(img_path):
 79 |         os.makedirs(img_path) # 生成目录文件夹
 80 |     try:
 81 |         resp = requests.get(item.get('image'))
 82 |         if requests.codes.ok == resp.status_code:
 83 |             file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
 84 |                 file_name=md5(resp.content).hexdigest(),
 85 |                 file_suffix='jpg')  # 单一文件的路径
 86 |             if not os.path.exists(file_path):
 87 |                 with open(file_path, 'wb') as f:
 88 |                     f.write(resp.content)
 89 |                 print('Downloaded image path is %s' % file_path)
 90 |             else:
 91 |                 print('Already Downloaded', file_path)
 92 |     except Exception as e:
 93 |         print(e,'none123')
 94 | 
 95 | def main(offset):
 96 |     a = get_page(offset)
 97 |     for i in get_images(a):
 98 |         save_image(i)
 99 | 
100 | cookies = get_cookies('https://www.toutiao.com')
101 | headers = {
102 |     'cookie': cookies,
103 |     'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
104 |     'x-requested-with': 'XMLHttpRequest',
105 |     'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
106 | }
107 | 
108 | if __name__=='__main__':
109 |     #p.map(main,[0]) #之所以不用Pool多进程是因为目前还没有办法实现跨进程共享Cookies
110 |     #map(main,[x*20 for x in range(3)]) map没有输出，不知道为什么
111 |     for i in [x*20 for x in range(3)]:
112 |         main(i)
113 | 


--------------------------------------------------------------------------------