├── .gitignore ├── README.md └── spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jiepai 2 | 3 | Jiepai Pictures of Toutiao 4 | 5 | ## 2018/8/21 更新 6 | 7 | 头条的接口已经改版,现已更新,感谢 @Switch-vov https://github.com/Python3WebSpider/Jiepai/issues/5 的贡献 8 | 9 | ## 2019/2/18 更新 10 | 11 | 感谢 @zhengbeiandy @joeytai https://github.com/Python3WebSpider/Jiepai/issues/13 的贡献 12 | 13 | ## 2019/7/29 更新 14 | 15 | 感谢 @Anodsaber https://github.com/Python3WebSpider/Jiepai/issues/21 的贡献 16 | 17 | ## 2020/2/10 更新 18 | 19 | 感谢 @siuszy https://github.com/Python3WebSpider/Jiepai/issues/25 的贡献 20 | 21 | ## 2020/2/28 更新 22 | 23 | 感谢 @A1bertY https://github.com/Python3WebSpider/Jiepai/issues/25 的贡献 24 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | import requests,re,os 2 | from hashlib import md5 3 | from selenium import webdriver 4 | 5 | def get_cookies(url): 6 | str='' 7 | options = webdriver.ChromeOptions() 8 | options.add_argument('--headless') 9 | browser = webdriver.Chrome(options=options) 10 | browser.get(url) 11 | for i in browser.get_cookies(): 12 | try: 13 | name=i.get('name') 14 | value=i.get('value') 15 | str=str+name+'='+value+';' 16 | except ValueError as e: 17 | print(e) 18 | return str 19 | 20 | def get_page(offset): 21 | params = { 22 | 'aid': '24', 23 | 'app_name': 'web_search', 24 | 'offset': offset, 25 | 'format': 'json', 26 | 'keyword': '街拍', 27 | 'autoload': 'true', 28 | 'count': '20', 29 | 'en_qc': '1', 30 | 'cur_tab': '1', 31 | 'from': 'search_tab', 32 | 'pd': 'synthesis', 33 | } 34 | url='https://www.toutiao.com/api/search/content/' 35 | try: 36 | r=requests.get(url,params=params,headers=headers) 37 | if r.status_code==200: 38 | return r.json() 39 | else: 40 | print('requests get_page error!') 41 | except requests.ConnectionError: 42 | return None 43 | 44 | def get_images(json): 45 | data=json.get('data') 46 | if data: 47 | for i in data: 48 | if i.get('title'): 49 | title=re.sub('[\t]','',i.get('title')) 50 | url=i.get('article_url') 51 | if url: 52 | r=requests.get(url,headers=headers) 53 | if r.status_code==200: 54 | images_pattern = re.compile('JSON.parse\("(.*?)"\),\n', re.S) 55 | result = re.search(images_pattern, r.text) 56 | if result: 57 | b_url='http://p3.pstatp.com/origin/pgc-image/' 58 | up=re.compile('url(.*?)"width',re.S) 59 | results=re.findall(up,result.group(1)) 60 | if results: 61 | for result in results: 62 | yield { 63 | 'title':title, 64 | 'image':b_url+re.search('F([^F]*)\\\\",',result).group(1) 65 | } 66 | else: 67 | images = i.get('image_list') 68 | for image in images: 69 | origin_image = re.sub("list.*?pgc-image", "large/pgc-image", 70 | image.get('url')) # 改成origin/pgc-image是原图 71 | yield { 72 | 'image': origin_image, 73 | 'title': title 74 | } 75 | 76 | def save_image(item): 77 | img_path = 'img' + os.path.sep + item.get('title') 78 | if not os.path.exists(img_path): 79 | os.makedirs(img_path) # 生成目录文件夹 80 | try: 81 | resp = requests.get(item.get('image')) 82 | if requests.codes.ok == resp.status_code: 83 | file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format( 84 | file_name=md5(resp.content).hexdigest(), 85 | file_suffix='jpg') # 单一文件的路径 86 | if not os.path.exists(file_path): 87 | with open(file_path, 'wb') as f: 88 | f.write(resp.content) 89 | print('Downloaded image path is %s' % file_path) 90 | else: 91 | print('Already Downloaded', file_path) 92 | except Exception as e: 93 | print(e,'none123') 94 | 95 | def main(offset): 96 | a = get_page(offset) 97 | for i in get_images(a): 98 | save_image(i) 99 | 100 | cookies = get_cookies('https://www.toutiao.com') 101 | headers = { 102 | 'cookie': cookies, 103 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36', 104 | 'x-requested-with': 'XMLHttpRequest', 105 | 'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D', 106 | } 107 | 108 | if __name__=='__main__': 109 | #p.map(main,[0]) #之所以不用Pool多进程是因为目前还没有办法实现跨进程共享Cookies 110 | #map(main,[x*20 for x in range(3)]) map没有输出,不知道为什么 111 | for i in [x*20 for x in range(3)]: 112 | main(i) 113 | --------------------------------------------------------------------------------