├── .gitattributes ├── README.md ├── instagram.py └── insthreadpool.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Instagram_crawler 2 | Python爬虫 爬取 Instagram 一个博主所有图片视频 3 | 4 | 用法: 5 | 在代码中加上自己的cookie, 6 | 修改图片保存路径, 7 | 在命令行运行 `python instagram.py user_name` # 这里的`user_name`写上要爬的博主账号名称即可 8 | 9 | 详情看[博客](https://linqingmaoer.cn/?p=130) 10 | -------------------------------------------------------------------------------- /instagram.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import json 5 | import time 6 | import random 7 | import requests 8 | from hashlib import md5 9 | from pyquery import PyQuery as pq 10 | 11 | url_base = 'https://www.instagram.com/' 12 | uri = 'https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=%7B%22id%22%3A%22{user_id}%22%2C%22first%22%3A12%2C%22after%22%3A%22{cursor}%22%7D' 13 | 14 | 15 | headers = { 16 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 17 | 'cookie': '这里加上自己的cookie' 18 | } 19 | 20 | 21 | def get_html(url): 22 | try: 23 | response = requests.get(url, headers=headers) 24 | if response.status_code == 200: 25 | return response.text 26 | else: 27 | print('请求网页源代码错误, 错误状态码:', response.status_code) 28 | except Exception as e: 29 | print(e) 30 | return None 31 | 32 | 33 | def get_json(url): 34 | try: 35 | response = requests.get(url, headers=headers, timeout=10) 36 | if response.status_code == 200: 37 | return response.json() 38 | else: 39 | print('请求网页json错误, 错误状态码:', response.status_code) 40 | except Exception as e: 41 | print(e) 42 | time.sleep(60 + float(random.randint(1, 4000))/100) 43 | return get_json(url) 44 | 45 | 46 | def get_content(url): 47 | try: 48 | response = requests.get(url, headers=headers, timeout=10) 49 | if response.status_code == 200: 50 | return response.content 51 | else: 52 | print('请求照片二进制流错误, 错误状态码:', response.status_code) 53 | except Exception as e: 54 | print(e) 55 | return None 56 | 57 | 58 | def get_urls(html): 59 | urls = [] 60 | user_id = re.findall('"profilePage_([0-9]+)"', html, re.S)[0] 61 | print('user_id:' + user_id) 62 | doc = pq(html) 63 | items = doc('script[type="text/javascript"]').items() 64 | for item in items: 65 | if item.text().strip().startswith('window._sharedData'): 66 | js_data = json.loads(item.text()[21:-1], encoding='utf-8') 67 | edges = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] 68 | page_info = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info'] 69 | cursor = page_info['end_cursor'] 70 | flag = page_info['has_next_page'] 71 | for edge in edges: 72 | if edge['node']['display_url']: 73 | display_url = edge['node']['display_url'] 74 | print(display_url) 75 | urls.append(display_url) 76 | print(cursor, flag) 77 | while flag: 78 | url = uri.format(user_id=user_id, cursor=cursor) 79 | js_data = get_json(url) 80 | infos = js_data['data']['user']['edge_owner_to_timeline_media']['edges'] 81 | cursor = js_data['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] 82 | flag = js_data['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page'] 83 | for info in infos: 84 | if info['node']['is_video']: 85 | video_url = info['node']['video_url'] 86 | if video_url: 87 | print(video_url) 88 | urls.append(video_url) 89 | else: 90 | if info['node']['display_url']: 91 | display_url = info['node']['display_url'] 92 | print(display_url) 93 | urls.append(display_url) 94 | print(cursor, flag) 95 | # time.sleep(4 + float(random.randint(1, 800))/200) # if count > 2000, turn on 96 | return urls 97 | 98 | 99 | def main(user): 100 | url = url_base + user + '/' 101 | html = get_html(url) 102 | urls = get_urls(html) 103 | dirpath = r'.\{0}'.format(user) 104 | if not os.path.exists(dirpath): 105 | os.mkdir(dirpath) 106 | for i in range(len(urls)): 107 | print('\n正在下载第{0}张: '.format(i) + urls[i], ' 还剩{0}张'.format(len(urls)-i-1)) 108 | try: 109 | content = get_content(urls[i]) 110 | endw = 'mp4' if r'mp4?_nc_ht=scontent' in urls[i] else 'jpg' 111 | file_path = r'.\{0}\{1}.{2}'.format(user, md5(content).hexdigest(), endw) 112 | if not os.path.exists(file_path): 113 | with open(file_path, 'wb') as f: 114 | print('第{0}张下载完成: '.format(i) + urls[i]) 115 | f.write(content) 116 | f.close() 117 | else: 118 | print('第{0}张照片已下载'.format(i)) 119 | except Exception as e: 120 | print(e) 121 | print('这张图片or视频下载失败') 122 | 123 | 124 | if __name__ == '__main__': 125 | user_name = sys.argv[1] 126 | start = time.time() 127 | main(user_name) 128 | print('Complete!!!!!!!!!!') 129 | end = time.time() 130 | spend = end - start 131 | hour = spend // 3600 132 | minu = (spend - 3600 * hour) // 60 133 | sec = spend - 3600 * hour - 60 * minu 134 | print(f'一共花费了{hour}小时{minu}分钟{sec}秒') 135 | -------------------------------------------------------------------------------- /insthreadpool.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import json 5 | import time 6 | import random 7 | import requests 8 | from hashlib import md5 9 | from pyquery import PyQuery as pq 10 | from multiprocessing.dummy import Pool 11 | 12 | url_base = 'https://www.instagram.com/' 13 | uri = 'https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=%7B%22id%22%3A%22{user_id}%22%2C%22first%22%3A12%2C%22after%22%3A%22{cursor}%22%7D' 14 | 15 | 16 | headers = { 17 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 18 | 'cookie': '这里加上自己的cookie' 19 | } 20 | 21 | 22 | def get_html(url): 23 | try: 24 | response = requests.get(url, headers=headers) 25 | if response.status_code == 200: 26 | return response.text 27 | else: 28 | print('请求网页源代码错误, 错误状态码:', response.status_code) 29 | except Exception as e: 30 | print(e) 31 | return None 32 | 33 | 34 | def get_json(url): 35 | try: 36 | response = requests.get(url, headers=headers) 37 | if response.status_code == 200: 38 | return response.json() 39 | else: 40 | print('请求网页json错误, 错误状态码:', response.status_code) 41 | except Exception as e: 42 | print(e) 43 | time.sleep(60 + float(random.randint(1, 4000))/100) 44 | return get_json(url) 45 | 46 | 47 | def get_content(url): 48 | try: 49 | response = requests.get(url, headers=headers) 50 | if response.status_code == 200: 51 | return response.content 52 | else: 53 | print('请求照片二进制流错误, 错误状态码:', response.status_code) 54 | except Exception as e: 55 | print(e) 56 | return None 57 | 58 | 59 | def get_urls(html): 60 | urls = [] 61 | user_id = re.findall('"profilePage_([0-9]+)"', html, re.S)[0] 62 | print('user_id:' + user_id) 63 | doc = pq(html) 64 | items = doc('script[type="text/javascript"]').items() 65 | for item in items: 66 | if item.text().strip().startswith('window._sharedData'): 67 | js_data = json.loads(item.text()[21:-1], encoding='utf-8') 68 | edges = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] 69 | page_info = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info'] 70 | cursor = page_info['end_cursor'] 71 | flag = page_info['has_next_page'] 72 | for edge in edges: 73 | if edge['node']['display_url']: 74 | display_url = edge['node']['display_url'] 75 | print(display_url) 76 | urls.append(display_url) 77 | yield urls 78 | print(cursor, flag) 79 | while flag: 80 | urls = [] 81 | url = uri.format(user_id=user_id, cursor=cursor) 82 | js_data = get_json(url) 83 | infos = js_data['data']['user']['edge_owner_to_timeline_media']['edges'] 84 | cursor = js_data['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] 85 | flag = js_data['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page'] 86 | for info in infos: 87 | if info['node']['is_video']: 88 | video_url = info['node']['video_url'] 89 | if video_url: 90 | print(video_url) 91 | urls.append(video_url) 92 | else: 93 | if info['node']['display_url']: 94 | display_url = info['node']['display_url'] 95 | print(display_url) 96 | urls.append(display_url) 97 | yield urls 98 | print(cursor, flag) 99 | # time.sleep(4 + float(random.randint(1, 800))/200) # if count > 2000, turn on 100 | # return urls 101 | 102 | 103 | def main(user): 104 | url = url_base + user + '/' 105 | html = get_html(url) 106 | dirpath = r'.\{0}'.format(user) 107 | if not os.path.exists(dirpath): 108 | os.mkdir(dirpath) 109 | for urls in get_urls(html): 110 | try: 111 | pool = Pool(4) 112 | contents = pool.map(get_content, urls) 113 | pool.close() 114 | pool.join() 115 | for i, content in enumerate(contents): 116 | endw = 'mp4' if r'mp4?_nc_ht=scontent' in urls[i] else 'jpg' 117 | file_path = r'.\{0}\{1}.{2}'.format(user, md5(content).hexdigest(), endw) 118 | if not os.path.exists(file_path): 119 | with open(file_path, 'wb') as f: 120 | # print('正在下载第{0}张: '.format(i) + urls[i], ' 还剩{0}张'.format(len(urls)-i-1)) 121 | print('下载完成:', urls[i]) 122 | f.write(content) 123 | f.close() 124 | else: 125 | print('第{0}张照片已下载'.format(i)) 126 | except Exception as e: 127 | print(e) 128 | print('这组图片视频下载失败') 129 | 130 | 131 | if __name__ == '__main__': 132 | user_name = sys.argv[1] 133 | start = time.time() 134 | main(user_name) 135 | print('Complete!!!!!!!!!!') 136 | end = time.time() 137 | spend = end - start 138 | hour = spend // 3600 139 | minu = (spend - 3600 * hour) // 60 140 | sec = spend - 3600 * hour - 60 * minu 141 | print(f'一共花费了{hour}小时{minu}分钟{sec}秒') 142 | --------------------------------------------------------------------------------