├── .gitignore ├── README.md └── instagram_search.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | venv/ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # InstagramSearch 2 | A python library for searching most recent hashtags via Instagrams search engine. 3 | 4 | The post about this can be found at http://tomkdickinson.co.uk/2016/12/extracting-instagram-data-part-1/ 5 | -------------------------------------------------------------------------------- /instagram_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging as log 3 | import re 4 | import sys 5 | from abc import ABCMeta, abstractmethod 6 | from json import JSONDecodeError 7 | 8 | import bs4 9 | import requests 10 | 11 | 12 | class InstagramUser: 13 | def __init__(self, user_id, username=None, bio=None, followers_count=None, following_count=None, is_private=False): 14 | """ 15 | A class to represent an Instagram User 16 | 17 | :param user_id: User ID of instagram user 18 | :param username: Username of Instagram user 19 | :param bio: Bio text for user 20 | :param followers_count: Number of followers 21 | :param following_count: Number of people following 22 | :param is_private: Boolean to indicate if account is private or not 23 | """ 24 | self.id = user_id 25 | self.username = username 26 | self.bio = bio 27 | self.followers_count = followers_count 28 | self.following_count = following_count 29 | self.is_private = is_private 30 | 31 | 32 | class InstagramPost: 33 | def __init__(self, post_id, code, user=None, caption="", display_src=None, is_video=False, created_at=None): 34 | """ 35 | A class to represent a post on Instagram 36 | :param post_id: ID of the post 37 | :param code: Code of the post 38 | :param user: A user object representing the owner of the post 39 | :param caption: The caption/text of the post 40 | :param display_src: The URL of the image of the post 41 | :param is_video: A boolean value indicating it's a video 42 | :param created_at: The time it was created 43 | """ 44 | self.post_id = post_id 45 | self.code = code 46 | self.caption = caption 47 | self.user = user 48 | self.display_src = display_src 49 | self.is_video = is_video 50 | self.created_at = created_at 51 | 52 | def processed_text(self): 53 | """ 54 | Processes a caption to remove newlines in it. 55 | :return: 56 | """ 57 | if self.caption is None: 58 | return "" 59 | else: 60 | text = re.sub('[\n\r]', ' ', self.caption) 61 | return text 62 | 63 | def hashtags(self): 64 | """ 65 | Simple hashtag extractor to return the hastags in the post 66 | :return: 67 | """ 68 | hashtags = [] 69 | if self.caption is None: 70 | return hashtags 71 | else: 72 | for tag in re.findall("#[a-zA-Z0-9]+", self.caption): 73 | hashtags.append(tag) 74 | return hashtags 75 | 76 | 77 | class HashTagSearch(metaclass=ABCMeta): 78 | instagram_root = "https://www.instagram.com" 79 | 80 | def __init__(self, ): 81 | """ 82 | This class performs a search on Instagrams hashtag search engine, and extracts posts for that given hashtag. 83 | 84 | There are some limitations, as this does not extract all occurrences of the hash tag. 85 | 86 | Instead, it extracts the most recent uses of the tag. 87 | """ 88 | super().__init__() 89 | 90 | def extract_recent_tag(self, tag): 91 | """ 92 | Extracts Instagram posts for a given hashtag 93 | :param tag: Hashtag to extract 94 | """ 95 | 96 | url_string = "https://www.instagram.com/explore/tags/%s/" % tag 97 | response = bs4.BeautifulSoup(requests.get(url_string).text, "html.parser") 98 | potential_query_ids = self.get_query_id(response) 99 | shared_data = self.extract_shared_data(response) 100 | 101 | media = shared_data['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['edges'] 102 | 103 | posts = [] 104 | for node in media: 105 | post = self.extract_recent_instagram_post(node['node']) 106 | posts.append(post) 107 | self.save_results(posts) 108 | 109 | end_cursor = shared_data['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor'] 110 | 111 | # figure out valid queryId 112 | success = False 113 | print(potential_query_ids) 114 | for potential_id in potential_query_ids: 115 | variables = { 116 | 'tag_name': tag, 117 | 'first': 4, 118 | 'after': end_cursor 119 | } 120 | url = "https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s" % (potential_id, json.dumps(variables)) 121 | try: 122 | data = requests.get(url).json() 123 | if data['status'] == 'fail': 124 | # empty response, skip 125 | continue 126 | query_id = potential_id 127 | success = True 128 | break 129 | except JSONDecodeError as de: 130 | # no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.' 131 | pass 132 | if not success: 133 | log.error("Error extracting Query Id, exiting") 134 | sys.exit(1) 135 | 136 | while end_cursor is not None: 137 | url = "https://www.instagram.com/graphql/query/?query_hash=%s&tag_name=%s&first=12&after=%s" % ( 138 | query_id, tag, end_cursor) 139 | data = json.loads(requests.get(url).text) 140 | end_cursor = data['data']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor'] 141 | posts = [] 142 | for node in data['data']['hashtag']['edge_hashtag_to_media']['edges']: 143 | posts.append(self.extract_recent_query_instagram_post(node['node'])) 144 | self.save_results(posts) 145 | 146 | @staticmethod 147 | def extract_shared_data(doc): 148 | for script_tag in doc.find_all("script"): 149 | if script_tag.text.startswith("window._sharedData ="): 150 | shared_data = re.sub("^window\._sharedData = ", "", script_tag.text) 151 | shared_data = re.sub(";$", "", shared_data) 152 | shared_data = json.loads(shared_data) 153 | return shared_data 154 | 155 | @staticmethod 156 | def extract_recent_instagram_post(node): 157 | return InstagramPost( 158 | post_id=node['id'], 159 | code=node['shortcode'], 160 | user=InstagramUser(user_id=node['owner']['id']), 161 | caption=HashTagSearch.extract_caption(node), 162 | display_src=node['display_url'], 163 | is_video=node['is_video'], 164 | created_at=node['taken_at_timestamp'] 165 | ) 166 | 167 | @staticmethod 168 | def extract_recent_query_instagram_post(node): 169 | return InstagramPost( 170 | post_id=node['id'], 171 | code=node['shortcode'], 172 | user=InstagramUser(user_id=node['owner']['id']), 173 | caption=HashTagSearch.extract_caption(node), 174 | display_src=node['display_url'], 175 | is_video=node['is_video'], 176 | created_at=node['taken_at_timestamp'] 177 | ) 178 | 179 | @staticmethod 180 | def extract_caption(node): 181 | if len(node['edge_media_to_caption']['edges']) > 0: 182 | return node['edge_media_to_caption']['edges'][0]['node']['text'] 183 | else: 184 | return None 185 | 186 | @staticmethod 187 | def extract_owner_details(owner): 188 | """ 189 | Extracts the details of a user object. 190 | :param owner: Instagrams JSON user object 191 | :return: An Instagram User object 192 | """ 193 | username = None 194 | if "username" in owner: 195 | username = owner["username"] 196 | is_private = False 197 | if "is_private" in owner: 198 | is_private = is_private 199 | user = InstagramUser(owner['id'], username=username, is_private=is_private) 200 | return user 201 | 202 | def get_query_id(self, doc): 203 | query_ids = [] 204 | for script in doc.find_all("script"): 205 | if script.has_attr("src"): 206 | text = requests.get("%s%s" % (self.instagram_root, script['src'])).text 207 | if "queryId" in text: 208 | for query_id in re.findall("(?<=queryId:\")[0-9A-Za-z]+", text): 209 | query_ids.append(query_id) 210 | print(query_ids) 211 | return query_ids 212 | 213 | @abstractmethod 214 | def save_results(self, instagram_results): 215 | """ 216 | Implement yourself to work out what to do with each extract batch of posts 217 | :param instagram_results: A list of Instagram Posts 218 | """ 219 | 220 | 221 | class HashTagSearchExample(HashTagSearch): 222 | def __init__(self): 223 | super().__init__() 224 | self.total_posts = 0 225 | 226 | def save_results(self, instagram_results): 227 | super().save_results(instagram_results) 228 | for i, post in enumerate(instagram_results): 229 | self.total_posts += 1 230 | print("%i - %s" % (self.total_posts, post.processed_text())) 231 | 232 | 233 | if __name__ == '__main__': 234 | log.basicConfig(level=log.INFO) 235 | HashTagSearchExample().extract_recent_tag("christmas") 236 | --------------------------------------------------------------------------------