├── .gitignore
├── README.md
└── instagram_search.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | venv/
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # InstagramSearch
2 | A python library for searching most recent hashtags via Instagrams search engine.
3 | 
4 | The post about this can be found at http://tomkdickinson.co.uk/2016/12/extracting-instagram-data-part-1/
5 | 


--------------------------------------------------------------------------------
/instagram_search.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging as log
  3 | import re
  4 | import sys
  5 | from abc import ABCMeta, abstractmethod
  6 | from json import JSONDecodeError
  7 | 
  8 | import bs4
  9 | import requests
 10 | 
 11 | 
 12 | class InstagramUser:
 13 |     def __init__(self, user_id, username=None, bio=None, followers_count=None, following_count=None, is_private=False):
 14 |         """
 15 |         A class to represent an Instagram User
 16 | 
 17 |         :param user_id: User ID of instagram user
 18 |         :param username: Username of Instagram user
 19 |         :param bio: Bio text for user
 20 |         :param followers_count: Number of followers
 21 |         :param following_count: Number of people following
 22 |         :param is_private: Boolean to indicate if account is private or not
 23 |         """
 24 |         self.id = user_id
 25 |         self.username = username
 26 |         self.bio = bio
 27 |         self.followers_count = followers_count
 28 |         self.following_count = following_count
 29 |         self.is_private = is_private
 30 | 
 31 | 
 32 | class InstagramPost:
 33 |     def __init__(self, post_id, code, user=None, caption="", display_src=None, is_video=False, created_at=None):
 34 |         """
 35 |         A class to represent a post on Instagram
 36 |         :param post_id: ID of the post
 37 |         :param code: Code of the post
 38 |         :param user: A user object representing the owner of the post
 39 |         :param caption: The caption/text of the post
 40 |         :param display_src: The URL of the image of the post
 41 |         :param is_video: A boolean value indicating it's a video
 42 |         :param created_at: The time it was created
 43 |         """
 44 |         self.post_id = post_id
 45 |         self.code = code
 46 |         self.caption = caption
 47 |         self.user = user
 48 |         self.display_src = display_src
 49 |         self.is_video = is_video
 50 |         self.created_at = created_at
 51 | 
 52 |     def processed_text(self):
 53 |         """
 54 |         Processes a caption to remove newlines in it.
 55 |         :return:
 56 |         """
 57 |         if self.caption is None:
 58 |             return ""
 59 |         else:
 60 |             text = re.sub('[\n\r]', ' ', self.caption)
 61 |             return text
 62 | 
 63 |     def hashtags(self):
 64 |         """
 65 |         Simple hashtag extractor to return the hastags in the post
 66 |         :return:
 67 |         """
 68 |         hashtags = []
 69 |         if self.caption is None:
 70 |             return hashtags
 71 |         else:
 72 |             for tag in re.findall("#[a-zA-Z0-9]+", self.caption):
 73 |                 hashtags.append(tag)
 74 |             return hashtags
 75 | 
 76 | 
 77 | class HashTagSearch(metaclass=ABCMeta):
 78 |     instagram_root = "https://www.instagram.com"
 79 | 
 80 |     def __init__(self, ):
 81 |         """
 82 |         This class performs a search on Instagrams hashtag search engine, and extracts posts for that given hashtag.
 83 | 
 84 |         There are some limitations, as this does not extract all occurrences of the hash tag.
 85 | 
 86 |         Instead, it extracts the most recent uses of the tag.
 87 |         """
 88 |         super().__init__()
 89 | 
 90 |     def extract_recent_tag(self, tag):
 91 |         """
 92 |         Extracts Instagram posts for a given hashtag
 93 |         :param tag: Hashtag to extract
 94 |         """
 95 | 
 96 |         url_string = "https://www.instagram.com/explore/tags/%s/" % tag
 97 |         response = bs4.BeautifulSoup(requests.get(url_string).text, "html.parser")
 98 |         potential_query_ids = self.get_query_id(response)
 99 |         shared_data = self.extract_shared_data(response)
100 | 
101 |         media = shared_data['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['edges']
102 | 
103 |         posts = []
104 |         for node in media:
105 |             post = self.extract_recent_instagram_post(node['node'])
106 |             posts.append(post)
107 |         self.save_results(posts)
108 | 
109 |         end_cursor = shared_data['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
110 | 
111 |         # figure out valid queryId
112 |         success = False
113 |         print(potential_query_ids)
114 |         for potential_id in potential_query_ids:
115 |             variables = {
116 |                 'tag_name': tag,
117 |                 'first': 4,
118 |                 'after': end_cursor
119 |             }
120 |             url = "https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s" % (potential_id, json.dumps(variables))
121 |             try:
122 |                 data = requests.get(url).json()
123 |                 if data['status'] == 'fail':
124 |                     # empty response, skip
125 |                     continue
126 |                 query_id = potential_id
127 |                 success = True
128 |                 break
129 |             except JSONDecodeError as de:
130 |                 # no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.'
131 |                 pass
132 |         if not success:
133 |             log.error("Error extracting Query Id, exiting")
134 |             sys.exit(1)
135 | 
136 |         while end_cursor is not None:
137 |             url = "https://www.instagram.com/graphql/query/?query_hash=%s&tag_name=%s&first=12&after=%s" % (
138 |                 query_id, tag, end_cursor)
139 |             data = json.loads(requests.get(url).text)
140 |             end_cursor = data['data']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
141 |             posts = []
142 |             for node in data['data']['hashtag']['edge_hashtag_to_media']['edges']:
143 |                 posts.append(self.extract_recent_query_instagram_post(node['node']))
144 |             self.save_results(posts)
145 | 
146 |     @staticmethod
147 |     def extract_shared_data(doc):
148 |         for script_tag in doc.find_all("script"):
149 |             if script_tag.text.startswith("window._sharedData ="):
150 |                 shared_data = re.sub("^window\._sharedData = ", "", script_tag.text)
151 |                 shared_data = re.sub(";$", "", shared_data)
152 |                 shared_data = json.loads(shared_data)
153 |                 return shared_data
154 | 
155 |     @staticmethod
156 |     def extract_recent_instagram_post(node):
157 |         return InstagramPost(
158 |             post_id=node['id'],
159 |             code=node['shortcode'],
160 |             user=InstagramUser(user_id=node['owner']['id']),
161 |             caption=HashTagSearch.extract_caption(node),
162 |             display_src=node['display_url'],
163 |             is_video=node['is_video'],
164 |             created_at=node['taken_at_timestamp']
165 |         )
166 | 
167 |     @staticmethod
168 |     def extract_recent_query_instagram_post(node):
169 |         return InstagramPost(
170 |             post_id=node['id'],
171 |             code=node['shortcode'],
172 |             user=InstagramUser(user_id=node['owner']['id']),
173 |             caption=HashTagSearch.extract_caption(node),
174 |             display_src=node['display_url'],
175 |             is_video=node['is_video'],
176 |             created_at=node['taken_at_timestamp']
177 |         )
178 | 
179 |     @staticmethod
180 |     def extract_caption(node):
181 |         if len(node['edge_media_to_caption']['edges']) > 0:
182 |             return node['edge_media_to_caption']['edges'][0]['node']['text']
183 |         else:
184 |             return None
185 | 
186 |     @staticmethod
187 |     def extract_owner_details(owner):
188 |         """
189 |         Extracts the details of a user object.
190 |         :param owner: Instagrams JSON user object
191 |         :return: An Instagram User object
192 |         """
193 |         username = None
194 |         if "username" in owner:
195 |             username = owner["username"]
196 |         is_private = False
197 |         if "is_private" in owner:
198 |             is_private = is_private
199 |         user = InstagramUser(owner['id'], username=username, is_private=is_private)
200 |         return user
201 | 
202 |     def get_query_id(self, doc):
203 |         query_ids = []
204 |         for script in doc.find_all("script"):
205 |             if script.has_attr("src"):
206 |                 text = requests.get("%s%s" % (self.instagram_root, script['src'])).text
207 |                 if "queryId" in text:
208 |                     for query_id in re.findall("(?<=queryId:\")[0-9A-Za-z]+", text):
209 |                         query_ids.append(query_id)
210 |         print(query_ids)
211 |         return query_ids
212 | 
213 |     @abstractmethod
214 |     def save_results(self, instagram_results):
215 |         """
216 |         Implement yourself to work out what to do with each extract batch of posts
217 |         :param instagram_results: A list of Instagram Posts
218 |         """
219 | 
220 | 
221 | class HashTagSearchExample(HashTagSearch):
222 |     def __init__(self):
223 |         super().__init__()
224 |         self.total_posts = 0
225 | 
226 |     def save_results(self, instagram_results):
227 |         super().save_results(instagram_results)
228 |         for i, post in enumerate(instagram_results):
229 |             self.total_posts += 1
230 |             print("%i - %s" % (self.total_posts, post.processed_text()))
231 | 
232 | 
233 | if __name__ == '__main__':
234 |     log.basicConfig(level=log.INFO)
235 |     HashTagSearchExample().extract_recent_tag("christmas")
236 | 


--------------------------------------------------------------------------------