├── README.md ├── requirements.txt └── fetch.py /README.md: -------------------------------------------------------------------------------- 1 | `instagram-feed` generates a JSONfeed for an Instagram user. 2 | 3 | ### Installation 4 | 5 | ``` 6 | > pip install -r requirements.txt 7 | ``` 8 | 9 | ### Usage 10 | 11 | Fetching the HTML for a given user is powered by [requests_html](https://html.python-requests.org). 12 | `fetch.py` accepts a username as the last argument. 13 | 14 | ``` 15 | > python fetch.py 16 | ``` 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | beautifulsoup4==4.8.0 3 | bs4==0.0.1 4 | certifi==2019.6.16 5 | chardet==3.0.4 6 | cssselect==1.0.3 7 | fake-useragent==0.1.11 8 | idna==2.8 9 | lxml==4.4.0 10 | parse==1.12.0 11 | pkg-resources==0.0.0 12 | pyee==6.0.0 13 | pyppeteer==0.0.25 14 | pyquery==1.4.0 15 | requests==2.22.0 16 | requests-html==0.10.0 17 | six==1.12.0 18 | soupsieve==1.9.2 19 | tqdm==4.32.2 20 | urllib3==1.25.3 21 | w3lib==1.20.0 22 | websockets==8.0.2 23 | -------------------------------------------------------------------------------- /fetch.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from requests_html import HTMLSession 5 | 6 | 7 | class InstagramFeed: 8 | 9 | def __init__(self, username): 10 | self.username = username 11 | self.html = self.fetch_html() 12 | 13 | def generate(self): 14 | return { 15 | 'version': 'https://jsonfeed.org/version/1', 16 | 'title': self.get_title(), 17 | 'url': self.get_homepage_url(), 18 | 'author': self.get_author_block(), 19 | 'items': list(self.get_photos()), 20 | } 21 | 22 | def fetch_html(self): 23 | url = f'https://instagram.com/{self.username}' 24 | 25 | session = HTMLSession() 26 | response = session.get(url) 27 | 28 | if response.status_code == 200: 29 | response.html.render() 30 | return response.html 31 | 32 | def get_author_block(self): 33 | return { 34 | 'name': self.username, 35 | 'avatar': self.get_avatar_url(), 36 | 'home_page_url': self.get_homepage_url(), 37 | } 38 | 39 | def get_title(self): 40 | return self.html.find('title')[0].text.split(')')[0] + ')' 41 | 42 | def get_homepage_url(self): 43 | return f'https://instagram.com/{self.username}' 44 | 45 | def get_avatar_url(self): 46 | meta = self.html.find('meta') 47 | for m in meta: 48 | if m.attrs.get('property') == 'og:image': 49 | return m.attrs['content'] 50 | 51 | def get_photos(self): 52 | photos = [a for a in self.html.find('a') if a.attrs['href'].startswith('/p/')] 53 | 54 | for photo in photos: 55 | post_id = 'https://instagram.com' + photo.attrs['href'] 56 | author = post_id.split('taken-by=')[-1] 57 | photo_img = photo.find('img')[0] 58 | content_text = photo_img.attrs.get('alt', '') 59 | image = photo_img.attrs['src'] 60 | 61 | yield { 62 | 'id': post_id, 63 | 'url': post_id, 64 | 'content_text': content_text, 65 | 'image': image, 66 | 'author': author, 67 | } 68 | 69 | 70 | def main(username): 71 | feed = InstagramFeed(username) 72 | print(json.dumps(feed.generate(), indent=2)) 73 | 74 | 75 | if __name__ == '__main__': 76 | main(sys.argv[-1]) 77 | --------------------------------------------------------------------------------