├── requirements.txt ├── README.md ├── LICENSE └── instagram_scraper.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests-html 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Instagram-Scraper 2 | 3 | ## Installation 4 | 5 | This scraper uses `requests_html` which requires python 3.6 or higher runtime. 6 | 7 | ```bash 8 | pip install -r requirements.txt 9 | ``` 10 | 11 | ## Usage 12 | 13 | ### As library 14 | 15 | ```python 16 | from instagram_scraper import scrape_instagram 17 | 18 | for url, caption, hashtags, mentions in scrape_instagram(['quotes', 'meet'], 5): 19 | print(url, caption, hashtags, mentions) 20 | ``` 21 | 22 | ### As script 23 | 24 | ```bash 25 | python3 instagram_scraper.py --tags software bugs --count 50 26 | ``` 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Meet Mangukiya 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /instagram_scraper.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import os 4 | import re 5 | from typing import List 6 | 7 | import requests 8 | from requests_html import HTMLSession 9 | 10 | # Source: http://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/ 11 | REGEXES = { 12 | 'hashtag': re.compile('(?:#)([A-Za-z0-9_](?:(?:[A-Za-z0-9_]|(?:\.(?!\.))){0,28}(?:[A-Za-z0-9_]))?)'), 13 | 'username': re.compile('(?:@)([A-Za-z0-9_](?:(?:[A-Za-z0-9_]|(?:\.(?!\.))){0,28}(?:[A-Za-z0-9_]))?)'), 14 | } 15 | 16 | 17 | def scrape_instagram_tag(tag: str, total_count: int=50, existing: set=None): 18 | """ 19 | Scrape and yield recently tagged instagram photos. 20 | """ 21 | if existing is None: 22 | existing = set() 23 | 24 | url = f'https://www.instagram.com/explore/tags/{tag}' 25 | session = HTMLSession() 26 | req = session.get(url) 27 | 28 | imgs = set(existing) 29 | count = 0 30 | page = 0 31 | 32 | while count <= total_count: 33 | req.html.render(scrolldown=page) 34 | images = req.html.xpath('//img[@alt]') 35 | page += 1 36 | for image in images: 37 | if count > total_count: 38 | break 39 | try: 40 | url, caption = image.attrs['src'], image.attrs['alt'] 41 | except: 42 | pass 43 | else: 44 | if url in imgs: 45 | continue 46 | imgs.add(url) 47 | hashtags = set(REGEXES['hashtag'].findall(caption)) 48 | mentions = set(REGEXES['username'].findall(caption)) 49 | count += 1 50 | yield url, caption, hashtags, mentions 51 | 52 | 53 | def scrape_instagram(tags: List[str], total_count: int=50, existing: set=None): 54 | """ 55 | :param tags: 56 | List of tags that need to be scraped. 57 | :param total_count: 58 | Total number of images to be scraped. 59 | :param existing: 60 | Set of URLs to escape. 61 | """ 62 | if existing is None: 63 | existing = set() 64 | 65 | for tag in tags: 66 | yield from scrape_instagram_tag(tag, total_count, existing) 67 | 68 | 69 | def main(tags, total_count, should_continue): 70 | def _single_tag_processing(tag, total_count, existing_links, start): 71 | os.makedirs(f'data/{tag}', exist_ok=True) 72 | with open(f'data/{tag}/data.csv', 'a' if existing_links else 'w', newline='', encoding='utf-8') as csvfile: 73 | writer = csv.writer(csvfile, delimiter=',') 74 | for count, (url, caption, hashtags, mentions) in enumerate(scrape_instagram_tag( 75 | tag, total_count, existing_links), start): 76 | 77 | try: 78 | req = requests.get(url) 79 | with open(f'data/{tag}/{count}.jpg', 'wb') as img: 80 | img.write(req.content) 81 | except: 82 | print(f'An error occured while downloading {url}') 83 | else: 84 | writer.writerow([ 85 | f'{count}.jpg', 86 | url, 87 | caption.replace('\n', '\\n'), 88 | ', '.join(hashtags), 89 | ', '.join(mentions) 90 | ]) 91 | print(f'[{tag}] downloaded {url} as {count}.jpg in data/{tag}') 92 | 93 | for tag in tags: 94 | existing_links = set() 95 | start = 0 96 | if os.path.exists(f'data/{tag}/data.csv') and should_continue: 97 | with open(f'data/{tag}/data.csv', newline='', encoding='utf-8') as csvfile: 98 | reader = csv.reader(csvfile) 99 | for i, row in enumerate(reader): 100 | existing_links.add(row[1]) 101 | start = i + 1 102 | _single_tag_processing(tag, total_count, existing_links, start) 103 | 104 | 105 | if __name__ == '__main__': 106 | parser = argparse.ArgumentParser() 107 | parser.add_argument('--tags', '-t', nargs='+', 108 | help='Tags to scrape images from') 109 | parser.add_argument('--count', '-c', type=int, default=50, 110 | help='Total number of images to scrape for each given ' 111 | 'tag.') 112 | parser.add_argument('--continue', '-C', 113 | default=False, action='store_true', dest='cont', 114 | help='See existing data, and do not parse those again, ' 115 | 'and append to the data file, instead of a rewrite') 116 | args = parser.parse_args() 117 | assert args.tags, "Enter tags to scrape! Use --tags option, see help." 118 | assert args.count, "Enter total number of images to scrape using --count option, see help." 119 | main(args.tags, args.count, args.cont) 120 | --------------------------------------------------------------------------------