├── requirements.txt
├── README.md
├── LICENSE
└── instagram_scraper.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests-html
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Instagram-Scraper
 2 | 
 3 | ## Installation
 4 | 
 5 | This scraper uses `requests_html` which requires python 3.6 or higher runtime.
 6 | 
 7 | ```bash
 8 | pip install -r requirements.txt
 9 | ```
10 | 
11 | ## Usage
12 | 
13 | ### As library
14 | 
15 | ```python
16 | from instagram_scraper import scrape_instagram
17 | 
18 | for url, caption, hashtags, mentions in scrape_instagram(['quotes', 'meet'], 5):
19 |     print(url, caption, hashtags, mentions)
20 | ```
21 | 
22 | ### As script
23 | 
24 | ```bash
25 | python3 instagram_scraper.py --tags software bugs --count 50
26 | ```
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Meet Mangukiya
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/instagram_scraper.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import os
  4 | import re
  5 | from typing import List
  6 | 
  7 | import requests
  8 | from requests_html import HTMLSession
  9 | 
 10 | # Source: http://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
 11 | REGEXES = {
 12 |     'hashtag': re.compile('(?:#)([A-Za-z0-9_](?:(?:[A-Za-z0-9_]|(?:\.(?!\.))){0,28}(?:[A-Za-z0-9_]))?)'),
 13 |     'username': re.compile('(?:@)([A-Za-z0-9_](?:(?:[A-Za-z0-9_]|(?:\.(?!\.))){0,28}(?:[A-Za-z0-9_]))?)'),
 14 | }
 15 | 
 16 | 
 17 | def scrape_instagram_tag(tag: str, total_count: int=50, existing: set=None):
 18 |     """
 19 |     Scrape and yield recently tagged instagram photos.
 20 |     """
 21 |     if existing is None:
 22 |         existing = set()
 23 | 
 24 |     url = f'https://www.instagram.com/explore/tags/{tag}'
 25 |     session = HTMLSession()
 26 |     req = session.get(url)
 27 | 
 28 |     imgs = set(existing)
 29 |     count = 0
 30 |     page = 0
 31 | 
 32 |     while count <= total_count:
 33 |         req.html.render(scrolldown=page)
 34 |         images = req.html.xpath('//img[@alt]')
 35 |         page += 1
 36 |         for image in images:
 37 |             if count > total_count:
 38 |                 break
 39 |             try:
 40 |                 url, caption = image.attrs['src'], image.attrs['alt']
 41 |             except:
 42 |                 pass
 43 |             else:
 44 |                 if url in imgs:
 45 |                     continue
 46 |                 imgs.add(url)
 47 |                 hashtags = set(REGEXES['hashtag'].findall(caption))
 48 |                 mentions = set(REGEXES['username'].findall(caption))
 49 |                 count += 1
 50 |                 yield url, caption, hashtags, mentions
 51 | 
 52 | 
 53 | def scrape_instagram(tags: List[str], total_count: int=50, existing: set=None):
 54 |     """
 55 |     :param tags:
 56 |         List of tags that need to be scraped.
 57 |     :param total_count:
 58 |         Total number of images to be scraped.
 59 |     :param existing:
 60 |         Set of URLs to escape.
 61 |     """
 62 |     if existing is None:
 63 |         existing = set()
 64 | 
 65 |     for tag in tags:
 66 |         yield from scrape_instagram_tag(tag, total_count, existing)
 67 | 
 68 | 
 69 | def main(tags, total_count, should_continue):
 70 |     def _single_tag_processing(tag, total_count, existing_links, start):
 71 |         os.makedirs(f'data/{tag}', exist_ok=True)
 72 |         with open(f'data/{tag}/data.csv', 'a' if existing_links else 'w', newline='', encoding='utf-8') as csvfile:
 73 |             writer = csv.writer(csvfile, delimiter=',')
 74 |             for count, (url, caption, hashtags, mentions) in enumerate(scrape_instagram_tag(
 75 |                 tag, total_count, existing_links), start):
 76 | 
 77 |                 try:
 78 |                     req = requests.get(url)
 79 |                     with open(f'data/{tag}/{count}.jpg', 'wb') as img:
 80 |                         img.write(req.content)
 81 |                 except:
 82 |                     print(f'An error occured while downloading {url}')
 83 |                 else:
 84 |                     writer.writerow([
 85 |                         f'{count}.jpg',
 86 |                         url,
 87 |                         caption.replace('\n', '\\n'),
 88 |                         ', '.join(hashtags),
 89 |                         ', '.join(mentions)
 90 |                     ])
 91 |                     print(f'[{tag}] downloaded {url} as {count}.jpg in data/{tag}')
 92 | 
 93 |     for tag in tags:
 94 |         existing_links = set()
 95 |         start = 0
 96 |         if os.path.exists(f'data/{tag}/data.csv') and should_continue:
 97 |             with open(f'data/{tag}/data.csv', newline='', encoding='utf-8') as csvfile:
 98 |                 reader = csv.reader(csvfile)
 99 |                 for i, row in enumerate(reader):
100 |                     existing_links.add(row[1])
101 |                 start = i + 1
102 |         _single_tag_processing(tag, total_count, existing_links, start)
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     parser = argparse.ArgumentParser()
107 |     parser.add_argument('--tags', '-t', nargs='+',
108 |                         help='Tags to scrape images from')
109 |     parser.add_argument('--count', '-c', type=int, default=50,
110 |                         help='Total number of images to scrape for each given '
111 |                              'tag.')
112 |     parser.add_argument('--continue', '-C',
113 |                         default=False, action='store_true', dest='cont',
114 |                         help='See existing data, and do not parse those again, '
115 |                              'and append to the data file, instead of a rewrite')
116 |     args = parser.parse_args()
117 |     assert args.tags, "Enter tags to scrape! Use --tags option, see help."
118 |     assert args.count, "Enter total number of images to scrape using --count option, see help."
119 |     main(args.tags, args.count, args.cont)
120 | 


--------------------------------------------------------------------------------