├── images
    └── .keep
├── .gitignore
├── requirements.txt
├── README.md
└── scraper.py


/images/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | env/
2 | *.jpg
3 | *.png
4 | *.gif
5 | *.jpeg
6 | *.json
7 | .DS_Store
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.6.0
2 | certifi==2017.11.5
3 | chardet==3.0.4
4 | idna==2.6
5 | requests==2.20.0
6 | tqdm==4.19.5
7 | urllib3==1.24.2
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrape flickr
 2 | 
 3 | ## Installation
 4 | 
 5 | 1. Clone the repo
 6 | 
 7 | 2. Create a virtualenvironment
 8 | 
 9 | ```
10 | virtualenv env
11 | source env/bin/activate
12 | ```
13 | 
14 | 3. Install requirements
15 | 
16 | `pip install -r requirements.txt`
17 | 
18 | ## Usage
19 | 
20 | [Get an API key from Flickr](https://www.flickr.com/services/api/misc.api_keys.html) and make a file called `credentials.json` which has the following text in it (replace the credentials with your own):
21 | 
22 | ```
23 | {"KEY":"YOUR_API_KEY", "SECRET":"YOUR_API_SECRET"}
24 | ```
25 | 
26 | To scrape for a particular search term:
27 | 
28 | `python scraper.py --search "SEARCH TERM" --bbox "minimum_longitude minimum_latitude maximum_longitude maximum_latitude"`
29 | 
30 | 
31 | To scrape for a particular group:
32 | 
33 | `python scraper.py --group "GROUP URL"`
34 | 
35 | Where GROUP URL is something like https://www.flickr.com/groups/scenery/pool/
36 | 
37 | You can also add a lat/lng coordinates to specify a geographic bounding box:
38 | 
39 | `python scraper.py --search "SEARCH TERM" --bbox "minimum_longitude minimum_latitude maximum_longitude maximum_latitude"`
40 | 
41 | Large-sized (1024px width) will be downloaded by default. You can download the original images by passing the flag `--original`.
42 | 
43 | Limit the number of pages of results downloaded by passing `--max-pages N` where `N` is pages of 500 results each.  Specify the start page with `--start-page M`.
44 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | import sys
  4 | import json
  5 | import re
  6 | import os
  7 | import requests
  8 | from tqdm import tqdm
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | with open('credentials.json') as infile:
 12 |     creds = json.load(infile)
 13 | 
 14 | KEY = creds['KEY']
 15 | SECRET = creds['SECRET']
 16 | 
 17 | def download_file(url, local_filename):
 18 |     if local_filename is None:
 19 |         local_filename = url.split('/')[-1]
 20 |     r = requests.get(url, stream=True)
 21 |     with open(local_filename, 'wb') as f:
 22 |         for chunk in r.iter_content(chunk_size=1024):
 23 |             if chunk:
 24 |                 f.write(chunk)
 25 |     return local_filename
 26 | 
 27 | 
 28 | def get_group_id_from_url(url):
 29 |     params = {
 30 |         'method' : 'flickr.urls.lookupGroup',
 31 |         'url': url,
 32 |         'format': 'json',
 33 |         'api_key': KEY,
 34 |         'format': 'json',
 35 |         'nojsoncallback': 1
 36 |     }
 37 |     results = requests.get('https://api.flickr.com/services/rest', params=params).json()
 38 |     return results['group']['id']
 39 | 
 40 | 
 41 | def get_photos(qs, qg, page=1, original=False, bbox=None):
 42 |     params = {
 43 |         'content_type': '7',
 44 |         'per_page': '500',
 45 |         'media': 'photos',
 46 |         'format': 'json',
 47 |         'advanced': 1,
 48 |         'nojsoncallback': 1,
 49 |         'extras': 'media,realname,%s,o_dims,geo,tags,machine_tags,date_taken' % ('url_o' if original else 'url_l'), #url_c,url_l,url_m,url_n,url_q,url_s,url_sq,url_t,url_z',
 50 |         'page': page,
 51 |         'api_key': KEY
 52 |     }
 53 | 
 54 |     if qs is not None:
 55 |         params['method'] = 'flickr.photos.search',
 56 |         params['text'] = qs
 57 |     elif qg is not None:
 58 |         params['method'] = 'flickr.groups.pools.getPhotos',
 59 |         params['group_id'] = qg
 60 | 
 61 |     # bbox should be: minimum_longitude, minimum_latitude, maximum_longitude, maximum_latitude
 62 |     if bbox is not None and len(bbox) == 4:
 63 |         params['bbox'] = ','.join(bbox)
 64 | 
 65 |     results = requests.get('https://api.flickr.com/services/rest', params=params).json()
 66 |     if "photos" not in results:
 67 |         print(results)
 68 |         return None
 69 |     return results["photos"]
 70 | 
 71 | 
 72 | def search(qs, qg, bbox=None, original=False, max_pages=None, start_page=1, output_dir='images'):
 73 |     # create a folder for the query if it does not exist
 74 |     foldername = os.path.join(output_dir, re.sub(r'[\W]', '_', qs if qs is not None else "group_%s"%qg))
 75 |     if bbox is not None:
 76 |         foldername += '_'.join(bbox)
 77 | 
 78 |     if not os.path.exists(foldername):
 79 |         os.makedirs(foldername)
 80 | 
 81 |     jsonfilename = os.path.join(foldername, 'results' + str(start_page) + '.json')
 82 | 
 83 |     if not os.path.exists(jsonfilename):
 84 | 
 85 |         # save results as a json file
 86 |         photos = []
 87 |         current_page = start_page
 88 | 
 89 |         results = get_photos(qs, qg, page=current_page, original=original, bbox=bbox)
 90 |         if results is None:
 91 |             return
 92 | 
 93 |         total_pages = results['pages']
 94 |         if max_pages is not None and total_pages > start_page + max_pages:
 95 |             total_pages = start_page + max_pages
 96 | 
 97 |         photos += results['photo']
 98 | 
 99 |         while current_page < total_pages:
100 |             print('downloading metadata, page {} of {}'.format(current_page, total_pages))
101 |             current_page += 1
102 |             photos += get_photos(qs, qg, page=current_page, original=original, bbox=bbox)['photo']
103 |             time.sleep(0.5)
104 | 
105 |         with open(jsonfilename, 'w') as outfile:
106 |             json.dump(photos, outfile)
107 | 
108 |     else:
109 |         with open(jsonfilename, 'r') as infile:
110 |             photos = json.load(infile)
111 | 
112 |     # download images
113 |     print('Downloading images')
114 |     for photo in tqdm(photos):
115 |         try:
116 |             url = photo.get('url_o' if original else 'url_l')
117 |             extension = url.split('.')[-1]
118 |             localname = os.path.join(foldername, '{}.{}'.format(photo['id'], extension))
119 |             if not os.path.exists(localname):
120 |                 download_file(url, localname)
121 |         except Exception as e:
122 |             continue
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     import argparse
127 |     parser = argparse.ArgumentParser(description='Download images from flickr')
128 |     parser.add_argument('--search', '-s', dest='q_search', default=None, required=False, help='Search term')
129 |     parser.add_argument('--group', '-g', dest='q_group', default=None, required=False, help='Group url, e.g. https://www.flickr.com/groups/scenery/')
130 |     parser.add_argument('--original', '-o', dest='original', action='store_true', default=False, required=False, help='Download original sized photos if True, large (1024px) otherwise')
131 |     parser.add_argument('--output_dir', '-t', dest='output_dir', default='images', required=False, help='Root directory to download to')
132 |     parser.add_argument('--max-pages', '-m', dest='max_pages', required=False, help='Max pages (default none)')
133 |     parser.add_argument('--start-page', '-st', dest='start_page', required=False, default=1, help='Start page (default 1)')
134 |     parser.add_argument('--bbox', '-b', dest='bbox', required=False, help='Bounding box to search in, separated by spaces like so: minimum_longitude minimum_latitude maximum_longitude maximum_latitude')
135 |     args = parser.parse_args()
136 | 
137 |     qs = args.q_search
138 |     qg = args.q_group
139 |     original = args.original
140 |     output_dir = args.output_dir
141 | 
142 |     if qs is None and qg is None:
143 |         sys.exit('Must specify a search term or group id')
144 | 
145 |     try:
146 |         bbox = args.bbox.split(' ')
147 |     except Exception as e:
148 |         bbox = None
149 | 
150 |     if bbox and len(bbox) != 4:
151 |         bbox = None
152 | 
153 |     if qg is not None:
154 |         qg = get_group_id_from_url(qg)
155 | 
156 |     print('Searching for {}'.format(qs if qs is not None else "group %s"%qg))
157 |     if bbox:
158 |         print('Within', bbox)
159 | 
160 |     max_pages = None
161 |     if args.max_pages:
162 |         max_pages = int(args.max_pages)
163 | 
164 |     if args.start_page:
165 |         start_page = int(args.start_page)
166 | 
167 |     search(qs, qg, bbox, original, max_pages, start_page, output_dir)
168 | 
169 | 


--------------------------------------------------------------------------------