├── images └── .keep ├── .gitignore ├── requirements.txt ├── README.md └── scraper.py /images/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | env/ 2 | *.jpg 3 | *.png 4 | *.gif 5 | *.jpeg 6 | *.json 7 | .DS_Store 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.0 2 | certifi==2017.11.5 3 | chardet==3.0.4 4 | idna==2.6 5 | requests==2.20.0 6 | tqdm==4.19.5 7 | urllib3==1.24.2 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrape flickr 2 | 3 | ## Installation 4 | 5 | 1. Clone the repo 6 | 7 | 2. Create a virtualenvironment 8 | 9 | ``` 10 | virtualenv env 11 | source env/bin/activate 12 | ``` 13 | 14 | 3. Install requirements 15 | 16 | `pip install -r requirements.txt` 17 | 18 | ## Usage 19 | 20 | [Get an API key from Flickr](https://www.flickr.com/services/api/misc.api_keys.html) and make a file called `credentials.json` which has the following text in it (replace the credentials with your own): 21 | 22 | ``` 23 | {"KEY":"YOUR_API_KEY", "SECRET":"YOUR_API_SECRET"} 24 | ``` 25 | 26 | To scrape for a particular search term: 27 | 28 | `python scraper.py --search "SEARCH TERM" --bbox "minimum_longitude minimum_latitude maximum_longitude maximum_latitude"` 29 | 30 | 31 | To scrape for a particular group: 32 | 33 | `python scraper.py --group "GROUP URL"` 34 | 35 | Where GROUP URL is something like https://www.flickr.com/groups/scenery/pool/ 36 | 37 | You can also add a lat/lng coordinates to specify a geographic bounding box: 38 | 39 | `python scraper.py --search "SEARCH TERM" --bbox "minimum_longitude minimum_latitude maximum_longitude maximum_latitude"` 40 | 41 | Large-sized (1024px width) will be downloaded by default. You can download the original images by passing the flag `--original`. 42 | 43 | Limit the number of pages of results downloaded by passing `--max-pages N` where `N` is pages of 500 results each. Specify the start page with `--start-page M`. 44 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | import sys 4 | import json 5 | import re 6 | import os 7 | import requests 8 | from tqdm import tqdm 9 | from bs4 import BeautifulSoup 10 | 11 | with open('credentials.json') as infile: 12 | creds = json.load(infile) 13 | 14 | KEY = creds['KEY'] 15 | SECRET = creds['SECRET'] 16 | 17 | def download_file(url, local_filename): 18 | if local_filename is None: 19 | local_filename = url.split('/')[-1] 20 | r = requests.get(url, stream=True) 21 | with open(local_filename, 'wb') as f: 22 | for chunk in r.iter_content(chunk_size=1024): 23 | if chunk: 24 | f.write(chunk) 25 | return local_filename 26 | 27 | 28 | def get_group_id_from_url(url): 29 | params = { 30 | 'method' : 'flickr.urls.lookupGroup', 31 | 'url': url, 32 | 'format': 'json', 33 | 'api_key': KEY, 34 | 'format': 'json', 35 | 'nojsoncallback': 1 36 | } 37 | results = requests.get('https://api.flickr.com/services/rest', params=params).json() 38 | return results['group']['id'] 39 | 40 | 41 | def get_photos(qs, qg, page=1, original=False, bbox=None): 42 | params = { 43 | 'content_type': '7', 44 | 'per_page': '500', 45 | 'media': 'photos', 46 | 'format': 'json', 47 | 'advanced': 1, 48 | 'nojsoncallback': 1, 49 | 'extras': 'media,realname,%s,o_dims,geo,tags,machine_tags,date_taken' % ('url_o' if original else 'url_l'), #url_c,url_l,url_m,url_n,url_q,url_s,url_sq,url_t,url_z', 50 | 'page': page, 51 | 'api_key': KEY 52 | } 53 | 54 | if qs is not None: 55 | params['method'] = 'flickr.photos.search', 56 | params['text'] = qs 57 | elif qg is not None: 58 | params['method'] = 'flickr.groups.pools.getPhotos', 59 | params['group_id'] = qg 60 | 61 | # bbox should be: minimum_longitude, minimum_latitude, maximum_longitude, maximum_latitude 62 | if bbox is not None and len(bbox) == 4: 63 | params['bbox'] = ','.join(bbox) 64 | 65 | results = requests.get('https://api.flickr.com/services/rest', params=params).json() 66 | if "photos" not in results: 67 | print(results) 68 | return None 69 | return results["photos"] 70 | 71 | 72 | def search(qs, qg, bbox=None, original=False, max_pages=None, start_page=1, output_dir='images'): 73 | # create a folder for the query if it does not exist 74 | foldername = os.path.join(output_dir, re.sub(r'[\W]', '_', qs if qs is not None else "group_%s"%qg)) 75 | if bbox is not None: 76 | foldername += '_'.join(bbox) 77 | 78 | if not os.path.exists(foldername): 79 | os.makedirs(foldername) 80 | 81 | jsonfilename = os.path.join(foldername, 'results' + str(start_page) + '.json') 82 | 83 | if not os.path.exists(jsonfilename): 84 | 85 | # save results as a json file 86 | photos = [] 87 | current_page = start_page 88 | 89 | results = get_photos(qs, qg, page=current_page, original=original, bbox=bbox) 90 | if results is None: 91 | return 92 | 93 | total_pages = results['pages'] 94 | if max_pages is not None and total_pages > start_page + max_pages: 95 | total_pages = start_page + max_pages 96 | 97 | photos += results['photo'] 98 | 99 | while current_page < total_pages: 100 | print('downloading metadata, page {} of {}'.format(current_page, total_pages)) 101 | current_page += 1 102 | photos += get_photos(qs, qg, page=current_page, original=original, bbox=bbox)['photo'] 103 | time.sleep(0.5) 104 | 105 | with open(jsonfilename, 'w') as outfile: 106 | json.dump(photos, outfile) 107 | 108 | else: 109 | with open(jsonfilename, 'r') as infile: 110 | photos = json.load(infile) 111 | 112 | # download images 113 | print('Downloading images') 114 | for photo in tqdm(photos): 115 | try: 116 | url = photo.get('url_o' if original else 'url_l') 117 | extension = url.split('.')[-1] 118 | localname = os.path.join(foldername, '{}.{}'.format(photo['id'], extension)) 119 | if not os.path.exists(localname): 120 | download_file(url, localname) 121 | except Exception as e: 122 | continue 123 | 124 | 125 | if __name__ == '__main__': 126 | import argparse 127 | parser = argparse.ArgumentParser(description='Download images from flickr') 128 | parser.add_argument('--search', '-s', dest='q_search', default=None, required=False, help='Search term') 129 | parser.add_argument('--group', '-g', dest='q_group', default=None, required=False, help='Group url, e.g. https://www.flickr.com/groups/scenery/') 130 | parser.add_argument('--original', '-o', dest='original', action='store_true', default=False, required=False, help='Download original sized photos if True, large (1024px) otherwise') 131 | parser.add_argument('--output_dir', '-t', dest='output_dir', default='images', required=False, help='Root directory to download to') 132 | parser.add_argument('--max-pages', '-m', dest='max_pages', required=False, help='Max pages (default none)') 133 | parser.add_argument('--start-page', '-st', dest='start_page', required=False, default=1, help='Start page (default 1)') 134 | parser.add_argument('--bbox', '-b', dest='bbox', required=False, help='Bounding box to search in, separated by spaces like so: minimum_longitude minimum_latitude maximum_longitude maximum_latitude') 135 | args = parser.parse_args() 136 | 137 | qs = args.q_search 138 | qg = args.q_group 139 | original = args.original 140 | output_dir = args.output_dir 141 | 142 | if qs is None and qg is None: 143 | sys.exit('Must specify a search term or group id') 144 | 145 | try: 146 | bbox = args.bbox.split(' ') 147 | except Exception as e: 148 | bbox = None 149 | 150 | if bbox and len(bbox) != 4: 151 | bbox = None 152 | 153 | if qg is not None: 154 | qg = get_group_id_from_url(qg) 155 | 156 | print('Searching for {}'.format(qs if qs is not None else "group %s"%qg)) 157 | if bbox: 158 | print('Within', bbox) 159 | 160 | max_pages = None 161 | if args.max_pages: 162 | max_pages = int(args.max_pages) 163 | 164 | if args.start_page: 165 | start_page = int(args.start_page) 166 | 167 | search(qs, qg, bbox, original, max_pages, start_page, output_dir) 168 | 169 | --------------------------------------------------------------------------------