├── .gitignore ├── README.md └── scrape.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # google image search scraper 2 | 3 | ## install 4 | 5 | ```sh 6 | pip install --upgrade google-api-python-client 7 | pip install requests 8 | ``` 9 | 10 | ## use 11 | 12 | `python scrape.py --query="cute pandas" --n=100` 13 | 14 | Will get the first 100 google image results for "cute pandas" in a directory `cute pandas/[time of query]/`. That directory will contain a file `query-results.json` with metadata, along with 100 cute panda images, named by result index + filename 15 | 16 | You'll also need a file in this directory called `keys.json` with the format 17 | 18 | ```json 19 | { 20 | "developerKey": "your google developer key", 21 | "cx": "you google custom search id - cse.google.com/cse/manage/all" 22 | } 23 | 24 | ``` 25 | -------------------------------------------------------------------------------- /scrape.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description='Download some images with a query') 6 | parser.add_argument('--query', dest='query', 7 | help='Query to search.') 8 | parser.add_argument('--n', dest='n', 9 | help='Number of items to return.') 10 | 11 | args = parser.parse_args() 12 | 13 | # In[1]: 14 | 15 | import json 16 | keys = json.loads(open('keys.json', 'r').read()) 17 | # keys 18 | 19 | 20 | # In[68]: 21 | 22 | from googleapiclient.discovery import build 23 | 24 | def image_search (query, start=1): 25 | service = build("customsearch", "v1", 26 | developerKey=keys['developerKey']) 27 | return service.cse().list( 28 | q=query, 29 | filter="1", # filter duplicates 30 | start=start, 31 | cx=keys['cx'], 32 | searchType='image', 33 | ).execute() 34 | 35 | def images_from (res): 36 | urls = [item['link'] for item in res['items']] 37 | return urls 38 | 39 | 40 | # In[72]: 41 | 42 | from time import sleep 43 | 44 | def query_images (q, n=100): 45 | resList = [] 46 | queried=0 47 | while(queried