├── .gitignore
├── README.md
└── scrape.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # google image search scraper
 2 | 
 3 | ## install
 4 | 
 5 | ```sh
 6 | pip install --upgrade google-api-python-client
 7 | pip install requests
 8 | ```
 9 | 
10 | ## use
11 | 
12 | `python scrape.py --query="cute pandas" --n=100`
13 | 
14 | Will get the first 100 google image results for "cute pandas" in a directory `cute pandas/[time of query]/`. That directory will contain a file `query-results.json` with metadata, along with 100 cute panda images, named by result index + filename
15 | 
16 | You'll also need a file in this directory called `keys.json` with the format
17 | 
18 | ```json
19 | {
20 |   "developerKey": "your google developer key",
21 |   "cx": "you google custom search id - cse.google.com/cse/manage/all"
22 | }
23 | 
24 | ```
25 | 


--------------------------------------------------------------------------------
/scrape.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | import argparse
  4 | 
  5 | parser = argparse.ArgumentParser(description='Download some images with a query')
  6 | parser.add_argument('--query', dest='query', 
  7 |                     help='Query to search.')
  8 | parser.add_argument('--n', dest='n', 
  9 |                     help='Number of items to return.')
 10 | 
 11 | args = parser.parse_args()
 12 | 
 13 | # In[1]:
 14 | 
 15 | import json
 16 | keys = json.loads(open('keys.json', 'r').read())
 17 | # keys
 18 | 
 19 | 
 20 | # In[68]:
 21 | 
 22 | from googleapiclient.discovery import build
 23 | 
 24 | def image_search (query, start=1):
 25 |     service = build("customsearch", "v1",
 26 |                developerKey=keys['developerKey'])
 27 |     return service.cse().list(
 28 |         q=query,
 29 |         filter="1", # filter duplicates
 30 |         start=start,
 31 |         cx=keys['cx'],
 32 |         searchType='image',
 33 |     ).execute()
 34 | 
 35 | def images_from (res):
 36 |     urls = [item['link'] for item in res['items']]
 37 |     return urls
 38 |     
 39 | 
 40 | # In[72]:
 41 | 
 42 | from time import sleep
 43 | 
 44 | def query_images (q, n=100):
 45 |     resList = []
 46 |     queried=0
 47 |     while(queried<n):
 48 |         res = image_search(q, queried+1)
 49 |         resList.append(res)
 50 |         queried+=10
 51 |         sleep(1)
 52 |     return resList
 53 | 
 54 | 
 55 | 
 56 | # In[74]:
 57 | 
 58 | #from io import BytesIO
 59 | import requests
 60 | #from PIL import Image
 61 | 
 62 | def load_image_bytes (url):
 63 |     return requests.get(url).content
 64 | 
 65 | #def load_image (bytes):
 66 | #    img = BytesIO(bytes)
 67 | #    return Image.open(img)
 68 | 
 69 | #img = img_urls[0]
 70 | #load_image(img)
 71 | 
 72 | 
 73 | # In[75]:
 74 | 
 75 | query_res = query_images(args.query)#, n=args.n)
 76 | url_lists = [images_from(res) for res in query_res]
 77 | img_urls = [url for url_list in url_lists for url in url_list]
 78 | print(img_urls)
 79 | 
 80 | 
 81 | # In[80]:
 82 | 
 83 | # query_res
 84 | 
 85 | 
 86 | # In[76]:
 87 | 
 88 | def filename (url):
 89 |     return url.split('/')[-1]
 90 | 
 91 | #filename(img_urls[0])
 92 | 
 93 | 
 94 | # In[79]:
 95 | 
 96 | from os import mkdir
 97 | from os.path import exists
 98 | from os.path import join
 99 | from datetime import datetime
100 | from io import TextIOWrapper
101 | 
102 | if not exists(args.query):
103 |     mkdir(args.query)
104 | out_dir = join(args.query, 
105 |               datetime.now().isoformat())
106 | mkdir(out_dir)
107 | 
108 | for i, url in enumerate(img_urls):
109 |     try:
110 |         print(url)
111 |         fn = str(i) + '_' + filename(url)
112 |         path = join(out_dir, fn)
113 |         print(path)
114 |         img = load_image_bytes(url)
115 |         print('downloaded')
116 |         with open(path, 'wb') as f:
117 |             f.write(img)
118 |     except:
119 |         print('something went wrong?')
120 |     
121 | path = join(out_dir, 'query-results.json')
122 | with open (path, 'w') as f:
123 |     f.write(json.dumps(query_res))
124 | print('done', path)
125 | 
126 | 
127 | # In[ ]:
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------