├── .gitignore
├── mo-downloader.py
├── viper-downloader.py
├── README.md
├── viper-scraper.py
├── combiner.py
└── mo-scraper.py


/.gitignore:
--------------------------------------------------------------------------------
1 | data/*
2 | 


--------------------------------------------------------------------------------
/mo-downloader.py:
--------------------------------------------------------------------------------
 1 | # downloads every image in the moillusions_data.json file
 2 | # and stores them in 'data' folder, which is in .gitignore
 3 | # with filenames as index in the json file
 4 | 
 5 | import json
 6 | import requests
 7 | import os
 8 | import shutil
 9 | import multiprocessing
10 | 
11 | if not os.path.exists('data/'):
12 |     os.makedirs('data/')
13 | 
14 | def download_image(image_dict):
15 | 	index = image_dict['index']
16 | 	url = image_dict['img_src']
17 | 	
18 | 	# fix broken urls
19 | 	if url[0] == '/':
20 | 		url = 'https:' + url
21 | 
22 | 	# name files with left padded n 5 digit number and extension
23 | 	extension = url.split('.')[-1]
24 | 	# this happens when no extention is supplied in url
25 | 	if len(extension) > 4:
26 | 		extension = ''
27 | 	filename = str(index).zfill(5) + '.' + extension
28 | 
29 | 	# quit if it is already downloaded
30 | 	if os.path.isfile('data/'+filename):
31 | #		print('have ', filename)
32 | 		return
33 | 
34 | 	try:
35 | 		response = requests.get(url, stream=True)
36 | 	except requests.exceptions.SSLError:
37 | 		print('failed', image_dict)
38 | 		return
39 | 
40 | 	print(filename)
41 | 	with open('data/'+filename, 'wb') as out_file:
42 | 		shutil.copyfileobj(response.raw, out_file)
43 | 	del response
44 | 
45 | # load json file
46 | moillusions = json.load(open('moillusions_data.json'))
47 | # add 'index' to each entry
48 | for i, data in enumerate(moillusions):
49 | 	data['index'] = i
50 | 
51 | p = multiprocessing.Pool(10)
52 | p.map(download_image, moillusions)
53 | 


--------------------------------------------------------------------------------
/viper-downloader.py:
--------------------------------------------------------------------------------
 1 | # downloads every image in the moillusions_data.json file
 2 | # and stores them in 'data' folder, which is in .gitignore
 3 | # with filenames as index in the json file
 4 | 
 5 | import json
 6 | import requests
 7 | import os
 8 | import shutil
 9 | import multiprocessing
10 | 
11 | if not os.path.exists('data/'):
12 |     os.makedirs('data/')
13 | 
14 | def download_image(image_dict):
15 |     index = image_dict['index']
16 |     url = image_dict['url']
17 | 
18 |     # fix broken urls
19 |     if url[0] == '/':
20 |         url = 'http://viperlib.york.ac.uk' + url
21 | 
22 |     url = url.split('?')[0]
23 | 
24 |     # name files with left padded n 5 digit number and extension
25 |     extension = url.split('.')[-1]
26 | 
27 |     # this happens when no extention is supplied in url
28 |     if len(extension) > 4:
29 |         extension = ''
30 |     filename = str(index).zfill(5) + '.' + extension
31 | 
32 |     # quit if it is already downloaded
33 |     if os.path.isfile('data/'+filename):
34 | #       print('have ', filename)
35 |         return
36 | 
37 |     try:
38 |         response = requests.get(url, stream=True)
39 |     except requests.exceptions.SSLError:
40 |         print('failed', image_dict)
41 |         return
42 | 
43 |     print(filename)
44 |     with open('data/'+filename, 'wb') as out_file:
45 |         shutil.copyfileobj(response.raw, out_file)
46 |     del response
47 | 
48 | # load json file
49 | viper = json.load(open('viperlib_data.json'))
50 | # add 'index' to each entry
51 | for i, data in enumerate(viper):
52 |     data['index'] = i
53 | 
54 | p = multiprocessing.Pool(16)
55 | p.map(download_image, viper)
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # optical-illusion-dataset
 2 | JSON files with image links and metadata for optical illusions. Stay tuned as it grows and the images are released in gzip format from University host
 3 | 
 4 | ## Files:
 5 | * mo-scraper.py:  final version of web scraping script for moillusions
 6 | * moillusions_data.json:  json of image URLs and metadata for moillusions
 7 | * mo-downloader.py: downloades images linked in JSON above into data/
 8 | 
 9 | * viper-scraper.py: viperlib scraper script
10 | * viperlib_data.json: viperlib image URLs and metadata
11 | * viper-downloader.py: downloades images linked in JSON above into data/
12 | 
13 | ## Sources:
14 | 
15 | 6436 image links and metadata from https://www.moillusions.com/
16 | 
17 | 1454 image links and metadata from http://viperlib.york.ac.uk/
18 | 
19 | ## Download JPEGs:
20 | 
21 | The full image download can be found here (**6725** images) :
22 | 
23 | <https://www.floydhub.com/robertmax/datasets/illusions-jpg>
24 | 
25 | A greatly reduced dataset of only images that have eye-bending patterns is here (**569** images, hand picked):
26 | 
27 | <https://www.floydhub.com/robertmax/datasets/illusions-filtered>
28 | 
29 | ## My dataset build procese: 
30 | If you want to replicate it, for more latest images, or if some weird bug appears and documentation becomes important:
31 | - run a scraper to make the JSON file of all images
32 | - run a downloader to download images to data/
33 | - make sure `rename`, `mogrify`, and `file` are installed on your system
34 | - run `bash cleanconvert.sh` to convert everything to jpg, and verify all images using `file`
35 | - Investigate lines printed out, but I doubt you'll get anything unless you're unlucky
36 | - look in data/ if it has lots of #####.jpg and nothing else
37 | - rename data/ to something like viper-data/ etc
38 | 
39 | ## Next steps
40 | - come up with a scheme to combine website provided categories (some effort towards this in `combiner.py`)
41 | - train a GAN on the filtered images
42 | - learn a latent space for the images and do dimensionality reduction of some sort on it
43 | 


--------------------------------------------------------------------------------
/viper-scraper.py:
--------------------------------------------------------------------------------
 1 | # web scraper to scrape a section of viper 
 2 | # http://viperlib.york.ac.uk/
 3 | # for thumbnail images and store the URLs in JSON format. Mmmm JSON....
 4 |  
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | import requests
 8 | import time
 9 | import json
10 | 
11 | 
12 | start_pages = ['http://viperlib.york.ac.uk/areas/16-lightness-brightness/contributions?page=*&sort=by_title',
13 | 			'http://viperlib.york.ac.uk/areas/12-colour/contributions?page=*&sort=by_title',
14 | 			'http://viperlib.york.ac.uk/areas/18-depth/contributions?page=*&sort=by_title',
15 | 			'http://viperlib.york.ac.uk/areas/25-faces/contributions?page=*&sort=by_title',
16 | 			'http://viperlib.york.ac.uk/areas/20-illusions/contributions?page=*&sort=by_title',
17 | 			'http://viperlib.york.ac.uk/areas/16-lightness-brightness/contributions?page=*&sort=by_title',
18 | 			'http://viperlib.york.ac.uk/areas/13-motion/contributions?page=*&sort=by_title',
19 | 			'http://viperlib.york.ac.uk/areas/14-spatial-vision/contributions?page=*&sort=by_title',]
20 | 
21 | def scrape_images_and_text(url, category):
22 | 	print(url)
23 | 	# obtain 'soup' which lets us find parts in the page
24 | 	r = requests.get(url)
25 | 	data = r.text
26 | 	soup = BeautifulSoup(data, 'lxml')
27 | 
28 | 	# for each div class=contribution, collect data
29 | 	# package into dict, and yield
30 | 	images = list()
31 | 	for sample in soup.find_all('div', {'class':'contribution'}):
32 | 		image = sample.find('img')
33 | 		image_dict = {'url':image.get('src'),
34 | 					'title':image.get('alt'),
35 | 					'page_url':sample.find('a').get('href'),
36 | 					'category':category,
37 | 					'description':tuple(sample.find_all('p'))[-1].text}
38 | 		yield image_dict
39 | 		print('\t', image_dict['title'])
40 | 		print('\t\t', image_dict)
41 | 
42 | def write_to_json(data):
43 | 	with open('data.json', 'w') as json_file:
44 | 		json_file.write(json.dumps(data))
45 | 
46 | images = list()
47 | for base_url in start_pages:
48 | 	category = base_url.split('/')[4]
49 | 	i = 1
50 | 	while True:
51 | 		url = base_url.replace('*', str(i))
52 | 		start_len = len(images)
53 | 		images += scrape_images_and_text(url, category)
54 | 		if (start_len == len(images)):
55 | 			break
56 | 		i+=1
57 | 	write_to_json(images)
58 | 


--------------------------------------------------------------------------------
/combiner.py:
--------------------------------------------------------------------------------
 1 | # combines the two JSON files and leaves bank fields where needed
 2 | """
 3 |     { moillisions_data.json
 4 |         "url": "https://www.moillusions.com/4-dots-illusion/",
 5 |         "img_src": "https://www.moillusions.com/wp-content/uploads/2018/01/4-dots-illusion-1-580x342.jpg",
 6 |         "alt_text": "dots illusion",
 7 |         "categories": [
 8 |             "Disappearing Effect"
 9 |         ]
10 |     },
11 |         { viperlib_data.json
12 |         "url": "/images/thumbnails/0000/8908/3d_photo_log_illusion_medium.jpg?1238543179",
13 |         "title": "3d photo log illusion",
14 |         "page_url": "/areas/16-lightness-brightness/contributions/1372-3d-photo-log-illusion",
15 |         "category": "16-lightness-brightness",
16 |         "description": "Photo of a 3D version of the Logvinenko Illusion. The illusory effect was weaker althou..."
17 |     },
18 | """
19 | import json
20 | 
21 | moillusions = json.load(open('moillusions_data.json'))
22 | viperlib = json.load(open('viperlib_data.json'))
23 | 
24 | # get categories so I can manually determine which viper 
25 | # categories I should map to which mo categories
26 | viper_categories = set()
27 | mo_categories = set()
28 | for sample in moillusions:
29 | 	mo_categories.update(set(sample['categories']))
30 | 
31 | for sample in viperlib:
32 | 	viper_categories.add(sample['category'])
33 | 
34 | print(viper_categories)
35 | print(mo_categories)
36 | 
37 | # my choices are somewhat arbitrary, because mo categories are
38 | # much more fine grained than
39 | viper_to_mo = {'12-colour':['Color Adapting', '25-faces', '18-depth', '16-lightness-brightness', '13-motion', '14-spatial-vision', '20-illusions'}
40 | 
41 | 
42 | viper_cats = {'12-colour', '25-faces', '18-depth', '16-lightness-brightness', '13-motion', '14-spatial-vision', '20-illusions'}
43 | mo_cats = {'Animals', 'Escher Style', 'Anamorphosis', 'Transportation', 'Google Earth', 'Spot The Object', 'Brain Teaser', 'Games', 'Stereograms', 'Outdoor Illusions', 'Audio Illusions', 'Afterimages', 'Toys', 'Color Adapting', 'Relative Sizes', 'Scary Illusions', 'Skull Illusions', 'Sand Sculptures', 'Billboards', 'World of Weird', 'Animations', 'Art Illusion', 'Science Stuff', 'Photo Illusions', 'Video Illusions', 'Seemingly Bent', 'Art Installation', 'Disappearing Effect', 'Moving Optical Illusions', 'Body Paint', 'Transparent', 'Celebrities', 'Multiple Meanings', '3D Chalk Drawings', 'Perspective Illusions', 'Greatest Hits', 'Murals', 'Holiday Theme', 'News', 'Impossible Objects', 'Motion Illusions', 'David Blaine', 'Funny Illusions', 'Tests'}
44 | 
45 | `
46 | # actually, I'm just going to use mo illusions for now....
47 | 


--------------------------------------------------------------------------------
/mo-scraper.py:
--------------------------------------------------------------------------------
 1 | # Web scraper to collect all illusions from the site
 2 | # https://www.moillusions.com/
 3 | # they also have other content but we don't want that
 4 | # getting the tags (categories) would also be nice
 5 | 
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | import requests
 9 | import time
10 | import json
11 | 
12 | start_url = 'https://www.moillusions.com/one-two-face-illusion/'
13 | url = start_url
14 | outfile_name = str(time.time())
15 | 
16 | # image_list is a list of dicts with  url, img_src, alt_text, categories for each image
17 | image_list = list()
18 | 
19 | while True:
20 | 	try:
21 | 		print(url)
22 | 		r  = requests.get(url)
23 | 		data = r.text
24 | 		soup = BeautifulSoup(data, 'lxml')
25 | 
26 | 		# collect all categories
27 | 		categories = list()
28 | 		for link in soup.find_all('a', {'rel':'category tag'}):
29 | 			categories.append(link.text)
30 | 
31 | 		# get all image links, except avatar images
32 | 		for link in soup.find_all('img'):
33 | 			img_src = link.get('src')
34 | 
35 | 			# easy way to filter out avatar images
36 | 			if 'https://secure.gravatar.com/avatar/' in img_src:
37 | 				continue
38 | 
39 | 			alt_text = link.get('alt')
40 | 			image_data = {'url':url, 'img_src':img_src, 'alt_text':alt_text, 'categories':categories[:]} 
41 | 			image_list.append(image_data)
42 | 
43 | 			print('\t', img_src)
44 | 
45 | 			# every 20 images sampled, write a backup and print how it's going
46 | 			if (len(image_list) % 20 == 0): 
47 | 				print("pages scraped:", len(image_list))
48 | 				with open('.backup_' + str(len(image_list)) + '.json', 'w') as backup_file:
49 | 					backup_file.write(json.dumps(image_list))
50 | 
51 | 
52 | 
53 | 		# get the prev button. There should only be one
54 | 		new_url = None
55 | 		for link in soup.find_all('a', {'rel':'prev'}):
56 | 			new_url = link.get('href')
57 | 
58 | 		# if not "prev" url is supplied, exit the program. Either failure state or end of pages.
59 | 		if new_url == None:
60 | 			print("failed to get next url, ending")
61 | 			break
62 | 		if new_url == start_url:
63 | 			print("looped back to start, ending")
64 | 			break
65 | 		else:
66 | 			url = new_url
67 | 
68 | 		# sleep so we don't get booted
69 | 		#eh, lets not and say we did time.sleep(0.5)
70 | 
71 | 	# I needed to save when there is an error. I hope this is the right way to do it!
72 | 	except Exception as err:
73 | 		print('there has been as issue. writing json file and aborting')
74 | 		with open('failure_backup.json', 'w') as backup_file:
75 | 			backup_file.write(json.dumps(image_list))
76 | 		raise err
77 | 
78 | # after we're done or failed or blocked, write to data.json
79 | with open('data.json', 'w') as json_file:
80 | 	json_file.write(json.dumps(image_list))
81 | 


--------------------------------------------------------------------------------