├── .gitignore ├── mo-downloader.py ├── viper-downloader.py ├── README.md ├── viper-scraper.py ├── combiner.py └── mo-scraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | -------------------------------------------------------------------------------- /mo-downloader.py: -------------------------------------------------------------------------------- 1 | # downloads every image in the moillusions_data.json file 2 | # and stores them in 'data' folder, which is in .gitignore 3 | # with filenames as index in the json file 4 | 5 | import json 6 | import requests 7 | import os 8 | import shutil 9 | import multiprocessing 10 | 11 | if not os.path.exists('data/'): 12 | os.makedirs('data/') 13 | 14 | def download_image(image_dict): 15 | index = image_dict['index'] 16 | url = image_dict['img_src'] 17 | 18 | # fix broken urls 19 | if url[0] == '/': 20 | url = 'https:' + url 21 | 22 | # name files with left padded n 5 digit number and extension 23 | extension = url.split('.')[-1] 24 | # this happens when no extention is supplied in url 25 | if len(extension) > 4: 26 | extension = '' 27 | filename = str(index).zfill(5) + '.' + extension 28 | 29 | # quit if it is already downloaded 30 | if os.path.isfile('data/'+filename): 31 | # print('have ', filename) 32 | return 33 | 34 | try: 35 | response = requests.get(url, stream=True) 36 | except requests.exceptions.SSLError: 37 | print('failed', image_dict) 38 | return 39 | 40 | print(filename) 41 | with open('data/'+filename, 'wb') as out_file: 42 | shutil.copyfileobj(response.raw, out_file) 43 | del response 44 | 45 | # load json file 46 | moillusions = json.load(open('moillusions_data.json')) 47 | # add 'index' to each entry 48 | for i, data in enumerate(moillusions): 49 | data['index'] = i 50 | 51 | p = multiprocessing.Pool(10) 52 | p.map(download_image, moillusions) 53 | -------------------------------------------------------------------------------- /viper-downloader.py: -------------------------------------------------------------------------------- 1 | # downloads every image in the moillusions_data.json file 2 | # and stores them in 'data' folder, which is in .gitignore 3 | # with filenames as index in the json file 4 | 5 | import json 6 | import requests 7 | import os 8 | import shutil 9 | import multiprocessing 10 | 11 | if not os.path.exists('data/'): 12 | os.makedirs('data/') 13 | 14 | def download_image(image_dict): 15 | index = image_dict['index'] 16 | url = image_dict['url'] 17 | 18 | # fix broken urls 19 | if url[0] == '/': 20 | url = 'http://viperlib.york.ac.uk' + url 21 | 22 | url = url.split('?')[0] 23 | 24 | # name files with left padded n 5 digit number and extension 25 | extension = url.split('.')[-1] 26 | 27 | # this happens when no extention is supplied in url 28 | if len(extension) > 4: 29 | extension = '' 30 | filename = str(index).zfill(5) + '.' + extension 31 | 32 | # quit if it is already downloaded 33 | if os.path.isfile('data/'+filename): 34 | # print('have ', filename) 35 | return 36 | 37 | try: 38 | response = requests.get(url, stream=True) 39 | except requests.exceptions.SSLError: 40 | print('failed', image_dict) 41 | return 42 | 43 | print(filename) 44 | with open('data/'+filename, 'wb') as out_file: 45 | shutil.copyfileobj(response.raw, out_file) 46 | del response 47 | 48 | # load json file 49 | viper = json.load(open('viperlib_data.json')) 50 | # add 'index' to each entry 51 | for i, data in enumerate(viper): 52 | data['index'] = i 53 | 54 | p = multiprocessing.Pool(16) 55 | p.map(download_image, viper) 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # optical-illusion-dataset 2 | JSON files with image links and metadata for optical illusions. Stay tuned as it grows and the images are released in gzip format from University host 3 | 4 | ## Files: 5 | * mo-scraper.py: final version of web scraping script for moillusions 6 | * moillusions_data.json: json of image URLs and metadata for moillusions 7 | * mo-downloader.py: downloades images linked in JSON above into data/ 8 | 9 | * viper-scraper.py: viperlib scraper script 10 | * viperlib_data.json: viperlib image URLs and metadata 11 | * viper-downloader.py: downloades images linked in JSON above into data/ 12 | 13 | ## Sources: 14 | 15 | 6436 image links and metadata from https://www.moillusions.com/ 16 | 17 | 1454 image links and metadata from http://viperlib.york.ac.uk/ 18 | 19 | ## Download JPEGs: 20 | 21 | The full image download can be found here (**6725** images) : 22 | 23 | 24 | 25 | A greatly reduced dataset of only images that have eye-bending patterns is here (**569** images, hand picked): 26 | 27 | 28 | 29 | ## My dataset build procese: 30 | If you want to replicate it, for more latest images, or if some weird bug appears and documentation becomes important: 31 | - run a scraper to make the JSON file of all images 32 | - run a downloader to download images to data/ 33 | - make sure `rename`, `mogrify`, and `file` are installed on your system 34 | - run `bash cleanconvert.sh` to convert everything to jpg, and verify all images using `file` 35 | - Investigate lines printed out, but I doubt you'll get anything unless you're unlucky 36 | - look in data/ if it has lots of #####.jpg and nothing else 37 | - rename data/ to something like viper-data/ etc 38 | 39 | ## Next steps 40 | - come up with a scheme to combine website provided categories (some effort towards this in `combiner.py`) 41 | - train a GAN on the filtered images 42 | - learn a latent space for the images and do dimensionality reduction of some sort on it 43 | -------------------------------------------------------------------------------- /viper-scraper.py: -------------------------------------------------------------------------------- 1 | # web scraper to scrape a section of viper 2 | # http://viperlib.york.ac.uk/ 3 | # for thumbnail images and store the URLs in JSON format. Mmmm JSON.... 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | import requests 8 | import time 9 | import json 10 | 11 | 12 | start_pages = ['http://viperlib.york.ac.uk/areas/16-lightness-brightness/contributions?page=*&sort=by_title', 13 | 'http://viperlib.york.ac.uk/areas/12-colour/contributions?page=*&sort=by_title', 14 | 'http://viperlib.york.ac.uk/areas/18-depth/contributions?page=*&sort=by_title', 15 | 'http://viperlib.york.ac.uk/areas/25-faces/contributions?page=*&sort=by_title', 16 | 'http://viperlib.york.ac.uk/areas/20-illusions/contributions?page=*&sort=by_title', 17 | 'http://viperlib.york.ac.uk/areas/16-lightness-brightness/contributions?page=*&sort=by_title', 18 | 'http://viperlib.york.ac.uk/areas/13-motion/contributions?page=*&sort=by_title', 19 | 'http://viperlib.york.ac.uk/areas/14-spatial-vision/contributions?page=*&sort=by_title',] 20 | 21 | def scrape_images_and_text(url, category): 22 | print(url) 23 | # obtain 'soup' which lets us find parts in the page 24 | r = requests.get(url) 25 | data = r.text 26 | soup = BeautifulSoup(data, 'lxml') 27 | 28 | # for each div class=contribution, collect data 29 | # package into dict, and yield 30 | images = list() 31 | for sample in soup.find_all('div', {'class':'contribution'}): 32 | image = sample.find('img') 33 | image_dict = {'url':image.get('src'), 34 | 'title':image.get('alt'), 35 | 'page_url':sample.find('a').get('href'), 36 | 'category':category, 37 | 'description':tuple(sample.find_all('p'))[-1].text} 38 | yield image_dict 39 | print('\t', image_dict['title']) 40 | print('\t\t', image_dict) 41 | 42 | def write_to_json(data): 43 | with open('data.json', 'w') as json_file: 44 | json_file.write(json.dumps(data)) 45 | 46 | images = list() 47 | for base_url in start_pages: 48 | category = base_url.split('/')[4] 49 | i = 1 50 | while True: 51 | url = base_url.replace('*', str(i)) 52 | start_len = len(images) 53 | images += scrape_images_and_text(url, category) 54 | if (start_len == len(images)): 55 | break 56 | i+=1 57 | write_to_json(images) 58 | -------------------------------------------------------------------------------- /combiner.py: -------------------------------------------------------------------------------- 1 | # combines the two JSON files and leaves bank fields where needed 2 | """ 3 | { moillisions_data.json 4 | "url": "https://www.moillusions.com/4-dots-illusion/", 5 | "img_src": "https://www.moillusions.com/wp-content/uploads/2018/01/4-dots-illusion-1-580x342.jpg", 6 | "alt_text": "dots illusion", 7 | "categories": [ 8 | "Disappearing Effect" 9 | ] 10 | }, 11 | { viperlib_data.json 12 | "url": "/images/thumbnails/0000/8908/3d_photo_log_illusion_medium.jpg?1238543179", 13 | "title": "3d photo log illusion", 14 | "page_url": "/areas/16-lightness-brightness/contributions/1372-3d-photo-log-illusion", 15 | "category": "16-lightness-brightness", 16 | "description": "Photo of a 3D version of the Logvinenko Illusion. The illusory effect was weaker althou..." 17 | }, 18 | """ 19 | import json 20 | 21 | moillusions = json.load(open('moillusions_data.json')) 22 | viperlib = json.load(open('viperlib_data.json')) 23 | 24 | # get categories so I can manually determine which viper 25 | # categories I should map to which mo categories 26 | viper_categories = set() 27 | mo_categories = set() 28 | for sample in moillusions: 29 | mo_categories.update(set(sample['categories'])) 30 | 31 | for sample in viperlib: 32 | viper_categories.add(sample['category']) 33 | 34 | print(viper_categories) 35 | print(mo_categories) 36 | 37 | # my choices are somewhat arbitrary, because mo categories are 38 | # much more fine grained than 39 | viper_to_mo = {'12-colour':['Color Adapting', '25-faces', '18-depth', '16-lightness-brightness', '13-motion', '14-spatial-vision', '20-illusions'} 40 | 41 | 42 | viper_cats = {'12-colour', '25-faces', '18-depth', '16-lightness-brightness', '13-motion', '14-spatial-vision', '20-illusions'} 43 | mo_cats = {'Animals', 'Escher Style', 'Anamorphosis', 'Transportation', 'Google Earth', 'Spot The Object', 'Brain Teaser', 'Games', 'Stereograms', 'Outdoor Illusions', 'Audio Illusions', 'Afterimages', 'Toys', 'Color Adapting', 'Relative Sizes', 'Scary Illusions', 'Skull Illusions', 'Sand Sculptures', 'Billboards', 'World of Weird', 'Animations', 'Art Illusion', 'Science Stuff', 'Photo Illusions', 'Video Illusions', 'Seemingly Bent', 'Art Installation', 'Disappearing Effect', 'Moving Optical Illusions', 'Body Paint', 'Transparent', 'Celebrities', 'Multiple Meanings', '3D Chalk Drawings', 'Perspective Illusions', 'Greatest Hits', 'Murals', 'Holiday Theme', 'News', 'Impossible Objects', 'Motion Illusions', 'David Blaine', 'Funny Illusions', 'Tests'} 44 | 45 | ` 46 | # actually, I'm just going to use mo illusions for now.... 47 | -------------------------------------------------------------------------------- /mo-scraper.py: -------------------------------------------------------------------------------- 1 | # Web scraper to collect all illusions from the site 2 | # https://www.moillusions.com/ 3 | # they also have other content but we don't want that 4 | # getting the tags (categories) would also be nice 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | import requests 9 | import time 10 | import json 11 | 12 | start_url = 'https://www.moillusions.com/one-two-face-illusion/' 13 | url = start_url 14 | outfile_name = str(time.time()) 15 | 16 | # image_list is a list of dicts with url, img_src, alt_text, categories for each image 17 | image_list = list() 18 | 19 | while True: 20 | try: 21 | print(url) 22 | r = requests.get(url) 23 | data = r.text 24 | soup = BeautifulSoup(data, 'lxml') 25 | 26 | # collect all categories 27 | categories = list() 28 | for link in soup.find_all('a', {'rel':'category tag'}): 29 | categories.append(link.text) 30 | 31 | # get all image links, except avatar images 32 | for link in soup.find_all('img'): 33 | img_src = link.get('src') 34 | 35 | # easy way to filter out avatar images 36 | if 'https://secure.gravatar.com/avatar/' in img_src: 37 | continue 38 | 39 | alt_text = link.get('alt') 40 | image_data = {'url':url, 'img_src':img_src, 'alt_text':alt_text, 'categories':categories[:]} 41 | image_list.append(image_data) 42 | 43 | print('\t', img_src) 44 | 45 | # every 20 images sampled, write a backup and print how it's going 46 | if (len(image_list) % 20 == 0): 47 | print("pages scraped:", len(image_list)) 48 | with open('.backup_' + str(len(image_list)) + '.json', 'w') as backup_file: 49 | backup_file.write(json.dumps(image_list)) 50 | 51 | 52 | 53 | # get the prev button. There should only be one 54 | new_url = None 55 | for link in soup.find_all('a', {'rel':'prev'}): 56 | new_url = link.get('href') 57 | 58 | # if not "prev" url is supplied, exit the program. Either failure state or end of pages. 59 | if new_url == None: 60 | print("failed to get next url, ending") 61 | break 62 | if new_url == start_url: 63 | print("looped back to start, ending") 64 | break 65 | else: 66 | url = new_url 67 | 68 | # sleep so we don't get booted 69 | #eh, lets not and say we did time.sleep(0.5) 70 | 71 | # I needed to save when there is an error. I hope this is the right way to do it! 72 | except Exception as err: 73 | print('there has been as issue. writing json file and aborting') 74 | with open('failure_backup.json', 'w') as backup_file: 75 | backup_file.write(json.dumps(image_list)) 76 | raise err 77 | 78 | # after we're done or failed or blocked, write to data.json 79 | with open('data.json', 'w') as json_file: 80 | json_file.write(json.dumps(image_list)) 81 | --------------------------------------------------------------------------------