├── .github └── workflows │ └── pythonapp.yml ├── .gitignore ├── README.md ├── async_tile_fetcher.py ├── decryption.py ├── example.py ├── gapdecoder.iml ├── requirements.txt └── tile_fetch.py /.github/workflows/pythonapp.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - uses: actions/setup-python@v1 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | - name: Test image "the water carrier la aguadora" 25 | run: python tile_fetch.py --zoom 1 "https://artsandculture.google.com/asset/the-water-carrier-la-aguadora/UwE2fGsMlWHuMg" 26 | - name: Test image "waterloo bridge claude monet" 27 | run: python tile_fetch.py --zoom 1 "https://artsandculture.google.com/asset/waterloo-bridge-claude-monet/DwE25c4VOCwboQ" 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.jp 3 | *.pyc 4 | .idea/ 5 | .vscode/ 6 | .DS_Store 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GAP decoder 2 | 3 | This project aims at making it possible to download images from 4 | [google arts and culture](https://artsandculture.google.com/) 5 | (formerly Google Art Project). 6 | 7 | ## How to use 8 | 9 | > **Note** : If you are not comfortable with installing a scripting language on your computer, you can use instead of gapdecoder: 10 | > - [dezoomify](https://ophir.alwaysdata.net/dezoomify/dezoomify.html), which can be used online without downloading anything to your computer, but limits the maximum size of downloaded images. 11 | > - [dezoomify-rs](https://github.com/lovasoa/dezoomify-rs#dezoomify-rs), which comes as ready-to-use executable. 12 | 13 | 14 | First, install [python 3](https://www.python.org/) on your system, 15 | and install the dependencies: 16 | 17 | ```bash 18 | python3 -m pip install -r requirements.txt 19 | ``` 20 | 21 | Then, run the code 22 | 23 | ```bash 24 | python3 tile_fetch.py --zoom 4 "https://artsandculture.google.com/asset/the-water-carrier-la-aguadora/UwE2fGsMlWHuMg" 25 | ``` 26 | 27 | You can of course change the zoom level and the URL. 28 | If you omit the zoom level, the script will display the list of available levels. 29 | 30 | Run with the '-h' flag for a list of available commands. 31 | 32 | ## Technical details 33 | 34 | This project required reverse-engineering google's code to find 35 | the protection measures in place and circumvent them. 36 | Here is what was found. 37 | 38 | ### Tile URLs 39 | 40 | The tile URLs are signed using HMAC. 41 | [See the details](./tile_fetch.py) 42 | 43 | ### Tile images 44 | 45 | The tile images are encoded using AES 128 CBC. 46 | [See the details](./decryption.py) 47 | -------------------------------------------------------------------------------- /async_tile_fetcher.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | 4 | def exponential_backoff(f, n=5, err=Exception): 5 | async def modified(*args, **kwargs): 6 | for i in range(n): 7 | try: 8 | return await f(*args, **kwargs) 9 | except err: 10 | if i < n - 1: 11 | await asyncio.sleep(2 ** i) 12 | else: 13 | raise err 14 | 15 | return modified 16 | 17 | 18 | @exponential_backoff 19 | async def fetch(session, url, destination): 20 | if destination.is_file(): 21 | return destination.read_bytes() 22 | async with session.get(url) as response: 23 | response.raise_for_status() 24 | file_bytes = await response.read() 25 | destination.write_bytes(file_bytes) 26 | return file_bytes 27 | 28 | 29 | async def gather_progress(awaitables): 30 | """ 31 | Gather awaitables, printing the completion ratio to stdout 32 | """ 33 | done = [] 34 | 35 | async def print_percent(awaitable, done): 36 | res = await awaitable 37 | done.append(res) 38 | msg = "{:.1f}%".format(100 * len(done) / len(awaitables)) 39 | print(msg, end='\r') 40 | return res 41 | 42 | total = await asyncio.gather(*[ 43 | print_percent(a, done) 44 | for i, a in enumerate(awaitables) 45 | ]) 46 | print() # Print a new line after the percentages 47 | return total 48 | -------------------------------------------------------------------------------- /decryption.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | import struct 4 | from Crypto.Cipher import AES 5 | 6 | aes_key = bytes.fromhex('5b63db113b7af3e0b1435556c8f9530c') 7 | aes_iv = bytes.fromhex('71e70405353a778bfa6fbc30321b9592') 8 | 9 | 10 | def aes_decrypt_buffer(buffer): 11 | """ 12 | >>> aes_decrypt_buffer(b"0123456789abcdef"*2).hex() 13 | 'a35fd5bfdb47815bcbe4b39e596a9358e289e389da48c0e709b26ecc081563ac' 14 | """ 15 | cipher = AES.new(aes_key, AES.MODE_CBC, iv=aes_iv) 16 | return cipher.decrypt(buffer) 17 | 18 | 19 | def split_buffer_in_3(buf, idx1, idx2): 20 | return buf[:idx1], buf[idx1:idx2], buf[idx2:] 21 | 22 | 23 | def decrypt(image): 24 | """ 25 | >>> x = "0A0A0A0A BABAC0C0 10000000 01010101 01010101 01010101 01010101 DEADBEAF 04000000" 26 | >>> decrypt(bytes.fromhex(x)).hex() 27 | 'babac0c0ca251118030ff9aff186bdccbce26a4cdeadbeaf' 28 | """ 29 | # The file is composed of a constant header, a body, 30 | # and a last 4-byte word indicating the start of the encrypted part 31 | encryption_marker, body, index_bytes = split_buffer_in_3(image, 4, -4) 32 | 33 | # return if the encryption marker isn't present at the start of the file 34 | if encryption_marker != b"\x0A\x0A\x0A\x0A": 35 | return image 36 | 37 | # Use the last 4 bytes to get the index of the bytes to be replaced 38 | (index,) = struct.unpack(" 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pycryptodome 2 | lxml 3 | Pillow 4 | aiohttp -------------------------------------------------------------------------------- /tile_fetch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | import asyncio 4 | import base64 5 | import hmac 6 | import io 7 | import itertools 8 | import re 9 | import shutil 10 | import string 11 | import urllib.parse 12 | import urllib.request 13 | from pathlib import Path 14 | 15 | import aiohttp 16 | from PIL import Image 17 | from lxml import etree 18 | 19 | import async_tile_fetcher 20 | from decryption import decrypt 21 | 22 | IV = bytes.fromhex("7b2b4e23de2cc5c5") 23 | 24 | 25 | def compute_url(path, token, x, y, z): 26 | """ 27 | >>> path = b'wGcDNN8L-2COcm9toX5BTp6HPxpMPPPuxrMU-ZL-W-nDHW8I_L4R5vlBJ6ITtlmONQ' 28 | >>> token = b'KwCgJ1QIfgprHn0a93x7Q-HhJ04' 29 | >>> compute_url(path, token, 0, 0, 7) 30 | 'https://lh3.googleusercontent.com/wGcDNN8L-2COcm9toX5BTp6HPxpMPPPuxrMU-ZL-W-nDHW8I_L4R5vlBJ6ITtlmONQ=x0-y0-z7-tHeJ3xylnSyyHPGwMZimI4EV3JP8' 31 | """ 32 | sign_path = b'%s=x%d-y%d-z%d-t%s' % (path, x, y, z, token) 33 | encoded = hmac.new(IV, sign_path, 'sha1').digest() 34 | signature = base64.b64encode(encoded, b'__')[:-1] 35 | url_bytes = b'https://lh3.googleusercontent.com/%s=x%d-y%d-z%d-t%s' % (path, x, y, z, signature) 36 | return url_bytes.decode('utf-8') 37 | 38 | 39 | class ImageInfo(object): 40 | RE_URL_PATH_TOKEN = re.compile(rb'],"(//[^"/]+/[^"/]+)",(?:"([^"]+)"|null)', re.MULTILINE) 41 | 42 | def __init__(self, url): 43 | page_source = urllib.request.urlopen(url).read() 44 | 45 | match = self.RE_URL_PATH_TOKEN.search(page_source) 46 | if match is None: 47 | raise ValueError("Unable to find google arts image token") 48 | url_no_proto, token = match.groups() 49 | assert url_no_proto, "Unable to extract required information from the page" 50 | self.path = url_no_proto.rsplit(b'/', 1)[1] 51 | self.token = token or b'' 52 | url_path = urllib.parse.unquote_plus(urllib.parse.urlparse(url).path) 53 | self.image_slug, image_id = url_path.split('/')[-2:] 54 | self.image_name = '%s - %s' % (string.capwords(self.image_slug.replace("-"," ")), image_id) 55 | 56 | meta_info_url = "https:{}=g".format(url_no_proto.decode('utf8')) 57 | meta_info_tree = etree.fromstring(urllib.request.urlopen(meta_info_url).read()) 58 | self.tile_width = int(meta_info_tree.attrib['tile_width']) 59 | self.tile_height = int(meta_info_tree.attrib['tile_height']) 60 | self.tile_info = [ 61 | ZoomLevelInfo(self, i, attrs.attrib) 62 | for i, attrs in enumerate(meta_info_tree.xpath('//pyramid_level')) 63 | ] 64 | 65 | def url(self, x, y, z): 66 | return compute_url(self.path, self.token, x, y, z) 67 | 68 | def __repr__(self): 69 | return '{} - zoom levels:\n{}'.format( 70 | self.image_slug, 71 | '\n'.join(map(str, self.tile_info)) 72 | ) 73 | 74 | 75 | class ZoomLevelInfo(object): 76 | def __init__(self, img_info, level_num, attrs): 77 | self.num = level_num 78 | self.num_tiles_x = int(attrs['num_tiles_x']) 79 | self.num_tiles_y = int(attrs['num_tiles_y']) 80 | self.empty_x = int(attrs['empty_pels_x']) 81 | self.empty_y = int(attrs['empty_pels_y']) 82 | self.img_info = img_info 83 | 84 | @property 85 | def size(self): 86 | return ( 87 | self.num_tiles_x * self.img_info.tile_width - self.empty_x, 88 | self.num_tiles_y * self.img_info.tile_height - self.empty_y 89 | ) 90 | 91 | @property 92 | def total_tiles(self): 93 | return self.num_tiles_x * self.num_tiles_y 94 | 95 | def __repr__(self): 96 | return 'level {level.num:2d}: {level.size[0]:6d} x {level.size[1]:6d} ({level.total_tiles:6d} tiles)'.format( 97 | level=self) 98 | 99 | 100 | async def fetch_tile(session, image_info, tiles_dir, x, y, z): 101 | file_path = tiles_dir / ('%sx%sx%s.jpg' % (x, y, z)) 102 | image_url = image_info.url(x, y, z) 103 | encrypted_bytes = await async_tile_fetcher.fetch(session, image_url, file_path) 104 | return x, y, encrypted_bytes 105 | 106 | 107 | async def load_tiles(info, z=-1, outfile=None, quality=90): 108 | if z >= len(info.tile_info): 109 | print( 110 | 'Invalid zoom level {z}. ' 111 | 'The maximum zoom level is {max}, using that instead.'.format( 112 | z=z, 113 | max=len(info.tile_info) - 1) 114 | ) 115 | z = len(info.tile_info) - 1 116 | 117 | z %= len(info.tile_info) # keep 0 <= z < len(tile_info) 118 | level = info.tile_info[z] 119 | 120 | img = Image.new(mode="RGB", size=level.size) 121 | 122 | tiles_dir = Path(info.image_name) 123 | tiles_dir.mkdir(exist_ok=True) 124 | 125 | async with aiohttp.ClientSession() as session: 126 | awaitable_tiles = [ 127 | fetch_tile(session, info, tiles_dir, x, y, z) 128 | for (x, y) in itertools.product( 129 | range(level.num_tiles_x), 130 | range(level.num_tiles_y)) 131 | ] 132 | print("Downloading tiles...") 133 | tiles = await async_tile_fetcher.gather_progress(awaitable_tiles) 134 | 135 | for x, y, encrypted_bytes in tiles: 136 | clear_bytes = decrypt(encrypted_bytes) 137 | tile_img = Image.open(io.BytesIO(clear_bytes)) 138 | img.paste(tile_img, (x * info.tile_width, y * info.tile_height)) 139 | 140 | print("Downloaded all tiles. Saving...") 141 | final_image_filename = outfile or (info.image_name + '.jpg') 142 | img.save(final_image_filename, quality=quality, subsampling=0) 143 | shutil.rmtree(tiles_dir) 144 | print("Saved the result as " + final_image_filename) 145 | 146 | 147 | def main(): 148 | import argparse 149 | 150 | parser = argparse.ArgumentParser(description='Download all image tiles from Google Arts and Culture website') 151 | parser.add_argument('url', type=str, nargs='?', help='an artsandculture.google.com url') 152 | parser.add_argument('--zoom', type=int, nargs='?', 153 | help='Zoom level to fetch, can be negative. Will print zoom levels if omitted') 154 | parser.add_argument('--outfile', type=str, nargs='?', 155 | help='The name of the file to create.') 156 | parser.add_argument('--quality', type=int, nargs='?', default=90, 157 | help='Compression level from 0-95. Higher is better.') 158 | args = parser.parse_args() 159 | 160 | assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" 161 | url = args.url or input("Enter the url of the image: ") 162 | 163 | print("Downloading image meta-information...") 164 | image_info = ImageInfo(url) 165 | 166 | zoom = args.zoom 167 | if zoom is None: 168 | print(image_info) 169 | while True: 170 | try: 171 | zoom = int(input("Which level do you want to download? ")) 172 | assert 0 <= zoom < len(image_info.tile_info) 173 | break 174 | except (ValueError, AssertionError): 175 | print("Not a valid zoom level.") 176 | 177 | coro = load_tiles(image_info, zoom, args.outfile, args.quality) 178 | loop = asyncio.get_event_loop() 179 | loop.run_until_complete(coro) 180 | 181 | 182 | if __name__ == '__main__': 183 | main() 184 | --------------------------------------------------------------------------------