├── .github
    └── workflows
    │   └── pythonapp.yml
├── .gitignore
├── README.md
├── async_tile_fetcher.py
├── decryption.py
├── example.py
├── gapdecoder.iml
├── requirements.txt
└── tile_fetch.py


/.github/workflows/pythonapp.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - uses: actions/setup-python@v1
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -r requirements.txt
24 |     - name: Test image "the water carrier la aguadora"
25 |       run: python tile_fetch.py --zoom 1 "https://artsandculture.google.com/asset/the-water-carrier-la-aguadora/UwE2fGsMlWHuMg"
26 |     - name: Test image "waterloo bridge claude monet"
27 |       run: python tile_fetch.py --zoom 1 "https://artsandculture.google.com/asset/waterloo-bridge-claude-monet/DwE25c4VOCwboQ"
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.jp
3 | *.pyc
4 | .idea/
5 | .vscode/
6 | .DS_Store
7 | 
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GAP decoder
 2 | 
 3 | This project aims at making it possible to download images from
 4 | [google arts and culture](https://artsandculture.google.com/)
 5 | (formerly Google Art Project).
 6 | 
 7 | ## How to use
 8 | 
 9 | > **Note** : If you are not comfortable with installing a scripting language on your computer, you can use instead of gapdecoder:
10 | > - [dezoomify](https://ophir.alwaysdata.net/dezoomify/dezoomify.html), which can be used online without downloading anything to your computer, but limits the maximum size of downloaded images. 
11 | > - [dezoomify-rs](https://github.com/lovasoa/dezoomify-rs#dezoomify-rs), which comes as ready-to-use executable. 
12 | 
13 | 
14 | First, install [python 3](https://www.python.org/) on your system,
15 | and install the dependencies:
16 | 
17 | ```bash
18 | python3 -m pip install -r requirements.txt 
19 | ```
20 | 
21 | Then, run the code
22 | 
23 | ```bash
24 | python3 tile_fetch.py --zoom 4 "https://artsandculture.google.com/asset/the-water-carrier-la-aguadora/UwE2fGsMlWHuMg"
25 | ```
26 | 
27 | You can of course change the zoom level and the URL.
28 | If you omit the zoom level, the script will display the list of available levels.
29 | 
30 | Run with the '-h' flag for a list of available commands.
31 | 
32 | ## Technical details
33 | 
34 | This project required reverse-engineering google's code to find 
35 | the protection measures in place and circumvent them.
36 | Here is what was found.
37 | 
38 | ### Tile URLs
39 | 
40 | The tile URLs are signed using HMAC.
41 | [See the details](./tile_fetch.py)
42 | 
43 | ### Tile images
44 | 
45 | The tile images are encoded using AES 128 CBC.
46 | [See the details](./decryption.py)
47 | 


--------------------------------------------------------------------------------
/async_tile_fetcher.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | 
 4 | def exponential_backoff(f, n=5, err=Exception):
 5 |     async def modified(*args, **kwargs):
 6 |         for i in range(n):
 7 |             try:
 8 |                 return await f(*args, **kwargs)
 9 |             except err:
10 |                 if i < n - 1:
11 |                     await asyncio.sleep(2 ** i)
12 |                 else:
13 |                     raise err
14 | 
15 |     return modified
16 | 
17 | 
18 | @exponential_backoff
19 | async def fetch(session, url, destination):
20 |     if destination.is_file():
21 |         return destination.read_bytes()
22 |     async with session.get(url) as response:
23 |         response.raise_for_status()
24 |         file_bytes = await response.read()
25 |         destination.write_bytes(file_bytes)
26 |         return file_bytes
27 | 
28 | 
29 | async def gather_progress(awaitables):
30 |     """
31 |     Gather awaitables, printing the completion ratio to stdout
32 |     """
33 |     done = []
34 | 
35 |     async def print_percent(awaitable, done):
36 |         res = await awaitable
37 |         done.append(res)
38 |         msg = "{:.1f}%".format(100 * len(done) / len(awaitables))
39 |         print(msg, end='\r')
40 |         return res
41 | 
42 |     total = await asyncio.gather(*[
43 |         print_percent(a, done)
44 |         for i, a in enumerate(awaitables)
45 |     ])
46 |     print()  # Print a new line after the percentages
47 |     return total
48 | 


--------------------------------------------------------------------------------
/decryption.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | import struct
 4 | from Crypto.Cipher import AES
 5 | 
 6 | aes_key = bytes.fromhex('5b63db113b7af3e0b1435556c8f9530c')
 7 | aes_iv = bytes.fromhex('71e70405353a778bfa6fbc30321b9592')
 8 | 
 9 | 
10 | def aes_decrypt_buffer(buffer):
11 |     """
12 |     >>> aes_decrypt_buffer(b"0123456789abcdef"*2).hex()
13 |     'a35fd5bfdb47815bcbe4b39e596a9358e289e389da48c0e709b26ecc081563ac'
14 |     """
15 |     cipher = AES.new(aes_key, AES.MODE_CBC, iv=aes_iv)
16 |     return cipher.decrypt(buffer)
17 | 
18 | 
19 | def split_buffer_in_3(buf, idx1, idx2):
20 |     return buf[:idx1], buf[idx1:idx2], buf[idx2:]
21 | 
22 | 
23 | def decrypt(image):
24 |     """
25 |     >>> x = "0A0A0A0A BABAC0C0 10000000 01010101 01010101 01010101 01010101 DEADBEAF 04000000"
26 |     >>> decrypt(bytes.fromhex(x)).hex()
27 |     'babac0c0ca251118030ff9aff186bdccbce26a4cdeadbeaf'
28 |     """
29 |     # The file is composed of a constant header, a body,
30 |     # and a last 4-byte word indicating the start of the encrypted part
31 |     encryption_marker, body, index_bytes = split_buffer_in_3(image, 4, -4)
32 | 
33 |     # return if the encryption marker isn't present at the start of the file
34 |     if encryption_marker != b"\x0A\x0A\x0A\x0A":
35 |         return image
36 | 
37 |     # Use the last 4 bytes to get the index of the bytes to be replaced
38 |     (index,) = struct.unpack("<i", index_bytes)
39 | 
40 |     clear_prefix, replace_count_bytes, rest = split_buffer_in_3(body, index, index + 4)
41 | 
42 |     # How many bytes to replace
43 |     (replace_count,) = struct.unpack("<i", replace_count_bytes)
44 | 
45 |     _, encrypted, clear_suffix = split_buffer_in_3(rest, 0, replace_count)
46 | 
47 |     # Convert back into bytes
48 |     return b"".join((clear_prefix, aes_decrypt_buffer(encrypted), clear_suffix))
49 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from decryption import decrypt
 2 | import sys
 3 | import os
 4 | 
 5 | input = sys.argv[1]
 6 | output = sys.argv[2]
 7 | 
 8 | image = open(input, "rb").read()
 9 | 
10 | image = decrypt(image)
11 | 
12 | open(output, "wb").write(image)
13 | 
14 | print(f"Decrypted {os.path.basename(input)} to {os.path.basename(output)}.")
15 | 


--------------------------------------------------------------------------------
/gapdecoder.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager" inherit-compiler-output="true">
4 |     <exclude-output />
5 |     <content url="file://$MODULE_DIR$" />
6 |     <orderEntry type="inheritedJdk" />
7 |     <orderEntry type="sourceFolder" forTests="false" />
8 |   </component>
9 | </module>


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pycryptodome
2 | lxml
3 | Pillow
4 | aiohttp


--------------------------------------------------------------------------------
/tile_fetch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # coding: utf-8
  3 | import asyncio
  4 | import base64
  5 | import hmac
  6 | import io
  7 | import itertools
  8 | import re
  9 | import shutil
 10 | import string
 11 | import urllib.parse
 12 | import urllib.request
 13 | from pathlib import Path
 14 | 
 15 | import aiohttp
 16 | from PIL import Image
 17 | from lxml import etree
 18 | 
 19 | import async_tile_fetcher
 20 | from decryption import decrypt
 21 | 
 22 | IV = bytes.fromhex("7b2b4e23de2cc5c5")
 23 | 
 24 | 
 25 | def compute_url(path, token, x, y, z):
 26 |     """
 27 |     >>> path = b'wGcDNN8L-2COcm9toX5BTp6HPxpMPPPuxrMU-ZL-W-nDHW8I_L4R5vlBJ6ITtlmONQ'
 28 |     >>> token = b'KwCgJ1QIfgprHn0a93x7Q-HhJ04'
 29 |     >>> compute_url(path, token, 0, 0, 7)
 30 |     'https://lh3.googleusercontent.com/wGcDNN8L-2COcm9toX5BTp6HPxpMPPPuxrMU-ZL-W-nDHW8I_L4R5vlBJ6ITtlmONQ=x0-y0-z7-tHeJ3xylnSyyHPGwMZimI4EV3JP8'
 31 |     """
 32 |     sign_path = b'%s=x%d-y%d-z%d-t%s' % (path, x, y, z, token)
 33 |     encoded = hmac.new(IV, sign_path, 'sha1').digest()
 34 |     signature = base64.b64encode(encoded, b'__')[:-1]
 35 |     url_bytes = b'https://lh3.googleusercontent.com/%s=x%d-y%d-z%d-t%s' % (path, x, y, z, signature)
 36 |     return url_bytes.decode('utf-8')
 37 | 
 38 | 
 39 | class ImageInfo(object):
 40 |     RE_URL_PATH_TOKEN = re.compile(rb'],"(//[^"/]+/[^"/]+)",(?:"([^"]+)"|null)', re.MULTILINE)
 41 | 
 42 |     def __init__(self, url):
 43 |         page_source = urllib.request.urlopen(url).read()
 44 | 
 45 |         match = self.RE_URL_PATH_TOKEN.search(page_source)
 46 |         if match is None:
 47 |             raise ValueError("Unable to find google arts image token")
 48 |         url_no_proto, token = match.groups()
 49 |         assert url_no_proto, "Unable to extract required information from the page"
 50 |         self.path = url_no_proto.rsplit(b'/', 1)[1]
 51 |         self.token = token or b''
 52 |         url_path = urllib.parse.unquote_plus(urllib.parse.urlparse(url).path)
 53 |         self.image_slug, image_id = url_path.split('/')[-2:]
 54 |         self.image_name = '%s - %s' % (string.capwords(self.image_slug.replace("-"," ")), image_id)
 55 | 
 56 |         meta_info_url = "https:{}=g".format(url_no_proto.decode('utf8'))
 57 |         meta_info_tree = etree.fromstring(urllib.request.urlopen(meta_info_url).read())
 58 |         self.tile_width = int(meta_info_tree.attrib['tile_width'])
 59 |         self.tile_height = int(meta_info_tree.attrib['tile_height'])
 60 |         self.tile_info = [
 61 |             ZoomLevelInfo(self, i, attrs.attrib)
 62 |             for i, attrs in enumerate(meta_info_tree.xpath('//pyramid_level'))
 63 |         ]
 64 | 
 65 |     def url(self, x, y, z):
 66 |         return compute_url(self.path, self.token, x, y, z)
 67 | 
 68 |     def __repr__(self):
 69 |         return '{} - zoom levels:\n{}'.format(
 70 |             self.image_slug,
 71 |             '\n'.join(map(str, self.tile_info))
 72 |         )
 73 | 
 74 | 
 75 | class ZoomLevelInfo(object):
 76 |     def __init__(self, img_info, level_num, attrs):
 77 |         self.num = level_num
 78 |         self.num_tiles_x = int(attrs['num_tiles_x'])
 79 |         self.num_tiles_y = int(attrs['num_tiles_y'])
 80 |         self.empty_x = int(attrs['empty_pels_x'])
 81 |         self.empty_y = int(attrs['empty_pels_y'])
 82 |         self.img_info = img_info
 83 | 
 84 |     @property
 85 |     def size(self):
 86 |         return (
 87 |             self.num_tiles_x * self.img_info.tile_width - self.empty_x,
 88 |             self.num_tiles_y * self.img_info.tile_height - self.empty_y
 89 |         )
 90 | 
 91 |     @property
 92 |     def total_tiles(self):
 93 |         return self.num_tiles_x * self.num_tiles_y
 94 | 
 95 |     def __repr__(self):
 96 |         return 'level {level.num:2d}: {level.size[0]:6d} x {level.size[1]:6d} ({level.total_tiles:6d} tiles)'.format(
 97 |             level=self)
 98 | 
 99 | 
100 | async def fetch_tile(session, image_info, tiles_dir, x, y, z):
101 |     file_path = tiles_dir / ('%sx%sx%s.jpg' % (x, y, z))
102 |     image_url = image_info.url(x, y, z)
103 |     encrypted_bytes = await async_tile_fetcher.fetch(session, image_url, file_path)
104 |     return x, y, encrypted_bytes
105 | 
106 | 
107 | async def load_tiles(info, z=-1, outfile=None, quality=90):
108 |     if z >= len(info.tile_info):
109 |         print(
110 |             'Invalid zoom level {z}. '
111 |             'The maximum zoom level is {max}, using that instead.'.format(
112 |                 z=z,
113 |                 max=len(info.tile_info) - 1)
114 |         )
115 |         z = len(info.tile_info) - 1
116 | 
117 |     z %= len(info.tile_info)  # keep 0 <= z < len(tile_info)
118 |     level = info.tile_info[z]
119 | 
120 |     img = Image.new(mode="RGB", size=level.size)
121 | 
122 |     tiles_dir = Path(info.image_name)
123 |     tiles_dir.mkdir(exist_ok=True)
124 | 
125 |     async with aiohttp.ClientSession() as session:
126 |         awaitable_tiles = [
127 |             fetch_tile(session, info, tiles_dir, x, y, z)
128 |             for (x, y) in itertools.product(
129 |                 range(level.num_tiles_x),
130 |                 range(level.num_tiles_y))
131 |         ]
132 |         print("Downloading tiles...")
133 |         tiles = await async_tile_fetcher.gather_progress(awaitable_tiles)
134 | 
135 |     for x, y, encrypted_bytes in tiles:
136 |         clear_bytes = decrypt(encrypted_bytes)
137 |         tile_img = Image.open(io.BytesIO(clear_bytes))
138 |         img.paste(tile_img, (x * info.tile_width, y * info.tile_height))
139 | 
140 |     print("Downloaded all tiles. Saving...")
141 |     final_image_filename = outfile or (info.image_name + '.jpg')
142 |     img.save(final_image_filename, quality=quality, subsampling=0)
143 |     shutil.rmtree(tiles_dir)
144 |     print("Saved the result as " + final_image_filename)
145 | 
146 | 
147 | def main():
148 |     import argparse
149 | 
150 |     parser = argparse.ArgumentParser(description='Download all image tiles from Google Arts and Culture website')
151 |     parser.add_argument('url', type=str, nargs='?', help='an artsandculture.google.com url')
152 |     parser.add_argument('--zoom', type=int, nargs='?',
153 |                         help='Zoom level to fetch, can be negative. Will print zoom levels if omitted')
154 |     parser.add_argument('--outfile', type=str, nargs='?',
155 |                         help='The name of the file to create.')
156 |     parser.add_argument('--quality', type=int, nargs='?', default=90,
157 |                         help='Compression level from 0-95. Higher is better.')
158 |     args = parser.parse_args()
159 | 
160 |     assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95"
161 |     url = args.url or input("Enter the url of the image: ")
162 | 
163 |     print("Downloading image meta-information...")
164 |     image_info = ImageInfo(url)
165 | 
166 |     zoom = args.zoom
167 |     if zoom is None:
168 |         print(image_info)
169 |         while True:
170 |             try:
171 |                 zoom = int(input("Which level do you want to download? "))
172 |                 assert 0 <= zoom < len(image_info.tile_info)
173 |                 break
174 |             except (ValueError, AssertionError):
175 |                 print("Not a valid zoom level.")
176 | 
177 |     coro = load_tiles(image_info, zoom, args.outfile, args.quality)
178 |     loop = asyncio.get_event_loop()
179 |     loop.run_until_complete(coro)
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     main()
184 | 


--------------------------------------------------------------------------------