├── README.md ├── LICENSE.md └── vma.py /README.md: -------------------------------------------------------------------------------- 1 | VMA extractor 2 | ============= 3 | 4 | `vma.py` implements an extraction tool for the VMA backup format used by 5 | [Proxmox](https://www.proxmox.com). The tool is implemented in Python3. 6 | 7 | Usage: 8 | ```sh 9 | ./vma.py path/to/source.vma path/to/target/directory 10 | ``` 11 | 12 | I think it is pretty important to be able to read Proxmox backups outside of a 13 | Proxmox environment. Yet, porting their VMA implementation to a standalone 14 | tool proved difficult. VMA-Reader and VMA-Writer are implemented as patches to 15 | the Proxmox-patched version and Qemu and are thus very difficult to compile on 16 | non-Proxmox systems. 17 | 18 | The format specification can be found on [git.proxmox.com](https://git.proxmox.com/?p=pve-qemu.git;a=blob_plain;f=vma_spec.txt;hb=refs/heads/master). 19 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019-2020 Jan Wolff 2 | 3 | This software is provided 'as-is', without any express or implied 4 | warranty. In no event will the authors be held liable for any damages 5 | arising from the use of this software. 6 | 7 | Permission is granted to anyone to use this software for any purpose, 8 | including commercial applications, and to alter it and redistribute it 9 | freely, subject to the following restrictions: 10 | 11 | 1. The origin of this software must not be misrepresented; you must not 12 | claim that you wrote the original software. If you use this software 13 | in a product, an acknowledgment in the product documentation would be 14 | appreciated but is not required. 15 | 2. Altered source versions must be plainly marked as such, and must not be 16 | misrepresented as being the original software. 17 | 3. This notice may not be removed or altered from any source distribution. 18 | 19 | -------------------------------------------------------------------------------- /vma.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import hashlib 5 | import struct 6 | import argparse 7 | 8 | 9 | class VmaHeader(): 10 | def __init__(self, fo, skip_hash): 11 | # 0 - 3: magic 12 | # VMA magic string ("VMA\x00") 13 | magic = fo.read(4) 14 | assert magic == b'VMA\0' 15 | 16 | # 4 - 7: version 17 | # Version number (valid value is 1) 18 | version = int.from_bytes(fo.read(4), 'big') 19 | assert version == 1 20 | 21 | # 8 - 23: uuid 22 | # Unique ID, Same uuid is used to mark extents. 23 | self.uuid = fo.read(16) 24 | 25 | # 24 - 31: ctime 26 | # Backup time stamp (seconds since epoch) 27 | self.ctime = int.from_bytes(fo.read(8), 'big') 28 | 29 | # 32 - 47: md5sum 30 | # Header checksum (from byte 0 to header_size). This field 31 | # is filled with zero to generate the checksum. 32 | self.md5sum = fo.read(16) 33 | 34 | # 48 - 51: blob_buffer_offset 35 | # Start of blob buffer (multiple of 512) 36 | self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big') 37 | 38 | # 52 - 55: blob_buffer_size 39 | # Size of blob buffer (multiple of 512) 40 | self.blob_buffer_size = int.from_bytes(fo.read(4), 'big') 41 | 42 | # 56 - 59: header_size 43 | # Overall size of this header (multiple of 512) 44 | self.header_size = int.from_bytes(fo.read(4), 'big') 45 | 46 | # 60 - 2043: reserved 47 | fo.seek(1984, os.SEEK_CUR) 48 | 49 | # 2044 - 3067: uint32_t config_names[256] 50 | # Offsets into blob_buffer table 51 | self.config_names = [] 52 | for i in range(256): 53 | self.config_names.append(int.from_bytes(fo.read(4), 'big')) 54 | 55 | # 3068 - 4091: uint32_t config_data[256] 56 | # Offsets into blob_buffer table 57 | self.config_data = [] 58 | for i in range(256): 59 | self.config_data.append(int.from_bytes(fo.read(4), 'big')) 60 | 61 | # 4092 - 4095: reserved 62 | fo.seek(4, os.SEEK_CUR) 63 | 64 | # 4096 - 12287: VmaDeviceInfoHeader dev_info[256] 65 | # The offset in this table is used as 'dev_id' inside 66 | # the data streams. 67 | self.dev_info = [] 68 | for i in range(256): 69 | self.dev_info.append(VmaDeviceInfoHeader(fo, self)) 70 | 71 | # 12288 - header_size: Blob buffer 72 | 73 | # the blob buffer layout is very odd. there appears to be an additional 74 | # byte of padding at the beginning 75 | fo.seek(1, os.SEEK_CUR) 76 | # since byte-wise offsets are used to address the blob buffer, the 77 | # blob metadata is stored in a hashmap, with the offsets as the keys 78 | self.blob_buffer = {} 79 | blob_buffer_current_offset = 1 80 | while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size): 81 | self.blob_buffer[blob_buffer_current_offset] = Blob(fo) 82 | blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset 83 | 84 | # make sure the file object points at the end of the vma header 85 | fo.seek(self.header_size, os.SEEK_SET) 86 | 87 | # reread the header and generate a md5 checksum of the data 88 | if skip_hash: 89 | self.generated_md5sum = None 90 | else: 91 | self.generated_md5sum = self.__gen_md5sum(fo) 92 | 93 | 94 | def __gen_md5sum(self, fo): 95 | p = fo.tell() 96 | fo.seek(0, os.SEEK_SET) 97 | h = hashlib.md5() 98 | 99 | data = fo.read(self.header_size) 100 | data = data[:32] + b'\0' * 16 + data[48:] 101 | h.update(data) 102 | 103 | fo.seek(p, os.SEEK_SET) 104 | return h.digest() 105 | 106 | 107 | class VmaDeviceInfoHeader(): 108 | def __init__(self, fo, vma_header): 109 | self.__vma_header = vma_header 110 | 111 | # 0 - 3: devive name (offsets into blob_buffer table) 112 | self.device_name = int.from_bytes(fo.read(4), 'big') 113 | 114 | # 4 - 7: reserved 115 | fo.seek(4, os.SEEK_CUR) 116 | 117 | # 8 - 15: device size in bytes 118 | self.device_size = int.from_bytes(fo.read(8), 'big') 119 | 120 | # 16 - 31: reserved 121 | fo.seek(16, os.SEEK_CUR) 122 | 123 | 124 | def get_name(self): 125 | name = self.__vma_header.blob_buffer[self.device_name].data 126 | return name.split(b'\0')[0].decode('utf-8') 127 | 128 | 129 | class VmaExtentHeader(): 130 | def __init__(self, fo, vma_header, skip_hash): 131 | self.pos_start = fo.tell() 132 | 133 | # 0 - 3: magic 134 | # VMA extent magic string ("VMAE") 135 | magic = fo.read(4) 136 | assert magic == b'VMAE' 137 | 138 | # 4 - 5: reserved 139 | fo.seek(2, os.SEEK_CUR) 140 | 141 | # 6 - 7: block_count 142 | # Overall number of contained 4K block 143 | self.block_count = int.from_bytes(fo.read(2), 'big') 144 | 145 | # 8 - 23: uuid 146 | # Unique ID, Same uuid as used in the VMA header. 147 | self.uuid = fo.read(16) 148 | 149 | # 24 - 39: md5sum 150 | # Header checksum (from byte 0 to header_size). This field 151 | # is filled with zero to generate the checksum. 152 | self.md5sum = fo.read(16) 153 | 154 | # 40 - 511: blockinfo[59] 155 | self.blockinfo = [] 156 | for i in range(59): 157 | self.blockinfo.append(Blockinfo(fo, vma_header)) 158 | 159 | self.pos_end = fo.tell() 160 | 161 | if skip_hash: 162 | self.generated_md5sum = None 163 | else: 164 | self.generated_md5sum = self.__gen_md5sum(fo) 165 | 166 | 167 | def __gen_md5sum(self, fo): 168 | p = fo.tell() 169 | fo.seek(self.pos_start, os.SEEK_SET) 170 | h = hashlib.md5() 171 | 172 | data = fo.read(self.pos_end - self.pos_start) 173 | data = data[:24] + b'\0' * 16 + data[40:] 174 | h.update(data) 175 | 176 | fo.seek(p, os.SEEK_SET) 177 | return h.digest() 178 | 179 | 180 | class Blob(): 181 | def __init__(self, fo): 182 | # the size of a blob is a two-byte int in LITTLE endian 183 | # source: original c code of vma-reader 184 | # uint32_t size = vmar->head_data[bstart] + 185 | # (vmar->head_data[bstart+1] << 8); 186 | self.size = int.from_bytes(fo.read(2), 'little') 187 | self.data = fo.read(self.size) 188 | 189 | 190 | class Blockinfo(): 191 | CLUSTER_SIZE = 65536 192 | 193 | def __init__(self, fo, vma_header): 194 | self.__vma_header = vma_header 195 | 196 | # 0 - 1: mask 197 | self.mask = int.from_bytes(fo.read(2), 'big') 198 | 199 | # 2: reserved 200 | fo.seek(1, os.SEEK_CUR) 201 | 202 | # 3: dev_id 203 | # Device ID (offset into dev_info table) 204 | self.dev_id = int.from_bytes(fo.read(1), 'big') 205 | 206 | # 4 - 7: cluster_num 207 | self.cluster_num = int.from_bytes(fo.read(4), 'big') 208 | 209 | 210 | def extract_configs(fo, args, vma_header): 211 | """ 212 | Configs in VMA are composed of two blobs. One specifies the config's 213 | filename and the other contains the config's content. 214 | The filename seems to be a null-terminated string, while the content is not 215 | terminated. 216 | """ 217 | 218 | if args.verbose: print('extracting configs...') 219 | 220 | for i in range(256): 221 | if vma_header.config_names[i] == 0: continue 222 | config_name = vma_header.blob_buffer[vma_header.config_names[i]].data 223 | # interpret filename as a null-terminated utf-8 string 224 | config_name = config_name.split(b'\0')[0].decode('utf-8') 225 | 226 | if args.verbose: print(f'{config_name}...', end='') 227 | 228 | config_data = vma_header.blob_buffer[vma_header.config_data[i]].data 229 | 230 | with open(os.path.join(args.destination, config_name), 'wb') as config_fo: 231 | config_fo.write(config_data) 232 | 233 | if args.verbose: print(' OK') 234 | 235 | 236 | def extract(fo, args): 237 | os.makedirs(args.destination, exist_ok=True) 238 | 239 | fo.seek(0, os.SEEK_END) 240 | filesize = fo.tell() 241 | fo.seek(0, os.SEEK_SET) 242 | 243 | vma_header = VmaHeader(fo, args.skip_hash) 244 | 245 | # check the md5 checksum given in the header with the value calculated from 246 | # the file 247 | if vma_header.generated_md5sum is not None: 248 | assert vma_header.md5sum == vma_header.generated_md5sum 249 | 250 | extract_configs(fo, args, vma_header) 251 | 252 | # extract_configs may move the read head somewhere into the blob buffer 253 | # make sure we are back at the end of the header 254 | fo.seek(vma_header.header_size, os.SEEK_SET) 255 | 256 | if args.verbose: print('extracting devices...') 257 | 258 | # open file handlers for all devices within the VMA 259 | # so we can easily append data to arbitrary devices 260 | device_fos = {} 261 | for dev_id, dev_info in enumerate(vma_header.dev_info): 262 | if dev_info.device_size > 0: 263 | if args.verbose: print(dev_info.get_name()) 264 | device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb') 265 | 266 | if args.verbose: print('this may take a while...') 267 | 268 | # used for sanity checking 269 | cluster_num_prev = -1 270 | 271 | while(fo.tell() < filesize): 272 | # when there is data to read at this point, we can safely expect a full 273 | # extent header with additional clusters 274 | extent_header = VmaExtentHeader(fo, vma_header, args.skip_hash) 275 | assert vma_header.uuid == extent_header.uuid 276 | 277 | # check the md5 checksum given in the header with the value calculated from 278 | # the file 279 | if extent_header.generated_md5sum is not None: 280 | assert extent_header.md5sum == extent_header.generated_md5sum 281 | 282 | for blockinfo in extent_header.blockinfo: 283 | if blockinfo.dev_id == 0: continue 284 | 285 | device_fo = device_fos[blockinfo.dev_id] 286 | 287 | # non-sequential clusters encountered, handle this case 288 | if blockinfo.cluster_num != cluster_num_prev + 1: 289 | if args.verbose: print('non sequential cluster encountered...') 290 | 291 | cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE 292 | if blockinfo.cluster_num > cluster_num_prev: 293 | # special case: cluster num is larger than current, 294 | # seek forward into file AND, if needed, fill missing size 295 | # with zeros 296 | device_fo.seek(0, os.SEEK_END) 297 | written_size = device_fo.tell() 298 | 299 | if written_size < cluster_pos: 300 | # add padding for missing clusters 301 | if args.verbose: 302 | print(f'{blockinfo.cluster_num}') 303 | print(f'adding {cluster_pos - written_size} bytes' 304 | + 'of padding...') 305 | 306 | # write padding in chucks of 4096 bytes to avoid 307 | # memory errors 308 | padding = cluster_pos - written_size 309 | while padding > 0: 310 | device_fo.write(b'\0' * min(padding, 4096)) 311 | padding -= 4096 312 | 313 | # seek to start of new cluster 314 | device_fo.seek(cluster_pos, os.SEEK_SET) 315 | 316 | cluster_num_prev = blockinfo.cluster_num 317 | 318 | for i in range(16): 319 | # a 2-bytes wide bitmask indicates 4k blocks with only zeros 320 | if (1 << i) & blockinfo.mask: 321 | device_fo.write(fo.read(4096)) 322 | else: 323 | device_fo.write(b'\0' * 4096) 324 | 325 | if args.verbose: print('closing file handles...') 326 | for device_fo in device_fos.values(): 327 | device_fo.close() 328 | 329 | if args.verbose: print('done') 330 | 331 | 332 | def main(): 333 | parser = argparse.ArgumentParser() 334 | parser.add_argument('filename', type=str) 335 | parser.add_argument('destination', type=str) 336 | parser.add_argument('-v', '--verbose', default=False, action='store_true') 337 | parser.add_argument('-f', '--force', default=False, action='store_true', 338 | help='overwrite target file if it exists') 339 | parser.add_argument('--skip-hash', default=False, action='store_true', 340 | help='do not perform md5 checksum test of data') 341 | args = parser.parse_args() 342 | 343 | if(not os.path.exists(args.filename)): 344 | print('Error! Source file does not exist!') 345 | return 1 346 | 347 | if(os.path.exists(args.destination) and not args.force): 348 | print('Error! Destination path exists!') 349 | return 1 350 | 351 | with open(args.filename, 'rb') as fo: 352 | extract(fo, args) 353 | 354 | return 0 355 | 356 | if __name__ == '__main__': 357 | sys.exit(main()) 358 | --------------------------------------------------------------------------------