├── README.md
├── LICENSE.md
└── vma.py


/README.md:
--------------------------------------------------------------------------------
 1 | VMA extractor
 2 | =============
 3 | 
 4 | `vma.py` implements an extraction tool for the VMA backup format used by
 5 | [Proxmox](https://www.proxmox.com). The tool is implemented in Python3.
 6 | 
 7 | Usage:
 8 | ```sh
 9 | ./vma.py path/to/source.vma path/to/target/directory
10 | ```
11 | 
12 | I think it is pretty important to be able to read Proxmox backups outside of a
13 | Proxmox environment. Yet, porting their VMA implementation to a standalone
14 | tool proved difficult. VMA-Reader and VMA-Writer are implemented as patches to
15 | the Proxmox-patched version and Qemu and are thus very difficult to compile on
16 | non-Proxmox systems.
17 | 
18 | The format specification can be found on [git.proxmox.com](https://git.proxmox.com/?p=pve-qemu.git;a=blob_plain;f=vma_spec.txt;hb=refs/heads/master).
19 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019-2020 Jan Wolff
 2 | 
 3 | This software is provided 'as-is', without any express or implied
 4 | warranty. In no event will the authors be held liable for any damages
 5 | arising from the use of this software.
 6 | 
 7 | Permission is granted to anyone to use this software for any purpose,
 8 | including commercial applications, and to alter it and redistribute it
 9 | freely, subject to the following restrictions:
10 | 
11 | 1. The origin of this software must not be misrepresented; you must not
12 |    claim that you wrote the original software. If you use this software
13 |    in a product, an acknowledgment in the product documentation would be
14 |    appreciated but is not required.
15 | 2. Altered source versions must be plainly marked as such, and must not be
16 |    misrepresented as being the original software.
17 | 3. This notice may not be removed or altered from any source distribution.
18 | 
19 | 


--------------------------------------------------------------------------------
/vma.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import hashlib
  5 | import struct
  6 | import argparse
  7 | 
  8 | 
  9 | class VmaHeader():
 10 |     def __init__(self, fo, skip_hash):
 11 |         # 0 -  3:   magic
 12 |         #     VMA magic string ("VMA\x00")
 13 |         magic = fo.read(4)
 14 |         assert magic == b'VMA\0'
 15 | 
 16 |         # 4 -  7:   version
 17 |         #     Version number (valid value is 1)
 18 |         version = int.from_bytes(fo.read(4), 'big')
 19 |         assert version == 1
 20 | 
 21 |         # 8 - 23:   uuid
 22 |         #     Unique ID, Same uuid is used to mark extents.
 23 |         self.uuid = fo.read(16)
 24 | 
 25 |         # 24 - 31:   ctime
 26 |         #     Backup time stamp (seconds since epoch)
 27 |         self.ctime = int.from_bytes(fo.read(8), 'big')
 28 | 
 29 |         # 32 - 47:   md5sum
 30 |         #     Header checksum (from byte 0 to header_size). This field
 31 |         #     is filled with zero to generate the checksum.
 32 |         self.md5sum = fo.read(16)
 33 | 
 34 |         # 48 - 51:   blob_buffer_offset
 35 |         #     Start of blob buffer (multiple of 512)
 36 |         self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big')
 37 | 
 38 |         # 52 - 55:   blob_buffer_size
 39 |         #     Size of blob buffer (multiple of 512)
 40 |         self.blob_buffer_size = int.from_bytes(fo.read(4), 'big')
 41 | 
 42 |         # 56 - 59:   header_size
 43 |         #     Overall size of this header (multiple of 512)
 44 |         self.header_size = int.from_bytes(fo.read(4), 'big')
 45 | 
 46 |         # 60 - 2043: reserved
 47 |         fo.seek(1984, os.SEEK_CUR)
 48 | 
 49 |         # 2044 - 3067: uint32_t config_names[256]
 50 |         #     Offsets into blob_buffer table
 51 |         self.config_names = []
 52 |         for i in range(256):
 53 |             self.config_names.append(int.from_bytes(fo.read(4), 'big'))
 54 | 
 55 |         # 3068 - 4091: uint32_t config_data[256]
 56 |         #     Offsets into blob_buffer table
 57 |         self.config_data = []
 58 |         for i in range(256):
 59 |             self.config_data.append(int.from_bytes(fo.read(4), 'big'))
 60 | 
 61 |         # 4092 - 4095: reserved
 62 |         fo.seek(4, os.SEEK_CUR)
 63 | 
 64 |         # 4096 - 12287: VmaDeviceInfoHeader dev_info[256]
 65 |         #     The offset in this table is used as 'dev_id' inside
 66 |         #     the data streams.
 67 |         self.dev_info = []
 68 |         for i in range(256):
 69 |             self.dev_info.append(VmaDeviceInfoHeader(fo, self))
 70 | 
 71 |         # 12288 - header_size: Blob buffer
 72 | 
 73 |         # the blob buffer layout is very odd. there appears to be an additional
 74 |         # byte of padding at the beginning
 75 |         fo.seek(1, os.SEEK_CUR)
 76 |         # since byte-wise offsets are used to address the blob buffer, the
 77 |         # blob metadata is stored in a hashmap, with the offsets as the keys
 78 |         self.blob_buffer = {}
 79 |         blob_buffer_current_offset = 1
 80 |         while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size):
 81 |             self.blob_buffer[blob_buffer_current_offset] = Blob(fo)
 82 |             blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset
 83 | 
 84 |         # make sure the file object points at the end of the vma header
 85 |         fo.seek(self.header_size, os.SEEK_SET)
 86 | 
 87 |         # reread the header and generate a md5 checksum of the data
 88 |         if skip_hash:
 89 |             self.generated_md5sum = None
 90 |         else:
 91 |             self.generated_md5sum = self.__gen_md5sum(fo)
 92 | 
 93 | 
 94 |     def __gen_md5sum(self, fo):
 95 |         p = fo.tell()
 96 |         fo.seek(0, os.SEEK_SET)
 97 |         h = hashlib.md5()
 98 | 
 99 |         data = fo.read(self.header_size)
100 |         data = data[:32] + b'\0' * 16 + data[48:]
101 |         h.update(data)
102 | 
103 |         fo.seek(p, os.SEEK_SET)
104 |         return h.digest()
105 | 
106 | 
107 | class VmaDeviceInfoHeader():
108 |     def __init__(self, fo, vma_header):
109 |         self.__vma_header = vma_header
110 | 
111 |         # 0 -  3:   devive name (offsets into blob_buffer table)
112 |         self.device_name = int.from_bytes(fo.read(4), 'big')
113 | 
114 |         # 4 -  7:   reserved
115 |         fo.seek(4, os.SEEK_CUR)
116 | 
117 |         # 8 - 15:   device size in bytes
118 |         self.device_size = int.from_bytes(fo.read(8), 'big')
119 | 
120 |         # 16 - 31:   reserved
121 |         fo.seek(16, os.SEEK_CUR)
122 | 
123 | 
124 |     def get_name(self):
125 |         name = self.__vma_header.blob_buffer[self.device_name].data
126 |         return name.split(b'\0')[0].decode('utf-8')
127 | 
128 | 
129 | class VmaExtentHeader():
130 |     def __init__(self, fo, vma_header, skip_hash):
131 |         self.pos_start = fo.tell()
132 | 
133 |         # 0 -  3:   magic
134 |         #     VMA extent magic string ("VMAE")
135 |         magic = fo.read(4)
136 |         assert magic == b'VMAE'
137 | 
138 |         # 4 -  5:   reserved
139 |         fo.seek(2, os.SEEK_CUR)
140 | 
141 |         # 6 -  7:   block_count
142 |         #     Overall number of contained 4K block
143 |         self.block_count = int.from_bytes(fo.read(2), 'big')
144 | 
145 |         # 8 - 23:   uuid
146 |         #     Unique ID, Same uuid as used in the VMA header.
147 |         self.uuid = fo.read(16)
148 | 
149 |         # 24 - 39:   md5sum
150 |         #     Header checksum (from byte 0 to header_size). This field
151 |         #     is filled with zero to generate the checksum.
152 |         self.md5sum = fo.read(16)
153 | 
154 |         # 40 - 511:   blockinfo[59]
155 |         self.blockinfo = []
156 |         for i in range(59):
157 |             self.blockinfo.append(Blockinfo(fo, vma_header))
158 | 
159 |         self.pos_end = fo.tell()
160 | 
161 |         if skip_hash:
162 |             self.generated_md5sum = None
163 |         else:
164 |             self.generated_md5sum = self.__gen_md5sum(fo)
165 | 
166 | 
167 |     def __gen_md5sum(self, fo):
168 |         p = fo.tell()
169 |         fo.seek(self.pos_start, os.SEEK_SET)
170 |         h = hashlib.md5()
171 | 
172 |         data = fo.read(self.pos_end - self.pos_start)
173 |         data = data[:24] + b'\0' * 16 + data[40:]
174 |         h.update(data)
175 | 
176 |         fo.seek(p, os.SEEK_SET)
177 |         return h.digest()
178 | 
179 | 
180 | class Blob():
181 |     def __init__(self, fo):
182 |         # the size of a blob is a two-byte int in LITTLE endian
183 |         # source: original c code of vma-reader
184 |         #    uint32_t size = vmar->head_data[bstart] +
185 |         #        (vmar->head_data[bstart+1] << 8);
186 |         self.size = int.from_bytes(fo.read(2), 'little')
187 |         self.data = fo.read(self.size)
188 | 
189 | 
190 | class Blockinfo():
191 |     CLUSTER_SIZE = 65536
192 | 
193 |     def __init__(self, fo, vma_header):
194 |         self.__vma_header = vma_header
195 | 
196 |         # 0 - 1:   mask
197 |         self.mask = int.from_bytes(fo.read(2), 'big')
198 | 
199 |         # 2:   reserved
200 |         fo.seek(1, os.SEEK_CUR)
201 | 
202 |         # 3:   dev_id
203 |         #    Device ID (offset into dev_info table)
204 |         self.dev_id = int.from_bytes(fo.read(1), 'big')
205 | 
206 |         # 4 - 7:   cluster_num
207 |         self.cluster_num = int.from_bytes(fo.read(4), 'big')
208 | 
209 | 
210 | def extract_configs(fo, args, vma_header):
211 |     """
212 |     Configs in VMA are composed of two blobs. One specifies the config's
213 |     filename and the other contains the config's content.
214 |     The filename seems to be a null-terminated string, while the content is not
215 |     terminated.
216 |     """
217 | 
218 |     if args.verbose: print('extracting configs...')
219 | 
220 |     for i in range(256):
221 |         if vma_header.config_names[i] == 0: continue
222 |         config_name = vma_header.blob_buffer[vma_header.config_names[i]].data
223 |         # interpret filename as a null-terminated utf-8 string
224 |         config_name = config_name.split(b'\0')[0].decode('utf-8')
225 | 
226 |         if args.verbose: print(f'{config_name}...', end='')
227 | 
228 |         config_data = vma_header.blob_buffer[vma_header.config_data[i]].data
229 | 
230 |         with open(os.path.join(args.destination, config_name), 'wb') as config_fo:
231 |             config_fo.write(config_data)
232 | 
233 |         if args.verbose: print(' OK')
234 | 
235 | 
236 | def extract(fo, args):
237 |     os.makedirs(args.destination, exist_ok=True)
238 | 
239 |     fo.seek(0, os.SEEK_END)
240 |     filesize = fo.tell()
241 |     fo.seek(0, os.SEEK_SET)
242 | 
243 |     vma_header = VmaHeader(fo, args.skip_hash)
244 | 
245 |     # check the md5 checksum given in the header with the value calculated from
246 |     # the file
247 |     if vma_header.generated_md5sum is not None:
248 |         assert vma_header.md5sum == vma_header.generated_md5sum
249 | 
250 |     extract_configs(fo, args, vma_header)
251 | 
252 |     # extract_configs may move the read head somewhere into the blob buffer
253 |     # make sure we are back at the end of the header
254 |     fo.seek(vma_header.header_size, os.SEEK_SET)
255 | 
256 |     if args.verbose: print('extracting devices...')
257 | 
258 |     # open file handlers for all devices within the VMA
259 |     # so we can easily append data to arbitrary devices
260 |     device_fos = {}
261 |     for dev_id, dev_info in enumerate(vma_header.dev_info):
262 |         if dev_info.device_size > 0:
263 |             if args.verbose: print(dev_info.get_name())
264 |             device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb')
265 | 
266 |     if args.verbose: print('this may take a while...')
267 | 
268 |     # used for sanity checking
269 |     cluster_num_prev = -1
270 | 
271 |     while(fo.tell() < filesize):
272 |         # when there is data to read at this point, we can safely expect a full
273 |         # extent header with additional clusters
274 |         extent_header = VmaExtentHeader(fo, vma_header, args.skip_hash)
275 |         assert vma_header.uuid == extent_header.uuid
276 | 
277 |         # check the md5 checksum given in the header with the value calculated from
278 |         # the file
279 |         if extent_header.generated_md5sum is not None:
280 |             assert extent_header.md5sum == extent_header.generated_md5sum
281 | 
282 |         for blockinfo in extent_header.blockinfo:
283 |             if blockinfo.dev_id == 0: continue
284 | 
285 |             device_fo = device_fos[blockinfo.dev_id]
286 | 
287 |             # non-sequential clusters encountered, handle this case
288 |             if blockinfo.cluster_num != cluster_num_prev + 1:
289 |                 if args.verbose: print('non sequential cluster encountered...')
290 | 
291 |                 cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE
292 |                 if blockinfo.cluster_num > cluster_num_prev:
293 |                     # special case: cluster num is larger than current,
294 |                     # seek forward into file AND, if needed, fill missing size
295 |                     # with zeros
296 |                     device_fo.seek(0, os.SEEK_END)
297 |                     written_size = device_fo.tell()
298 | 
299 |                     if written_size < cluster_pos:
300 |                         # add padding for missing clusters
301 |                         if args.verbose:
302 |                             print(f'{blockinfo.cluster_num}')
303 |                             print(f'adding {cluster_pos - written_size} bytes'
304 |                                  + 'of padding...')
305 | 
306 |                         # write padding in chucks of 4096 bytes to avoid
307 |                         # memory errors
308 |                         padding = cluster_pos - written_size
309 |                         while padding > 0:
310 |                             device_fo.write(b'\0' * min(padding, 4096))
311 |                             padding -= 4096
312 | 
313 |                 # seek to start of new cluster
314 |                 device_fo.seek(cluster_pos, os.SEEK_SET)
315 | 
316 |             cluster_num_prev = blockinfo.cluster_num
317 | 
318 |             for i in range(16):
319 |                 # a 2-bytes wide bitmask indicates 4k blocks with only zeros
320 |                 if (1 << i) & blockinfo.mask:
321 |                     device_fo.write(fo.read(4096))
322 |                 else:
323 |                     device_fo.write(b'\0' * 4096)
324 | 
325 |     if args.verbose: print('closing file handles...')
326 |     for device_fo in device_fos.values():
327 |         device_fo.close()
328 | 
329 |     if args.verbose: print('done')
330 | 
331 | 
332 | def main():
333 |     parser = argparse.ArgumentParser()
334 |     parser.add_argument('filename', type=str)
335 |     parser.add_argument('destination', type=str)
336 |     parser.add_argument('-v', '--verbose', default=False, action='store_true')
337 |     parser.add_argument('-f', '--force', default=False, action='store_true',
338 |             help='overwrite target file if it exists')
339 |     parser.add_argument('--skip-hash', default=False, action='store_true',
340 |             help='do not perform md5 checksum test of data')
341 |     args = parser.parse_args()
342 | 
343 |     if(not os.path.exists(args.filename)):
344 |         print('Error! Source file does not exist!')
345 |         return 1
346 | 
347 |     if(os.path.exists(args.destination) and not args.force):
348 |         print('Error! Destination path exists!')
349 |         return 1
350 | 
351 |     with open(args.filename, 'rb') as fo:
352 |         extract(fo, args)
353 | 
354 |     return 0
355 | 
356 | if __name__ == '__main__':
357 |     sys.exit(main())
358 | 


--------------------------------------------------------------------------------