├── .gitignore ├── access_parser ├── __init__.py ├── utils.py ├── parsing_primitives.py └── access_parser.py ├── examples ├── test.mdb └── parse_db.py ├── setup.py ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | access_parser.egg-info/ 3 | build/ 4 | **/__pycache__/ 5 | -------------------------------------------------------------------------------- /access_parser/__init__.py: -------------------------------------------------------------------------------- 1 | from access_parser.access_parser import AccessParser 2 | -------------------------------------------------------------------------------- /examples/test.mdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claroty/access_parser/HEAD/examples/test.mdb -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as f: 4 | long_description = f.read() 5 | 6 | setuptools.setup( 7 | name="access_parser", 8 | version="0.0.6", 9 | author="Uri Katz", 10 | author_email="uri.k@claroty.com", 11 | description="Access database (*.mdb, *.accdb) parser", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/ClarotyICS/access_parser", 15 | packages=setuptools.find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: Apache Software License", 19 | "Operating System :: OS Independent", 20 | ], 21 | python_requires='>=3.6', 22 | install_requires=[ 23 | 'construct', 24 | 'tabulate', 25 | ], 26 | ) 27 | -------------------------------------------------------------------------------- /examples/parse_db.py: -------------------------------------------------------------------------------- 1 | from access_parser import AccessParser 2 | from tabulate import tabulate 3 | import argparse 4 | 5 | 6 | def print_tables(db_path, only_catalog=False, specific_table=None): 7 | db = AccessParser(db_path) 8 | if only_catalog: 9 | for k in db.catalog.keys(): 10 | print(f"{k}\n") 11 | elif specific_table: 12 | table = db.parse_table(specific_table) 13 | print(f'TABLE NAME: {specific_table}\r\n') 14 | print(tabulate(table, headers="keys", disable_numparse=True)) 15 | print("\n\n\n\n") 16 | else: 17 | db.print_database() 18 | 19 | 20 | if __name__ == '__main__': 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("-c", "--catalog", required=False, help="Print DB table names", action="store_true") 23 | parser.add_argument("-f", "--file", required=True, help="*.mdb / *.accdb File") 24 | parser.add_argument("-t", "--table", required=False, help="Table to print", default=None) 25 | 26 | args = parser.parse_args() 27 | print_tables(args.file, args.catalog, args.table) 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AccessDB Parser (Pure Python) 2 | Microsoft Access (.mdb / .accdb) database files parser. The parsing logic is fully written in python and works without any external binary dependencies. 3 | 4 | # Installing 5 | Use pip: `pip install access-parser` 6 | 7 | Or install manually: 8 | ```bash 9 | git clone https://github.com/ClarotyICS/access_parser.git 10 | cd access_parser 11 | python3 setup.py install 12 | ``` 13 | 14 | # Demo 15 | [![asciicast](https://asciinema.org/a/345445.svg)](https://asciinema.org/a/345445) 16 | 17 | # Usage Example 18 | ```python 19 | from access_parser import AccessParser 20 | 21 | # .mdb or .accdb file 22 | db = AccessParser("/path/to/mdb/file.mdb") 23 | 24 | # Print DB tables 25 | print(db.catalog) 26 | 27 | # Tables are stored as defaultdict(list) -- table[column][row_index] 28 | table = db.parse_table("table_name") 29 | 30 | # Pretty print all tables 31 | db.print_database() 32 | 33 | ``` 34 | 35 | ### Known Issues 36 | * 37 | 38 | This library was tested on a limited subset of database files. Due to the differences between database versions and the complexity of the parsing we expect to find more parsing edge-cases. 39 | 40 | To help us resolve issues faster please provide as much data as you can when opening an issue - DB file if possible and full trace including log messages. 41 | 42 | 43 | ### Thanks 44 | * This library was made possible by the great work by mdb-tools. The logic in this library heavily relies on the excellent documentation they have https://github.com/brianb/mdbtools 45 | * Huge thanks to Mashav Sapir for the help debugging, CRing and contributing to this project https://github.com/mashavs 46 | -------------------------------------------------------------------------------- /access_parser/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import struct 4 | import uuid 5 | import math 6 | from datetime import datetime, timedelta 7 | 8 | LOGGER = logging.getLogger("access_parser.utils") 9 | 10 | 11 | TYPE_BOOLEAN = 1 12 | TYPE_INT8 = 2 13 | TYPE_INT16 = 3 14 | TYPE_INT32 = 4 15 | TYPE_MONEY = 5 16 | TYPE_FLOAT32 = 6 17 | TYPE_FLOAT64 = 7 18 | TYPE_DATETIME = 8 19 | TYPE_BINARY = 9 20 | TYPE_TEXT = 10 21 | TYPE_OLE = 11 22 | TYPE_MEMO = 12 23 | TYPE_GUID = 15 24 | TYPE_96_bit_17_BYTES = 16 25 | TYPE_COMPLEX = 18 26 | 27 | TABLE_PAGE_MAGIC = b"\x02\x01" 28 | DATA_PAGE_MAGIC = b"\x01\x01" 29 | 30 | 31 | ACCESS_EPOCH = datetime(1899, 12, 30) 32 | 33 | PERCENT_DEFAULT = '0.00%' 34 | EURO_DEFAULT = '€0.00' 35 | DOLLAR_DEFAULT = '$0.00' 36 | GENERAL_NUMBER_DEFAULT = '0' 37 | FIXED_AND_STANDARD_DEFAULT = '0.00' 38 | SCIENTIFIC_DEFAULT = '0.00E+00' 39 | 40 | FORMAT_PERCENT = "Percent" 41 | FORMAT_DOLLAR = "$" 42 | FORMAT_EURO = "€" 43 | FORMAT_GENERAL_NUMBER = "General Number" 44 | FORMAT_FIXED = "Fixed" 45 | FORMAT_STANDARD = "Standard" 46 | FORMAT_SCIENTIFIC = "Scientific" 47 | 48 | FORMAT_TO_DEFAULT_VALUE = { 49 | FORMAT_DOLLAR: DOLLAR_DEFAULT, 50 | FORMAT_STANDARD: FIXED_AND_STANDARD_DEFAULT, 51 | FORMAT_FIXED: FIXED_AND_STANDARD_DEFAULT, 52 | FORMAT_PERCENT: PERCENT_DEFAULT, 53 | FORMAT_EURO: EURO_DEFAULT, 54 | FORMAT_GENERAL_NUMBER: GENERAL_NUMBER_DEFAULT, 55 | FORMAT_SCIENTIFIC: SCIENTIFIC_DEFAULT 56 | } 57 | 58 | 59 | # https://stackoverflow.com/questions/45560782 60 | def mdb_date_to_readable(double_time): 61 | try: 62 | dtime_bytes = struct.pack("Q", double_time) 63 | 64 | dtime_double = struct.unpack(' scale: 83 | dot_len = len(full_number) - scale 84 | full_number = full_number[:dot_len] + "." + full_number[dot_len:] 85 | numeric_string = "-" if neg else "" 86 | numeric_string += full_number 87 | return numeric_string 88 | 89 | 90 | def get_decoded_text(bytes_data): 91 | try: 92 | decoded = bytes_data.decode('utf-8') 93 | except UnicodeDecodeError: 94 | try: 95 | decoded = bytes_data.decode('latin1') 96 | except UnicodeDecodeError: 97 | decoded = bytes_data.decode('utf-8', errors='ignore') 98 | return decoded 99 | 100 | 101 | def parse_money_type(parsed, prop_format): 102 | """ 103 | Parse and format a money value according to the specified format. 104 | 105 | Args: 106 | parsed (int): The numerical value to be parsed. 107 | prop_format (str): The format string specifying the desired format. 108 | 109 | Returns: 110 | str: The parsed and formatted money value. 111 | """ 112 | parsed = str(parsed) 113 | if prop_format == FORMAT_PERCENT: 114 | special_format = "{:.2f}%" 115 | dot_location = -2 116 | elif prop_format.startswith(FORMAT_DOLLAR): 117 | special_format = '${:,.2f}' 118 | dot_location = -4 119 | elif prop_format.startswith(FORMAT_EURO): 120 | special_format = '€{:,.2f}' 121 | dot_location = -4 122 | elif prop_format == FORMAT_GENERAL_NUMBER: 123 | special_format = '{:,.1f}' 124 | dot_location = -4 125 | elif prop_format == FORMAT_SCIENTIFIC: 126 | special_format = '{:.2e}' 127 | dot_location = -4 128 | elif prop_format in [FORMAT_FIXED, FORMAT_STANDARD]: 129 | dot_location = -4 130 | special_format = '{:,.2f}' 131 | else: 132 | LOGGER.warning(f"parse_money_type - unsupported format: {prop_format} value {parsed} may be wrong") 133 | return parsed 134 | 135 | money_float = parsed[:dot_location] + "." + parsed[dot_location:] 136 | if special_format: 137 | money_float = special_format.format(float(money_float)) 138 | return money_float 139 | 140 | 141 | def parse_type(data_type, buffer, length=None, version=3, props=None): 142 | parsed = "" 143 | # Bool or int8 144 | if data_type == TYPE_INT8: 145 | parsed = struct.unpack_from("b", buffer)[0] 146 | elif data_type == TYPE_INT16: 147 | parsed = struct.unpack_from("h", buffer)[0] 148 | elif data_type == TYPE_INT32 or data_type == TYPE_COMPLEX: 149 | parsed = struct.unpack_from("i", buffer)[0] 150 | elif data_type == TYPE_MONEY: 151 | parsed = struct.unpack_from("q", buffer)[0] 152 | if props and "Format" in props: 153 | prop_format = props['Format'] 154 | if parsed == 0: 155 | parsed = [y for x, y in FORMAT_TO_DEFAULT_VALUE.items() if prop_format.startswith(x)] 156 | if not parsed: 157 | LOGGER.warning(f"parse_type got unknown format while parsing money field {prop_format}") 158 | else: 159 | parsed = parsed[0] 160 | else: 161 | parsed = parse_money_type(parsed, prop_format) 162 | elif data_type == TYPE_FLOAT32: 163 | parsed = struct.unpack_from("f", buffer)[0] 164 | elif data_type == TYPE_FLOAT64: 165 | parsed = struct.unpack_from("d", buffer)[0] 166 | elif data_type == TYPE_DATETIME: 167 | double_datetime = struct.unpack_from("q", buffer)[0] 168 | parsed = mdb_date_to_readable(double_datetime) 169 | elif data_type == TYPE_BINARY: 170 | parsed = buffer[:length] 171 | offset = length 172 | elif data_type == TYPE_OLE: 173 | parsed = buffer 174 | elif data_type == TYPE_GUID: 175 | parsed = buffer[:16] 176 | guid = uuid.UUID(parsed.hex()) 177 | parsed = str(guid) 178 | elif data_type == TYPE_96_bit_17_BYTES: 179 | parsed = buffer[:17] 180 | elif data_type == TYPE_TEXT: 181 | if version > 3: 182 | # Looks like if BOM is present text is already decoded 183 | if buffer.startswith(b"\xfe\xff") or buffer.startswith(b"\xff\xfe"): 184 | buff = buffer[2:] 185 | parsed = get_decoded_text(buff) 186 | else: 187 | parsed = buffer.decode("utf-16", errors='ignore') 188 | else: 189 | parsed = get_decoded_text(buffer) 190 | 191 | if "\x00" in parsed: 192 | LOGGER.debug(f"Parsed string contains NUL (0x00) characters: {parsed}") 193 | parsed = parsed.replace("\x00", "") 194 | else: 195 | LOGGER.debug(f"parse_type - unsupported data type: {data_type}") 196 | return parsed 197 | 198 | 199 | def categorize_pages(db_data, page_size): 200 | if len(db_data) % page_size: 201 | LOGGER.warning(f"DB is not full or PAGE_SIZE is wrong. page size: {page_size} DB length {len(db_data)}") 202 | pages = {i: db_data[i:i + page_size] for i in range(0, len(db_data), page_size)} 203 | data_pages = {} 204 | table_defs = {} 205 | for page in pages: 206 | if pages[page].startswith(DATA_PAGE_MAGIC): 207 | data_pages[page] = pages[page] 208 | elif pages[page].startswith(TABLE_PAGE_MAGIC): 209 | table_defs[page] = pages[page] 210 | return table_defs, data_pages, pages 211 | 212 | 213 | def read_db_file(path): 214 | if not os.path.isfile(path): 215 | LOGGER.error(f"File {path} not found") 216 | raise FileNotFoundError(f"File {path} not found") 217 | with open(path, "rb") as f: 218 | return f.read() 219 | -------------------------------------------------------------------------------- /access_parser/parsing_primitives.py: -------------------------------------------------------------------------------- 1 | from construct import * 2 | 3 | 4 | def version_specific(version, v3_subcon, v4_subcon): 5 | """ 6 | There are some differences in the parsing structure between v3 and v4. Some fields are different length and some 7 | exist only in one of the versions. this returns the relevant parsing structure by version 8 | :param version: int 3 or 4 9 | :param v3_subcon: the parsing struct if version is 3 10 | :param v4_subcon: the parsing struct if version is 4 11 | """ 12 | if version == 3: 13 | return v3_subcon 14 | else: 15 | return v4_subcon 16 | 17 | 18 | ACCESSHEADER = Struct( 19 | Const(b'\00\x01\x00\x00'), 20 | "jet_string" / CString("utf8"), 21 | "jet_version" / Int32ul, 22 | # RC4 encrypted with key 0x6b39dac7. Database metadata 23 | Padding(126)) 24 | 25 | MEMO = Struct( 26 | "memo_length" / Int32ul, 27 | "record_pointer" / Int32ul, 28 | "memo_unknown" / Int32ul, 29 | "memo_end" / Tell) 30 | 31 | VERSION_3_FLAGS = BitStruct( 32 | "hyperlink" / Flag, 33 | "auto_GUID" / Flag, 34 | "unk_1" / Flag, 35 | "replication" / Flag, 36 | "unk_2" / Flag, 37 | "autonumber" / Flag, 38 | "can_be_null" / Flag, 39 | "fixed_length" / Flag) 40 | 41 | VERSION_4_FLAGS = BitStruct( 42 | "hyperlink" / Flag, 43 | "auto_GUID" / Flag, 44 | "unk_1" / Flag, 45 | "replication" / Flag, 46 | "unk_2" / Flag, 47 | "autonumber" / Flag, 48 | "can_be_null" / Flag, 49 | "fixed_length" / Flag, 50 | "unk_3" / Flag, 51 | "unk_4" / Flag, 52 | "unk_5" / Flag, 53 | 'modern_package_type' / Flag, 54 | "unk_6" / Flag, 55 | "unk_7" / Flag, 56 | "unk_8" / Flag, 57 | "compressed_unicode" / Flag) 58 | 59 | TDEF_HEADER = Struct( 60 | Const(b'\02\x01'), 61 | "peek_version" / Peek(Int16ul), 62 | "tdef_ver" / IfThenElse(lambda x: x.peek_version == b"VC", Const(b"VC"), Int16ul), 63 | "next_page_ptr" / Int32ul, 64 | "header_end" / Tell) 65 | 66 | LVPROP_CHUNK_NAMES_INT = Struct( 67 | "name_length" / Int16ul, 68 | "name" / PaddedString(this.name_length, "utf16"), 69 | ) 70 | LVPROP_CHUNK_NAMES = Struct( 71 | "names" / GreedyRange(LVPROP_CHUNK_NAMES_INT), 72 | # "leftover" / GreedyBytes 73 | ) 74 | LVPROP_DATA = Struct( 75 | "data_length" / Int16ul, 76 | "ddl_flag" / Int8ul, 77 | "type" / Int8ul, 78 | "name_index" / Int16ul, 79 | "only_data_length" / Int16ul, 80 | "actual_data" / Bytes(this.only_data_length) 81 | ) 82 | LVPROP_VALUE = Struct( 83 | "val_length" / Int32ul, 84 | "name_length" / Int16ul, 85 | "column_name" / PaddedString(this.name_length, "utf16"), 86 | "data" / GreedyRange(LVPROP_DATA), 87 | "left" / GreedyBytes 88 | ) 89 | 90 | LVPROP_CHUNK = Struct( 91 | "length" / Int32ul, 92 | "chunk_type" / Int16ul, 93 | 94 | "data" / Prefixed(Computed(this.length - 6), Switch(this.chunk_type, { 95 | # 128: GreedyRange(LVPROP_CHUNK_NAMES) 96 | 128: LVPROP_CHUNK_NAMES, 97 | 0:LVPROP_VALUE, 98 | 1: LVPROP_VALUE 99 | }, default=Bytes(this.length - 4))) 100 | ) 101 | LVPROP = Struct( 102 | #'KKD\0' in Jet3 and 'MR2\0' in Jet 4. 103 | "magic" / Bytes(4), 104 | "chunks" / GreedyRange(LVPROP_CHUNK), 105 | "leftover" / GreedyBytes 106 | ) 107 | 108 | def parse_table_head(buffer, version=3): 109 | return Struct( 110 | "TDEF_header" / TDEF_HEADER, 111 | # Table 112 | "table_definition_length" / Int32ul, 113 | "ver4_unknown" / If(lambda x: version > 3, Int32ul), 114 | "number_of_rows" / Int32ul, 115 | "autonumber" / Int32ul, 116 | "autonumber_increment" / If(lambda x: version > 3, Int32ul), 117 | "complex_autonumber" / If(lambda x: version > 3, Int32ul), 118 | "ver4_unknown_1" / If(lambda x: version > 3, Int32ul), 119 | "ver4_unknown_2" / If(lambda x: version > 3, Int32ul), 120 | # 0x53 system table 121 | # 0x4e user table 122 | "table_type_flags" / Int8ul, 123 | "next_column_id" / Int16ul, 124 | "variable_columns" / Int16ul, 125 | "column_count" / Int16ul, 126 | "index_count" / Int32ul, 127 | "real_index_count" / Int32ul, 128 | "row_page_map" / Int32ul, 129 | "free_space_page_map" / Int32ul, 130 | "tdef_header_end" / Tell).parse(buffer) 131 | 132 | 133 | def parse_table_data(buffer, index_count, real_index_count, column_count, version=3): 134 | REAL_INDEX = Struct( 135 | "unk1" / Int32ul, 136 | "index_row_count" / Int32ul, 137 | "ver4_always_zero" / If(lambda x: version > 3, Int32ul)) 138 | 139 | VARIOUS_TEXT_V3 = Struct( 140 | "LCID" / Int16ul, 141 | "code_page" / Int16ul, 142 | "various_text3_unknown" / Int16ul) 143 | 144 | VARIOUS_TEXT_V4 = Struct( 145 | "collation" / Int16ul, 146 | "various_text4_unknown" / Int8ul, 147 | "collation_version_number" / Int8ul) 148 | 149 | VARIOUS_TEXT = VARIOUS_TEXT_V3 if version == 3 else VARIOUS_TEXT_V4 150 | 151 | VARIOUS_DEC_V3 = Struct( 152 | "various_dec3_unknown" / Int16ul, 153 | "max_number_of_digits" / Int8ul, 154 | "number_of_decimal" / Int8ul, 155 | "various_dec3_unknown2" / Int16ul) 156 | 157 | VARIOUS_DEC_V4 = Struct( 158 | "max_num_of_digits" / Int8ul, 159 | "num_of_decimal_digits" / Int8ul, 160 | "various_dec4_unknown" / Int16ul) 161 | 162 | VARIOUS_DEC = VARIOUS_DEC_V3 if version == 3 else VARIOUS_DEC_V4 163 | 164 | VARIOUS_NUMERIC_V3 = Struct("prec" / Int8ul, "scale" / Int8ul, "unknown" / Int32ul) 165 | VARIOUS_NUMERIC_V4 = Struct("prec" / Int8ul, "scale" / Int8ul, "unknown" / Int16ul) 166 | VARIOUS_NUMERIC = VARIOUS_NUMERIC_V3 if version == 3 else VARIOUS_NUMERIC_V4 167 | 168 | COLUMN = Struct( 169 | "type" / Int8ul, 170 | "ver4_unknown_3" / If(lambda x: version > 3, Int32ul), 171 | "column_id" / Int16ul, 172 | "variable_column_number" / Int16ul, 173 | "column_index" / Int16ul, 174 | "various" / Switch(lambda ctx: ctx.type, 175 | { 176 | 9: VARIOUS_TEXT, 177 | 10: VARIOUS_TEXT, 178 | 11: VARIOUS_TEXT, 179 | 12: VARIOUS_TEXT, 180 | 16: VARIOUS_NUMERIC, 181 | 182 | 1: VARIOUS_DEC, 183 | 2: VARIOUS_DEC, 184 | 3: VARIOUS_DEC, 185 | 4: VARIOUS_DEC, 186 | 5: VARIOUS_DEC, 187 | 6: VARIOUS_DEC, 188 | 7: VARIOUS_DEC, 189 | 8: VARIOUS_DEC, 190 | 191 | }, default=version_specific(version, Bytes(6), Bytes(4))), 192 | "column_flags" / version_specific(version, VERSION_3_FLAGS, VERSION_4_FLAGS), 193 | "ver4_unknown_4" / If(lambda x: version > 3, Int32ul), 194 | "fixed_offset" / Int16ul, 195 | "length" / Int16ul) 196 | 197 | COLUMN_NAMES = Struct( 198 | "col_name_len" / version_specific(version, Int8ul, Int16ul), 199 | "col_name_str" / version_specific(version, 200 | PaddedString(lambda x: x.col_name_len, encoding="utf8"), 201 | PaddedString(lambda x: x.col_name_len, encoding="utf16")), 202 | ) 203 | 204 | REAL_INDEX2 = Struct( 205 | "unknown_b1" / If(lambda x: version > 3, Int32ul), 206 | "unk_struct" / Array(10, Struct("col_id" / Int16ul, "idx_flags" / Int8ul)), 207 | "runk" / Int32ul, 208 | "first_index_page" / Int32ul, 209 | "flags" / Int8ul, 210 | "unknown_b3" / If(lambda x: version > 3, Padding(9))) 211 | 212 | ALL_INDEXES = Struct( 213 | "unknown_c1" / If(lambda x: version > 3, Int32ul), 214 | "idx_num" / Int32ul, 215 | "idx_col_num" / Int32ul, 216 | "rel_tbl_type" / Int8ul, 217 | "rel_idx_num" / Int32sl, 218 | "rel_tbl_page" / Int32ul, 219 | "cascade_ups" / Int8ul, 220 | "cascade_dels" / Int8ul, 221 | "idx_type" / Int8ul, 222 | "unknown_c2" / If(lambda x: version > 3, Int32ul)) 223 | 224 | INDEX_NAMES = Struct( 225 | "idx_name_len" / version_specific(version, Int8ul, Int16ul), 226 | "idx_name_str" / version_specific(version, 227 | PaddedString(lambda x: x.idx_name_len, encoding="utf8"), 228 | PaddedString(lambda x: x.idx_name_len, encoding="utf16")), 229 | ) 230 | 231 | return Struct( 232 | "real_index" / Array(real_index_count, REAL_INDEX), 233 | "column" / Array(column_count, COLUMN), 234 | "column_names" / Array(column_count, COLUMN_NAMES), 235 | "real_index_2" / Array(real_index_count, REAL_INDEX2), 236 | "all_indexes" / Array(index_count, ALL_INDEXES), 237 | "index_names" / Array(index_count, INDEX_NAMES)).parse(buffer) 238 | 239 | 240 | def parse_data_page_header(buffer, version=3): 241 | return Struct( 242 | Const(b"\x01\x01"), 243 | "data_free_space" / Int16ul, 244 | "owner" / Int32ul, 245 | "ver4_unknown_dat1" / If(lambda x: version > 3, Int32ul), 246 | "record_count" / Int16ul, 247 | "record_offsets" / Array(lambda x: x.record_count, Int16ul)).parse(buffer) 248 | 249 | 250 | # buffer should be the record data in reverse 251 | def parse_relative_object_metadata_struct(buffer, variable_jump_tables_cnt=0, version=3): 252 | return Struct( 253 | "variable_length_field_count" / version_specific(version, Int8ub, Int16ub), 254 | "variable_length_jump_table" / If(lambda x: version == 3, Array(variable_jump_tables_cnt, Int8ub)), 255 | # This currently supports up to 255 columns for versions > 3 256 | "variable_length_field_offsets" / version_specific(version, 257 | Array(lambda x: x.variable_length_field_count, Int8ub), 258 | Array(lambda x: x.variable_length_field_count & 0xff, 259 | Int16ub)), 260 | "var_len_count" / version_specific(version, Int8ub, Int16ub), 261 | "relative_metadata_end" / Tell).parse(buffer) 262 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /access_parser/access_parser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import struct 3 | from collections import defaultdict 4 | 5 | from construct import ConstructError 6 | from tabulate import tabulate 7 | 8 | from .parsing_primitives import parse_relative_object_metadata_struct, parse_table_head, parse_data_page_header, \ 9 | ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP 10 | from .utils import categorize_pages, parse_type, TYPE_MEMO, TYPE_TEXT, TYPE_BOOLEAN, read_db_file, numeric_to_string, \ 11 | TYPE_96_bit_17_BYTES, TYPE_OLE 12 | 13 | # Page sizes 14 | PAGE_SIZE_V3 = 0x800 15 | PAGE_SIZE_V4 = 0x1000 16 | 17 | # Versions 18 | VERSION_3 = 0x00 19 | VERSION_4 = 0x01 20 | VERSION_5 = 0x02 21 | VERSION_2010 = 0x03 22 | 23 | ALL_VERSIONS = {VERSION_3: 3, VERSION_4: 4, VERSION_5: 5, VERSION_2010: 2010} 24 | NEW_VERSIONS = [VERSION_4, VERSION_5, VERSION_2010] 25 | 26 | SYSTEM_TABLE_FLAGS = [-0x80000000, -0x00000002, 0x80000000, 0x00000002] 27 | 28 | LOGGER = logging.getLogger("access_parser") 29 | 30 | 31 | class TableObj(object): 32 | def __init__(self, offset, val): 33 | self.value = val 34 | self.offset = offset 35 | self.linked_pages = [] 36 | 37 | 38 | class AccessParser(object): 39 | def __init__(self, db_path): 40 | self.db_data = read_db_file(db_path) 41 | self._parse_file_header(self.db_data) 42 | self._table_defs, self._data_pages, self._all_pages = categorize_pages(self.db_data, self.page_size) 43 | self._tables_with_data = self._link_tables_to_data() 44 | self.catalog = self._parse_catalog() 45 | self.extra_props = self.parse_msys_table() 46 | 47 | def parse_msys_table(self): 48 | """The MSysObjects contains extra metadata about tables and columns, like the Format of money field types """ 49 | msys_table = self.parse_table("MSysObjects") 50 | if not msys_table: 51 | return None 52 | if not msys_table.get('Name') or not msys_table.get('LvProp'): 53 | return [] 54 | table_to_lval_memo = {key: self.parse_lvprop(value) for key, value in zip(msys_table['Name'], 55 | msys_table['LvProp']) if value} 56 | return table_to_lval_memo 57 | 58 | def _parse_file_header(self, db_data): 59 | """ 60 | Parse the basic file header and determine the Access DB version based on the parsing results. 61 | :param db_data: db file data 62 | """ 63 | try: 64 | head = ACCESSHEADER.parse(db_data) 65 | except ConstructError: 66 | # This is a very minimal parsing of the header. If we fail this probable is not a valid mdb file 67 | raise ValueError("Failed to parse DB file header. Check it is a valid access database") 68 | version = head.jet_version 69 | if version in NEW_VERSIONS: 70 | if version == VERSION_4: 71 | self.version = ALL_VERSIONS[VERSION_4] 72 | elif version == VERSION_5: 73 | self.version = ALL_VERSIONS[VERSION_5] 74 | elif version == VERSION_2010: 75 | self.version = ALL_VERSIONS[VERSION_2010] 76 | self.page_size = PAGE_SIZE_V4 77 | 78 | else: 79 | if not version == VERSION_3: 80 | LOGGER.error(f"Unknown database version {version} Trying to parse database as version 3") 81 | self.version = ALL_VERSIONS[VERSION_3] 82 | self.page_size = PAGE_SIZE_V3 83 | LOGGER.info(f"DataBase version {version}") 84 | 85 | def _link_tables_to_data(self): 86 | """ 87 | Link tables definitions to their data pages 88 | :return: dict of {ofssets : PageObj} 89 | """ 90 | tables_with_data = {} 91 | # Link table definitions to data 92 | # the offset of the table definition page / 0x800 == the owner of a Data page 93 | for offset, data in self._data_pages.items(): 94 | try: 95 | parsed_dp = parse_data_page_header(data, version=self.version) 96 | except ConstructError: 97 | LOGGER.error(f"Failed to parse data page {data}") 98 | continue 99 | page_offset = parsed_dp.owner * self.page_size 100 | if page_offset in self._table_defs: 101 | table_page_value = self._table_defs.get(parsed_dp.owner * self.page_size) 102 | if page_offset not in tables_with_data: 103 | tables_with_data[page_offset] = TableObj(page_offset, table_page_value) 104 | tables_with_data[page_offset].linked_pages.append(data) 105 | return tables_with_data 106 | 107 | def _parse_catalog(self): 108 | """ 109 | Parse the catalog to get the DB tables and their offsets 110 | :return: dict {table : offset} 111 | """ 112 | catalog_page = self._tables_with_data[2 * self.page_size] 113 | access_table = AccessTable(catalog_page, self.version, self.page_size, self._data_pages, self._table_defs) 114 | catalog = access_table.parse() 115 | tables_mapping = {} 116 | for i, table_name in enumerate(catalog['Name']): 117 | # We need the MSysObjects table for metadata so exclude it from the system table filter. 118 | if table_name == "MSysObjects": 119 | tables_mapping[table_name] = catalog['Id'][i] 120 | # Visible user tables are type 1 121 | table_type = 1 122 | if catalog["Type"][i] == table_type: 123 | # Don't parse system tables 124 | if not catalog["Flags"][i] in SYSTEM_TABLE_FLAGS: 125 | tables_mapping[table_name] = catalog['Id'][i] 126 | else: 127 | LOGGER.debug(f"Not parsing system table - {table_name}") 128 | return tables_mapping 129 | 130 | def get_table(self, table_name): 131 | table_offset = self.catalog.get(table_name) 132 | if not table_offset: 133 | LOGGER.error(f"Could not find table {table_name} in DataBase") 134 | return 135 | table_offset = table_offset * self.page_size 136 | table = self._tables_with_data.get(table_offset) 137 | if not table: 138 | table_def = self._table_defs.get(table_offset) 139 | if table_def: 140 | table = TableObj(offset=table_offset, val=table_def) 141 | LOGGER.info(f"Table {table_name} has no data") 142 | else: 143 | LOGGER.error(f"Could not find table {table_name} offset {table_offset}") 144 | return 145 | 146 | # Try to get extra metadata for the table if it exists in the MSysObjects table 147 | props = None 148 | if table_name != "MSysObjects" and table_name in self.extra_props: 149 | props = self.extra_props[table_name] 150 | 151 | return AccessTable(table, self.version, self.page_size, self._data_pages, self._table_defs, props) 152 | 153 | def parse_lvprop(self, lvprop_raw): 154 | try: 155 | parsed = LVPROP.parse(lvprop_raw) 156 | except ConstructError: 157 | return None 158 | if not parsed.get("chunks"): 159 | return None 160 | table_names = [x.name for x in parsed.chunks[0].data.names] 161 | # Chunk type 0 does not have a column name, so we cannot link it to a column 162 | chunk_type_one = [x for x in parsed.chunks if x.chunk_type == 1] 163 | reconstructed_column_data = {} 164 | for chunk in chunk_type_one: 165 | if not chunk.data.column_name: 166 | LOGGER.error("Error while parsing MSysObjects table chunk.") 167 | continue 168 | data_values = {} 169 | for dv in chunk.data.data: 170 | val = parse_type(dv.type, dv.actual_data, version=self.version) 171 | try: 172 | name = table_names[dv.name_index] 173 | data_values[name] = val 174 | except IndexError: 175 | LOGGER.error("Error while parsing MSysObjects table chunk.") 176 | continue 177 | reconstructed_column_data[chunk.data.column_name] = data_values 178 | return reconstructed_column_data 179 | 180 | def parse_table(self, table_name): 181 | """ 182 | Parse a table from the db. 183 | tables names are in self.catalog 184 | :return defaultdict(list) with the parsed table -- table[column][row_index] 185 | """ 186 | return self.get_table(table_name).parse() 187 | 188 | def print_database(self): 189 | """ 190 | Print data from all database tables 191 | """ 192 | table_names = self.catalog 193 | for table_name in table_names: 194 | table = self.parse_table(table_name) 195 | if not table: 196 | continue 197 | print(f'TABLE NAME: {table_name}\r\n') 198 | print(tabulate(table, headers="keys", disable_numparse=True)) 199 | print('\r\n\r\n\r\n\r\n') 200 | 201 | 202 | class AccessTable(object): 203 | def __init__(self, table, version, page_size, data_pages, table_defs, props=None): 204 | self.version = version 205 | self.props = props 206 | self.page_size = page_size 207 | self._data_pages = data_pages 208 | self._table_defs = table_defs 209 | self.table = table 210 | self.parsed_table = defaultdict(list) 211 | self.columns, self.primary_keys, self.table_header = self._get_table_columns() 212 | 213 | def create_empty_table(self): 214 | parsed_table = defaultdict(list) 215 | columns, *_ = self._get_table_columns() 216 | for i, column in columns.items(): 217 | parsed_table[column.col_name_str] = "" 218 | return parsed_table 219 | 220 | def parse(self): 221 | """ 222 | This is the main table parsing function. We go through all of the data pages linked to the table, separate each 223 | data page to rows(records) and parse each record. 224 | :return defaultdict(list) with the parsed data -- table[column][row_index] 225 | """ 226 | if not self.table.linked_pages: 227 | return self.create_empty_table() 228 | for data_chunk in self.table.linked_pages: 229 | original_data = data_chunk 230 | parsed_data = parse_data_page_header(original_data, version=self.version) 231 | 232 | last_offset = None 233 | for rec_offset in parsed_data.record_offsets: 234 | # Deleted row - Just skip it 235 | if rec_offset & 0x8000: 236 | last_offset = rec_offset & 0xfff 237 | continue 238 | # Overflow page 239 | if rec_offset & 0x4000: 240 | # overflow ptr is 4 bits flags, 12 bits ptr 241 | rec_ptr_offset = rec_offset & 0xfff 242 | # update last pointer to pointer without flags 243 | last_offset = rec_ptr_offset 244 | # The ptr is the offset in the current data page. we get a 4 byte record_pointer from that 245 | overflow_rec_ptr = original_data[rec_ptr_offset:rec_ptr_offset + 4] 246 | overflow_rec_ptr = struct.unpack(" 3: 279 | field_count = struct.unpack_from("h", record)[0] 280 | record = record[2:] 281 | else: 282 | field_count = struct.unpack_from("b", record)[0] 283 | record = record[1:] 284 | 285 | relative_records_column_map = {} 286 | # Iterate columns 287 | for i, column in self.columns.items(): 288 | # Fixed length columns are handled before variable length. If this is a variable length column add it to 289 | # mapping and continue 290 | if not column.column_flags.fixed_length: 291 | relative_records_column_map[i] = column 292 | continue 293 | 294 | self._parse_fixed_length_data(record, column, null_table) 295 | if relative_records_column_map: 296 | relative_records_column_map = dict(sorted(relative_records_column_map.items())) 297 | metadata = self._parse_dynamic_length_records_metadata(reverse_record, original_record, 298 | null_table_len) 299 | if not metadata: 300 | return 301 | if metadata.variable_length_field_offsets: 302 | self._parse_dynamic_length_data(original_record, metadata, relative_records_column_map, null_table) 303 | 304 | def _parse_fixed_length_data(self, original_record, column, null_table): 305 | """ 306 | Parse fixed-length data from record 307 | :param original_record: unmodified record 308 | :param column: column this data belongs to 309 | :param null_table: null table of the row 310 | """ 311 | column_name = column.col_name_str 312 | # The null table indicates null values in the row. 313 | # The only exception is BOOL fields which are encoded in the null table 314 | has_value = True 315 | if column.column_id > len(null_table): 316 | LOGGER.warning("Invalid null table. Bool values may be wrong, deleted values may be shown in the db.") 317 | if column.type == TYPE_BOOLEAN: 318 | has_value = None 319 | else: 320 | has_value = null_table[column.column_id] 321 | # Boolean fields are encoded in the null table 322 | if column.type == TYPE_BOOLEAN: 323 | parsed_type = has_value 324 | else: 325 | if column.fixed_offset > len(original_record): 326 | LOGGER.error(f"Column offset is bigger than the length of the record {column.fixed_offset}") 327 | return 328 | record = original_record[column.fixed_offset:] 329 | parsed_type = parse_type(column.type, record, version=self.version, props=column.extra_props or None) 330 | if not has_value: 331 | self.parsed_table[column_name].append(None) 332 | return 333 | self.parsed_table[column_name].append(parsed_type) 334 | 335 | def _parse_dynamic_length_records_metadata(self, reverse_record, original_record, null_table_length): 336 | """ 337 | parse the metadata of relative records. The metadata used to parse relative records is found at the end of the 338 | record so reverse_record is used for parsing from the bottom up. 339 | :param reverse_record: original record in reverse 340 | :param original_record: unmodified record 341 | :param null_table_length: 342 | :return: parsed relative record metadata 343 | """ 344 | if self.version > 3: 345 | reverse_record = reverse_record[null_table_length:] 346 | return parse_relative_object_metadata_struct(reverse_record, version=self.version) 347 | # Parse relative metadata. 348 | # Metadata is from the end of the record(reverse_record is used here) 349 | variable_length_jump_table_cnt = (len(original_record) - 1) // 256 350 | reverse_record = reverse_record[null_table_length:] 351 | try: 352 | relative_record_metadata = parse_relative_object_metadata_struct(reverse_record, 353 | variable_length_jump_table_cnt, 354 | self.version) 355 | # relative_record_metadata = RELATIVE_OBJS.parse(reverse_record) 356 | # we use this offset in original_record so we have to update the length with the null_tables 357 | relative_record_metadata.relative_metadata_end = relative_record_metadata.relative_metadata_end + null_table_length 358 | except ConstructError: 359 | relative_record_metadata = None 360 | LOGGER.error("Failed parsing record") 361 | 362 | if relative_record_metadata and \ 363 | relative_record_metadata.variable_length_field_count != self.table_header.variable_columns: 364 | 365 | # best effort - try to find variable column count in the record and parse from there 366 | # this is limited to the 10 first bytes to reduce false positives. 367 | # most of the time iv'e seen this there was an extra DWORD before the actual metadata 368 | metadata_start = reverse_record.find(bytes([self.table_header.variable_columns])) 369 | if metadata_start != -1 and metadata_start < 10: 370 | reverse_record = reverse_record[metadata_start:] 371 | try: 372 | relative_record_metadata = parse_relative_object_metadata_struct(reverse_record, 373 | variable_length_jump_table_cnt, 374 | self.version) 375 | except ConstructError: 376 | LOGGER.error(f"Failed to parse record metadata: {original_record}") 377 | relative_record_metadata.relative_metadata_end = relative_record_metadata.relative_metadata_end + \ 378 | metadata_start 379 | else: 380 | LOGGER.warning( 381 | f"Record did not parse correctly. Number of columns: {self.table_header.variable_columns}" 382 | f" number of parsed columns: {relative_record_metadata.variable_length_field_count}") 383 | return None 384 | return relative_record_metadata 385 | 386 | def _parse_dynamic_length_data(self, original_record, relative_record_metadata, 387 | relative_records_column_map, null_table): 388 | """ 389 | Parse dynamic (non fixed length) records from row 390 | :param original_record: full unmodified record 391 | :param relative_record_metadata: parsed record metadata 392 | :param relative_records_column_map: relative records colum mapping {index: column} 393 | :param null_table: list indicating which columns have null value 394 | """ 395 | relative_offsets = relative_record_metadata.variable_length_field_offsets 396 | jump_table_addition = 0 397 | for i, column_index in enumerate(relative_records_column_map): 398 | column = relative_records_column_map[column_index] 399 | col_name = column.col_name_str 400 | has_value = True 401 | if column.column_id > len(null_table): 402 | LOGGER.warning("Invalid null table. null values may be shown in the db.") 403 | else: 404 | has_value = null_table[column.column_id] 405 | if not has_value: 406 | self.parsed_table[col_name].append(None) 407 | continue 408 | 409 | if self.version == 3: 410 | if i in relative_record_metadata.variable_length_jump_table: 411 | jump_table_addition += 0x100 412 | rel_start = relative_offsets[i] 413 | # If this is the last one use var_len_count as end offset 414 | if i + 1 == len(relative_offsets): 415 | rel_end = relative_record_metadata.var_len_count 416 | else: 417 | rel_end = relative_offsets[i + 1] 418 | 419 | # if rel_start and rel_end are the same there is no data in this slot 420 | if rel_start == rel_end: 421 | self.parsed_table[col_name].append("") 422 | continue 423 | 424 | relative_obj_data = original_record[rel_start + jump_table_addition: rel_end + jump_table_addition] 425 | # Parse types that require column data here, call parse_type on all other types 426 | if column.type == TYPE_MEMO: 427 | try: 428 | parsed_type = self._parse_memo(relative_obj_data) 429 | except ConstructError: 430 | LOGGER.warning("Failed to parse memo field. Using data as bytes") 431 | parsed_type = relative_obj_data 432 | elif column.type == TYPE_OLE: 433 | try: 434 | parsed_type = self._parse_memo(relative_obj_data, return_raw=True) 435 | except ConstructError: 436 | LOGGER.warning("Failed to parse OLE field. Using data as bytes") 437 | parsed_type = relative_obj_data 438 | elif column.type == TYPE_96_bit_17_BYTES: 439 | if len(relative_obj_data) != 17: 440 | LOGGER.warning(f"Relative numeric field has invalid length {len(relative_obj_data)}, expected 17") 441 | parsed_type = relative_obj_data 442 | else: 443 | # Get scale or None 444 | scale = column.get('various', {}).get('scale', 6) 445 | parsed_type = numeric_to_string(relative_obj_data, scale) 446 | else: 447 | parsed_type = parse_type(column.type, relative_obj_data, len(relative_obj_data), version=self.version) 448 | self.parsed_table[col_name].append(parsed_type) 449 | 450 | def _get_table_columns(self): 451 | """ 452 | Parse columns for a specific table 453 | """ 454 | try: 455 | table_header = parse_table_head(self.table.value, version=self.version) 456 | merged_data = self.table.value[table_header.tdef_header_end:] 457 | if table_header.TDEF_header.next_page_ptr: 458 | merged_data = merged_data + self._merge_table_data(table_header.TDEF_header.next_page_ptr) 459 | 460 | parsed_data = parse_table_data( 461 | merged_data, 462 | table_header.index_count, 463 | table_header.real_index_count, 464 | table_header.column_count, 465 | version=self.version, 466 | ) 467 | 468 | # Merge Data back to table_header 469 | table_header['column'] = parsed_data['column'] 470 | table_header['column_names'] = parsed_data['column_names'] 471 | table_header['real_index_2'] = parsed_data['real_index_2'] 472 | table_header["all_indexes"] = parsed_data["all_indexes"] 473 | table_header["index_names"] = parsed_data["index_names"] 474 | 475 | except ConstructError: 476 | LOGGER.error(f"Failed to parse table header {self.table.value}") 477 | return 478 | col_names = table_header.column_names 479 | columns = table_header.column 480 | 481 | # Add names to columns metadata, so we can use only columns for parsing 482 | for i, c in enumerate(columns): 483 | c.col_name_str = col_names[i].col_name_str 484 | c.extra_props = None 485 | 486 | # column_index is more accurate(id is always incremented so it is wrong when a column is deleted). 487 | # Some tables like the catalog don't have index, so if indexes are 0 use id. 488 | 489 | # create a dict of index to column to make it easier to access. offset is used to make this zero based 490 | offset = min(x.column_index for x in columns) 491 | column_dict = {x.column_index - offset: x for x in columns} 492 | # If column index is not unique try best effort 493 | if len(column_dict) != len(columns): 494 | # create a dict of id to column to make it easier to access 495 | column_dict = {x.column_id: x for x in columns} 496 | 497 | # Add the extra properties relevant for the column 498 | if self.props: 499 | for i, col in column_dict.items(): 500 | if col.col_name_str in self.props: 501 | col.extra_props = self.props[col.col_name_str] 502 | 503 | primary_keys = [ 504 | column_dict[col.col_id].col_name_str 505 | for idx in table_header.all_indexes 506 | for col in table_header.real_index_2[idx.idx_col_num].unk_struct 507 | if idx.idx_type == 1 and col.col_id ^ 0xFFFF 508 | ] 509 | 510 | if len(column_dict) != table_header.column_count: 511 | LOGGER.debug(f"expected {table_header.column_count} columns got {len(column_dict)}") 512 | return column_dict, primary_keys, table_header 513 | 514 | def _merge_table_data(self, first_page): 515 | """ 516 | Merege data of tdef pages in case the data does not fit in one page 517 | :param first_page: index of the next page 518 | :return: merged data from all linked table definitions 519 | """ 520 | table = self._table_defs.get(first_page * self.page_size) 521 | parsed_header = TDEF_HEADER.parse(table) 522 | data = table[parsed_header.header_end:] 523 | while parsed_header.next_page_ptr: 524 | table = self._table_defs.get(parsed_header.next_page_ptr * self.page_size) 525 | parsed_header = TDEF_HEADER.parse(table) 526 | data = data + table[parsed_header.header_end:] 527 | return data 528 | 529 | def _parse_memo(self, relative_obj_data, return_raw=False): 530 | LOGGER.debug(f"Parsing memo field {relative_obj_data}") 531 | parsed_memo = MEMO.parse(relative_obj_data) 532 | memo_type = TYPE_TEXT 533 | if parsed_memo.memo_length & 0x80000000: 534 | LOGGER.debug("memo data inline") 535 | inline_memo_length = parsed_memo.memo_length & 0x3FFFFFFF 536 | if len(relative_obj_data) < parsed_memo.memo_end + inline_memo_length: 537 | LOGGER.warning("Inline memo field has invalid length using full data") 538 | memo_data = relative_obj_data[parsed_memo.memo_end:] 539 | else: 540 | memo_data = relative_obj_data[parsed_memo.memo_end:parsed_memo.memo_end + inline_memo_length] 541 | 542 | elif parsed_memo.memo_length & 0x40000000: 543 | LOGGER.debug("LVAL type 1") 544 | memo_data = self._get_overflow_record(parsed_memo.record_pointer) 545 | else: 546 | LOGGER.debug("LVAL type 2") 547 | rec_data = self._get_overflow_record(parsed_memo.record_pointer) 548 | next_page = struct.unpack("I", rec_data[:4])[0] 549 | # LVAL2 has data over multiple pages. The first 4 bytes of the page are the next record, then that data. 550 | # Concat the data until we get a 0 next_page. 551 | memo_data = b"" 552 | while next_page: 553 | memo_data += rec_data[4:] 554 | rec_data = self._get_overflow_record(next_page) 555 | next_page = struct.unpack("I", rec_data[:4])[0] 556 | memo_data += rec_data[4:] 557 | if memo_data: 558 | if return_raw: 559 | return memo_data 560 | parsed_type = parse_type(memo_type, memo_data, len(memo_data), version=self.version) 561 | return parsed_type 562 | 563 | def _get_overflow_record(self, record_pointer): 564 | """ 565 | Get the actual record from a record pointer 566 | :param record_pointer: 567 | :return: record 568 | """ 569 | record_offset = record_pointer & 0xff 570 | page_num = record_pointer >> 8 571 | record_page = self._data_pages.get(page_num * self.page_size) 572 | if not record_page: 573 | LOGGER.warning(f"Could not find overflow record data page overflow pointer: {record_pointer}") 574 | return 575 | parsed_data = parse_data_page_header(record_page, version=self.version) 576 | if record_offset > len(parsed_data.record_offsets): 577 | LOGGER.warning("Failed parsing overflow record offset") 578 | return 579 | start = parsed_data.record_offsets[record_offset] 580 | if start & 0x8000: 581 | start = start & 0xfff 582 | else: 583 | LOGGER.debug(f"Overflow record flag is not present {start}") 584 | if record_offset == 0: 585 | record = record_page[start:] 586 | else: 587 | end = parsed_data.record_offsets[record_offset - 1] 588 | if end & 0x8000 and (end & 0xff != 0): 589 | end = end & 0xfff 590 | record = record_page[start: end] 591 | return record 592 | --------------------------------------------------------------------------------