├── .gitignore
├── access_parser
    ├── __init__.py
    ├── utils.py
    ├── parsing_primitives.py
    └── access_parser.py
├── examples
    ├── test.mdb
    └── parse_db.py
├── setup.py
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv/
2 | access_parser.egg-info/
3 | build/
4 | **/__pycache__/
5 | 


--------------------------------------------------------------------------------
/access_parser/__init__.py:
--------------------------------------------------------------------------------
1 | from access_parser.access_parser import AccessParser
2 | 


--------------------------------------------------------------------------------
/examples/test.mdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claroty/access_parser/HEAD/examples/test.mdb


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as f:
 4 |     long_description = f.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="access_parser",
 8 |     version="0.0.6",
 9 |     author="Uri Katz",
10 |     author_email="uri.k@claroty.com",
11 |     description="Access database (*.mdb, *.accdb) parser",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/ClarotyICS/access_parser",
15 |     packages=setuptools.find_packages(),
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: Apache Software License",
19 |         "Operating System :: OS Independent",
20 |     ],
21 |     python_requires='>=3.6',
22 |     install_requires=[
23 |           'construct',
24 |           'tabulate',
25 |       ],
26 | )
27 | 


--------------------------------------------------------------------------------
/examples/parse_db.py:
--------------------------------------------------------------------------------
 1 | from access_parser import AccessParser
 2 | from tabulate import tabulate
 3 | import argparse
 4 | 
 5 | 
 6 | def print_tables(db_path, only_catalog=False, specific_table=None):
 7 |     db = AccessParser(db_path)
 8 |     if only_catalog:
 9 |         for k in db.catalog.keys():
10 |             print(f"{k}\n")
11 |     elif specific_table:
12 |         table = db.parse_table(specific_table)
13 |         print(f'TABLE NAME: {specific_table}\r\n')
14 |         print(tabulate(table, headers="keys", disable_numparse=True))
15 |         print("\n\n\n\n")
16 |     else:
17 |         db.print_database()
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("-c", "--catalog", required=False, help="Print DB table names", action="store_true")
23 |     parser.add_argument("-f", "--file", required=True, help="*.mdb / *.accdb File")
24 |     parser.add_argument("-t", "--table", required=False, help="Table to print", default=None)
25 | 
26 |     args = parser.parse_args()
27 |     print_tables(args.file, args.catalog, args.table)
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AccessDB Parser (Pure Python)
 2 | Microsoft Access (.mdb / .accdb) database files parser. The parsing logic is fully written in python and works without any external binary dependencies.
 3 | 
 4 | # Installing
 5 | Use pip: `pip install access-parser`
 6 | 
 7 | Or install manually:
 8 | ```bash
 9 | git clone https://github.com/ClarotyICS/access_parser.git
10 | cd access_parser
11 | python3 setup.py install
12 | ```
13 | 
14 | # Demo
15 | [![asciicast](https://asciinema.org/a/345445.svg)](https://asciinema.org/a/345445)
16 | 
17 | # Usage Example
18 | ```python
19 | from access_parser import AccessParser
20 | 
21 | # .mdb or .accdb file
22 | db = AccessParser("/path/to/mdb/file.mdb")
23 | 
24 | # Print DB tables
25 | print(db.catalog)
26 | 
27 | # Tables are stored as defaultdict(list) -- table[column][row_index]
28 | table = db.parse_table("table_name")
29 | 
30 | # Pretty print all tables
31 | db.print_database()
32 | 
33 | ```
34 | 
35 | ### Known Issues
36 | * 
37 | 
38 | This library was tested on a limited subset of database files. Due to the differences between database versions and the complexity of the parsing we expect to find more parsing edge-cases.
39 | 
40 | To help us resolve issues faster please provide as much data as you can when opening an issue - DB file if possible and full trace including log messages.
41 |  
42 |  
43 | ### Thanks
44 | * This library was made possible by the great work by mdb-tools. The logic in this library heavily relies on the excellent documentation they have https://github.com/brianb/mdbtools
45 | * Huge thanks to Mashav Sapir for the help debugging, CRing and contributing to this project https://github.com/mashavs
46 | 


--------------------------------------------------------------------------------
/access_parser/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import struct
  4 | import uuid
  5 | import math
  6 | from datetime import datetime, timedelta
  7 | 
  8 | LOGGER = logging.getLogger("access_parser.utils")
  9 | 
 10 | 
 11 | TYPE_BOOLEAN = 1
 12 | TYPE_INT8 = 2
 13 | TYPE_INT16 = 3
 14 | TYPE_INT32 = 4
 15 | TYPE_MONEY = 5
 16 | TYPE_FLOAT32 = 6
 17 | TYPE_FLOAT64 = 7
 18 | TYPE_DATETIME = 8
 19 | TYPE_BINARY = 9
 20 | TYPE_TEXT = 10
 21 | TYPE_OLE = 11
 22 | TYPE_MEMO = 12
 23 | TYPE_GUID = 15
 24 | TYPE_96_bit_17_BYTES = 16
 25 | TYPE_COMPLEX = 18
 26 | 
 27 | TABLE_PAGE_MAGIC = b"\x02\x01"
 28 | DATA_PAGE_MAGIC = b"\x01\x01"
 29 | 
 30 | 
 31 | ACCESS_EPOCH = datetime(1899, 12, 30)
 32 | 
 33 | PERCENT_DEFAULT = '0.00%'
 34 | EURO_DEFAULT = '€0.00'
 35 | DOLLAR_DEFAULT = '$0.00'
 36 | GENERAL_NUMBER_DEFAULT = '0'
 37 | FIXED_AND_STANDARD_DEFAULT = '0.00'
 38 | SCIENTIFIC_DEFAULT = '0.00E+00'
 39 | 
 40 | FORMAT_PERCENT = "Percent"
 41 | FORMAT_DOLLAR = "$"
 42 | FORMAT_EURO = "€"
 43 | FORMAT_GENERAL_NUMBER = "General Number"
 44 | FORMAT_FIXED = "Fixed"
 45 | FORMAT_STANDARD = "Standard"
 46 | FORMAT_SCIENTIFIC = "Scientific"
 47 | 
 48 | FORMAT_TO_DEFAULT_VALUE = {
 49 |     FORMAT_DOLLAR: DOLLAR_DEFAULT,
 50 |     FORMAT_STANDARD: FIXED_AND_STANDARD_DEFAULT,
 51 |     FORMAT_FIXED: FIXED_AND_STANDARD_DEFAULT,
 52 |     FORMAT_PERCENT: PERCENT_DEFAULT,
 53 |     FORMAT_EURO: EURO_DEFAULT,
 54 |     FORMAT_GENERAL_NUMBER: GENERAL_NUMBER_DEFAULT,
 55 |     FORMAT_SCIENTIFIC: SCIENTIFIC_DEFAULT
 56 | }
 57 | 
 58 | 
 59 | # https://stackoverflow.com/questions/45560782
 60 | def mdb_date_to_readable(double_time):
 61 |     try:
 62 |         dtime_bytes = struct.pack("Q", double_time)
 63 | 
 64 |         dtime_double = struct.unpack('<d', dtime_bytes)[0]
 65 |         dtime_frac, dtime_whole = math.modf(dtime_double)
 66 |         dtime = (ACCESS_EPOCH + timedelta(days=dtime_whole) + timedelta(days=dtime_frac))
 67 |         if dtime == ACCESS_EPOCH:
 68 |             return "(Empty Date)"
 69 |         return str(dtime)
 70 |     except OverflowError:
 71 |         return "(Invalid Date)"
 72 |     except struct.error:
 73 |         return "(Invalid Date)"
 74 | 
 75 | 
 76 | def numeric_to_string(bytes_num, scale=6):
 77 |     neg, num1, num2, num3, num4 = struct.unpack("<BIIII", bytes_num)
 78 |     full_number = (num1 << 96) + (num2 << 64) + (num3 << 32) + num4
 79 |     full_number = str(full_number)
 80 |     # If scale is 6 149804168 will be 149.804168 - 6 from the end.
 81 |     # If scale is bigger than the number ignore the scale(1498 will remain 1498)
 82 |     if len(full_number) > scale:
 83 |         dot_len = len(full_number) - scale
 84 |         full_number = full_number[:dot_len] + "." + full_number[dot_len:]
 85 |     numeric_string = "-" if neg else ""
 86 |     numeric_string += full_number
 87 |     return numeric_string
 88 | 
 89 | 
 90 | def get_decoded_text(bytes_data):
 91 |     try:
 92 |         decoded = bytes_data.decode('utf-8')
 93 |     except UnicodeDecodeError:
 94 |         try:
 95 |             decoded = bytes_data.decode('latin1')
 96 |         except UnicodeDecodeError:
 97 |             decoded = bytes_data.decode('utf-8', errors='ignore')
 98 |     return decoded
 99 | 
100 | 
101 | def parse_money_type(parsed, prop_format):
102 |     """
103 |     Parse and format a money value according to the specified format.
104 | 
105 |     Args:
106 |         parsed (int): The numerical value to be parsed.
107 |         prop_format (str): The format string specifying the desired format.
108 | 
109 |     Returns:
110 |         str: The parsed and formatted money value.
111 |     """
112 |     parsed = str(parsed)
113 |     if prop_format == FORMAT_PERCENT:
114 |         special_format = "{:.2f}%"
115 |         dot_location = -2
116 |     elif prop_format.startswith(FORMAT_DOLLAR):
117 |         special_format = '${:,.2f}'
118 |         dot_location = -4
119 |     elif prop_format.startswith(FORMAT_EURO):
120 |         special_format = '€{:,.2f}'
121 |         dot_location = -4
122 |     elif prop_format == FORMAT_GENERAL_NUMBER:
123 |         special_format = '{:,.1f}'
124 |         dot_location = -4
125 |     elif prop_format == FORMAT_SCIENTIFIC:
126 |         special_format = '{:.2e}'
127 |         dot_location = -4
128 |     elif prop_format in [FORMAT_FIXED, FORMAT_STANDARD]:
129 |         dot_location = -4
130 |         special_format = '{:,.2f}'
131 |     else:
132 |         LOGGER.warning(f"parse_money_type - unsupported format: {prop_format} value {parsed} may be wrong")
133 |         return parsed
134 | 
135 |     money_float = parsed[:dot_location] + "." + parsed[dot_location:]
136 |     if special_format:
137 |         money_float = special_format.format(float(money_float))
138 |     return money_float
139 | 
140 | 
141 | def parse_type(data_type, buffer, length=None, version=3, props=None):
142 |     parsed = ""
143 |     # Bool or int8
144 |     if data_type == TYPE_INT8:
145 |         parsed = struct.unpack_from("b", buffer)[0]
146 |     elif data_type == TYPE_INT16:
147 |         parsed = struct.unpack_from("h", buffer)[0]
148 |     elif data_type == TYPE_INT32 or data_type == TYPE_COMPLEX:
149 |         parsed = struct.unpack_from("i", buffer)[0]
150 |     elif data_type == TYPE_MONEY:
151 |         parsed = struct.unpack_from("q", buffer)[0]
152 |         if props and "Format" in props:
153 |             prop_format = props['Format']
154 |             if parsed == 0:
155 |                 parsed = [y for x, y in FORMAT_TO_DEFAULT_VALUE.items() if prop_format.startswith(x)]
156 |                 if not parsed:
157 |                     LOGGER.warning(f"parse_type got unknown format while parsing money field {prop_format}")
158 |                 else:
159 |                     parsed = parsed[0]
160 |             else:
161 |                 parsed = parse_money_type(parsed, prop_format)
162 |     elif data_type == TYPE_FLOAT32:
163 |         parsed = struct.unpack_from("f", buffer)[0]
164 |     elif data_type == TYPE_FLOAT64:
165 |         parsed = struct.unpack_from("d", buffer)[0]
166 |     elif data_type == TYPE_DATETIME:
167 |         double_datetime = struct.unpack_from("q", buffer)[0]
168 |         parsed = mdb_date_to_readable(double_datetime)
169 |     elif data_type == TYPE_BINARY:
170 |         parsed = buffer[:length]
171 |         offset = length
172 |     elif data_type == TYPE_OLE:
173 |         parsed = buffer
174 |     elif data_type == TYPE_GUID:
175 |         parsed = buffer[:16]
176 |         guid = uuid.UUID(parsed.hex())
177 |         parsed = str(guid)
178 |     elif data_type == TYPE_96_bit_17_BYTES:
179 |         parsed = buffer[:17]
180 |     elif data_type == TYPE_TEXT:
181 |         if version > 3:
182 |             # Looks like if BOM is present text is already decoded
183 |             if buffer.startswith(b"\xfe\xff") or buffer.startswith(b"\xff\xfe"):
184 |                 buff = buffer[2:]
185 |                 parsed = get_decoded_text(buff)
186 |             else:
187 |                 parsed = buffer.decode("utf-16", errors='ignore')
188 |         else:
189 |             parsed = get_decoded_text(buffer)
190 | 
191 |         if "\x00" in parsed:
192 |             LOGGER.debug(f"Parsed string contains NUL (0x00) characters: {parsed}")
193 |             parsed = parsed.replace("\x00", "")
194 |     else:
195 |         LOGGER.debug(f"parse_type - unsupported data type: {data_type}")
196 |     return parsed
197 | 
198 | 
199 | def categorize_pages(db_data, page_size):
200 |     if len(db_data) % page_size:
201 |         LOGGER.warning(f"DB is not full or PAGE_SIZE is wrong. page size: {page_size} DB length {len(db_data)}")
202 |     pages = {i: db_data[i:i + page_size] for i in range(0, len(db_data), page_size)}
203 |     data_pages = {}
204 |     table_defs = {}
205 |     for page in pages:
206 |         if pages[page].startswith(DATA_PAGE_MAGIC):
207 |             data_pages[page] = pages[page]
208 |         elif pages[page].startswith(TABLE_PAGE_MAGIC):
209 |             table_defs[page] = pages[page]
210 |     return table_defs, data_pages, pages
211 | 
212 | 
213 | def read_db_file(path):
214 |     if not os.path.isfile(path):
215 |         LOGGER.error(f"File {path} not found")
216 |         raise FileNotFoundError(f"File {path} not found")
217 |     with open(path, "rb") as f:
218 |         return f.read()
219 | 


--------------------------------------------------------------------------------
/access_parser/parsing_primitives.py:
--------------------------------------------------------------------------------
  1 | from construct import *
  2 | 
  3 | 
  4 | def version_specific(version, v3_subcon, v4_subcon):
  5 |     """
  6 |     There are some differences in the parsing structure between v3 and v4. Some fields are different length and some
  7 |     exist only in one of the versions. this returns the relevant parsing structure by version
  8 |     :param version: int 3 or 4
  9 |     :param v3_subcon: the parsing struct if version is 3
 10 |     :param v4_subcon: the parsing struct if version is 4
 11 |     """
 12 |     if version == 3:
 13 |         return v3_subcon
 14 |     else:
 15 |         return v4_subcon
 16 | 
 17 | 
 18 | ACCESSHEADER = Struct(
 19 |     Const(b'\00\x01\x00\x00'),
 20 |     "jet_string" / CString("utf8"),
 21 |     "jet_version" / Int32ul,
 22 |     # RC4 encrypted with key 0x6b39dac7. Database metadata
 23 |     Padding(126))
 24 | 
 25 | MEMO = Struct(
 26 |     "memo_length" / Int32ul,
 27 |     "record_pointer" / Int32ul,
 28 |     "memo_unknown" / Int32ul,
 29 |     "memo_end" / Tell)
 30 | 
 31 | VERSION_3_FLAGS = BitStruct(
 32 |     "hyperlink" / Flag,
 33 |     "auto_GUID" / Flag,
 34 |     "unk_1" / Flag,
 35 |     "replication" / Flag,
 36 |     "unk_2" / Flag,
 37 |     "autonumber" / Flag,
 38 |     "can_be_null" / Flag,
 39 |     "fixed_length" / Flag)
 40 | 
 41 | VERSION_4_FLAGS = BitStruct(
 42 |     "hyperlink" / Flag,
 43 |     "auto_GUID" / Flag,
 44 |     "unk_1" / Flag,
 45 |     "replication" / Flag,
 46 |     "unk_2" / Flag,
 47 |     "autonumber" / Flag,
 48 |     "can_be_null" / Flag,
 49 |     "fixed_length" / Flag,
 50 |     "unk_3" / Flag,
 51 |     "unk_4" / Flag,
 52 |     "unk_5" / Flag,
 53 |     'modern_package_type' / Flag,
 54 |     "unk_6" / Flag,
 55 |     "unk_7" / Flag,
 56 |     "unk_8" / Flag,
 57 |     "compressed_unicode" / Flag)
 58 | 
 59 | TDEF_HEADER = Struct(
 60 |     Const(b'\02\x01'),
 61 |     "peek_version" / Peek(Int16ul),
 62 |     "tdef_ver" / IfThenElse(lambda x: x.peek_version == b"VC", Const(b"VC"), Int16ul),
 63 |     "next_page_ptr" / Int32ul,
 64 |     "header_end" / Tell)
 65 | 
 66 | LVPROP_CHUNK_NAMES_INT = Struct(
 67 |     "name_length" / Int16ul,
 68 |     "name" / PaddedString(this.name_length, "utf16"),
 69 | )
 70 | LVPROP_CHUNK_NAMES = Struct(
 71 |     "names" / GreedyRange(LVPROP_CHUNK_NAMES_INT),
 72 |     # "leftover" / GreedyBytes
 73 | )
 74 | LVPROP_DATA = Struct(
 75 |     "data_length" / Int16ul,
 76 |     "ddl_flag" / Int8ul,
 77 |     "type" / Int8ul,
 78 |     "name_index" / Int16ul,
 79 |     "only_data_length" / Int16ul,
 80 |     "actual_data" / Bytes(this.only_data_length)
 81 | )
 82 | LVPROP_VALUE = Struct(
 83 |     "val_length" / Int32ul,
 84 |     "name_length" / Int16ul,
 85 |     "column_name" / PaddedString(this.name_length, "utf16"),
 86 |     "data" / GreedyRange(LVPROP_DATA),
 87 |     "left" / GreedyBytes
 88 | )
 89 | 
 90 | LVPROP_CHUNK = Struct(
 91 |     "length" / Int32ul,
 92 |     "chunk_type" / Int16ul,
 93 | 
 94 |     "data" / Prefixed(Computed(this.length - 6), Switch(this.chunk_type, {
 95 |         # 128: GreedyRange(LVPROP_CHUNK_NAMES)
 96 |         128: LVPROP_CHUNK_NAMES,
 97 |         0:LVPROP_VALUE,
 98 |         1: LVPROP_VALUE
 99 |     }, default=Bytes(this.length - 4)))
100 | )
101 | LVPROP = Struct(
102 |     #'KKD\0' in Jet3 and 'MR2\0' in Jet 4.
103 |     "magic" / Bytes(4),
104 |     "chunks" / GreedyRange(LVPROP_CHUNK),
105 |     "leftover" / GreedyBytes
106 | )
107 | 
108 | def parse_table_head(buffer, version=3):
109 |     return Struct(
110 |         "TDEF_header" / TDEF_HEADER,
111 |         # Table
112 |         "table_definition_length" / Int32ul,
113 |         "ver4_unknown" / If(lambda x: version > 3, Int32ul),
114 |         "number_of_rows" / Int32ul,
115 |         "autonumber" / Int32ul,
116 |         "autonumber_increment" / If(lambda x: version > 3, Int32ul),
117 |         "complex_autonumber" / If(lambda x: version > 3, Int32ul),
118 |         "ver4_unknown_1" / If(lambda x: version > 3, Int32ul),
119 |         "ver4_unknown_2" / If(lambda x: version > 3, Int32ul),
120 |         # 0x53 system table
121 |         # 0x4e user table
122 |         "table_type_flags" / Int8ul,
123 |         "next_column_id" / Int16ul,
124 |         "variable_columns" / Int16ul,
125 |         "column_count" / Int16ul,
126 |         "index_count" / Int32ul,
127 |         "real_index_count" / Int32ul,
128 |         "row_page_map" / Int32ul,
129 |         "free_space_page_map" / Int32ul,
130 |         "tdef_header_end" / Tell).parse(buffer)
131 | 
132 | 
133 | def parse_table_data(buffer, index_count, real_index_count, column_count, version=3):
134 |     REAL_INDEX = Struct(
135 |         "unk1" / Int32ul,
136 |         "index_row_count" / Int32ul,
137 |         "ver4_always_zero" /  If(lambda x: version > 3, Int32ul))
138 | 
139 |     VARIOUS_TEXT_V3 = Struct(
140 |         "LCID" / Int16ul,
141 |         "code_page" / Int16ul,
142 |         "various_text3_unknown" / Int16ul)
143 | 
144 |     VARIOUS_TEXT_V4 = Struct(
145 |         "collation" / Int16ul,
146 |         "various_text4_unknown" / Int8ul,
147 |         "collation_version_number" / Int8ul)
148 | 
149 |     VARIOUS_TEXT = VARIOUS_TEXT_V3 if version == 3 else VARIOUS_TEXT_V4
150 | 
151 |     VARIOUS_DEC_V3 = Struct(
152 |         "various_dec3_unknown" / Int16ul,
153 |         "max_number_of_digits" / Int8ul,
154 |         "number_of_decimal" / Int8ul,
155 |         "various_dec3_unknown2" / Int16ul)
156 | 
157 |     VARIOUS_DEC_V4 = Struct(
158 |         "max_num_of_digits" / Int8ul,
159 |         "num_of_decimal_digits" / Int8ul,
160 |         "various_dec4_unknown" / Int16ul)
161 | 
162 |     VARIOUS_DEC = VARIOUS_DEC_V3 if version == 3 else VARIOUS_DEC_V4
163 | 
164 |     VARIOUS_NUMERIC_V3 = Struct("prec" / Int8ul, "scale" / Int8ul, "unknown" / Int32ul)
165 |     VARIOUS_NUMERIC_V4 = Struct("prec" / Int8ul, "scale" / Int8ul, "unknown" / Int16ul)
166 |     VARIOUS_NUMERIC = VARIOUS_NUMERIC_V3 if version == 3 else VARIOUS_NUMERIC_V4
167 | 
168 |     COLUMN = Struct(
169 |         "type" / Int8ul,
170 |         "ver4_unknown_3" /  If(lambda x: version > 3, Int32ul),
171 |         "column_id" / Int16ul,
172 |         "variable_column_number" / Int16ul,
173 |         "column_index" / Int16ul,
174 |         "various" / Switch(lambda ctx: ctx.type,
175 |                            {
176 |                                9: VARIOUS_TEXT,
177 |                                10: VARIOUS_TEXT,
178 |                                11: VARIOUS_TEXT,
179 |                                12: VARIOUS_TEXT,
180 |                                16: VARIOUS_NUMERIC,
181 | 
182 |                                1: VARIOUS_DEC,
183 |                                2: VARIOUS_DEC,
184 |                                3: VARIOUS_DEC,
185 |                                4: VARIOUS_DEC,
186 |                                5: VARIOUS_DEC,
187 |                                6: VARIOUS_DEC,
188 |                                7: VARIOUS_DEC,
189 |                                8: VARIOUS_DEC,
190 | 
191 |                            }, default=version_specific(version, Bytes(6), Bytes(4))),
192 |         "column_flags" / version_specific(version, VERSION_3_FLAGS, VERSION_4_FLAGS),
193 |         "ver4_unknown_4" / If(lambda x: version > 3, Int32ul),
194 |         "fixed_offset" / Int16ul,
195 |         "length" / Int16ul)
196 | 
197 |     COLUMN_NAMES = Struct(
198 |         "col_name_len" / version_specific(version, Int8ul, Int16ul),
199 |         "col_name_str" / version_specific(version,
200 |                                           PaddedString(lambda x: x.col_name_len, encoding="utf8"),
201 |                                           PaddedString(lambda x: x.col_name_len, encoding="utf16")),
202 |     )
203 | 
204 |     REAL_INDEX2 = Struct(
205 |         "unknown_b1" / If(lambda x: version > 3, Int32ul),
206 |         "unk_struct" / Array(10, Struct("col_id" / Int16ul, "idx_flags" / Int8ul)),
207 |         "runk" / Int32ul,
208 |         "first_index_page" / Int32ul,
209 |         "flags" / Int8ul,
210 |         "unknown_b3" / If(lambda x: version > 3, Padding(9)))
211 | 
212 |     ALL_INDEXES = Struct(
213 |         "unknown_c1" / If(lambda x: version > 3, Int32ul),
214 |         "idx_num" / Int32ul,
215 |         "idx_col_num" / Int32ul,
216 |         "rel_tbl_type" / Int8ul,
217 |         "rel_idx_num" / Int32sl,
218 |         "rel_tbl_page" / Int32ul,
219 |         "cascade_ups" / Int8ul,
220 |         "cascade_dels" / Int8ul,
221 |         "idx_type" / Int8ul,
222 |         "unknown_c2" / If(lambda x: version > 3, Int32ul))
223 | 
224 |     INDEX_NAMES = Struct(
225 |         "idx_name_len" / version_specific(version, Int8ul, Int16ul),
226 |         "idx_name_str" / version_specific(version,
227 |                                           PaddedString(lambda x: x.idx_name_len, encoding="utf8"),
228 |                                           PaddedString(lambda x: x.idx_name_len, encoding="utf16")),
229 |     )
230 | 
231 |     return Struct(
232 |         "real_index" / Array(real_index_count, REAL_INDEX),
233 |         "column" / Array(column_count, COLUMN),
234 |         "column_names" / Array(column_count, COLUMN_NAMES),
235 |         "real_index_2" / Array(real_index_count, REAL_INDEX2),
236 |         "all_indexes" / Array(index_count, ALL_INDEXES),
237 |         "index_names" /  Array(index_count, INDEX_NAMES)).parse(buffer)
238 | 
239 | 
240 | def parse_data_page_header(buffer, version=3):
241 |     return Struct(
242 |         Const(b"\x01\x01"),
243 |         "data_free_space" / Int16ul,
244 |         "owner" / Int32ul,
245 |         "ver4_unknown_dat1" / If(lambda x: version > 3, Int32ul),
246 |         "record_count" / Int16ul,
247 |         "record_offsets" / Array(lambda x: x.record_count, Int16ul)).parse(buffer)
248 | 
249 | 
250 | # buffer should be the record data in reverse
251 | def parse_relative_object_metadata_struct(buffer, variable_jump_tables_cnt=0, version=3):
252 |     return Struct(
253 |         "variable_length_field_count" / version_specific(version, Int8ub, Int16ub),
254 |         "variable_length_jump_table" / If(lambda x: version == 3, Array(variable_jump_tables_cnt, Int8ub)),
255 |         # This currently supports up to 255 columns for versions > 3
256 |         "variable_length_field_offsets" / version_specific(version,
257 |                                                            Array(lambda x: x.variable_length_field_count, Int8ub),
258 |                                                            Array(lambda x: x.variable_length_field_count & 0xff,
259 |                                                                  Int16ub)),
260 |         "var_len_count" / version_specific(version, Int8ub, Int16ub),
261 |         "relative_metadata_end" / Tell).parse(buffer)
262 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/access_parser/access_parser.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import struct
  3 | from collections import defaultdict
  4 | 
  5 | from construct import ConstructError
  6 | from tabulate import tabulate
  7 | 
  8 | from .parsing_primitives import parse_relative_object_metadata_struct, parse_table_head, parse_data_page_header, \
  9 |     ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP
 10 | from .utils import categorize_pages, parse_type, TYPE_MEMO, TYPE_TEXT, TYPE_BOOLEAN, read_db_file, numeric_to_string, \
 11 |     TYPE_96_bit_17_BYTES, TYPE_OLE
 12 | 
 13 | # Page sizes
 14 | PAGE_SIZE_V3 = 0x800
 15 | PAGE_SIZE_V4 = 0x1000
 16 | 
 17 | # Versions
 18 | VERSION_3 = 0x00
 19 | VERSION_4 = 0x01
 20 | VERSION_5 = 0x02
 21 | VERSION_2010 = 0x03
 22 | 
 23 | ALL_VERSIONS = {VERSION_3: 3, VERSION_4: 4, VERSION_5: 5, VERSION_2010: 2010}
 24 | NEW_VERSIONS = [VERSION_4, VERSION_5, VERSION_2010]
 25 | 
 26 | SYSTEM_TABLE_FLAGS = [-0x80000000, -0x00000002, 0x80000000, 0x00000002]
 27 | 
 28 | LOGGER = logging.getLogger("access_parser")
 29 | 
 30 | 
 31 | class TableObj(object):
 32 |     def __init__(self, offset, val):
 33 |         self.value = val
 34 |         self.offset = offset
 35 |         self.linked_pages = []
 36 | 
 37 | 
 38 | class AccessParser(object):
 39 |     def __init__(self, db_path):
 40 |         self.db_data = read_db_file(db_path)
 41 |         self._parse_file_header(self.db_data)
 42 |         self._table_defs, self._data_pages, self._all_pages = categorize_pages(self.db_data, self.page_size)
 43 |         self._tables_with_data = self._link_tables_to_data()
 44 |         self.catalog = self._parse_catalog()
 45 |         self.extra_props = self.parse_msys_table()
 46 | 
 47 |     def parse_msys_table(self):
 48 |         """The MSysObjects contains extra metadata about tables and columns, like the Format of money field types """
 49 |         msys_table = self.parse_table("MSysObjects")
 50 |         if not msys_table:
 51 |             return None
 52 |         if not msys_table.get('Name') or not msys_table.get('LvProp'):
 53 |             return []
 54 |         table_to_lval_memo = {key: self.parse_lvprop(value) for key, value in zip(msys_table['Name'],
 55 |                                                                                   msys_table['LvProp']) if value}
 56 |         return table_to_lval_memo
 57 | 
 58 |     def _parse_file_header(self, db_data):
 59 |         """
 60 |         Parse the basic file header and determine the Access DB version based on the parsing results.
 61 |         :param db_data: db file data
 62 |         """
 63 |         try:
 64 |             head = ACCESSHEADER.parse(db_data)
 65 |         except ConstructError:
 66 |             # This is a very minimal parsing of the header. If we fail this probable is not a valid mdb file
 67 |             raise ValueError("Failed to parse DB file header. Check it is a valid access database")
 68 |         version = head.jet_version
 69 |         if version in NEW_VERSIONS:
 70 |             if version == VERSION_4:
 71 |                 self.version = ALL_VERSIONS[VERSION_4]
 72 |             elif version == VERSION_5:
 73 |                 self.version = ALL_VERSIONS[VERSION_5]
 74 |             elif version == VERSION_2010:
 75 |                 self.version = ALL_VERSIONS[VERSION_2010]
 76 |             self.page_size = PAGE_SIZE_V4
 77 | 
 78 |         else:
 79 |             if not version == VERSION_3:
 80 |                 LOGGER.error(f"Unknown database version {version} Trying to parse database as version 3")
 81 |             self.version = ALL_VERSIONS[VERSION_3]
 82 |             self.page_size = PAGE_SIZE_V3
 83 |         LOGGER.info(f"DataBase version {version}")
 84 | 
 85 |     def _link_tables_to_data(self):
 86 |         """
 87 |         Link tables definitions to their data pages
 88 |         :return: dict of {ofssets : PageObj}
 89 |         """
 90 |         tables_with_data = {}
 91 |         # Link table definitions to data
 92 |         # the offset of the table definition page / 0x800  ==  the owner of a Data page
 93 |         for offset, data in self._data_pages.items():
 94 |             try:
 95 |                 parsed_dp = parse_data_page_header(data, version=self.version)
 96 |             except ConstructError:
 97 |                 LOGGER.error(f"Failed to parse data page {data}")
 98 |                 continue
 99 |             page_offset = parsed_dp.owner * self.page_size
100 |             if page_offset in self._table_defs:
101 |                 table_page_value = self._table_defs.get(parsed_dp.owner * self.page_size)
102 |                 if page_offset not in tables_with_data:
103 |                     tables_with_data[page_offset] = TableObj(page_offset, table_page_value)
104 |                 tables_with_data[page_offset].linked_pages.append(data)
105 |         return tables_with_data
106 | 
107 |     def _parse_catalog(self):
108 |         """
109 |         Parse the catalog to get the DB tables and their offsets
110 |         :return: dict {table : offset}
111 |         """
112 |         catalog_page = self._tables_with_data[2 * self.page_size]
113 |         access_table = AccessTable(catalog_page, self.version, self.page_size, self._data_pages, self._table_defs)
114 |         catalog = access_table.parse()
115 |         tables_mapping = {}
116 |         for i, table_name in enumerate(catalog['Name']):
117 |             # We need the MSysObjects table for metadata so exclude it from the system table filter.
118 |             if table_name == "MSysObjects":
119 |                 tables_mapping[table_name] = catalog['Id'][i]
120 |             # Visible user tables are type 1
121 |             table_type = 1
122 |             if catalog["Type"][i] == table_type:
123 |                 # Don't parse system tables
124 |                 if not catalog["Flags"][i] in SYSTEM_TABLE_FLAGS:
125 |                     tables_mapping[table_name] = catalog['Id'][i]
126 |                 else:
127 |                     LOGGER.debug(f"Not parsing system table - {table_name}")
128 |         return tables_mapping
129 | 
130 |     def get_table(self, table_name):
131 |         table_offset = self.catalog.get(table_name)
132 |         if not table_offset:
133 |             LOGGER.error(f"Could not find table {table_name} in DataBase")
134 |             return
135 |         table_offset = table_offset * self.page_size
136 |         table = self._tables_with_data.get(table_offset)
137 |         if not table:
138 |             table_def = self._table_defs.get(table_offset)
139 |             if table_def:
140 |                 table = TableObj(offset=table_offset, val=table_def)
141 |                 LOGGER.info(f"Table {table_name} has no data")
142 |             else:
143 |                 LOGGER.error(f"Could not find table {table_name} offset {table_offset}")
144 |                 return
145 | 
146 |         # Try to get extra metadata for the table if it exists in the MSysObjects table
147 |         props = None
148 |         if table_name != "MSysObjects" and table_name in self.extra_props:
149 |             props = self.extra_props[table_name]
150 | 
151 |         return AccessTable(table, self.version, self.page_size, self._data_pages, self._table_defs, props)
152 | 
153 |     def parse_lvprop(self, lvprop_raw):
154 |         try:
155 |             parsed = LVPROP.parse(lvprop_raw)
156 |         except ConstructError:
157 |             return None
158 |         if not parsed.get("chunks"):
159 |             return None
160 |         table_names = [x.name for x in parsed.chunks[0].data.names]
161 |         # Chunk type 0 does not have a column name, so we cannot link it to a column
162 |         chunk_type_one = [x for x in parsed.chunks if x.chunk_type == 1]
163 |         reconstructed_column_data = {}
164 |         for chunk in chunk_type_one:
165 |             if not chunk.data.column_name:
166 |                 LOGGER.error("Error while parsing MSysObjects table chunk.")
167 |                 continue
168 |             data_values = {}
169 |             for dv in chunk.data.data:
170 |                 val = parse_type(dv.type, dv.actual_data, version=self.version)
171 |                 try:
172 |                     name = table_names[dv.name_index]
173 |                     data_values[name] = val
174 |                 except IndexError:
175 |                     LOGGER.error("Error while parsing MSysObjects table chunk.")
176 |                     continue
177 |             reconstructed_column_data[chunk.data.column_name] = data_values
178 |         return reconstructed_column_data
179 | 
180 |     def parse_table(self, table_name):
181 |         """
182 |         Parse a table from the db.
183 |         tables names are in self.catalog
184 |         :return defaultdict(list) with the parsed table -- table[column][row_index]
185 |         """
186 |         return self.get_table(table_name).parse()
187 | 
188 |     def print_database(self):
189 |         """
190 |         Print data from all database tables
191 |         """
192 |         table_names = self.catalog
193 |         for table_name in table_names:
194 |             table = self.parse_table(table_name)
195 |             if not table:
196 |                 continue
197 |             print(f'TABLE NAME: {table_name}\r\n')
198 |             print(tabulate(table, headers="keys", disable_numparse=True))
199 |             print('\r\n\r\n\r\n\r\n')
200 | 
201 | 
202 | class AccessTable(object):
203 |     def __init__(self, table, version, page_size, data_pages, table_defs, props=None):
204 |         self.version = version
205 |         self.props = props
206 |         self.page_size = page_size
207 |         self._data_pages = data_pages
208 |         self._table_defs = table_defs
209 |         self.table = table
210 |         self.parsed_table = defaultdict(list)
211 |         self.columns, self.primary_keys, self.table_header = self._get_table_columns()
212 | 
213 |     def create_empty_table(self):
214 |         parsed_table = defaultdict(list)
215 |         columns, *_ = self._get_table_columns()
216 |         for i, column in columns.items():
217 |             parsed_table[column.col_name_str] = ""
218 |         return parsed_table
219 | 
220 |     def parse(self):
221 |         """
222 |         This is the main table parsing function. We go through all of the data pages linked to the table, separate each
223 |         data page to rows(records) and parse each record.
224 |         :return defaultdict(list) with the parsed data -- table[column][row_index]
225 |         """
226 |         if not self.table.linked_pages:
227 |             return self.create_empty_table()
228 |         for data_chunk in self.table.linked_pages:
229 |             original_data = data_chunk
230 |             parsed_data = parse_data_page_header(original_data, version=self.version)
231 | 
232 |             last_offset = None
233 |             for rec_offset in parsed_data.record_offsets:
234 |                 # Deleted row - Just skip it
235 |                 if rec_offset & 0x8000:
236 |                     last_offset = rec_offset & 0xfff
237 |                     continue
238 |                 # Overflow page
239 |                 if rec_offset & 0x4000:
240 |                     # overflow ptr is 4 bits flags, 12 bits ptr
241 |                     rec_ptr_offset = rec_offset & 0xfff
242 |                     # update last pointer to pointer without flags
243 |                     last_offset = rec_ptr_offset
244 |                     # The ptr is the offset in the current data page. we get a 4 byte record_pointer from that
245 |                     overflow_rec_ptr = original_data[rec_ptr_offset:rec_ptr_offset + 4]
246 |                     overflow_rec_ptr = struct.unpack("<I", overflow_rec_ptr)[0]
247 |                     record = self._get_overflow_record(overflow_rec_ptr)
248 |                     if record:
249 |                         self._parse_row(record)
250 |                     continue
251 |                 # First record is actually the last one - from offset until the end of the data
252 |                 if not last_offset:
253 |                     record = original_data[rec_offset:]
254 |                 else:
255 |                     record = original_data[rec_offset:last_offset]
256 |                 last_offset = rec_offset
257 |                 if record:
258 |                     self._parse_row(record)
259 |         return self.parsed_table
260 | 
261 |     def _parse_row(self, record):
262 |         """
263 |         parse record (row) of data. First parse all fixed-length data field and then parse the relative length data.
264 |         :param record: the current row data
265 |         :return:
266 |         """
267 |         original_record = record
268 |         reverse_record = record[::-1]
269 |         # Records contain null bitmaps for columns. The number of bitmaps is the number of columns / 8 rounded up
270 |         null_table_len = (self.table_header.column_count + 7) // 8
271 |         if null_table_len and null_table_len < len(original_record):
272 |             null_table = record[-null_table_len:]
273 |             # Turn bitmap to a list of True False values
274 |             null_table = [((null_table[i // 8]) & (1 << (i % 8))) != 0 for i in range(len(null_table) * 8)]
275 |         else:
276 |             LOGGER.error(f"Failed to parse null table column count {self.table_header.column_count}")
277 |             return
278 |         if self.version > 3:
279 |             field_count = struct.unpack_from("h", record)[0]
280 |             record = record[2:]
281 |         else:
282 |             field_count = struct.unpack_from("b", record)[0]
283 |             record = record[1:]
284 | 
285 |         relative_records_column_map = {}
286 |         # Iterate columns
287 |         for i, column in self.columns.items():
288 |             # Fixed length columns are handled before variable length. If this is a variable length column add it to
289 |             # mapping and continue
290 |             if not column.column_flags.fixed_length:
291 |                 relative_records_column_map[i] = column
292 |                 continue
293 | 
294 |             self._parse_fixed_length_data(record, column, null_table)
295 |         if relative_records_column_map:
296 |             relative_records_column_map = dict(sorted(relative_records_column_map.items()))
297 |             metadata = self._parse_dynamic_length_records_metadata(reverse_record, original_record,
298 |                                                                    null_table_len)
299 |             if not metadata:
300 |                 return
301 |             if metadata.variable_length_field_offsets:
302 |                 self._parse_dynamic_length_data(original_record, metadata, relative_records_column_map, null_table)
303 | 
304 |     def _parse_fixed_length_data(self, original_record, column, null_table):
305 |         """
306 |         Parse fixed-length data from record
307 |         :param original_record: unmodified record
308 |         :param column: column this data belongs to
309 |         :param null_table: null table of the row
310 |         """
311 |         column_name = column.col_name_str
312 |         # The null table indicates null values in the row.
313 |         # The only exception is BOOL fields which are encoded in the null table
314 |         has_value = True
315 |         if column.column_id > len(null_table):
316 |             LOGGER.warning("Invalid null table. Bool values may be wrong, deleted values may be shown in the db.")
317 |             if column.type == TYPE_BOOLEAN:
318 |                 has_value = None
319 |         else:
320 |             has_value = null_table[column.column_id]
321 |         # Boolean fields are encoded in the null table
322 |         if column.type == TYPE_BOOLEAN:
323 |             parsed_type = has_value
324 |         else:
325 |             if column.fixed_offset > len(original_record):
326 |                 LOGGER.error(f"Column offset is bigger than the length of the record {column.fixed_offset}")
327 |                 return
328 |             record = original_record[column.fixed_offset:]
329 |             parsed_type = parse_type(column.type, record, version=self.version, props=column.extra_props or None)
330 |             if not has_value:
331 |                 self.parsed_table[column_name].append(None)
332 |                 return
333 |         self.parsed_table[column_name].append(parsed_type)
334 | 
335 |     def _parse_dynamic_length_records_metadata(self, reverse_record, original_record, null_table_length):
336 |         """
337 |         parse the metadata of relative records. The metadata used to parse relative records is found at the end of the
338 |         record so reverse_record is used for parsing from the bottom up.
339 |         :param reverse_record: original record in reverse
340 |         :param original_record: unmodified record
341 |         :param null_table_length:
342 |         :return: parsed relative record metadata
343 |         """
344 |         if self.version > 3:
345 |             reverse_record = reverse_record[null_table_length:]
346 |             return parse_relative_object_metadata_struct(reverse_record, version=self.version)
347 |         # Parse relative metadata.
348 |         # Metadata is from the end of the record(reverse_record is used here)
349 |         variable_length_jump_table_cnt = (len(original_record) - 1) // 256
350 |         reverse_record = reverse_record[null_table_length:]
351 |         try:
352 |             relative_record_metadata = parse_relative_object_metadata_struct(reverse_record,
353 |                                                                              variable_length_jump_table_cnt,
354 |                                                                              self.version)
355 |             # relative_record_metadata = RELATIVE_OBJS.parse(reverse_record)
356 |             # we use this offset in original_record so we have to update the length with the null_tables
357 |             relative_record_metadata.relative_metadata_end = relative_record_metadata.relative_metadata_end + null_table_length
358 |         except ConstructError:
359 |             relative_record_metadata = None
360 |             LOGGER.error("Failed parsing record")
361 | 
362 |         if relative_record_metadata and \
363 |                 relative_record_metadata.variable_length_field_count != self.table_header.variable_columns:
364 | 
365 |             # best effort - try to find variable column count in the record and parse from there
366 |             # this is limited to the 10 first bytes to reduce false positives.
367 |             # most of the time iv'e seen this there was an extra DWORD before the actual metadata
368 |             metadata_start = reverse_record.find(bytes([self.table_header.variable_columns]))
369 |             if metadata_start != -1 and metadata_start < 10:
370 |                 reverse_record = reverse_record[metadata_start:]
371 |                 try:
372 |                     relative_record_metadata = parse_relative_object_metadata_struct(reverse_record,
373 |                                                                                      variable_length_jump_table_cnt,
374 |                                                                                      self.version)
375 |                 except ConstructError:
376 |                     LOGGER.error(f"Failed to parse record metadata: {original_record}")
377 |                 relative_record_metadata.relative_metadata_end = relative_record_metadata.relative_metadata_end + \
378 |                                                                  metadata_start
379 |             else:
380 |                 LOGGER.warning(
381 |                     f"Record did not parse correctly. Number of columns: {self.table_header.variable_columns}"
382 |                     f" number of parsed columns: {relative_record_metadata.variable_length_field_count}")
383 |                 return None
384 |         return relative_record_metadata
385 | 
386 |     def _parse_dynamic_length_data(self, original_record, relative_record_metadata,
387 |                                    relative_records_column_map, null_table):
388 |         """
389 |         Parse dynamic (non fixed length) records from row
390 |         :param original_record: full unmodified record
391 |         :param relative_record_metadata: parsed record metadata
392 |         :param relative_records_column_map: relative records colum mapping {index: column}
393 |         :param null_table: list indicating which columns have null value
394 |         """
395 |         relative_offsets = relative_record_metadata.variable_length_field_offsets
396 |         jump_table_addition = 0
397 |         for i, column_index in enumerate(relative_records_column_map):
398 |             column = relative_records_column_map[column_index]
399 |             col_name = column.col_name_str
400 |             has_value = True
401 |             if column.column_id > len(null_table):
402 |                 LOGGER.warning("Invalid null table. null values may be shown in the db.")
403 |             else:
404 |                 has_value = null_table[column.column_id]
405 |             if not has_value:
406 |                 self.parsed_table[col_name].append(None)
407 |                 continue
408 | 
409 |             if self.version == 3:
410 |                 if i in relative_record_metadata.variable_length_jump_table:
411 |                     jump_table_addition += 0x100
412 |             rel_start = relative_offsets[i]
413 |             # If this is the last one use var_len_count as end offset
414 |             if i + 1 == len(relative_offsets):
415 |                 rel_end = relative_record_metadata.var_len_count
416 |             else:
417 |                 rel_end = relative_offsets[i + 1]
418 | 
419 |             # if rel_start and rel_end are the same there is no data in this slot
420 |             if rel_start == rel_end:
421 |                 self.parsed_table[col_name].append("")
422 |                 continue
423 | 
424 |             relative_obj_data = original_record[rel_start + jump_table_addition: rel_end + jump_table_addition]
425 |             # Parse types that require column data here, call parse_type on all other types
426 |             if column.type == TYPE_MEMO:
427 |                 try:
428 |                     parsed_type = self._parse_memo(relative_obj_data)
429 |                 except ConstructError:
430 |                     LOGGER.warning("Failed to parse memo field. Using data as bytes")
431 |                     parsed_type = relative_obj_data
432 |             elif column.type == TYPE_OLE:
433 |                 try:
434 |                     parsed_type = self._parse_memo(relative_obj_data, return_raw=True)
435 |                 except ConstructError:
436 |                     LOGGER.warning("Failed to parse OLE field. Using data as bytes")
437 |                     parsed_type = relative_obj_data
438 |             elif column.type == TYPE_96_bit_17_BYTES:
439 |                 if len(relative_obj_data) != 17:
440 |                     LOGGER.warning(f"Relative numeric field has invalid length {len(relative_obj_data)}, expected 17")
441 |                     parsed_type = relative_obj_data
442 |                 else:
443 |                     # Get scale or None
444 |                     scale = column.get('various', {}).get('scale', 6)
445 |                     parsed_type = numeric_to_string(relative_obj_data, scale)
446 |             else:
447 |                 parsed_type = parse_type(column.type, relative_obj_data, len(relative_obj_data), version=self.version)
448 |             self.parsed_table[col_name].append(parsed_type)
449 | 
450 |     def _get_table_columns(self):
451 |         """
452 |         Parse columns for a specific table
453 |         """
454 |         try:
455 |             table_header = parse_table_head(self.table.value, version=self.version)
456 |             merged_data = self.table.value[table_header.tdef_header_end:]
457 |             if table_header.TDEF_header.next_page_ptr:
458 |                 merged_data = merged_data + self._merge_table_data(table_header.TDEF_header.next_page_ptr)
459 | 
460 |             parsed_data = parse_table_data(
461 |                 merged_data,
462 |                 table_header.index_count,
463 |                 table_header.real_index_count,
464 |                 table_header.column_count,
465 |                 version=self.version,
466 |             )
467 | 
468 |             # Merge Data back to table_header
469 |             table_header['column'] = parsed_data['column']
470 |             table_header['column_names'] = parsed_data['column_names']
471 |             table_header['real_index_2'] = parsed_data['real_index_2']
472 |             table_header["all_indexes"] = parsed_data["all_indexes"]
473 |             table_header["index_names"] = parsed_data["index_names"]
474 | 
475 |         except ConstructError:
476 |             LOGGER.error(f"Failed to parse table header {self.table.value}")
477 |             return
478 |         col_names = table_header.column_names
479 |         columns = table_header.column
480 | 
481 |         # Add names to columns metadata, so we can use only columns for parsing
482 |         for i, c in enumerate(columns):
483 |             c.col_name_str = col_names[i].col_name_str
484 |             c.extra_props = None
485 | 
486 |         # column_index is more accurate(id is always incremented so it is wrong when a column is deleted).
487 |         # Some tables like the catalog don't have index, so if indexes are 0 use id.
488 | 
489 |         # create a dict of index to column to make it easier to access. offset is used to make this zero based
490 |         offset = min(x.column_index for x in columns)
491 |         column_dict = {x.column_index - offset: x for x in columns}
492 |         # If column index is not unique try best effort
493 |         if len(column_dict) != len(columns):
494 |             # create a dict of id to column to make it easier to access
495 |             column_dict = {x.column_id: x for x in columns}
496 | 
497 |         # Add the extra properties relevant for the column
498 |         if self.props:
499 |             for i, col in column_dict.items():
500 |                 if col.col_name_str in self.props:
501 |                     col.extra_props = self.props[col.col_name_str]
502 | 
503 |         primary_keys = [
504 |             column_dict[col.col_id].col_name_str
505 |             for idx in table_header.all_indexes
506 |             for col in table_header.real_index_2[idx.idx_col_num].unk_struct
507 |             if idx.idx_type == 1 and col.col_id ^ 0xFFFF
508 |         ]
509 | 
510 |         if len(column_dict) != table_header.column_count:
511 |             LOGGER.debug(f"expected {table_header.column_count} columns got {len(column_dict)}")
512 |         return column_dict, primary_keys, table_header
513 | 
514 |     def _merge_table_data(self, first_page):
515 |         """
516 |         Merege data of tdef pages in case the data does not fit in one page
517 |         :param first_page: index of the next page
518 |         :return: merged data from all linked table definitions
519 |         """
520 |         table = self._table_defs.get(first_page * self.page_size)
521 |         parsed_header = TDEF_HEADER.parse(table)
522 |         data = table[parsed_header.header_end:]
523 |         while parsed_header.next_page_ptr:
524 |             table = self._table_defs.get(parsed_header.next_page_ptr * self.page_size)
525 |             parsed_header = TDEF_HEADER.parse(table)
526 |             data = data + table[parsed_header.header_end:]
527 |         return data
528 | 
529 |     def _parse_memo(self, relative_obj_data, return_raw=False):
530 |         LOGGER.debug(f"Parsing memo field {relative_obj_data}")
531 |         parsed_memo = MEMO.parse(relative_obj_data)
532 |         memo_type = TYPE_TEXT
533 |         if parsed_memo.memo_length & 0x80000000:
534 |             LOGGER.debug("memo data inline")
535 |             inline_memo_length = parsed_memo.memo_length & 0x3FFFFFFF
536 |             if len(relative_obj_data) < parsed_memo.memo_end + inline_memo_length:
537 |                 LOGGER.warning("Inline memo field has invalid length using full data")
538 |                 memo_data = relative_obj_data[parsed_memo.memo_end:]
539 |             else:
540 |                 memo_data = relative_obj_data[parsed_memo.memo_end:parsed_memo.memo_end + inline_memo_length]
541 | 
542 |         elif parsed_memo.memo_length & 0x40000000:
543 |             LOGGER.debug("LVAL type 1")
544 |             memo_data = self._get_overflow_record(parsed_memo.record_pointer)
545 |         else:
546 |             LOGGER.debug("LVAL type 2")
547 |             rec_data = self._get_overflow_record(parsed_memo.record_pointer)
548 |             next_page = struct.unpack("I", rec_data[:4])[0]
549 |             # LVAL2 has data over multiple pages. The first 4 bytes of the page are the next record, then that data.
550 |             # Concat the data until we get a 0 next_page.
551 |             memo_data = b""
552 |             while next_page:
553 |                 memo_data += rec_data[4:]
554 |                 rec_data = self._get_overflow_record(next_page)
555 |                 next_page = struct.unpack("I", rec_data[:4])[0]
556 |             memo_data += rec_data[4:]
557 |         if memo_data:
558 |             if return_raw:
559 |                 return memo_data
560 |             parsed_type = parse_type(memo_type, memo_data, len(memo_data), version=self.version)
561 |             return parsed_type
562 | 
563 |     def _get_overflow_record(self, record_pointer):
564 |         """
565 |         Get the actual record from a record pointer
566 |         :param record_pointer:
567 |         :return: record
568 |         """
569 |         record_offset = record_pointer & 0xff
570 |         page_num = record_pointer >> 8
571 |         record_page = self._data_pages.get(page_num * self.page_size)
572 |         if not record_page:
573 |             LOGGER.warning(f"Could not find overflow record data page overflow pointer: {record_pointer}")
574 |             return
575 |         parsed_data = parse_data_page_header(record_page, version=self.version)
576 |         if record_offset > len(parsed_data.record_offsets):
577 |             LOGGER.warning("Failed parsing overflow record offset")
578 |             return
579 |         start = parsed_data.record_offsets[record_offset]
580 |         if start & 0x8000:
581 |             start = start & 0xfff
582 |         else:
583 |             LOGGER.debug(f"Overflow record flag is not present {start}")
584 |         if record_offset == 0:
585 |             record = record_page[start:]
586 |         else:
587 |             end = parsed_data.record_offsets[record_offset - 1]
588 |             if end & 0x8000 and (end & 0xff != 0):
589 |                 end = end & 0xfff
590 |             record = record_page[start: end]
591 |         return record
592 | 


--------------------------------------------------------------------------------