├── .gitignore ├── LICENSE ├── README.md ├── pandas_access └── __init__.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | PandasAccess.iml 3 | *.pyc 4 | dist/ 5 | MANIFEST 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 John Bjorn Nelson (tw: @generativist) 2 | 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | this software and associated documentation files (the "Software"), to deal in 6 | the Software without restriction, including without limitation the rights to 7 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 8 | of the Software, and to permit persons to whom the Software is furnished to do 9 | so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is this? 2 | 3 | A tiny, `subprocess`-based tool for reading a 4 | [MS Access](https://products.office.com/en-us/access) 5 | database (`.rdb`) as a [Pandas](http://pandas.pydata.org/) 6 | [DataFrame](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html). 7 | 8 | ## Installation 9 | 10 | To read the database, this package (thinly!) wraps 11 | [MDBTools](http://mdbtools.sourceforge.net/). Since I assume you're already 12 | using Pandas, it should be your only installation requirement. 13 | 14 | If you are on `OSX`, install it via [Homebrew](http://brew.sh/): 15 | 16 | ```sh 17 | $ brew install mdbtools 18 | ``` 19 | Then, do, 20 | ```sh 21 | $ pip install pandas_access 22 | ``` 23 | 24 | ## Usage 25 | 26 | ```python 27 | import pandas_access as mdb 28 | 29 | # Listing the tables. 30 | for tbl in mdb.list_tables("my.mdb"): 31 | print(tbl) 32 | 33 | # Read a small table. 34 | df = pandas_access.read_table("my.mdb", "MyTable") 35 | 36 | # Read a huge table. 37 | accumulator = [] 38 | for chunk in pandas_access.read_table("my.mdb", "MyTable", chunksize=10000): 39 | accumulator.append(f(chunk)) 40 | ``` 41 | 42 | If you need more power than this, see: 43 | [pyodbc](https://github.com/mkleehammer/pyodbc). 44 | 45 | ## Testing 46 | 47 | I needed this code in a quick pinch -- I had no access to MS Access, and I had 48 | a single `.mdb` file. If someone with Access would like to create a tiny 49 | database for unit-testing purposes, I'd be much obliged. 50 | 51 | -------------------------------------------------------------------------------- /pandas_access/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import subprocess 3 | import pandas as pd 4 | import numpy as np 5 | try: 6 | from StringIO import StringIO as BytesIO 7 | except ImportError: 8 | from io import BytesIO 9 | 10 | 11 | TABLE_RE = re.compile("CREATE TABLE \[(\w+)\]\s+\((.*?\));", 12 | re.MULTILINE | re.DOTALL) 13 | 14 | DEF_RE = re.compile("\s*\[(\w+)\]\s*(.*?),") 15 | 16 | 17 | def list_tables(rdb_file, encoding="latin-1"): 18 | """ 19 | :param rdb_file: The MS Access database file. 20 | :param encoding: The content encoding of the output. I assume `latin-1` 21 | because so many of MS files have that encoding. But, MDBTools may 22 | actually be UTF-8. 23 | :return: A list of the tables in a given database. 24 | """ 25 | tables = subprocess.check_output(['mdb-tables', rdb_file]).decode(encoding) 26 | return tables.strip().split(" ") 27 | 28 | 29 | def _extract_dtype(data_type): 30 | # Note, this list is surely incomplete. But, I only had one .mdb file 31 | # at the time of creation. If you see a new data-type, patch-pull or just 32 | # open an issue. 33 | data_type = data_type.lower() 34 | if data_type.startswith('double'): 35 | return np.float_ 36 | elif data_type.startswith('long'): 37 | return np.int_ 38 | else: 39 | return None 40 | 41 | 42 | def _extract_defs(defs_str): 43 | defs = {} 44 | lines = defs_str.splitlines() 45 | for line in lines: 46 | m = DEF_RE.match(line) 47 | if m: 48 | defs[m.group(1)] = m.group(2) 49 | return defs 50 | 51 | 52 | def read_schema(rdb_file, encoding='utf8'): 53 | """ 54 | :param rdb_file: The MS Access database file. 55 | :param encoding: The schema encoding. I'm almost positive that MDBTools 56 | spits out UTF-8, exclusively. 57 | :return: a dictionary of table -> column -> access_data_type 58 | """ 59 | output = subprocess.check_output(['mdb-schema', rdb_file]) 60 | lines = output.decode(encoding).splitlines() 61 | schema_ddl = "\n".join(l for l in lines if l and not l.startswith('-')) 62 | 63 | schema = {} 64 | for table, defs in TABLE_RE.findall(schema_ddl): 65 | schema[table] = _extract_defs(defs) 66 | 67 | return schema 68 | 69 | 70 | def to_pandas_schema(schema, implicit_string=True): 71 | """ 72 | :param schema: the output of `read_schema` 73 | :param implicit_string: mark strings and unknown dtypes as `np.str_`. 74 | :return: a dictionary of table -> column -> np.dtype 75 | """ 76 | pd_schema = {} 77 | for tbl, defs in schema.items(): 78 | pd_schema[tbl] = None 79 | sub_schema = {} 80 | for column, data_type in defs.items(): 81 | dtype = _extract_dtype(data_type) 82 | if dtype is not None: 83 | sub_schema[column] = dtype 84 | elif implicit_string: 85 | sub_schema[column] = np.str_ 86 | pd_schema[tbl] = sub_schema 87 | return pd_schema 88 | 89 | 90 | def read_table(rdb_file, table_name, *args, **kwargs): 91 | """ 92 | Read a MS Access database as a Pandas DataFrame. 93 | 94 | Unless you set `converters_from_schema=False`, this function assumes you 95 | want to infer the schema from the Access database's schema. This sets the 96 | `dtype` argument of `read_csv`, which makes things much faster, in most 97 | cases. If you set the `dtype` keyword argument also, it overrides 98 | inferences. The `schema_encoding keyword argument passes through to 99 | `read_schema`. The `implicit_string` argument passes through to 100 | `to_pandas_schema`. 101 | 102 | I recommend setting `chunksize=k`, where k is some reasonable number of 103 | rows. This is a simple interface, that doesn't do basic things like 104 | counting the number of rows ahead of time. You may inadvertently start 105 | reading a 100TB file into memory. (Although, being a MS product, I assume 106 | the Access format breaks after 2^32 bytes -- har, har.) 107 | 108 | :param rdb_file: The MS Access database file. 109 | :param table_name: The name of the table to process. 110 | :param args: positional arguments passed to `pd.read_csv` 111 | :param kwargs: keyword arguments passed to `pd.read_csv` 112 | :return: a pandas `DataFrame` (or, `TextFileReader` if you set 113 | `chunksize=k`) 114 | """ 115 | if kwargs.pop('converters_from_schema', True): 116 | specified_dtypes = kwargs.pop('dtype', {}) 117 | schema_encoding = kwargs.pop('schema_encoding', 'utf8') 118 | schemas = to_pandas_schema(read_schema(rdb_file, schema_encoding), 119 | kwargs.pop('implicit_string', True)) 120 | dtypes = schemas[table_name] 121 | dtypes.update(specified_dtypes) 122 | if dtypes != {}: 123 | kwargs['dtype'] = dtypes 124 | 125 | cmd = ['mdb-export', rdb_file, table_name] 126 | proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) 127 | return pd.read_csv(proc.stdout, *args, **kwargs) 128 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from distutils.core import setup 3 | 4 | 5 | README_FILE = os.path.join(os.path.dirname(__file__), 'README.md') 6 | 7 | setup( 8 | name="pandas_access", 9 | version="0.0.1", 10 | packages=["pandas_access"], # Basically, reserve that namespace. 11 | license="License :: OSI Approved :: MIT License", 12 | author="John Bjorn Nelson", 13 | author_email="jbn@abreka.com", 14 | description="A tiny, subprocess-based tool for reading a MS Access database(.rdb) as a Pandas DataFrame.", 15 | long_description=open(README_FILE).read(), 16 | data_files=['README.md'], 17 | url="https://github.com/jbn/pandas_access" 18 | ) --------------------------------------------------------------------------------