├── .gitignore
├── LICENSE
├── README.md
├── pandas_access
    └── __init__.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | PandasAccess.iml
3 | *.pyc
4 | dist/
5 | MANIFEST
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 John Bjorn Nelson (tw: @generativist)
 2 | 
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 | this software and associated documentation files (the "Software"), to deal in
 6 | the Software without restriction, including without limitation the rights to
 7 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 8 | of the Software, and to permit persons to whom the Software is furnished to do
 9 | so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # What is this?
 2 | 
 3 | A tiny, `subprocess`-based tool for reading a 
 4 | [MS Access](https://products.office.com/en-us/access) 
 5 | database (`.rdb`) as a [Pandas](http://pandas.pydata.org/) 
 6 | [DataFrame](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html). 
 7 | 
 8 | ## Installation
 9 | 
10 | To read the database, this package (thinly!) wraps 
11 | [MDBTools](http://mdbtools.sourceforge.net/). Since I assume you're already 
12 | using Pandas, it should be your only installation requirement. 
13 | 
14 | If you are on `OSX`, install it via [Homebrew](http://brew.sh/):
15 | 
16 | ```sh
17 | $ brew install mdbtools
18 | ```
19 | Then, do,
20 | ```sh
21 | $ pip install pandas_access
22 | ```
23 | 
24 | ## Usage
25 | 
26 | ```python
27 | import pandas_access as mdb
28 | 
29 | # Listing the tables.
30 | for tbl in mdb.list_tables("my.mdb"):
31 |     print(tbl)
32 |     
33 | # Read a small table.
34 | df = pandas_access.read_table("my.mdb", "MyTable")
35 | 
36 | # Read a huge table.
37 | accumulator = []
38 | for chunk in pandas_access.read_table("my.mdb", "MyTable", chunksize=10000):
39 |     accumulator.append(f(chunk))
40 | ```
41 | 
42 | If you need more power than this, see: 
43 | [pyodbc](https://github.com/mkleehammer/pyodbc).
44 | 
45 | ## Testing
46 | 
47 | I needed this code in a quick pinch -- I had no access to MS Access, and I had
48 | a single `.mdb` file. If someone with Access would like to create a tiny 
49 | database for unit-testing purposes, I'd be much obliged. 
50 | 
51 | 


--------------------------------------------------------------------------------
/pandas_access/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import subprocess
  3 | import pandas as pd
  4 | import numpy as np
  5 | try:
  6 |     from StringIO import StringIO as BytesIO
  7 | except ImportError:
  8 |     from io import BytesIO
  9 | 
 10 | 
 11 | TABLE_RE = re.compile("CREATE TABLE \[(\w+)\]\s+\((.*?\));",
 12 |                       re.MULTILINE | re.DOTALL)
 13 | 
 14 | DEF_RE = re.compile("\s*\[(\w+)\]\s*(.*?),")
 15 | 
 16 | 
 17 | def list_tables(rdb_file, encoding="latin-1"):
 18 |     """
 19 |     :param rdb_file: The MS Access database file.
 20 |     :param encoding: The content encoding of the output. I assume `latin-1`
 21 |         because so many of MS files have that encoding. But, MDBTools may
 22 |         actually be UTF-8.
 23 |     :return: A list of the tables in a given database.
 24 |     """
 25 |     tables = subprocess.check_output(['mdb-tables', rdb_file]).decode(encoding)
 26 |     return tables.strip().split(" ")
 27 | 
 28 | 
 29 | def _extract_dtype(data_type):
 30 |     # Note, this list is surely incomplete. But, I only had one .mdb file
 31 |     # at the time of creation. If you see a new data-type, patch-pull or just
 32 |     # open an issue.
 33 |     data_type = data_type.lower()
 34 |     if data_type.startswith('double'):
 35 |         return np.float_
 36 |     elif data_type.startswith('long'):
 37 |         return np.int_
 38 |     else:
 39 |         return None
 40 | 
 41 | 
 42 | def _extract_defs(defs_str):
 43 |     defs = {}
 44 |     lines = defs_str.splitlines()
 45 |     for line in lines:
 46 |         m = DEF_RE.match(line)
 47 |         if m:
 48 |             defs[m.group(1)] = m.group(2)
 49 |     return defs
 50 | 
 51 | 
 52 | def read_schema(rdb_file, encoding='utf8'):
 53 |     """
 54 |     :param rdb_file: The MS Access database file.
 55 |     :param encoding: The schema encoding. I'm almost positive that MDBTools
 56 |         spits out UTF-8, exclusively.
 57 |     :return: a dictionary of table -> column -> access_data_type
 58 |     """
 59 |     output = subprocess.check_output(['mdb-schema', rdb_file])
 60 |     lines = output.decode(encoding).splitlines()
 61 |     schema_ddl = "\n".join(l for l in lines if l and not l.startswith('-'))
 62 | 
 63 |     schema = {}
 64 |     for table, defs in TABLE_RE.findall(schema_ddl):
 65 |         schema[table] = _extract_defs(defs)
 66 | 
 67 |     return schema
 68 | 
 69 | 
 70 | def to_pandas_schema(schema, implicit_string=True):
 71 |     """
 72 |     :param schema: the output of `read_schema`
 73 |     :param implicit_string: mark strings and unknown dtypes as `np.str_`.
 74 |     :return: a dictionary of table -> column -> np.dtype
 75 |     """
 76 |     pd_schema = {}
 77 |     for tbl, defs in schema.items():
 78 |         pd_schema[tbl] = None
 79 |         sub_schema = {}
 80 |         for column, data_type in defs.items():
 81 |             dtype = _extract_dtype(data_type)
 82 |             if dtype is not None:
 83 |                 sub_schema[column] = dtype
 84 |             elif implicit_string:
 85 |                 sub_schema[column] = np.str_
 86 |         pd_schema[tbl] = sub_schema
 87 |     return pd_schema
 88 | 
 89 | 
 90 | def read_table(rdb_file, table_name, *args, **kwargs):
 91 |     """
 92 |     Read a MS Access database as a Pandas DataFrame.
 93 | 
 94 |     Unless you set `converters_from_schema=False`, this function assumes you
 95 |     want to infer the schema from the Access database's schema. This sets the
 96 |     `dtype` argument of `read_csv`, which makes things much faster, in most
 97 |     cases. If you set the `dtype` keyword argument also, it overrides
 98 |     inferences. The `schema_encoding keyword argument passes through to
 99 |     `read_schema`. The `implicit_string` argument passes through to
100 |     `to_pandas_schema`.
101 | 
102 |     I recommend setting `chunksize=k`, where k is some reasonable number of
103 |     rows. This is a simple interface, that doesn't do basic things like
104 |     counting the number of rows ahead of time. You may inadvertently start
105 |     reading a 100TB file into memory. (Although, being a MS product, I assume
106 |     the Access format breaks after 2^32 bytes -- har, har.)
107 | 
108 |     :param rdb_file: The MS Access database file.
109 |     :param table_name: The name of the table to process.
110 |     :param args: positional arguments passed to `pd.read_csv`
111 |     :param kwargs: keyword arguments passed to `pd.read_csv`
112 |     :return: a pandas `DataFrame` (or, `TextFileReader` if you set
113 |         `chunksize=k`)
114 |     """
115 |     if kwargs.pop('converters_from_schema', True):
116 |         specified_dtypes = kwargs.pop('dtype', {})
117 |         schema_encoding = kwargs.pop('schema_encoding', 'utf8')
118 |         schemas = to_pandas_schema(read_schema(rdb_file, schema_encoding),
119 |                                    kwargs.pop('implicit_string', True))
120 |         dtypes = schemas[table_name]
121 |         dtypes.update(specified_dtypes)
122 |         if dtypes != {}:
123 |             kwargs['dtype'] = dtypes
124 | 
125 |     cmd = ['mdb-export', rdb_file, table_name]
126 |     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
127 |     return pd.read_csv(proc.stdout, *args, **kwargs)
128 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from distutils.core import setup
 3 | 
 4 | 
 5 | README_FILE = os.path.join(os.path.dirname(__file__), 'README.md')
 6 | 
 7 | setup(
 8 |     name="pandas_access",
 9 |     version="0.0.1",
10 |     packages=["pandas_access"], # Basically, reserve that namespace.
11 |     license="License :: OSI Approved :: MIT License",
12 |     author="John Bjorn Nelson",
13 |     author_email="jbn@abreka.com",
14 |     description="A tiny, subprocess-based tool for reading a MS Access database(.rdb) as a Pandas DataFrame.",
15 |     long_description=open(README_FILE).read(),
16 |     data_files=['README.md'],
17 |     url="https://github.com/jbn/pandas_access"
18 | )


--------------------------------------------------------------------------------